1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 #ifdef PERL_EXT_RE_BUILD
35 #include "re_top.h"
36 #endif
37 
38 /*
39  * pregcomp and pregexec -- regsub and regerror are not used in perl
40  *
41  *	Copyright (c) 1986 by University of Toronto.
42  *	Written by Henry Spencer.  Not derived from licensed software.
43  *
44  *	Permission is granted to anyone to use this software for any
45  *	purpose on any computer system, and to redistribute it freely,
46  *	subject to the following restrictions:
47  *
48  *	1. The author is not responsible for the consequences of use of
49  *		this software, no matter how awful, even if they arise
50  *		from defects in it.
51  *
52  *	2. The origin of this software must not be misrepresented, either
53  *		by explicit claim or by omission.
54  *
55  *	3. Altered versions must be plainly marked as such, and must not
56  *		be misrepresented as being the original software.
57  *
58  *
59  ****    Alterations to Henry's code are...
60  ****
61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63  ****    by Larry Wall and others
64  ****
65  ****    You may distribute under the terms of either the GNU General Public
66  ****    License or the Artistic License, as specified in the README file.
67 
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 
74 /* Note on debug output:
75  *
76  * This is set up so that -Dr turns on debugging like all other flags that are
77  * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
78  * all regular expressions encountered in a program, and gives a huge amount of
79  * output for all but the shortest programs.
80  *
81  * The ability to output pattern debugging information lexically, and with much
82  * finer grained control was added, with 'use re qw(Debug ....);' available even
83  * in non-DEBUGGING builds.  This is accomplished by copying the contents of
84  * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85  * Those files are compiled and linked into the perl executable, and they are
86  * compiled essentially as if DEBUGGING were enabled, and controlled by calls
87  * to re.pm.
88  *
89  * That would normally mean linking errors when two functions of the same name
90  * are attempted to be placed into the same executable.  That is solved in one
91  * of four ways:
92  *  1)  Static functions aren't known outside the file they are in, so for the
93  *      many functions of that type in this file, it just isn't a problem.
94  *  2)  Most externally known functions are enclosed in
95  *          #ifndef PERL_IN_XSUB_RE
96  *          ...
97  *          #endif
98  *      blocks, so there is only one defintion for them in the whole
99  *      executable, the one in regcomp.c (or regexec.c).  The implication of
100  *      that is any debugging info that comes from them is controlled only by
101  *      -Dr.  Further, any static function they call will also be the version
102  *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103  *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
104  *      have different names, so that what gets loaded in the executable is
105  *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106  *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
107  *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108  *      versions and their callees are under control of re.pm.   The catch is
109  *      that references to all these go through the regexp_engine structure,
110  *      which is initialized in regcomp.h to the Perl_foo versions, and
111  *      substituted out in lexical scopes where 'use re' is in effect to the
112  *      'my_foo' ones.   That structure is public API, so it would be a hard
113  *      sell to add any additional members.
114  *  4)  For functions in regcomp.c and re_comp.c that are called only from,
115  *      respectively, regexec.c and re_exec.c, they can have two different
116  *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
117  *      embed.fnc.
118  *
119  * The bottom line is that if you add code to one of the public functions
120  * listed in ext/re/re_top.h, debugging automagically works.  But if you write
121  * a new function that needs to do debugging or there is a chain of calls from
122  * it that need to do debugging, all functions in the chain should use options
123  * 2) or 4) above.
124  *
125  * A function may have to be split so that debugging stuff is static, but it
126  * calls out to some other function that only gets compiled in regcomp.c to
127  * access data that we don't want to duplicate.
128  */
129 
130 #include "EXTERN.h"
131 #define PERL_IN_REGCOMP_C
132 #include "perl.h"
133 
134 #define REG_COMP_C
135 #ifdef PERL_IN_XSUB_RE
136 #  include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
139 #else
140 #  include "regcomp.h"
141 #endif
142 
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
145 
146 #ifndef STATIC
147 #define	STATIC	static
148 #endif
149 
150 /* this is a chain of data about sub patterns we are processing that
151    need to be handled separately/specially in study_chunk. Its so
152    we can simulate recursion without losing state.  */
153 struct scan_frame;
154 typedef struct scan_frame {
155     regnode *last_regnode;      /* last node to process in this frame */
156     regnode *next_regnode;      /* next node to process when last is reached */
157     U32 prev_recursed_depth;
158     I32 stopparen;              /* what stopparen do we use */
159     bool in_gosub;              /* this or an outer frame is for GOSUB */
160 
161     struct scan_frame *this_prev_frame; /* this previous frame */
162     struct scan_frame *prev_frame;      /* previous frame */
163     struct scan_frame *next_frame;      /* next frame */
164 } scan_frame;
165 
166 /* Certain characters are output as a sequence with the first being a
167  * backslash. */
168 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
169 
170 
171 struct RExC_state_t {
172     U32		flags;			/* RXf_* are we folding, multilining? */
173     U32		pm_flags;		/* PMf_* stuff from the calling PMOP */
174     char	*precomp;		/* uncompiled string. */
175     char	*precomp_end;		/* pointer to end of uncompiled string. */
176     REGEXP	*rx_sv;			/* The SV that is the regexp. */
177     regexp	*rx;                    /* perl core regexp structure */
178     regexp_internal	*rxi;           /* internal data for regexp object
179                                            pprivate field */
180     char	*start;			/* Start of input for compile */
181     char	*end;			/* End of input for compile */
182     char	*parse;			/* Input-scan pointer. */
183     char        *copy_start;            /* start of copy of input within
184                                            constructed parse string */
185     char        *save_copy_start;       /* Provides one level of saving
186                                            and restoring 'copy_start' */
187     char        *copy_start_in_input;   /* Position in input string
188                                            corresponding to copy_start */
189     SSize_t	whilem_seen;		/* number of WHILEM in this expr */
190     regnode	*emit_start;		/* Start of emitted-code area */
191     regnode_offset emit;		/* Code-emit pointer */
192     I32		naughty;		/* How bad is this pattern? */
193     I32		sawback;		/* Did we see \1, ...? */
194     SSize_t	size;			/* Number of regnode equivalents in
195                                            pattern */
196     Size_t      sets_depth;              /* Counts recursion depth of already-
197                                            compiled regex set patterns */
198     U32		seen;
199 
200     I32      parens_buf_size;           /* #slots malloced open/close_parens */
201     regnode_offset *open_parens;	/* offsets to open parens */
202     regnode_offset *close_parens;	/* offsets to close parens */
203     HV		*paren_names;		/* Paren names */
204 
205     /* position beyond 'precomp' of the warning message furthest away from
206      * 'precomp'.  During the parse, no warnings are raised for any problems
207      * earlier in the parse than this position.  This works if warnings are
208      * raised the first time a given spot is parsed, and if only one
209      * independent warning is raised for any given spot */
210     Size_t	latest_warn_offset;
211 
212     I32         npar;                   /* Capture buffer count so far in the
213                                            parse, (OPEN) plus one. ("par" 0 is
214                                            the whole pattern)*/
215     I32         total_par;              /* During initial parse, is either 0,
216                                            or -1; the latter indicating a
217                                            reparse is needed.  After that pass,
218                                            it is what 'npar' became after the
219                                            pass.  Hence, it being > 0 indicates
220                                            we are in a reparse situation */
221     I32		nestroot;		/* root parens we are in - used by
222                                            accept */
223     I32		seen_zerolen;
224     regnode     *end_op;                /* END node in program */
225     I32		utf8;		/* whether the pattern is utf8 or not */
226     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
227 				/* XXX use this for future optimisation of case
228 				 * where pattern must be upgraded to utf8. */
229     I32		uni_semantics;	/* If a d charset modifier should use unicode
230 				   rules, even if the pattern is not in
231 				   utf8 */
232 
233     I32         recurse_count;          /* Number of recurse regops we have generated */
234     regnode	**recurse;		/* Recurse regops */
235     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
236                                            through */
237     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
238     I32		in_lookaround;
239     I32		contains_locale;
240     I32		override_recoding;
241     I32         recode_x_to_native;
242     I32		in_multi_char_class;
243     int		code_index;		/* next code_blocks[] slot */
244     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
245 					    within pattern */
246     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
247     scan_frame *frame_head;
248     scan_frame *frame_last;
249     U32         frame_count;
250     AV         *warn_text;
251     HV         *unlexed_names;
252     SV		*runtime_code_qr;	/* qr with the runtime code blocks */
253 #ifdef DEBUGGING
254     const char  *lastparse;
255     I32         lastnum;
256     U32         study_chunk_recursed_count;
257     AV          *paren_name_list;       /* idx -> name */
258     SV          *mysv1;
259     SV          *mysv2;
260 
261 #define RExC_lastparse	(pRExC_state->lastparse)
262 #define RExC_lastnum	(pRExC_state->lastnum)
263 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
264 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
265 #define RExC_mysv	(pRExC_state->mysv1)
266 #define RExC_mysv1	(pRExC_state->mysv1)
267 #define RExC_mysv2	(pRExC_state->mysv2)
268 
269 #endif
270     bool        seen_d_op;
271     bool        strict;
272     bool        study_started;
273     bool        in_script_run;
274     bool        use_BRANCHJ;
275     bool        sWARN_EXPERIMENTAL__VLB;
276     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
277 };
278 
279 #define RExC_flags	(pRExC_state->flags)
280 #define RExC_pm_flags	(pRExC_state->pm_flags)
281 #define RExC_precomp	(pRExC_state->precomp)
282 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
283 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
284 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
285 #define RExC_precomp_end (pRExC_state->precomp_end)
286 #define RExC_rx_sv	(pRExC_state->rx_sv)
287 #define RExC_rx		(pRExC_state->rx)
288 #define RExC_rxi	(pRExC_state->rxi)
289 #define RExC_start	(pRExC_state->start)
290 #define RExC_end	(pRExC_state->end)
291 #define RExC_parse	(pRExC_state->parse)
292 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
293 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
294 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
295                                                    under /d from /u ? */
296 
297 #ifdef RE_TRACK_PATTERN_OFFSETS
298 #  define RExC_offsets	(RExC_rxi->u.offsets) /* I am not like the
299                                                          others */
300 #endif
301 #define RExC_emit	(pRExC_state->emit)
302 #define RExC_emit_start	(pRExC_state->emit_start)
303 #define RExC_sawback	(pRExC_state->sawback)
304 #define RExC_seen	(pRExC_state->seen)
305 #define RExC_size	(pRExC_state->size)
306 #define RExC_maxlen        (pRExC_state->maxlen)
307 #define RExC_npar	(pRExC_state->npar)
308 #define RExC_total_parens	(pRExC_state->total_par)
309 #define RExC_parens_buf_size	(pRExC_state->parens_buf_size)
310 #define RExC_nestroot   (pRExC_state->nestroot)
311 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
312 #define RExC_utf8	(pRExC_state->utf8)
313 #define RExC_uni_semantics	(pRExC_state->uni_semantics)
314 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
315 #define RExC_open_parens	(pRExC_state->open_parens)
316 #define RExC_close_parens	(pRExC_state->close_parens)
317 #define RExC_end_op	(pRExC_state->end_op)
318 #define RExC_paren_names	(pRExC_state->paren_names)
319 #define RExC_recurse	(pRExC_state->recurse)
320 #define RExC_recurse_count	(pRExC_state->recurse_count)
321 #define RExC_sets_depth         (pRExC_state->sets_depth)
322 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
323 #define RExC_study_chunk_recursed_bytes  \
324                                    (pRExC_state->study_chunk_recursed_bytes)
325 #define RExC_in_lookaround	(pRExC_state->in_lookaround)
326 #define RExC_contains_locale	(pRExC_state->contains_locale)
327 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
328 
329 #ifdef EBCDIC
330 #  define SET_recode_x_to_native(x)                                         \
331                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
332 #else
333 #  define SET_recode_x_to_native(x) NOOP
334 #endif
335 
336 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
337 #define RExC_frame_head (pRExC_state->frame_head)
338 #define RExC_frame_last (pRExC_state->frame_last)
339 #define RExC_frame_count (pRExC_state->frame_count)
340 #define RExC_strict (pRExC_state->strict)
341 #define RExC_study_started      (pRExC_state->study_started)
342 #define RExC_warn_text (pRExC_state->warn_text)
343 #define RExC_in_script_run      (pRExC_state->in_script_run)
344 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
345 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
346 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
347 #define RExC_unlexed_names (pRExC_state->unlexed_names)
348 
349 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
350  * a flag to disable back-off on the fixed/floating substrings - if it's
351  * a high complexity pattern we assume the benefit of avoiding a full match
352  * is worth the cost of checking for the substrings even if they rarely help.
353  */
354 #define RExC_naughty	(pRExC_state->naughty)
355 #define TOO_NAUGHTY (10)
356 #define MARK_NAUGHTY(add) \
357     if (RExC_naughty < TOO_NAUGHTY) \
358         RExC_naughty += (add)
359 #define MARK_NAUGHTY_EXP(exp, add) \
360     if (RExC_naughty < TOO_NAUGHTY) \
361         RExC_naughty += RExC_naughty / (exp) + (add)
362 
363 #define	isNON_BRACE_QUANTIFIER(c)   ((c) == '*' || (c) == '+' || (c) == '?')
364 #define	isQUANTIFIER(s,e)  (   isNON_BRACE_QUANTIFIER(*s)                      \
365                             || ((*s) == '{' && regcurly(s, e, NULL)))
366 
367 /*
368  * Flags to be passed up and down.
369  */
370 #define	HASWIDTH	0x01	/* Known to not match null strings, could match
371                                    non-null ones. */
372 #define	SIMPLE		0x02    /* Exactly one character wide */
373                                 /* (or LNBREAK as a special case) */
374 #define POSTPONED	0x08    /* (?1),(?&name), (??{...}) or similar */
375 #define TRYAGAIN	0x10	/* Weeded out a declaration. */
376 #define RESTART_PARSE   0x20    /* Need to redo the parse */
377 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
378                                    calcuate sizes as UTF-8 */
379 
380 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
381 
382 /* whether trie related optimizations are enabled */
383 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
384 #define TRIE_STUDY_OPT
385 #define FULL_TRIE_STUDY
386 #define TRIE_STCLASS
387 #endif
388 
389 
390 
391 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
392 #define PBITVAL(paren) (1 << ((paren) & 7))
393 #define PAREN_OFFSET(depth) \
394     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
395 #define PAREN_TEST(depth, paren) \
396     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
397 #define PAREN_SET(depth, paren) \
398     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
399 #define PAREN_UNSET(depth, paren) \
400     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
401 
402 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
403                                      if (!UTF) {                           \
404                                          *flagp = RESTART_PARSE|NEED_UTF8; \
405                                          return 0;                         \
406                                      }                                     \
407                              } STMT_END
408 
409 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
410  * pattern is in UTF-8.  This latter condition is in case the outermost rules
411  * are locale.  See GH #17278 */
412 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
413 
414 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
415  * a flag that indicates we need to override /d with /u as a result of
416  * something in the pattern.  It should only be used in regards to calling
417  * set_regex_charset() or get_regex_charset() */
418 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
419     STMT_START {                                                            \
420             if (DEPENDS_SEMANTICS) {                                        \
421                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
422                 RExC_uni_semantics = 1;                                     \
423                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
424                     /* No need to restart the parse if we haven't seen      \
425                      * anything that differs between /u and /d, and no need \
426                      * to restart immediately if we're going to reparse     \
427                      * anyway to count parens */                            \
428                     *flagp |= RESTART_PARSE;                                \
429                     return restart_retval;                                  \
430                 }                                                           \
431             }                                                               \
432     } STMT_END
433 
434 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
435     STMT_START {                                                            \
436                 RExC_use_BRANCHJ = 1;                                       \
437                 *flagp |= RESTART_PARSE;                                    \
438                 return restart_retval;                                      \
439     } STMT_END
440 
441 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
442  * less.  After that, it must always be positive, because the whole re is
443  * considered to be surrounded by virtual parens.  Setting it to negative
444  * indicates there is some construct that needs to know the actual number of
445  * parens to be properly handled.  And that means an extra pass will be
446  * required after we've counted them all */
447 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
448 #define REQUIRE_PARENS_PASS                                                 \
449     STMT_START {  /* No-op if have completed a pass */                      \
450                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
451     } STMT_END
452 #define IN_PARENS_PASS (RExC_total_parens < 0)
453 
454 
455 /* This is used to return failure (zero) early from the calling function if
456  * various flags in 'flags' are set.  Two flags always cause a return:
457  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
458  * additional flags that should cause a return; 0 if none.  If the return will
459  * be done, '*flagp' is first set to be all of the flags that caused the
460  * return. */
461 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
462     STMT_START {                                                            \
463             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
464                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
465                 return 0;                                                   \
466             }                                                               \
467     } STMT_END
468 
469 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
470 
471 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
472                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
473 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
474                                     if (MUST_RESTART(*(flagp))) return 0
475 
476 /* This converts the named class defined in regcomp.h to its equivalent class
477  * number defined in handy.h. */
478 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
479 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
480 
481 #define _invlist_union_complement_2nd(a, b, output) \
482                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
483 #define _invlist_intersection_complement_2nd(a, b, output) \
484                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
485 
486 /* We add a marker if we are deferring expansion of a property that is both
487  * 1) potentiallly user-defined; and
488  * 2) could also be an official Unicode property.
489  *
490  * Without this marker, any deferred expansion can only be for a user-defined
491  * one.  This marker shouldn't conflict with any that could be in a legal name,
492  * and is appended to its name to indicate this.  There is a string and
493  * character form */
494 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
495 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
496 
497 /* What is infinity for optimization purposes */
498 #define OPTIMIZE_INFTY  SSize_t_MAX
499 
500 /* About scan_data_t.
501 
502   During optimisation we recurse through the regexp program performing
503   various inplace (keyhole style) optimisations. In addition study_chunk
504   and scan_commit populate this data structure with information about
505   what strings MUST appear in the pattern. We look for the longest
506   string that must appear at a fixed location, and we look for the
507   longest string that may appear at a floating location. So for instance
508   in the pattern:
509 
510     /FOO[xX]A.*B[xX]BAR/
511 
512   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
513   strings (because they follow a .* construct). study_chunk will identify
514   both FOO and BAR as being the longest fixed and floating strings respectively.
515 
516   The strings can be composites, for instance
517 
518      /(f)(o)(o)/
519 
520   will result in a composite fixed substring 'foo'.
521 
522   For each string some basic information is maintained:
523 
524   - min_offset
525     This is the position the string must appear at, or not before.
526     It also implicitly (when combined with minlenp) tells us how many
527     characters must match before the string we are searching for.
528     Likewise when combined with minlenp and the length of the string it
529     tells us how many characters must appear after the string we have
530     found.
531 
532   - max_offset
533     Only used for floating strings. This is the rightmost point that
534     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
535     string can occur infinitely far to the right.
536     For fixed strings, it is equal to min_offset.
537 
538   - minlenp
539     A pointer to the minimum number of characters of the pattern that the
540     string was found inside. This is important as in the case of positive
541     lookahead or positive lookbehind we can have multiple patterns
542     involved. Consider
543 
544     /(?=FOO).*F/
545 
546     The minimum length of the pattern overall is 3, the minimum length
547     of the lookahead part is 3, but the minimum length of the part that
548     will actually match is 1. So 'FOO's minimum length is 3, but the
549     minimum length for the F is 1. This is important as the minimum length
550     is used to determine offsets in front of and behind the string being
551     looked for.  Since strings can be composites this is the length of the
552     pattern at the time it was committed with a scan_commit. Note that
553     the length is calculated by study_chunk, so that the minimum lengths
554     are not known until the full pattern has been compiled, thus the
555     pointer to the value.
556 
557   - lookbehind
558 
559     In the case of lookbehind the string being searched for can be
560     offset past the start point of the final matching string.
561     If this value was just blithely removed from the min_offset it would
562     invalidate some of the calculations for how many chars must match
563     before or after (as they are derived from min_offset and minlen and
564     the length of the string being searched for).
565     When the final pattern is compiled and the data is moved from the
566     scan_data_t structure into the regexp structure the information
567     about lookbehind is factored in, with the information that would
568     have been lost precalculated in the end_shift field for the
569     associated string.
570 
571   The fields pos_min and pos_delta are used to store the minimum offset
572   and the delta to the maximum offset at the current point in the pattern.
573 
574 */
575 
576 struct scan_data_substrs {
577     SV      *str;       /* longest substring found in pattern */
578     SSize_t min_offset; /* earliest point in string it can appear */
579     SSize_t max_offset; /* latest point in string it can appear */
580     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
581     SSize_t lookbehind; /* is the pos of the string modified by LB */
582     I32 flags;          /* per substring SF_* and SCF_* flags */
583 };
584 
585 typedef struct scan_data_t {
586     /*I32 len_min;      unused */
587     /*I32 len_delta;    unused */
588     SSize_t pos_min;
589     SSize_t pos_delta;
590     SV *last_found;
591     SSize_t last_end;	    /* min value, <0 unless valid. */
592     SSize_t last_start_min;
593     SSize_t last_start_max;
594     U8      cur_is_floating; /* whether the last_* values should be set as
595                               * the next fixed (0) or floating (1)
596                               * substring */
597 
598     /* [0] is longest fixed substring so far, [1] is longest float so far */
599     struct scan_data_substrs  substrs[2];
600 
601     I32 flags;             /* common SF_* and SCF_* flags */
602     I32 whilem_c;
603     SSize_t *last_closep;
604     regnode_ssc *start_class;
605 } scan_data_t;
606 
607 /*
608  * Forward declarations for pregcomp()'s friends.
609  */
610 
611 static const scan_data_t zero_scan_data = {
612     0, 0, NULL, 0, 0, 0, 0,
613     {
614         { NULL, 0, 0, 0, 0, 0 },
615         { NULL, 0, 0, 0, 0, 0 },
616     },
617     0, 0, NULL, NULL
618 };
619 
620 /* study flags */
621 
622 #define SF_BEFORE_SEOL		0x0001
623 #define SF_BEFORE_MEOL		0x0002
624 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
625 
626 #define SF_IS_INF		0x0040
627 #define SF_HAS_PAR		0x0080
628 #define SF_IN_PAR		0x0100
629 #define SF_HAS_EVAL		0x0200
630 
631 
632 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
633  * longest substring in the pattern. When it is not set the optimiser keeps
634  * track of position, but does not keep track of the actual strings seen,
635  *
636  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
637  * /foo/i will not.
638  *
639  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
640  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
641  * turned off because of the alternation (BRANCH). */
642 #define SCF_DO_SUBSTR		0x0400
643 
644 #define SCF_DO_STCLASS_AND	0x0800
645 #define SCF_DO_STCLASS_OR	0x1000
646 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
647 #define SCF_WHILEM_VISITED_POS	0x2000
648 
649 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
650 #define SCF_SEEN_ACCEPT         0x8000
651 #define SCF_TRIE_DOING_RESTUDY 0x10000
652 #define SCF_IN_DEFINE          0x20000
653 
654 
655 
656 
657 #define UTF cBOOL(RExC_utf8)
658 
659 /* The enums for all these are ordered so things work out correctly */
660 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
661 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
662                                                      == REGEX_DEPENDS_CHARSET)
663 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
664 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
665                                                      >= REGEX_UNICODE_CHARSET)
666 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
667                                             == REGEX_ASCII_RESTRICTED_CHARSET)
668 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
669                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
670 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
671                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
672 
673 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
674 
675 /* For programs that want to be strictly Unicode compatible by dying if any
676  * attempt is made to match a non-Unicode code point against a Unicode
677  * property.  */
678 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
679 
680 #define OOB_NAMEDCLASS		-1
681 
682 /* There is no code point that is out-of-bounds, so this is problematic.  But
683  * its only current use is to initialize a variable that is always set before
684  * looked at. */
685 #define OOB_UNICODE		0xDEADBEEF
686 
687 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
688 
689 
690 /* length of regex to show in messages that don't mark a position within */
691 #define RegexLengthToShowInErrorMessages 127
692 
693 /*
694  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
695  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
696  * op/pragma/warn/regcomp.
697  */
698 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
699 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
700 
701 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
702                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
703 
704 /* The code in this file in places uses one level of recursion with parsing
705  * rebased to an alternate string constructed by us in memory.  This can take
706  * the form of something that is completely different from the input, or
707  * something that uses the input as part of the alternate.  In the first case,
708  * there should be no possibility of an error, as we are in complete control of
709  * the alternate string.  But in the second case we don't completely control
710  * the input portion, so there may be errors in that.  Here's an example:
711  *      /[abc\x{DF}def]/ui
712  * is handled specially because \x{df} folds to a sequence of more than one
713  * character: 'ss'.  What is done is to create and parse an alternate string,
714  * which looks like this:
715  *      /(?:\x{DF}|[abc\x{DF}def])/ui
716  * where it uses the input unchanged in the middle of something it constructs,
717  * which is a branch for the DF outside the character class, and clustering
718  * parens around the whole thing. (It knows enough to skip the DF inside the
719  * class while in this substitute parse.) 'abc' and 'def' may have errors that
720  * need to be reported.  The general situation looks like this:
721  *
722  *                                       |<------- identical ------>|
723  *              sI                       tI               xI       eI
724  * Input:       ---------------------------------------------------------------
725  * Constructed:         ---------------------------------------------------
726  *                      sC               tC               xC       eC     EC
727  *                                       |<------- identical ------>|
728  *
729  * sI..eI   is the portion of the input pattern we are concerned with here.
730  * sC..EC   is the constructed substitute parse string.
731  *  sC..tC  is constructed by us
732  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
733  *          In the diagram, these are vertically aligned.
734  *  eC..EC  is also constructed by us.
735  * xC       is the position in the substitute parse string where we found a
736  *          problem.
737  * xI       is the position in the original pattern corresponding to xC.
738  *
739  * We want to display a message showing the real input string.  Thus we need to
740  * translate from xC to xI.  We know that xC >= tC, since the portion of the
741  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
742  * get:
743  *      xI = tI + (xC - tC)
744  *
745  * When the substitute parse is constructed, the code needs to set:
746  *      RExC_start (sC)
747  *      RExC_end (eC)
748  *      RExC_copy_start_in_input  (tI)
749  *      RExC_copy_start_in_constructed (tC)
750  * and restore them when done.
751  *
752  * During normal processing of the input pattern, both
753  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
754  * sI, so that xC equals xI.
755  */
756 
757 #define sI              RExC_precomp
758 #define eI              RExC_precomp_end
759 #define sC              RExC_start
760 #define eC              RExC_end
761 #define tI              RExC_copy_start_in_input
762 #define tC              RExC_copy_start_in_constructed
763 #define xI(xC)          (tI + (xC - tC))
764 #define xI_offset(xC)   (xI(xC) - sI)
765 
766 #define REPORT_LOCATION_ARGS(xC)                                            \
767     UTF8fARG(UTF,                                                           \
768              (xI(xC) > eI) /* Don't run off end */                          \
769               ? eI - sI   /* Length before the <--HERE */                   \
770               : ((xI_offset(xC) >= 0)                                       \
771                  ? xI_offset(xC)                                            \
772                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
773                                     IVdf " trying to output message for "   \
774                                     " pattern %.*s",                        \
775                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
776                                     ((int) (eC - sC)), sC), 0)),            \
777              sI),         /* The input pattern printed up to the <--HERE */ \
778     UTF8fARG(UTF,                                                           \
779              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
780              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
781 
782 /* Used to point after bad bytes for an error message, but avoid skipping
783  * past a nul byte. */
784 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
785 
786 /* Set up to clean up after our imminent demise */
787 #define PREPARE_TO_DIE                                                      \
788     STMT_START {					                    \
789         if (RExC_rx_sv)                                                     \
790             SAVEFREESV(RExC_rx_sv);                                         \
791         if (RExC_open_parens)                                               \
792             SAVEFREEPV(RExC_open_parens);                                   \
793         if (RExC_close_parens)                                              \
794             SAVEFREEPV(RExC_close_parens);                                  \
795     } STMT_END
796 
797 /*
798  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
799  * arg. Show regex, up to a maximum length. If it's too long, chop and add
800  * "...".
801  */
802 #define _FAIL(code) STMT_START {					\
803     const char *ellipses = "";						\
804     IV len = RExC_precomp_end - RExC_precomp;				\
805 									\
806     PREPARE_TO_DIE;						        \
807     if (len > RegexLengthToShowInErrorMessages) {			\
808 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
809 	len = RegexLengthToShowInErrorMessages - 10;			\
810 	ellipses = "...";						\
811     }									\
812     code;                                                               \
813 } STMT_END
814 
815 #define	FAIL(msg) _FAIL(			    \
816     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",	    \
817 	    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
818 
819 #define	FAIL2(msg,arg) _FAIL(			    \
820     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",	    \
821 	    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
822 
823 #define	FAIL3(msg,arg1,arg2) _FAIL(			    \
824     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",	    \
825      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
826 
827 /*
828  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
829  */
830 #define	Simple_vFAIL(m) STMT_START {					\
831     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
832 	    m, REPORT_LOCATION_ARGS(RExC_parse));	                \
833 } STMT_END
834 
835 /*
836  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
837  */
838 #define	vFAIL(m) STMT_START {				\
839     PREPARE_TO_DIE;                                     \
840     Simple_vFAIL(m);					\
841 } STMT_END
842 
843 /*
844  * Like Simple_vFAIL(), but accepts two arguments.
845  */
846 #define	Simple_vFAIL2(m,a1) STMT_START {			\
847     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,		\
848                       REPORT_LOCATION_ARGS(RExC_parse));	\
849 } STMT_END
850 
851 /*
852  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
853  */
854 #define	vFAIL2(m,a1) STMT_START {			\
855     PREPARE_TO_DIE;                                     \
856     Simple_vFAIL2(m, a1);				\
857 } STMT_END
858 
859 
860 /*
861  * Like Simple_vFAIL(), but accepts three arguments.
862  */
863 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
864     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,		\
865 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
866 } STMT_END
867 
868 /*
869  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
870  */
871 #define	vFAIL3(m,a1,a2) STMT_START {			\
872     PREPARE_TO_DIE;                                     \
873     Simple_vFAIL3(m, a1, a2);				\
874 } STMT_END
875 
876 /*
877  * Like Simple_vFAIL(), but accepts four arguments.
878  */
879 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
880     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,	\
881 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
882 } STMT_END
883 
884 #define	vFAIL4(m,a1,a2,a3) STMT_START {			\
885     PREPARE_TO_DIE;                                     \
886     Simple_vFAIL4(m, a1, a2, a3);			\
887 } STMT_END
888 
889 /* A specialized version of vFAIL2 that works with UTF8f */
890 #define vFAIL2utf8f(m, a1) STMT_START {             \
891     PREPARE_TO_DIE;                                 \
892     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
893             REPORT_LOCATION_ARGS(RExC_parse));      \
894 } STMT_END
895 
896 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
897     PREPARE_TO_DIE;                                     \
898     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
899             REPORT_LOCATION_ARGS(RExC_parse));          \
900 } STMT_END
901 
902 /* Setting this to NULL is a signal to not output warnings */
903 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
904     STMT_START {                                                            \
905       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
906       RExC_copy_start_in_constructed = NULL;                                \
907     } STMT_END
908 #define RESTORE_WARNINGS                                                    \
909     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
910 
911 /* Since a warning can be generated multiple times as the input is reparsed, we
912  * output it the first time we come to that point in the parse, but suppress it
913  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
914  * generate any warnings */
915 #define TO_OUTPUT_WARNINGS(loc)                                         \
916   (   RExC_copy_start_in_constructed                                    \
917    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
918 
919 /* After we've emitted a warning, we save the position in the input so we don't
920  * output it again */
921 #define UPDATE_WARNINGS_LOC(loc)                                        \
922     STMT_START {                                                        \
923         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
924             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
925                                                        - RExC_precomp;  \
926         }                                                               \
927     } STMT_END
928 
929 /* 'warns' is the output of the packWARNx macro used in 'code' */
930 #define _WARN_HELPER(loc, warns, code)                                  \
931     STMT_START {                                                        \
932         if (! RExC_copy_start_in_constructed) {                         \
933             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
934                               " expected at '%s'",                      \
935                               __FILE__, __LINE__, loc);                 \
936         }                                                               \
937         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
938             if (ckDEAD(warns))                                          \
939                 PREPARE_TO_DIE;                                         \
940             code;                                                       \
941             UPDATE_WARNINGS_LOC(loc);                                   \
942         }                                                               \
943     } STMT_END
944 
945 /* m is not necessarily a "literal string", in this macro */
946 #define warn_non_literal_string(loc, packed_warn, m)                    \
947     _WARN_HELPER(loc, packed_warn,                                      \
948                       Perl_warner(aTHX_ packed_warn,                    \
949                                        "%s" REPORT_LOCATION,            \
950                                   m, REPORT_LOCATION_ARGS(loc)))
951 #define reg_warn_non_literal_string(loc, m)                             \
952                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
953 
954 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
955     STMT_START {                                                            \
956                 char * format;                                              \
957                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
958                 Newx(format, format_size, char);                            \
959                 my_strlcpy(format, m, format_size);                         \
960                 my_strlcat(format, REPORT_LOCATION, format_size);           \
961                 SAVEFREEPV(format);                                         \
962                 _WARN_HELPER(loc, packwarn,                                 \
963                       Perl_ck_warner(aTHX_ packwarn,                        \
964                                         format,                             \
965                                         a1, REPORT_LOCATION_ARGS(loc)));    \
966     } STMT_END
967 
968 #define	ckWARNreg(loc,m) 					        \
969     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
970                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
971                                           m REPORT_LOCATION,	        \
972 	                                  REPORT_LOCATION_ARGS(loc)))
973 
974 #define	vWARN(loc, m)           				        \
975     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
976                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
977                                        m REPORT_LOCATION,               \
978                                        REPORT_LOCATION_ARGS(loc)))      \
979 
980 #define	vWARN_dep(loc, m)           				        \
981     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
982                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
983                                        m REPORT_LOCATION,               \
984 	                               REPORT_LOCATION_ARGS(loc)))
985 
986 #define	ckWARNdep(loc,m)            				        \
987     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
988                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
989 	                                    m REPORT_LOCATION,          \
990 	                                    REPORT_LOCATION_ARGS(loc)))
991 
992 #define	ckWARNregdep(loc,m)             				    \
993     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
994                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
995                                                       WARN_REGEXP),         \
996 	                                     m REPORT_LOCATION,             \
997 	                                     REPORT_LOCATION_ARGS(loc)))
998 
999 #define	ckWARN2reg_d(loc,m, a1)             				    \
1000     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1001                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1002 	                                    m REPORT_LOCATION,              \
1003 	                                    a1, REPORT_LOCATION_ARGS(loc)))
1004 
1005 #define	ckWARN2reg(loc, m, a1)                                              \
1006     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1007                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1008                                           m REPORT_LOCATION,	            \
1009                                           a1, REPORT_LOCATION_ARGS(loc)))
1010 
1011 #define	vWARN3(loc, m, a1, a2)          				    \
1012     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1013                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1014                                        m REPORT_LOCATION,                   \
1015 	                               a1, a2, REPORT_LOCATION_ARGS(loc)))
1016 
1017 #define	ckWARN3reg(loc, m, a1, a2)          				    \
1018     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1019                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1020                                           m REPORT_LOCATION,                \
1021 	                                  a1, a2,                           \
1022                                           REPORT_LOCATION_ARGS(loc)))
1023 
1024 #define	vWARN4(loc, m, a1, a2, a3)          				\
1025     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1026                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1027                                        m REPORT_LOCATION,               \
1028 	                               a1, a2, a3,                      \
1029                                        REPORT_LOCATION_ARGS(loc)))
1030 
1031 #define	ckWARN4reg(loc, m, a1, a2, a3)          			\
1032     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1033                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1034                                           m REPORT_LOCATION,            \
1035 	                                  a1, a2, a3,                   \
1036                                           REPORT_LOCATION_ARGS(loc)))
1037 
1038 #define	vWARN5(loc, m, a1, a2, a3, a4)          			\
1039     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1040                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1041                                        m REPORT_LOCATION,		\
1042 	                               a1, a2, a3, a4,                  \
1043                                        REPORT_LOCATION_ARGS(loc)))
1044 
1045 #define	ckWARNexperimental(loc, class, m)                               \
1046     STMT_START {                                                        \
1047         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1048             RExC_warned_ ## class = 1;                                  \
1049             _WARN_HELPER(loc, packWARN(class),                          \
1050                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1051                                             m REPORT_LOCATION,          \
1052                                             REPORT_LOCATION_ARGS(loc)));\
1053         }                                                               \
1054     } STMT_END
1055 
1056 /* Convert between a pointer to a node and its offset from the beginning of the
1057  * program */
1058 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1059 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1060 
1061 /* Macros for recording node offsets.   20001227 mjd@plover.com
1062  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
1063  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
1064  * Element 0 holds the number n.
1065  * Position is 1 indexed.
1066  */
1067 #ifndef RE_TRACK_PATTERN_OFFSETS
1068 #define Set_Node_Offset_To_R(offset,byte)
1069 #define Set_Node_Offset(node,byte)
1070 #define Set_Cur_Node_Offset
1071 #define Set_Node_Length_To_R(node,len)
1072 #define Set_Node_Length(node,len)
1073 #define Set_Node_Cur_Length(node,start)
1074 #define Node_Offset(n)
1075 #define Node_Length(n)
1076 #define Set_Node_Offset_Length(node,offset,len)
1077 #define ProgLen(ri) ri->u.proglen
1078 #define SetProgLen(ri,x) ri->u.proglen = x
1079 #define Track_Code(code)
1080 #else
1081 #define ProgLen(ri) ri->u.offsets[0]
1082 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1083 #define Set_Node_Offset_To_R(offset,byte) STMT_START {			\
1084 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
1085 		    __LINE__, (int)(offset), (int)(byte)));		\
1086 	if((offset) < 0) {						\
1087 	    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
1088                                          (int)(offset));                \
1089 	} else {							\
1090             RExC_offsets[2*(offset)-1] = (byte);	                \
1091 	}								\
1092 } STMT_END
1093 
1094 #define Set_Node_Offset(node,byte)                                      \
1095     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1096 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1097 
1098 #define Set_Node_Length_To_R(node,len) STMT_START {			\
1099 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
1100 		__LINE__, (int)(node), (int)(len)));			\
1101 	if((node) < 0) {						\
1102 	    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1103                                          (int)(node));                  \
1104 	} else {							\
1105 	    RExC_offsets[2*(node)] = (len);				\
1106 	}								\
1107 } STMT_END
1108 
1109 #define Set_Node_Length(node,len) \
1110     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1111 #define Set_Node_Cur_Length(node, start)                \
1112     Set_Node_Length(node, RExC_parse - start)
1113 
1114 /* Get offsets and lengths */
1115 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1116 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1117 
1118 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
1119     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));	\
1120     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));	\
1121 } STMT_END
1122 
1123 #define Track_Code(code) STMT_START { code } STMT_END
1124 #endif
1125 
1126 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1127 #define EXPERIMENTAL_INPLACESCAN
1128 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1129 
1130 #ifdef DEBUGGING
1131 int
Perl_re_printf(pTHX_ const char * fmt,...)1132 Perl_re_printf(pTHX_ const char *fmt, ...)
1133 {
1134     va_list ap;
1135     int result;
1136     PerlIO *f= Perl_debug_log;
1137     PERL_ARGS_ASSERT_RE_PRINTF;
1138     va_start(ap, fmt);
1139     result = PerlIO_vprintf(f, fmt, ap);
1140     va_end(ap);
1141     return result;
1142 }
1143 
1144 int
Perl_re_indentf(pTHX_ const char * fmt,U32 depth,...)1145 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1146 {
1147     va_list ap;
1148     int result;
1149     PerlIO *f= Perl_debug_log;
1150     PERL_ARGS_ASSERT_RE_INDENTF;
1151     va_start(ap, depth);
1152     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1153     result = PerlIO_vprintf(f, fmt, ap);
1154     va_end(ap);
1155     return result;
1156 }
1157 #endif /* DEBUGGING */
1158 
1159 #define DEBUG_RExC_seen()                                                   \
1160         DEBUG_OPTIMISE_MORE_r({                                             \
1161             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1162                                                                             \
1163             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1164                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1165                                                                             \
1166             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1167                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1168                                                                             \
1169             if (RExC_seen & REG_GPOS_SEEN)                                  \
1170                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1171                                                                             \
1172             if (RExC_seen & REG_RECURSE_SEEN)                               \
1173                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1174                                                                             \
1175             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1176                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1177                                                                             \
1178             if (RExC_seen & REG_VERBARG_SEEN)                               \
1179                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1180                                                                             \
1181             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1182                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1183                                                                             \
1184             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1185                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1186                                                                             \
1187             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1188                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1189                                                                             \
1190             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1191                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1192                                                                             \
1193             Perl_re_printf( aTHX_ "\n");                                    \
1194         });
1195 
1196 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1197   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1198 
1199 
1200 #ifdef DEBUGGING
1201 static void
S_debug_show_study_flags(pTHX_ U32 flags,const char * open_str,const char * close_str)1202 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1203                                     const char *close_str)
1204 {
1205     if (!flags)
1206         return;
1207 
1208     Perl_re_printf( aTHX_  "%s", open_str);
1209     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1210     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1211     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1212     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1213     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1214     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1215     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1216     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1217     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1218     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1219     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1220     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1221     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1222     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1223     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1224     Perl_re_printf( aTHX_  "%s", close_str);
1225 }
1226 
1227 
1228 static void
S_debug_studydata(pTHX_ const char * where,scan_data_t * data,U32 depth,int is_inf)1229 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1230                     U32 depth, int is_inf)
1231 {
1232     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1233 
1234     DEBUG_OPTIMISE_MORE_r({
1235         if (!data)
1236             return;
1237         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1238             depth,
1239             where,
1240             (IV)data->pos_min,
1241             (IV)data->pos_delta,
1242             (UV)data->flags
1243         );
1244 
1245         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1246 
1247         Perl_re_printf( aTHX_
1248             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1249             (IV)data->whilem_c,
1250             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1251             is_inf ? "INF " : ""
1252         );
1253 
1254         if (data->last_found) {
1255             int i;
1256             Perl_re_printf(aTHX_
1257                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1258                     SvPVX_const(data->last_found),
1259                     (IV)data->last_end,
1260                     (IV)data->last_start_min,
1261                     (IV)data->last_start_max
1262             );
1263 
1264             for (i = 0; i < 2; i++) {
1265                 Perl_re_printf(aTHX_
1266                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1267                     data->cur_is_floating == i ? "*" : "",
1268                     i ? "Float" : "Fixed",
1269                     SvPVX_const(data->substrs[i].str),
1270                     (IV)data->substrs[i].min_offset,
1271                     (IV)data->substrs[i].max_offset
1272                 );
1273                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1274             }
1275         }
1276 
1277         Perl_re_printf( aTHX_ "\n");
1278     });
1279 }
1280 
1281 
1282 static void
S_debug_peep(pTHX_ const char * str,const RExC_state_t * pRExC_state,regnode * scan,U32 depth,U32 flags)1283 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1284                 regnode *scan, U32 depth, U32 flags)
1285 {
1286     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1287 
1288     DEBUG_OPTIMISE_r({
1289         regnode *Next;
1290 
1291         if (!scan)
1292             return;
1293         Next = regnext(scan);
1294         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1295         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1296             depth,
1297             str,
1298             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1299             Next ? (REG_NODE_NUM(Next)) : 0 );
1300         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1301         Perl_re_printf( aTHX_  "\n");
1302    });
1303 }
1304 
1305 
1306 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1307                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1308 
1309 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1310                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1311 
1312 #else
1313 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1314 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1315 #endif
1316 
1317 
1318 /* =========================================================
1319  * BEGIN edit_distance stuff.
1320  *
1321  * This calculates how many single character changes of any type are needed to
1322  * transform a string into another one.  It is taken from version 3.1 of
1323  *
1324  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1325  */
1326 
1327 /* Our unsorted dictionary linked list.   */
1328 /* Note we use UVs, not chars. */
1329 
1330 struct dictionary{
1331   UV key;
1332   UV value;
1333   struct dictionary* next;
1334 };
1335 typedef struct dictionary item;
1336 
1337 
1338 PERL_STATIC_INLINE item*
push(UV key,item * curr)1339 push(UV key, item* curr)
1340 {
1341     item* head;
1342     Newx(head, 1, item);
1343     head->key = key;
1344     head->value = 0;
1345     head->next = curr;
1346     return head;
1347 }
1348 
1349 
1350 PERL_STATIC_INLINE item*
find(item * head,UV key)1351 find(item* head, UV key)
1352 {
1353     item* iterator = head;
1354     while (iterator){
1355         if (iterator->key == key){
1356             return iterator;
1357         }
1358         iterator = iterator->next;
1359     }
1360 
1361     return NULL;
1362 }
1363 
1364 PERL_STATIC_INLINE item*
uniquePush(item * head,UV key)1365 uniquePush(item* head, UV key)
1366 {
1367     item* iterator = head;
1368 
1369     while (iterator){
1370         if (iterator->key == key) {
1371             return head;
1372         }
1373         iterator = iterator->next;
1374     }
1375 
1376     return push(key, head);
1377 }
1378 
1379 PERL_STATIC_INLINE void
dict_free(item * head)1380 dict_free(item* head)
1381 {
1382     item* iterator = head;
1383 
1384     while (iterator) {
1385         item* temp = iterator;
1386         iterator = iterator->next;
1387         Safefree(temp);
1388     }
1389 
1390     head = NULL;
1391 }
1392 
1393 /* End of Dictionary Stuff */
1394 
1395 /* All calculations/work are done here */
1396 STATIC int
S_edit_distance(const UV * src,const UV * tgt,const STRLEN x,const STRLEN y,const SSize_t maxDistance)1397 S_edit_distance(const UV* src,
1398                 const UV* tgt,
1399                 const STRLEN x,             /* length of src[] */
1400                 const STRLEN y,             /* length of tgt[] */
1401                 const SSize_t maxDistance
1402 )
1403 {
1404     item *head = NULL;
1405     UV swapCount, swapScore, targetCharCount, i, j;
1406     UV *scores;
1407     UV score_ceil = x + y;
1408 
1409     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1410 
1411     /* intialize matrix start values */
1412     Newx(scores, ( (x + 2) * (y + 2)), UV);
1413     scores[0] = score_ceil;
1414     scores[1 * (y + 2) + 0] = score_ceil;
1415     scores[0 * (y + 2) + 1] = score_ceil;
1416     scores[1 * (y + 2) + 1] = 0;
1417     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1418 
1419     /* work loops    */
1420     /* i = src index */
1421     /* j = tgt index */
1422     for (i=1;i<=x;i++) {
1423         if (i < x)
1424             head = uniquePush(head, src[i]);
1425         scores[(i+1) * (y + 2) + 1] = i;
1426         scores[(i+1) * (y + 2) + 0] = score_ceil;
1427         swapCount = 0;
1428 
1429         for (j=1;j<=y;j++) {
1430             if (i == 1) {
1431                 if(j < y)
1432                 head = uniquePush(head, tgt[j]);
1433                 scores[1 * (y + 2) + (j + 1)] = j;
1434                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1435             }
1436 
1437             targetCharCount = find(head, tgt[j-1])->value;
1438             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1439 
1440             if (src[i-1] != tgt[j-1]){
1441                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1442             }
1443             else {
1444                 swapCount = j;
1445                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1446             }
1447         }
1448 
1449         find(head, src[i-1])->value = i;
1450     }
1451 
1452     {
1453         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1454         dict_free(head);
1455         Safefree(scores);
1456         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1457     }
1458 }
1459 
1460 /* END of edit_distance() stuff
1461  * ========================================================= */
1462 
1463 /* Mark that we cannot extend a found fixed substring at this point.
1464    Update the longest found anchored substring or the longest found
1465    floating substrings if needed. */
1466 
1467 STATIC void
S_scan_commit(pTHX_ const RExC_state_t * pRExC_state,scan_data_t * data,SSize_t * minlenp,int is_inf)1468 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1469                     SSize_t *minlenp, int is_inf)
1470 {
1471     const STRLEN l = CHR_SVLEN(data->last_found);
1472     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1473     const STRLEN old_l = CHR_SVLEN(longest_sv);
1474     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1475 
1476     PERL_ARGS_ASSERT_SCAN_COMMIT;
1477 
1478     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1479         const U8 i = data->cur_is_floating;
1480 	SvSetMagicSV(longest_sv, data->last_found);
1481         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1482 
1483 	if (!i) /* fixed */
1484 	    data->substrs[0].max_offset = data->substrs[0].min_offset;
1485 	else { /* float */
1486 	    data->substrs[1].max_offset =
1487                       (is_inf)
1488                        ? OPTIMIZE_INFTY
1489                        : (l
1490                           ? data->last_start_max
1491                           /* temporary underflow guard for 5.32 */
1492                           : data->pos_delta < 0 ? OPTIMIZE_INFTY
1493                           : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1494 					 ? OPTIMIZE_INFTY
1495 					 : data->pos_min + data->pos_delta));
1496         }
1497 
1498         data->substrs[i].flags &= ~SF_BEFORE_EOL;
1499         data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1500         data->substrs[i].minlenp = minlenp;
1501         data->substrs[i].lookbehind = 0;
1502     }
1503 
1504     SvCUR_set(data->last_found, 0);
1505     {
1506 	SV * const sv = data->last_found;
1507 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
1508 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1509 	    if (mg)
1510 		mg->mg_len = 0;
1511 	}
1512     }
1513     data->last_end = -1;
1514     data->flags &= ~SF_BEFORE_EOL;
1515     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1516 }
1517 
1518 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1519  * list that describes which code points it matches */
1520 
1521 STATIC void
S_ssc_anything(pTHX_ regnode_ssc * ssc)1522 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1523 {
1524     /* Set the SSC 'ssc' to match an empty string or any code point */
1525 
1526     PERL_ARGS_ASSERT_SSC_ANYTHING;
1527 
1528     assert(is_ANYOF_SYNTHETIC(ssc));
1529 
1530     /* mortalize so won't leak */
1531     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1532     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1533 }
1534 
1535 STATIC int
S_ssc_is_anything(const regnode_ssc * ssc)1536 S_ssc_is_anything(const regnode_ssc *ssc)
1537 {
1538     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1539      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1540      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1541      * in any way, so there's no point in using it */
1542 
1543     UV start, end;
1544     bool ret;
1545 
1546     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1547 
1548     assert(is_ANYOF_SYNTHETIC(ssc));
1549 
1550     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1551         return FALSE;
1552     }
1553 
1554     /* See if the list consists solely of the range 0 - Infinity */
1555     invlist_iterinit(ssc->invlist);
1556     ret = invlist_iternext(ssc->invlist, &start, &end)
1557           && start == 0
1558           && end == UV_MAX;
1559 
1560     invlist_iterfinish(ssc->invlist);
1561 
1562     if (ret) {
1563         return TRUE;
1564     }
1565 
1566     /* If e.g., both \w and \W are set, matches everything */
1567     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1568         int i;
1569         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1570             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1571                 return TRUE;
1572             }
1573         }
1574     }
1575 
1576     return FALSE;
1577 }
1578 
1579 STATIC void
S_ssc_init(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc)1580 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1581 {
1582     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1583      * string, any code point, or any posix class under locale */
1584 
1585     PERL_ARGS_ASSERT_SSC_INIT;
1586 
1587     Zero(ssc, 1, regnode_ssc);
1588     set_ANYOF_SYNTHETIC(ssc);
1589     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1590     ssc_anything(ssc);
1591 
1592     /* If any portion of the regex is to operate under locale rules that aren't
1593      * fully known at compile time, initialization includes it.  The reason
1594      * this isn't done for all regexes is that the optimizer was written under
1595      * the assumption that locale was all-or-nothing.  Given the complexity and
1596      * lack of documentation in the optimizer, and that there are inadequate
1597      * test cases for locale, many parts of it may not work properly, it is
1598      * safest to avoid locale unless necessary. */
1599     if (RExC_contains_locale) {
1600 	ANYOF_POSIXL_SETALL(ssc);
1601     }
1602     else {
1603 	ANYOF_POSIXL_ZERO(ssc);
1604     }
1605 }
1606 
1607 STATIC int
S_ssc_is_cp_posixl_init(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)1608 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1609                         const regnode_ssc *ssc)
1610 {
1611     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1612      * to the list of code points matched, and locale posix classes; hence does
1613      * not check its flags) */
1614 
1615     UV start, end;
1616     bool ret;
1617 
1618     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1619 
1620     assert(is_ANYOF_SYNTHETIC(ssc));
1621 
1622     invlist_iterinit(ssc->invlist);
1623     ret = invlist_iternext(ssc->invlist, &start, &end)
1624           && start == 0
1625           && end == UV_MAX;
1626 
1627     invlist_iterfinish(ssc->invlist);
1628 
1629     if (! ret) {
1630         return FALSE;
1631     }
1632 
1633     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1634         return FALSE;
1635     }
1636 
1637     return TRUE;
1638 }
1639 
1640 #define INVLIST_INDEX 0
1641 #define ONLY_LOCALE_MATCHES_INDEX 1
1642 #define DEFERRED_USER_DEFINED_INDEX 2
1643 
1644 STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t * pRExC_state,const regnode_charclass * const node)1645 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1646                                const regnode_charclass* const node)
1647 {
1648     /* Returns a mortal inversion list defining which code points are matched
1649      * by 'node', which is of type ANYOF.  Handles complementing the result if
1650      * appropriate.  If some code points aren't knowable at this time, the
1651      * returned list must, and will, contain every code point that is a
1652      * possibility. */
1653 
1654     SV* invlist = NULL;
1655     SV* only_utf8_locale_invlist = NULL;
1656     unsigned int i;
1657     const U32 n = ARG(node);
1658     bool new_node_has_latin1 = FALSE;
1659     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1660                       ? 0
1661                       : ANYOF_FLAGS(node);
1662 
1663     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1664 
1665     /* Look at the data structure created by S_set_ANYOF_arg() */
1666     if (n != ANYOF_ONLY_HAS_BITMAP) {
1667         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1668         AV * const av = MUTABLE_AV(SvRV(rv));
1669         SV **const ary = AvARRAY(av);
1670         assert(RExC_rxi->data->what[n] == 's');
1671 
1672         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1673 
1674             /* Here there are things that won't be known until runtime -- we
1675              * have to assume it could be anything */
1676             invlist = sv_2mortal(_new_invlist(1));
1677             return _add_range_to_invlist(invlist, 0, UV_MAX);
1678         }
1679         else if (ary[INVLIST_INDEX]) {
1680 
1681             /* Use the node's inversion list */
1682             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1683         }
1684 
1685         /* Get the code points valid only under UTF-8 locales */
1686         if (   (flags & ANYOFL_FOLD)
1687             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1688         {
1689             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1690         }
1691     }
1692 
1693     if (! invlist) {
1694         invlist = sv_2mortal(_new_invlist(0));
1695     }
1696 
1697     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1698      * code points, and an inversion list for the others, but if there are code
1699      * points that should match only conditionally on the target string being
1700      * UTF-8, those are placed in the inversion list, and not the bitmap.
1701      * Since there are circumstances under which they could match, they are
1702      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1703      * to exclude them here, so that when we invert below, the end result
1704      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1705      * have to do this here before we add the unconditionally matched code
1706      * points */
1707     if (flags & ANYOF_INVERT) {
1708         _invlist_intersection_complement_2nd(invlist,
1709                                              PL_UpperLatin1,
1710                                              &invlist);
1711     }
1712 
1713     /* Add in the points from the bit map */
1714     if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1715         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1716             if (ANYOF_BITMAP_TEST(node, i)) {
1717                 unsigned int start = i++;
1718 
1719                 for (;    i < NUM_ANYOF_CODE_POINTS
1720                        && ANYOF_BITMAP_TEST(node, i); ++i)
1721                 {
1722                     /* empty */
1723                 }
1724                 invlist = _add_range_to_invlist(invlist, start, i-1);
1725                 new_node_has_latin1 = TRUE;
1726             }
1727         }
1728     }
1729 
1730     /* If this can match all upper Latin1 code points, have to add them
1731      * as well.  But don't add them if inverting, as when that gets done below,
1732      * it would exclude all these characters, including the ones it shouldn't
1733      * that were added just above */
1734     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1735         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1736     {
1737         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1738     }
1739 
1740     /* Similarly for these */
1741     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1742         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1743     }
1744 
1745     if (flags & ANYOF_INVERT) {
1746         _invlist_invert(invlist);
1747     }
1748     else if (flags & ANYOFL_FOLD) {
1749         if (new_node_has_latin1) {
1750 
1751             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1752              * the locale.  We can skip this if there are no 0-255 at all. */
1753             _invlist_union(invlist, PL_Latin1, &invlist);
1754 
1755             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1756             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1757         }
1758         else {
1759             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1760                 invlist = add_cp_to_invlist(invlist, 'I');
1761             }
1762             if (_invlist_contains_cp(invlist,
1763                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1764             {
1765                 invlist = add_cp_to_invlist(invlist, 'i');
1766             }
1767         }
1768     }
1769 
1770     /* Similarly add the UTF-8 locale possible matches.  These have to be
1771      * deferred until after the non-UTF-8 locale ones are taken care of just
1772      * above, or it leads to wrong results under ANYOF_INVERT */
1773     if (only_utf8_locale_invlist) {
1774         _invlist_union_maybe_complement_2nd(invlist,
1775                                             only_utf8_locale_invlist,
1776                                             flags & ANYOF_INVERT,
1777                                             &invlist);
1778     }
1779 
1780     return invlist;
1781 }
1782 
1783 /* These two functions currently do the exact same thing */
1784 #define ssc_init_zero		ssc_init
1785 
1786 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1787 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1788 
1789 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1790  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1791  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1792 
1793 STATIC void
S_ssc_and(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * and_with)1794 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1795                 const regnode_charclass *and_with)
1796 {
1797     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1798      * another SSC or a regular ANYOF class.  Can create false positives. */
1799 
1800     SV* anded_cp_list;
1801     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1802                           ? 0
1803                           : ANYOF_FLAGS(and_with);
1804     U8  anded_flags;
1805 
1806     PERL_ARGS_ASSERT_SSC_AND;
1807 
1808     assert(is_ANYOF_SYNTHETIC(ssc));
1809 
1810     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1811      * the code point inversion list and just the relevant flags */
1812     if (is_ANYOF_SYNTHETIC(and_with)) {
1813         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1814         anded_flags = and_with_flags;
1815 
1816         /* XXX This is a kludge around what appears to be deficiencies in the
1817          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1818          * there are paths through the optimizer where it doesn't get weeded
1819          * out when it should.  And if we don't make some extra provision for
1820          * it like the code just below, it doesn't get added when it should.
1821          * This solution is to add it only when AND'ing, which is here, and
1822          * only when what is being AND'ed is the pristine, original node
1823          * matching anything.  Thus it is like adding it to ssc_anything() but
1824          * only when the result is to be AND'ed.  Probably the same solution
1825          * could be adopted for the same problem we have with /l matching,
1826          * which is solved differently in S_ssc_init(), and that would lead to
1827          * fewer false positives than that solution has.  But if this solution
1828          * creates bugs, the consequences are only that a warning isn't raised
1829          * that should be; while the consequences for having /l bugs is
1830          * incorrect matches */
1831         if (ssc_is_anything((regnode_ssc *)and_with)) {
1832             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1833         }
1834     }
1835     else {
1836         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1837         if (OP(and_with) == ANYOFD) {
1838             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1839         }
1840         else {
1841             anded_flags = and_with_flags
1842             &( ANYOF_COMMON_FLAGS
1843               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1844               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1845             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1846                 anded_flags &=
1847                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1848             }
1849         }
1850     }
1851 
1852     ANYOF_FLAGS(ssc) &= anded_flags;
1853 
1854     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1855      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1856      * 'and_with' may be inverted.  When not inverted, we have the situation of
1857      * computing:
1858      *  (C1 | P1) & (C2 | P2)
1859      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1860      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1861      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1862      *                    <=  ((C1 & C2) | P1 | P2)
1863      * Alternatively, the last few steps could be:
1864      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1865      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1866      *                    <=  (C1 | C2 | (P1 & P2))
1867      * We favor the second approach if either P1 or P2 is non-empty.  This is
1868      * because these components are a barrier to doing optimizations, as what
1869      * they match cannot be known until the moment of matching as they are
1870      * dependent on the current locale, 'AND"ing them likely will reduce or
1871      * eliminate them.
1872      * But we can do better if we know that C1,P1 are in their initial state (a
1873      * frequent occurrence), each matching everything:
1874      *  (<everything>) & (C2 | P2) =  C2 | P2
1875      * Similarly, if C2,P2 are in their initial state (again a frequent
1876      * occurrence), the result is a no-op
1877      *  (C1 | P1) & (<everything>) =  C1 | P1
1878      *
1879      * Inverted, we have
1880      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1881      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1882      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1883      * */
1884 
1885     if ((and_with_flags & ANYOF_INVERT)
1886         && ! is_ANYOF_SYNTHETIC(and_with))
1887     {
1888         unsigned int i;
1889 
1890         ssc_intersection(ssc,
1891                          anded_cp_list,
1892                          FALSE /* Has already been inverted */
1893                          );
1894 
1895         /* If either P1 or P2 is empty, the intersection will be also; can skip
1896          * the loop */
1897         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1898             ANYOF_POSIXL_ZERO(ssc);
1899         }
1900         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1901 
1902             /* Note that the Posix class component P from 'and_with' actually
1903              * looks like:
1904              *      P = Pa | Pb | ... | Pn
1905              * where each component is one posix class, such as in [\w\s].
1906              * Thus
1907              *      ~P = ~(Pa | Pb | ... | Pn)
1908              *         = ~Pa & ~Pb & ... & ~Pn
1909              *        <= ~Pa | ~Pb | ... | ~Pn
1910              * The last is something we can easily calculate, but unfortunately
1911              * is likely to have many false positives.  We could do better
1912              * in some (but certainly not all) instances if two classes in
1913              * P have known relationships.  For example
1914              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1915              * So
1916              *      :lower: & :print: = :lower:
1917              * And similarly for classes that must be disjoint.  For example,
1918              * since \s and \w can have no elements in common based on rules in
1919              * the POSIX standard,
1920              *      \w & ^\S = nothing
1921              * Unfortunately, some vendor locales do not meet the Posix
1922              * standard, in particular almost everything by Microsoft.
1923              * The loop below just changes e.g., \w into \W and vice versa */
1924 
1925             regnode_charclass_posixl temp;
1926             int add = 1;    /* To calculate the index of the complement */
1927 
1928             Zero(&temp, 1, regnode_charclass_posixl);
1929             ANYOF_POSIXL_ZERO(&temp);
1930             for (i = 0; i < ANYOF_MAX; i++) {
1931                 assert(i % 2 != 0
1932                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1933                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1934 
1935                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1936                     ANYOF_POSIXL_SET(&temp, i + add);
1937                 }
1938                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1939             }
1940             ANYOF_POSIXL_AND(&temp, ssc);
1941 
1942         } /* else ssc already has no posixes */
1943     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1944          in its initial state */
1945     else if (! is_ANYOF_SYNTHETIC(and_with)
1946              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1947     {
1948         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1949          * copy it over 'ssc' */
1950         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1951             if (is_ANYOF_SYNTHETIC(and_with)) {
1952                 StructCopy(and_with, ssc, regnode_ssc);
1953             }
1954             else {
1955                 ssc->invlist = anded_cp_list;
1956                 ANYOF_POSIXL_ZERO(ssc);
1957                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1958                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1959                 }
1960             }
1961         }
1962         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1963                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1964         {
1965             /* One or the other of P1, P2 is non-empty. */
1966             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1967                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1968             }
1969             ssc_union(ssc, anded_cp_list, FALSE);
1970         }
1971         else { /* P1 = P2 = empty */
1972             ssc_intersection(ssc, anded_cp_list, FALSE);
1973         }
1974     }
1975 }
1976 
1977 STATIC void
S_ssc_or(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * or_with)1978 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1979                const regnode_charclass *or_with)
1980 {
1981     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1982      * another SSC or a regular ANYOF class.  Can create false positives if
1983      * 'or_with' is to be inverted. */
1984 
1985     SV* ored_cp_list;
1986     U8 ored_flags;
1987     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1988                          ? 0
1989                          : ANYOF_FLAGS(or_with);
1990 
1991     PERL_ARGS_ASSERT_SSC_OR;
1992 
1993     assert(is_ANYOF_SYNTHETIC(ssc));
1994 
1995     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1996      * the code point inversion list and just the relevant flags */
1997     if (is_ANYOF_SYNTHETIC(or_with)) {
1998         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1999         ored_flags = or_with_flags;
2000     }
2001     else {
2002         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2003         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2004         if (OP(or_with) != ANYOFD) {
2005             ored_flags
2006             |= or_with_flags
2007              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2008                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2009             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2010                 ored_flags |=
2011                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2012             }
2013         }
2014     }
2015 
2016     ANYOF_FLAGS(ssc) |= ored_flags;
2017 
2018     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2019      * C2 is the list of code points in 'or-with'; P2, its posix classes.
2020      * 'or_with' may be inverted.  When not inverted, we have the simple
2021      * situation of computing:
2022      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
2023      * If P1|P2 yields a situation with both a class and its complement are
2024      * set, like having both \w and \W, this matches all code points, and we
2025      * can delete these from the P component of the ssc going forward.  XXX We
2026      * might be able to delete all the P components, but I (khw) am not certain
2027      * about this, and it is better to be safe.
2028      *
2029      * Inverted, we have
2030      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
2031      *                         <=  (C1 | P1) | ~C2
2032      *                         <=  (C1 | ~C2) | P1
2033      * (which results in actually simpler code than the non-inverted case)
2034      * */
2035 
2036     if ((or_with_flags & ANYOF_INVERT)
2037         && ! is_ANYOF_SYNTHETIC(or_with))
2038     {
2039         /* We ignore P2, leaving P1 going forward */
2040     }   /* else  Not inverted */
2041     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2042         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2043         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2044             unsigned int i;
2045             for (i = 0; i < ANYOF_MAX; i += 2) {
2046                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2047                 {
2048                     ssc_match_all_cp(ssc);
2049                     ANYOF_POSIXL_CLEAR(ssc, i);
2050                     ANYOF_POSIXL_CLEAR(ssc, i+1);
2051                 }
2052             }
2053         }
2054     }
2055 
2056     ssc_union(ssc,
2057               ored_cp_list,
2058               FALSE /* Already has been inverted */
2059               );
2060 }
2061 
2062 STATIC void
S_ssc_union(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)2063 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2064 {
2065     PERL_ARGS_ASSERT_SSC_UNION;
2066 
2067     assert(is_ANYOF_SYNTHETIC(ssc));
2068 
2069     _invlist_union_maybe_complement_2nd(ssc->invlist,
2070                                         invlist,
2071                                         invert2nd,
2072                                         &ssc->invlist);
2073 }
2074 
2075 STATIC void
S_ssc_intersection(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)2076 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2077                          SV* const invlist,
2078                          const bool invert2nd)
2079 {
2080     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2081 
2082     assert(is_ANYOF_SYNTHETIC(ssc));
2083 
2084     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2085                                                invlist,
2086                                                invert2nd,
2087                                                &ssc->invlist);
2088 }
2089 
2090 STATIC void
S_ssc_add_range(pTHX_ regnode_ssc * ssc,const UV start,const UV end)2091 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2092 {
2093     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2094 
2095     assert(is_ANYOF_SYNTHETIC(ssc));
2096 
2097     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2098 }
2099 
2100 STATIC void
S_ssc_cp_and(pTHX_ regnode_ssc * ssc,const UV cp)2101 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2102 {
2103     /* AND just the single code point 'cp' into the SSC 'ssc' */
2104 
2105     SV* cp_list = _new_invlist(2);
2106 
2107     PERL_ARGS_ASSERT_SSC_CP_AND;
2108 
2109     assert(is_ANYOF_SYNTHETIC(ssc));
2110 
2111     cp_list = add_cp_to_invlist(cp_list, cp);
2112     ssc_intersection(ssc, cp_list,
2113                      FALSE /* Not inverted */
2114                      );
2115     SvREFCNT_dec_NN(cp_list);
2116 }
2117 
2118 STATIC void
S_ssc_clear_locale(regnode_ssc * ssc)2119 S_ssc_clear_locale(regnode_ssc *ssc)
2120 {
2121     /* Set the SSC 'ssc' to not match any locale things */
2122     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2123 
2124     assert(is_ANYOF_SYNTHETIC(ssc));
2125 
2126     ANYOF_POSIXL_ZERO(ssc);
2127     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2128 }
2129 
2130 STATIC bool
S_is_ssc_worth_it(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)2131 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2132 {
2133     /* The synthetic start class is used to hopefully quickly winnow down
2134      * places where a pattern could start a match in the target string.  If it
2135      * doesn't really narrow things down that much, there isn't much point to
2136      * having the overhead of using it.  This function uses some very crude
2137      * heuristics to decide if to use the ssc or not.
2138      *
2139      * It returns TRUE if 'ssc' rules out more than half what it considers to
2140      * be the "likely" possible matches, but of course it doesn't know what the
2141      * actual things being matched are going to be; these are only guesses
2142      *
2143      * For /l matches, it assumes that the only likely matches are going to be
2144      *      in the 0-255 range, uniformly distributed, so half of that is 127
2145      * For /a and /d matches, it assumes that the likely matches will be just
2146      *      the ASCII range, so half of that is 63
2147      * For /u and there isn't anything matching above the Latin1 range, it
2148      *      assumes that that is the only range likely to be matched, and uses
2149      *      half that as the cut-off: 127.  If anything matches above Latin1,
2150      *      it assumes that all of Unicode could match (uniformly), except for
2151      *      non-Unicode code points and things in the General Category "Other"
2152      *      (unassigned, private use, surrogates, controls and formats).  This
2153      *      is a much large number. */
2154 
2155     U32 count = 0;      /* Running total of number of code points matched by
2156                            'ssc' */
2157     UV start, end;      /* Start and end points of current range in inversion
2158                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2159     const U32 max_code_points = (LOC)
2160                                 ?  256
2161                                 : ((  ! UNI_SEMANTICS
2162                                     ||  invlist_highest(ssc->invlist) < 256)
2163                                   ? 128
2164                                   : NON_OTHER_COUNT);
2165     const U32 max_match = max_code_points / 2;
2166 
2167     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2168 
2169     invlist_iterinit(ssc->invlist);
2170     while (invlist_iternext(ssc->invlist, &start, &end)) {
2171         if (start >= max_code_points) {
2172             break;
2173         }
2174         end = MIN(end, max_code_points - 1);
2175         count += end - start + 1;
2176         if (count >= max_match) {
2177             invlist_iterfinish(ssc->invlist);
2178             return FALSE;
2179         }
2180     }
2181 
2182     return TRUE;
2183 }
2184 
2185 
2186 STATIC void
S_ssc_finalize(pTHX_ RExC_state_t * pRExC_state,regnode_ssc * ssc)2187 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2188 {
2189     /* The inversion list in the SSC is marked mortal; now we need a more
2190      * permanent copy, which is stored the same way that is done in a regular
2191      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2192      * map */
2193 
2194     SV* invlist = invlist_clone(ssc->invlist, NULL);
2195 
2196     PERL_ARGS_ASSERT_SSC_FINALIZE;
2197 
2198     assert(is_ANYOF_SYNTHETIC(ssc));
2199 
2200     /* The code in this file assumes that all but these flags aren't relevant
2201      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2202      * by the time we reach here */
2203     assert(! (ANYOF_FLAGS(ssc)
2204         & ~( ANYOF_COMMON_FLAGS
2205             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2206             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2207 
2208     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2209 
2210     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2211     SvREFCNT_dec(invlist);
2212 
2213     /* Make sure is clone-safe */
2214     ssc->invlist = NULL;
2215 
2216     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2217         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2218         OP(ssc) = ANYOFPOSIXL;
2219     }
2220     else if (RExC_contains_locale) {
2221         OP(ssc) = ANYOFL;
2222     }
2223 
2224     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2225 }
2226 
2227 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2228 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2229 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2230 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2231                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2232                                : 0 )
2233 
2234 
2235 #ifdef DEBUGGING
2236 /*
2237    dump_trie(trie,widecharmap,revcharmap)
2238    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2239    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2240 
2241    These routines dump out a trie in a somewhat readable format.
2242    The _interim_ variants are used for debugging the interim
2243    tables that are used to generate the final compressed
2244    representation which is what dump_trie expects.
2245 
2246    Part of the reason for their existence is to provide a form
2247    of documentation as to how the different representations function.
2248 
2249 */
2250 
2251 /*
2252   Dumps the final compressed table form of the trie to Perl_debug_log.
2253   Used for debugging make_trie().
2254 */
2255 
2256 STATIC void
S_dump_trie(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 depth)2257 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2258 	    AV *revcharmap, U32 depth)
2259 {
2260     U32 state;
2261     SV *sv=sv_newmortal();
2262     int colwidth= widecharmap ? 6 : 4;
2263     U16 word;
2264     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2265 
2266     PERL_ARGS_ASSERT_DUMP_TRIE;
2267 
2268     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2269         depth+1, "Match","Base","Ofs" );
2270 
2271     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2272 	SV ** const tmp = av_fetch( revcharmap, state, 0);
2273         if ( tmp ) {
2274             Perl_re_printf( aTHX_  "%*s",
2275                 colwidth,
2276                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2277 	                    PL_colors[0], PL_colors[1],
2278 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2279 	                    PERL_PV_ESCAPE_FIRSTCHAR
2280                 )
2281             );
2282         }
2283     }
2284     Perl_re_printf( aTHX_  "\n");
2285     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2286 
2287     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2288         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2289     Perl_re_printf( aTHX_  "\n");
2290 
2291     for( state = 1 ; state < trie->statecount ; state++ ) {
2292 	const U32 base = trie->states[ state ].trans.base;
2293 
2294         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2295 
2296         if ( trie->states[ state ].wordnum ) {
2297             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2298         } else {
2299             Perl_re_printf( aTHX_  "%6s", "" );
2300         }
2301 
2302         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2303 
2304         if ( base ) {
2305             U32 ofs = 0;
2306 
2307             while( ( base + ofs  < trie->uniquecharcount ) ||
2308                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2309                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2310                                                                     != state))
2311                     ofs++;
2312 
2313             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2314 
2315             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2316                 if ( ( base + ofs >= trie->uniquecharcount )
2317                         && ( base + ofs - trie->uniquecharcount
2318                                                         < trie->lasttrans )
2319                         && trie->trans[ base + ofs
2320                                     - trie->uniquecharcount ].check == state )
2321                 {
2322                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2323                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2324                    );
2325                 } else {
2326                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2327                 }
2328             }
2329 
2330             Perl_re_printf( aTHX_  "]");
2331 
2332         }
2333         Perl_re_printf( aTHX_  "\n" );
2334     }
2335     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2336                                 depth);
2337     for (word=1; word <= trie->wordcount; word++) {
2338         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2339 	    (int)word, (int)(trie->wordinfo[word].prev),
2340 	    (int)(trie->wordinfo[word].len));
2341     }
2342     Perl_re_printf( aTHX_  "\n" );
2343 }
2344 /*
2345   Dumps a fully constructed but uncompressed trie in list form.
2346   List tries normally only are used for construction when the number of
2347   possible chars (trie->uniquecharcount) is very high.
2348   Used for debugging make_trie().
2349 */
2350 STATIC void
S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)2351 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2352 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2353 			 U32 depth)
2354 {
2355     U32 state;
2356     SV *sv=sv_newmortal();
2357     int colwidth= widecharmap ? 6 : 4;
2358     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2359 
2360     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2361 
2362     /* print out the table precompression.  */
2363     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2364             depth+1 );
2365     Perl_re_indentf( aTHX_  "%s",
2366             depth+1, "------:-----+-----------------\n" );
2367 
2368     for( state=1 ; state < next_alloc ; state ++ ) {
2369         U16 charid;
2370 
2371         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2372             depth+1, (UV)state  );
2373         if ( ! trie->states[ state ].wordnum ) {
2374             Perl_re_printf( aTHX_  "%5s| ","");
2375         } else {
2376             Perl_re_printf( aTHX_  "W%4x| ",
2377                 trie->states[ state ].wordnum
2378             );
2379         }
2380         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2381 	    SV ** const tmp = av_fetch( revcharmap,
2382                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2383 	    if ( tmp ) {
2384                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2385                     colwidth,
2386                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2387                               colwidth,
2388                               PL_colors[0], PL_colors[1],
2389                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2390                               | PERL_PV_ESCAPE_FIRSTCHAR
2391                     ) ,
2392                     TRIE_LIST_ITEM(state, charid).forid,
2393                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2394                 );
2395                 if (!(charid % 10))
2396                     Perl_re_printf( aTHX_  "\n%*s| ",
2397                         (int)((depth * 2) + 14), "");
2398             }
2399         }
2400         Perl_re_printf( aTHX_  "\n");
2401     }
2402 }
2403 
2404 /*
2405   Dumps a fully constructed but uncompressed trie in table form.
2406   This is the normal DFA style state transition table, with a few
2407   twists to facilitate compression later.
2408   Used for debugging make_trie().
2409 */
2410 STATIC void
S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)2411 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2412 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
2413 			  U32 depth)
2414 {
2415     U32 state;
2416     U16 charid;
2417     SV *sv=sv_newmortal();
2418     int colwidth= widecharmap ? 6 : 4;
2419     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2420 
2421     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2422 
2423     /*
2424        print out the table precompression so that we can do a visual check
2425        that they are identical.
2426      */
2427 
2428     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2429 
2430     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2431 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
2432         if ( tmp ) {
2433             Perl_re_printf( aTHX_  "%*s",
2434                 colwidth,
2435                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2436 	                    PL_colors[0], PL_colors[1],
2437 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2438 	                    PERL_PV_ESCAPE_FIRSTCHAR
2439                 )
2440             );
2441         }
2442     }
2443 
2444     Perl_re_printf( aTHX_ "\n");
2445     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2446 
2447     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2448         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2449     }
2450 
2451     Perl_re_printf( aTHX_  "\n" );
2452 
2453     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2454 
2455         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2456             depth+1,
2457             (UV)TRIE_NODENUM( state ) );
2458 
2459         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2460             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2461             if (v)
2462                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2463             else
2464                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2465         }
2466         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2467             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2468                                             (UV)trie->trans[ state ].check );
2469         } else {
2470             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2471                                             (UV)trie->trans[ state ].check,
2472             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2473         }
2474     }
2475 }
2476 
2477 #endif
2478 
2479 
2480 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2481   startbranch: the first branch in the whole branch sequence
2482   first      : start branch of sequence of branch-exact nodes.
2483 	       May be the same as startbranch
2484   last       : Thing following the last branch.
2485 	       May be the same as tail.
2486   tail       : item following the branch sequence
2487   count      : words in the sequence
2488   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2489   depth      : indent depth
2490 
2491 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2492 
2493 A trie is an N'ary tree where the branches are determined by digital
2494 decomposition of the key. IE, at the root node you look up the 1st character and
2495 follow that branch repeat until you find the end of the branches. Nodes can be
2496 marked as "accepting" meaning they represent a complete word. Eg:
2497 
2498   /he|she|his|hers/
2499 
2500 would convert into the following structure. Numbers represent states, letters
2501 following numbers represent valid transitions on the letter from that state, if
2502 the number is in square brackets it represents an accepting state, otherwise it
2503 will be in parenthesis.
2504 
2505       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2506       |    |
2507       |   (2)
2508       |    |
2509      (1)   +-i->(6)-+-s->[7]
2510       |
2511       +-s->(3)-+-h->(4)-+-e->[5]
2512 
2513       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2514 
2515 This shows that when matching against the string 'hers' we will begin at state 1
2516 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2517 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2518 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2519 single traverse. We store a mapping from accepting to state to which word was
2520 matched, and then when we have multiple possibilities we try to complete the
2521 rest of the regex in the order in which they occurred in the alternation.
2522 
2523 The only prior NFA like behaviour that would be changed by the TRIE support is
2524 the silent ignoring of duplicate alternations which are of the form:
2525 
2526  / (DUPE|DUPE) X? (?{ ... }) Y /x
2527 
2528 Thus EVAL blocks following a trie may be called a different number of times with
2529 and without the optimisation. With the optimisations dupes will be silently
2530 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2531 the following demonstrates:
2532 
2533  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2534 
2535 which prints out 'word' three times, but
2536 
2537  'words'=~/(word|word|word)(?{ print $1 })S/
2538 
2539 which doesnt print it out at all. This is due to other optimisations kicking in.
2540 
2541 Example of what happens on a structural level:
2542 
2543 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2544 
2545    1: CURLYM[1] {1,32767}(18)
2546    5:   BRANCH(8)
2547    6:     EXACT <ac>(16)
2548    8:   BRANCH(11)
2549    9:     EXACT <ad>(16)
2550   11:   BRANCH(14)
2551   12:     EXACT <ab>(16)
2552   16:   SUCCEED(0)
2553   17:   NOTHING(18)
2554   18: END(0)
2555 
2556 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2557 and should turn into:
2558 
2559    1: CURLYM[1] {1,32767}(18)
2560    5:   TRIE(16)
2561 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2562 	  <ac>
2563 	  <ad>
2564 	  <ab>
2565   16:   SUCCEED(0)
2566   17:   NOTHING(18)
2567   18: END(0)
2568 
2569 Cases where tail != last would be like /(?foo|bar)baz/:
2570 
2571    1: BRANCH(4)
2572    2:   EXACT <foo>(8)
2573    4: BRANCH(7)
2574    5:   EXACT <bar>(8)
2575    7: TAIL(8)
2576    8: EXACT <baz>(10)
2577   10: END(0)
2578 
2579 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2580 and would end up looking like:
2581 
2582     1: TRIE(8)
2583       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2584 	<foo>
2585 	<bar>
2586    7: TAIL(8)
2587    8: EXACT <baz>(10)
2588   10: END(0)
2589 
2590     d = uvchr_to_utf8_flags(d, uv, 0);
2591 
2592 is the recommended Unicode-aware way of saying
2593 
2594     *(d++) = uv;
2595 */
2596 
2597 #define TRIE_STORE_REVCHAR(val)                                            \
2598     STMT_START {                                                           \
2599 	if (UTF) {							   \
2600             SV *zlopp = newSV(UTF8_MAXBYTES);				   \
2601 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
2602             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2603             *kapow = '\0';                                                 \
2604 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
2605 	    SvPOK_on(zlopp);						   \
2606 	    SvUTF8_on(zlopp);						   \
2607 	    av_push(revcharmap, zlopp);					   \
2608 	} else {							   \
2609             char ooooff = (char)val;                                           \
2610 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
2611 	}								   \
2612         } STMT_END
2613 
2614 /* This gets the next character from the input, folding it if not already
2615  * folded. */
2616 #define TRIE_READ_CHAR STMT_START {                                           \
2617     wordlen++;                                                                \
2618     if ( UTF ) {                                                              \
2619         /* if it is UTF then it is either already folded, or does not need    \
2620          * folding */                                                         \
2621         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2622     }                                                                         \
2623     else if (folder == PL_fold_latin1) {                                      \
2624         /* This folder implies Unicode rules, which in the range expressible  \
2625          *  by not UTF is the lower case, with the two exceptions, one of     \
2626          *  which should have been taken care of before calling this */       \
2627         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2628         uvc = toLOWER_L1(*uc);                                                \
2629         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2630         len = 1;                                                              \
2631     } else {                                                                  \
2632         /* raw data, will be folded later if needed */                        \
2633         uvc = (U32)*uc;                                                       \
2634         len = 1;                                                              \
2635     }                                                                         \
2636 } STMT_END
2637 
2638 
2639 
2640 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2641     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2642 	U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2643 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2644         TRIE_LIST_LEN( state ) = ging;                          \
2645     }                                                           \
2646     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2647     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2648     TRIE_LIST_CUR( state )++;                                   \
2649 } STMT_END
2650 
2651 #define TRIE_LIST_NEW(state) STMT_START {                       \
2652     Newx( trie->states[ state ].trans.list,                     \
2653 	4, reg_trie_trans_le );                                 \
2654      TRIE_LIST_CUR( state ) = 1;                                \
2655      TRIE_LIST_LEN( state ) = 4;                                \
2656 } STMT_END
2657 
2658 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2659     U16 dupe= trie->states[ state ].wordnum;                    \
2660     regnode * const noper_next = regnext( noper );              \
2661                                                                 \
2662     DEBUG_r({                                                   \
2663         /* store the word for dumping */                        \
2664         SV* tmp;                                                \
2665         if (OP(noper) != NOTHING)                               \
2666             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);	\
2667         else                                                    \
2668             tmp = newSVpvn_utf8( "", 0, UTF );			\
2669         av_push( trie_words, tmp );                             \
2670     });                                                         \
2671                                                                 \
2672     curword++;                                                  \
2673     trie->wordinfo[curword].prev   = 0;                         \
2674     trie->wordinfo[curword].len    = wordlen;                   \
2675     trie->wordinfo[curword].accept = state;                     \
2676                                                                 \
2677     if ( noper_next < tail ) {                                  \
2678         if (!trie->jump)                                        \
2679             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2680                                                  sizeof(U16) ); \
2681         trie->jump[curword] = (U16)(noper_next - convert);      \
2682         if (!jumper)                                            \
2683             jumper = noper_next;                                \
2684         if (!nextbranch)                                        \
2685             nextbranch= regnext(cur);                           \
2686     }                                                           \
2687                                                                 \
2688     if ( dupe ) {                                               \
2689         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2690         /* chain, so that when the bits of chain are later    */\
2691         /* linked together, the dups appear in the chain      */\
2692 	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2693 	trie->wordinfo[dupe].prev = curword;                    \
2694     } else {                                                    \
2695         /* we haven't inserted this word yet.                */ \
2696         trie->states[ state ].wordnum = curword;                \
2697     }                                                           \
2698 } STMT_END
2699 
2700 
2701 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
2702      ( ( base + charid >=  ucharcount					\
2703          && base + charid < ubound					\
2704          && state == trie->trans[ base - ucharcount + charid ].check	\
2705          && trie->trans[ base - ucharcount + charid ].next )		\
2706            ? trie->trans[ base - ucharcount + charid ].next		\
2707            : ( state==1 ? special : 0 )					\
2708       )
2709 
2710 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2711 STMT_START {                                                \
2712     TRIE_BITMAP_SET(trie, uvc);                             \
2713     /* store the folded codepoint */                        \
2714     if ( folder )                                           \
2715         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2716                                                             \
2717     if ( !UTF ) {                                           \
2718         /* store first byte of utf8 representation of */    \
2719         /* variant codepoints */                            \
2720         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2721             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2722         }                                                   \
2723     }                                                       \
2724 } STMT_END
2725 #define MADE_TRIE       1
2726 #define MADE_JUMP_TRIE  2
2727 #define MADE_EXACT_TRIE 4
2728 
2729 STATIC I32
S_make_trie(pTHX_ RExC_state_t * pRExC_state,regnode * startbranch,regnode * first,regnode * last,regnode * tail,U32 word_count,U32 flags,U32 depth)2730 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2731                   regnode *first, regnode *last, regnode *tail,
2732                   U32 word_count, U32 flags, U32 depth)
2733 {
2734     /* first pass, loop through and scan words */
2735     reg_trie_data *trie;
2736     HV *widecharmap = NULL;
2737     AV *revcharmap = newAV();
2738     regnode *cur;
2739     STRLEN len = 0;
2740     UV uvc = 0;
2741     U16 curword = 0;
2742     U32 next_alloc = 0;
2743     regnode *jumper = NULL;
2744     regnode *nextbranch = NULL;
2745     regnode *convert = NULL;
2746     U32 *prev_states; /* temp array mapping each state to previous one */
2747     /* we just use folder as a flag in utf8 */
2748     const U8 * folder = NULL;
2749 
2750     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2751      * which stands for one trie structure, one hash, optionally followed
2752      * by two arrays */
2753 #ifdef DEBUGGING
2754     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2755     AV *trie_words = NULL;
2756     /* along with revcharmap, this only used during construction but both are
2757      * useful during debugging so we store them in the struct when debugging.
2758      */
2759 #else
2760     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2761     STRLEN trie_charcount=0;
2762 #endif
2763     SV *re_trie_maxbuff;
2764     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2765 
2766     PERL_ARGS_ASSERT_MAKE_TRIE;
2767 #ifndef DEBUGGING
2768     PERL_UNUSED_ARG(depth);
2769 #endif
2770 
2771     switch (flags) {
2772         case EXACT: case EXACT_REQ8: case EXACTL: break;
2773 	case EXACTFAA:
2774         case EXACTFUP:
2775 	case EXACTFU:
2776 	case EXACTFLU8: folder = PL_fold_latin1; break;
2777 	case EXACTF:  folder = PL_fold; break;
2778         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2779     }
2780 
2781     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2782     trie->refcount = 1;
2783     trie->startstate = 1;
2784     trie->wordcount = word_count;
2785     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2786     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2787     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2788 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2789     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2790                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2791 
2792     DEBUG_r({
2793         trie_words = newAV();
2794     });
2795 
2796     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2797     assert(re_trie_maxbuff);
2798     if (!SvIOK(re_trie_maxbuff)) {
2799         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2800     }
2801     DEBUG_TRIE_COMPILE_r({
2802         Perl_re_indentf( aTHX_
2803           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2804           depth+1,
2805           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2806           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2807     });
2808 
2809    /* Find the node we are going to overwrite */
2810     if ( first == startbranch && OP( last ) != BRANCH ) {
2811         /* whole branch chain */
2812         convert = first;
2813     } else {
2814         /* branch sub-chain */
2815         convert = NEXTOPER( first );
2816     }
2817 
2818     /*  -- First loop and Setup --
2819 
2820        We first traverse the branches and scan each word to determine if it
2821        contains widechars, and how many unique chars there are, this is
2822        important as we have to build a table with at least as many columns as we
2823        have unique chars.
2824 
2825        We use an array of integers to represent the character codes 0..255
2826        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2827        the native representation of the character value as the key and IV's for
2828        the coded index.
2829 
2830        *TODO* If we keep track of how many times each character is used we can
2831        remap the columns so that the table compression later on is more
2832        efficient in terms of memory by ensuring the most common value is in the
2833        middle and the least common are on the outside.  IMO this would be better
2834        than a most to least common mapping as theres a decent chance the most
2835        common letter will share a node with the least common, meaning the node
2836        will not be compressible. With a middle is most common approach the worst
2837        case is when we have the least common nodes twice.
2838 
2839      */
2840 
2841     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2842         regnode *noper = NEXTOPER( cur );
2843         const U8 *uc;
2844         const U8 *e;
2845         int foldlen = 0;
2846         U32 wordlen      = 0;         /* required init */
2847         STRLEN minchars = 0;
2848         STRLEN maxchars = 0;
2849         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2850                                                bitmap?*/
2851 
2852         if (OP(noper) == NOTHING) {
2853             /* skip past a NOTHING at the start of an alternation
2854              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2855              *
2856              * If the next node is not something we are supposed to process
2857              * we will just ignore it due to the condition guarding the
2858              * next block.
2859              */
2860 
2861             regnode *noper_next= regnext(noper);
2862             if (noper_next < tail)
2863                 noper= noper_next;
2864         }
2865 
2866         if (    noper < tail
2867             && (    OP(noper) == flags
2868                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2869                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
2870                                          || OP(noper) == EXACTFUP))))
2871         {
2872             uc= (U8*)STRING(noper);
2873             e= uc + STR_LEN(noper);
2874         } else {
2875             trie->minlen= 0;
2876             continue;
2877         }
2878 
2879 
2880         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2881             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2882                                           regardless of encoding */
2883             if (OP( noper ) == EXACTFUP) {
2884                 /* false positives are ok, so just set this */
2885                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2886             }
2887         }
2888 
2889         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2890                                            branch */
2891             TRIE_CHARCOUNT(trie)++;
2892             TRIE_READ_CHAR;
2893 
2894             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2895              * is in effect.  Under /i, this character can match itself, or
2896              * anything that folds to it.  If not under /i, it can match just
2897              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2898              * all fold to k, and all are single characters.   But some folds
2899              * expand to more than one character, so for example LATIN SMALL
2900              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2901              * the string beginning at 'uc' is 'ffi', it could be matched by
2902              * three characters, or just by the one ligature character. (It
2903              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2904              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2905              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2906              * match.)  The trie needs to know the minimum and maximum number
2907              * of characters that could match so that it can use size alone to
2908              * quickly reject many match attempts.  The max is simple: it is
2909              * the number of folded characters in this branch (since a fold is
2910              * never shorter than what folds to it. */
2911 
2912             maxchars++;
2913 
2914             /* And the min is equal to the max if not under /i (indicated by
2915              * 'folder' being NULL), or there are no multi-character folds.  If
2916              * there is a multi-character fold, the min is incremented just
2917              * once, for the character that folds to the sequence.  Each
2918              * character in the sequence needs to be added to the list below of
2919              * characters in the trie, but we count only the first towards the
2920              * min number of characters needed.  This is done through the
2921              * variable 'foldlen', which is returned by the macros that look
2922              * for these sequences as the number of bytes the sequence
2923              * occupies.  Each time through the loop, we decrement 'foldlen' by
2924              * how many bytes the current char occupies.  Only when it reaches
2925              * 0 do we increment 'minchars' or look for another multi-character
2926              * sequence. */
2927             if (folder == NULL) {
2928                 minchars++;
2929             }
2930             else if (foldlen > 0) {
2931                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2932             }
2933             else {
2934                 minchars++;
2935 
2936                 /* See if *uc is the beginning of a multi-character fold.  If
2937                  * so, we decrement the length remaining to look at, to account
2938                  * for the current character this iteration.  (We can use 'uc'
2939                  * instead of the fold returned by TRIE_READ_CHAR because the
2940                  * macro is smart enough to account for any unfolded
2941                  * characters. */
2942                 if (UTF) {
2943                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2944                         foldlen -= UTF8SKIP(uc);
2945                     }
2946                 }
2947                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2948                     foldlen--;
2949                 }
2950             }
2951 
2952             /* The current character (and any potential folds) should be added
2953              * to the possible matching characters for this position in this
2954              * branch */
2955             if ( uvc < 256 ) {
2956                 if ( folder ) {
2957                     U8 folded= folder[ (U8) uvc ];
2958                     if ( !trie->charmap[ folded ] ) {
2959                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2960                         TRIE_STORE_REVCHAR( folded );
2961                     }
2962                 }
2963                 if ( !trie->charmap[ uvc ] ) {
2964                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2965                     TRIE_STORE_REVCHAR( uvc );
2966                 }
2967                 if ( set_bit ) {
2968 		    /* store the codepoint in the bitmap, and its folded
2969 		     * equivalent. */
2970                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2971                     set_bit = 0; /* We've done our bit :-) */
2972                 }
2973             } else {
2974 
2975                 /* XXX We could come up with the list of code points that fold
2976                  * to this using PL_utf8_foldclosures, except not for
2977                  * multi-char folds, as there may be multiple combinations
2978                  * there that could work, which needs to wait until runtime to
2979                  * resolve (The comment about LIGATURE FFI above is such an
2980                  * example */
2981 
2982                 SV** svpp;
2983                 if ( !widecharmap )
2984                     widecharmap = newHV();
2985 
2986                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2987 
2988                 if ( !svpp )
2989                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2990 
2991                 if ( !SvTRUE( *svpp ) ) {
2992                     sv_setiv( *svpp, ++trie->uniquecharcount );
2993                     TRIE_STORE_REVCHAR(uvc);
2994                 }
2995             }
2996         } /* end loop through characters in this branch of the trie */
2997 
2998         /* We take the min and max for this branch and combine to find the min
2999          * and max for all branches processed so far */
3000         if( cur == first ) {
3001             trie->minlen = minchars;
3002             trie->maxlen = maxchars;
3003         } else if (minchars < trie->minlen) {
3004             trie->minlen = minchars;
3005         } else if (maxchars > trie->maxlen) {
3006             trie->maxlen = maxchars;
3007         }
3008     } /* end first pass */
3009     DEBUG_TRIE_COMPILE_r(
3010         Perl_re_indentf( aTHX_
3011                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3012                 depth+1,
3013                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3014 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3015 		(int)trie->minlen, (int)trie->maxlen )
3016     );
3017 
3018     /*
3019         We now know what we are dealing with in terms of unique chars and
3020         string sizes so we can calculate how much memory a naive
3021         representation using a flat table  will take. If it's over a reasonable
3022         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3023         conservative but potentially much slower representation using an array
3024         of lists.
3025 
3026         At the end we convert both representations into the same compressed
3027         form that will be used in regexec.c for matching with. The latter
3028         is a form that cannot be used to construct with but has memory
3029         properties similar to the list form and access properties similar
3030         to the table form making it both suitable for fast searches and
3031         small enough that its feasable to store for the duration of a program.
3032 
3033         See the comment in the code where the compressed table is produced
3034         inplace from the flat tabe representation for an explanation of how
3035         the compression works.
3036 
3037     */
3038 
3039 
3040     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3041     prev_states[1] = 0;
3042 
3043     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3044                                                     > SvIV(re_trie_maxbuff) )
3045     {
3046         /*
3047             Second Pass -- Array Of Lists Representation
3048 
3049             Each state will be represented by a list of charid:state records
3050             (reg_trie_trans_le) the first such element holds the CUR and LEN
3051             points of the allocated array. (See defines above).
3052 
3053             We build the initial structure using the lists, and then convert
3054             it into the compressed table form which allows faster lookups
3055             (but cant be modified once converted).
3056         */
3057 
3058         STRLEN transcount = 1;
3059 
3060         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
3061             depth+1));
3062 
3063 	trie->states = (reg_trie_state *)
3064 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3065 				  sizeof(reg_trie_state) );
3066         TRIE_LIST_NEW(1);
3067         next_alloc = 2;
3068 
3069         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3070 
3071             regnode *noper   = NEXTOPER( cur );
3072 	    U32 state        = 1;         /* required init */
3073 	    U16 charid       = 0;         /* sanity init */
3074             U32 wordlen      = 0;         /* required init */
3075 
3076             if (OP(noper) == NOTHING) {
3077                 regnode *noper_next= regnext(noper);
3078                 if (noper_next < tail)
3079                     noper= noper_next;
3080                 /* we will undo this assignment if noper does not
3081                  * point at a trieable type in the else clause of
3082                  * the following statement. */
3083             }
3084 
3085             if (    noper < tail
3086                 && (    OP(noper) == flags
3087                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3088                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3089                                              || OP(noper) == EXACTFUP))))
3090             {
3091                 const U8 *uc= (U8*)STRING(noper);
3092                 const U8 *e= uc + STR_LEN(noper);
3093 
3094                 for ( ; uc < e ; uc += len ) {
3095 
3096                     TRIE_READ_CHAR;
3097 
3098                     if ( uvc < 256 ) {
3099                         charid = trie->charmap[ uvc ];
3100 		    } else {
3101                         SV** const svpp = hv_fetch( widecharmap,
3102                                                     (char*)&uvc,
3103                                                     sizeof( UV ),
3104                                                     0);
3105                         if ( !svpp ) {
3106                             charid = 0;
3107                         } else {
3108                             charid=(U16)SvIV( *svpp );
3109                         }
3110 		    }
3111                     /* charid is now 0 if we dont know the char read, or
3112                      * nonzero if we do */
3113                     if ( charid ) {
3114 
3115                         U16 check;
3116                         U32 newstate = 0;
3117 
3118                         charid--;
3119                         if ( !trie->states[ state ].trans.list ) {
3120                             TRIE_LIST_NEW( state );
3121 			}
3122                         for ( check = 1;
3123                               check <= TRIE_LIST_USED( state );
3124                               check++ )
3125                         {
3126                             if ( TRIE_LIST_ITEM( state, check ).forid
3127                                                                     == charid )
3128                             {
3129                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3130                                 break;
3131                             }
3132                         }
3133                         if ( ! newstate ) {
3134                             newstate = next_alloc++;
3135 			    prev_states[newstate] = state;
3136                             TRIE_LIST_PUSH( state, charid, newstate );
3137                             transcount++;
3138                         }
3139                         state = newstate;
3140                     } else {
3141                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3142 		    }
3143 		}
3144             } else {
3145                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3146                  * on a trieable type. So we need to reset noper back to point at the first regop
3147                  * in the branch before we call TRIE_HANDLE_WORD()
3148                 */
3149                 noper= NEXTOPER(cur);
3150             }
3151             TRIE_HANDLE_WORD(state);
3152 
3153         } /* end second pass */
3154 
3155         /* next alloc is the NEXT state to be allocated */
3156         trie->statecount = next_alloc;
3157         trie->states = (reg_trie_state *)
3158 	    PerlMemShared_realloc( trie->states,
3159 				   next_alloc
3160 				   * sizeof(reg_trie_state) );
3161 
3162         /* and now dump it out before we compress it */
3163         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3164 							 revcharmap, next_alloc,
3165 							 depth+1)
3166         );
3167 
3168         trie->trans = (reg_trie_trans *)
3169 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3170         {
3171             U32 state;
3172             U32 tp = 0;
3173             U32 zp = 0;
3174 
3175 
3176             for( state=1 ; state < next_alloc ; state ++ ) {
3177                 U32 base=0;
3178 
3179                 /*
3180                 DEBUG_TRIE_COMPILE_MORE_r(
3181                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3182                 );
3183                 */
3184 
3185                 if (trie->states[state].trans.list) {
3186                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3187                     U16 maxid=minid;
3188 		    U16 idx;
3189 
3190                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3191 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3192 			if ( forid < minid ) {
3193 			    minid=forid;
3194 			} else if ( forid > maxid ) {
3195 			    maxid=forid;
3196 			}
3197                     }
3198                     if ( transcount < tp + maxid - minid + 1) {
3199                         transcount *= 2;
3200 			trie->trans = (reg_trie_trans *)
3201 			    PerlMemShared_realloc( trie->trans,
3202 						     transcount
3203 						     * sizeof(reg_trie_trans) );
3204                         Zero( trie->trans + (transcount / 2),
3205                               transcount / 2,
3206                               reg_trie_trans );
3207                     }
3208                     base = trie->uniquecharcount + tp - minid;
3209                     if ( maxid == minid ) {
3210                         U32 set = 0;
3211                         for ( ; zp < tp ; zp++ ) {
3212                             if ( ! trie->trans[ zp ].next ) {
3213                                 base = trie->uniquecharcount + zp - minid;
3214                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3215                                                                    1).newstate;
3216                                 trie->trans[ zp ].check = state;
3217                                 set = 1;
3218                                 break;
3219                             }
3220                         }
3221                         if ( !set ) {
3222                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3223                                                                    1).newstate;
3224                             trie->trans[ tp ].check = state;
3225                             tp++;
3226                             zp = tp;
3227                         }
3228                     } else {
3229                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3230                             const U32 tid = base
3231                                            - trie->uniquecharcount
3232                                            + TRIE_LIST_ITEM( state, idx ).forid;
3233                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3234                                                                 idx ).newstate;
3235                             trie->trans[ tid ].check = state;
3236                         }
3237                         tp += ( maxid - minid + 1 );
3238                     }
3239                     Safefree(trie->states[ state ].trans.list);
3240                 }
3241                 /*
3242                 DEBUG_TRIE_COMPILE_MORE_r(
3243                     Perl_re_printf( aTHX_  " base: %d\n",base);
3244                 );
3245                 */
3246                 trie->states[ state ].trans.base=base;
3247             }
3248             trie->lasttrans = tp + 1;
3249         }
3250     } else {
3251         /*
3252            Second Pass -- Flat Table Representation.
3253 
3254            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3255            each.  We know that we will need Charcount+1 trans at most to store
3256            the data (one row per char at worst case) So we preallocate both
3257            structures assuming worst case.
3258 
3259            We then construct the trie using only the .next slots of the entry
3260            structs.
3261 
3262            We use the .check field of the first entry of the node temporarily
3263            to make compression both faster and easier by keeping track of how
3264            many non zero fields are in the node.
3265 
3266            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3267            transition.
3268 
3269            There are two terms at use here: state as a TRIE_NODEIDX() which is
3270            a number representing the first entry of the node, and state as a
3271            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3272            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3273            if there are 2 entrys per node. eg:
3274 
3275              A B       A B
3276           1. 2 4    1. 3 7
3277           2. 0 3    3. 0 5
3278           3. 0 0    5. 0 0
3279           4. 0 0    7. 0 0
3280 
3281            The table is internally in the right hand, idx form. However as we
3282            also have to deal with the states array which is indexed by nodenum
3283            we have to use TRIE_NODENUM() to convert.
3284 
3285         */
3286         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3287             depth+1));
3288 
3289 	trie->trans = (reg_trie_trans *)
3290 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3291 				  * trie->uniquecharcount + 1,
3292 				  sizeof(reg_trie_trans) );
3293         trie->states = (reg_trie_state *)
3294 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3295 				  sizeof(reg_trie_state) );
3296         next_alloc = trie->uniquecharcount + 1;
3297 
3298 
3299         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3300 
3301             regnode *noper   = NEXTOPER( cur );
3302 
3303             U32 state        = 1;         /* required init */
3304 
3305             U16 charid       = 0;         /* sanity init */
3306             U32 accept_state = 0;         /* sanity init */
3307 
3308             U32 wordlen      = 0;         /* required init */
3309 
3310             if (OP(noper) == NOTHING) {
3311                 regnode *noper_next= regnext(noper);
3312                 if (noper_next < tail)
3313                     noper= noper_next;
3314                 /* we will undo this assignment if noper does not
3315                  * point at a trieable type in the else clause of
3316                  * the following statement. */
3317             }
3318 
3319             if (    noper < tail
3320                 && (    OP(noper) == flags
3321                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3322                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3323                                              || OP(noper) == EXACTFUP))))
3324             {
3325                 const U8 *uc= (U8*)STRING(noper);
3326                 const U8 *e= uc + STR_LEN(noper);
3327 
3328                 for ( ; uc < e ; uc += len ) {
3329 
3330                     TRIE_READ_CHAR;
3331 
3332                     if ( uvc < 256 ) {
3333                         charid = trie->charmap[ uvc ];
3334                     } else {
3335                         SV* const * const svpp = hv_fetch( widecharmap,
3336                                                            (char*)&uvc,
3337                                                            sizeof( UV ),
3338                                                            0);
3339                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3340                     }
3341                     if ( charid ) {
3342                         charid--;
3343                         if ( !trie->trans[ state + charid ].next ) {
3344                             trie->trans[ state + charid ].next = next_alloc;
3345                             trie->trans[ state ].check++;
3346 			    prev_states[TRIE_NODENUM(next_alloc)]
3347 				    = TRIE_NODENUM(state);
3348                             next_alloc += trie->uniquecharcount;
3349                         }
3350                         state = trie->trans[ state + charid ].next;
3351                     } else {
3352                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3353                     }
3354                     /* charid is now 0 if we dont know the char read, or
3355                      * nonzero if we do */
3356                 }
3357             } else {
3358                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3359                  * on a trieable type. So we need to reset noper back to point at the first regop
3360                  * in the branch before we call TRIE_HANDLE_WORD().
3361                 */
3362                 noper= NEXTOPER(cur);
3363             }
3364             accept_state = TRIE_NODENUM( state );
3365             TRIE_HANDLE_WORD(accept_state);
3366 
3367         } /* end second pass */
3368 
3369         /* and now dump it out before we compress it */
3370         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3371 							  revcharmap,
3372 							  next_alloc, depth+1));
3373 
3374         {
3375         /*
3376            * Inplace compress the table.*
3377 
3378            For sparse data sets the table constructed by the trie algorithm will
3379            be mostly 0/FAIL transitions or to put it another way mostly empty.
3380            (Note that leaf nodes will not contain any transitions.)
3381 
3382            This algorithm compresses the tables by eliminating most such
3383            transitions, at the cost of a modest bit of extra work during lookup:
3384 
3385            - Each states[] entry contains a .base field which indicates the
3386            index in the state[] array wheres its transition data is stored.
3387 
3388            - If .base is 0 there are no valid transitions from that node.
3389 
3390            - If .base is nonzero then charid is added to it to find an entry in
3391            the trans array.
3392 
3393            -If trans[states[state].base+charid].check!=state then the
3394            transition is taken to be a 0/Fail transition. Thus if there are fail
3395            transitions at the front of the node then the .base offset will point
3396            somewhere inside the previous nodes data (or maybe even into a node
3397            even earlier), but the .check field determines if the transition is
3398            valid.
3399 
3400            XXX - wrong maybe?
3401            The following process inplace converts the table to the compressed
3402            table: We first do not compress the root node 1,and mark all its
3403            .check pointers as 1 and set its .base pointer as 1 as well. This
3404            allows us to do a DFA construction from the compressed table later,
3405            and ensures that any .base pointers we calculate later are greater
3406            than 0.
3407 
3408            - We set 'pos' to indicate the first entry of the second node.
3409 
3410            - We then iterate over the columns of the node, finding the first and
3411            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3412            and set the .check pointers accordingly, and advance pos
3413            appropriately and repreat for the next node. Note that when we copy
3414            the next pointers we have to convert them from the original
3415            NODEIDX form to NODENUM form as the former is not valid post
3416            compression.
3417 
3418            - If a node has no transitions used we mark its base as 0 and do not
3419            advance the pos pointer.
3420 
3421            - If a node only has one transition we use a second pointer into the
3422            structure to fill in allocated fail transitions from other states.
3423            This pointer is independent of the main pointer and scans forward
3424            looking for null transitions that are allocated to a state. When it
3425            finds one it writes the single transition into the "hole".  If the
3426            pointer doesnt find one the single transition is appended as normal.
3427 
3428            - Once compressed we can Renew/realloc the structures to release the
3429            excess space.
3430 
3431            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3432            specifically Fig 3.47 and the associated pseudocode.
3433 
3434            demq
3435         */
3436         const U32 laststate = TRIE_NODENUM( next_alloc );
3437 	U32 state, charid;
3438         U32 pos = 0, zp=0;
3439         trie->statecount = laststate;
3440 
3441         for ( state = 1 ; state < laststate ; state++ ) {
3442             U8 flag = 0;
3443 	    const U32 stateidx = TRIE_NODEIDX( state );
3444 	    const U32 o_used = trie->trans[ stateidx ].check;
3445 	    U32 used = trie->trans[ stateidx ].check;
3446             trie->trans[ stateidx ].check = 0;
3447 
3448             for ( charid = 0;
3449                   used && charid < trie->uniquecharcount;
3450                   charid++ )
3451             {
3452                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3453                     if ( trie->trans[ stateidx + charid ].next ) {
3454                         if (o_used == 1) {
3455                             for ( ; zp < pos ; zp++ ) {
3456                                 if ( ! trie->trans[ zp ].next ) {
3457                                     break;
3458                                 }
3459                             }
3460                             trie->states[ state ].trans.base
3461                                                     = zp
3462                                                       + trie->uniquecharcount
3463                                                       - charid ;
3464                             trie->trans[ zp ].next
3465                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3466                                                              + charid ].next );
3467                             trie->trans[ zp ].check = state;
3468                             if ( ++zp > pos ) pos = zp;
3469                             break;
3470                         }
3471                         used--;
3472                     }
3473                     if ( !flag ) {
3474                         flag = 1;
3475                         trie->states[ state ].trans.base
3476                                        = pos + trie->uniquecharcount - charid ;
3477                     }
3478                     trie->trans[ pos ].next
3479                         = SAFE_TRIE_NODENUM(
3480                                        trie->trans[ stateidx + charid ].next );
3481                     trie->trans[ pos ].check = state;
3482                     pos++;
3483                 }
3484             }
3485         }
3486         trie->lasttrans = pos + 1;
3487         trie->states = (reg_trie_state *)
3488 	    PerlMemShared_realloc( trie->states, laststate
3489 				   * sizeof(reg_trie_state) );
3490         DEBUG_TRIE_COMPILE_MORE_r(
3491             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3492                 depth+1,
3493                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3494                        + 1 ),
3495                 (IV)next_alloc,
3496                 (IV)pos,
3497                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3498             );
3499 
3500         } /* end table compress */
3501     }
3502     DEBUG_TRIE_COMPILE_MORE_r(
3503             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3504                 depth+1,
3505                 (UV)trie->statecount,
3506                 (UV)trie->lasttrans)
3507     );
3508     /* resize the trans array to remove unused space */
3509     trie->trans = (reg_trie_trans *)
3510 	PerlMemShared_realloc( trie->trans, trie->lasttrans
3511 			       * sizeof(reg_trie_trans) );
3512 
3513     {   /* Modify the program and insert the new TRIE node */
3514         U8 nodetype =(U8)(flags & 0xFF);
3515         char *str=NULL;
3516 
3517 #ifdef DEBUGGING
3518         regnode *optimize = NULL;
3519 #ifdef RE_TRACK_PATTERN_OFFSETS
3520 
3521         U32 mjd_offset = 0;
3522         U32 mjd_nodelen = 0;
3523 #endif /* RE_TRACK_PATTERN_OFFSETS */
3524 #endif /* DEBUGGING */
3525         /*
3526            This means we convert either the first branch or the first Exact,
3527            depending on whether the thing following (in 'last') is a branch
3528            or not and whther first is the startbranch (ie is it a sub part of
3529            the alternation or is it the whole thing.)
3530            Assuming its a sub part we convert the EXACT otherwise we convert
3531            the whole branch sequence, including the first.
3532          */
3533         /* Find the node we are going to overwrite */
3534         if ( first != startbranch || OP( last ) == BRANCH ) {
3535             /* branch sub-chain */
3536             NEXT_OFF( first ) = (U16)(last - first);
3537 #ifdef RE_TRACK_PATTERN_OFFSETS
3538             DEBUG_r({
3539                 mjd_offset= Node_Offset((convert));
3540                 mjd_nodelen= Node_Length((convert));
3541             });
3542 #endif
3543             /* whole branch chain */
3544         }
3545 #ifdef RE_TRACK_PATTERN_OFFSETS
3546         else {
3547             DEBUG_r({
3548                 const  regnode *nop = NEXTOPER( convert );
3549                 mjd_offset= Node_Offset((nop));
3550                 mjd_nodelen= Node_Length((nop));
3551             });
3552         }
3553         DEBUG_OPTIMISE_r(
3554             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3555                 depth+1,
3556                 (UV)mjd_offset, (UV)mjd_nodelen)
3557         );
3558 #endif
3559         /* But first we check to see if there is a common prefix we can
3560            split out as an EXACT and put in front of the TRIE node.  */
3561         trie->startstate= 1;
3562         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3563             /* we want to find the first state that has more than
3564              * one transition, if that state is not the first state
3565              * then we have a common prefix which we can remove.
3566              */
3567             U32 state;
3568             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3569                 U32 ofs = 0;
3570                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3571                                        transition, -1 means none */
3572                 U32 count = 0;
3573                 const U32 base = trie->states[ state ].trans.base;
3574 
3575                 /* does this state terminate an alternation? */
3576                 if ( trie->states[state].wordnum )
3577                         count = 1;
3578 
3579                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3580                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3581                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3582                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3583                     {
3584                         if ( ++count > 1 ) {
3585                             /* we have more than one transition */
3586                             SV **tmp;
3587                             U8 *ch;
3588                             /* if this is the first state there is no common prefix
3589                              * to extract, so we can exit */
3590                             if ( state == 1 ) break;
3591                             tmp = av_fetch( revcharmap, ofs, 0);
3592                             ch = (U8*)SvPV_nolen_const( *tmp );
3593 
3594                             /* if we are on count 2 then we need to initialize the
3595                              * bitmap, and store the previous char if there was one
3596                              * in it*/
3597                             if ( count == 2 ) {
3598                                 /* clear the bitmap */
3599                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3600                                 DEBUG_OPTIMISE_r(
3601                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3602                                         depth+1,
3603                                         (UV)state));
3604                                 if (first_ofs >= 0) {
3605                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3606 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3607 
3608                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3609                                     DEBUG_OPTIMISE_r(
3610                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3611                                     );
3612 				}
3613 			    }
3614                             /* store the current firstchar in the bitmap */
3615                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3616                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3617 			}
3618                         first_ofs = ofs;
3619 		    }
3620                 }
3621                 if ( count == 1 ) {
3622                     /* This state has only one transition, its transition is part
3623                      * of a common prefix - we need to concatenate the char it
3624                      * represents to what we have so far. */
3625                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3626                     STRLEN len;
3627                     char *ch = SvPV( *tmp, len );
3628                     DEBUG_OPTIMISE_r({
3629                         SV *sv=sv_newmortal();
3630                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3631                             depth+1,
3632                             (UV)state, (UV)first_ofs,
3633                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3634 	                        PL_colors[0], PL_colors[1],
3635 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3636 	                        PERL_PV_ESCAPE_FIRSTCHAR
3637                             )
3638                         );
3639                     });
3640                     if ( state==1 ) {
3641                         OP( convert ) = nodetype;
3642                         str=STRING(convert);
3643                         setSTR_LEN(convert, 0);
3644                     }
3645                     assert( ( STR_LEN(convert) + len ) < 256 );
3646                     setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3647                     while (len--)
3648                         *str++ = *ch++;
3649 		} else {
3650 #ifdef DEBUGGING
3651 		    if (state>1)
3652                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3653 #endif
3654 		    break;
3655 		}
3656 	    }
3657 	    trie->prefixlen = (state-1);
3658             if (str) {
3659                 regnode *n = convert+NODE_SZ_STR(convert);
3660                 assert( NODE_SZ_STR(convert) <= U16_MAX );
3661                 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3662                 trie->startstate = state;
3663                 trie->minlen -= (state - 1);
3664                 trie->maxlen -= (state - 1);
3665 #ifdef DEBUGGING
3666                /* At least the UNICOS C compiler choked on this
3667                 * being argument to DEBUG_r(), so let's just have
3668                 * it right here. */
3669                if (
3670 #ifdef PERL_EXT_RE_BUILD
3671                    1
3672 #else
3673                    DEBUG_r_TEST
3674 #endif
3675                    ) {
3676                    regnode *fix = convert;
3677                    U32 word = trie->wordcount;
3678 #ifdef RE_TRACK_PATTERN_OFFSETS
3679                    mjd_nodelen++;
3680 #endif
3681                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3682                    while( ++fix < n ) {
3683                        Set_Node_Offset_Length(fix, 0, 0);
3684                    }
3685                    while (word--) {
3686                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3687                        if (tmp) {
3688                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3689                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3690                            else
3691                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3692                        }
3693                    }
3694                }
3695 #endif
3696                 if (trie->maxlen) {
3697                     convert = n;
3698 		} else {
3699                     NEXT_OFF(convert) = (U16)(tail - convert);
3700                     DEBUG_r(optimize= n);
3701                 }
3702             }
3703         }
3704         if (!jumper)
3705             jumper = last;
3706         if ( trie->maxlen ) {
3707 	    NEXT_OFF( convert ) = (U16)(tail - convert);
3708 	    ARG_SET( convert, data_slot );
3709 	    /* Store the offset to the first unabsorbed branch in
3710 	       jump[0], which is otherwise unused by the jump logic.
3711 	       We use this when dumping a trie and during optimisation. */
3712 	    if (trie->jump)
3713 	        trie->jump[0] = (U16)(nextbranch - convert);
3714 
3715             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3716 	     *   and there is a bitmap
3717 	     *   and the first "jump target" node we found leaves enough room
3718 	     * then convert the TRIE node into a TRIEC node, with the bitmap
3719 	     * embedded inline in the opcode - this is hypothetically faster.
3720 	     */
3721             if ( !trie->states[trie->startstate].wordnum
3722 		 && trie->bitmap
3723 		 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3724             {
3725                 OP( convert ) = TRIEC;
3726                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3727                 PerlMemShared_free(trie->bitmap);
3728                 trie->bitmap= NULL;
3729             } else
3730                 OP( convert ) = TRIE;
3731 
3732             /* store the type in the flags */
3733             convert->flags = nodetype;
3734             DEBUG_r({
3735             optimize = convert
3736                       + NODE_STEP_REGNODE
3737                       + regarglen[ OP( convert ) ];
3738             });
3739             /* XXX We really should free up the resource in trie now,
3740                    as we won't use them - (which resources?) dmq */
3741         }
3742         /* needed for dumping*/
3743         DEBUG_r(if (optimize) {
3744             regnode *opt = convert;
3745 
3746             while ( ++opt < optimize) {
3747                 Set_Node_Offset_Length(opt, 0, 0);
3748             }
3749             /*
3750                 Try to clean up some of the debris left after the
3751                 optimisation.
3752              */
3753             while( optimize < jumper ) {
3754                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3755                 OP( optimize ) = OPTIMIZED;
3756                 Set_Node_Offset_Length(optimize, 0, 0);
3757                 optimize++;
3758             }
3759             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3760         });
3761     } /* end node insert */
3762 
3763     /*  Finish populating the prev field of the wordinfo array.  Walk back
3764      *  from each accept state until we find another accept state, and if
3765      *  so, point the first word's .prev field at the second word. If the
3766      *  second already has a .prev field set, stop now. This will be the
3767      *  case either if we've already processed that word's accept state,
3768      *  or that state had multiple words, and the overspill words were
3769      *  already linked up earlier.
3770      */
3771     {
3772 	U16 word;
3773 	U32 state;
3774 	U16 prev;
3775 
3776 	for (word=1; word <= trie->wordcount; word++) {
3777 	    prev = 0;
3778 	    if (trie->wordinfo[word].prev)
3779 		continue;
3780 	    state = trie->wordinfo[word].accept;
3781 	    while (state) {
3782 		state = prev_states[state];
3783 		if (!state)
3784 		    break;
3785 		prev = trie->states[state].wordnum;
3786 		if (prev)
3787 		    break;
3788 	    }
3789 	    trie->wordinfo[word].prev = prev;
3790 	}
3791 	Safefree(prev_states);
3792     }
3793 
3794 
3795     /* and now dump out the compressed format */
3796     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3797 
3798     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3799 #ifdef DEBUGGING
3800     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3801     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3802 #else
3803     SvREFCNT_dec_NN(revcharmap);
3804 #endif
3805     return trie->jump
3806            ? MADE_JUMP_TRIE
3807            : trie->startstate>1
3808              ? MADE_EXACT_TRIE
3809              : MADE_TRIE;
3810 }
3811 
3812 STATIC regnode *
S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t * pRExC_state,regnode * source,U32 depth)3813 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3814 {
3815 /* The Trie is constructed and compressed now so we can build a fail array if
3816  * it's needed
3817 
3818    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3819    3.32 in the
3820    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3821    Ullman 1985/88
3822    ISBN 0-201-10088-6
3823 
3824    We find the fail state for each state in the trie, this state is the longest
3825    proper suffix of the current state's 'word' that is also a proper prefix of
3826    another word in our trie. State 1 represents the word '' and is thus the
3827    default fail state. This allows the DFA not to have to restart after its
3828    tried and failed a word at a given point, it simply continues as though it
3829    had been matching the other word in the first place.
3830    Consider
3831       'abcdgu'=~/abcdefg|cdgu/
3832    When we get to 'd' we are still matching the first word, we would encounter
3833    'g' which would fail, which would bring us to the state representing 'd' in
3834    the second word where we would try 'g' and succeed, proceeding to match
3835    'cdgu'.
3836  */
3837  /* add a fail transition */
3838     const U32 trie_offset = ARG(source);
3839     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3840     U32 *q;
3841     const U32 ucharcount = trie->uniquecharcount;
3842     const U32 numstates = trie->statecount;
3843     const U32 ubound = trie->lasttrans + ucharcount;
3844     U32 q_read = 0;
3845     U32 q_write = 0;
3846     U32 charid;
3847     U32 base = trie->states[ 1 ].trans.base;
3848     U32 *fail;
3849     reg_ac_data *aho;
3850     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3851     regnode *stclass;
3852     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3853 
3854     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3855     PERL_UNUSED_CONTEXT;
3856 #ifndef DEBUGGING
3857     PERL_UNUSED_ARG(depth);
3858 #endif
3859 
3860     if ( OP(source) == TRIE ) {
3861         struct regnode_1 *op = (struct regnode_1 *)
3862             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3863         StructCopy(source, op, struct regnode_1);
3864         stclass = (regnode *)op;
3865     } else {
3866         struct regnode_charclass *op = (struct regnode_charclass *)
3867             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3868         StructCopy(source, op, struct regnode_charclass);
3869         stclass = (regnode *)op;
3870     }
3871     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3872 
3873     ARG_SET( stclass, data_slot );
3874     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3875     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3876     aho->trie=trie_offset;
3877     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3878     Copy( trie->states, aho->states, numstates, reg_trie_state );
3879     Newx( q, numstates, U32);
3880     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3881     aho->refcount = 1;
3882     fail = aho->fail;
3883     /* initialize fail[0..1] to be 1 so that we always have
3884        a valid final fail state */
3885     fail[ 0 ] = fail[ 1 ] = 1;
3886 
3887     for ( charid = 0; charid < ucharcount ; charid++ ) {
3888 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3889 	if ( newstate ) {
3890             q[ q_write ] = newstate;
3891             /* set to point at the root */
3892             fail[ q[ q_write++ ] ]=1;
3893         }
3894     }
3895     while ( q_read < q_write) {
3896 	const U32 cur = q[ q_read++ % numstates ];
3897         base = trie->states[ cur ].trans.base;
3898 
3899         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3900 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3901 	    if (ch_state) {
3902                 U32 fail_state = cur;
3903                 U32 fail_base;
3904                 do {
3905                     fail_state = fail[ fail_state ];
3906                     fail_base = aho->states[ fail_state ].trans.base;
3907                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3908 
3909                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3910                 fail[ ch_state ] = fail_state;
3911                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3912                 {
3913                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3914                 }
3915                 q[ q_write++ % numstates] = ch_state;
3916             }
3917         }
3918     }
3919     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3920        when we fail in state 1, this allows us to use the
3921        charclass scan to find a valid start char. This is based on the principle
3922        that theres a good chance the string being searched contains lots of stuff
3923        that cant be a start char.
3924      */
3925     fail[ 0 ] = fail[ 1 ] = 0;
3926     DEBUG_TRIE_COMPILE_r({
3927         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3928                       depth, (UV)numstates
3929         );
3930         for( q_read=1; q_read<numstates; q_read++ ) {
3931             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3932         }
3933         Perl_re_printf( aTHX_  "\n");
3934     });
3935     Safefree(q);
3936     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3937     return stclass;
3938 }
3939 
3940 
3941 /* The below joins as many adjacent EXACTish nodes as possible into a single
3942  * one.  The regop may be changed if the node(s) contain certain sequences that
3943  * require special handling.  The joining is only done if:
3944  * 1) there is room in the current conglomerated node to entirely contain the
3945  *    next one.
3946  * 2) they are compatible node types
3947  *
3948  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3949  * these get optimized out
3950  *
3951  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3952  * as possible, even if that means splitting an existing node so that its first
3953  * part is moved to the preceeding node.  This would maximise the efficiency of
3954  * memEQ during matching.
3955  *
3956  * If a node is to match under /i (folded), the number of characters it matches
3957  * can be different than its character length if it contains a multi-character
3958  * fold.  *min_subtract is set to the total delta number of characters of the
3959  * input nodes.
3960  *
3961  * And *unfolded_multi_char is set to indicate whether or not the node contains
3962  * an unfolded multi-char fold.  This happens when it won't be known until
3963  * runtime whether the fold is valid or not; namely
3964  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3965  *      target string being matched against turns out to be UTF-8 is that fold
3966  *      valid; or
3967  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3968  *      runtime.
3969  * (Multi-char folds whose components are all above the Latin1 range are not
3970  * run-time locale dependent, and have already been folded by the time this
3971  * function is called.)
3972  *
3973  * This is as good a place as any to discuss the design of handling these
3974  * multi-character fold sequences.  It's been wrong in Perl for a very long
3975  * time.  There are three code points in Unicode whose multi-character folds
3976  * were long ago discovered to mess things up.  The previous designs for
3977  * dealing with these involved assigning a special node for them.  This
3978  * approach doesn't always work, as evidenced by this example:
3979  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3980  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3981  * would match just the \xDF, it won't be able to handle the case where a
3982  * successful match would have to cross the node's boundary.  The new approach
3983  * that hopefully generally solves the problem generates an EXACTFUP node
3984  * that is "sss" in this case.
3985  *
3986  * It turns out that there are problems with all multi-character folds, and not
3987  * just these three.  Now the code is general, for all such cases.  The
3988  * approach taken is:
3989  * 1)   This routine examines each EXACTFish node that could contain multi-
3990  *      character folded sequences.  Since a single character can fold into
3991  *      such a sequence, the minimum match length for this node is less than
3992  *      the number of characters in the node.  This routine returns in
3993  *      *min_subtract how many characters to subtract from the actual
3994  *      length of the string to get a real minimum match length; it is 0 if
3995  *      there are no multi-char foldeds.  This delta is used by the caller to
3996  *      adjust the min length of the match, and the delta between min and max,
3997  *      so that the optimizer doesn't reject these possibilities based on size
3998  *      constraints.
3999  *
4000  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4001  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
4002  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4003  *      EXACTFU nodes.  The node type of such nodes is then changed to
4004  *      EXACTFUP, indicating it is problematic, and needs careful handling.
4005  *      (The procedures in step 1) above are sufficient to handle this case in
4006  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
4007  *      the only case where there is a possible fold length change in non-UTF-8
4008  *      patterns.  By reserving a special node type for problematic cases, the
4009  *      far more common regular EXACTFU nodes can be processed faster.
4010  *      regexec.c takes advantage of this.
4011  *
4012  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4013  *      problematic cases.   These all only occur when the pattern is not
4014  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
4015  *      length change, it handles the situation where the string cannot be
4016  *      entirely folded.  The strings in an EXACTFish node are folded as much
4017  *      as possible during compilation in regcomp.c.  This saves effort in
4018  *      regex matching.  By using an EXACTFUP node when it is not possible to
4019  *      fully fold at compile time, regexec.c can know that everything in an
4020  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
4021  *      case where folding in EXACTFU nodes can't be done at compile time is
4022  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
4023  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
4024  *      handle two very different cases.  Alternatively, there could have been
4025  *      a node type where there are length changes, one for unfolded, and one
4026  *      for both.  If yet another special case needed to be created, the number
4027  *      of required node types would have to go to 7.  khw figures that even
4028  *      though there are plenty of node types to spare, that the maintenance
4029  *      cost wasn't worth the small speedup of doing it that way, especially
4030  *      since he thinks the MICRO SIGN is rarely encountered in practice.
4031  *
4032  *      There are other cases where folding isn't done at compile time, but
4033  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
4034  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
4035  *      changes.  Some folds in EXACTF depend on if the runtime target string
4036  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
4037  *      when no fold in it depends on the UTF-8ness of the target string.)
4038  *
4039  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
4040  *      validity of the fold won't be known until runtime, and so must remain
4041  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
4042  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
4043  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
4044  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4045  *      The reason this is a problem is that the optimizer part of regexec.c
4046  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4047  *      that a character in the pattern corresponds to at most a single
4048  *      character in the target string.  (And I do mean character, and not byte
4049  *      here, unlike other parts of the documentation that have never been
4050  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
4051  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4052  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
4053  *      EXACTFL nodes, violate the assumption, and they are the only instances
4054  *      where it is violated.  I'm reluctant to try to change the assumption,
4055  *      as the code involved is impenetrable to me (khw), so instead the code
4056  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
4057  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4058  *      boolean indicating whether or not the node contains such a fold.  When
4059  *      it is true, the caller sets a flag that later causes the optimizer in
4060  *      this file to not set values for the floating and fixed string lengths,
4061  *      and thus avoids the optimizer code in regexec.c that makes the invalid
4062  *      assumption.  Thus, there is no optimization based on string lengths for
4063  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4064  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
4065  *      assumption is wrong only in these cases is that all other non-UTF-8
4066  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4067  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
4068  *      EXACTF nodes because we don't know at compile time if it actually
4069  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
4070  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
4071  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
4072  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4073  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4074  *      string would require the pattern to be forced into UTF-8, the overhead
4075  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
4076  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4077  *      locale.)
4078  *
4079  *      Similarly, the code that generates tries doesn't currently handle
4080  *      not-already-folded multi-char folds, and it looks like a pain to change
4081  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
4082  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
4083  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
4084  *      using /iaa matching will be doing so almost entirely with ASCII
4085  *      strings, so this should rarely be encountered in practice */
4086 
4087 STATIC U32
S_join_exact(pTHX_ RExC_state_t * pRExC_state,regnode * scan,UV * min_subtract,bool * unfolded_multi_char,U32 flags,regnode * val,U32 depth)4088 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4089                    UV *min_subtract, bool *unfolded_multi_char,
4090                    U32 flags, regnode *val, U32 depth)
4091 {
4092     /* Merge several consecutive EXACTish nodes into one. */
4093 
4094     regnode *n = regnext(scan);
4095     U32 stringok = 1;
4096     regnode *next = scan + NODE_SZ_STR(scan);
4097     U32 merged = 0;
4098     U32 stopnow = 0;
4099 #ifdef DEBUGGING
4100     regnode *stop = scan;
4101     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4102 #else
4103     PERL_UNUSED_ARG(depth);
4104 #endif
4105 
4106     PERL_ARGS_ASSERT_JOIN_EXACT;
4107 #ifndef EXPERIMENTAL_INPLACESCAN
4108     PERL_UNUSED_ARG(flags);
4109     PERL_UNUSED_ARG(val);
4110 #endif
4111     DEBUG_PEEP("join", scan, depth, 0);
4112 
4113     assert(PL_regkind[OP(scan)] == EXACT);
4114 
4115     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4116      * EXACT ones that are mergeable to the current one. */
4117     while (    n
4118            && (    PL_regkind[OP(n)] == NOTHING
4119                || (stringok && PL_regkind[OP(n)] == EXACT))
4120            && NEXT_OFF(n)
4121            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4122     {
4123 
4124         if (OP(n) == TAIL || n > next)
4125             stringok = 0;
4126         if (PL_regkind[OP(n)] == NOTHING) {
4127             DEBUG_PEEP("skip:", n, depth, 0);
4128             NEXT_OFF(scan) += NEXT_OFF(n);
4129             next = n + NODE_STEP_REGNODE;
4130 #ifdef DEBUGGING
4131             if (stringok)
4132                 stop = n;
4133 #endif
4134             n = regnext(n);
4135         }
4136         else if (stringok) {
4137             const unsigned int oldl = STR_LEN(scan);
4138             regnode * const nnext = regnext(n);
4139 
4140             /* XXX I (khw) kind of doubt that this works on platforms (should
4141              * Perl ever run on one) where U8_MAX is above 255 because of lots
4142              * of other assumptions */
4143             /* Don't join if the sum can't fit into a single node */
4144             if (oldl + STR_LEN(n) > U8_MAX)
4145                 break;
4146 
4147             /* Joining something that requires UTF-8 with something that
4148              * doesn't, means the result requires UTF-8. */
4149             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4150                 OP(scan) = EXACT_REQ8;
4151             }
4152             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4153                 ;   /* join is compatible, no need to change OP */
4154             }
4155             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4156                 OP(scan) = EXACTFU_REQ8;
4157             }
4158             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4159                 ;   /* join is compatible, no need to change OP */
4160             }
4161             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4162                 ;   /* join is compatible, no need to change OP */
4163             }
4164             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4165 
4166                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4167                   * which can join with EXACTFU ones.  We check for this case
4168                   * here.  These need to be resolved to either EXACTFU or
4169                   * EXACTF at joining time.  They have nothing in them that
4170                   * would forbid them from being the more desirable EXACTFU
4171                   * nodes except that they begin and/or end with a single [Ss].
4172                   * The reason this is problematic is because they could be
4173                   * joined in this loop with an adjacent node that ends and/or
4174                   * begins with [Ss] which would then form the sequence 'ss',
4175                   * which matches differently under /di than /ui, in which case
4176                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4177                   * formed, the nodes get absorbed into any adjacent EXACTFU
4178                   * node.  And if the only adjacent node is EXACTF, they get
4179                   * absorbed into that, under the theory that a longer node is
4180                   * better than two shorter ones, even if one is EXACTFU.  Note
4181                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4182                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4183 
4184                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4185 
4186                     /* Here the joined node would end with 's'.  If the node
4187                      * following the combination is an EXACTF one, it's better to
4188                      * join this trailing edge 's' node with that one, leaving the
4189                      * current one in 'scan' be the more desirable EXACTFU */
4190                     if (OP(nnext) == EXACTF) {
4191                         break;
4192                     }
4193 
4194                     OP(scan) = EXACTFU_S_EDGE;
4195 
4196                 }   /* Otherwise, the beginning 's' of the 2nd node just
4197                        becomes an interior 's' in 'scan' */
4198             }
4199             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4200                 ;   /* join is compatible, no need to change OP */
4201             }
4202             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4203 
4204                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4205                  * nodes.  But the latter nodes can be also joined with EXACTFU
4206                  * ones, and that is a better outcome, so if the node following
4207                  * 'n' is EXACTFU, quit now so that those two can be joined
4208                  * later */
4209                 if (OP(nnext) == EXACTFU) {
4210                     break;
4211                 }
4212 
4213                 /* The join is compatible, and the combined node will be
4214                  * EXACTF.  (These don't care if they begin or end with 's' */
4215             }
4216             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4217                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4218                     && STRING(n)[0] == 's')
4219                 {
4220                     /* When combined, we have the sequence 'ss', which means we
4221                      * have to remain /di */
4222                     OP(scan) = EXACTF;
4223                 }
4224             }
4225             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4226                 if (STRING(n)[0] == 's') {
4227                     ;   /* Here the join is compatible and the combined node
4228                            starts with 's', no need to change OP */
4229                 }
4230                 else {  /* Now the trailing 's' is in the interior */
4231                     OP(scan) = EXACTFU;
4232                 }
4233             }
4234             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4235 
4236                 /* The join is compatible, and the combined node will be
4237                  * EXACTF.  (These don't care if they begin or end with 's' */
4238                 OP(scan) = EXACTF;
4239             }
4240             else if (OP(scan) != OP(n)) {
4241 
4242                 /* The only other compatible joinings are the same node type */
4243                 break;
4244             }
4245 
4246             DEBUG_PEEP("merg", n, depth, 0);
4247             merged++;
4248 
4249             NEXT_OFF(scan) += NEXT_OFF(n);
4250             assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4251             setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4252             next = n + NODE_SZ_STR(n);
4253             /* Now we can overwrite *n : */
4254             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4255 #ifdef DEBUGGING
4256             stop = next - 1;
4257 #endif
4258             n = nnext;
4259             if (stopnow) break;
4260         }
4261 
4262 #ifdef EXPERIMENTAL_INPLACESCAN
4263 	if (flags && !NEXT_OFF(n)) {
4264 	    DEBUG_PEEP("atch", val, depth, 0);
4265 	    if (reg_off_by_arg[OP(n)]) {
4266 		ARG_SET(n, val - n);
4267 	    }
4268 	    else {
4269 		NEXT_OFF(n) = val - n;
4270 	    }
4271 	    stopnow = 1;
4272 	}
4273 #endif
4274     }
4275 
4276     /* This temporary node can now be turned into EXACTFU, and must, as
4277      * regexec.c doesn't handle it */
4278     if (OP(scan) == EXACTFU_S_EDGE) {
4279         OP(scan) = EXACTFU;
4280     }
4281 
4282     *min_subtract = 0;
4283     *unfolded_multi_char = FALSE;
4284 
4285     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4286      * can now analyze for sequences of problematic code points.  (Prior to
4287      * this final joining, sequences could have been split over boundaries, and
4288      * hence missed).  The sequences only happen in folding, hence for any
4289      * non-EXACT EXACTish node */
4290     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4291         U8* s0 = (U8*) STRING(scan);
4292         U8* s = s0;
4293         U8* s_end = s0 + STR_LEN(scan);
4294 
4295         int total_count_delta = 0;  /* Total delta number of characters that
4296                                        multi-char folds expand to */
4297 
4298 	/* One pass is made over the node's string looking for all the
4299 	 * possibilities.  To avoid some tests in the loop, there are two main
4300 	 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4301 	 * non-UTF-8 */
4302 	if (UTF) {
4303             U8* folded = NULL;
4304 
4305             if (OP(scan) == EXACTFL) {
4306                 U8 *d;
4307 
4308                 /* An EXACTFL node would already have been changed to another
4309                  * node type unless there is at least one character in it that
4310                  * is problematic; likely a character whose fold definition
4311                  * won't be known until runtime, and so has yet to be folded.
4312                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4313                  * to handle the UTF-8 case, we need to create a temporary
4314                  * folded copy using UTF-8 locale rules in order to analyze it.
4315                  * This is because our macros that look to see if a sequence is
4316                  * a multi-char fold assume everything is folded (otherwise the
4317                  * tests in those macros would be too complicated and slow).
4318                  * Note that here, the non-problematic folds will have already
4319                  * been done, so we can just copy such characters.  We actually
4320                  * don't completely fold the EXACTFL string.  We skip the
4321                  * unfolded multi-char folds, as that would just create work
4322                  * below to figure out the size they already are */
4323 
4324                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4325                 d = folded;
4326                 while (s < s_end) {
4327                     STRLEN s_len = UTF8SKIP(s);
4328                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4329                         Copy(s, d, s_len, U8);
4330                         d += s_len;
4331                     }
4332                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4333                         *unfolded_multi_char = TRUE;
4334                         Copy(s, d, s_len, U8);
4335                         d += s_len;
4336                     }
4337                     else if (isASCII(*s)) {
4338                         *(d++) = toFOLD(*s);
4339                     }
4340                     else {
4341                         STRLEN len;
4342                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4343                         d += len;
4344                     }
4345                     s += s_len;
4346                 }
4347 
4348                 /* Point the remainder of the routine to look at our temporary
4349                  * folded copy */
4350                 s = folded;
4351                 s_end = d;
4352             } /* End of creating folded copy of EXACTFL string */
4353 
4354             /* Examine the string for a multi-character fold sequence.  UTF-8
4355              * patterns have all characters pre-folded by the time this code is
4356              * executed */
4357             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4358                                      length sequence we are looking for is 2 */
4359 	    {
4360                 int count = 0;  /* How many characters in a multi-char fold */
4361                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4362                 if (! len) {    /* Not a multi-char fold: get next char */
4363                     s += UTF8SKIP(s);
4364                     continue;
4365                 }
4366 
4367                 { /* Here is a generic multi-char fold. */
4368                     U8* multi_end  = s + len;
4369 
4370                     /* Count how many characters are in it.  In the case of
4371                      * /aa, no folds which contain ASCII code points are
4372                      * allowed, so check for those, and skip if found. */
4373                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4374                         count = utf8_length(s, multi_end);
4375                         s = multi_end;
4376                     }
4377                     else {
4378                         while (s < multi_end) {
4379                             if (isASCII(*s)) {
4380                                 s++;
4381                                 goto next_iteration;
4382                             }
4383                             else {
4384                                 s += UTF8SKIP(s);
4385                             }
4386                             count++;
4387                         }
4388                     }
4389                 }
4390 
4391                 /* The delta is how long the sequence is minus 1 (1 is how long
4392                  * the character that folds to the sequence is) */
4393                 total_count_delta += count - 1;
4394               next_iteration: ;
4395 	    }
4396 
4397             /* We created a temporary folded copy of the string in EXACTFL
4398              * nodes.  Therefore we need to be sure it doesn't go below zero,
4399              * as the real string could be shorter */
4400             if (OP(scan) == EXACTFL) {
4401                 int total_chars = utf8_length((U8*) STRING(scan),
4402                                            (U8*) STRING(scan) + STR_LEN(scan));
4403                 if (total_count_delta > total_chars) {
4404                     total_count_delta = total_chars;
4405                 }
4406             }
4407 
4408             *min_subtract += total_count_delta;
4409             Safefree(folded);
4410 	}
4411 	else if (OP(scan) == EXACTFAA) {
4412 
4413             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4414              * fold to the ASCII range (and there are no existing ones in the
4415              * upper latin1 range).  But, as outlined in the comments preceding
4416              * this function, we need to flag any occurrences of the sharp s.
4417              * This character forbids trie formation (because of added
4418              * complexity) */
4419 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4420    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4421                                       || UNICODE_DOT_DOT_VERSION > 0)
4422 	    while (s < s_end) {
4423                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4424                     OP(scan) = EXACTFAA_NO_TRIE;
4425                     *unfolded_multi_char = TRUE;
4426                     break;
4427                 }
4428                 s++;
4429             }
4430         }
4431 	else if (OP(scan) != EXACTFAA_NO_TRIE) {
4432 
4433             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4434              * folds that are all Latin1.  As explained in the comments
4435              * preceding this function, we look also for the sharp s in EXACTF
4436              * and EXACTFL nodes; it can be in the final position.  Otherwise
4437              * we can stop looking 1 byte earlier because have to find at least
4438              * two characters for a multi-fold */
4439 	    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4440                               ? s_end
4441                               : s_end -1;
4442 
4443 	    while (s < upper) {
4444                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4445                 if (! len) {    /* Not a multi-char fold. */
4446                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4447                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4448                     {
4449                         *unfolded_multi_char = TRUE;
4450                     }
4451                     s++;
4452                     continue;
4453                 }
4454 
4455                 if (len == 2
4456                     && isALPHA_FOLD_EQ(*s, 's')
4457                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4458                 {
4459 
4460                     /* EXACTF nodes need to know that the minimum length
4461                      * changed so that a sharp s in the string can match this
4462                      * ss in the pattern, but they remain EXACTF nodes, as they
4463                      * won't match this unless the target string is in UTF-8,
4464                      * which we don't know until runtime.  EXACTFL nodes can't
4465                      * transform into EXACTFU nodes */
4466                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4467                         OP(scan) = EXACTFUP;
4468                     }
4469 		}
4470 
4471                 *min_subtract += len - 1;
4472                 s += len;
4473 	    }
4474 #endif
4475 	}
4476     }
4477 
4478 #ifdef DEBUGGING
4479     /* Allow dumping but overwriting the collection of skipped
4480      * ops and/or strings with fake optimized ops */
4481     n = scan + NODE_SZ_STR(scan);
4482     while (n <= stop) {
4483 	OP(n) = OPTIMIZED;
4484 	FLAGS(n) = 0;
4485 	NEXT_OFF(n) = 0;
4486         n++;
4487     }
4488 #endif
4489     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4490     return stopnow;
4491 }
4492 
4493 /* REx optimizer.  Converts nodes into quicker variants "in place".
4494    Finds fixed substrings.  */
4495 
4496 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4497    to the position after last scanned or to NULL. */
4498 
4499 #define INIT_AND_WITHP \
4500     assert(!and_withp); \
4501     Newx(and_withp, 1, regnode_ssc); \
4502     SAVEFREEPV(and_withp)
4503 
4504 
4505 static void
S_unwind_scan_frames(pTHX_ const void * p)4506 S_unwind_scan_frames(pTHX_ const void *p)
4507 {
4508     scan_frame *f= (scan_frame *)p;
4509     do {
4510         scan_frame *n= f->next_frame;
4511         Safefree(f);
4512         f= n;
4513     } while (f);
4514 }
4515 
4516 /* Follow the next-chain of the current node and optimize away
4517    all the NOTHINGs from it.
4518  */
4519 STATIC void
S_rck_elide_nothing(pTHX_ regnode * node)4520 S_rck_elide_nothing(pTHX_ regnode *node)
4521 {
4522     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4523 
4524     if (OP(node) != CURLYX) {
4525         const int max = (reg_off_by_arg[OP(node)]
4526                         ? I32_MAX
4527                           /* I32 may be smaller than U16 on CRAYs! */
4528                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4529         int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4530         int noff;
4531         regnode *n = node;
4532 
4533         /* Skip NOTHING and LONGJMP. */
4534         while (
4535             (n = regnext(n))
4536             && (
4537                 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4538                 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4539             )
4540             && off + noff < max
4541         ) {
4542             off += noff;
4543         }
4544         if (reg_off_by_arg[OP(node)])
4545             ARG(node) = off;
4546         else
4547             NEXT_OFF(node) = off;
4548     }
4549     return;
4550 }
4551 
4552 /* the return from this sub is the minimum length that could possibly match */
4553 STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,SSize_t * minlenp,SSize_t * deltap,regnode * last,scan_data_t * data,I32 stopparen,U32 recursed_depth,regnode_ssc * and_withp,U32 flags,U32 depth,bool was_mutate_ok)4554 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4555                         SSize_t *minlenp, SSize_t *deltap,
4556 			regnode *last,
4557 			scan_data_t *data,
4558 			I32 stopparen,
4559                         U32 recursed_depth,
4560 			regnode_ssc *and_withp,
4561 			U32 flags, U32 depth, bool was_mutate_ok)
4562 			/* scanp: Start here (read-write). */
4563 			/* deltap: Write maxlen-minlen here. */
4564 			/* last: Stop before this one. */
4565 			/* data: string data about the pattern */
4566 			/* stopparen: treat close N as END */
4567 			/* recursed: which subroutines have we recursed into */
4568 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4569 {
4570     SSize_t final_minlen;
4571     /* There must be at least this number of characters to match */
4572     SSize_t min = 0;
4573     I32 pars = 0, code;
4574     regnode *scan = *scanp, *next;
4575     SSize_t delta = 0;
4576     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4577     int is_inf_internal = 0;		/* The studied chunk is infinite */
4578     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4579     scan_data_t data_fake;
4580     SV *re_trie_maxbuff = NULL;
4581     regnode *first_non_open = scan;
4582     SSize_t stopmin = OPTIMIZE_INFTY;
4583     scan_frame *frame = NULL;
4584     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4585 
4586     PERL_ARGS_ASSERT_STUDY_CHUNK;
4587     RExC_study_started= 1;
4588 
4589     Zero(&data_fake, 1, scan_data_t);
4590 
4591     if ( depth == 0 ) {
4592         while (first_non_open && OP(first_non_open) == OPEN)
4593             first_non_open=regnext(first_non_open);
4594     }
4595 
4596 
4597   fake_study_recurse:
4598     DEBUG_r(
4599         RExC_study_chunk_recursed_count++;
4600     );
4601     DEBUG_OPTIMISE_MORE_r(
4602     {
4603         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4604             depth, (long)stopparen,
4605             (unsigned long)RExC_study_chunk_recursed_count,
4606             (unsigned long)depth, (unsigned long)recursed_depth,
4607             scan,
4608             last);
4609         if (recursed_depth) {
4610             U32 i;
4611             U32 j;
4612             for ( j = 0 ; j < recursed_depth ; j++ ) {
4613                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4614                     if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4615                         Perl_re_printf( aTHX_ " %d",(int)i);
4616                         break;
4617                     }
4618                 }
4619                 if ( j + 1 < recursed_depth ) {
4620                     Perl_re_printf( aTHX_  ",");
4621                 }
4622             }
4623         }
4624         Perl_re_printf( aTHX_ "\n");
4625     }
4626     );
4627     while ( scan && OP(scan) != END && scan < last ){
4628         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4629                                    node length to get a real minimum (because
4630                                    the folded version may be shorter) */
4631 	bool unfolded_multi_char = FALSE;
4632         /* avoid mutating ops if we are anywhere within the recursed or
4633          * enframed handling for a GOSUB: the outermost level will handle it.
4634          */
4635         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4636 	/* Peephole optimizer: */
4637         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4638         DEBUG_PEEP("Peep", scan, depth, flags);
4639 
4640 
4641         /* The reason we do this here is that we need to deal with things like
4642          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4643          * parsing code, as each (?:..) is handled by a different invocation of
4644          * reg() -- Yves
4645          */
4646         if (PL_regkind[OP(scan)] == EXACT
4647             && OP(scan) != LEXACT
4648             && OP(scan) != LEXACT_REQ8
4649             && mutate_ok
4650         ) {
4651             join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4652                     0, NULL, depth + 1);
4653         }
4654 
4655         /* Follow the next-chain of the current node and optimize
4656            away all the NOTHINGs from it.
4657          */
4658         rck_elide_nothing(scan);
4659 
4660         /* The principal pseudo-switch.  Cannot be a switch, since we look into
4661          * several different things.  */
4662         if ( OP(scan) == DEFINEP ) {
4663             SSize_t minlen = 0;
4664             SSize_t deltanext = 0;
4665             SSize_t fake_last_close = 0;
4666             I32 f = SCF_IN_DEFINE;
4667 
4668             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4669             scan = regnext(scan);
4670             assert( OP(scan) == IFTHEN );
4671             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4672 
4673             data_fake.last_closep= &fake_last_close;
4674             minlen = *minlenp;
4675             next = regnext(scan);
4676             scan = NEXTOPER(NEXTOPER(scan));
4677             DEBUG_PEEP("scan", scan, depth, flags);
4678             DEBUG_PEEP("next", next, depth, flags);
4679 
4680             /* we suppose the run is continuous, last=next...
4681              * NOTE we dont use the return here! */
4682             /* DEFINEP study_chunk() recursion */
4683             (void)study_chunk(pRExC_state, &scan, &minlen,
4684                               &deltanext, next, &data_fake, stopparen,
4685                               recursed_depth, NULL, f, depth+1, mutate_ok);
4686 
4687             scan = next;
4688         } else
4689         if (
4690             OP(scan) == BRANCH  ||
4691             OP(scan) == BRANCHJ ||
4692             OP(scan) == IFTHEN
4693         ) {
4694 	    next = regnext(scan);
4695 	    code = OP(scan);
4696 
4697             /* The op(next)==code check below is to see if we
4698              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4699              * IFTHEN is special as it might not appear in pairs.
4700              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4701              * we dont handle it cleanly. */
4702 	    if (OP(next) == code || code == IFTHEN) {
4703                 /* NOTE - There is similar code to this block below for
4704                  * handling TRIE nodes on a re-study.  If you change stuff here
4705                  * check there too. */
4706 		SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4707 		regnode_ssc accum;
4708 		regnode * const startbranch=scan;
4709 
4710                 if (flags & SCF_DO_SUBSTR) {
4711                     /* Cannot merge strings after this. */
4712                     scan_commit(pRExC_state, data, minlenp, is_inf);
4713                 }
4714 
4715                 if (flags & SCF_DO_STCLASS)
4716 		    ssc_init_zero(pRExC_state, &accum);
4717 
4718 		while (OP(scan) == code) {
4719 		    SSize_t deltanext, minnext, fake;
4720 		    I32 f = 0;
4721 		    regnode_ssc this_class;
4722 
4723                     DEBUG_PEEP("Branch", scan, depth, flags);
4724 
4725 		    num++;
4726                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4727 		    if (data) {
4728 			data_fake.whilem_c = data->whilem_c;
4729 			data_fake.last_closep = data->last_closep;
4730 		    }
4731 		    else
4732 			data_fake.last_closep = &fake;
4733 
4734 		    data_fake.pos_delta = delta;
4735 		    next = regnext(scan);
4736 
4737                     scan = NEXTOPER(scan); /* everything */
4738                     if (code != BRANCH)    /* everything but BRANCH */
4739 			scan = NEXTOPER(scan);
4740 
4741 		    if (flags & SCF_DO_STCLASS) {
4742 			ssc_init(pRExC_state, &this_class);
4743 			data_fake.start_class = &this_class;
4744 			f = SCF_DO_STCLASS_AND;
4745 		    }
4746 		    if (flags & SCF_WHILEM_VISITED_POS)
4747 			f |= SCF_WHILEM_VISITED_POS;
4748 
4749 		    /* we suppose the run is continuous, last=next...*/
4750                     /* recurse study_chunk() for each BRANCH in an alternation */
4751 		    minnext = study_chunk(pRExC_state, &scan, minlenp,
4752                                       &deltanext, next, &data_fake, stopparen,
4753                                       recursed_depth, NULL, f, depth+1,
4754                                       mutate_ok);
4755 
4756 		    if (min1 > minnext)
4757 			min1 = minnext;
4758 		    if (deltanext == OPTIMIZE_INFTY) {
4759 			is_inf = is_inf_internal = 1;
4760 			max1 = OPTIMIZE_INFTY;
4761 		    } else if (max1 < minnext + deltanext)
4762 			max1 = minnext + deltanext;
4763 		    scan = next;
4764 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4765 			pars++;
4766 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
4767 	                if ( stopmin > minnext)
4768 	                    stopmin = min + min1;
4769 	                flags &= ~SCF_DO_SUBSTR;
4770 	                if (data)
4771 	                    data->flags |= SCF_SEEN_ACCEPT;
4772 	            }
4773 		    if (data) {
4774 			if (data_fake.flags & SF_HAS_EVAL)
4775 			    data->flags |= SF_HAS_EVAL;
4776 			data->whilem_c = data_fake.whilem_c;
4777 		    }
4778 		    if (flags & SCF_DO_STCLASS)
4779 			ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4780 		}
4781 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4782 		    min1 = 0;
4783 		if (flags & SCF_DO_SUBSTR) {
4784 		    data->pos_min += min1;
4785 		    if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4786 		        data->pos_delta = OPTIMIZE_INFTY;
4787 		    else
4788 		        data->pos_delta += max1 - min1;
4789 		    if (max1 != min1 || is_inf)
4790 			data->cur_is_floating = 1;
4791 		}
4792 		min += min1;
4793 		if (delta == OPTIMIZE_INFTY
4794 		 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4795 		    delta = OPTIMIZE_INFTY;
4796 		else
4797 		    delta += max1 - min1;
4798 		if (flags & SCF_DO_STCLASS_OR) {
4799 		    ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4800 		    if (min1) {
4801 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4802 			flags &= ~SCF_DO_STCLASS;
4803 		    }
4804 		}
4805 		else if (flags & SCF_DO_STCLASS_AND) {
4806 		    if (min1) {
4807 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4808 			flags &= ~SCF_DO_STCLASS;
4809 		    }
4810 		    else {
4811 			/* Switch to OR mode: cache the old value of
4812 			 * data->start_class */
4813 			INIT_AND_WITHP;
4814 			StructCopy(data->start_class, and_withp, regnode_ssc);
4815 			flags &= ~SCF_DO_STCLASS_AND;
4816 			StructCopy(&accum, data->start_class, regnode_ssc);
4817 			flags |= SCF_DO_STCLASS_OR;
4818 		    }
4819 		}
4820 
4821                 if (PERL_ENABLE_TRIE_OPTIMISATION
4822                     && OP(startbranch) == BRANCH
4823                     && mutate_ok
4824                 ) {
4825 		/* demq.
4826 
4827                    Assuming this was/is a branch we are dealing with: 'scan'
4828                    now points at the item that follows the branch sequence,
4829                    whatever it is. We now start at the beginning of the
4830                    sequence and look for subsequences of
4831 
4832 		   BRANCH->EXACT=>x1
4833 		   BRANCH->EXACT=>x2
4834 		   tail
4835 
4836                    which would be constructed from a pattern like
4837                    /A|LIST|OF|WORDS/
4838 
4839 		   If we can find such a subsequence we need to turn the first
4840 		   element into a trie and then add the subsequent branch exact
4841 		   strings to the trie.
4842 
4843 		   We have two cases
4844 
4845                      1. patterns where the whole set of branches can be
4846                         converted.
4847 
4848 		     2. patterns where only a subset can be converted.
4849 
4850 		   In case 1 we can replace the whole set with a single regop
4851 		   for the trie. In case 2 we need to keep the start and end
4852 		   branches so
4853 
4854 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4855 		     becomes BRANCH TRIE; BRANCH X;
4856 
4857 		  There is an additional case, that being where there is a
4858 		  common prefix, which gets split out into an EXACT like node
4859 		  preceding the TRIE node.
4860 
4861 		  If x(1..n)==tail then we can do a simple trie, if not we make
4862 		  a "jump" trie, such that when we match the appropriate word
4863 		  we "jump" to the appropriate tail node. Essentially we turn
4864 		  a nested if into a case structure of sorts.
4865 
4866 		*/
4867 
4868 		    int made=0;
4869 		    if (!re_trie_maxbuff) {
4870 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4871 			if (!SvIOK(re_trie_maxbuff))
4872 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4873 		    }
4874                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4875                         regnode *cur;
4876                         regnode *first = (regnode *)NULL;
4877                         regnode *prev = (regnode *)NULL;
4878                         regnode *tail = scan;
4879                         U8 trietype = 0;
4880                         U32 count=0;
4881 
4882                         /* var tail is used because there may be a TAIL
4883                            regop in the way. Ie, the exacts will point to the
4884                            thing following the TAIL, but the last branch will
4885                            point at the TAIL. So we advance tail. If we
4886                            have nested (?:) we may have to move through several
4887                            tails.
4888                          */
4889 
4890                         while ( OP( tail ) == TAIL ) {
4891                             /* this is the TAIL generated by (?:) */
4892                             tail = regnext( tail );
4893                         }
4894 
4895 
4896                         DEBUG_TRIE_COMPILE_r({
4897                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4898                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4899                               depth+1,
4900                               "Looking for TRIE'able sequences. Tail node is ",
4901                               (UV) REGNODE_OFFSET(tail),
4902                               SvPV_nolen_const( RExC_mysv )
4903                             );
4904                         });
4905 
4906                         /*
4907 
4908                             Step through the branches
4909                                 cur represents each branch,
4910                                 noper is the first thing to be matched as part
4911                                       of that branch
4912                                 noper_next is the regnext() of that node.
4913 
4914                             We normally handle a case like this
4915                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4916                             support building with NOJUMPTRIE, which restricts
4917                             the trie logic to structures like /FOO|BAR/.
4918 
4919                             If noper is a trieable nodetype then the branch is
4920                             a possible optimization target. If we are building
4921                             under NOJUMPTRIE then we require that noper_next is
4922                             the same as scan (our current position in the regex
4923                             program).
4924 
4925                             Once we have two or more consecutive such branches
4926                             we can create a trie of the EXACT's contents and
4927                             stitch it in place into the program.
4928 
4929                             If the sequence represents all of the branches in
4930                             the alternation we replace the entire thing with a
4931                             single TRIE node.
4932 
4933                             Otherwise when it is a subsequence we need to
4934                             stitch it in place and replace only the relevant
4935                             branches. This means the first branch has to remain
4936                             as it is used by the alternation logic, and its
4937                             next pointer, and needs to be repointed at the item
4938                             on the branch chain following the last branch we
4939                             have optimized away.
4940 
4941                             This could be either a BRANCH, in which case the
4942                             subsequence is internal, or it could be the item
4943                             following the branch sequence in which case the
4944                             subsequence is at the end (which does not
4945                             necessarily mean the first node is the start of the
4946                             alternation).
4947 
4948                             TRIE_TYPE(X) is a define which maps the optype to a
4949                             trietype.
4950 
4951                                 optype          |  trietype
4952                                 ----------------+-----------
4953                                 NOTHING         | NOTHING
4954                                 EXACT           | EXACT
4955                                 EXACT_REQ8     | EXACT
4956                                 EXACTFU         | EXACTFU
4957                                 EXACTFU_REQ8   | EXACTFU
4958                                 EXACTFUP        | EXACTFU
4959                                 EXACTFAA        | EXACTFAA
4960                                 EXACTL          | EXACTL
4961                                 EXACTFLU8       | EXACTFLU8
4962 
4963 
4964                         */
4965 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4966                        ? NOTHING                                            \
4967                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
4968                          ? EXACT                                            \
4969                          : (     EXACTFU == (X)                             \
4970                               || EXACTFU_REQ8 == (X)                       \
4971                               || EXACTFUP == (X) )                          \
4972                            ? EXACTFU                                        \
4973                            : ( EXACTFAA == (X) )                            \
4974                              ? EXACTFAA                                     \
4975                              : ( EXACTL == (X) )                            \
4976                                ? EXACTL                                     \
4977                                : ( EXACTFLU8 == (X) )                       \
4978                                  ? EXACTFLU8                                \
4979                                  : 0 )
4980 
4981                         /* dont use tail as the end marker for this traverse */
4982                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4983                             regnode * const noper = NEXTOPER( cur );
4984                             U8 noper_type = OP( noper );
4985                             U8 noper_trietype = TRIE_TYPE( noper_type );
4986 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4987                             regnode * const noper_next = regnext( noper );
4988                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4989                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4990 #endif
4991 
4992                             DEBUG_TRIE_COMPILE_r({
4993                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4994                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4995                                    depth+1,
4996                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4997 
4998                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4999                                 Perl_re_printf( aTHX_  " -> %d:%s",
5000                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5001 
5002                                 if ( noper_next ) {
5003                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5004                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5005                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5006                                 }
5007                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5008                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5009 				   PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5010 				);
5011                             });
5012 
5013                             /* Is noper a trieable nodetype that can be merged
5014                              * with the current trie (if there is one)? */
5015                             if ( noper_trietype
5016                                   &&
5017                                   (
5018                                         ( noper_trietype == NOTHING )
5019                                         || ( trietype == NOTHING )
5020                                         || ( trietype == noper_trietype )
5021                                   )
5022 #ifdef NOJUMPTRIE
5023                                   && noper_next >= tail
5024 #endif
5025                                   && count < U16_MAX)
5026                             {
5027                                 /* Handle mergable triable node Either we are
5028                                  * the first node in a new trieable sequence,
5029                                  * in which case we do some bookkeeping,
5030                                  * otherwise we update the end pointer. */
5031                                 if ( !first ) {
5032                                     first = cur;
5033 				    if ( noper_trietype == NOTHING ) {
5034 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5035 					regnode * const noper_next = regnext( noper );
5036                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5037 					U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5038 #endif
5039 
5040                                         if ( noper_next_trietype ) {
5041 					    trietype = noper_next_trietype;
5042                                         } else if (noper_next_type)  {
5043                                             /* a NOTHING regop is 1 regop wide.
5044                                              * We need at least two for a trie
5045                                              * so we can't merge this in */
5046                                             first = NULL;
5047                                         }
5048                                     } else {
5049                                         trietype = noper_trietype;
5050                                     }
5051                                 } else {
5052                                     if ( trietype == NOTHING )
5053                                         trietype = noper_trietype;
5054                                     prev = cur;
5055                                 }
5056 				if (first)
5057 				    count++;
5058                             } /* end handle mergable triable node */
5059                             else {
5060                                 /* handle unmergable node -
5061                                  * noper may either be a triable node which can
5062                                  * not be tried together with the current trie,
5063                                  * or a non triable node */
5064                                 if ( prev ) {
5065                                     /* If last is set and trietype is not
5066                                      * NOTHING then we have found at least two
5067                                      * triable branch sequences in a row of a
5068                                      * similar trietype so we can turn them
5069                                      * into a trie. If/when we allow NOTHING to
5070                                      * start a trie sequence this condition
5071                                      * will be required, and it isn't expensive
5072                                      * so we leave it in for now. */
5073                                     if ( trietype && trietype != NOTHING )
5074                                         make_trie( pRExC_state,
5075                                                 startbranch, first, cur, tail,
5076                                                 count, trietype, depth+1 );
5077                                     prev = NULL; /* note: we clear/update
5078                                                     first, trietype etc below,
5079                                                     so we dont do it here */
5080                                 }
5081                                 if ( noper_trietype
5082 #ifdef NOJUMPTRIE
5083                                      && noper_next >= tail
5084 #endif
5085                                 ){
5086                                     /* noper is triable, so we can start a new
5087                                      * trie sequence */
5088                                     count = 1;
5089                                     first = cur;
5090                                     trietype = noper_trietype;
5091                                 } else if (first) {
5092                                     /* if we already saw a first but the
5093                                      * current node is not triable then we have
5094                                      * to reset the first information. */
5095                                     count = 0;
5096                                     first = NULL;
5097                                     trietype = 0;
5098                                 }
5099                             } /* end handle unmergable node */
5100                         } /* loop over branches */
5101                         DEBUG_TRIE_COMPILE_r({
5102                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5103                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5104                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5105                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5106                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5107                                PL_reg_name[trietype]
5108                             );
5109 
5110                         });
5111                         if ( prev && trietype ) {
5112                             if ( trietype != NOTHING ) {
5113                                 /* the last branch of the sequence was part of
5114                                  * a trie, so we have to construct it here
5115                                  * outside of the loop */
5116                                 made= make_trie( pRExC_state, startbranch,
5117                                                  first, scan, tail, count,
5118                                                  trietype, depth+1 );
5119 #ifdef TRIE_STUDY_OPT
5120                                 if ( ((made == MADE_EXACT_TRIE &&
5121                                      startbranch == first)
5122                                      || ( first_non_open == first )) &&
5123                                      depth==0 ) {
5124                                     flags |= SCF_TRIE_RESTUDY;
5125                                     if ( startbranch == first
5126                                          && scan >= tail )
5127                                     {
5128                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5129                                     }
5130                                 }
5131 #endif
5132                             } else {
5133                                 /* at this point we know whatever we have is a
5134                                  * NOTHING sequence/branch AND if 'startbranch'
5135                                  * is 'first' then we can turn the whole thing
5136                                  * into a NOTHING
5137                                  */
5138                                 if ( startbranch == first ) {
5139                                     regnode *opt;
5140                                     /* the entire thing is a NOTHING sequence,
5141                                      * something like this: (?:|) So we can
5142                                      * turn it into a plain NOTHING op. */
5143                                     DEBUG_TRIE_COMPILE_r({
5144                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5145                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5146                                           depth+1,
5147                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5148 
5149                                     });
5150                                     OP(startbranch)= NOTHING;
5151                                     NEXT_OFF(startbranch)= tail - startbranch;
5152                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5153                                         OP(opt)= OPTIMIZED;
5154                                 }
5155                             }
5156                         } /* end if ( prev) */
5157                     } /* TRIE_MAXBUF is non zero */
5158                 } /* do trie */
5159 
5160 	    }
5161 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5162 		scan = NEXTOPER(NEXTOPER(scan));
5163 	    } else			/* single branch is optimized. */
5164 		scan = NEXTOPER(scan);
5165 	    continue;
5166         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5167             I32 paren = 0;
5168             regnode *start = NULL;
5169             regnode *end = NULL;
5170             U32 my_recursed_depth= recursed_depth;
5171 
5172             if (OP(scan) != SUSPEND) { /* GOSUB */
5173                 /* Do setup, note this code has side effects beyond
5174                  * the rest of this block. Specifically setting
5175                  * RExC_recurse[] must happen at least once during
5176                  * study_chunk(). */
5177                 paren = ARG(scan);
5178                 RExC_recurse[ARG2L(scan)] = scan;
5179                 start = REGNODE_p(RExC_open_parens[paren]);
5180                 end   = REGNODE_p(RExC_close_parens[paren]);
5181 
5182                 /* NOTE we MUST always execute the above code, even
5183                  * if we do nothing with a GOSUB */
5184                 if (
5185                     ( flags & SCF_IN_DEFINE )
5186                     ||
5187                     (
5188                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5189                         &&
5190                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5191                     )
5192                 ) {
5193                     /* no need to do anything here if we are in a define. */
5194                     /* or we are after some kind of infinite construct
5195                      * so we can skip recursing into this item.
5196                      * Since it is infinite we will not change the maxlen
5197                      * or delta, and if we miss something that might raise
5198                      * the minlen it will merely pessimise a little.
5199                      *
5200                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5201                      * might result in a minlen of 1 and not of 4,
5202                      * but this doesn't make us mismatch, just try a bit
5203                      * harder than we should.
5204                      *
5205                      * However we must assume this GOSUB is infinite, to
5206                      * avoid wrongly applying other optimizations in the
5207                      * enclosing scope - see GH 18096, for example.
5208                      */
5209                     is_inf = is_inf_internal = 1;
5210                     scan= regnext(scan);
5211                     continue;
5212                 }
5213 
5214                 if (
5215                     !recursed_depth
5216                     || !PAREN_TEST(recursed_depth - 1, paren)
5217                 ) {
5218                     /* it is quite possible that there are more efficient ways
5219                      * to do this. We maintain a bitmap per level of recursion
5220                      * of which patterns we have entered so we can detect if a
5221                      * pattern creates a possible infinite loop. When we
5222                      * recurse down a level we copy the previous levels bitmap
5223                      * down. When we are at recursion level 0 we zero the top
5224                      * level bitmap. It would be nice to implement a different
5225                      * more efficient way of doing this. In particular the top
5226                      * level bitmap may be unnecessary.
5227                      */
5228                     if (!recursed_depth) {
5229                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5230                     } else {
5231                         Copy(PAREN_OFFSET(recursed_depth - 1),
5232                              PAREN_OFFSET(recursed_depth),
5233                              RExC_study_chunk_recursed_bytes, U8);
5234                     }
5235                     /* we havent recursed into this paren yet, so recurse into it */
5236                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5237                     PAREN_SET(recursed_depth, paren);
5238                     my_recursed_depth= recursed_depth + 1;
5239                 } else {
5240                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5241                     /* some form of infinite recursion, assume infinite length
5242                      * */
5243                     if (flags & SCF_DO_SUBSTR) {
5244                         scan_commit(pRExC_state, data, minlenp, is_inf);
5245                         data->cur_is_floating = 1;
5246                     }
5247                     is_inf = is_inf_internal = 1;
5248                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5249                         ssc_anything(data->start_class);
5250                     flags &= ~SCF_DO_STCLASS;
5251 
5252                     start= NULL; /* reset start so we dont recurse later on. */
5253 	        }
5254             } else {
5255 	        paren = stopparen;
5256                 start = scan + 2;
5257 	        end = regnext(scan);
5258 	    }
5259             if (start) {
5260                 scan_frame *newframe;
5261                 assert(end);
5262                 if (!RExC_frame_last) {
5263                     Newxz(newframe, 1, scan_frame);
5264                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5265                     RExC_frame_head= newframe;
5266                     RExC_frame_count++;
5267                 } else if (!RExC_frame_last->next_frame) {
5268                     Newxz(newframe, 1, scan_frame);
5269                     RExC_frame_last->next_frame= newframe;
5270                     newframe->prev_frame= RExC_frame_last;
5271                     RExC_frame_count++;
5272                 } else {
5273                     newframe= RExC_frame_last->next_frame;
5274                 }
5275                 RExC_frame_last= newframe;
5276 
5277                 newframe->next_regnode = regnext(scan);
5278                 newframe->last_regnode = last;
5279                 newframe->stopparen = stopparen;
5280                 newframe->prev_recursed_depth = recursed_depth;
5281                 newframe->this_prev_frame= frame;
5282                 newframe->in_gosub = (
5283                     (frame && frame->in_gosub) || OP(scan) == GOSUB
5284                 );
5285 
5286                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5287                 DEBUG_PEEP("fnew", scan, depth, flags);
5288 
5289 	        frame = newframe;
5290 	        scan =  start;
5291 	        stopparen = paren;
5292 	        last = end;
5293                 depth = depth + 1;
5294                 recursed_depth= my_recursed_depth;
5295 
5296 	        continue;
5297 	    }
5298 	}
5299 	else if (PL_regkind[OP(scan)] == EXACT && ! isEXACTFish(OP(scan))) {
5300 	    SSize_t bytelen = STR_LEN(scan), charlen;
5301 	    UV uc;
5302             assert(bytelen);
5303 	    if (UTF) {
5304 		const U8 * const s = (U8*)STRING(scan);
5305 		uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5306 		charlen = utf8_length(s, s + bytelen);
5307 	    } else {
5308 		uc = *((U8*)STRING(scan));
5309                 charlen = bytelen;
5310 	    }
5311 	    min += charlen;
5312 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5313 		/* The code below prefers earlier match for fixed
5314 		   offset, later match for variable offset.  */
5315 		if (data->last_end == -1) { /* Update the start info. */
5316 		    data->last_start_min = data->pos_min;
5317                     data->last_start_max =
5318                         is_inf ? OPTIMIZE_INFTY
5319                         : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5320                             ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5321 		}
5322 		sv_catpvn(data->last_found, STRING(scan), bytelen);
5323 		if (UTF)
5324 		    SvUTF8_on(data->last_found);
5325 		{
5326 		    SV * const sv = data->last_found;
5327 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5328 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
5329 		    if (mg && mg->mg_len >= 0)
5330 			mg->mg_len += charlen;
5331 		}
5332 		data->last_end = data->pos_min + charlen;
5333 		data->pos_min += charlen; /* As in the first entry. */
5334 		data->flags &= ~SF_BEFORE_EOL;
5335 	    }
5336 
5337             /* ANDing the code point leaves at most it, and not in locale, and
5338              * can't match null string */
5339 	    if (flags & SCF_DO_STCLASS_AND) {
5340                 ssc_cp_and(data->start_class, uc);
5341                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5342                 ssc_clear_locale(data->start_class);
5343 	    }
5344 	    else if (flags & SCF_DO_STCLASS_OR) {
5345                 ssc_add_cp(data->start_class, uc);
5346 		ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5347 
5348                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5349                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5350 	    }
5351 	    flags &= ~SCF_DO_STCLASS;
5352 	}
5353         else if (PL_regkind[OP(scan)] == EXACT) {
5354             /* But OP != EXACT!, so is EXACTFish */
5355 	    SSize_t bytelen = STR_LEN(scan), charlen;
5356             const U8 * s = (U8*)STRING(scan);
5357 
5358             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5359              * with the mask set to the complement of the bit that differs
5360              * between upper and lower case, and the lowest code point of the
5361              * pair (which the '&' forces) */
5362             if (     bytelen == 1
5363                 &&   isALPHA_A(*s)
5364                 &&  (         OP(scan) == EXACTFAA
5365                      || (     OP(scan) == EXACTFU
5366                          && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5367                 &&   mutate_ok
5368             ) {
5369                 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5370 
5371                 OP(scan) = ANYOFM;
5372                 ARG_SET(scan, *s & mask);
5373                 FLAGS(scan) = mask;
5374                 /* we're not EXACTFish any more, so restudy */
5375                 continue;
5376             }
5377 
5378 	    /* Search for fixed substrings supports EXACT only. */
5379 	    if (flags & SCF_DO_SUBSTR) {
5380 		assert(data);
5381                 scan_commit(pRExC_state, data, minlenp, is_inf);
5382 	    }
5383             charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5384 	    if (unfolded_multi_char) {
5385                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5386 	    }
5387 	    min += charlen - min_subtract;
5388             assert (min >= 0);
5389             delta += min_subtract;
5390 	    if (flags & SCF_DO_SUBSTR) {
5391 		data->pos_min += charlen - min_subtract;
5392 		if (data->pos_min < 0) {
5393                     data->pos_min = 0;
5394                 }
5395                 data->pos_delta += min_subtract;
5396 		if (min_subtract) {
5397 		    data->cur_is_floating = 1; /* float */
5398 		}
5399 	    }
5400 
5401             if (flags & SCF_DO_STCLASS) {
5402                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5403 
5404                 assert(EXACTF_invlist);
5405                 if (flags & SCF_DO_STCLASS_AND) {
5406                     if (OP(scan) != EXACTFL)
5407                         ssc_clear_locale(data->start_class);
5408                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5409                     ANYOF_POSIXL_ZERO(data->start_class);
5410                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5411                 }
5412                 else {  /* SCF_DO_STCLASS_OR */
5413                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5414                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5415 
5416                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5417                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5418                 }
5419                 flags &= ~SCF_DO_STCLASS;
5420                 SvREFCNT_dec(EXACTF_invlist);
5421             }
5422 	}
5423 	else if (REGNODE_VARIES(OP(scan))) {
5424 	    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5425 	    I32 fl = 0, f = flags;
5426 	    regnode * const oscan = scan;
5427 	    regnode_ssc this_class;
5428 	    regnode_ssc *oclass = NULL;
5429 	    I32 next_is_eval = 0;
5430 
5431 	    switch (PL_regkind[OP(scan)]) {
5432 	    case WHILEM:		/* End of (?:...)* . */
5433 		scan = NEXTOPER(scan);
5434 		goto finish;
5435 	    case PLUS:
5436 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5437 		    next = NEXTOPER(scan);
5438 		    if (   (     PL_regkind[OP(next)] == EXACT
5439                             && ! isEXACTFish(OP(next)))
5440                         || (flags & SCF_DO_STCLASS))
5441                     {
5442 			mincount = 1;
5443 			maxcount = REG_INFTY;
5444 			next = regnext(scan);
5445 			scan = NEXTOPER(scan);
5446 			goto do_curly;
5447 		    }
5448 		}
5449 		if (flags & SCF_DO_SUBSTR)
5450 		    data->pos_min++;
5451                 /* This will bypass the formal 'min += minnext * mincount'
5452                  * calculation in the do_curly path, so assumes min width
5453                  * of the PLUS payload is exactly one. */
5454 		min++;
5455 		/* FALLTHROUGH */
5456 	    case STAR:
5457                 next = NEXTOPER(scan);
5458 
5459                 /* This temporary node can now be turned into EXACTFU, and
5460                  * must, as regexec.c doesn't handle it */
5461                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5462                     OP(next) = EXACTFU;
5463                 }
5464 
5465                 if (     STR_LEN(next) == 1
5466                     &&   isALPHA_A(* STRING(next))
5467                     && (         OP(next) == EXACTFAA
5468                         || (     OP(next) == EXACTFU
5469                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5470                     &&   mutate_ok
5471                 ) {
5472                     /* These differ in just one bit */
5473                     U8 mask = ~ ('A' ^ 'a');
5474 
5475                     assert(isALPHA_A(* STRING(next)));
5476 
5477                     /* Then replace it by an ANYOFM node, with
5478                     * the mask set to the complement of the
5479                     * bit that differs between upper and lower
5480                     * case, and the lowest code point of the
5481                     * pair (which the '&' forces) */
5482                     OP(next) = ANYOFM;
5483                     ARG_SET(next, *STRING(next) & mask);
5484                     FLAGS(next) = mask;
5485                 }
5486 
5487 		if (flags & SCF_DO_STCLASS) {
5488 		    mincount = 0;
5489 		    maxcount = REG_INFTY;
5490 		    next = regnext(scan);
5491 		    scan = NEXTOPER(scan);
5492 		    goto do_curly;
5493 		}
5494 		if (flags & SCF_DO_SUBSTR) {
5495                     scan_commit(pRExC_state, data, minlenp, is_inf);
5496                     /* Cannot extend fixed substrings */
5497 		    data->cur_is_floating = 1; /* float */
5498 		}
5499                 is_inf = is_inf_internal = 1;
5500                 scan = regnext(scan);
5501 		goto optimize_curly_tail;
5502 	    case CURLY:
5503 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5504 	            && (scan->flags == stopparen))
5505 		{
5506 		    mincount = 1;
5507 		    maxcount = 1;
5508 		} else {
5509 		    mincount = ARG1(scan);
5510 		    maxcount = ARG2(scan);
5511 		}
5512 		next = regnext(scan);
5513 		if (OP(scan) == CURLYX) {
5514 		    I32 lp = (data ? *(data->last_closep) : 0);
5515 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5516 		}
5517 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5518 		next_is_eval = (OP(scan) == EVAL);
5519 	      do_curly:
5520 		if (flags & SCF_DO_SUBSTR) {
5521                     if (mincount == 0)
5522                         scan_commit(pRExC_state, data, minlenp, is_inf);
5523                     /* Cannot extend fixed substrings */
5524 		    pos_before = data->pos_min;
5525 		}
5526 		if (data) {
5527 		    fl = data->flags;
5528 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5529 		    if (is_inf)
5530 			data->flags |= SF_IS_INF;
5531 		}
5532 		if (flags & SCF_DO_STCLASS) {
5533 		    ssc_init(pRExC_state, &this_class);
5534 		    oclass = data->start_class;
5535 		    data->start_class = &this_class;
5536 		    f |= SCF_DO_STCLASS_AND;
5537 		    f &= ~SCF_DO_STCLASS_OR;
5538 		}
5539 	        /* Exclude from super-linear cache processing any {n,m}
5540 		   regops for which the combination of input pos and regex
5541 		   pos is not enough information to determine if a match
5542 		   will be possible.
5543 
5544 		   For example, in the regex /foo(bar\s*){4,8}baz/ with the
5545 		   regex pos at the \s*, the prospects for a match depend not
5546 		   only on the input position but also on how many (bar\s*)
5547 		   repeats into the {4,8} we are. */
5548                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5549 		    f &= ~SCF_WHILEM_VISITED_POS;
5550 
5551 		/* This will finish on WHILEM, setting scan, or on NULL: */
5552                 /* recurse study_chunk() on loop bodies */
5553 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5554                                   last, data, stopparen, recursed_depth, NULL,
5555                                   (mincount == 0
5556                                    ? (f & ~SCF_DO_SUBSTR)
5557                                    : f)
5558                                   , depth+1, mutate_ok);
5559 
5560 		if (flags & SCF_DO_STCLASS)
5561 		    data->start_class = oclass;
5562 		if (mincount == 0 || minnext == 0) {
5563 		    if (flags & SCF_DO_STCLASS_OR) {
5564 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5565 		    }
5566 		    else if (flags & SCF_DO_STCLASS_AND) {
5567 			/* Switch to OR mode: cache the old value of
5568 			 * data->start_class */
5569 			INIT_AND_WITHP;
5570 			StructCopy(data->start_class, and_withp, regnode_ssc);
5571 			flags &= ~SCF_DO_STCLASS_AND;
5572 			StructCopy(&this_class, data->start_class, regnode_ssc);
5573 			flags |= SCF_DO_STCLASS_OR;
5574                         ANYOF_FLAGS(data->start_class)
5575                                                 |= SSC_MATCHES_EMPTY_STRING;
5576 		    }
5577 		} else {		/* Non-zero len */
5578 		    if (flags & SCF_DO_STCLASS_OR) {
5579 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5580 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5581 		    }
5582 		    else if (flags & SCF_DO_STCLASS_AND)
5583 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5584 		    flags &= ~SCF_DO_STCLASS;
5585 		}
5586 		if (!scan) 		/* It was not CURLYX, but CURLY. */
5587 		    scan = next;
5588 		if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5589 		    /* ? quantifier ok, except for (?{ ... }) */
5590 		    && (next_is_eval || !(mincount == 0 && maxcount == 1))
5591 		    && (minnext == 0) && (deltanext == 0)
5592 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5593                     && maxcount <= REG_INFTY/3) /* Complement check for big
5594                                                    count */
5595 		{
5596 		    _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5597                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5598                             "Quantifier unexpected on zero-length expression "
5599                             "in regex m/%" UTF8f "/",
5600 			     UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5601 				  RExC_precomp)));
5602                 }
5603 
5604                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5605                     || min >= SSize_t_MAX - minnext * mincount )
5606                 {
5607                     FAIL("Regexp out of space");
5608                 }
5609 
5610 		min += minnext * mincount;
5611 		is_inf_internal |= deltanext == OPTIMIZE_INFTY
5612                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5613 		is_inf |= is_inf_internal;
5614                 if (is_inf) {
5615 		    delta = OPTIMIZE_INFTY;
5616                 } else {
5617 		    delta += (minnext + deltanext) * maxcount
5618                              - minnext * mincount;
5619                 }
5620 		/* Try powerful optimization CURLYX => CURLYN. */
5621 		if (  OP(oscan) == CURLYX && data
5622 		      && data->flags & SF_IN_PAR
5623 		      && !(data->flags & SF_HAS_EVAL)
5624 		      && !deltanext && minnext == 1
5625                       && mutate_ok
5626                 ) {
5627 		    /* Try to optimize to CURLYN.  */
5628 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5629 		    regnode * const nxt1 = nxt;
5630 #ifdef DEBUGGING
5631 		    regnode *nxt2;
5632 #endif
5633 
5634 		    /* Skip open. */
5635 		    nxt = regnext(nxt);
5636 		    if (!REGNODE_SIMPLE(OP(nxt))
5637 			&& !(PL_regkind[OP(nxt)] == EXACT
5638 			     && STR_LEN(nxt) == 1))
5639 			goto nogo;
5640 #ifdef DEBUGGING
5641 		    nxt2 = nxt;
5642 #endif
5643 		    nxt = regnext(nxt);
5644 		    if (OP(nxt) != CLOSE)
5645 			goto nogo;
5646 		    if (RExC_open_parens) {
5647 
5648                         /*open->CURLYM*/
5649                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5650 
5651                         /*close->while*/
5652                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5653 		    }
5654 		    /* Now we know that nxt2 is the only contents: */
5655 		    oscan->flags = (U8)ARG(nxt);
5656 		    OP(oscan) = CURLYN;
5657 		    OP(nxt1) = NOTHING;	/* was OPEN. */
5658 
5659 #ifdef DEBUGGING
5660 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5661 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5662 		    NEXT_OFF(nxt2) = 0;	/* just for consistency with CURLY. */
5663 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5664 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
5665 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5666 #endif
5667 		}
5668 	      nogo:
5669 
5670 		/* Try optimization CURLYX => CURLYM. */
5671 		if (  OP(oscan) == CURLYX && data
5672 		      && !(data->flags & SF_HAS_PAR)
5673 		      && !(data->flags & SF_HAS_EVAL)
5674 		      && !deltanext	/* atom is fixed width */
5675 		      && minnext != 0	/* CURLYM can't handle zero width */
5676                          /* Nor characters whose fold at run-time may be
5677                           * multi-character */
5678                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5679                       && mutate_ok
5680 		) {
5681 		    /* XXXX How to optimize if data == 0? */
5682 		    /* Optimize to a simpler form.  */
5683 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5684 		    regnode *nxt2;
5685 
5686 		    OP(oscan) = CURLYM;
5687 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5688 			    && (OP(nxt2) != WHILEM))
5689 			nxt = nxt2;
5690 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5691 		    /* Need to optimize away parenths. */
5692 		    if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5693 			/* Set the parenth number.  */
5694 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5695 
5696 			oscan->flags = (U8)ARG(nxt);
5697 			if (RExC_open_parens) {
5698                              /*open->CURLYM*/
5699                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5700 
5701                             /*close->NOTHING*/
5702                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5703                                                          + 1;
5704 			}
5705 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
5706 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5707 
5708 #ifdef DEBUGGING
5709 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5710 			OP(nxt + 1) = OPTIMIZED; /* was count. */
5711 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5712 			NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5713 #endif
5714 #if 0
5715 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
5716 			    regnode *nnxt = regnext(nxt1);
5717 			    if (nnxt == nxt) {
5718 				if (reg_off_by_arg[OP(nxt1)])
5719 				    ARG_SET(nxt1, nxt2 - nxt1);
5720 				else if (nxt2 - nxt1 < U16_MAX)
5721 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
5722 				else
5723 				    OP(nxt) = NOTHING;	/* Cannot beautify */
5724 			    }
5725 			    nxt1 = nnxt;
5726 			}
5727 #endif
5728 			/* Optimize again: */
5729                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5730 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5731                                     NULL, stopparen, recursed_depth, NULL, 0,
5732                                     depth+1, mutate_ok);
5733 		    }
5734 		    else
5735 			oscan->flags = 0;
5736 		}
5737 		else if ((OP(oscan) == CURLYX)
5738 			 && (flags & SCF_WHILEM_VISITED_POS)
5739 			 /* See the comment on a similar expression above.
5740 			    However, this time it's not a subexpression
5741 			    we care about, but the expression itself. */
5742 			 && (maxcount == REG_INFTY)
5743 			 && data) {
5744 		    /* This stays as CURLYX, we can put the count/of pair. */
5745 		    /* Find WHILEM (as in regexec.c) */
5746 		    regnode *nxt = oscan + NEXT_OFF(oscan);
5747 
5748 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5749 			nxt += ARG(nxt);
5750                     nxt = PREVOPER(nxt);
5751                     if (nxt->flags & 0xf) {
5752                         /* we've already set whilem count on this node */
5753                     } else if (++data->whilem_c < 16) {
5754                         assert(data->whilem_c <= RExC_whilem_seen);
5755                         nxt->flags = (U8)(data->whilem_c
5756                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5757                     }
5758 		}
5759 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5760 		    pars++;
5761 		if (flags & SCF_DO_SUBSTR) {
5762 		    SV *last_str = NULL;
5763                     STRLEN last_chrs = 0;
5764 		    int counted = mincount != 0;
5765 
5766                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5767                                                                   string. */
5768 			SSize_t b = pos_before >= data->last_start_min
5769 			    ? pos_before : data->last_start_min;
5770 			STRLEN l;
5771 			const char * const s = SvPV_const(data->last_found, l);
5772 			SSize_t old = b - data->last_start_min;
5773                         assert(old >= 0);
5774 
5775 			if (UTF)
5776 			    old = utf8_hop_forward((U8*)s, old,
5777                                                (U8 *) SvEND(data->last_found))
5778                                 - (U8*)s;
5779 			l -= old;
5780 			/* Get the added string: */
5781 			last_str = newSVpvn_utf8(s  + old, l, UTF);
5782                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5783                                             (U8*)(s + old + l)) : l;
5784 			if (deltanext == 0 && pos_before == b) {
5785 			    /* What was added is a constant string */
5786 			    if (mincount > 1) {
5787 
5788 				SvGROW(last_str, (mincount * l) + 1);
5789 				repeatcpy(SvPVX(last_str) + l,
5790 					  SvPVX_const(last_str), l,
5791                                           mincount - 1);
5792 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
5793 				/* Add additional parts. */
5794 				SvCUR_set(data->last_found,
5795 					  SvCUR(data->last_found) - l);
5796 				sv_catsv(data->last_found, last_str);
5797 				{
5798 				    SV * sv = data->last_found;
5799 				    MAGIC *mg =
5800 					SvUTF8(sv) && SvMAGICAL(sv) ?
5801 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
5802 				    if (mg && mg->mg_len >= 0)
5803 					mg->mg_len += last_chrs * (mincount-1);
5804 				}
5805                                 last_chrs *= mincount;
5806 				data->last_end += l * (mincount - 1);
5807 			    }
5808 			} else {
5809 			    /* start offset must point into the last copy */
5810 			    data->last_start_min += minnext * (mincount - 1);
5811 			    data->last_start_max =
5812                               is_inf
5813                                ? OPTIMIZE_INFTY
5814 			       : data->last_start_max +
5815                                  (maxcount - 1) * (minnext + data->pos_delta);
5816 			}
5817 		    }
5818 		    /* It is counted once already... */
5819 		    data->pos_min += minnext * (mincount - counted);
5820 #if 0
5821 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5822                               " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5823                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5824     (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5825     (UV)mincount);
5826 if (deltanext != OPTIMIZE_INFTY)
5827 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5828     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5829           - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5830 #endif
5831 		    if (deltanext == OPTIMIZE_INFTY
5832                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5833 		        data->pos_delta = OPTIMIZE_INFTY;
5834 		    else
5835 		        data->pos_delta += - counted * deltanext +
5836 			(minnext + deltanext) * maxcount - minnext * mincount;
5837 		    if (mincount != maxcount) {
5838 			 /* Cannot extend fixed substrings found inside
5839 			    the group.  */
5840                         scan_commit(pRExC_state, data, minlenp, is_inf);
5841 			if (mincount && last_str) {
5842 			    SV * const sv = data->last_found;
5843 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5844 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
5845 
5846 			    if (mg)
5847 				mg->mg_len = -1;
5848 			    sv_setsv(sv, last_str);
5849 			    data->last_end = data->pos_min;
5850 			    data->last_start_min = data->pos_min - last_chrs;
5851 			    data->last_start_max = is_inf
5852 				? OPTIMIZE_INFTY
5853 				: data->pos_min + data->pos_delta - last_chrs;
5854 			}
5855 			data->cur_is_floating = 1; /* float */
5856 		    }
5857 		    SvREFCNT_dec(last_str);
5858 		}
5859 		if (data && (fl & SF_HAS_EVAL))
5860 		    data->flags |= SF_HAS_EVAL;
5861 	      optimize_curly_tail:
5862 		rck_elide_nothing(oscan);
5863 		continue;
5864 
5865 	    default:
5866                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5867                                                                     OP(scan));
5868             case REF:
5869             case CLUMP:
5870 		if (flags & SCF_DO_SUBSTR) {
5871                     /* Cannot expect anything... */
5872                     scan_commit(pRExC_state, data, minlenp, is_inf);
5873 		    data->cur_is_floating = 1; /* float */
5874 		}
5875 		is_inf = is_inf_internal = 1;
5876 		if (flags & SCF_DO_STCLASS_OR) {
5877                     if (OP(scan) == CLUMP) {
5878                         /* Actually is any start char, but very few code points
5879                          * aren't start characters */
5880                         ssc_match_all_cp(data->start_class);
5881                     }
5882                     else {
5883                         ssc_anything(data->start_class);
5884                     }
5885                 }
5886 		flags &= ~SCF_DO_STCLASS;
5887 		break;
5888 	    }
5889 	}
5890 	else if (OP(scan) == LNBREAK) {
5891 	    if (flags & SCF_DO_STCLASS) {
5892                 if (flags & SCF_DO_STCLASS_AND) {
5893                     ssc_intersection(data->start_class,
5894                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5895                     ssc_clear_locale(data->start_class);
5896                     ANYOF_FLAGS(data->start_class)
5897                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5898                 }
5899                 else if (flags & SCF_DO_STCLASS_OR) {
5900                     ssc_union(data->start_class,
5901                               PL_XPosix_ptrs[_CC_VERTSPACE],
5902                               FALSE);
5903 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5904 
5905                     /* See commit msg for
5906                      * 749e076fceedeb708a624933726e7989f2302f6a */
5907                     ANYOF_FLAGS(data->start_class)
5908                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5909                 }
5910 		flags &= ~SCF_DO_STCLASS;
5911             }
5912 	    min++;
5913             if (delta != OPTIMIZE_INFTY)
5914                 delta++;    /* Because of the 2 char string cr-lf */
5915             if (flags & SCF_DO_SUBSTR) {
5916                 /* Cannot expect anything... */
5917                 scan_commit(pRExC_state, data, minlenp, is_inf);
5918                 data->pos_min += 1;
5919                 if (data->pos_delta != OPTIMIZE_INFTY) {
5920                     data->pos_delta += 1;
5921                 }
5922 		data->cur_is_floating = 1; /* float */
5923             }
5924 	}
5925 	else if (REGNODE_SIMPLE(OP(scan))) {
5926 
5927 	    if (flags & SCF_DO_SUBSTR) {
5928                 scan_commit(pRExC_state, data, minlenp, is_inf);
5929 		data->pos_min++;
5930 	    }
5931 	    min++;
5932 	    if (flags & SCF_DO_STCLASS) {
5933                 bool invert = 0;
5934                 SV* my_invlist = NULL;
5935                 U8 namedclass;
5936 
5937                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5938                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5939 
5940 		/* Some of the logic below assumes that switching
5941 		   locale on will only add false positives. */
5942 		switch (OP(scan)) {
5943 
5944 		default:
5945 #ifdef DEBUGGING
5946                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5947                                                                      OP(scan));
5948 #endif
5949 		case SANY:
5950 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5951 			ssc_match_all_cp(data->start_class);
5952 		    break;
5953 
5954 		case REG_ANY:
5955                     {
5956                         SV* REG_ANY_invlist = _new_invlist(2);
5957                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5958                                                             '\n');
5959                         if (flags & SCF_DO_STCLASS_OR) {
5960                             ssc_union(data->start_class,
5961                                       REG_ANY_invlist,
5962                                       TRUE /* TRUE => invert, hence all but \n
5963                                             */
5964                                       );
5965                         }
5966                         else if (flags & SCF_DO_STCLASS_AND) {
5967                             ssc_intersection(data->start_class,
5968                                              REG_ANY_invlist,
5969                                              TRUE  /* TRUE => invert */
5970                                              );
5971                             ssc_clear_locale(data->start_class);
5972                         }
5973                         SvREFCNT_dec_NN(REG_ANY_invlist);
5974 		    }
5975 		    break;
5976 
5977                 case ANYOFD:
5978                 case ANYOFL:
5979                 case ANYOFPOSIXL:
5980                 case ANYOFH:
5981                 case ANYOFHb:
5982                 case ANYOFHr:
5983                 case ANYOFHs:
5984                 case ANYOF:
5985 		    if (flags & SCF_DO_STCLASS_AND)
5986 			ssc_and(pRExC_state, data->start_class,
5987                                 (regnode_charclass *) scan);
5988 		    else
5989 			ssc_or(pRExC_state, data->start_class,
5990                                                           (regnode_charclass *) scan);
5991 		    break;
5992 
5993                 case NANYOFM: /* NANYOFM already contains the inversion of the
5994                                  input ANYOF data, so, unlike things like
5995                                  NPOSIXA, don't change 'invert' to TRUE */
5996                     /* FALLTHROUGH */
5997                 case ANYOFM:
5998                   {
5999                     SV* cp_list = get_ANYOFM_contents(scan);
6000 
6001                     if (flags & SCF_DO_STCLASS_OR) {
6002                         ssc_union(data->start_class, cp_list, invert);
6003                     }
6004                     else if (flags & SCF_DO_STCLASS_AND) {
6005                         ssc_intersection(data->start_class, cp_list, invert);
6006                     }
6007 
6008                     SvREFCNT_dec_NN(cp_list);
6009                     break;
6010                   }
6011 
6012                 case ANYOFR:
6013                 case ANYOFRb:
6014                   {
6015                     SV* cp_list = NULL;
6016 
6017                     cp_list = _add_range_to_invlist(cp_list,
6018                                         ANYOFRbase(scan),
6019                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
6020 
6021                     if (flags & SCF_DO_STCLASS_OR) {
6022                         ssc_union(data->start_class, cp_list, invert);
6023                     }
6024                     else if (flags & SCF_DO_STCLASS_AND) {
6025                         ssc_intersection(data->start_class, cp_list, invert);
6026                     }
6027 
6028                     SvREFCNT_dec_NN(cp_list);
6029                     break;
6030                   }
6031 
6032 		case NPOSIXL:
6033                     invert = 1;
6034                     /* FALLTHROUGH */
6035 
6036 		case POSIXL:
6037                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6038                     if (flags & SCF_DO_STCLASS_AND) {
6039                         bool was_there = cBOOL(
6040                                           ANYOF_POSIXL_TEST(data->start_class,
6041                                                                  namedclass));
6042                         ANYOF_POSIXL_ZERO(data->start_class);
6043                         if (was_there) {    /* Do an AND */
6044                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6045                         }
6046                         /* No individual code points can now match */
6047                         data->start_class->invlist
6048                                                 = sv_2mortal(_new_invlist(0));
6049                     }
6050                     else {
6051                         int complement = namedclass + ((invert) ? -1 : 1);
6052 
6053                         assert(flags & SCF_DO_STCLASS_OR);
6054 
6055                         /* If the complement of this class was already there,
6056                          * the result is that they match all code points,
6057                          * (\d + \D == everything).  Remove the classes from
6058                          * future consideration.  Locale is not relevant in
6059                          * this case */
6060                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6061                             ssc_match_all_cp(data->start_class);
6062                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6063                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
6064                         }
6065                         else {  /* The usual case; just add this class to the
6066                                    existing set */
6067                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6068                         }
6069                     }
6070                     break;
6071 
6072                 case NPOSIXA:   /* For these, we always know the exact set of
6073                                    what's matched */
6074                     invert = 1;
6075                     /* FALLTHROUGH */
6076 		case POSIXA:
6077                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6078                     goto join_posix_and_ascii;
6079 
6080 		case NPOSIXD:
6081 		case NPOSIXU:
6082                     invert = 1;
6083                     /* FALLTHROUGH */
6084 		case POSIXD:
6085 		case POSIXU:
6086                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6087 
6088                     /* NPOSIXD matches all upper Latin1 code points unless the
6089                      * target string being matched is UTF-8, which is
6090                      * unknowable until match time.  Since we are going to
6091                      * invert, we want to get rid of all of them so that the
6092                      * inversion will match all */
6093                     if (OP(scan) == NPOSIXD) {
6094                         _invlist_subtract(my_invlist, PL_UpperLatin1,
6095                                           &my_invlist);
6096                     }
6097 
6098                   join_posix_and_ascii:
6099 
6100                     if (flags & SCF_DO_STCLASS_AND) {
6101                         ssc_intersection(data->start_class, my_invlist, invert);
6102                         ssc_clear_locale(data->start_class);
6103                     }
6104                     else {
6105                         assert(flags & SCF_DO_STCLASS_OR);
6106                         ssc_union(data->start_class, my_invlist, invert);
6107                     }
6108                     SvREFCNT_dec(my_invlist);
6109 		}
6110 		if (flags & SCF_DO_STCLASS_OR)
6111 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6112 		flags &= ~SCF_DO_STCLASS;
6113 	    }
6114 	}
6115 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6116 	    data->flags |= (OP(scan) == MEOL
6117 			    ? SF_BEFORE_MEOL
6118 			    : SF_BEFORE_SEOL);
6119             scan_commit(pRExC_state, data, minlenp, is_inf);
6120 
6121 	}
6122 	else if (  PL_regkind[OP(scan)] == BRANCHJ
6123 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
6124 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
6125 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6126         {
6127             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6128                 || OP(scan) == UNLESSM )
6129             {
6130                 /* Negative Lookahead/lookbehind
6131                    In this case we can't do fixed string optimisation.
6132                 */
6133 
6134                 SSize_t deltanext, minnext, fake = 0;
6135                 regnode *nscan;
6136                 regnode_ssc intrnl;
6137                 int f = 0;
6138 
6139                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6140                 if (data) {
6141                     data_fake.whilem_c = data->whilem_c;
6142                     data_fake.last_closep = data->last_closep;
6143 		}
6144                 else
6145                     data_fake.last_closep = &fake;
6146 		data_fake.pos_delta = delta;
6147                 if ( flags & SCF_DO_STCLASS && !scan->flags
6148                      && OP(scan) == IFMATCH ) { /* Lookahead */
6149                     ssc_init(pRExC_state, &intrnl);
6150                     data_fake.start_class = &intrnl;
6151                     f |= SCF_DO_STCLASS_AND;
6152 		}
6153                 if (flags & SCF_WHILEM_VISITED_POS)
6154                     f |= SCF_WHILEM_VISITED_POS;
6155                 next = regnext(scan);
6156                 nscan = NEXTOPER(NEXTOPER(scan));
6157 
6158                 /* recurse study_chunk() for lookahead body */
6159                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6160                                       last, &data_fake, stopparen,
6161                                       recursed_depth, NULL, f, depth+1,
6162                                       mutate_ok);
6163                 if (scan->flags) {
6164                     if (   deltanext < 0
6165                         || deltanext > (I32) U8_MAX
6166                         || minnext > (I32)U8_MAX
6167                         || minnext + deltanext > (I32)U8_MAX)
6168                     {
6169 			FAIL2("Lookbehind longer than %" UVuf " not implemented",
6170                               (UV)U8_MAX);
6171                     }
6172 
6173                     /* The 'next_off' field has been repurposed to count the
6174                      * additional starting positions to try beyond the initial
6175                      * one.  (This leaves it at 0 for non-variable length
6176                      * matches to avoid breakage for those not using this
6177                      * extension) */
6178                     if (deltanext) {
6179                         scan->next_off = deltanext;
6180                         ckWARNexperimental(RExC_parse,
6181                             WARN_EXPERIMENTAL__VLB,
6182                             "Variable length lookbehind is experimental");
6183                     }
6184                     scan->flags = (U8)minnext + deltanext;
6185                 }
6186                 if (data) {
6187                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6188                         pars++;
6189                     if (data_fake.flags & SF_HAS_EVAL)
6190                         data->flags |= SF_HAS_EVAL;
6191                     data->whilem_c = data_fake.whilem_c;
6192                 }
6193                 if (f & SCF_DO_STCLASS_AND) {
6194 		    if (flags & SCF_DO_STCLASS_OR) {
6195 			/* OR before, AND after: ideally we would recurse with
6196 			 * data_fake to get the AND applied by study of the
6197 			 * remainder of the pattern, and then derecurse;
6198 			 * *** HACK *** for now just treat as "no information".
6199 			 * See [perl #56690].
6200 			 */
6201 			ssc_init(pRExC_state, data->start_class);
6202 		    }  else {
6203                         /* AND before and after: combine and continue.  These
6204                          * assertions are zero-length, so can match an EMPTY
6205                          * string */
6206 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6207                         ANYOF_FLAGS(data->start_class)
6208                                                    |= SSC_MATCHES_EMPTY_STRING;
6209 		    }
6210                 }
6211 	    }
6212 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6213             else {
6214                 /* Positive Lookahead/lookbehind
6215                    In this case we can do fixed string optimisation,
6216                    but we must be careful about it. Note in the case of
6217                    lookbehind the positions will be offset by the minimum
6218                    length of the pattern, something we won't know about
6219                    until after the recurse.
6220                 */
6221                 SSize_t deltanext, fake = 0;
6222                 regnode *nscan;
6223                 regnode_ssc intrnl;
6224                 int f = 0;
6225                 /* We use SAVEFREEPV so that when the full compile
6226                     is finished perl will clean up the allocated
6227                     minlens when it's all done. This way we don't
6228                     have to worry about freeing them when we know
6229                     they wont be used, which would be a pain.
6230                  */
6231                 SSize_t *minnextp;
6232                 Newx( minnextp, 1, SSize_t );
6233                 SAVEFREEPV(minnextp);
6234 
6235                 if (data) {
6236                     StructCopy(data, &data_fake, scan_data_t);
6237                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6238                         f |= SCF_DO_SUBSTR;
6239                         if (scan->flags)
6240                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6241                         data_fake.last_found=newSVsv(data->last_found);
6242                     }
6243                 }
6244                 else
6245                     data_fake.last_closep = &fake;
6246                 data_fake.flags = 0;
6247                 data_fake.substrs[0].flags = 0;
6248                 data_fake.substrs[1].flags = 0;
6249 		data_fake.pos_delta = delta;
6250                 if (is_inf)
6251 	            data_fake.flags |= SF_IS_INF;
6252                 if ( flags & SCF_DO_STCLASS && !scan->flags
6253                      && OP(scan) == IFMATCH ) { /* Lookahead */
6254                     ssc_init(pRExC_state, &intrnl);
6255                     data_fake.start_class = &intrnl;
6256                     f |= SCF_DO_STCLASS_AND;
6257                 }
6258                 if (flags & SCF_WHILEM_VISITED_POS)
6259                     f |= SCF_WHILEM_VISITED_POS;
6260                 next = regnext(scan);
6261                 nscan = NEXTOPER(NEXTOPER(scan));
6262 
6263                 /* positive lookahead study_chunk() recursion */
6264                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6265                                         &deltanext, last, &data_fake,
6266                                         stopparen, recursed_depth, NULL,
6267                                         f, depth+1, mutate_ok);
6268                 if (scan->flags) {
6269                     assert(0);  /* This code has never been tested since this
6270                                    is normally not compiled */
6271                     if (   deltanext < 0
6272                         || deltanext > (I32) U8_MAX
6273                         || *minnextp > (I32)U8_MAX
6274                         || *minnextp + deltanext > (I32)U8_MAX)
6275                     {
6276 			FAIL2("Lookbehind longer than %" UVuf " not implemented",
6277                               (UV)U8_MAX);
6278                     }
6279 
6280                     if (deltanext) {
6281                         scan->next_off = deltanext;
6282                     }
6283                     scan->flags = (U8)*minnextp + deltanext;
6284                 }
6285 
6286                 *minnextp += min;
6287 
6288                 if (f & SCF_DO_STCLASS_AND) {
6289                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6290                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6291                 }
6292                 if (data) {
6293                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6294                         pars++;
6295                     if (data_fake.flags & SF_HAS_EVAL)
6296                         data->flags |= SF_HAS_EVAL;
6297                     data->whilem_c = data_fake.whilem_c;
6298                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6299                         int i;
6300                         if (RExC_rx->minlen<*minnextp)
6301                             RExC_rx->minlen=*minnextp;
6302                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6303                         SvREFCNT_dec_NN(data_fake.last_found);
6304 
6305                         for (i = 0; i < 2; i++) {
6306                             if (data_fake.substrs[i].minlenp != minlenp) {
6307                                 data->substrs[i].min_offset =
6308                                             data_fake.substrs[i].min_offset;
6309                                 data->substrs[i].max_offset =
6310                                             data_fake.substrs[i].max_offset;
6311                                 data->substrs[i].minlenp =
6312                                             data_fake.substrs[i].minlenp;
6313                                 data->substrs[i].lookbehind += scan->flags;
6314                             }
6315                         }
6316                     }
6317                 }
6318 	    }
6319 #endif
6320 	}
6321 	else if (OP(scan) == OPEN) {
6322 	    if (stopparen != (I32)ARG(scan))
6323 	        pars++;
6324 	}
6325 	else if (OP(scan) == CLOSE) {
6326 	    if (stopparen == (I32)ARG(scan)) {
6327 	        break;
6328 	    }
6329 	    if ((I32)ARG(scan) == is_par) {
6330 		next = regnext(scan);
6331 
6332 		if ( next && (OP(next) != WHILEM) && next < last)
6333 		    is_par = 0;		/* Disable optimization */
6334 	    }
6335 	    if (data)
6336 		*(data->last_closep) = ARG(scan);
6337 	}
6338 	else if (OP(scan) == EVAL) {
6339 		if (data)
6340 		    data->flags |= SF_HAS_EVAL;
6341 	}
6342 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6343 	    if (flags & SCF_DO_SUBSTR) {
6344                 scan_commit(pRExC_state, data, minlenp, is_inf);
6345 		flags &= ~SCF_DO_SUBSTR;
6346 	    }
6347 	    if (data && OP(scan)==ACCEPT) {
6348 	        data->flags |= SCF_SEEN_ACCEPT;
6349 	        if (stopmin > min)
6350 	            stopmin = min;
6351 	    }
6352 	}
6353 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6354 	{
6355 		if (flags & SCF_DO_SUBSTR) {
6356                     scan_commit(pRExC_state, data, minlenp, is_inf);
6357 		    data->cur_is_floating = 1; /* float */
6358 		}
6359 		is_inf = is_inf_internal = 1;
6360 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6361 		    ssc_anything(data->start_class);
6362 		flags &= ~SCF_DO_STCLASS;
6363 	}
6364 	else if (OP(scan) == GPOS) {
6365             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6366 	        !(delta || is_inf || (data && data->pos_delta)))
6367 	    {
6368                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6369                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6370 	        if (RExC_rx->gofs < (STRLEN)min)
6371 		    RExC_rx->gofs = min;
6372             } else {
6373                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6374                 RExC_rx->gofs = 0;
6375             }
6376 	}
6377 #ifdef TRIE_STUDY_OPT
6378 #ifdef FULL_TRIE_STUDY
6379         else if (PL_regkind[OP(scan)] == TRIE) {
6380             /* NOTE - There is similar code to this block above for handling
6381                BRANCH nodes on the initial study.  If you change stuff here
6382                check there too. */
6383             regnode *trie_node= scan;
6384             regnode *tail= regnext(scan);
6385             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6386             SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6387             regnode_ssc accum;
6388 
6389             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6390                 /* Cannot merge strings after this. */
6391                 scan_commit(pRExC_state, data, minlenp, is_inf);
6392             }
6393             if (flags & SCF_DO_STCLASS)
6394                 ssc_init_zero(pRExC_state, &accum);
6395 
6396             if (!trie->jump) {
6397                 min1= trie->minlen;
6398                 max1= trie->maxlen;
6399             } else {
6400                 const regnode *nextbranch= NULL;
6401                 U32 word;
6402 
6403                 for ( word=1 ; word <= trie->wordcount ; word++)
6404                 {
6405                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6406                     regnode_ssc this_class;
6407 
6408                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6409                     if (data) {
6410                         data_fake.whilem_c = data->whilem_c;
6411                         data_fake.last_closep = data->last_closep;
6412                     }
6413                     else
6414                         data_fake.last_closep = &fake;
6415 		    data_fake.pos_delta = delta;
6416                     if (flags & SCF_DO_STCLASS) {
6417                         ssc_init(pRExC_state, &this_class);
6418                         data_fake.start_class = &this_class;
6419                         f = SCF_DO_STCLASS_AND;
6420                     }
6421                     if (flags & SCF_WHILEM_VISITED_POS)
6422                         f |= SCF_WHILEM_VISITED_POS;
6423 
6424                     if (trie->jump[word]) {
6425                         if (!nextbranch)
6426                             nextbranch = trie_node + trie->jump[0];
6427                         scan= trie_node + trie->jump[word];
6428                         /* We go from the jump point to the branch that follows
6429                            it. Note this means we need the vestigal unused
6430                            branches even though they arent otherwise used. */
6431                         /* optimise study_chunk() for TRIE */
6432                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6433                             &deltanext, (regnode *)nextbranch, &data_fake,
6434                             stopparen, recursed_depth, NULL, f, depth+1,
6435                             mutate_ok);
6436                     }
6437                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6438                         nextbranch= regnext((regnode*)nextbranch);
6439 
6440                     if (min1 > (SSize_t)(minnext + trie->minlen))
6441                         min1 = minnext + trie->minlen;
6442                     if (deltanext == OPTIMIZE_INFTY) {
6443                         is_inf = is_inf_internal = 1;
6444                         max1 = OPTIMIZE_INFTY;
6445                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6446                         max1 = minnext + deltanext + trie->maxlen;
6447 
6448                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6449                         pars++;
6450                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6451                         if ( stopmin > min + min1)
6452 	                    stopmin = min + min1;
6453 	                flags &= ~SCF_DO_SUBSTR;
6454 	                if (data)
6455 	                    data->flags |= SCF_SEEN_ACCEPT;
6456 	            }
6457                     if (data) {
6458                         if (data_fake.flags & SF_HAS_EVAL)
6459                             data->flags |= SF_HAS_EVAL;
6460                         data->whilem_c = data_fake.whilem_c;
6461                     }
6462                     if (flags & SCF_DO_STCLASS)
6463                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6464                 }
6465             }
6466             if (flags & SCF_DO_SUBSTR) {
6467                 data->pos_min += min1;
6468                 data->pos_delta += max1 - min1;
6469                 if (max1 != min1 || is_inf)
6470                     data->cur_is_floating = 1; /* float */
6471             }
6472             min += min1;
6473             if (delta != OPTIMIZE_INFTY) {
6474                 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6475                     delta += max1 - min1;
6476                 else
6477                     delta = OPTIMIZE_INFTY;
6478             }
6479             if (flags & SCF_DO_STCLASS_OR) {
6480                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6481                 if (min1) {
6482                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6483                     flags &= ~SCF_DO_STCLASS;
6484                 }
6485             }
6486             else if (flags & SCF_DO_STCLASS_AND) {
6487                 if (min1) {
6488                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6489                     flags &= ~SCF_DO_STCLASS;
6490                 }
6491                 else {
6492                     /* Switch to OR mode: cache the old value of
6493                      * data->start_class */
6494 		    INIT_AND_WITHP;
6495                     StructCopy(data->start_class, and_withp, regnode_ssc);
6496                     flags &= ~SCF_DO_STCLASS_AND;
6497                     StructCopy(&accum, data->start_class, regnode_ssc);
6498                     flags |= SCF_DO_STCLASS_OR;
6499                 }
6500             }
6501             scan= tail;
6502             continue;
6503         }
6504 #else
6505 	else if (PL_regkind[OP(scan)] == TRIE) {
6506 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6507 	    U8*bang=NULL;
6508 
6509 	    min += trie->minlen;
6510 	    delta += (trie->maxlen - trie->minlen);
6511 	    flags &= ~SCF_DO_STCLASS; /* xxx */
6512             if (flags & SCF_DO_SUBSTR) {
6513                 /* Cannot expect anything... */
6514                 scan_commit(pRExC_state, data, minlenp, is_inf);
6515                 data->pos_min += trie->minlen;
6516                 data->pos_delta += (trie->maxlen - trie->minlen);
6517 		if (trie->maxlen != trie->minlen)
6518 		    data->cur_is_floating = 1; /* float */
6519             }
6520             if (trie->jump) /* no more substrings -- for now /grr*/
6521                flags &= ~SCF_DO_SUBSTR;
6522 	}
6523         else if (OP(scan) == REGEX_SET) {
6524             Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6525                              " before optimization", reg_name[REGEX_SET]);
6526         }
6527 
6528 #endif /* old or new */
6529 #endif /* TRIE_STUDY_OPT */
6530 
6531 	/* Else: zero-length, ignore. */
6532 	scan = regnext(scan);
6533     }
6534 
6535   finish:
6536     if (frame) {
6537         /* we need to unwind recursion. */
6538         depth = depth - 1;
6539 
6540         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6541         DEBUG_PEEP("fend", scan, depth, flags);
6542 
6543         /* restore previous context */
6544         last = frame->last_regnode;
6545         scan = frame->next_regnode;
6546         stopparen = frame->stopparen;
6547         recursed_depth = frame->prev_recursed_depth;
6548 
6549         RExC_frame_last = frame->prev_frame;
6550         frame = frame->this_prev_frame;
6551         goto fake_study_recurse;
6552     }
6553 
6554     assert(!frame);
6555     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6556 
6557     *scanp = scan;
6558     *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6559 
6560     if (flags & SCF_DO_SUBSTR && is_inf)
6561 	data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6562     if (is_par > (I32)U8_MAX)
6563 	is_par = 0;
6564     if (is_par && pars==1 && data) {
6565 	data->flags |= SF_IN_PAR;
6566 	data->flags &= ~SF_HAS_PAR;
6567     }
6568     else if (pars && data) {
6569 	data->flags |= SF_HAS_PAR;
6570 	data->flags &= ~SF_IN_PAR;
6571     }
6572     if (flags & SCF_DO_STCLASS_OR)
6573 	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6574     if (flags & SCF_TRIE_RESTUDY)
6575         data->flags |= 	SCF_TRIE_RESTUDY;
6576 
6577     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6578 
6579     final_minlen = min < stopmin
6580             ? min : stopmin;
6581 
6582     if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6583         if (final_minlen > OPTIMIZE_INFTY - delta)
6584             RExC_maxlen = OPTIMIZE_INFTY;
6585         else if (RExC_maxlen < final_minlen + delta)
6586             RExC_maxlen = final_minlen + delta;
6587     }
6588     return final_minlen;
6589 }
6590 
6591 STATIC U32
S_add_data(RExC_state_t * const pRExC_state,const char * const s,const U32 n)6592 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6593 {
6594     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6595 
6596     PERL_ARGS_ASSERT_ADD_DATA;
6597 
6598     Renewc(RExC_rxi->data,
6599 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6600 	   char, struct reg_data);
6601     if(count)
6602 	Renew(RExC_rxi->data->what, count + n, U8);
6603     else
6604 	Newx(RExC_rxi->data->what, n, U8);
6605     RExC_rxi->data->count = count + n;
6606     Copy(s, RExC_rxi->data->what + count, n, U8);
6607     return count;
6608 }
6609 
6610 /*XXX: todo make this not included in a non debugging perl, but appears to be
6611  * used anyway there, in 'use re' */
6612 #ifndef PERL_IN_XSUB_RE
6613 void
Perl_reginitcolors(pTHX)6614 Perl_reginitcolors(pTHX)
6615 {
6616     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6617     if (s) {
6618 	char *t = savepv(s);
6619 	int i = 0;
6620 	PL_colors[0] = t;
6621 	while (++i < 6) {
6622 	    t = strchr(t, '\t');
6623 	    if (t) {
6624 		*t = '\0';
6625 		PL_colors[i] = ++t;
6626 	    }
6627 	    else
6628 		PL_colors[i] = t = (char *)"";
6629 	}
6630     } else {
6631 	int i = 0;
6632 	while (i < 6)
6633 	    PL_colors[i++] = (char *)"";
6634     }
6635     PL_colorset = 1;
6636 }
6637 #endif
6638 
6639 
6640 #ifdef TRIE_STUDY_OPT
6641 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6642     STMT_START {                                            \
6643         if (                                                \
6644               (data.flags & SCF_TRIE_RESTUDY)               \
6645               && ! restudied++                              \
6646         ) {                                                 \
6647             dOsomething;                                    \
6648             goto reStudy;                                   \
6649         }                                                   \
6650     } STMT_END
6651 #else
6652 #define CHECK_RESTUDY_GOTO_butfirst
6653 #endif
6654 
6655 /*
6656  * pregcomp - compile a regular expression into internal code
6657  *
6658  * Decides which engine's compiler to call based on the hint currently in
6659  * scope
6660  */
6661 
6662 #ifndef PERL_IN_XSUB_RE
6663 
6664 /* return the currently in-scope regex engine (or the default if none)  */
6665 
6666 regexp_engine const *
Perl_current_re_engine(pTHX)6667 Perl_current_re_engine(pTHX)
6668 {
6669     if (IN_PERL_COMPILETIME) {
6670 	HV * const table = GvHV(PL_hintgv);
6671 	SV **ptr;
6672 
6673 	if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6674 	    return &PL_core_reg_engine;
6675 	ptr = hv_fetchs(table, "regcomp", FALSE);
6676 	if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6677 	    return &PL_core_reg_engine;
6678 	return INT2PTR(regexp_engine*, SvIV(*ptr));
6679     }
6680     else {
6681 	SV *ptr;
6682 	if (!PL_curcop->cop_hints_hash)
6683 	    return &PL_core_reg_engine;
6684 	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6685 	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6686 	    return &PL_core_reg_engine;
6687 	return INT2PTR(regexp_engine*, SvIV(ptr));
6688     }
6689 }
6690 
6691 
6692 REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern,const U32 flags)6693 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6694 {
6695     regexp_engine const *eng = current_re_engine();
6696     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6697 
6698     PERL_ARGS_ASSERT_PREGCOMP;
6699 
6700     /* Dispatch a request to compile a regexp to correct regexp engine. */
6701     DEBUG_COMPILE_r({
6702         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6703 			PTR2UV(eng));
6704     });
6705     return CALLREGCOMP_ENG(eng, pattern, flags);
6706 }
6707 #endif
6708 
6709 /* public(ish) entry point for the perl core's own regex compiling code.
6710  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6711  * pattern rather than a list of OPs, and uses the internal engine rather
6712  * than the current one */
6713 
6714 REGEXP *
Perl_re_compile(pTHX_ SV * const pattern,U32 rx_flags)6715 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6716 {
6717     SV *pat = pattern; /* defeat constness! */
6718 
6719     PERL_ARGS_ASSERT_RE_COMPILE;
6720 
6721     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6722 #ifdef PERL_IN_XSUB_RE
6723                                 &my_reg_engine,
6724 #else
6725                                 &PL_core_reg_engine,
6726 #endif
6727                                 NULL, NULL, rx_flags, 0);
6728 }
6729 
6730 static void
S_free_codeblocks(pTHX_ struct reg_code_blocks * cbs)6731 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6732 {
6733     int n;
6734 
6735     if (--cbs->refcnt > 0)
6736         return;
6737     for (n = 0; n < cbs->count; n++) {
6738         REGEXP *rx = cbs->cb[n].src_regex;
6739         if (rx) {
6740             cbs->cb[n].src_regex = NULL;
6741             SvREFCNT_dec_NN(rx);
6742         }
6743     }
6744     Safefree(cbs->cb);
6745     Safefree(cbs);
6746 }
6747 
6748 
6749 static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)6750 S_alloc_code_blocks(pTHX_  int ncode)
6751 {
6752      struct reg_code_blocks *cbs;
6753     Newx(cbs, 1, struct reg_code_blocks);
6754     cbs->count = ncode;
6755     cbs->refcnt = 1;
6756     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6757     if (ncode)
6758         Newx(cbs->cb, ncode, struct reg_code_block);
6759     else
6760         cbs->cb = NULL;
6761     return cbs;
6762 }
6763 
6764 
6765 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6766  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6767  * point to the realloced string and length.
6768  *
6769  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6770  * stuff added */
6771 
6772 static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,char ** pat_p,STRLEN * plen_p,int num_code_blocks)6773 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6774 		    char **pat_p, STRLEN *plen_p, int num_code_blocks)
6775 {
6776     U8 *const src = (U8*)*pat_p;
6777     U8 *dst, *d;
6778     int n=0;
6779     STRLEN s = 0;
6780     bool do_end = 0;
6781     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6782 
6783     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6784         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6785 
6786     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6787     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6788     d = dst;
6789 
6790     while (s < *plen_p) {
6791         append_utf8_from_native_byte(src[s], &d);
6792 
6793         if (n < num_code_blocks) {
6794             assert(pRExC_state->code_blocks);
6795             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6796                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6797                 assert(*(d - 1) == '(');
6798                 do_end = 1;
6799             }
6800             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6801                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6802                 assert(*(d - 1) == ')');
6803                 do_end = 0;
6804                 n++;
6805             }
6806         }
6807         s++;
6808     }
6809     *d = '\0';
6810     *plen_p = d - dst;
6811     *pat_p = (char*) dst;
6812     SAVEFREEPV(*pat_p);
6813     RExC_orig_utf8 = RExC_utf8 = 1;
6814 }
6815 
6816 
6817 
6818 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6819  * while recording any code block indices, and handling overloading,
6820  * nested qr// objects etc.  If pat is null, it will allocate a new
6821  * string, or just return the first arg, if there's only one.
6822  *
6823  * Returns the malloced/updated pat.
6824  * patternp and pat_count is the array of SVs to be concatted;
6825  * oplist is the optional list of ops that generated the SVs;
6826  * recompile_p is a pointer to a boolean that will be set if
6827  *   the regex will need to be recompiled.
6828  * delim, if non-null is an SV that will be inserted between each element
6829  */
6830 
6831 static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,SV * pat,SV ** const patternp,int pat_count,OP * oplist,bool * recompile_p,SV * delim)6832 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6833                 SV *pat, SV ** const patternp, int pat_count,
6834                 OP *oplist, bool *recompile_p, SV *delim)
6835 {
6836     SV **svp;
6837     int n = 0;
6838     bool use_delim = FALSE;
6839     bool alloced = FALSE;
6840 
6841     /* if we know we have at least two args, create an empty string,
6842      * then concatenate args to that. For no args, return an empty string */
6843     if (!pat && pat_count != 1) {
6844         pat = newSVpvs("");
6845         SAVEFREESV(pat);
6846         alloced = TRUE;
6847     }
6848 
6849     for (svp = patternp; svp < patternp + pat_count; svp++) {
6850         SV *sv;
6851         SV *rx  = NULL;
6852         STRLEN orig_patlen = 0;
6853         bool code = 0;
6854         SV *msv = use_delim ? delim : *svp;
6855         if (!msv) msv = &PL_sv_undef;
6856 
6857         /* if we've got a delimiter, we go round the loop twice for each
6858          * svp slot (except the last), using the delimiter the second
6859          * time round */
6860         if (use_delim) {
6861             svp--;
6862             use_delim = FALSE;
6863         }
6864         else if (delim)
6865             use_delim = TRUE;
6866 
6867         if (SvTYPE(msv) == SVt_PVAV) {
6868             /* we've encountered an interpolated array within
6869              * the pattern, e.g. /...@a..../. Expand the list of elements,
6870              * then recursively append elements.
6871              * The code in this block is based on S_pushav() */
6872 
6873             AV *const av = (AV*)msv;
6874             const SSize_t maxarg = AvFILL(av) + 1;
6875             SV **array;
6876 
6877             if (oplist) {
6878                 assert(oplist->op_type == OP_PADAV
6879                     || oplist->op_type == OP_RV2AV);
6880                 oplist = OpSIBLING(oplist);
6881             }
6882 
6883             if (SvRMAGICAL(av)) {
6884                 SSize_t i;
6885 
6886                 Newx(array, maxarg, SV*);
6887                 SAVEFREEPV(array);
6888                 for (i=0; i < maxarg; i++) {
6889                     SV ** const svp = av_fetch(av, i, FALSE);
6890                     array[i] = svp ? *svp : &PL_sv_undef;
6891                 }
6892             }
6893             else
6894                 array = AvARRAY(av);
6895 
6896             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6897                                 array, maxarg, NULL, recompile_p,
6898                                 /* $" */
6899                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6900 
6901             continue;
6902         }
6903 
6904 
6905         /* we make the assumption here that each op in the list of
6906          * op_siblings maps to one SV pushed onto the stack,
6907          * except for code blocks, with have both an OP_NULL and
6908          * an OP_CONST.
6909          * This allows us to match up the list of SVs against the
6910          * list of OPs to find the next code block.
6911          *
6912          * Note that       PUSHMARK PADSV PADSV ..
6913          * is optimised to
6914          *                 PADRANGE PADSV  PADSV  ..
6915          * so the alignment still works. */
6916 
6917         if (oplist) {
6918             if (oplist->op_type == OP_NULL
6919                 && (oplist->op_flags & OPf_SPECIAL))
6920             {
6921                 assert(n < pRExC_state->code_blocks->count);
6922                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6923                 pRExC_state->code_blocks->cb[n].block = oplist;
6924                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6925                 n++;
6926                 code = 1;
6927                 oplist = OpSIBLING(oplist); /* skip CONST */
6928                 assert(oplist);
6929             }
6930             oplist = OpSIBLING(oplist);;
6931         }
6932 
6933 	/* apply magic and QR overloading to arg */
6934 
6935         SvGETMAGIC(msv);
6936         if (SvROK(msv) && SvAMAGIC(msv)) {
6937             SV *sv = AMG_CALLunary(msv, regexp_amg);
6938             if (sv) {
6939                 if (SvROK(sv))
6940                     sv = SvRV(sv);
6941                 if (SvTYPE(sv) != SVt_REGEXP)
6942                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6943                 msv = sv;
6944             }
6945         }
6946 
6947         /* try concatenation overload ... */
6948         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6949                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6950         {
6951             sv_setsv(pat, sv);
6952             /* overloading involved: all bets are off over literal
6953              * code. Pretend we haven't seen it */
6954             if (n)
6955                 pRExC_state->code_blocks->count -= n;
6956             n = 0;
6957         }
6958         else {
6959             /* ... or failing that, try "" overload */
6960             while (SvAMAGIC(msv)
6961                     && (sv = AMG_CALLunary(msv, string_amg))
6962                     && sv != msv
6963                     &&  !(   SvROK(msv)
6964                           && SvROK(sv)
6965                           && SvRV(msv) == SvRV(sv))
6966             ) {
6967                 msv = sv;
6968                 SvGETMAGIC(msv);
6969             }
6970             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6971                 msv = SvRV(msv);
6972 
6973             if (pat) {
6974                 /* this is a partially unrolled
6975                  *     sv_catsv_nomg(pat, msv);
6976                  * that allows us to adjust code block indices if
6977                  * needed */
6978                 STRLEN dlen;
6979                 char *dst = SvPV_force_nomg(pat, dlen);
6980                 orig_patlen = dlen;
6981                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6982                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6983                     sv_setpvn(pat, dst, dlen);
6984                     SvUTF8_on(pat);
6985                 }
6986                 sv_catsv_nomg(pat, msv);
6987                 rx = msv;
6988             }
6989             else {
6990                 /* We have only one SV to process, but we need to verify
6991                  * it is properly null terminated or we will fail asserts
6992                  * later. In theory we probably shouldn't get such SV's,
6993                  * but if we do we should handle it gracefully. */
6994                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6995                     /* not a string, or a string with a trailing null */
6996                     pat = msv;
6997                 } else {
6998                     /* a string with no trailing null, we need to copy it
6999                      * so it has a trailing null */
7000                     pat = sv_2mortal(newSVsv(msv));
7001                 }
7002             }
7003 
7004             if (code)
7005                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7006         }
7007 
7008         /* extract any code blocks within any embedded qr//'s */
7009         if (rx && SvTYPE(rx) == SVt_REGEXP
7010             && RX_ENGINE((REGEXP*)rx)->op_comp)
7011         {
7012 
7013             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7014             if (ri->code_blocks && ri->code_blocks->count) {
7015                 int i;
7016                 /* the presence of an embedded qr// with code means
7017                  * we should always recompile: the text of the
7018                  * qr// may not have changed, but it may be a
7019                  * different closure than last time */
7020                 *recompile_p = 1;
7021                 if (pRExC_state->code_blocks) {
7022                     int new_count = pRExC_state->code_blocks->count
7023                             + ri->code_blocks->count;
7024                     Renew(pRExC_state->code_blocks->cb,
7025                             new_count, struct reg_code_block);
7026                     pRExC_state->code_blocks->count = new_count;
7027                 }
7028                 else
7029                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7030                                                     ri->code_blocks->count);
7031 
7032                 for (i=0; i < ri->code_blocks->count; i++) {
7033                     struct reg_code_block *src, *dst;
7034                     STRLEN offset =  orig_patlen
7035                         + ReANY((REGEXP *)rx)->pre_prefix;
7036                     assert(n < pRExC_state->code_blocks->count);
7037                     src = &ri->code_blocks->cb[i];
7038                     dst = &pRExC_state->code_blocks->cb[n];
7039                     dst->start	    = src->start + offset;
7040                     dst->end	    = src->end   + offset;
7041                     dst->block	    = src->block;
7042                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
7043                                             src->src_regex
7044                                                 ? src->src_regex
7045                                                 : (REGEXP*)rx);
7046                     n++;
7047                 }
7048             }
7049         }
7050     }
7051     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7052     if (alloced)
7053         SvSETMAGIC(pat);
7054 
7055     return pat;
7056 }
7057 
7058 
7059 
7060 /* see if there are any run-time code blocks in the pattern.
7061  * False positives are allowed */
7062 
7063 static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)7064 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7065 		    char *pat, STRLEN plen)
7066 {
7067     int n = 0;
7068     STRLEN s;
7069 
7070     PERL_UNUSED_CONTEXT;
7071 
7072     for (s = 0; s < plen; s++) {
7073 	if (   pRExC_state->code_blocks
7074             && n < pRExC_state->code_blocks->count
7075 	    && s == pRExC_state->code_blocks->cb[n].start)
7076 	{
7077 	    s = pRExC_state->code_blocks->cb[n].end;
7078 	    n++;
7079 	    continue;
7080 	}
7081 	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
7082 	 * positives here */
7083 	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7084 	    (pat[s+2] == '{'
7085                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7086 	)
7087 	    return 1;
7088     }
7089     return 0;
7090 }
7091 
7092 /* Handle run-time code blocks. We will already have compiled any direct
7093  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7094  * copy of it, but with any literal code blocks blanked out and
7095  * appropriate chars escaped; then feed it into
7096  *
7097  *    eval "qr'modified_pattern'"
7098  *
7099  * For example,
7100  *
7101  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7102  *
7103  * becomes
7104  *
7105  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7106  *
7107  * After eval_sv()-ing that, grab any new code blocks from the returned qr
7108  * and merge them with any code blocks of the original regexp.
7109  *
7110  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7111  * instead, just save the qr and return FALSE; this tells our caller that
7112  * the original pattern needs upgrading to utf8.
7113  */
7114 
7115 static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)7116 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7117     char *pat, STRLEN plen)
7118 {
7119     SV *qr;
7120 
7121     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7122 
7123     if (pRExC_state->runtime_code_qr) {
7124 	/* this is the second time we've been called; this should
7125 	 * only happen if the main pattern got upgraded to utf8
7126 	 * during compilation; re-use the qr we compiled first time
7127 	 * round (which should be utf8 too)
7128 	 */
7129 	qr = pRExC_state->runtime_code_qr;
7130 	pRExC_state->runtime_code_qr = NULL;
7131 	assert(RExC_utf8 && SvUTF8(qr));
7132     }
7133     else {
7134 	int n = 0;
7135 	STRLEN s;
7136 	char *p, *newpat;
7137 	int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7138 	SV *sv, *qr_ref;
7139 	dSP;
7140 
7141 	/* determine how many extra chars we need for ' and \ escaping */
7142 	for (s = 0; s < plen; s++) {
7143 	    if (pat[s] == '\'' || pat[s] == '\\')
7144 		newlen++;
7145 	}
7146 
7147 	Newx(newpat, newlen, char);
7148 	p = newpat;
7149 	*p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7150 
7151 	for (s = 0; s < plen; s++) {
7152 	    if (   pRExC_state->code_blocks
7153 	        && n < pRExC_state->code_blocks->count
7154 		&& s == pRExC_state->code_blocks->cb[n].start)
7155 	    {
7156 		/* blank out literal code block so that they aren't
7157                  * recompiled: eg change from/to:
7158                  *     /(?{xyz})/
7159                  *     /(?=====)/
7160                  * and
7161                  *     /(??{xyz})/
7162                  *     /(?======)/
7163                  * and
7164                  *     /(?(?{xyz}))/
7165                  *     /(?(?=====))/
7166                 */
7167 		assert(pat[s]   == '(');
7168 		assert(pat[s+1] == '?');
7169                 *p++ = '(';
7170                 *p++ = '?';
7171                 s += 2;
7172 		while (s < pRExC_state->code_blocks->cb[n].end) {
7173 		    *p++ = '=';
7174 		    s++;
7175 		}
7176                 *p++ = ')';
7177 		n++;
7178 		continue;
7179 	    }
7180 	    if (pat[s] == '\'' || pat[s] == '\\')
7181 		*p++ = '\\';
7182 	    *p++ = pat[s];
7183 	}
7184 	*p++ = '\'';
7185 	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7186 	    *p++ = 'x';
7187             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7188                 *p++ = 'x';
7189             }
7190         }
7191 	*p++ = '\0';
7192 	DEBUG_COMPILE_r({
7193             Perl_re_printf( aTHX_
7194 		"%sre-parsing pattern for runtime code:%s %s\n",
7195 		PL_colors[4], PL_colors[5], newpat);
7196 	});
7197 
7198 	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7199 	Safefree(newpat);
7200 
7201 	ENTER;
7202 	SAVETMPS;
7203 	save_re_context();
7204 	PUSHSTACKi(PERLSI_REQUIRE);
7205         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7206          * parsing qr''; normally only q'' does this. It also alters
7207          * hints handling */
7208 	eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7209 	SvREFCNT_dec_NN(sv);
7210 	SPAGAIN;
7211 	qr_ref = POPs;
7212 	PUTBACK;
7213 	{
7214 	    SV * const errsv = ERRSV;
7215 	    if (SvTRUE_NN(errsv))
7216                 /* use croak_sv ? */
7217 		Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7218 	}
7219 	assert(SvROK(qr_ref));
7220 	qr = SvRV(qr_ref);
7221 	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7222 	/* the leaving below frees the tmp qr_ref.
7223 	 * Give qr a life of its own */
7224 	SvREFCNT_inc(qr);
7225 	POPSTACK;
7226 	FREETMPS;
7227 	LEAVE;
7228 
7229     }
7230 
7231     if (!RExC_utf8 && SvUTF8(qr)) {
7232 	/* first time through; the pattern got upgraded; save the
7233 	 * qr for the next time through */
7234 	assert(!pRExC_state->runtime_code_qr);
7235 	pRExC_state->runtime_code_qr = qr;
7236 	return 0;
7237     }
7238 
7239 
7240     /* extract any code blocks within the returned qr//  */
7241 
7242 
7243     /* merge the main (r1) and run-time (r2) code blocks into one */
7244     {
7245 	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7246 	struct reg_code_block *new_block, *dst;
7247 	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7248 	int i1 = 0, i2 = 0;
7249         int r1c, r2c;
7250 
7251 	if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7252 	{
7253 	    SvREFCNT_dec_NN(qr);
7254 	    return 1;
7255 	}
7256 
7257         if (!r1->code_blocks)
7258             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7259 
7260         r1c = r1->code_blocks->count;
7261         r2c = r2->code_blocks->count;
7262 
7263 	Newx(new_block, r1c + r2c, struct reg_code_block);
7264 
7265 	dst = new_block;
7266 
7267 	while (i1 < r1c || i2 < r2c) {
7268 	    struct reg_code_block *src;
7269 	    bool is_qr = 0;
7270 
7271 	    if (i1 == r1c) {
7272 		src = &r2->code_blocks->cb[i2++];
7273 		is_qr = 1;
7274 	    }
7275 	    else if (i2 == r2c)
7276 		src = &r1->code_blocks->cb[i1++];
7277 	    else if (  r1->code_blocks->cb[i1].start
7278 	             < r2->code_blocks->cb[i2].start)
7279 	    {
7280 		src = &r1->code_blocks->cb[i1++];
7281 		assert(src->end < r2->code_blocks->cb[i2].start);
7282 	    }
7283 	    else {
7284 		assert(  r1->code_blocks->cb[i1].start
7285 		       > r2->code_blocks->cb[i2].start);
7286 		src = &r2->code_blocks->cb[i2++];
7287 		is_qr = 1;
7288 		assert(src->end < r1->code_blocks->cb[i1].start);
7289 	    }
7290 
7291 	    assert(pat[src->start] == '(');
7292 	    assert(pat[src->end]   == ')');
7293 	    dst->start	    = src->start;
7294 	    dst->end	    = src->end;
7295 	    dst->block	    = src->block;
7296 	    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7297 				    : src->src_regex;
7298 	    dst++;
7299 	}
7300 	r1->code_blocks->count += r2c;
7301 	Safefree(r1->code_blocks->cb);
7302 	r1->code_blocks->cb = new_block;
7303     }
7304 
7305     SvREFCNT_dec_NN(qr);
7306     return 1;
7307 }
7308 
7309 
7310 STATIC bool
S_setup_longest(pTHX_ RExC_state_t * pRExC_state,struct reg_substr_datum * rsd,struct scan_data_substrs * sub,STRLEN longest_length)7311 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7312                       struct reg_substr_datum  *rsd,
7313                       struct scan_data_substrs *sub,
7314                       STRLEN longest_length)
7315 {
7316     /* This is the common code for setting up the floating and fixed length
7317      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7318      * as to whether succeeded or not */
7319 
7320     I32 t;
7321     SSize_t ml;
7322     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7323     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7324 
7325     if (! (longest_length
7326            || (eol /* Can't have SEOL and MULTI */
7327                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7328           )
7329             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7330         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7331     {
7332         return FALSE;
7333     }
7334 
7335     /* copy the information about the longest from the reg_scan_data
7336         over to the program. */
7337     if (SvUTF8(sub->str)) {
7338         rsd->substr      = NULL;
7339         rsd->utf8_substr = sub->str;
7340     } else {
7341         rsd->substr      = sub->str;
7342         rsd->utf8_substr = NULL;
7343     }
7344     /* end_shift is how many chars that must be matched that
7345         follow this item. We calculate it ahead of time as once the
7346         lookbehind offset is added in we lose the ability to correctly
7347         calculate it.*/
7348     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7349     rsd->end_shift = ml - sub->min_offset
7350         - longest_length
7351             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7352              * intead? - DAPM
7353             + (SvTAIL(sub->str) != 0)
7354             */
7355         + sub->lookbehind;
7356 
7357     t = (eol/* Can't have SEOL and MULTI */
7358          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7359     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7360 
7361     return TRUE;
7362 }
7363 
7364 STATIC void
S_set_regex_pv(pTHX_ RExC_state_t * pRExC_state,REGEXP * Rx)7365 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7366 {
7367     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7368      * properly wrapped with the right modifiers */
7369 
7370     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7371     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7372                                                 != REGEX_DEPENDS_CHARSET);
7373 
7374     /* The caret is output if there are any defaults: if not all the STD
7375         * flags are set, or if no character set specifier is needed */
7376     bool has_default =
7377                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7378                 || ! has_charset);
7379     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7380                                                 == REG_RUN_ON_COMMENT_SEEN);
7381     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7382                         >> RXf_PMf_STD_PMMOD_SHIFT);
7383     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7384     char *p;
7385     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7386 
7387     /* We output all the necessary flags; we never output a minus, as all
7388         * those are defaults, so are
7389         * covered by the caret */
7390     const STRLEN wraplen = pat_len + has_p + has_runon
7391         + has_default       /* If needs a caret */
7392         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7393 
7394             /* If needs a character set specifier */
7395         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7396         + (sizeof("(?:)") - 1);
7397 
7398     PERL_ARGS_ASSERT_SET_REGEX_PV;
7399 
7400     /* make sure PL_bitcount bounds not exceeded */
7401     STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7402 
7403     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7404     SvPOK_on(Rx);
7405     if (RExC_utf8)
7406         SvFLAGS(Rx) |= SVf_UTF8;
7407     *p++='('; *p++='?';
7408 
7409     /* If a default, cover it using the caret */
7410     if (has_default) {
7411         *p++= DEFAULT_PAT_MOD;
7412     }
7413     if (has_charset) {
7414         STRLEN len;
7415         const char* name;
7416 
7417         name = get_regex_charset_name(RExC_rx->extflags, &len);
7418         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7419             assert(RExC_utf8);
7420             name = UNICODE_PAT_MODS;
7421             len = sizeof(UNICODE_PAT_MODS) - 1;
7422         }
7423         Copy(name, p, len, char);
7424         p += len;
7425     }
7426     if (has_p)
7427         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7428     {
7429         char ch;
7430         while((ch = *fptr++)) {
7431             if(reganch & 1)
7432                 *p++ = ch;
7433             reganch >>= 1;
7434         }
7435     }
7436 
7437     *p++ = ':';
7438     Copy(RExC_precomp, p, pat_len, char);
7439     assert ((RX_WRAPPED(Rx) - p) < 16);
7440     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7441     p += pat_len;
7442 
7443     /* Adding a trailing \n causes this to compile properly:
7444             my $R = qr / A B C # D E/x; /($R)/
7445         Otherwise the parens are considered part of the comment */
7446     if (has_runon)
7447         *p++ = '\n';
7448     *p++ = ')';
7449     *p = 0;
7450     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7451 }
7452 
7453 /*
7454  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7455  * regular expression into internal code.
7456  * The pattern may be passed either as:
7457  *    a list of SVs (patternp plus pat_count)
7458  *    a list of OPs (expr)
7459  * If both are passed, the SV list is used, but the OP list indicates
7460  * which SVs are actually pre-compiled code blocks
7461  *
7462  * The SVs in the list have magic and qr overloading applied to them (and
7463  * the list may be modified in-place with replacement SVs in the latter
7464  * case).
7465  *
7466  * If the pattern hasn't changed from old_re, then old_re will be
7467  * returned.
7468  *
7469  * eng is the current engine. If that engine has an op_comp method, then
7470  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7471  * do the initial concatenation of arguments and pass on to the external
7472  * engine.
7473  *
7474  * If is_bare_re is not null, set it to a boolean indicating whether the
7475  * arg list reduced (after overloading) to a single bare regex which has
7476  * been returned (i.e. /$qr/).
7477  *
7478  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7479  *
7480  * pm_flags contains the PMf_* flags, typically based on those from the
7481  * pm_flags field of the related PMOP. Currently we're only interested in
7482  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7483  *
7484  * For many years this code had an initial sizing pass that calculated
7485  * (sometimes incorrectly, leading to security holes) the size needed for the
7486  * compiled pattern.  That was changed by commit
7487  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7488  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7489  * references to this sizing pass.
7490  *
7491  * Now, an initial crude guess as to the size needed is made, based on the
7492  * length of the pattern.  Patches welcome to improve that guess.  That amount
7493  * of space is malloc'd and then immediately freed, and then clawed back node
7494  * by node.  This design is to minimze, to the extent possible, memory churn
7495  * when doing the reallocs.
7496  *
7497  * A separate parentheses counting pass may be needed in some cases.
7498  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7499  * of these cases.
7500  *
7501  * The existence of a sizing pass necessitated design decisions that are no
7502  * longer needed.  There are potential areas of simplification.
7503  *
7504  * Beware that the optimization-preparation code in here knows about some
7505  * of the structure of the compiled regexp.  [I'll say.]
7506  */
7507 
7508 REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp,int pat_count,OP * expr,const regexp_engine * eng,REGEXP * old_re,bool * is_bare_re,const U32 orig_rx_flags,const U32 pm_flags)7509 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7510 		    OP *expr, const regexp_engine* eng, REGEXP *old_re,
7511 		     bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7512 {
7513     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7514     STRLEN plen;
7515     char *exp;
7516     regnode *scan;
7517     I32 flags;
7518     SSize_t minlen = 0;
7519     U32 rx_flags;
7520     SV *pat;
7521     SV** new_patternp = patternp;
7522 
7523     /* these are all flags - maybe they should be turned
7524      * into a single int with different bit masks */
7525     I32 sawlookahead = 0;
7526     I32 sawplus = 0;
7527     I32 sawopen = 0;
7528     I32 sawminmod = 0;
7529 
7530     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7531     bool recompile = 0;
7532     bool runtime_code = 0;
7533     scan_data_t data;
7534     RExC_state_t RExC_state;
7535     RExC_state_t * const pRExC_state = &RExC_state;
7536 #ifdef TRIE_STUDY_OPT
7537     int restudied = 0;
7538     RExC_state_t copyRExC_state;
7539 #endif
7540     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7541 
7542     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7543 
7544     DEBUG_r(if (!PL_colorset) reginitcolors());
7545 
7546 
7547     pRExC_state->warn_text = NULL;
7548     pRExC_state->unlexed_names = NULL;
7549     pRExC_state->code_blocks = NULL;
7550 
7551     if (is_bare_re)
7552 	*is_bare_re = FALSE;
7553 
7554     if (expr && (expr->op_type == OP_LIST ||
7555 		(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7556 	/* allocate code_blocks if needed */
7557 	OP *o;
7558 	int ncode = 0;
7559 
7560 	for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7561 	    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7562 		ncode++; /* count of DO blocks */
7563 
7564 	if (ncode)
7565             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7566     }
7567 
7568     if (!pat_count) {
7569         /* compile-time pattern with just OP_CONSTs and DO blocks */
7570 
7571         int n;
7572         OP *o;
7573 
7574         /* find how many CONSTs there are */
7575         assert(expr);
7576         n = 0;
7577         if (expr->op_type == OP_CONST)
7578             n = 1;
7579         else
7580             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7581                 if (o->op_type == OP_CONST)
7582                     n++;
7583             }
7584 
7585         /* fake up an SV array */
7586 
7587         assert(!new_patternp);
7588         Newx(new_patternp, n, SV*);
7589         SAVEFREEPV(new_patternp);
7590         pat_count = n;
7591 
7592         n = 0;
7593         if (expr->op_type == OP_CONST)
7594             new_patternp[n] = cSVOPx_sv(expr);
7595         else
7596             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7597                 if (o->op_type == OP_CONST)
7598                     new_patternp[n++] = cSVOPo_sv;
7599             }
7600 
7601     }
7602 
7603     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7604         "Assembling pattern from %d elements%s\n", pat_count,
7605             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7606 
7607     /* set expr to the first arg op */
7608 
7609     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7610          && expr->op_type != OP_CONST)
7611     {
7612             expr = cLISTOPx(expr)->op_first;
7613             assert(   expr->op_type == OP_PUSHMARK
7614                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7615                    || expr->op_type == OP_PADRANGE);
7616             expr = OpSIBLING(expr);
7617     }
7618 
7619     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7620                         expr, &recompile, NULL);
7621 
7622     /* handle bare (possibly after overloading) regex: foo =~ $re */
7623     {
7624         SV *re = pat;
7625         if (SvROK(re))
7626             re = SvRV(re);
7627         if (SvTYPE(re) == SVt_REGEXP) {
7628             if (is_bare_re)
7629                 *is_bare_re = TRUE;
7630             SvREFCNT_inc(re);
7631             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7632                 "Precompiled pattern%s\n",
7633                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7634 
7635             return (REGEXP*)re;
7636         }
7637     }
7638 
7639     exp = SvPV_nomg(pat, plen);
7640 
7641     if (!eng->op_comp) {
7642 	if ((SvUTF8(pat) && IN_BYTES)
7643 		|| SvGMAGICAL(pat) || SvAMAGIC(pat))
7644 	{
7645 	    /* make a temporary copy; either to convert to bytes,
7646 	     * or to avoid repeating get-magic / overloaded stringify */
7647 	    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7648 					(IN_BYTES ? 0 : SvUTF8(pat)));
7649 	}
7650 	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7651     }
7652 
7653     /* ignore the utf8ness if the pattern is 0 length */
7654     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7655     RExC_uni_semantics = 0;
7656     RExC_contains_locale = 0;
7657     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7658     RExC_in_script_run = 0;
7659     RExC_study_started = 0;
7660     pRExC_state->runtime_code_qr = NULL;
7661     RExC_frame_head= NULL;
7662     RExC_frame_last= NULL;
7663     RExC_frame_count= 0;
7664     RExC_latest_warn_offset = 0;
7665     RExC_use_BRANCHJ = 0;
7666     RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7667     RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7668     RExC_total_parens = 0;
7669     RExC_open_parens = NULL;
7670     RExC_close_parens = NULL;
7671     RExC_paren_names = NULL;
7672     RExC_size = 0;
7673     RExC_seen_d_op = FALSE;
7674 #ifdef DEBUGGING
7675     RExC_paren_name_list = NULL;
7676 #endif
7677 
7678     DEBUG_r({
7679         RExC_mysv1= sv_newmortal();
7680         RExC_mysv2= sv_newmortal();
7681     });
7682 
7683     DEBUG_COMPILE_r({
7684             SV *dsv= sv_newmortal();
7685             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7686             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7687                           PL_colors[4], PL_colors[5], s);
7688         });
7689 
7690     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7691      * to utf8 */
7692 
7693     if ((pm_flags & PMf_USE_RE_EVAL)
7694 		/* this second condition covers the non-regex literal case,
7695 		 * i.e.  $foo =~ '(?{})'. */
7696 		|| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7697     )
7698 	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7699 
7700   redo_parse:
7701     /* return old regex if pattern hasn't changed */
7702     /* XXX: note in the below we have to check the flags as well as the
7703      * pattern.
7704      *
7705      * Things get a touch tricky as we have to compare the utf8 flag
7706      * independently from the compile flags.  */
7707 
7708     if (   old_re
7709         && !recompile
7710         && !!RX_UTF8(old_re) == !!RExC_utf8
7711         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7712 	&& RX_PRECOMP(old_re)
7713 	&& RX_PRELEN(old_re) == plen
7714         && memEQ(RX_PRECOMP(old_re), exp, plen)
7715 	&& !runtime_code /* with runtime code, always recompile */ )
7716     {
7717         DEBUG_COMPILE_r({
7718             SV *dsv= sv_newmortal();
7719             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7720             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
7721                           PL_colors[4], PL_colors[5], s);
7722         });
7723         return old_re;
7724     }
7725 
7726     /* Allocate the pattern's SV */
7727     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7728     RExC_rx = ReANY(Rx);
7729     if ( RExC_rx == NULL )
7730         FAIL("Regexp out of space");
7731 
7732     rx_flags = orig_rx_flags;
7733 
7734     if (   toUSE_UNI_CHARSET_NOT_DEPENDS
7735         && initial_charset == REGEX_DEPENDS_CHARSET)
7736     {
7737 
7738 	/* Set to use unicode semantics if the pattern is in utf8 and has the
7739 	 * 'depends' charset specified, as it means unicode when utf8  */
7740 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7741         RExC_uni_semantics = 1;
7742     }
7743 
7744     RExC_pm_flags = pm_flags;
7745 
7746     if (runtime_code) {
7747         assert(TAINTING_get || !TAINT_get);
7748 	if (TAINT_get)
7749 	    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7750 
7751 	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7752 	    /* whoops, we have a non-utf8 pattern, whilst run-time code
7753 	     * got compiled as utf8. Try again with a utf8 pattern */
7754             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7755                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7756             goto redo_parse;
7757 	}
7758     }
7759     assert(!pRExC_state->runtime_code_qr);
7760 
7761     RExC_sawback = 0;
7762 
7763     RExC_seen = 0;
7764     RExC_maxlen = 0;
7765     RExC_in_lookaround = 0;
7766     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7767     RExC_recode_x_to_native = 0;
7768     RExC_in_multi_char_class = 0;
7769 
7770     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7771     RExC_precomp_end = RExC_end = exp + plen;
7772     RExC_nestroot = 0;
7773     RExC_whilem_seen = 0;
7774     RExC_end_op = NULL;
7775     RExC_recurse = NULL;
7776     RExC_study_chunk_recursed = NULL;
7777     RExC_study_chunk_recursed_bytes= 0;
7778     RExC_recurse_count = 0;
7779     RExC_sets_depth = 0;
7780     pRExC_state->code_index = 0;
7781 
7782     /* Initialize the string in the compiled pattern.  This is so that there is
7783      * something to output if necessary */
7784     set_regex_pv(pRExC_state, Rx);
7785 
7786     DEBUG_PARSE_r({
7787         Perl_re_printf( aTHX_
7788             "Starting parse and generation\n");
7789         RExC_lastnum=0;
7790         RExC_lastparse=NULL;
7791     });
7792 
7793     /* Allocate space and zero-initialize. Note, the two step process
7794        of zeroing when in debug mode, thus anything assigned has to
7795        happen after that */
7796     if (!  RExC_size) {
7797 
7798         /* On the first pass of the parse, we guess how big this will be.  Then
7799          * we grow in one operation to that amount and then give it back.  As
7800          * we go along, we re-allocate what we need.
7801          *
7802          * XXX Currently the guess is essentially that the pattern will be an
7803          * EXACT node with one byte input, one byte output.  This is crude, and
7804          * better heuristics are welcome.
7805          *
7806          * On any subsequent passes, we guess what we actually computed in the
7807          * latest earlier pass.  Such a pass probably didn't complete so is
7808          * missing stuff.  We could improve those guesses by knowing where the
7809          * parse stopped, and use the length so far plus apply the above
7810          * assumption to what's left. */
7811         RExC_size = STR_SZ(RExC_end - RExC_start);
7812     }
7813 
7814     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7815     if ( RExC_rxi == NULL )
7816         FAIL("Regexp out of space");
7817 
7818     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7819     RXi_SET( RExC_rx, RExC_rxi );
7820 
7821     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7822      * node parsed will give back any excess memory we have allocated so far).
7823      * */
7824     RExC_size = 0;
7825 
7826     /* non-zero initialization begins here */
7827     RExC_rx->engine= eng;
7828     RExC_rx->extflags = rx_flags;
7829     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7830 
7831     if (pm_flags & PMf_IS_QR) {
7832 	RExC_rxi->code_blocks = pRExC_state->code_blocks;
7833         if (RExC_rxi->code_blocks) {
7834             RExC_rxi->code_blocks->refcnt++;
7835         }
7836     }
7837 
7838     RExC_rx->intflags = 0;
7839 
7840     RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
7841     RExC_parse = exp;
7842 
7843     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7844      * code makes sure the final byte is an uncounted NUL.  But should this
7845      * ever not be the case, lots of things could read beyond the end of the
7846      * buffer: loops like
7847      *      while(isFOO(*RExC_parse)) RExC_parse++;
7848      *      strchr(RExC_parse, "foo");
7849      * etc.  So it is worth noting. */
7850     assert(*RExC_end == '\0');
7851 
7852     RExC_naughty = 0;
7853     RExC_npar = 1;
7854     RExC_parens_buf_size = 0;
7855     RExC_emit_start = RExC_rxi->program;
7856     pRExC_state->code_index = 0;
7857 
7858     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7859     RExC_emit = 1;
7860 
7861     /* Do the parse */
7862     if (reg(pRExC_state, 0, &flags, 1)) {
7863 
7864         /* Success!, But we may need to redo the parse knowing how many parens
7865          * there actually are */
7866         if (IN_PARENS_PASS) {
7867             flags |= RESTART_PARSE;
7868         }
7869 
7870         /* We have that number in RExC_npar */
7871         RExC_total_parens = RExC_npar;
7872     }
7873     else if (! MUST_RESTART(flags)) {
7874 	ReREFCNT_dec(Rx);
7875         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7876     }
7877 
7878     /* Here, we either have success, or we have to redo the parse for some reason */
7879     if (MUST_RESTART(flags)) {
7880 
7881         /* It's possible to write a regexp in ascii that represents Unicode
7882         codepoints outside of the byte range, such as via \x{100}. If we
7883         detect such a sequence we have to convert the entire pattern to utf8
7884         and then recompile, as our sizing calculation will have been based
7885         on 1 byte == 1 character, but we will need to use utf8 to encode
7886         at least some part of the pattern, and therefore must convert the whole
7887         thing.
7888         -- dmq */
7889         if (flags & NEED_UTF8) {
7890 
7891             /* We have stored the offset of the final warning output so far.
7892              * That must be adjusted.  Any variant characters between the start
7893              * of the pattern and this warning count for 2 bytes in the final,
7894              * so just add them again */
7895             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7896                 RExC_latest_warn_offset +=
7897                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7898                                                 + RExC_latest_warn_offset);
7899             }
7900             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7901             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7902             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7903         }
7904         else {
7905             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7906         }
7907 
7908         if (ALL_PARENS_COUNTED) {
7909             /* Make enough room for all the known parens, and zero it */
7910             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7911             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7912             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7913 
7914             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7915             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7916         }
7917         else { /* Parse did not complete.  Reinitialize the parentheses
7918                   structures */
7919             RExC_total_parens = 0;
7920             if (RExC_open_parens) {
7921                 Safefree(RExC_open_parens);
7922                 RExC_open_parens = NULL;
7923             }
7924             if (RExC_close_parens) {
7925                 Safefree(RExC_close_parens);
7926                 RExC_close_parens = NULL;
7927             }
7928         }
7929 
7930         /* Clean up what we did in this parse */
7931         SvREFCNT_dec_NN(RExC_rx_sv);
7932 
7933         goto redo_parse;
7934     }
7935 
7936     /* Here, we have successfully parsed and generated the pattern's program
7937      * for the regex engine.  We are ready to finish things up and look for
7938      * optimizations. */
7939 
7940     /* Update the string to compile, with correct modifiers, etc */
7941     set_regex_pv(pRExC_state, Rx);
7942 
7943     RExC_rx->nparens = RExC_total_parens - 1;
7944 
7945     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7946     if (RExC_whilem_seen > 15)
7947         RExC_whilem_seen = 15;
7948 
7949     DEBUG_PARSE_r({
7950         Perl_re_printf( aTHX_
7951             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7952         RExC_lastnum=0;
7953         RExC_lastparse=NULL;
7954     });
7955 
7956 #ifdef RE_TRACK_PATTERN_OFFSETS
7957     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7958                           "%s %" UVuf " bytes for offset annotations.\n",
7959                           RExC_offsets ? "Got" : "Couldn't get",
7960                           (UV)((RExC_offsets[0] * 2 + 1))));
7961     DEBUG_OFFSETS_r(if (RExC_offsets) {
7962         const STRLEN len = RExC_offsets[0];
7963         STRLEN i;
7964         DECLARE_AND_GET_RE_DEBUG_FLAGS;
7965         Perl_re_printf( aTHX_
7966                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7967         for (i = 1; i <= len; i++) {
7968             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7969                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7970                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7971         }
7972         Perl_re_printf( aTHX_  "\n");
7973     });
7974 
7975 #else
7976     SetProgLen(RExC_rxi,RExC_size);
7977 #endif
7978 
7979     DEBUG_DUMP_PRE_OPTIMIZE_r({
7980         SV * const sv = sv_newmortal();
7981         RXi_GET_DECL(RExC_rx, ri);
7982         DEBUG_RExC_seen();
7983         Perl_re_printf( aTHX_ "Program before optimization:\n");
7984 
7985         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7986                         sv, 0, 0);
7987     });
7988 
7989     DEBUG_OPTIMISE_r(
7990         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7991     );
7992 
7993     /* XXXX To minimize changes to RE engine we always allocate
7994        3-units-long substrs field. */
7995     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7996     if (RExC_recurse_count) {
7997         Newx(RExC_recurse, RExC_recurse_count, regnode *);
7998         SAVEFREEPV(RExC_recurse);
7999     }
8000 
8001     if (RExC_seen & REG_RECURSE_SEEN) {
8002         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8003          * So its 1 if there are no parens. */
8004         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8005                                          ((RExC_total_parens & 0x07) != 0);
8006         Newx(RExC_study_chunk_recursed,
8007              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8008         SAVEFREEPV(RExC_study_chunk_recursed);
8009     }
8010 
8011   reStudy:
8012     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8013     DEBUG_r(
8014         RExC_study_chunk_recursed_count= 0;
8015     );
8016     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8017     if (RExC_study_chunk_recursed) {
8018         Zero(RExC_study_chunk_recursed,
8019              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8020     }
8021 
8022 
8023 #ifdef TRIE_STUDY_OPT
8024     if (!restudied) {
8025         StructCopy(&zero_scan_data, &data, scan_data_t);
8026         copyRExC_state = RExC_state;
8027     } else {
8028         U32 seen=RExC_seen;
8029         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8030 
8031         RExC_state = copyRExC_state;
8032         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8033             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8034         else
8035             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8036 	StructCopy(&zero_scan_data, &data, scan_data_t);
8037     }
8038 #else
8039     StructCopy(&zero_scan_data, &data, scan_data_t);
8040 #endif
8041 
8042     /* Dig out information for optimizations. */
8043     RExC_rx->extflags = RExC_flags; /* was pm_op */
8044     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8045 
8046     if (UTF)
8047 	SvUTF8_on(Rx);	/* Unicode in it? */
8048     RExC_rxi->regstclass = NULL;
8049     if (RExC_naughty >= TOO_NAUGHTY)	/* Probably an expensive pattern. */
8050 	RExC_rx->intflags |= PREGf_NAUGHTY;
8051     scan = RExC_rxi->program + 1;		/* First BRANCH. */
8052 
8053     /* testing for BRANCH here tells us whether there is "must appear"
8054        data in the pattern. If there is then we can use it for optimisations */
8055     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
8056                                                   */
8057 	SSize_t fake;
8058 	STRLEN longest_length[2];
8059 	regnode_ssc ch_class; /* pointed to by data */
8060 	int stclass_flag;
8061 	SSize_t last_close = 0; /* pointed to by data */
8062         regnode *first= scan;
8063         regnode *first_next= regnext(first);
8064         int i;
8065 
8066 	/*
8067 	 * Skip introductions and multiplicators >= 1
8068 	 * so that we can extract the 'meat' of the pattern that must
8069 	 * match in the large if() sequence following.
8070 	 * NOTE that EXACT is NOT covered here, as it is normally
8071 	 * picked up by the optimiser separately.
8072 	 *
8073 	 * This is unfortunate as the optimiser isnt handling lookahead
8074 	 * properly currently.
8075 	 *
8076 	 */
8077 	while ((OP(first) == OPEN && (sawopen = 1)) ||
8078 	       /* An OR of *one* alternative - should not happen now. */
8079 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8080 	    /* for now we can't handle lookbehind IFMATCH*/
8081 	    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8082 	    (OP(first) == PLUS) ||
8083 	    (OP(first) == MINMOD) ||
8084 	       /* An {n,m} with n>0 */
8085 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8086 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8087 	{
8088 		/*
8089 		 * the only op that could be a regnode is PLUS, all the rest
8090 		 * will be regnode_1 or regnode_2.
8091 		 *
8092                  * (yves doesn't think this is true)
8093 		 */
8094 		if (OP(first) == PLUS)
8095 		    sawplus = 1;
8096                 else {
8097                     if (OP(first) == MINMOD)
8098                         sawminmod = 1;
8099 		    first += regarglen[OP(first)];
8100                 }
8101 		first = NEXTOPER(first);
8102 		first_next= regnext(first);
8103 	}
8104 
8105 	/* Starting-point info. */
8106       again:
8107         DEBUG_PEEP("first:", first, 0, 0);
8108         /* Ignore EXACT as we deal with it later. */
8109 	if (PL_regkind[OP(first)] == EXACT) {
8110 	    if (! isEXACTFish(OP(first))) {
8111 		NOOP;	/* Empty, get anchored substr later. */
8112             }
8113 	    else
8114 		RExC_rxi->regstclass = first;
8115 	}
8116 #ifdef TRIE_STCLASS
8117 	else if (PL_regkind[OP(first)] == TRIE &&
8118 	        ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8119 	{
8120             /* this can happen only on restudy */
8121             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8122 	}
8123 #endif
8124 	else if (REGNODE_SIMPLE(OP(first)))
8125 	    RExC_rxi->regstclass = first;
8126 	else if (PL_regkind[OP(first)] == BOUND ||
8127 		 PL_regkind[OP(first)] == NBOUND)
8128 	    RExC_rxi->regstclass = first;
8129 	else if (PL_regkind[OP(first)] == BOL) {
8130             RExC_rx->intflags |= (OP(first) == MBOL
8131                            ? PREGf_ANCH_MBOL
8132                            : PREGf_ANCH_SBOL);
8133 	    first = NEXTOPER(first);
8134 	    goto again;
8135 	}
8136 	else if (OP(first) == GPOS) {
8137             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8138 	    first = NEXTOPER(first);
8139 	    goto again;
8140 	}
8141 	else if ((!sawopen || !RExC_sawback) &&
8142             !sawlookahead &&
8143 	    (OP(first) == STAR &&
8144 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8145             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8146 	{
8147 	    /* turn .* into ^.* with an implied $*=1 */
8148 	    const int type =
8149 		(OP(NEXTOPER(first)) == REG_ANY)
8150                     ? PREGf_ANCH_MBOL
8151                     : PREGf_ANCH_SBOL;
8152             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8153 	    first = NEXTOPER(first);
8154 	    goto again;
8155 	}
8156         if (sawplus && !sawminmod && !sawlookahead
8157             && (!sawopen || !RExC_sawback)
8158 	    && !pRExC_state->code_blocks) /* May examine pos and $& */
8159 	    /* x+ must match at the 1st pos of run of x's */
8160 	    RExC_rx->intflags |= PREGf_SKIP;
8161 
8162 	/* Scan is after the zeroth branch, first is atomic matcher. */
8163 #ifdef TRIE_STUDY_OPT
8164 	DEBUG_PARSE_r(
8165 	    if (!restudied)
8166                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8167 			      (IV)(first - scan + 1))
8168         );
8169 #else
8170 	DEBUG_PARSE_r(
8171             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8172 	        (IV)(first - scan + 1))
8173         );
8174 #endif
8175 
8176 
8177 	/*
8178 	* If there's something expensive in the r.e., find the
8179 	* longest literal string that must appear and make it the
8180 	* regmust.  Resolve ties in favor of later strings, since
8181 	* the regstart check works with the beginning of the r.e.
8182 	* and avoiding duplication strengthens checking.  Not a
8183 	* strong reason, but sufficient in the absence of others.
8184 	* [Now we resolve ties in favor of the earlier string if
8185 	* it happens that c_offset_min has been invalidated, since the
8186 	* earlier string may buy us something the later one won't.]
8187 	*/
8188 
8189 	data.substrs[0].str = newSVpvs("");
8190 	data.substrs[1].str = newSVpvs("");
8191 	data.last_found = newSVpvs("");
8192 	data.cur_is_floating = 0; /* initially any found substring is fixed */
8193 	ENTER_with_name("study_chunk");
8194 	SAVEFREESV(data.substrs[0].str);
8195 	SAVEFREESV(data.substrs[1].str);
8196 	SAVEFREESV(data.last_found);
8197 	first = scan;
8198 	if (!RExC_rxi->regstclass) {
8199 	    ssc_init(pRExC_state, &ch_class);
8200 	    data.start_class = &ch_class;
8201 	    stclass_flag = SCF_DO_STCLASS_AND;
8202 	} else				/* XXXX Check for BOUND? */
8203 	    stclass_flag = 0;
8204 	data.last_closep = &last_close;
8205 
8206         DEBUG_RExC_seen();
8207         /*
8208          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8209          * (NO top level branches)
8210          */
8211 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8212                              scan + RExC_size, /* Up to end */
8213             &data, -1, 0, NULL,
8214             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8215                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8216             0, TRUE);
8217 
8218 
8219         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8220 
8221 
8222 	if ( RExC_total_parens == 1 && !data.cur_is_floating
8223 	     && data.last_start_min == 0 && data.last_end > 0
8224 	     && !RExC_seen_zerolen
8225              && !(RExC_seen & REG_VERBARG_SEEN)
8226              && !(RExC_seen & REG_GPOS_SEEN)
8227         ){
8228 	    RExC_rx->extflags |= RXf_CHECK_ALL;
8229         }
8230 	scan_commit(pRExC_state, &data,&minlen, 0);
8231 
8232 
8233         /* XXX this is done in reverse order because that's the way the
8234          * code was before it was parameterised. Don't know whether it
8235          * actually needs doing in reverse order. DAPM */
8236         for (i = 1; i >= 0; i--) {
8237             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8238 
8239             if (   !(   i
8240                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8241                      &&    data.substrs[0].min_offset
8242                         == data.substrs[1].min_offset
8243                      &&    SvCUR(data.substrs[0].str)
8244                         == SvCUR(data.substrs[1].str)
8245                     )
8246                 && S_setup_longest (aTHX_ pRExC_state,
8247                                         &(RExC_rx->substrs->data[i]),
8248                                         &(data.substrs[i]),
8249                                         longest_length[i]))
8250             {
8251                 RExC_rx->substrs->data[i].min_offset =
8252                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8253 
8254                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8255                 /* Don't offset infinity */
8256                 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8257                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8258                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8259             }
8260             else {
8261                 RExC_rx->substrs->data[i].substr      = NULL;
8262                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8263                 longest_length[i] = 0;
8264             }
8265         }
8266 
8267 	LEAVE_with_name("study_chunk");
8268 
8269 	if (RExC_rxi->regstclass
8270 	    && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8271 	    RExC_rxi->regstclass = NULL;
8272 
8273 	if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8274               || RExC_rx->substrs->data[0].min_offset)
8275 	    && stclass_flag
8276             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8277 	    && is_ssc_worth_it(pRExC_state, data.start_class))
8278 	{
8279 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8280 
8281             ssc_finalize(pRExC_state, data.start_class);
8282 
8283 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8284 	    StructCopy(data.start_class,
8285 		       (regnode_ssc*)RExC_rxi->data->data[n],
8286 		       regnode_ssc);
8287 	    RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8288 	    RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
8289 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8290                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8291                       Perl_re_printf( aTHX_
8292 				    "synthetic stclass \"%s\".\n",
8293 				    SvPVX_const(sv));});
8294             data.start_class = NULL;
8295 	}
8296 
8297         /* A temporary algorithm prefers floated substr to fixed one of
8298          * same length to dig more info. */
8299 	i = (longest_length[0] <= longest_length[1]);
8300         RExC_rx->substrs->check_ix = i;
8301         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8302         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8303         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8304         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8305         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8306         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8307             RExC_rx->intflags |= PREGf_NOSCAN;
8308 
8309 	if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8310 	    RExC_rx->extflags |= RXf_USE_INTUIT;
8311 	    if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8312 		RExC_rx->extflags |= RXf_INTUIT_TAIL;
8313 	}
8314 
8315 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8316 	if ( (STRLEN)minlen < longest_length[1] )
8317             minlen= longest_length[1];
8318         if ( (STRLEN)minlen < longest_length[0] )
8319             minlen= longest_length[0];
8320         */
8321     }
8322     else {
8323 	/* Several toplevels. Best we can is to set minlen. */
8324 	SSize_t fake;
8325 	regnode_ssc ch_class;
8326 	SSize_t last_close = 0;
8327 
8328         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8329 
8330 	scan = RExC_rxi->program + 1;
8331 	ssc_init(pRExC_state, &ch_class);
8332 	data.start_class = &ch_class;
8333 	data.last_closep = &last_close;
8334 
8335         DEBUG_RExC_seen();
8336         /*
8337          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8338          * (patterns WITH top level branches)
8339          */
8340 	minlen = study_chunk(pRExC_state,
8341             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8342             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8343                                                       ? SCF_TRIE_DOING_RESTUDY
8344                                                       : 0),
8345             0, TRUE);
8346 
8347         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8348 
8349 	RExC_rx->check_substr = NULL;
8350         RExC_rx->check_utf8 = NULL;
8351         RExC_rx->substrs->data[0].substr      = NULL;
8352         RExC_rx->substrs->data[0].utf8_substr = NULL;
8353         RExC_rx->substrs->data[1].substr      = NULL;
8354         RExC_rx->substrs->data[1].utf8_substr = NULL;
8355 
8356         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8357 	    && is_ssc_worth_it(pRExC_state, data.start_class))
8358         {
8359 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8360 
8361             ssc_finalize(pRExC_state, data.start_class);
8362 
8363 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8364 	    StructCopy(data.start_class,
8365 		       (regnode_ssc*)RExC_rxi->data->data[n],
8366 		       regnode_ssc);
8367 	    RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8368 	    RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
8369 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8370                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8371                       Perl_re_printf( aTHX_
8372 				    "synthetic stclass \"%s\".\n",
8373 				    SvPVX_const(sv));});
8374             data.start_class = NULL;
8375 	}
8376     }
8377 
8378     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8379         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8380         RExC_rx->maxlen = REG_INFTY;
8381     }
8382     else {
8383         RExC_rx->maxlen = RExC_maxlen;
8384     }
8385 
8386     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8387        the "real" pattern. */
8388     DEBUG_OPTIMISE_r({
8389         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8390                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8391     });
8392     RExC_rx->minlenret = minlen;
8393     if (RExC_rx->minlen < minlen)
8394         RExC_rx->minlen = minlen;
8395 
8396     if (RExC_seen & REG_RECURSE_SEEN ) {
8397         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8398         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8399     }
8400     if (RExC_seen & REG_GPOS_SEEN)
8401         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8402     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8403         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8404                                                 lookbehind */
8405     if (pRExC_state->code_blocks)
8406 	RExC_rx->extflags |= RXf_EVAL_SEEN;
8407     if (RExC_seen & REG_VERBARG_SEEN)
8408     {
8409 	RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8410         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8411     }
8412     if (RExC_seen & REG_CUTGROUP_SEEN)
8413 	RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8414     if (pm_flags & PMf_USE_RE_EVAL)
8415 	RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8416     if (RExC_paren_names)
8417         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8418     else
8419         RXp_PAREN_NAMES(RExC_rx) = NULL;
8420 
8421     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8422      * so it can be used in pp.c */
8423     if (RExC_rx->intflags & PREGf_ANCH)
8424         RExC_rx->extflags |= RXf_IS_ANCHORED;
8425 
8426 
8427     {
8428         /* this is used to identify "special" patterns that might result
8429          * in Perl NOT calling the regex engine and instead doing the match "itself",
8430          * particularly special cases in split//. By having the regex compiler
8431          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8432          * we avoid weird issues with equivalent patterns resulting in different behavior,
8433          * AND we allow non Perl engines to get the same optimizations by the setting the
8434          * flags appropriately - Yves */
8435         regnode *first = RExC_rxi->program + 1;
8436         U8 fop = OP(first);
8437         regnode *next = NEXTOPER(first);
8438         /* It's safe to read through *next only if OP(first) is a regop of
8439          * the right type (not EXACT, for example).
8440          */
8441         U8 nop = (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS)
8442                 ? OP(next) : 0;
8443 
8444         if (PL_regkind[fop] == NOTHING && nop == END)
8445             RExC_rx->extflags |= RXf_NULL;
8446         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8447             /* when fop is SBOL first->flags will be true only when it was
8448              * produced by parsing /\A/, and not when parsing /^/. This is
8449              * very important for the split code as there we want to
8450              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8451              * See rt #122761 for more details. -- Yves */
8452             RExC_rx->extflags |= RXf_START_ONLY;
8453         else if (fop == PLUS
8454                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8455                  && OP(regnext(first)) == END)
8456             RExC_rx->extflags |= RXf_WHITE;
8457         else if ( RExC_rx->extflags & RXf_SPLIT
8458                   && (PL_regkind[fop] == EXACT && ! isEXACTFish(fop))
8459                   && STR_LEN(first) == 1
8460                   && *(STRING(first)) == ' '
8461                   && OP(regnext(first)) == END )
8462             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8463 
8464     }
8465 
8466     if (RExC_contains_locale) {
8467         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8468     }
8469 
8470 #ifdef DEBUGGING
8471     if (RExC_paren_names) {
8472         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8473         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8474                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8475     } else
8476 #endif
8477     RExC_rxi->name_list_idx = 0;
8478 
8479     while ( RExC_recurse_count > 0 ) {
8480         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8481         /*
8482          * This data structure is set up in study_chunk() and is used
8483          * to calculate the distance between a GOSUB regopcode and
8484          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8485          * it refers to.
8486          *
8487          * If for some reason someone writes code that optimises
8488          * away a GOSUB opcode then the assert should be changed to
8489          * an if(scan) to guard the ARG2L_SET() - Yves
8490          *
8491          */
8492         assert(scan && OP(scan) == GOSUB);
8493         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8494     }
8495 
8496     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8497     /* assume we don't need to swap parens around before we match */
8498     DEBUG_TEST_r({
8499         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8500             (unsigned long)RExC_study_chunk_recursed_count);
8501     });
8502     DEBUG_DUMP_r({
8503         DEBUG_RExC_seen();
8504         Perl_re_printf( aTHX_ "Final program:\n");
8505         regdump(RExC_rx);
8506     });
8507 
8508     if (RExC_open_parens) {
8509         Safefree(RExC_open_parens);
8510         RExC_open_parens = NULL;
8511     }
8512     if (RExC_close_parens) {
8513         Safefree(RExC_close_parens);
8514         RExC_close_parens = NULL;
8515     }
8516 
8517 #ifdef USE_ITHREADS
8518     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8519      * by setting the regexp SV to readonly-only instead. If the
8520      * pattern's been recompiled, the USEDness should remain. */
8521     if (old_re && SvREADONLY(old_re))
8522         SvREADONLY_on(Rx);
8523 #endif
8524     return Rx;
8525 }
8526 
8527 
8528 SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx,SV * const key,SV * const value,const U32 flags)8529 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8530                     const U32 flags)
8531 {
8532     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8533 
8534     PERL_UNUSED_ARG(value);
8535 
8536     if (flags & RXapif_FETCH) {
8537         return reg_named_buff_fetch(rx, key, flags);
8538     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8539         Perl_croak_no_modify();
8540         return NULL;
8541     } else if (flags & RXapif_EXISTS) {
8542         return reg_named_buff_exists(rx, key, flags)
8543             ? &PL_sv_yes
8544             : &PL_sv_no;
8545     } else if (flags & RXapif_REGNAMES) {
8546         return reg_named_buff_all(rx, flags);
8547     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8548         return reg_named_buff_scalar(rx, flags);
8549     } else {
8550         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8551         return NULL;
8552     }
8553 }
8554 
8555 SV*
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx,const SV * const lastkey,const U32 flags)8556 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8557                          const U32 flags)
8558 {
8559     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8560     PERL_UNUSED_ARG(lastkey);
8561 
8562     if (flags & RXapif_FIRSTKEY)
8563         return reg_named_buff_firstkey(rx, flags);
8564     else if (flags & RXapif_NEXTKEY)
8565         return reg_named_buff_nextkey(rx, flags);
8566     else {
8567         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8568                                             (int)flags);
8569         return NULL;
8570     }
8571 }
8572 
8573 SV*
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r,SV * const namesv,const U32 flags)8574 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8575 			  const U32 flags)
8576 {
8577     SV *ret;
8578     struct regexp *const rx = ReANY(r);
8579 
8580     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8581 
8582     if (rx && RXp_PAREN_NAMES(rx)) {
8583         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8584         if (he_str) {
8585             IV i;
8586             SV* sv_dat=HeVAL(he_str);
8587             I32 *nums=(I32*)SvPVX(sv_dat);
8588             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8589             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8590                 if ((I32)(rx->nparens) >= nums[i]
8591                     && rx->offs[nums[i]].start != -1
8592                     && rx->offs[nums[i]].end != -1)
8593                 {
8594                     ret = newSVpvs("");
8595                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8596                     if (!retarray)
8597                         return ret;
8598                 } else {
8599                     if (retarray)
8600                         ret = newSVsv(&PL_sv_undef);
8601                 }
8602                 if (retarray)
8603                     av_push(retarray, ret);
8604             }
8605             if (retarray)
8606                 return newRV_noinc(MUTABLE_SV(retarray));
8607         }
8608     }
8609     return NULL;
8610 }
8611 
8612 bool
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r,SV * const key,const U32 flags)8613 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8614                            const U32 flags)
8615 {
8616     struct regexp *const rx = ReANY(r);
8617 
8618     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8619 
8620     if (rx && RXp_PAREN_NAMES(rx)) {
8621         if (flags & RXapif_ALL) {
8622             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8623         } else {
8624 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8625             if (sv) {
8626 		SvREFCNT_dec_NN(sv);
8627                 return TRUE;
8628             } else {
8629                 return FALSE;
8630             }
8631         }
8632     } else {
8633         return FALSE;
8634     }
8635 }
8636 
8637 SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r,const U32 flags)8638 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8639 {
8640     struct regexp *const rx = ReANY(r);
8641 
8642     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8643 
8644     if ( rx && RXp_PAREN_NAMES(rx) ) {
8645 	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
8646 
8647 	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8648     } else {
8649 	return FALSE;
8650     }
8651 }
8652 
8653 SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r,const U32 flags)8654 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8655 {
8656     struct regexp *const rx = ReANY(r);
8657     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8658 
8659     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8660 
8661     if (rx && RXp_PAREN_NAMES(rx)) {
8662         HV *hv = RXp_PAREN_NAMES(rx);
8663         HE *temphe;
8664         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8665             IV i;
8666             IV parno = 0;
8667             SV* sv_dat = HeVAL(temphe);
8668             I32 *nums = (I32*)SvPVX(sv_dat);
8669             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8670                 if ((I32)(rx->lastparen) >= nums[i] &&
8671                     rx->offs[nums[i]].start != -1 &&
8672                     rx->offs[nums[i]].end != -1)
8673                 {
8674                     parno = nums[i];
8675                     break;
8676                 }
8677             }
8678             if (parno || flags & RXapif_ALL) {
8679 		return newSVhek(HeKEY_hek(temphe));
8680             }
8681         }
8682     }
8683     return NULL;
8684 }
8685 
8686 SV*
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r,const U32 flags)8687 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8688 {
8689     SV *ret;
8690     AV *av;
8691     SSize_t length;
8692     struct regexp *const rx = ReANY(r);
8693 
8694     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8695 
8696     if (rx && RXp_PAREN_NAMES(rx)) {
8697         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8698             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8699         } else if (flags & RXapif_ONE) {
8700             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8701             av = MUTABLE_AV(SvRV(ret));
8702             length = av_count(av);
8703 	    SvREFCNT_dec_NN(ret);
8704             return newSViv(length);
8705         } else {
8706             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8707                                                 (int)flags);
8708             return NULL;
8709         }
8710     }
8711     return &PL_sv_undef;
8712 }
8713 
8714 SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r,const U32 flags)8715 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8716 {
8717     struct regexp *const rx = ReANY(r);
8718     AV *av = newAV();
8719 
8720     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8721 
8722     if (rx && RXp_PAREN_NAMES(rx)) {
8723         HV *hv= RXp_PAREN_NAMES(rx);
8724         HE *temphe;
8725         (void)hv_iterinit(hv);
8726         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8727             IV i;
8728             IV parno = 0;
8729             SV* sv_dat = HeVAL(temphe);
8730             I32 *nums = (I32*)SvPVX(sv_dat);
8731             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8732                 if ((I32)(rx->lastparen) >= nums[i] &&
8733                     rx->offs[nums[i]].start != -1 &&
8734                     rx->offs[nums[i]].end != -1)
8735                 {
8736                     parno = nums[i];
8737                     break;
8738                 }
8739             }
8740             if (parno || flags & RXapif_ALL) {
8741                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8742             }
8743         }
8744     }
8745 
8746     return newRV_noinc(MUTABLE_SV(av));
8747 }
8748 
8749 void
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r,const I32 paren,SV * const sv)8750 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8751 			     SV * const sv)
8752 {
8753     struct regexp *const rx = ReANY(r);
8754     char *s = NULL;
8755     SSize_t i = 0;
8756     SSize_t s1, t1;
8757     I32 n = paren;
8758 
8759     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8760 
8761     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8762            || n == RX_BUFF_IDX_CARET_FULLMATCH
8763            || n == RX_BUFF_IDX_CARET_POSTMATCH
8764        )
8765     {
8766         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8767         if (!keepcopy) {
8768             /* on something like
8769              *    $r = qr/.../;
8770              *    /$qr/p;
8771              * the KEEPCOPY is set on the PMOP rather than the regex */
8772             if (PL_curpm && r == PM_GETRE(PL_curpm))
8773                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8774         }
8775         if (!keepcopy)
8776             goto ret_undef;
8777     }
8778 
8779     if (!rx->subbeg)
8780         goto ret_undef;
8781 
8782     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8783         /* no need to distinguish between them any more */
8784         n = RX_BUFF_IDX_FULLMATCH;
8785 
8786     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8787         && rx->offs[0].start != -1)
8788     {
8789         /* $`, ${^PREMATCH} */
8790 	i = rx->offs[0].start;
8791 	s = rx->subbeg;
8792     }
8793     else
8794     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8795         && rx->offs[0].end != -1)
8796     {
8797         /* $', ${^POSTMATCH} */
8798 	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8799 	i = rx->sublen + rx->suboffset - rx->offs[0].end;
8800     }
8801     else
8802     if (inRANGE(n, 0, (I32)rx->nparens) &&
8803         (s1 = rx->offs[n].start) != -1  &&
8804         (t1 = rx->offs[n].end) != -1)
8805     {
8806         /* $&, ${^MATCH},  $1 ... */
8807         i = t1 - s1;
8808         s = rx->subbeg + s1 - rx->suboffset;
8809     } else {
8810         goto ret_undef;
8811     }
8812 
8813     assert(s >= rx->subbeg);
8814     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8815     if (i >= 0) {
8816 #ifdef NO_TAINT_SUPPORT
8817         sv_setpvn(sv, s, i);
8818 #else
8819         const int oldtainted = TAINT_get;
8820         TAINT_NOT;
8821         sv_setpvn(sv, s, i);
8822         TAINT_set(oldtainted);
8823 #endif
8824         if (RXp_MATCH_UTF8(rx))
8825             SvUTF8_on(sv);
8826         else
8827             SvUTF8_off(sv);
8828         if (TAINTING_get) {
8829             if (RXp_MATCH_TAINTED(rx)) {
8830                 if (SvTYPE(sv) >= SVt_PVMG) {
8831                     MAGIC* const mg = SvMAGIC(sv);
8832                     MAGIC* mgt;
8833                     TAINT;
8834                     SvMAGIC_set(sv, mg->mg_moremagic);
8835                     SvTAINT(sv);
8836                     if ((mgt = SvMAGIC(sv))) {
8837                         mg->mg_moremagic = mgt;
8838                         SvMAGIC_set(sv, mg);
8839                     }
8840                 } else {
8841                     TAINT;
8842                     SvTAINT(sv);
8843                 }
8844             } else
8845                 SvTAINTED_off(sv);
8846         }
8847     } else {
8848       ret_undef:
8849         sv_set_undef(sv);
8850         return;
8851     }
8852 }
8853 
8854 void
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx,const I32 paren,SV const * const value)8855 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8856 							 SV const * const value)
8857 {
8858     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8859 
8860     PERL_UNUSED_ARG(rx);
8861     PERL_UNUSED_ARG(paren);
8862     PERL_UNUSED_ARG(value);
8863 
8864     if (!PL_localizing)
8865         Perl_croak_no_modify();
8866 }
8867 
8868 I32
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r,const SV * const sv,const I32 paren)8869 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8870                               const I32 paren)
8871 {
8872     struct regexp *const rx = ReANY(r);
8873     I32 i;
8874     I32 s1, t1;
8875 
8876     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8877 
8878     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8879         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8880         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8881     )
8882     {
8883         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8884         if (!keepcopy) {
8885             /* on something like
8886              *    $r = qr/.../;
8887              *    /$qr/p;
8888              * the KEEPCOPY is set on the PMOP rather than the regex */
8889             if (PL_curpm && r == PM_GETRE(PL_curpm))
8890                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8891         }
8892         if (!keepcopy)
8893             goto warn_undef;
8894     }
8895 
8896     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8897     switch (paren) {
8898       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8899       case RX_BUFF_IDX_PREMATCH:       /* $` */
8900         if (rx->offs[0].start != -1) {
8901 			i = rx->offs[0].start;
8902 			if (i > 0) {
8903 				s1 = 0;
8904 				t1 = i;
8905 				goto getlen;
8906 			}
8907 	    }
8908         return 0;
8909 
8910       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8911       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8912 	    if (rx->offs[0].end != -1) {
8913 			i = rx->sublen - rx->offs[0].end;
8914 			if (i > 0) {
8915 				s1 = rx->offs[0].end;
8916 				t1 = rx->sublen;
8917 				goto getlen;
8918 			}
8919 	    }
8920         return 0;
8921 
8922       default: /* $& / ${^MATCH}, $1, $2, ... */
8923 	    if (paren <= (I32)rx->nparens &&
8924             (s1 = rx->offs[paren].start) != -1 &&
8925             (t1 = rx->offs[paren].end) != -1)
8926 	    {
8927             i = t1 - s1;
8928             goto getlen;
8929         } else {
8930           warn_undef:
8931             if (ckWARN(WARN_UNINITIALIZED))
8932                 report_uninit((const SV *)sv);
8933             return 0;
8934         }
8935     }
8936   getlen:
8937     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8938         const char * const s = rx->subbeg - rx->suboffset + s1;
8939         const U8 *ep;
8940         STRLEN el;
8941 
8942         i = t1 - s1;
8943         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8944             i = el;
8945     }
8946     return i;
8947 }
8948 
8949 SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)8950 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8951 {
8952     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8953 	PERL_UNUSED_ARG(rx);
8954 	if (0)
8955 	    return NULL;
8956 	else
8957 	    return newSVpvs("Regexp");
8958 }
8959 
8960 /* Scans the name of a named buffer from the pattern.
8961  * If flags is REG_RSN_RETURN_NULL returns null.
8962  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8963  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8964  * to the parsed name as looked up in the RExC_paren_names hash.
8965  * If there is an error throws a vFAIL().. type exception.
8966  */
8967 
8968 #define REG_RSN_RETURN_NULL    0
8969 #define REG_RSN_RETURN_NAME    1
8970 #define REG_RSN_RETURN_DATA    2
8971 
8972 STATIC SV*
S_reg_scan_name(pTHX_ RExC_state_t * pRExC_state,U32 flags)8973 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8974 {
8975     char *name_start = RExC_parse;
8976     SV* sv_name;
8977 
8978     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8979 
8980     assert (RExC_parse <= RExC_end);
8981     if (RExC_parse == RExC_end) NOOP;
8982     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8983          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8984           * using do...while */
8985 	if (UTF)
8986 	    do {
8987 		RExC_parse += UTF8SKIP(RExC_parse);
8988 	    } while (   RExC_parse < RExC_end
8989                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8990 	else
8991 	    do {
8992 		RExC_parse++;
8993 	    } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8994     } else {
8995         RExC_parse++; /* so the <- from the vFAIL is after the offending
8996                          character */
8997         vFAIL("Group name must start with a non-digit word character");
8998     }
8999     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9000 			     SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9001     if ( flags == REG_RSN_RETURN_NAME)
9002         return sv_name;
9003     else if (flags==REG_RSN_RETURN_DATA) {
9004         HE *he_str = NULL;
9005         SV *sv_dat = NULL;
9006         if ( ! sv_name )      /* should not happen*/
9007             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9008         if (RExC_paren_names)
9009             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9010         if ( he_str )
9011             sv_dat = HeVAL(he_str);
9012         if ( ! sv_dat ) {   /* Didn't find group */
9013 
9014             /* It might be a forward reference; we can't fail until we
9015                 * know, by completing the parse to get all the groups, and
9016                 * then reparsing */
9017             if (ALL_PARENS_COUNTED)  {
9018                 vFAIL("Reference to nonexistent named group");
9019             }
9020             else {
9021                 REQUIRE_PARENS_PASS;
9022             }
9023         }
9024         return sv_dat;
9025     }
9026 
9027     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9028                      (unsigned long) flags);
9029 }
9030 
9031 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
9032     if (RExC_lastparse!=RExC_parse) {                           \
9033         Perl_re_printf( aTHX_  "%s",                            \
9034             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
9035                 RExC_end - RExC_parse, 16,                      \
9036                 "", "",                                         \
9037                 PERL_PV_ESCAPE_UNI_DETECT |                     \
9038                 PERL_PV_PRETTY_ELLIPSES   |                     \
9039                 PERL_PV_PRETTY_LTGT       |                     \
9040                 PERL_PV_ESCAPE_RE         |                     \
9041                 PERL_PV_PRETTY_EXACTSIZE                        \
9042             )                                                   \
9043         );                                                      \
9044     } else                                                      \
9045         Perl_re_printf( aTHX_ "%16s","");                       \
9046                                                                 \
9047     if (RExC_lastnum!=RExC_emit)                                \
9048        Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
9049     else                                                        \
9050        Perl_re_printf( aTHX_ "|%4s","");                        \
9051     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
9052         (int)((depth*2)), "",                                   \
9053         (funcname)                                              \
9054     );                                                          \
9055     RExC_lastnum=RExC_emit;                                     \
9056     RExC_lastparse=RExC_parse;                                  \
9057 })
9058 
9059 
9060 
9061 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
9062     DEBUG_PARSE_MSG((funcname));                            \
9063     Perl_re_printf( aTHX_ "%4s","\n");                                  \
9064 })
9065 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
9066     DEBUG_PARSE_MSG((funcname));                            \
9067     Perl_re_printf( aTHX_ fmt "\n",args);                               \
9068 })
9069 
9070 /* This section of code defines the inversion list object and its methods.  The
9071  * interfaces are highly subject to change, so as much as possible is static to
9072  * this file.  An inversion list is here implemented as a malloc'd C UV array
9073  * as an SVt_INVLIST scalar.
9074  *
9075  * An inversion list for Unicode is an array of code points, sorted by ordinal
9076  * number.  Each element gives the code point that begins a range that extends
9077  * up-to but not including the code point given by the next element.  The final
9078  * element gives the first code point of a range that extends to the platform's
9079  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
9080  * ...) give ranges whose code points are all in the inversion list.  We say
9081  * that those ranges are in the set.  The odd-numbered elements give ranges
9082  * whose code points are not in the inversion list, and hence not in the set.
9083  * Thus, element [0] is the first code point in the list.  Element [1]
9084  * is the first code point beyond that not in the list; and element [2] is the
9085  * first code point beyond that that is in the list.  In other words, the first
9086  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9087  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
9088  * all code points in that range are not in the inversion list.  The third
9089  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9090  * list, and so forth.  Thus every element whose index is divisible by two
9091  * gives the beginning of a range that is in the list, and every element whose
9092  * index is not divisible by two gives the beginning of a range not in the
9093  * list.  If the final element's index is divisible by two, the inversion list
9094  * extends to the platform's infinity; otherwise the highest code point in the
9095  * inversion list is the contents of that element minus 1.
9096  *
9097  * A range that contains just a single code point N will look like
9098  *  invlist[i]   == N
9099  *  invlist[i+1] == N+1
9100  *
9101  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9102  * impossible to represent, so element [i+1] is omitted.  The single element
9103  * inversion list
9104  *  invlist[0] == UV_MAX
9105  * contains just UV_MAX, but is interpreted as matching to infinity.
9106  *
9107  * Taking the complement (inverting) an inversion list is quite simple, if the
9108  * first element is 0, remove it; otherwise add a 0 element at the beginning.
9109  * This implementation reserves an element at the beginning of each inversion
9110  * list to always contain 0; there is an additional flag in the header which
9111  * indicates if the list begins at the 0, or is offset to begin at the next
9112  * element.  This means that the inversion list can be inverted without any
9113  * copying; just flip the flag.
9114  *
9115  * More about inversion lists can be found in "Unicode Demystified"
9116  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9117  *
9118  * The inversion list data structure is currently implemented as an SV pointing
9119  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
9120  * array of UV whose memory management is automatically handled by the existing
9121  * facilities for SV's.
9122  *
9123  * Some of the methods should always be private to the implementation, and some
9124  * should eventually be made public */
9125 
9126 /* The header definitions are in F<invlist_inline.h> */
9127 
9128 #ifndef PERL_IN_XSUB_RE
9129 
9130 PERL_STATIC_INLINE UV*
S__invlist_array_init(SV * const invlist,const bool will_have_0)9131 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9132 {
9133     /* Returns a pointer to the first element in the inversion list's array.
9134      * This is called upon initialization of an inversion list.  Where the
9135      * array begins depends on whether the list has the code point U+0000 in it
9136      * or not.  The other parameter tells it whether the code that follows this
9137      * call is about to put a 0 in the inversion list or not.  The first
9138      * element is either the element reserved for 0, if TRUE, or the element
9139      * after it, if FALSE */
9140 
9141     bool* offset = get_invlist_offset_addr(invlist);
9142     UV* zero_addr = (UV *) SvPVX(invlist);
9143 
9144     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9145 
9146     /* Must be empty */
9147     assert(! _invlist_len(invlist));
9148 
9149     *zero_addr = 0;
9150 
9151     /* 1^1 = 0; 1^0 = 1 */
9152     *offset = 1 ^ will_have_0;
9153     return zero_addr + *offset;
9154 }
9155 
9156 STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest,SV * src)9157 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9158 {
9159     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9160      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9161      * is similar to what SvSetMagicSV() would do, if it were implemented on
9162      * inversion lists, though this routine avoids a copy */
9163 
9164     const UV src_len          = _invlist_len(src);
9165     const bool src_offset     = *get_invlist_offset_addr(src);
9166     const STRLEN src_byte_len = SvLEN(src);
9167     char * array              = SvPVX(src);
9168 
9169     const int oldtainted = TAINT_get;
9170 
9171     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9172 
9173     assert(is_invlist(src));
9174     assert(is_invlist(dest));
9175     assert(! invlist_is_iterating(src));
9176     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9177 
9178     /* Make sure it ends in the right place with a NUL, as our inversion list
9179      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9180      * asserts it */
9181     array[src_byte_len - 1] = '\0';
9182 
9183     TAINT_NOT;      /* Otherwise it breaks */
9184     sv_usepvn_flags(dest,
9185                     (char *) array,
9186                     src_byte_len - 1,
9187 
9188                     /* This flag is documented to cause a copy to be avoided */
9189                     SV_HAS_TRAILING_NUL);
9190     TAINT_set(oldtainted);
9191     SvPV_set(src, 0);
9192     SvLEN_set(src, 0);
9193     SvCUR_set(src, 0);
9194 
9195     /* Finish up copying over the other fields in an inversion list */
9196     *get_invlist_offset_addr(dest) = src_offset;
9197     invlist_set_len(dest, src_len, src_offset);
9198     *get_invlist_previous_index_addr(dest) = 0;
9199     invlist_iterfinish(dest);
9200 }
9201 
9202 PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV * invlist)9203 S_get_invlist_previous_index_addr(SV* invlist)
9204 {
9205     /* Return the address of the IV that is reserved to hold the cached index
9206      * */
9207     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9208 
9209     assert(is_invlist(invlist));
9210 
9211     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9212 }
9213 
9214 PERL_STATIC_INLINE IV
S_invlist_previous_index(SV * const invlist)9215 S_invlist_previous_index(SV* const invlist)
9216 {
9217     /* Returns cached index of previous search */
9218 
9219     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9220 
9221     return *get_invlist_previous_index_addr(invlist);
9222 }
9223 
9224 PERL_STATIC_INLINE void
S_invlist_set_previous_index(SV * const invlist,const IV index)9225 S_invlist_set_previous_index(SV* const invlist, const IV index)
9226 {
9227     /* Caches <index> for later retrieval */
9228 
9229     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9230 
9231     assert(index == 0 || index < (int) _invlist_len(invlist));
9232 
9233     *get_invlist_previous_index_addr(invlist) = index;
9234 }
9235 
9236 PERL_STATIC_INLINE void
S_invlist_trim(SV * invlist)9237 S_invlist_trim(SV* invlist)
9238 {
9239     /* Free the not currently-being-used space in an inversion list */
9240 
9241     /* But don't free up the space needed for the 0 UV that is always at the
9242      * beginning of the list, nor the trailing NUL */
9243     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9244 
9245     PERL_ARGS_ASSERT_INVLIST_TRIM;
9246 
9247     assert(is_invlist(invlist));
9248 
9249     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9250 }
9251 
9252 PERL_STATIC_INLINE void
S_invlist_clear(pTHX_ SV * invlist)9253 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9254 {
9255     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9256 
9257     assert(is_invlist(invlist));
9258 
9259     invlist_set_len(invlist, 0, 0);
9260     invlist_trim(invlist);
9261 }
9262 
9263 #endif /* ifndef PERL_IN_XSUB_RE */
9264 
9265 PERL_STATIC_INLINE bool
S_invlist_is_iterating(SV * const invlist)9266 S_invlist_is_iterating(SV* const invlist)
9267 {
9268     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9269 
9270     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9271 }
9272 
9273 #ifndef PERL_IN_XSUB_RE
9274 
9275 PERL_STATIC_INLINE UV
S_invlist_max(SV * const invlist)9276 S_invlist_max(SV* const invlist)
9277 {
9278     /* Returns the maximum number of elements storable in the inversion list's
9279      * array, without having to realloc() */
9280 
9281     PERL_ARGS_ASSERT_INVLIST_MAX;
9282 
9283     assert(is_invlist(invlist));
9284 
9285     /* Assumes worst case, in which the 0 element is not counted in the
9286      * inversion list, so subtracts 1 for that */
9287     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9288            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9289            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9290 }
9291 
9292 STATIC void
S_initialize_invlist_guts(pTHX_ SV * invlist,const Size_t initial_size)9293 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9294 {
9295     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9296 
9297     /* First 1 is in case the zero element isn't in the list; second 1 is for
9298      * trailing NUL */
9299     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9300     invlist_set_len(invlist, 0, 0);
9301 
9302     /* Force iterinit() to be used to get iteration to work */
9303     invlist_iterfinish(invlist);
9304 
9305     *get_invlist_previous_index_addr(invlist) = 0;
9306     SvPOK_on(invlist);  /* This allows B to extract the PV */
9307 }
9308 
9309 SV*
Perl__new_invlist(pTHX_ IV initial_size)9310 Perl__new_invlist(pTHX_ IV initial_size)
9311 {
9312 
9313     /* Return a pointer to a newly constructed inversion list, with enough
9314      * space to store 'initial_size' elements.  If that number is negative, a
9315      * system default is used instead */
9316 
9317     SV* new_list;
9318 
9319     if (initial_size < 0) {
9320 	initial_size = 10;
9321     }
9322 
9323     new_list = newSV_type(SVt_INVLIST);
9324     initialize_invlist_guts(new_list, initial_size);
9325 
9326     return new_list;
9327 }
9328 
9329 SV*
Perl__new_invlist_C_array(pTHX_ const UV * const list)9330 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9331 {
9332     /* Return a pointer to a newly constructed inversion list, initialized to
9333      * point to <list>, which has to be in the exact correct inversion list
9334      * form, including internal fields.  Thus this is a dangerous routine that
9335      * should not be used in the wrong hands.  The passed in 'list' contains
9336      * several header fields at the beginning that are not part of the
9337      * inversion list body proper */
9338 
9339     const STRLEN length = (STRLEN) list[0];
9340     const UV version_id =          list[1];
9341     const bool offset   =    cBOOL(list[2]);
9342 #define HEADER_LENGTH 3
9343     /* If any of the above changes in any way, you must change HEADER_LENGTH
9344      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9345      *      perl -E 'say int(rand 2**31-1)'
9346      */
9347 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9348                                         data structure type, so that one being
9349                                         passed in can be validated to be an
9350                                         inversion list of the correct vintage.
9351                                        */
9352 
9353     SV* invlist = newSV_type(SVt_INVLIST);
9354 
9355     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9356 
9357     if (version_id != INVLIST_VERSION_ID) {
9358         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9359     }
9360 
9361     /* The generated array passed in includes header elements that aren't part
9362      * of the list proper, so start it just after them */
9363     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9364 
9365     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9366 			       shouldn't touch it */
9367 
9368     *(get_invlist_offset_addr(invlist)) = offset;
9369 
9370     /* The 'length' passed to us is the physical number of elements in the
9371      * inversion list.  But if there is an offset the logical number is one
9372      * less than that */
9373     invlist_set_len(invlist, length  - offset, offset);
9374 
9375     invlist_set_previous_index(invlist, 0);
9376 
9377     /* Initialize the iteration pointer. */
9378     invlist_iterfinish(invlist);
9379 
9380     SvREADONLY_on(invlist);
9381     SvPOK_on(invlist);
9382 
9383     return invlist;
9384 }
9385 
9386 STATIC void
S__append_range_to_invlist(pTHX_ SV * const invlist,const UV start,const UV end)9387 S__append_range_to_invlist(pTHX_ SV* const invlist,
9388                                  const UV start, const UV end)
9389 {
9390    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9391     * the end of the inversion list.  The range must be above any existing
9392     * ones. */
9393 
9394     UV* array;
9395     UV max = invlist_max(invlist);
9396     UV len = _invlist_len(invlist);
9397     bool offset;
9398 
9399     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9400 
9401     if (len == 0) { /* Empty lists must be initialized */
9402         offset = start != 0;
9403         array = _invlist_array_init(invlist, ! offset);
9404     }
9405     else {
9406 	/* Here, the existing list is non-empty. The current max entry in the
9407 	 * list is generally the first value not in the set, except when the
9408 	 * set extends to the end of permissible values, in which case it is
9409 	 * the first entry in that final set, and so this call is an attempt to
9410 	 * append out-of-order */
9411 
9412 	UV final_element = len - 1;
9413 	array = invlist_array(invlist);
9414 	if (   array[final_element] > start
9415 	    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9416 	{
9417 	    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9418 		     array[final_element], start,
9419 		     ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9420 	}
9421 
9422         /* Here, it is a legal append.  If the new range begins 1 above the end
9423          * of the range below it, it is extending the range below it, so the
9424          * new first value not in the set is one greater than the newly
9425          * extended range.  */
9426         offset = *get_invlist_offset_addr(invlist);
9427 	if (array[final_element] == start) {
9428 	    if (end != UV_MAX) {
9429 		array[final_element] = end + 1;
9430 	    }
9431 	    else {
9432 		/* But if the end is the maximum representable on the machine,
9433                  * assume that infinity was actually what was meant.  Just let
9434                  * the range that this would extend to have no end */
9435 		invlist_set_len(invlist, len - 1, offset);
9436 	    }
9437 	    return;
9438 	}
9439     }
9440 
9441     /* Here the new range doesn't extend any existing set.  Add it */
9442 
9443     len += 2;	/* Includes an element each for the start and end of range */
9444 
9445     /* If wll overflow the existing space, extend, which may cause the array to
9446      * be moved */
9447     if (max < len) {
9448 	invlist_extend(invlist, len);
9449 
9450         /* Have to set len here to avoid assert failure in invlist_array() */
9451         invlist_set_len(invlist, len, offset);
9452 
9453 	array = invlist_array(invlist);
9454     }
9455     else {
9456 	invlist_set_len(invlist, len, offset);
9457     }
9458 
9459     /* The next item on the list starts the range, the one after that is
9460      * one past the new range.  */
9461     array[len - 2] = start;
9462     if (end != UV_MAX) {
9463 	array[len - 1] = end + 1;
9464     }
9465     else {
9466 	/* But if the end is the maximum representable on the machine, just let
9467 	 * the range have no end */
9468 	invlist_set_len(invlist, len - 1, offset);
9469     }
9470 }
9471 
9472 SSize_t
Perl__invlist_search(SV * const invlist,const UV cp)9473 Perl__invlist_search(SV* const invlist, const UV cp)
9474 {
9475     /* Searches the inversion list for the entry that contains the input code
9476      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9477      * return value is the index into the list's array of the range that
9478      * contains <cp>, that is, 'i' such that
9479      *	array[i] <= cp < array[i+1]
9480      */
9481 
9482     IV low = 0;
9483     IV mid;
9484     IV high = _invlist_len(invlist);
9485     const IV highest_element = high - 1;
9486     const UV* array;
9487 
9488     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9489 
9490     /* If list is empty, return failure. */
9491     if (high == 0) {
9492 	return -1;
9493     }
9494 
9495     /* (We can't get the array unless we know the list is non-empty) */
9496     array = invlist_array(invlist);
9497 
9498     mid = invlist_previous_index(invlist);
9499     assert(mid >=0);
9500     if (mid > highest_element) {
9501         mid = highest_element;
9502     }
9503 
9504     /* <mid> contains the cache of the result of the previous call to this
9505      * function (0 the first time).  See if this call is for the same result,
9506      * or if it is for mid-1.  This is under the theory that calls to this
9507      * function will often be for related code points that are near each other.
9508      * And benchmarks show that caching gives better results.  We also test
9509      * here if the code point is within the bounds of the list.  These tests
9510      * replace others that would have had to be made anyway to make sure that
9511      * the array bounds were not exceeded, and these give us extra information
9512      * at the same time */
9513     if (cp >= array[mid]) {
9514         if (cp >= array[highest_element]) {
9515             return highest_element;
9516         }
9517 
9518         /* Here, array[mid] <= cp < array[highest_element].  This means that
9519          * the final element is not the answer, so can exclude it; it also
9520          * means that <mid> is not the final element, so can refer to 'mid + 1'
9521          * safely */
9522         if (cp < array[mid + 1]) {
9523             return mid;
9524         }
9525         high--;
9526         low = mid + 1;
9527     }
9528     else { /* cp < aray[mid] */
9529         if (cp < array[0]) { /* Fail if outside the array */
9530             return -1;
9531         }
9532         high = mid;
9533         if (cp >= array[mid - 1]) {
9534             goto found_entry;
9535         }
9536     }
9537 
9538     /* Binary search.  What we are looking for is <i> such that
9539      *	array[i] <= cp < array[i+1]
9540      * The loop below converges on the i+1.  Note that there may not be an
9541      * (i+1)th element in the array, and things work nonetheless */
9542     while (low < high) {
9543 	mid = (low + high) / 2;
9544         assert(mid <= highest_element);
9545 	if (array[mid] <= cp) { /* cp >= array[mid] */
9546 	    low = mid + 1;
9547 
9548 	    /* We could do this extra test to exit the loop early.
9549 	    if (cp < array[low]) {
9550 		return mid;
9551 	    }
9552 	    */
9553 	}
9554 	else { /* cp < array[mid] */
9555 	    high = mid;
9556 	}
9557     }
9558 
9559   found_entry:
9560     high--;
9561     invlist_set_previous_index(invlist, high);
9562     return high;
9563 }
9564 
9565 void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** output)9566 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9567                                          const bool complement_b, SV** output)
9568 {
9569     /* Take the union of two inversion lists and point '*output' to it.  On
9570      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9571      * even 'a' or 'b').  If to an inversion list, the contents of the original
9572      * list will be replaced by the union.  The first list, 'a', may be
9573      * NULL, in which case a copy of the second list is placed in '*output'.
9574      * If 'complement_b' is TRUE, the union is taken of the complement
9575      * (inversion) of 'b' instead of b itself.
9576      *
9577      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9578      * Richard Gillam, published by Addison-Wesley, and explained at some
9579      * length there.  The preface says to incorporate its examples into your
9580      * code at your own risk.
9581      *
9582      * The algorithm is like a merge sort. */
9583 
9584     const UV* array_a;    /* a's array */
9585     const UV* array_b;
9586     UV len_a;	    /* length of a's array */
9587     UV len_b;
9588 
9589     SV* u;			/* the resulting union */
9590     UV* array_u;
9591     UV len_u = 0;
9592 
9593     UV i_a = 0;		    /* current index into a's array */
9594     UV i_b = 0;
9595     UV i_u = 0;
9596 
9597     /* running count, as explained in the algorithm source book; items are
9598      * stopped accumulating and are output when the count changes to/from 0.
9599      * The count is incremented when we start a range that's in an input's set,
9600      * and decremented when we start a range that's not in a set.  So this
9601      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9602      * and hence nothing goes into the union; 1, just one of the inputs is in
9603      * its set (and its current range gets added to the union); and 2 when both
9604      * inputs are in their sets.  */
9605     UV count = 0;
9606 
9607     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9608     assert(a != b);
9609     assert(*output == NULL || is_invlist(*output));
9610 
9611     len_b = _invlist_len(b);
9612     if (len_b == 0) {
9613 
9614         /* Here, 'b' is empty, hence it's complement is all possible code
9615          * points.  So if the union includes the complement of 'b', it includes
9616          * everything, and we need not even look at 'a'.  It's easiest to
9617          * create a new inversion list that matches everything.  */
9618         if (complement_b) {
9619             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9620 
9621             if (*output == NULL) { /* If the output didn't exist, just point it
9622                                       at the new list */
9623                 *output = everything;
9624             }
9625             else { /* Otherwise, replace its contents with the new list */
9626                 invlist_replace_list_destroys_src(*output, everything);
9627                 SvREFCNT_dec_NN(everything);
9628             }
9629 
9630             return;
9631         }
9632 
9633         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9634          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9635          * output will be empty */
9636 
9637         if (a == NULL || _invlist_len(a) == 0) {
9638             if (*output == NULL) {
9639                 *output = _new_invlist(0);
9640             }
9641             else {
9642                 invlist_clear(*output);
9643             }
9644             return;
9645         }
9646 
9647         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9648          * union.  We can just return a copy of 'a' if '*output' doesn't point
9649          * to an existing list */
9650         if (*output == NULL) {
9651             *output = invlist_clone(a, NULL);
9652             return;
9653         }
9654 
9655         /* If the output is to overwrite 'a', we have a no-op, as it's
9656          * already in 'a' */
9657         if (*output == a) {
9658             return;
9659         }
9660 
9661         /* Here, '*output' is to be overwritten by 'a' */
9662         u = invlist_clone(a, NULL);
9663         invlist_replace_list_destroys_src(*output, u);
9664         SvREFCNT_dec_NN(u);
9665 
9666         return;
9667     }
9668 
9669     /* Here 'b' is not empty.  See about 'a' */
9670 
9671     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9672 
9673         /* Here, 'a' is empty (and b is not).  That means the union will come
9674          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9675          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9676          * the clone */
9677 
9678         SV ** dest = (*output == NULL) ? output : &u;
9679         *dest = invlist_clone(b, NULL);
9680         if (complement_b) {
9681             _invlist_invert(*dest);
9682         }
9683 
9684         if (dest == &u) {
9685             invlist_replace_list_destroys_src(*output, u);
9686             SvREFCNT_dec_NN(u);
9687         }
9688 
9689 	return;
9690     }
9691 
9692     /* Here both lists exist and are non-empty */
9693     array_a = invlist_array(a);
9694     array_b = invlist_array(b);
9695 
9696     /* If are to take the union of 'a' with the complement of b, set it
9697      * up so are looking at b's complement. */
9698     if (complement_b) {
9699 
9700 	/* To complement, we invert: if the first element is 0, remove it.  To
9701 	 * do this, we just pretend the array starts one later */
9702         if (array_b[0] == 0) {
9703             array_b++;
9704             len_b--;
9705         }
9706         else {
9707 
9708             /* But if the first element is not zero, we pretend the list starts
9709              * at the 0 that is always stored immediately before the array. */
9710             array_b--;
9711             len_b++;
9712         }
9713     }
9714 
9715     /* Size the union for the worst case: that the sets are completely
9716      * disjoint */
9717     u = _new_invlist(len_a + len_b);
9718 
9719     /* Will contain U+0000 if either component does */
9720     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9721                                       || (len_b > 0 && array_b[0] == 0));
9722 
9723     /* Go through each input list item by item, stopping when have exhausted
9724      * one of them */
9725     while (i_a < len_a && i_b < len_b) {
9726 	UV cp;	    /* The element to potentially add to the union's array */
9727 	bool cp_in_set;   /* is it in the input list's set or not */
9728 
9729 	/* We need to take one or the other of the two inputs for the union.
9730 	 * Since we are merging two sorted lists, we take the smaller of the
9731          * next items.  In case of a tie, we take first the one that is in its
9732          * set.  If we first took the one not in its set, it would decrement
9733          * the count, possibly to 0 which would cause it to be output as ending
9734          * the range, and the next time through we would take the same number,
9735          * and output it again as beginning the next range.  By doing it the
9736          * opposite way, there is no possibility that the count will be
9737          * momentarily decremented to 0, and thus the two adjoining ranges will
9738          * be seamlessly merged.  (In a tie and both are in the set or both not
9739          * in the set, it doesn't matter which we take first.) */
9740 	if (       array_a[i_a] < array_b[i_b]
9741 	    || (   array_a[i_a] == array_b[i_b]
9742 		&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9743 	{
9744 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9745 	    cp = array_a[i_a++];
9746 	}
9747 	else {
9748 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9749 	    cp = array_b[i_b++];
9750 	}
9751 
9752 	/* Here, have chosen which of the two inputs to look at.  Only output
9753 	 * if the running count changes to/from 0, which marks the
9754 	 * beginning/end of a range that's in the set */
9755 	if (cp_in_set) {
9756 	    if (count == 0) {
9757 		array_u[i_u++] = cp;
9758 	    }
9759 	    count++;
9760 	}
9761 	else {
9762 	    count--;
9763 	    if (count == 0) {
9764 		array_u[i_u++] = cp;
9765 	    }
9766 	}
9767     }
9768 
9769 
9770     /* The loop above increments the index into exactly one of the input lists
9771      * each iteration, and ends when either index gets to its list end.  That
9772      * means the other index is lower than its end, and so something is
9773      * remaining in that one.  We decrement 'count', as explained below, if
9774      * that list is in its set.  (i_a and i_b each currently index the element
9775      * beyond the one we care about.) */
9776     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9777 	|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9778     {
9779 	count--;
9780     }
9781 
9782     /* Above we decremented 'count' if the list that had unexamined elements in
9783      * it was in its set.  This has made it so that 'count' being non-zero
9784      * means there isn't anything left to output; and 'count' equal to 0 means
9785      * that what is left to output is precisely that which is left in the
9786      * non-exhausted input list.
9787      *
9788      * To see why, note first that the exhausted input obviously has nothing
9789      * left to add to the union.  If it was in its set at its end, that means
9790      * the set extends from here to the platform's infinity, and hence so does
9791      * the union and the non-exhausted set is irrelevant.  The exhausted set
9792      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9793      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9794      * 'count' remains at 1.  This is consistent with the decremented 'count'
9795      * != 0 meaning there's nothing left to add to the union.
9796      *
9797      * But if the exhausted input wasn't in its set, it contributed 0 to
9798      * 'count', and the rest of the union will be whatever the other input is.
9799      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9800      * otherwise it gets decremented to 0.  This is consistent with 'count'
9801      * == 0 meaning the remainder of the union is whatever is left in the
9802      * non-exhausted list. */
9803     if (count != 0) {
9804         len_u = i_u;
9805     }
9806     else {
9807         IV copy_count = len_a - i_a;
9808         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9809 	    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9810         }
9811         else { /* The non-exhausted input is b */
9812             copy_count = len_b - i_b;
9813 	    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9814         }
9815         len_u = i_u + copy_count;
9816     }
9817 
9818     /* Set the result to the final length, which can change the pointer to
9819      * array_u, so re-find it.  (Note that it is unlikely that this will
9820      * change, as we are shrinking the space, not enlarging it) */
9821     if (len_u != _invlist_len(u)) {
9822 	invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9823 	invlist_trim(u);
9824 	array_u = invlist_array(u);
9825     }
9826 
9827     if (*output == NULL) {  /* Simply return the new inversion list */
9828         *output = u;
9829     }
9830     else {
9831         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9832          * could instead free '*output', and then set it to 'u', but experience
9833          * has shown [perl #127392] that if the input is a mortal, we can get a
9834          * huge build-up of these during regex compilation before they get
9835          * freed. */
9836         invlist_replace_list_destroys_src(*output, u);
9837         SvREFCNT_dec_NN(u);
9838     }
9839 
9840     return;
9841 }
9842 
9843 void
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** i)9844 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9845                                                const bool complement_b, SV** i)
9846 {
9847     /* Take the intersection of two inversion lists and point '*i' to it.  On
9848      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9849      * even 'a' or 'b').  If to an inversion list, the contents of the original
9850      * list will be replaced by the intersection.  The first list, 'a', may be
9851      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9852      * TRUE, the result will be the intersection of 'a' and the complement (or
9853      * inversion) of 'b' instead of 'b' directly.
9854      *
9855      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9856      * Richard Gillam, published by Addison-Wesley, and explained at some
9857      * length there.  The preface says to incorporate its examples into your
9858      * code at your own risk.  In fact, it had bugs
9859      *
9860      * The algorithm is like a merge sort, and is essentially the same as the
9861      * union above
9862      */
9863 
9864     const UV* array_a;		/* a's array */
9865     const UV* array_b;
9866     UV len_a;	/* length of a's array */
9867     UV len_b;
9868 
9869     SV* r;		     /* the resulting intersection */
9870     UV* array_r;
9871     UV len_r = 0;
9872 
9873     UV i_a = 0;		    /* current index into a's array */
9874     UV i_b = 0;
9875     UV i_r = 0;
9876 
9877     /* running count of how many of the two inputs are postitioned at ranges
9878      * that are in their sets.  As explained in the algorithm source book,
9879      * items are stopped accumulating and are output when the count changes
9880      * to/from 2.  The count is incremented when we start a range that's in an
9881      * input's set, and decremented when we start a range that's not in a set.
9882      * Only when it is 2 are we in the intersection. */
9883     UV count = 0;
9884 
9885     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9886     assert(a != b);
9887     assert(*i == NULL || is_invlist(*i));
9888 
9889     /* Special case if either one is empty */
9890     len_a = (a == NULL) ? 0 : _invlist_len(a);
9891     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9892         if (len_a != 0 && complement_b) {
9893 
9894             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9895              * must be empty.  Here, also we are using 'b's complement, which
9896              * hence must be every possible code point.  Thus the intersection
9897              * is simply 'a'. */
9898 
9899             if (*i == a) {  /* No-op */
9900                 return;
9901             }
9902 
9903             if (*i == NULL) {
9904                 *i = invlist_clone(a, NULL);
9905                 return;
9906             }
9907 
9908             r = invlist_clone(a, NULL);
9909             invlist_replace_list_destroys_src(*i, r);
9910             SvREFCNT_dec_NN(r);
9911             return;
9912         }
9913 
9914         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9915          * intersection must be empty */
9916         if (*i == NULL) {
9917             *i = _new_invlist(0);
9918             return;
9919         }
9920 
9921         invlist_clear(*i);
9922 	return;
9923     }
9924 
9925     /* Here both lists exist and are non-empty */
9926     array_a = invlist_array(a);
9927     array_b = invlist_array(b);
9928 
9929     /* If are to take the intersection of 'a' with the complement of b, set it
9930      * up so are looking at b's complement. */
9931     if (complement_b) {
9932 
9933 	/* To complement, we invert: if the first element is 0, remove it.  To
9934 	 * do this, we just pretend the array starts one later */
9935         if (array_b[0] == 0) {
9936             array_b++;
9937             len_b--;
9938         }
9939         else {
9940 
9941             /* But if the first element is not zero, we pretend the list starts
9942              * at the 0 that is always stored immediately before the array. */
9943             array_b--;
9944             len_b++;
9945         }
9946     }
9947 
9948     /* Size the intersection for the worst case: that the intersection ends up
9949      * fragmenting everything to be completely disjoint */
9950     r= _new_invlist(len_a + len_b);
9951 
9952     /* Will contain U+0000 iff both components do */
9953     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9954                                      && len_b > 0 && array_b[0] == 0);
9955 
9956     /* Go through each list item by item, stopping when have exhausted one of
9957      * them */
9958     while (i_a < len_a && i_b < len_b) {
9959 	UV cp;	    /* The element to potentially add to the intersection's
9960 		       array */
9961 	bool cp_in_set;	/* Is it in the input list's set or not */
9962 
9963 	/* We need to take one or the other of the two inputs for the
9964 	 * intersection.  Since we are merging two sorted lists, we take the
9965          * smaller of the next items.  In case of a tie, we take first the one
9966          * that is not in its set (a difference from the union algorithm).  If
9967          * we first took the one in its set, it would increment the count,
9968          * possibly to 2 which would cause it to be output as starting a range
9969          * in the intersection, and the next time through we would take that
9970          * same number, and output it again as ending the set.  By doing the
9971          * opposite of this, there is no possibility that the count will be
9972          * momentarily incremented to 2.  (In a tie and both are in the set or
9973          * both not in the set, it doesn't matter which we take first.) */
9974 	if (       array_a[i_a] < array_b[i_b]
9975 	    || (   array_a[i_a] == array_b[i_b]
9976 		&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9977 	{
9978 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9979 	    cp = array_a[i_a++];
9980 	}
9981 	else {
9982 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9983 	    cp= array_b[i_b++];
9984 	}
9985 
9986 	/* Here, have chosen which of the two inputs to look at.  Only output
9987 	 * if the running count changes to/from 2, which marks the
9988 	 * beginning/end of a range that's in the intersection */
9989 	if (cp_in_set) {
9990 	    count++;
9991 	    if (count == 2) {
9992 		array_r[i_r++] = cp;
9993 	    }
9994 	}
9995 	else {
9996 	    if (count == 2) {
9997 		array_r[i_r++] = cp;
9998 	    }
9999 	    count--;
10000 	}
10001 
10002     }
10003 
10004     /* The loop above increments the index into exactly one of the input lists
10005      * each iteration, and ends when either index gets to its list end.  That
10006      * means the other index is lower than its end, and so something is
10007      * remaining in that one.  We increment 'count', as explained below, if the
10008      * exhausted list was in its set.  (i_a and i_b each currently index the
10009      * element beyond the one we care about.) */
10010     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10011         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10012     {
10013 	count++;
10014     }
10015 
10016     /* Above we incremented 'count' if the exhausted list was in its set.  This
10017      * has made it so that 'count' being below 2 means there is nothing left to
10018      * output; otheriwse what's left to add to the intersection is precisely
10019      * that which is left in the non-exhausted input list.
10020      *
10021      * To see why, note first that the exhausted input obviously has nothing
10022      * left to affect the intersection.  If it was in its set at its end, that
10023      * means the set extends from here to the platform's infinity, and hence
10024      * anything in the non-exhausted's list will be in the intersection, and
10025      * anything not in it won't be.  Hence, the rest of the intersection is
10026      * precisely what's in the non-exhausted list  The exhausted set also
10027      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
10028      * it means 'count' is now at least 2.  This is consistent with the
10029      * incremented 'count' being >= 2 means to add the non-exhausted list to
10030      * the intersection.
10031      *
10032      * But if the exhausted input wasn't in its set, it contributed 0 to
10033      * 'count', and the intersection can't include anything further; the
10034      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
10035      * incremented.  This is consistent with 'count' being < 2 meaning nothing
10036      * further to add to the intersection. */
10037     if (count < 2) { /* Nothing left to put in the intersection. */
10038         len_r = i_r;
10039     }
10040     else { /* copy the non-exhausted list, unchanged. */
10041         IV copy_count = len_a - i_a;
10042         if (copy_count > 0) {   /* a is the one with stuff left */
10043 	    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10044         }
10045         else {  /* b is the one with stuff left */
10046             copy_count = len_b - i_b;
10047 	    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10048         }
10049         len_r = i_r + copy_count;
10050     }
10051 
10052     /* Set the result to the final length, which can change the pointer to
10053      * array_r, so re-find it.  (Note that it is unlikely that this will
10054      * change, as we are shrinking the space, not enlarging it) */
10055     if (len_r != _invlist_len(r)) {
10056 	invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10057 	invlist_trim(r);
10058 	array_r = invlist_array(r);
10059     }
10060 
10061     if (*i == NULL) { /* Simply return the calculated intersection */
10062         *i = r;
10063     }
10064     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
10065               instead free '*i', and then set it to 'r', but experience has
10066               shown [perl #127392] that if the input is a mortal, we can get a
10067               huge build-up of these during regex compilation before they get
10068               freed. */
10069         if (len_r) {
10070             invlist_replace_list_destroys_src(*i, r);
10071         }
10072         else {
10073             invlist_clear(*i);
10074         }
10075         SvREFCNT_dec_NN(r);
10076     }
10077 
10078     return;
10079 }
10080 
10081 SV*
Perl__add_range_to_invlist(pTHX_ SV * invlist,UV start,UV end)10082 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10083 {
10084     /* Add the range from 'start' to 'end' inclusive to the inversion list's
10085      * set.  A pointer to the inversion list is returned.  This may actually be
10086      * a new list, in which case the passed in one has been destroyed.  The
10087      * passed-in inversion list can be NULL, in which case a new one is created
10088      * with just the one range in it.  The new list is not necessarily
10089      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
10090      * result of this function.  The gain would not be large, and in many
10091      * cases, this is called multiple times on a single inversion list, so
10092      * anything freed may almost immediately be needed again.
10093      *
10094      * This used to mostly call the 'union' routine, but that is much more
10095      * heavyweight than really needed for a single range addition */
10096 
10097     UV* array;              /* The array implementing the inversion list */
10098     UV len;                 /* How many elements in 'array' */
10099     SSize_t i_s;            /* index into the invlist array where 'start'
10100                                should go */
10101     SSize_t i_e = 0;        /* And the index where 'end' should go */
10102     UV cur_highest;         /* The highest code point in the inversion list
10103                                upon entry to this function */
10104 
10105     /* This range becomes the whole inversion list if none already existed */
10106     if (invlist == NULL) {
10107 	invlist = _new_invlist(2);
10108         _append_range_to_invlist(invlist, start, end);
10109         return invlist;
10110     }
10111 
10112     /* Likewise, if the inversion list is currently empty */
10113     len = _invlist_len(invlist);
10114     if (len == 0) {
10115         _append_range_to_invlist(invlist, start, end);
10116         return invlist;
10117     }
10118 
10119     /* Starting here, we have to know the internals of the list */
10120     array = invlist_array(invlist);
10121 
10122     /* If the new range ends higher than the current highest ... */
10123     cur_highest = invlist_highest(invlist);
10124     if (end > cur_highest) {
10125 
10126         /* If the whole range is higher, we can just append it */
10127         if (start > cur_highest) {
10128             _append_range_to_invlist(invlist, start, end);
10129             return invlist;
10130         }
10131 
10132         /* Otherwise, add the portion that is higher ... */
10133         _append_range_to_invlist(invlist, cur_highest + 1, end);
10134 
10135         /* ... and continue on below to handle the rest.  As a result of the
10136          * above append, we know that the index of the end of the range is the
10137          * final even numbered one of the array.  Recall that the final element
10138          * always starts a range that extends to infinity.  If that range is in
10139          * the set (meaning the set goes from here to infinity), it will be an
10140          * even index, but if it isn't in the set, it's odd, and the final
10141          * range in the set is one less, which is even. */
10142         if (end == UV_MAX) {
10143             i_e = len;
10144         }
10145         else {
10146             i_e = len - 2;
10147         }
10148     }
10149 
10150     /* We have dealt with appending, now see about prepending.  If the new
10151      * range starts lower than the current lowest ... */
10152     if (start < array[0]) {
10153 
10154         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10155          * Let the union code handle it, rather than having to know the
10156          * trickiness in two code places.  */
10157         if (UNLIKELY(start == 0)) {
10158             SV* range_invlist;
10159 
10160             range_invlist = _new_invlist(2);
10161             _append_range_to_invlist(range_invlist, start, end);
10162 
10163             _invlist_union(invlist, range_invlist, &invlist);
10164 
10165             SvREFCNT_dec_NN(range_invlist);
10166 
10167             return invlist;
10168         }
10169 
10170         /* If the whole new range comes before the first entry, and doesn't
10171          * extend it, we have to insert it as an additional range */
10172         if (end < array[0] - 1) {
10173             i_s = i_e = -1;
10174             goto splice_in_new_range;
10175         }
10176 
10177         /* Here the new range adjoins the existing first range, extending it
10178          * downwards. */
10179         array[0] = start;
10180 
10181         /* And continue on below to handle the rest.  We know that the index of
10182          * the beginning of the range is the first one of the array */
10183         i_s = 0;
10184     }
10185     else { /* Not prepending any part of the new range to the existing list.
10186             * Find where in the list it should go.  This finds i_s, such that:
10187             *     invlist[i_s] <= start < array[i_s+1]
10188             */
10189         i_s = _invlist_search(invlist, start);
10190     }
10191 
10192     /* At this point, any extending before the beginning of the inversion list
10193      * and/or after the end has been done.  This has made it so that, in the
10194      * code below, each endpoint of the new range is either in a range that is
10195      * in the set, or is in a gap between two ranges that are.  This means we
10196      * don't have to worry about exceeding the array bounds.
10197      *
10198      * Find where in the list the new range ends (but we can skip this if we
10199      * have already determined what it is, or if it will be the same as i_s,
10200      * which we already have computed) */
10201     if (i_e == 0) {
10202         i_e = (start == end)
10203               ? i_s
10204               : _invlist_search(invlist, end);
10205     }
10206 
10207     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10208      * is a range that goes to infinity there is no element at invlist[i_e+1],
10209      * so only the first relation holds. */
10210 
10211     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10212 
10213         /* Here, the ranges on either side of the beginning of the new range
10214          * are in the set, and this range starts in the gap between them.
10215          *
10216          * The new range extends the range above it downwards if the new range
10217          * ends at or above that range's start */
10218         const bool extends_the_range_above = (   end == UV_MAX
10219                                               || end + 1 >= array[i_s+1]);
10220 
10221         /* The new range extends the range below it upwards if it begins just
10222          * after where that range ends */
10223         if (start == array[i_s]) {
10224 
10225             /* If the new range fills the entire gap between the other ranges,
10226              * they will get merged together.  Other ranges may also get
10227              * merged, depending on how many of them the new range spans.  In
10228              * the general case, we do the merge later, just once, after we
10229              * figure out how many to merge.  But in the case where the new
10230              * range exactly spans just this one gap (possibly extending into
10231              * the one above), we do the merge here, and an early exit.  This
10232              * is done here to avoid having to special case later. */
10233             if (i_e - i_s <= 1) {
10234 
10235                 /* If i_e - i_s == 1, it means that the new range terminates
10236                  * within the range above, and hence 'extends_the_range_above'
10237                  * must be true.  (If the range above it extends to infinity,
10238                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10239                  * will be 0, so no harm done.) */
10240                 if (extends_the_range_above) {
10241                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10242                     invlist_set_len(invlist,
10243                                     len - 2,
10244                                     *(get_invlist_offset_addr(invlist)));
10245                     return invlist;
10246                 }
10247 
10248                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10249                  * to the same range, and below we are about to decrement i_s
10250                  * */
10251                 i_e--;
10252             }
10253 
10254             /* Here, the new range is adjacent to the one below.  (It may also
10255              * span beyond the range above, but that will get resolved later.)
10256              * Extend the range below to include this one. */
10257             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10258             i_s--;
10259             start = array[i_s];
10260         }
10261         else if (extends_the_range_above) {
10262 
10263             /* Here the new range only extends the range above it, but not the
10264              * one below.  It merges with the one above.  Again, we keep i_e
10265              * and i_s in sync if they point to the same range */
10266             if (i_e == i_s) {
10267                 i_e++;
10268             }
10269             i_s++;
10270             array[i_s] = start;
10271         }
10272     }
10273 
10274     /* Here, we've dealt with the new range start extending any adjoining
10275      * existing ranges.
10276      *
10277      * If the new range extends to infinity, it is now the final one,
10278      * regardless of what was there before */
10279     if (UNLIKELY(end == UV_MAX)) {
10280         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10281         return invlist;
10282     }
10283 
10284     /* If i_e started as == i_s, it has also been dealt with,
10285      * and been updated to the new i_s, which will fail the following if */
10286     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10287 
10288         /* Here, the ranges on either side of the end of the new range are in
10289          * the set, and this range ends in the gap between them.
10290          *
10291          * If this range is adjacent to (hence extends) the range above it, it
10292          * becomes part of that range; likewise if it extends the range below,
10293          * it becomes part of that range */
10294         if (end + 1 == array[i_e+1]) {
10295             i_e++;
10296             array[i_e] = start;
10297         }
10298         else if (start <= array[i_e]) {
10299             array[i_e] = end + 1;
10300             i_e--;
10301         }
10302     }
10303 
10304     if (i_s == i_e) {
10305 
10306         /* If the range fits entirely in an existing range (as possibly already
10307          * extended above), it doesn't add anything new */
10308         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10309             return invlist;
10310         }
10311 
10312         /* Here, no part of the range is in the list.  Must add it.  It will
10313          * occupy 2 more slots */
10314       splice_in_new_range:
10315 
10316         invlist_extend(invlist, len + 2);
10317         array = invlist_array(invlist);
10318         /* Move the rest of the array down two slots. Don't include any
10319          * trailing NUL */
10320         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10321 
10322         /* Do the actual splice */
10323         array[i_e+1] = start;
10324         array[i_e+2] = end + 1;
10325         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10326         return invlist;
10327     }
10328 
10329     /* Here the new range crossed the boundaries of a pre-existing range.  The
10330      * code above has adjusted things so that both ends are in ranges that are
10331      * in the set.  This means everything in between must also be in the set.
10332      * Just squash things together */
10333     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10334     invlist_set_len(invlist,
10335                     len - i_e + i_s,
10336                     *(get_invlist_offset_addr(invlist)));
10337 
10338     return invlist;
10339 }
10340 
10341 SV*
Perl__setup_canned_invlist(pTHX_ const STRLEN size,const UV element0,UV ** other_elements_ptr)10342 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10343                                  UV** other_elements_ptr)
10344 {
10345     /* Create and return an inversion list whose contents are to be populated
10346      * by the caller.  The caller gives the number of elements (in 'size') and
10347      * the very first element ('element0').  This function will set
10348      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10349      * are to be placed.
10350      *
10351      * Obviously there is some trust involved that the caller will properly
10352      * fill in the other elements of the array.
10353      *
10354      * (The first element needs to be passed in, as the underlying code does
10355      * things differently depending on whether it is zero or non-zero) */
10356 
10357     SV* invlist = _new_invlist(size);
10358     bool offset;
10359 
10360     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10361 
10362     invlist = add_cp_to_invlist(invlist, element0);
10363     offset = *get_invlist_offset_addr(invlist);
10364 
10365     invlist_set_len(invlist, size, offset);
10366     *other_elements_ptr = invlist_array(invlist) + 1;
10367     return invlist;
10368 }
10369 
10370 #endif
10371 
10372 #ifndef PERL_IN_XSUB_RE
10373 void
Perl__invlist_invert(pTHX_ SV * const invlist)10374 Perl__invlist_invert(pTHX_ SV* const invlist)
10375 {
10376     /* Complement the input inversion list.  This adds a 0 if the list didn't
10377      * have a zero; removes it otherwise.  As described above, the data
10378      * structure is set up so that this is very efficient */
10379 
10380     PERL_ARGS_ASSERT__INVLIST_INVERT;
10381 
10382     assert(! invlist_is_iterating(invlist));
10383 
10384     /* The inverse of matching nothing is matching everything */
10385     if (_invlist_len(invlist) == 0) {
10386 	_append_range_to_invlist(invlist, 0, UV_MAX);
10387 	return;
10388     }
10389 
10390     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10391 }
10392 
10393 SV*
Perl_invlist_clone(pTHX_ SV * const invlist,SV * new_invlist)10394 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10395 {
10396     /* Return a new inversion list that is a copy of the input one, which is
10397      * unchanged.  The new list will not be mortal even if the old one was. */
10398 
10399     const STRLEN nominal_length = _invlist_len(invlist);
10400     const STRLEN physical_length = SvCUR(invlist);
10401     const bool offset = *(get_invlist_offset_addr(invlist));
10402 
10403     PERL_ARGS_ASSERT_INVLIST_CLONE;
10404 
10405     if (new_invlist == NULL) {
10406         new_invlist = _new_invlist(nominal_length);
10407     }
10408     else {
10409         sv_upgrade(new_invlist, SVt_INVLIST);
10410         initialize_invlist_guts(new_invlist, nominal_length);
10411     }
10412 
10413     *(get_invlist_offset_addr(new_invlist)) = offset;
10414     invlist_set_len(new_invlist, nominal_length, offset);
10415     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10416 
10417     return new_invlist;
10418 }
10419 
10420 #endif
10421 
10422 PERL_STATIC_INLINE UV
S_invlist_lowest(SV * const invlist)10423 S_invlist_lowest(SV* const invlist)
10424 {
10425     /* Returns the lowest code point that matches an inversion list.  This API
10426      * has an ambiguity, as it returns 0 under either the lowest is actually
10427      * 0, or if the list is empty.  If this distinction matters to you, check
10428      * for emptiness before calling this function */
10429 
10430     UV len = _invlist_len(invlist);
10431     UV *array;
10432 
10433     PERL_ARGS_ASSERT_INVLIST_LOWEST;
10434 
10435     if (len == 0) {
10436         return 0;
10437     }
10438 
10439     array = invlist_array(invlist);
10440 
10441     return array[0];
10442 }
10443 
10444 STATIC SV *
S_invlist_contents(pTHX_ SV * const invlist,const bool traditional_style)10445 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10446 {
10447     /* Get the contents of an inversion list into a string SV so that they can
10448      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10449      * traditionally done for debug tracing; otherwise it uses a format
10450      * suitable for just copying to the output, with blanks between ranges and
10451      * a dash between range components */
10452 
10453     UV start, end;
10454     SV* output;
10455     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10456     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10457 
10458     if (traditional_style) {
10459         output = newSVpvs("\n");
10460     }
10461     else {
10462         output = newSVpvs("");
10463     }
10464 
10465     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10466 
10467     assert(! invlist_is_iterating(invlist));
10468 
10469     invlist_iterinit(invlist);
10470     while (invlist_iternext(invlist, &start, &end)) {
10471 	if (end == UV_MAX) {
10472 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10473                                           start, intra_range_delimiter,
10474                                                  inter_range_delimiter);
10475 	}
10476 	else if (end != start) {
10477 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10478 		                          start,
10479                                                    intra_range_delimiter,
10480                                                   end, inter_range_delimiter);
10481 	}
10482 	else {
10483 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10484                                           start, inter_range_delimiter);
10485 	}
10486     }
10487 
10488     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10489         SvCUR_set(output, SvCUR(output) - 1);
10490     }
10491 
10492     return output;
10493 }
10494 
10495 #ifndef PERL_IN_XSUB_RE
10496 void
Perl__invlist_dump(pTHX_ PerlIO * file,I32 level,const char * const indent,SV * const invlist)10497 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10498                          const char * const indent, SV* const invlist)
10499 {
10500     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10501      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10502      * the string 'indent'.  The output looks like this:
10503          [0] 0x000A .. 0x000D
10504          [2] 0x0085
10505          [4] 0x2028 .. 0x2029
10506          [6] 0x3104 .. INFTY
10507      * This means that the first range of code points matched by the list are
10508      * 0xA through 0xD; the second range contains only the single code point
10509      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10510      * are used to define each range (except if the final range extends to
10511      * infinity, only a single element is needed).  The array index of the
10512      * first element for the corresponding range is given in brackets. */
10513 
10514     UV start, end;
10515     STRLEN count = 0;
10516 
10517     PERL_ARGS_ASSERT__INVLIST_DUMP;
10518 
10519     if (invlist_is_iterating(invlist)) {
10520         Perl_dump_indent(aTHX_ level, file,
10521              "%sCan't dump inversion list because is in middle of iterating\n",
10522              indent);
10523         return;
10524     }
10525 
10526     invlist_iterinit(invlist);
10527     while (invlist_iternext(invlist, &start, &end)) {
10528 	if (end == UV_MAX) {
10529 	    Perl_dump_indent(aTHX_ level, file,
10530                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10531                                    indent, (UV)count, start);
10532 	}
10533 	else if (end != start) {
10534 	    Perl_dump_indent(aTHX_ level, file,
10535                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10536 		                indent, (UV)count, start,         end);
10537 	}
10538 	else {
10539 	    Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10540                                             indent, (UV)count, start);
10541 	}
10542         count += 2;
10543     }
10544 }
10545 
10546 #endif
10547 
10548 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10549 bool
Perl__invlistEQ(pTHX_ SV * const a,SV * const b,const bool complement_b)10550 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10551 {
10552     /* Return a boolean as to if the two passed in inversion lists are
10553      * identical.  The final argument, if TRUE, says to take the complement of
10554      * the second inversion list before doing the comparison */
10555 
10556     const UV len_a = _invlist_len(a);
10557     UV len_b = _invlist_len(b);
10558 
10559     const UV* array_a = NULL;
10560     const UV* array_b = NULL;
10561 
10562     PERL_ARGS_ASSERT__INVLISTEQ;
10563 
10564     /* This code avoids accessing the arrays unless it knows the length is
10565      * non-zero */
10566 
10567     if (len_a == 0) {
10568         if (len_b == 0) {
10569             return ! complement_b;
10570         }
10571     }
10572     else {
10573         array_a = invlist_array(a);
10574     }
10575 
10576     if (len_b != 0) {
10577         array_b = invlist_array(b);
10578     }
10579 
10580     /* If are to compare 'a' with the complement of b, set it
10581      * up so are looking at b's complement. */
10582     if (complement_b) {
10583 
10584         /* The complement of nothing is everything, so <a> would have to have
10585          * just one element, starting at zero (ending at infinity) */
10586         if (len_b == 0) {
10587             return (len_a == 1 && array_a[0] == 0);
10588         }
10589         if (array_b[0] == 0) {
10590 
10591             /* Otherwise, to complement, we invert.  Here, the first element is
10592              * 0, just remove it.  To do this, we just pretend the array starts
10593              * one later */
10594 
10595             array_b++;
10596             len_b--;
10597         }
10598         else {
10599 
10600             /* But if the first element is not zero, we pretend the list starts
10601              * at the 0 that is always stored immediately before the array. */
10602             array_b--;
10603             len_b++;
10604         }
10605     }
10606 
10607     return    len_a == len_b
10608            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10609 
10610 }
10611 #endif
10612 
10613 /*
10614  * As best we can, determine the characters that can match the start of
10615  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10616  * can be false positive matches
10617  *
10618  * Returns the invlist as a new SV*; it is the caller's responsibility to
10619  * call SvREFCNT_dec() when done with it.
10620  */
10621 STATIC SV*
S_make_exactf_invlist(pTHX_ RExC_state_t * pRExC_state,regnode * node)10622 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10623 {
10624     const U8 * s = (U8*)STRING(node);
10625     SSize_t bytelen = STR_LEN(node);
10626     UV uc;
10627     /* Start out big enough for 2 separate code points */
10628     SV* invlist = _new_invlist(4);
10629 
10630     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10631 
10632     if (! UTF) {
10633         uc = *s;
10634 
10635         /* We punt and assume can match anything if the node begins
10636          * with a multi-character fold.  Things are complicated.  For
10637          * example, /ffi/i could match any of:
10638          *  "\N{LATIN SMALL LIGATURE FFI}"
10639          *  "\N{LATIN SMALL LIGATURE FF}I"
10640          *  "F\N{LATIN SMALL LIGATURE FI}"
10641          *  plus several other things; and making sure we have all the
10642          *  possibilities is hard. */
10643         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10644             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10645         }
10646         else {
10647             /* Any Latin1 range character can potentially match any
10648              * other depending on the locale, and in Turkic locales, U+130 and
10649              * U+131 */
10650             if (OP(node) == EXACTFL) {
10651                 _invlist_union(invlist, PL_Latin1, &invlist);
10652                 invlist = add_cp_to_invlist(invlist,
10653                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10654                 invlist = add_cp_to_invlist(invlist,
10655                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10656             }
10657             else {
10658                 /* But otherwise, it matches at least itself.  We can
10659                  * quickly tell if it has a distinct fold, and if so,
10660                  * it matches that as well */
10661                 invlist = add_cp_to_invlist(invlist, uc);
10662                 if (IS_IN_SOME_FOLD_L1(uc))
10663                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10664             }
10665 
10666             /* Some characters match above-Latin1 ones under /i.  This
10667              * is true of EXACTFL ones when the locale is UTF-8 */
10668             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10669                 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
10670                                                          EXACTFAA_NO_TRIE)))
10671             {
10672                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10673             }
10674         }
10675     }
10676     else {  /* Pattern is UTF-8 */
10677         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10678         const U8* e = s + bytelen;
10679         IV fc;
10680 
10681         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10682 
10683         /* The only code points that aren't folded in a UTF EXACTFish
10684          * node are the problematic ones in EXACTFL nodes */
10685         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10686             /* We need to check for the possibility that this EXACTFL
10687              * node begins with a multi-char fold.  Therefore we fold
10688              * the first few characters of it so that we can make that
10689              * check */
10690             U8 *d = folded;
10691             int i;
10692 
10693             fc = -1;
10694             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10695                 if (isASCII(*s)) {
10696                     *(d++) = (U8) toFOLD(*s);
10697                     if (fc < 0) {       /* Save the first fold */
10698                         fc = *(d-1);
10699                     }
10700                     s++;
10701                 }
10702                 else {
10703                     STRLEN len;
10704                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10705                     if (fc < 0) {       /* Save the first fold */
10706                         fc = fold;
10707                     }
10708                     d += len;
10709                     s += UTF8SKIP(s);
10710                 }
10711             }
10712 
10713             /* And set up so the code below that looks in this folded
10714              * buffer instead of the node's string */
10715             e = d;
10716             s = folded;
10717         }
10718 
10719         /* When we reach here 's' points to the fold of the first
10720          * character(s) of the node; and 'e' points to far enough along
10721          * the folded string to be just past any possible multi-char
10722          * fold.
10723          *
10724          * Like the non-UTF case above, we punt if the node begins with a
10725          * multi-char fold  */
10726 
10727         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10728             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10729         }
10730         else {  /* Single char fold */
10731             unsigned int k;
10732             U32 first_fold;
10733             const U32 * remaining_folds;
10734             Size_t folds_count;
10735 
10736             /* It matches itself */
10737             invlist = add_cp_to_invlist(invlist, fc);
10738 
10739             /* ... plus all the things that fold to it, which are found in
10740              * PL_utf8_foldclosures */
10741             folds_count = _inverse_folds(fc, &first_fold,
10742                                                 &remaining_folds);
10743             for (k = 0; k < folds_count; k++) {
10744                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10745 
10746                 /* /aa doesn't allow folds between ASCII and non- */
10747                 if (   inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
10748                     && isASCII(c) != isASCII(fc))
10749                 {
10750                     continue;
10751                 }
10752 
10753                 invlist = add_cp_to_invlist(invlist, c);
10754             }
10755 
10756             if (OP(node) == EXACTFL) {
10757 
10758                 /* If either [iI] are present in an EXACTFL node the above code
10759                  * should have added its normal case pair, but under a Turkish
10760                  * locale they could match instead the case pairs from it.  Add
10761                  * those as potential matches as well */
10762                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10763                     invlist = add_cp_to_invlist(invlist,
10764                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10765                     invlist = add_cp_to_invlist(invlist,
10766                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10767                 }
10768                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10769                     invlist = add_cp_to_invlist(invlist, 'I');
10770                 }
10771                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10772                     invlist = add_cp_to_invlist(invlist, 'i');
10773                 }
10774             }
10775         }
10776     }
10777 
10778     return invlist;
10779 }
10780 
10781 #undef HEADER_LENGTH
10782 #undef TO_INTERNAL_SIZE
10783 #undef FROM_INTERNAL_SIZE
10784 #undef INVLIST_VERSION_ID
10785 
10786 /* End of inversion list object */
10787 
10788 STATIC void
S_parse_lparen_question_flags(pTHX_ RExC_state_t * pRExC_state)10789 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10790 {
10791     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10792      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10793      * should point to the first flag; it is updated on output to point to the
10794      * final ')' or ':'.  There needs to be at least one flag, or this will
10795      * abort */
10796 
10797     /* for (?g), (?gc), and (?o) warnings; warning
10798        about (?c) will warn about (?g) -- japhy    */
10799 
10800 #define WASTED_O  0x01
10801 #define WASTED_G  0x02
10802 #define WASTED_C  0x04
10803 #define WASTED_GC (WASTED_G|WASTED_C)
10804     I32 wastedflags = 0x00;
10805     U32 posflags = 0, negflags = 0;
10806     U32 *flagsp = &posflags;
10807     char has_charset_modifier = '\0';
10808     regex_charset cs;
10809     bool has_use_defaults = FALSE;
10810     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10811     int x_mod_count = 0;
10812 
10813     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10814 
10815     /* '^' as an initial flag sets certain defaults */
10816     if (UCHARAT(RExC_parse) == '^') {
10817         RExC_parse++;
10818         has_use_defaults = TRUE;
10819         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10820         cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10821              ? REGEX_UNICODE_CHARSET
10822              : REGEX_DEPENDS_CHARSET;
10823         set_regex_charset(&RExC_flags, cs);
10824     }
10825     else {
10826         cs = get_regex_charset(RExC_flags);
10827         if (   cs == REGEX_DEPENDS_CHARSET
10828             && (toUSE_UNI_CHARSET_NOT_DEPENDS))
10829         {
10830             cs = REGEX_UNICODE_CHARSET;
10831         }
10832     }
10833 
10834     while (RExC_parse < RExC_end) {
10835         /* && memCHRs("iogcmsx", *RExC_parse) */
10836         /* (?g), (?gc) and (?o) are useless here
10837            and must be globally applied -- japhy */
10838         if ((RExC_pm_flags & PMf_WILDCARD)) {
10839             if (flagsp == & negflags) {
10840                 if (*RExC_parse == 'm') {
10841                     RExC_parse++;
10842                     /* diag_listed_as: Use of %s is not allowed in Unicode
10843                        property wildcard subpatterns in regex; marked by <--
10844                        HERE in m/%s/ */
10845                     vFAIL("Use of modifier '-m' is not allowed in Unicode"
10846                           " property wildcard subpatterns");
10847                 }
10848             }
10849             else {
10850                 if (*RExC_parse == 's') {
10851                     goto modifier_illegal_in_wildcard;
10852                 }
10853             }
10854         }
10855 
10856         switch (*RExC_parse) {
10857 
10858             /* Code for the imsxn flags */
10859             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10860 
10861             case LOCALE_PAT_MOD:
10862                 if (has_charset_modifier) {
10863                     goto excess_modifier;
10864                 }
10865                 else if (flagsp == &negflags) {
10866                     goto neg_modifier;
10867                 }
10868                 cs = REGEX_LOCALE_CHARSET;
10869                 has_charset_modifier = LOCALE_PAT_MOD;
10870                 break;
10871             case UNICODE_PAT_MOD:
10872                 if (has_charset_modifier) {
10873                     goto excess_modifier;
10874                 }
10875                 else if (flagsp == &negflags) {
10876                     goto neg_modifier;
10877                 }
10878                 cs = REGEX_UNICODE_CHARSET;
10879                 has_charset_modifier = UNICODE_PAT_MOD;
10880                 break;
10881             case ASCII_RESTRICT_PAT_MOD:
10882                 if (flagsp == &negflags) {
10883                     goto neg_modifier;
10884                 }
10885                 if (has_charset_modifier) {
10886                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10887                         goto excess_modifier;
10888                     }
10889                     /* Doubled modifier implies more restricted */
10890                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10891                 }
10892                 else {
10893                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10894                 }
10895                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10896                 break;
10897             case DEPENDS_PAT_MOD:
10898                 if (has_use_defaults) {
10899                     goto fail_modifiers;
10900                 }
10901                 else if (flagsp == &negflags) {
10902                     goto neg_modifier;
10903                 }
10904                 else if (has_charset_modifier) {
10905                     goto excess_modifier;
10906                 }
10907 
10908                 /* The dual charset means unicode semantics if the
10909                  * pattern (or target, not known until runtime) are
10910                  * utf8, or something in the pattern indicates unicode
10911                  * semantics */
10912                 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10913                      ? REGEX_UNICODE_CHARSET
10914                      : REGEX_DEPENDS_CHARSET;
10915                 has_charset_modifier = DEPENDS_PAT_MOD;
10916                 break;
10917               excess_modifier:
10918                 RExC_parse++;
10919                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10920                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10921                 }
10922                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10923                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10924                                         *(RExC_parse - 1));
10925                 }
10926                 else {
10927                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10928                 }
10929                 NOT_REACHED; /*NOTREACHED*/
10930               neg_modifier:
10931                 RExC_parse++;
10932                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10933                                     *(RExC_parse - 1));
10934                 NOT_REACHED; /*NOTREACHED*/
10935             case GLOBAL_PAT_MOD: /* 'g' */
10936                 if (RExC_pm_flags & PMf_WILDCARD) {
10937                     goto modifier_illegal_in_wildcard;
10938                 }
10939                 /*FALLTHROUGH*/
10940             case ONCE_PAT_MOD: /* 'o' */
10941                 if (ckWARN(WARN_REGEXP)) {
10942                     const I32 wflagbit = *RExC_parse == 'o'
10943                                          ? WASTED_O
10944                                          : WASTED_G;
10945                     if (! (wastedflags & wflagbit) ) {
10946                         wastedflags |= wflagbit;
10947 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10948                         vWARN5(
10949                             RExC_parse + 1,
10950                             "Useless (%s%c) - %suse /%c modifier",
10951                             flagsp == &negflags ? "?-" : "?",
10952                             *RExC_parse,
10953                             flagsp == &negflags ? "don't " : "",
10954                             *RExC_parse
10955                         );
10956                     }
10957                 }
10958                 break;
10959 
10960             case CONTINUE_PAT_MOD: /* 'c' */
10961                 if (RExC_pm_flags & PMf_WILDCARD) {
10962                     goto modifier_illegal_in_wildcard;
10963                 }
10964                 if (ckWARN(WARN_REGEXP)) {
10965                     if (! (wastedflags & WASTED_C) ) {
10966                         wastedflags |= WASTED_GC;
10967 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10968                         vWARN3(
10969                             RExC_parse + 1,
10970                             "Useless (%sc) - %suse /gc modifier",
10971                             flagsp == &negflags ? "?-" : "?",
10972                             flagsp == &negflags ? "don't " : ""
10973                         );
10974                     }
10975                 }
10976                 break;
10977             case KEEPCOPY_PAT_MOD: /* 'p' */
10978                 if (RExC_pm_flags & PMf_WILDCARD) {
10979                     goto modifier_illegal_in_wildcard;
10980                 }
10981                 if (flagsp == &negflags) {
10982                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10983                 } else {
10984                     *flagsp |= RXf_PMf_KEEPCOPY;
10985                 }
10986                 break;
10987             case '-':
10988                 /* A flag is a default iff it is following a minus, so
10989                  * if there is a minus, it means will be trying to
10990                  * re-specify a default which is an error */
10991                 if (has_use_defaults || flagsp == &negflags) {
10992                     goto fail_modifiers;
10993                 }
10994                 flagsp = &negflags;
10995                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10996                 x_mod_count = 0;
10997                 break;
10998             case ':':
10999             case ')':
11000 
11001                 if (  (RExC_pm_flags & PMf_WILDCARD)
11002                     && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11003                 {
11004                     RExC_parse++;
11005                     /* diag_listed_as: Use of %s is not allowed in Unicode
11006                        property wildcard subpatterns in regex; marked by <--
11007                        HERE in m/%s/ */
11008                     vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11009                            " property wildcard subpatterns",
11010                            has_charset_modifier);
11011                 }
11012 
11013                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11014                     negflags |= RXf_PMf_EXTENDED_MORE;
11015                 }
11016                 RExC_flags |= posflags;
11017 
11018                 if (negflags & RXf_PMf_EXTENDED) {
11019                     negflags |= RXf_PMf_EXTENDED_MORE;
11020                 }
11021                 RExC_flags &= ~negflags;
11022                 set_regex_charset(&RExC_flags, cs);
11023 
11024                 return;
11025             default:
11026               fail_modifiers:
11027                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11028 		/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11029                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11030                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11031                 NOT_REACHED; /*NOTREACHED*/
11032         }
11033 
11034         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11035     }
11036 
11037     vFAIL("Sequence (?... not terminated");
11038 
11039   modifier_illegal_in_wildcard:
11040     RExC_parse++;
11041     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11042        subpatterns in regex; marked by <-- HERE in m/%s/ */
11043     vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11044            " subpatterns", *(RExC_parse - 1));
11045 }
11046 
11047 /*
11048  - reg - regular expression, i.e. main body or parenthesized thing
11049  *
11050  * Caller must absorb opening parenthesis.
11051  *
11052  * Combining parenthesis handling with the base level of regular expression
11053  * is a trifle forced, but the need to tie the tails of the branches to what
11054  * follows makes it hard to avoid.
11055  */
11056 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11057 #ifdef DEBUGGING
11058 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11059 #else
11060 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11061 #endif
11062 
11063 STATIC regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,char * parse_start,char ch)11064 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11065                              I32 *flagp,
11066                              char * parse_start,
11067                              char ch
11068                       )
11069 {
11070     regnode_offset ret;
11071     char* name_start = RExC_parse;
11072     U32 num = 0;
11073     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11074     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11075 
11076     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11077 
11078     if (RExC_parse != name_start && ch == '}') {
11079         while (isBLANK(*RExC_parse)) {
11080             RExC_parse++;
11081         }
11082     }
11083     if (RExC_parse == name_start || *RExC_parse != ch) {
11084         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11085         vFAIL2("Sequence %.3s... not terminated", parse_start);
11086     }
11087 
11088     if (sv_dat) {
11089         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11090         RExC_rxi->data->data[num]=(void*)sv_dat;
11091         SvREFCNT_inc_simple_void_NN(sv_dat);
11092     }
11093     RExC_sawback = 1;
11094     ret = reganode(pRExC_state,
11095                    ((! FOLD)
11096                      ? REFN
11097                      : (ASCII_FOLD_RESTRICTED)
11098                        ? REFFAN
11099                        : (AT_LEAST_UNI_SEMANTICS)
11100                          ? REFFUN
11101                          : (LOC)
11102                            ? REFFLN
11103                            : REFFN),
11104                     num);
11105     *flagp |= HASWIDTH;
11106 
11107     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11108     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11109 
11110     nextchar(pRExC_state);
11111     return ret;
11112 }
11113 
11114 /* On success, returns the offset at which any next node should be placed into
11115  * the regex engine program being compiled.
11116  *
11117  * Returns 0 otherwise, with *flagp set to indicate why:
11118  *  TRYAGAIN        at the end of (?) that only sets flags.
11119  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11120  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11121  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11122  *  happen.  */
11123 STATIC regnode_offset
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp,U32 depth)11124 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11125     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11126      * 2 is like 1, but indicates that nextchar() has been called to advance
11127      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11128      * this flag alerts us to the need to check for that */
11129 {
11130     regnode_offset ret = 0;    /* Will be the head of the group. */
11131     regnode_offset br;
11132     regnode_offset lastbr;
11133     regnode_offset ender = 0;
11134     I32 parno = 0;
11135     I32 flags;
11136     U32 oregflags = RExC_flags;
11137     bool have_branch = 0;
11138     bool is_open = 0;
11139     I32 freeze_paren = 0;
11140     I32 after_freeze = 0;
11141     I32 num; /* numeric backreferences */
11142     SV * max_open;  /* Max number of unclosed parens */
11143     I32 was_in_lookaround = RExC_in_lookaround;
11144 
11145     char * parse_start = RExC_parse; /* MJD */
11146     char * const oregcomp_parse = RExC_parse;
11147 
11148     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11149 
11150     PERL_ARGS_ASSERT_REG;
11151     DEBUG_PARSE("reg ");
11152 
11153     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11154     assert(max_open);
11155     if (!SvIOK(max_open)) {
11156         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11157     }
11158     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11159                                               open paren */
11160         vFAIL("Too many nested open parens");
11161     }
11162 
11163     *flagp = 0;				/* Initialize. */
11164 
11165     /* Having this true makes it feasible to have a lot fewer tests for the
11166      * parse pointer being in scope.  For example, we can write
11167      *      while(isFOO(*RExC_parse)) RExC_parse++;
11168      * instead of
11169      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11170      */
11171     assert(*RExC_end == '\0');
11172 
11173     /* Make an OPEN node, if parenthesized. */
11174     if (paren) {
11175 
11176         /* Under /x, space and comments can be gobbled up between the '(' and
11177          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11178          * intervening space, as the sequence is a token, and a token should be
11179          * indivisible */
11180         bool has_intervening_patws = (paren == 2)
11181                                   && *(RExC_parse - 1) != '(';
11182 
11183         if (RExC_parse >= RExC_end) {
11184 	    vFAIL("Unmatched (");
11185         }
11186 
11187         if (paren == 'r') {     /* Atomic script run */
11188             paren = '>';
11189             goto parse_rest;
11190         }
11191         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11192 	    char *start_verb = RExC_parse + 1;
11193 	    STRLEN verb_len;
11194 	    char *start_arg = NULL;
11195 	    unsigned char op = 0;
11196             int arg_required = 0;
11197             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11198             bool has_upper = FALSE;
11199 
11200             if (has_intervening_patws) {
11201                 RExC_parse++;   /* past the '*' */
11202 
11203                 /* For strict backwards compatibility, don't change the message
11204                  * now that we also have lowercase operands */
11205                 if (isUPPER(*RExC_parse)) {
11206                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11207                 }
11208                 else {
11209                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11210                 }
11211             }
11212 	    while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11213 	        if ( *RExC_parse == ':' ) {
11214 	            start_arg = RExC_parse + 1;
11215 	            break;
11216 	        }
11217                 else if (! UTF) {
11218                     if (isUPPER(*RExC_parse)) {
11219                         has_upper = TRUE;
11220                     }
11221                     RExC_parse++;
11222                 }
11223                 else {
11224                     RExC_parse += UTF8SKIP(RExC_parse);
11225                 }
11226 	    }
11227 	    verb_len = RExC_parse - start_verb;
11228 	    if ( start_arg ) {
11229                 if (RExC_parse >= RExC_end) {
11230                     goto unterminated_verb_pattern;
11231                 }
11232 
11233 	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11234 	        while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11235                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11236                 }
11237 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11238                   unterminated_verb_pattern:
11239                     if (has_upper) {
11240                         vFAIL("Unterminated verb pattern argument");
11241                     }
11242                     else {
11243                         vFAIL("Unterminated '(*...' argument");
11244                     }
11245                 }
11246 	    } else {
11247 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11248                     if (has_upper) {
11249                         vFAIL("Unterminated verb pattern");
11250                     }
11251                     else {
11252                         vFAIL("Unterminated '(*...' construct");
11253                     }
11254                 }
11255 	    }
11256 
11257             /* Here, we know that RExC_parse < RExC_end */
11258 
11259 	    switch ( *start_verb ) {
11260             case 'A':  /* (*ACCEPT) */
11261                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11262 		    op = ACCEPT;
11263 		    internal_argval = RExC_nestroot;
11264 		}
11265 		break;
11266             case 'C':  /* (*COMMIT) */
11267                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11268                     op = COMMIT;
11269                 break;
11270             case 'F':  /* (*FAIL) */
11271                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11272 		    op = OPFAIL;
11273 		}
11274 		break;
11275             case ':':  /* (*:NAME) */
11276 	    case 'M':  /* (*MARK:NAME) */
11277 	        if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11278                     op = MARKPOINT;
11279                     arg_required = 1;
11280                 }
11281                 break;
11282             case 'P':  /* (*PRUNE) */
11283                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11284                     op = PRUNE;
11285                 break;
11286             case 'S':   /* (*SKIP) */
11287                 if ( memEQs(start_verb, verb_len,"SKIP") )
11288                     op = SKIP;
11289                 break;
11290             case 'T':  /* (*THEN) */
11291                 /* [19:06] <TimToady> :: is then */
11292                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11293                     op = CUTGROUP;
11294                     RExC_seen |= REG_CUTGROUP_SEEN;
11295                 }
11296                 break;
11297             case 'a':
11298                 if (   memEQs(start_verb, verb_len, "asr")
11299                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11300                 {
11301                     paren = 'r';        /* Mnemonic: recursed run */
11302                     goto script_run;
11303                 }
11304                 else if (memEQs(start_verb, verb_len, "atomic")) {
11305                     paren = 't';    /* AtOMIC */
11306                     goto alpha_assertions;
11307                 }
11308                 break;
11309             case 'p':
11310                 if (   memEQs(start_verb, verb_len, "plb")
11311                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11312                 {
11313                     paren = 'b';
11314                     goto lookbehind_alpha_assertions;
11315                 }
11316                 else if (   memEQs(start_verb, verb_len, "pla")
11317                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11318                 {
11319                     paren = 'a';
11320                     goto alpha_assertions;
11321                 }
11322                 break;
11323             case 'n':
11324                 if (   memEQs(start_verb, verb_len, "nlb")
11325                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11326                 {
11327                     paren = 'B';
11328                     goto lookbehind_alpha_assertions;
11329                 }
11330                 else if (   memEQs(start_verb, verb_len, "nla")
11331                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11332                 {
11333                     paren = 'A';
11334                     goto alpha_assertions;
11335                 }
11336                 break;
11337             case 's':
11338                 if (   memEQs(start_verb, verb_len, "sr")
11339                     || memEQs(start_verb, verb_len, "script_run"))
11340                 {
11341                     regnode_offset atomic;
11342 
11343                     paren = 's';
11344 
11345                    script_run:
11346 
11347                     /* This indicates Unicode rules. */
11348                     REQUIRE_UNI_RULES(flagp, 0);
11349 
11350                     if (! start_arg) {
11351                         goto no_colon;
11352                     }
11353 
11354                     RExC_parse = start_arg;
11355 
11356                     if (RExC_in_script_run) {
11357 
11358                         /*  Nested script runs are treated as no-ops, because
11359                          *  if the nested one fails, the outer one must as
11360                          *  well.  It could fail sooner, and avoid (??{} with
11361                          *  side effects, but that is explicitly documented as
11362                          *  undefined behavior. */
11363 
11364                         ret = 0;
11365 
11366                         if (paren == 's') {
11367                             paren = ':';
11368                             goto parse_rest;
11369                         }
11370 
11371                         /* But, the atomic part of a nested atomic script run
11372                          * isn't a no-op, but can be treated just like a '(?>'
11373                          * */
11374                         paren = '>';
11375                         goto parse_rest;
11376                     }
11377 
11378                     if (paren == 's') {
11379                         /* Here, we're starting a new regular script run */
11380                         ret = reg_node(pRExC_state, SROPEN);
11381                         RExC_in_script_run = 1;
11382                         is_open = 1;
11383                         goto parse_rest;
11384                     }
11385 
11386                     /* Here, we are starting an atomic script run.  This is
11387                      * handled by recursing to deal with the atomic portion
11388                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11389 
11390                     ret = reg_node(pRExC_state, SROPEN);
11391 
11392                     RExC_in_script_run = 1;
11393 
11394                     atomic = reg(pRExC_state, 'r', &flags, depth);
11395                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11396                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11397                         return 0;
11398                     }
11399 
11400                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11401                         REQUIRE_BRANCHJ(flagp, 0);
11402                     }
11403 
11404                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11405                                                                 SRCLOSE)))
11406                     {
11407                         REQUIRE_BRANCHJ(flagp, 0);
11408                     }
11409 
11410                     RExC_in_script_run = 0;
11411                     return ret;
11412                 }
11413 
11414                 break;
11415 
11416             lookbehind_alpha_assertions:
11417                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11418                 /*FALLTHROUGH*/
11419 
11420             alpha_assertions:
11421 
11422                 RExC_in_lookaround++;
11423                 RExC_seen_zerolen++;
11424 
11425                 if (! start_arg) {
11426                     goto no_colon;
11427                 }
11428 
11429                 /* An empty negative lookahead assertion simply is failure */
11430                 if (paren == 'A' && RExC_parse == start_arg) {
11431                     ret=reganode(pRExC_state, OPFAIL, 0);
11432                     nextchar(pRExC_state);
11433                     return ret;
11434 	        }
11435 
11436                 RExC_parse = start_arg;
11437                 goto parse_rest;
11438 
11439               no_colon:
11440                 vFAIL2utf8f(
11441                 "'(*%" UTF8f "' requires a terminating ':'",
11442                 UTF8fARG(UTF, verb_len, start_verb));
11443 		NOT_REACHED; /*NOTREACHED*/
11444 
11445 	    } /* End of switch */
11446 	    if ( ! op ) {
11447 	        RExC_parse += UTF
11448                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11449                               : 1;
11450                 if (has_upper || verb_len == 0) {
11451                     vFAIL2utf8f(
11452                     "Unknown verb pattern '%" UTF8f "'",
11453                     UTF8fARG(UTF, verb_len, start_verb));
11454                 }
11455                 else {
11456                     vFAIL2utf8f(
11457                     "Unknown '(*...)' construct '%" UTF8f "'",
11458                     UTF8fARG(UTF, verb_len, start_verb));
11459                 }
11460 	    }
11461             if ( RExC_parse == start_arg ) {
11462                 start_arg = NULL;
11463             }
11464             if ( arg_required && !start_arg ) {
11465                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11466                     (int) verb_len, start_verb);
11467             }
11468             if (internal_argval == -1) {
11469                 ret = reganode(pRExC_state, op, 0);
11470             } else {
11471                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11472             }
11473             RExC_seen |= REG_VERBARG_SEEN;
11474             if (start_arg) {
11475                 SV *sv = newSVpvn( start_arg,
11476                                     RExC_parse - start_arg);
11477                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11478                                         STR_WITH_LEN("S"));
11479                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11480                 FLAGS(REGNODE_p(ret)) = 1;
11481             } else {
11482                 FLAGS(REGNODE_p(ret)) = 0;
11483             }
11484             if ( internal_argval != -1 )
11485                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11486 	    nextchar(pRExC_state);
11487 	    return ret;
11488         }
11489         else if (*RExC_parse == '?') { /* (?...) */
11490 	    bool is_logical = 0;
11491 	    const char * const seqstart = RExC_parse;
11492             const char * endptr;
11493             const char non_existent_group_msg[]
11494                                             = "Reference to nonexistent group";
11495             const char impossible_group[] = "Invalid reference to group";
11496 
11497             if (has_intervening_patws) {
11498                 RExC_parse++;
11499                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11500             }
11501 
11502 	    RExC_parse++;           /* past the '?' */
11503             paren = *RExC_parse;    /* might be a trailing NUL, if not
11504                                        well-formed */
11505             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11506             if (RExC_parse > RExC_end) {
11507                 paren = '\0';
11508             }
11509 	    ret = 0;			/* For look-ahead/behind. */
11510 	    switch (paren) {
11511 
11512 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
11513 	        paren = *RExC_parse;
11514 		if ( paren == '<') {    /* (?P<...>) named capture */
11515                     RExC_parse++;
11516                     if (RExC_parse >= RExC_end) {
11517                         vFAIL("Sequence (?P<... not terminated");
11518                     }
11519 		    goto named_capture;
11520                 }
11521                 else if (paren == '>') {   /* (?P>name) named recursion */
11522                     RExC_parse++;
11523                     if (RExC_parse >= RExC_end) {
11524                         vFAIL("Sequence (?P>... not terminated");
11525                     }
11526                     goto named_recursion;
11527                 }
11528                 else if (paren == '=') {   /* (?P=...)  named backref */
11529                     RExC_parse++;
11530                     return handle_named_backref(pRExC_state, flagp,
11531                                                 parse_start, ')');
11532                 }
11533                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11534                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11535 		vFAIL3("Sequence (%.*s...) not recognized",
11536                                 (int) (RExC_parse - seqstart), seqstart);
11537 		NOT_REACHED; /*NOTREACHED*/
11538             case '<':           /* (?<...) */
11539                 /* If you want to support (?<*...), first reconcile with GH #17363 */
11540 		if (*RExC_parse == '!')
11541 		    paren = ',';
11542 		else if (*RExC_parse != '=')
11543               named_capture:
11544 		{               /* (?<...>) */
11545 		    char *name_start;
11546 		    SV *svname;
11547 		    paren= '>';
11548                 /* FALLTHROUGH */
11549             case '\'':          /* (?'...') */
11550                     name_start = RExC_parse;
11551                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11552 		    if (   RExC_parse == name_start
11553                         || RExC_parse >= RExC_end
11554                         || *RExC_parse != paren)
11555                     {
11556 		        vFAIL2("Sequence (?%c... not terminated",
11557 		            paren=='>' ? '<' : (char) paren);
11558                     }
11559 		    {
11560 			HE *he_str;
11561 			SV *sv_dat = NULL;
11562                         if (!svname) /* shouldn't happen */
11563                             Perl_croak(aTHX_
11564                                 "panic: reg_scan_name returned NULL");
11565                         if (!RExC_paren_names) {
11566                             RExC_paren_names= newHV();
11567                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11568 #ifdef DEBUGGING
11569                             RExC_paren_name_list= newAV();
11570                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11571 #endif
11572                         }
11573                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11574                         if ( he_str )
11575                             sv_dat = HeVAL(he_str);
11576                         if ( ! sv_dat ) {
11577                             /* croak baby croak */
11578                             Perl_croak(aTHX_
11579                                 "panic: paren_name hash element allocation failed");
11580                         } else if ( SvPOK(sv_dat) ) {
11581                             /* (?|...) can mean we have dupes so scan to check
11582                                its already been stored. Maybe a flag indicating
11583                                we are inside such a construct would be useful,
11584                                but the arrays are likely to be quite small, so
11585                                for now we punt -- dmq */
11586                             IV count = SvIV(sv_dat);
11587                             I32 *pv = (I32*)SvPVX(sv_dat);
11588                             IV i;
11589                             for ( i = 0 ; i < count ; i++ ) {
11590                                 if ( pv[i] == RExC_npar ) {
11591                                     count = 0;
11592                                     break;
11593                                 }
11594                             }
11595                             if ( count ) {
11596                                 pv = (I32*)SvGROW(sv_dat,
11597                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11598                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11599                                 pv[count] = RExC_npar;
11600                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11601                             }
11602                         } else {
11603                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11604                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11605                                                                 sizeof(I32));
11606                             SvIOK_on(sv_dat);
11607                             SvIV_set(sv_dat, 1);
11608                         }
11609 #ifdef DEBUGGING
11610                         /* Yes this does cause a memory leak in debugging Perls
11611                          * */
11612                         if (!av_store(RExC_paren_name_list,
11613                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11614                             SvREFCNT_dec_NN(svname);
11615 #endif
11616 
11617                         /*sv_dump(sv_dat);*/
11618                     }
11619                     nextchar(pRExC_state);
11620 		    paren = 1;
11621 		    goto capturing_parens;
11622 		}
11623 
11624                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11625 		RExC_in_lookaround++;
11626 		RExC_parse++;
11627                 if (RExC_parse >= RExC_end) {
11628                     vFAIL("Sequence (?... not terminated");
11629                 }
11630                 RExC_seen_zerolen++;
11631                 break;
11632 	    case '=':           /* (?=...) */
11633 		RExC_seen_zerolen++;
11634                 RExC_in_lookaround++;
11635                 break;
11636 	    case '!':           /* (?!...) */
11637 		RExC_seen_zerolen++;
11638 		/* check if we're really just a "FAIL" assertion */
11639                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11640                                         FALSE /* Don't force to /x */ );
11641 	        if (*RExC_parse == ')') {
11642                     ret=reganode(pRExC_state, OPFAIL, 0);
11643 	            nextchar(pRExC_state);
11644 	            return ret;
11645 	        }
11646                 RExC_in_lookaround++;
11647 	        break;
11648 	    case '|':           /* (?|...) */
11649 	        /* branch reset, behave like a (?:...) except that
11650 	           buffers in alternations share the same numbers */
11651 	        paren = ':';
11652 	        after_freeze = freeze_paren = RExC_npar;
11653 
11654                 /* XXX This construct currently requires an extra pass.
11655                  * Investigation would be required to see if that could be
11656                  * changed */
11657                 REQUIRE_PARENS_PASS;
11658 	        break;
11659 	    case ':':           /* (?:...) */
11660 	    case '>':           /* (?>...) */
11661 		break;
11662 	    case '$':           /* (?$...) */
11663 	    case '@':           /* (?@...) */
11664 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11665 		break;
11666 	    case '0' :           /* (?0) */
11667 	    case 'R' :           /* (?R) */
11668                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11669 		    FAIL("Sequence (?R) not terminated");
11670                 num = 0;
11671                 RExC_seen |= REG_RECURSE_SEEN;
11672 
11673                 /* XXX These constructs currently require an extra pass.
11674                  * It probably could be changed */
11675                 REQUIRE_PARENS_PASS;
11676 
11677 		*flagp |= POSTPONED;
11678                 goto gen_recurse_regop;
11679 		/*notreached*/
11680             /* named and numeric backreferences */
11681             case '&':            /* (?&NAME) */
11682                 parse_start = RExC_parse - 1;
11683               named_recursion:
11684                 {
11685                     SV *sv_dat = reg_scan_name(pRExC_state,
11686                                                REG_RSN_RETURN_DATA);
11687                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11688                 }
11689                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11690                     vFAIL("Sequence (?&... not terminated");
11691                 goto gen_recurse_regop;
11692                 /* NOTREACHED */
11693             case '+':
11694                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11695                     RExC_parse++;
11696                     vFAIL("Illegal pattern");
11697                 }
11698                 goto parse_recursion;
11699                 /* NOTREACHED*/
11700             case '-': /* (?-1) */
11701                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11702                     RExC_parse--; /* rewind to let it be handled later */
11703                     goto parse_flags;
11704                 }
11705                 /* FALLTHROUGH */
11706             case '1': case '2': case '3': case '4': /* (?1) */
11707 	    case '5': case '6': case '7': case '8': case '9':
11708 	        RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11709               parse_recursion:
11710                 {
11711                     bool is_neg = FALSE;
11712                     UV unum;
11713                     parse_start = RExC_parse - 1; /* MJD */
11714                     if (*RExC_parse == '-') {
11715                         RExC_parse++;
11716                         is_neg = TRUE;
11717                     }
11718                     endptr = RExC_end;
11719                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11720                         && unum <= I32_MAX
11721                     ) {
11722                         num = (I32)unum;
11723                         RExC_parse = (char*)endptr;
11724                     }
11725                     else {  /* Overflow, or something like that.  Position
11726                                beyond all digits for the message */
11727                         while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
11728                             RExC_parse++;
11729                         }
11730                         vFAIL(impossible_group);
11731                     }
11732                     if (is_neg) {
11733                         /* -num is always representable on 1 and 2's complement
11734                          * machines */
11735                         num = -num;
11736                     }
11737                 }
11738 	        if (*RExC_parse!=')')
11739 	            vFAIL("Expecting close bracket");
11740 
11741               gen_recurse_regop:
11742                 if (paren == '-' || paren == '+') {
11743 
11744                     /* Don't overflow */
11745                     if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11746                         RExC_parse++;
11747                         vFAIL(impossible_group);
11748                     }
11749 
11750                     /*
11751                     Diagram of capture buffer numbering.
11752                     Top line is the normal capture buffer numbers
11753                     Bottom line is the negative indexing as from
11754                     the X (the (?-2))
11755 
11756                         1 2    3 4 5 X   Y      6 7
11757                        /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11758                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11759                     -   5 4    3 2 1 X   Y      x x
11760 
11761                     Resolve to absolute group.  Recall that RExC_npar is +1 of
11762                     the actual parenthesis group number.  For lookahead, we
11763                     have to compensate for that.  Using the above example, when
11764                     we get to Y in the parse, num is 2 and RExC_npar is 6.  We
11765                     want 7 for +2, and 4 for -2.
11766                     */
11767                     if ( paren == '+' ) {
11768                         num--;
11769                     }
11770 
11771                     num += RExC_npar;
11772 
11773                     if (paren == '-' && num < 1) {
11774                         RExC_parse++;
11775                         vFAIL(non_existent_group_msg);
11776                     }
11777                 }
11778 
11779                 if (num >= RExC_npar) {
11780 
11781                     /* It might be a forward reference; we can't fail until we
11782                      * know, by completing the parse to get all the groups, and
11783                      * then reparsing */
11784                     if (ALL_PARENS_COUNTED)  {
11785                         if (num >= RExC_total_parens) {
11786                             RExC_parse++;
11787                             vFAIL(non_existent_group_msg);
11788                         }
11789                     }
11790                     else {
11791                         REQUIRE_PARENS_PASS;
11792                     }
11793                 }
11794 
11795                 /* We keep track how many GOSUB items we have produced.
11796                    To start off the ARG2L() of the GOSUB holds its "id",
11797                    which is used later in conjunction with RExC_recurse
11798                    to calculate the offset we need to jump for the GOSUB,
11799                    which it will store in the final representation.
11800                    We have to defer the actual calculation until much later
11801                    as the regop may move.
11802                  */
11803                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11804                 RExC_recurse_count++;
11805                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11806                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11807                             22, "|    |", (int)(depth * 2 + 1), "",
11808                             (UV)ARG(REGNODE_p(ret)),
11809                             (IV)ARG2L(REGNODE_p(ret))));
11810                 RExC_seen |= REG_RECURSE_SEEN;
11811 
11812                 Set_Node_Length(REGNODE_p(ret),
11813                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11814 		Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11815 
11816                 *flagp |= POSTPONED;
11817                 assert(*RExC_parse == ')');
11818                 nextchar(pRExC_state);
11819                 return ret;
11820 
11821             /* NOTREACHED */
11822 
11823 	    case '?':           /* (??...) */
11824 		is_logical = 1;
11825 		if (*RExC_parse != '{') {
11826                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11827                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11828                     vFAIL2utf8f(
11829                         "Sequence (%" UTF8f "...) not recognized",
11830                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11831 		    NOT_REACHED; /*NOTREACHED*/
11832 		}
11833 		*flagp |= POSTPONED;
11834 		paren = '{';
11835                 RExC_parse++;
11836 		/* FALLTHROUGH */
11837 	    case '{':           /* (?{...}) */
11838 	    {
11839 		U32 n = 0;
11840 		struct reg_code_block *cb;
11841                 OP * o;
11842 
11843 		RExC_seen_zerolen++;
11844 
11845 		if (   !pRExC_state->code_blocks
11846 		    || pRExC_state->code_index
11847                                         >= pRExC_state->code_blocks->count
11848 		    || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11849 			!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11850 			    - RExC_start)
11851 		) {
11852 		    if (RExC_pm_flags & PMf_USE_RE_EVAL)
11853 			FAIL("panic: Sequence (?{...}): no code block found\n");
11854 		    FAIL("Eval-group not allowed at runtime, use re 'eval'");
11855 		}
11856 		/* this is a pre-compiled code block (?{...}) */
11857 		cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11858 		RExC_parse = RExC_start + cb->end;
11859 		o = cb->block;
11860                 if (cb->src_regex) {
11861                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11862                     RExC_rxi->data->data[n] =
11863                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11864                     RExC_rxi->data->data[n+1] = (void*)o;
11865                 }
11866                 else {
11867                     n = add_data(pRExC_state,
11868                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11869                     RExC_rxi->data->data[n] = (void*)o;
11870                 }
11871 		pRExC_state->code_index++;
11872 		nextchar(pRExC_state);
11873 
11874 		if (is_logical) {
11875                     regnode_offset eval;
11876 		    ret = reg_node(pRExC_state, LOGICAL);
11877 
11878                     eval = reg2Lanode(pRExC_state, EVAL,
11879                                        n,
11880 
11881                                        /* for later propagation into (??{})
11882                                         * return value */
11883                                        RExC_flags & RXf_PMf_COMPILETIME
11884                                       );
11885                     FLAGS(REGNODE_p(ret)) = 2;
11886                     if (! REGTAIL(pRExC_state, ret, eval)) {
11887                         REQUIRE_BRANCHJ(flagp, 0);
11888                     }
11889                     /* deal with the length of this later - MJD */
11890 		    return ret;
11891 		}
11892 		ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11893 		Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11894 		Set_Node_Offset(REGNODE_p(ret), parse_start);
11895 		return ret;
11896 	    }
11897 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
11898 	    {
11899 	        int is_define= 0;
11900                 const int DEFINE_len = sizeof("DEFINE") - 1;
11901 		if (    RExC_parse < RExC_end - 1
11902                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11903                             && (   RExC_parse[1] == '='
11904                                 || RExC_parse[1] == '!'
11905                                 || RExC_parse[1] == '<'
11906                                 || RExC_parse[1] == '{'))
11907 		        || (       RExC_parse[0] == '*'        /* (?(*...)) */
11908                             && (   memBEGINs(RExC_parse + 1,
11909                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11910                                          "pla:")
11911                                 || memBEGINs(RExC_parse + 1,
11912                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11913                                          "plb:")
11914                                 || memBEGINs(RExC_parse + 1,
11915                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11916                                          "nla:")
11917                                 || memBEGINs(RExC_parse + 1,
11918                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11919                                          "nlb:")
11920                                 || memBEGINs(RExC_parse + 1,
11921                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11922                                          "positive_lookahead:")
11923                                 || memBEGINs(RExC_parse + 1,
11924                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11925                                          "positive_lookbehind:")
11926                                 || memBEGINs(RExC_parse + 1,
11927                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11928                                          "negative_lookahead:")
11929                                 || memBEGINs(RExC_parse + 1,
11930                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11931                                          "negative_lookbehind:"))))
11932                 ) { /* Lookahead or eval. */
11933                     I32 flag;
11934                     regnode_offset tail;
11935 
11936                     ret = reg_node(pRExC_state, LOGICAL);
11937                     FLAGS(REGNODE_p(ret)) = 1;
11938 
11939                     tail = reg(pRExC_state, 1, &flag, depth+1);
11940                     RETURN_FAIL_ON_RESTART(flag, flagp);
11941                     if (! REGTAIL(pRExC_state, ret, tail)) {
11942                         REQUIRE_BRANCHJ(flagp, 0);
11943                     }
11944                     goto insert_if;
11945                 }
11946 		else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11947 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11948 	        {
11949 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
11950 	            char *name_start= RExC_parse++;
11951 	            U32 num = 0;
11952 	            SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11953 	            if (   RExC_parse == name_start
11954                         || RExC_parse >= RExC_end
11955                         || *RExC_parse != ch)
11956                     {
11957                         vFAIL2("Sequence (?(%c... not terminated",
11958                             (ch == '>' ? '<' : ch));
11959                     }
11960                     RExC_parse++;
11961                     if (sv_dat) {
11962                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11963                         RExC_rxi->data->data[num]=(void*)sv_dat;
11964                         SvREFCNT_inc_simple_void_NN(sv_dat);
11965                     }
11966                     ret = reganode(pRExC_state, GROUPPN, num);
11967                     goto insert_if_check_paren;
11968 		}
11969 		else if (memBEGINs(RExC_parse,
11970                                    (STRLEN) (RExC_end - RExC_parse),
11971                                    "DEFINE"))
11972                 {
11973 		    ret = reganode(pRExC_state, DEFINEP, 0);
11974 		    RExC_parse += DEFINE_len;
11975 		    is_define = 1;
11976 		    goto insert_if_check_paren;
11977 		}
11978 		else if (RExC_parse[0] == 'R') {
11979 		    RExC_parse++;
11980                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11981                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11982                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11983                      */
11984 		    parno = 0;
11985                     if (RExC_parse[0] == '0') {
11986                         parno = 1;
11987                         RExC_parse++;
11988                     }
11989                     else if (inRANGE(RExC_parse[0], '1', '9')) {
11990                         UV uv;
11991                         endptr = RExC_end;
11992                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11993                             && uv <= I32_MAX
11994                         ) {
11995                             parno = (I32)uv + 1;
11996                             RExC_parse = (char*)endptr;
11997                         }
11998                         /* else "Switch condition not recognized" below */
11999 		    } else if (RExC_parse[0] == '&') {
12000 		        SV *sv_dat;
12001 		        RExC_parse++;
12002 		        sv_dat = reg_scan_name(pRExC_state,
12003                                                REG_RSN_RETURN_DATA);
12004                         if (sv_dat)
12005                             parno = 1 + *((I32 *)SvPVX(sv_dat));
12006 		    }
12007 		    ret = reganode(pRExC_state, INSUBP, parno);
12008 		    goto insert_if_check_paren;
12009 		}
12010                 else if (inRANGE(RExC_parse[0], '1', '9')) {
12011                     /* (?(1)...) */
12012 		    char c;
12013                     UV uv;
12014                     endptr = RExC_end;
12015                     if (grok_atoUV(RExC_parse, &uv, &endptr)
12016                         && uv <= I32_MAX
12017                     ) {
12018                         parno = (I32)uv;
12019                         RExC_parse = (char*)endptr;
12020                     }
12021                     else {
12022                         vFAIL("panic: grok_atoUV returned FALSE");
12023                     }
12024                     ret = reganode(pRExC_state, GROUPP, parno);
12025 
12026                  insert_if_check_paren:
12027 		    if (UCHARAT(RExC_parse) != ')') {
12028                         RExC_parse += UTF
12029                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12030                                       : 1;
12031 			vFAIL("Switch condition not recognized");
12032 		    }
12033 		    nextchar(pRExC_state);
12034 		  insert_if:
12035                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12036                                                              IFTHEN, 0)))
12037                     {
12038                         REQUIRE_BRANCHJ(flagp, 0);
12039                     }
12040                     br = regbranch(pRExC_state, &flags, 1, depth+1);
12041 		    if (br == 0) {
12042                         RETURN_FAIL_ON_RESTART(flags,flagp);
12043                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12044                               (UV) flags);
12045                     } else
12046                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12047                                                              LONGJMP, 0)))
12048                     {
12049                         REQUIRE_BRANCHJ(flagp, 0);
12050                     }
12051 		    c = UCHARAT(RExC_parse);
12052                     nextchar(pRExC_state);
12053 		    if (flags&HASWIDTH)
12054 			*flagp |= HASWIDTH;
12055 		    if (c == '|') {
12056 		        if (is_define)
12057 		            vFAIL("(?(DEFINE)....) does not allow branches");
12058 
12059                         /* Fake one for optimizer.  */
12060                         lastbr = reganode(pRExC_state, IFTHEN, 0);
12061 
12062                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12063                             RETURN_FAIL_ON_RESTART(flags, flagp);
12064                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12065                                   (UV) flags);
12066                         }
12067                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
12068                             REQUIRE_BRANCHJ(flagp, 0);
12069                         }
12070                         if (flags&HASWIDTH)
12071 			    *flagp |= HASWIDTH;
12072                         c = UCHARAT(RExC_parse);
12073                         nextchar(pRExC_state);
12074 		    }
12075 		    else
12076 			lastbr = 0;
12077                     if (c != ')') {
12078                         if (RExC_parse >= RExC_end)
12079                             vFAIL("Switch (?(condition)... not terminated");
12080                         else
12081                             vFAIL("Switch (?(condition)... contains too many branches");
12082                     }
12083 		    ender = reg_node(pRExC_state, TAIL);
12084                     if (! REGTAIL(pRExC_state, br, ender)) {
12085                         REQUIRE_BRANCHJ(flagp, 0);
12086                     }
12087 		    if (lastbr) {
12088                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12089                             REQUIRE_BRANCHJ(flagp, 0);
12090                         }
12091                         if (! REGTAIL(pRExC_state,
12092                                       REGNODE_OFFSET(
12093                                                  NEXTOPER(
12094                                                  NEXTOPER(REGNODE_p(lastbr)))),
12095                                       ender))
12096                         {
12097                             REQUIRE_BRANCHJ(flagp, 0);
12098                         }
12099 		    }
12100 		    else
12101                         if (! REGTAIL(pRExC_state, ret, ender)) {
12102                             REQUIRE_BRANCHJ(flagp, 0);
12103                         }
12104 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
12105                     RExC_size++; /* XXX WHY do we need this?!!
12106                                     For large programs it seems to be required
12107                                     but I can't figure out why. -- dmq*/
12108 #endif
12109 		    return ret;
12110 		}
12111                 RExC_parse += UTF
12112                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12113                               : 1;
12114                 vFAIL("Unknown switch condition (?(...))");
12115 	    }
12116 	    case '[':           /* (?[ ... ]) */
12117                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12118                                          oregcomp_parse);
12119             case 0: /* A NUL */
12120 		RExC_parse--; /* for vFAIL to print correctly */
12121                 vFAIL("Sequence (? incomplete");
12122                 break;
12123 
12124             case ')':
12125                 if (RExC_strict) {  /* [perl #132851] */
12126                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12127                 }
12128                 /* FALLTHROUGH */
12129             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12130 	    /* FALLTHROUGH */
12131 	    default: /* e.g., (?i) */
12132 	        RExC_parse = (char *) seqstart + 1;
12133               parse_flags:
12134 		parse_lparen_question_flags(pRExC_state);
12135                 if (UCHARAT(RExC_parse) != ':') {
12136                     if (RExC_parse < RExC_end)
12137                         nextchar(pRExC_state);
12138                     *flagp = TRYAGAIN;
12139                     return 0;
12140                 }
12141                 paren = ':';
12142                 nextchar(pRExC_state);
12143                 ret = 0;
12144                 goto parse_rest;
12145             } /* end switch */
12146 	}
12147         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12148 	  capturing_parens:
12149 	    parno = RExC_npar;
12150 	    RExC_npar++;
12151             if (! ALL_PARENS_COUNTED) {
12152                 /* If we are in our first pass through (and maybe only pass),
12153                  * we  need to allocate memory for the capturing parentheses
12154                  * data structures.
12155                  */
12156 
12157                 if (!RExC_parens_buf_size) {
12158                     /* first guess at number of parens we might encounter */
12159                     RExC_parens_buf_size = 10;
12160 
12161                     /* setup RExC_open_parens, which holds the address of each
12162                      * OPEN tag, and to make things simpler for the 0 index the
12163                      * start of the program - this is used later for offsets */
12164                     Newxz(RExC_open_parens, RExC_parens_buf_size,
12165                             regnode_offset);
12166                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12167 
12168                     /* setup RExC_close_parens, which holds the address of each
12169                      * CLOSE tag, and to make things simpler for the 0 index
12170                      * the end of the program - this is used later for offsets
12171                      * */
12172                     Newxz(RExC_close_parens, RExC_parens_buf_size,
12173                             regnode_offset);
12174                     /* we dont know where end op starts yet, so we dont need to
12175                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12176                      * above */
12177                 }
12178                 else if (RExC_npar > RExC_parens_buf_size) {
12179                     I32 old_size = RExC_parens_buf_size;
12180 
12181                     RExC_parens_buf_size *= 2;
12182 
12183                     Renew(RExC_open_parens, RExC_parens_buf_size,
12184                             regnode_offset);
12185                     Zero(RExC_open_parens + old_size,
12186                             RExC_parens_buf_size - old_size, regnode_offset);
12187 
12188                     Renew(RExC_close_parens, RExC_parens_buf_size,
12189                             regnode_offset);
12190                     Zero(RExC_close_parens + old_size,
12191                             RExC_parens_buf_size - old_size, regnode_offset);
12192                 }
12193             }
12194 
12195 	    ret = reganode(pRExC_state, OPEN, parno);
12196             if (!RExC_nestroot)
12197                 RExC_nestroot = parno;
12198             if (RExC_open_parens && !RExC_open_parens[parno])
12199             {
12200                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12201                     "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12202                     22, "|    |", (int)(depth * 2 + 1), "",
12203                     (IV)parno, ret));
12204                 RExC_open_parens[parno]= ret;
12205             }
12206 
12207             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12208             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12209 	    is_open = 1;
12210 	} else {
12211             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12212             paren = ':';
12213 	    ret = 0;
12214         }
12215     }
12216     else                        /* ! paren */
12217 	ret = 0;
12218 
12219    parse_rest:
12220     /* Pick up the branches, linking them together. */
12221     parse_start = RExC_parse;   /* MJD */
12222     br = regbranch(pRExC_state, &flags, 1, depth+1);
12223 
12224     /*     branch_len = (paren != 0); */
12225 
12226     if (br == 0) {
12227         RETURN_FAIL_ON_RESTART(flags, flagp);
12228         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12229     }
12230     if (*RExC_parse == '|') {
12231 	if (RExC_use_BRANCHJ) {
12232 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
12233 	}
12234 	else {                  /* MJD */
12235 	    reginsert(pRExC_state, BRANCH, br, depth+1);
12236             Set_Node_Length(REGNODE_p(br), paren != 0);
12237             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12238         }
12239 	have_branch = 1;
12240     }
12241     else if (paren == ':') {
12242 	*flagp |= flags&SIMPLE;
12243     }
12244     if (is_open) {				/* Starts with OPEN. */
12245         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12246             REQUIRE_BRANCHJ(flagp, 0);
12247         }
12248     }
12249     else if (paren != '?')		/* Not Conditional */
12250 	ret = br;
12251     *flagp |= flags & (HASWIDTH | POSTPONED);
12252     lastbr = br;
12253     while (*RExC_parse == '|') {
12254 	if (RExC_use_BRANCHJ) {
12255             bool shut_gcc_up;
12256 
12257 	    ender = reganode(pRExC_state, LONGJMP, 0);
12258 
12259             /* Append to the previous. */
12260             shut_gcc_up = REGTAIL(pRExC_state,
12261                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12262                          ender);
12263             PERL_UNUSED_VAR(shut_gcc_up);
12264 	}
12265 	nextchar(pRExC_state);
12266 	if (freeze_paren) {
12267 	    if (RExC_npar > after_freeze)
12268 	        after_freeze = RExC_npar;
12269             RExC_npar = freeze_paren;
12270         }
12271         br = regbranch(pRExC_state, &flags, 0, depth+1);
12272 
12273 	if (br == 0) {
12274             RETURN_FAIL_ON_RESTART(flags, flagp);
12275             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12276         }
12277         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12278             REQUIRE_BRANCHJ(flagp, 0);
12279         }
12280 	lastbr = br;
12281 	*flagp |= flags & (HASWIDTH | POSTPONED);
12282     }
12283 
12284     if (have_branch || paren != ':') {
12285         regnode * br;
12286 
12287 	/* Make a closing node, and hook it on the end. */
12288 	switch (paren) {
12289 	case ':':
12290 	    ender = reg_node(pRExC_state, TAIL);
12291 	    break;
12292 	case 1: case 2:
12293 	    ender = reganode(pRExC_state, CLOSE, parno);
12294             if ( RExC_close_parens ) {
12295                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12296                         "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12297                         22, "|    |", (int)(depth * 2 + 1), "",
12298                         (IV)parno, ender));
12299                 RExC_close_parens[parno]= ender;
12300 	        if (RExC_nestroot == parno)
12301 	            RExC_nestroot = 0;
12302 	    }
12303             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12304             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12305 	    break;
12306 	case 's':
12307 	    ender = reg_node(pRExC_state, SRCLOSE);
12308             RExC_in_script_run = 0;
12309 	    break;
12310 	case '<':
12311         case 'a':
12312         case 'A':
12313         case 'b':
12314         case 'B':
12315 	case ',':
12316 	case '=':
12317 	case '!':
12318 	    *flagp &= ~HASWIDTH;
12319 	    /* FALLTHROUGH */
12320         case 't':   /* aTomic */
12321 	case '>':
12322 	    ender = reg_node(pRExC_state, SUCCEED);
12323 	    break;
12324 	case 0:
12325 	    ender = reg_node(pRExC_state, END);
12326             assert(!RExC_end_op); /* there can only be one! */
12327             RExC_end_op = REGNODE_p(ender);
12328             if (RExC_close_parens) {
12329                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12330                     "%*s%*s Setting close paren #0 (END) to %zu\n",
12331                     22, "|    |", (int)(depth * 2 + 1), "",
12332                     ender));
12333 
12334                 RExC_close_parens[0]= ender;
12335             }
12336 	    break;
12337 	}
12338         DEBUG_PARSE_r({
12339             DEBUG_PARSE_MSG("lsbr");
12340             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12341             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12342             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12343                           SvPV_nolen_const(RExC_mysv1),
12344                           (IV)lastbr,
12345                           SvPV_nolen_const(RExC_mysv2),
12346                           (IV)ender,
12347                           (IV)(ender - lastbr)
12348             );
12349         });
12350         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12351             REQUIRE_BRANCHJ(flagp, 0);
12352         }
12353 
12354 	if (have_branch) {
12355             char is_nothing= 1;
12356 	    if (depth==1)
12357                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12358 
12359 	    /* Hook the tails of the branches to the closing node. */
12360 	    for (br = REGNODE_p(ret); br; br = regnext(br)) {
12361 		const U8 op = PL_regkind[OP(br)];
12362 		if (op == BRANCH) {
12363                     if (! REGTAIL_STUDY(pRExC_state,
12364                                         REGNODE_OFFSET(NEXTOPER(br)),
12365                                         ender))
12366                     {
12367                         REQUIRE_BRANCHJ(flagp, 0);
12368                     }
12369                     if ( OP(NEXTOPER(br)) != NOTHING
12370                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12371                         is_nothing= 0;
12372 		}
12373 		else if (op == BRANCHJ) {
12374                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12375                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12376                                         ender);
12377                     PERL_UNUSED_VAR(shut_gcc_up);
12378                     /* for now we always disable this optimisation * /
12379                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12380                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12381                     */
12382                         is_nothing= 0;
12383 		}
12384 	    }
12385             if (is_nothing) {
12386                 regnode * ret_as_regnode = REGNODE_p(ret);
12387                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12388                                ? regnext(ret_as_regnode)
12389                                : ret_as_regnode;
12390                 DEBUG_PARSE_r({
12391                     DEBUG_PARSE_MSG("NADA");
12392                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12393                                      NULL, pRExC_state);
12394                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12395                                      NULL, pRExC_state);
12396                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12397                                   SvPV_nolen_const(RExC_mysv1),
12398                                   (IV)REG_NODE_NUM(ret_as_regnode),
12399                                   SvPV_nolen_const(RExC_mysv2),
12400                                   (IV)ender,
12401                                   (IV)(ender - ret)
12402                     );
12403                 });
12404                 OP(br)= NOTHING;
12405                 if (OP(REGNODE_p(ender)) == TAIL) {
12406                     NEXT_OFF(br)= 0;
12407                     RExC_emit= REGNODE_OFFSET(br) + 1;
12408                 } else {
12409                     regnode *opt;
12410                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12411                         OP(opt)= OPTIMIZED;
12412                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12413                 }
12414             }
12415 	}
12416     }
12417 
12418     {
12419         const char *p;
12420          /* Even/odd or x=don't care: 010101x10x */
12421         static const char parens[] = "=!aA<,>Bbt";
12422          /* flag below is set to 0 up through 'A'; 1 for larger */
12423 
12424 	if (paren && (p = strchr(parens, paren))) {
12425 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12426 	    int flag = (p - parens) > 3;
12427 
12428 	    if (paren == '>' || paren == 't') {
12429 		node = SUSPEND, flag = 0;
12430             }
12431 
12432 	    reginsert(pRExC_state, node, ret, depth+1);
12433             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12434 	    Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12435 	    FLAGS(REGNODE_p(ret)) = flag;
12436             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12437             {
12438                 REQUIRE_BRANCHJ(flagp, 0);
12439             }
12440 	}
12441     }
12442 
12443     /* Check for proper termination. */
12444     if (paren) {
12445         /* restore original flags, but keep (?p) and, if we've encountered
12446          * something in the parse that changes /d rules into /u, keep the /u */
12447 	RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12448         if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
12449             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12450         }
12451 	if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12452 	    RExC_parse = oregcomp_parse;
12453 	    vFAIL("Unmatched (");
12454 	}
12455 	nextchar(pRExC_state);
12456     }
12457     else if (!paren && RExC_parse < RExC_end) {
12458 	if (*RExC_parse == ')') {
12459 	    RExC_parse++;
12460 	    vFAIL("Unmatched )");
12461 	}
12462 	else
12463 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
12464 	NOT_REACHED; /* NOTREACHED */
12465     }
12466 
12467     if (after_freeze > RExC_npar)
12468         RExC_npar = after_freeze;
12469 
12470     RExC_in_lookaround = was_in_lookaround;
12471 
12472     return(ret);
12473 }
12474 
12475 /*
12476  - regbranch - one alternative of an | operator
12477  *
12478  * Implements the concatenation operator.
12479  *
12480  * On success, returns the offset at which any next node should be placed into
12481  * the regex engine program being compiled.
12482  *
12483  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12484  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12485  * UTF-8
12486  */
12487 STATIC regnode_offset
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first,U32 depth)12488 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12489 {
12490     regnode_offset ret;
12491     regnode_offset chain = 0;
12492     regnode_offset latest;
12493     I32 flags = 0, c = 0;
12494     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12495 
12496     PERL_ARGS_ASSERT_REGBRANCH;
12497 
12498     DEBUG_PARSE("brnc");
12499 
12500     if (first)
12501 	ret = 0;
12502     else {
12503 	if (RExC_use_BRANCHJ)
12504 	    ret = reganode(pRExC_state, BRANCHJ, 0);
12505 	else {
12506 	    ret = reg_node(pRExC_state, BRANCH);
12507             Set_Node_Length(REGNODE_p(ret), 1);
12508         }
12509     }
12510 
12511     *flagp = 0;			/* Initialize. */
12512 
12513     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12514                             FALSE /* Don't force to /x */ );
12515     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12516 	flags &= ~TRYAGAIN;
12517         latest = regpiece(pRExC_state, &flags, depth+1);
12518 	if (latest == 0) {
12519 	    if (flags & TRYAGAIN)
12520 		continue;
12521             RETURN_FAIL_ON_RESTART(flags, flagp);
12522             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12523 	}
12524 	else if (ret == 0)
12525             ret = latest;
12526 	*flagp |= flags&(HASWIDTH|POSTPONED);
12527 	if (chain != 0) {
12528 	    /* FIXME adding one for every branch after the first is probably
12529 	     * excessive now we have TRIE support. (hv) */
12530 	    MARK_NAUGHTY(1);
12531             if (! REGTAIL(pRExC_state, chain, latest)) {
12532                 /* XXX We could just redo this branch, but figuring out what
12533                  * bookkeeping needs to be reset is a pain, and it's likely
12534                  * that other branches that goto END will also be too large */
12535                 REQUIRE_BRANCHJ(flagp, 0);
12536             }
12537 	}
12538 	chain = latest;
12539 	c++;
12540     }
12541     if (chain == 0) {	/* Loop ran zero times. */
12542 	chain = reg_node(pRExC_state, NOTHING);
12543 	if (ret == 0)
12544 	    ret = chain;
12545     }
12546     if (c == 1) {
12547 	*flagp |= flags&SIMPLE;
12548     }
12549 
12550     return ret;
12551 }
12552 
12553 #define RBRACE  0
12554 #define MIN_S   1
12555 #define MIN_E   2
12556 #define MAX_S   3
12557 #define MAX_E   4
12558 
12559 #ifndef PERL_IN_XSUB_RE
12560 bool
Perl_regcurly(const char * s,const char * e,const char * result[5])12561 Perl_regcurly(const char *s, const char *e, const char * result[5])
12562 {
12563     /* This function matches a {m,n} quantifier.  When called with a NULL final
12564      * argument, it simply parses the input from 's' up through 'e-1', and
12565      * returns a boolean as to whether or not this input is syntactically a
12566      * {m,n} quantifier.
12567      *
12568      * When called with a non-NULL final parameter, and when the function
12569      * returns TRUE, it additionally stores information into the array
12570      * specified by that parameter about what it found in the parse.  The
12571      * parameter must be a pointer into a 5 element array of 'const char *'
12572      * elements.  The returned information is as follows:
12573      *   result[RBRACE]  points to the closing brace
12574      *   result[MIN_S]   points to the first byte of the lower bound
12575      *   result[MIN_E]   points to one beyond the final byte of the lower bound
12576      *   result[MAX_S]   points to the first byte of the upper bound
12577      *   result[MAX_E]   points to one beyond the final byte of the upper bound
12578      *
12579      * If the quantifier is of the form {m,} (meaning an infinite upper
12580      * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
12581      * to is irrelevant, just that it's the same place
12582      *
12583      * If instead the quantifier is of the form {m} there is actually only
12584      * one bound, and both the upper and lower result[] elements are set to
12585      * point to it.
12586      *
12587      * This function checks only for syntactic validity; it leaves checking for
12588      * semantic validity and raising any diagnostics to the caller.  This
12589      * function is called in multiple places to check for syntax, but only from
12590      * one for semantics.  It makes it as simple as possible for the
12591      * syntax-only callers, while furnishing just enough information for the
12592      * semantic caller.
12593      */
12594 
12595     const char * min_start = NULL;
12596     const char * max_start = NULL;
12597     const char * min_end = NULL;
12598     const char * max_end = NULL;
12599 
12600     bool has_comma = FALSE;
12601 
12602     PERL_ARGS_ASSERT_REGCURLY;
12603 
12604     if (s >= e || *s++ != '{')
12605 	return FALSE;
12606 
12607     while (s < e && isBLANK(*s)) {
12608         s++;
12609     }
12610 
12611     if isDIGIT(*s) {
12612         min_start = s;
12613         do {
12614             s++;
12615         } while (s < e && isDIGIT(*s));
12616         min_end = s;
12617     }
12618 
12619     while (s < e && isBLANK(*s)) {
12620         s++;
12621     }
12622 
12623     if (*s == ',') {
12624         has_comma = TRUE;
12625 	s++;
12626 
12627         while (s < e && isBLANK(*s)) {
12628             s++;
12629         }
12630 
12631         if isDIGIT(*s) {
12632             max_start = s;
12633             do {
12634                 s++;
12635             } while (s < e && isDIGIT(*s));
12636             max_end = s;
12637         }
12638     }
12639 
12640     while (s < e && isBLANK(*s)) {
12641         s++;
12642     }
12643                                /* Need at least one number */
12644     if (s >= e || *s != '}' || (! min_start && ! max_end)) {
12645         return FALSE;
12646     }
12647 
12648     if (result) {
12649 
12650         result[RBRACE] = s;
12651 
12652         result[MIN_S] = min_start;
12653         result[MIN_E] = min_end;
12654         if (has_comma) {
12655             if (max_start) {
12656                 result[MAX_S] = max_start;
12657                 result[MAX_E] = max_end;
12658             }
12659             else {
12660                 /* Having no value after the comma is signalled by setting
12661                  * start and end to the same value.  What that value is isn't
12662                  * relevant; NULL is chosen simply because it will fail if the
12663                  * caller mistakenly uses it */
12664                 result[MAX_S] = result[MAX_E] = NULL;
12665             }
12666         }
12667         else {  /* No comma means lower and upper bounds are the same */
12668             result[MAX_S] = min_start;
12669             result[MAX_E] = min_end;
12670         }
12671     }
12672 
12673     return TRUE;
12674 }
12675 #endif
12676 
12677 U32
S_get_quantifier_value(pTHX_ RExC_state_t * pRExC_state,const char * start,const char * end)12678 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
12679                        const char * start, const char * end)
12680 {
12681     /* This is a helper function for regpiece() to compute, given the
12682      * quantifier {m,n}, the value of either m or n, based on the starting
12683      * position 'start' in the string, through the byte 'end-1', returning it
12684      * if valid, and failing appropriately if not.  It knows the restrictions
12685      * imposed on quantifier values */
12686 
12687     UV uv;
12688     STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
12689 
12690     PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
12691 
12692     if (grok_atoUV(start, &uv, &end)) {
12693         if (uv < REG_INFTY) {   /* A valid, small-enough number */
12694             return (U32) uv;
12695         }
12696     }
12697     else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
12698                                  leading zeros or overflow */
12699         RExC_parse = (char * ) end;
12700 
12701         /* Perhaps too generic a msg for what is only failure from having
12702          * leading zeros, but this is how it's always behaved. */
12703         vFAIL("Invalid quantifier in {,}");
12704         NOT_REACHED; /*NOTREACHED*/
12705     }
12706 
12707     /* Here, found a quantifier, but was too large; either it overflowed or was
12708      * too big a legal number */
12709     RExC_parse = (char * ) end;
12710     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12711 
12712     NOT_REACHED; /*NOTREACHED*/
12713     return U32_MAX; /* Perhaps some compilers will be expecting a return */
12714 }
12715 
12716 /*
12717  - regpiece - something followed by possible quantifier * + ? {n,m}
12718  *
12719  * Note that the branching code sequences used for ? and the general cases
12720  * of * and + are somewhat optimized:  they use the same NOTHING node as
12721  * both the endmarker for their branch list and the body of the last branch.
12722  * It might seem that this node could be dispensed with entirely, but the
12723  * endmarker role is not redundant.
12724  *
12725  * On success, returns the offset at which any next node should be placed into
12726  * the regex engine program being compiled.
12727  *
12728  * Returns 0 otherwise, with *flagp set to indicate why:
12729  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12730  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12731  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12732  */
12733 STATIC regnode_offset
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)12734 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12735 {
12736     regnode_offset ret;
12737     char op;
12738     I32 flags;
12739     const char * const origparse = RExC_parse;
12740     I32 min;
12741     I32 max = REG_INFTY;
12742 #ifdef RE_TRACK_PATTERN_OFFSETS
12743     char *parse_start;
12744 #endif
12745 
12746     /* Save the original in case we change the emitted regop to a FAIL. */
12747     const regnode_offset orig_emit = RExC_emit;
12748 
12749     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12750 
12751     PERL_ARGS_ASSERT_REGPIECE;
12752 
12753     DEBUG_PARSE("piec");
12754 
12755     ret = regatom(pRExC_state, &flags, depth+1);
12756     if (ret == 0) {
12757         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12758         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12759     }
12760 
12761 #ifdef RE_TRACK_PATTERN_OFFSETS
12762     parse_start = RExC_parse;
12763 #endif
12764 
12765     op = *RExC_parse;
12766     switch (op) {
12767         const char * regcurly_return[5];
12768 
12769       case '*':
12770         nextchar(pRExC_state);
12771         min = 0;
12772         break;
12773 
12774       case '+':
12775         nextchar(pRExC_state);
12776         min = 1;
12777         break;
12778 
12779       case '?':
12780         nextchar(pRExC_state);
12781         min = 0; max = 1;
12782         break;
12783 
12784       case '{':  /* A '{' may or may not indicate a quantifier; call regcurly()
12785                     to determine which */
12786         if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
12787             const char * min_start = regcurly_return[MIN_S];
12788             const char * min_end   = regcurly_return[MIN_E];
12789             const char * max_start = regcurly_return[MAX_S];
12790             const char * max_end   = regcurly_return[MAX_E];
12791 
12792             if (min_start) {
12793                 min = get_quantifier_value(pRExC_state, min_start, min_end);
12794             }
12795             else {
12796                 min = 0;
12797             }
12798 
12799             if (max_start == max_end) {     /* Was of the form {m,} */
12800                 max = REG_INFTY;
12801             }
12802             else if (max_start == min_start) {  /* Was of the form {m} */
12803                 max = min;
12804             }
12805             else {  /* Was of the form {m,n} */
12806                 assert(max_end >= max_start);
12807 
12808                 max = get_quantifier_value(pRExC_state, max_start, max_end);
12809             }
12810 
12811             RExC_parse = (char *) regcurly_return[RBRACE];
12812             nextchar(pRExC_state);
12813 
12814             if (max < min) {    /* If can't match, warn and optimize to fail
12815                                    unconditionally */
12816                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12817                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12818                 NEXT_OFF(REGNODE_p(orig_emit)) =
12819                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12820                 return ret;
12821             }
12822             else if (min == max && *RExC_parse == '?') {
12823                 ckWARN2reg(RExC_parse + 1,
12824                            "Useless use of greediness modifier '%c'",
12825                            *RExC_parse);
12826             }
12827 
12828             break;
12829         } /* End of is {m,n} */
12830 
12831         /* Here was a '{', but what followed it didn't form a quantifier. */
12832         /* FALLTHROUGH */
12833 
12834       default:
12835         *flagp = flags;
12836         return(ret);
12837         NOT_REACHED; /*NOTREACHED*/
12838     }
12839 
12840     /* Here we have a quantifier, and have calculated 'min' and 'max'.
12841      *
12842      * Check and possibly adjust a zero width operand */
12843     if (! (flags & (HASWIDTH|POSTPONED))) {
12844         if (max > REG_INFTY/3) {
12845             if (origparse[0] == '\\' && origparse[1] == 'K') {
12846                 vFAIL2utf8f(
12847                            "%" UTF8f " is forbidden - matches null string"
12848                            " many times",
12849                            UTF8fARG(UTF, (RExC_parse >= origparse
12850                                          ? RExC_parse - origparse
12851                                          : 0),
12852                            origparse));
12853             } else {
12854                 ckWARN2reg(RExC_parse,
12855                            "%" UTF8f " matches null string many times",
12856                            UTF8fARG(UTF, (RExC_parse >= origparse
12857                                          ? RExC_parse - origparse
12858                                          : 0),
12859                            origparse));
12860             }
12861         }
12862 
12863         /* There's no point in trying to match something 0 length more than
12864          * once except for extra side effects, which we don't have here since
12865          * not POSTPONED */
12866         if (max > 1) {
12867             max = 1;
12868             if (min > max) {
12869                 min = max;
12870             }
12871         }
12872     }
12873 
12874     /* If this is a code block pass it up */
12875     *flagp |= (flags & POSTPONED);
12876 
12877     if (max > 0) {
12878         *flagp |= (flags & HASWIDTH);
12879         if (max == REG_INFTY)
12880             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12881     }
12882 
12883     /* 'SIMPLE' operands don't require full generality */
12884     if ((flags&SIMPLE)) {
12885         if (max == REG_INFTY) {
12886             if (min == 0) {
12887                 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
12888                     goto min0_maxINF_wildcard_forbidden;
12889                 }
12890 
12891                 reginsert(pRExC_state, STAR, ret, depth+1);
12892                 MARK_NAUGHTY(4);
12893                 goto done_main_op;
12894             }
12895             else if (min == 1) {
12896                 reginsert(pRExC_state, PLUS, ret, depth+1);
12897                 MARK_NAUGHTY(3);
12898                 goto done_main_op;
12899             }
12900         }
12901 
12902         /* Here, SIMPLE, but not the '*' and '+' special cases */
12903 
12904         MARK_NAUGHTY_EXP(2, 2);
12905         reginsert(pRExC_state, CURLY, ret, depth+1);
12906         Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12907         Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12908     }
12909     else {  /* not SIMPLE */
12910         const regnode_offset w = reg_node(pRExC_state, WHILEM);
12911 
12912         FLAGS(REGNODE_p(w)) = 0;
12913         if (!  REGTAIL(pRExC_state, ret, w)) {
12914             REQUIRE_BRANCHJ(flagp, 0);
12915         }
12916         if (RExC_use_BRANCHJ) {
12917             reginsert(pRExC_state, LONGJMP, ret, depth+1);
12918             reginsert(pRExC_state, NOTHING, ret, depth+1);
12919             NEXT_OFF(REGNODE_p(ret)) = 3;        /* Go over LONGJMP. */
12920         }
12921         reginsert(pRExC_state, CURLYX, ret, depth+1);
12922                         /* MJD hk */
12923         Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12924         Set_Node_Length(REGNODE_p(ret),
12925                         op == '{' ? (RExC_parse - parse_start) : 1);
12926 
12927         if (RExC_use_BRANCHJ)
12928             NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12929                                                LONGJMP. */
12930         if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12931                                                   NOTHING)))
12932         {
12933             REQUIRE_BRANCHJ(flagp, 0);
12934         }
12935         RExC_whilem_seen++;
12936         MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12937     }
12938 
12939     /* Finish up the CURLY/CURLYX case */
12940     FLAGS(REGNODE_p(ret)) = 0;
12941 
12942     ARG1_SET(REGNODE_p(ret), (U16)min);
12943     ARG2_SET(REGNODE_p(ret), (U16)max);
12944 
12945   done_main_op:
12946 
12947     /* Process any greediness modifiers */
12948     if (*RExC_parse == '?') {
12949         nextchar(pRExC_state);
12950         reginsert(pRExC_state, MINMOD, ret, depth+1);
12951         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12952             REQUIRE_BRANCHJ(flagp, 0);
12953         }
12954     }
12955     else if (*RExC_parse == '+') {
12956         regnode_offset ender;
12957         nextchar(pRExC_state);
12958         ender = reg_node(pRExC_state, SUCCEED);
12959         if (! REGTAIL(pRExC_state, ret, ender)) {
12960             REQUIRE_BRANCHJ(flagp, 0);
12961         }
12962         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12963         ender = reg_node(pRExC_state, TAIL);
12964         if (! REGTAIL(pRExC_state, ret, ender)) {
12965             REQUIRE_BRANCHJ(flagp, 0);
12966         }
12967     }
12968 
12969     /* Forbid extra quantifiers */
12970     if (isQUANTIFIER(RExC_parse, RExC_end)) {
12971         RExC_parse++;
12972         vFAIL("Nested quantifiers");
12973     }
12974 
12975     return(ret);
12976 
12977   min0_maxINF_wildcard_forbidden:
12978 
12979     /* Here we are in a wildcard match, and the minimum match length is 0, and
12980      * the max could be infinity.  This is currently forbidden.  The only
12981      * reason is to make it harder to write patterns that take a long long time
12982      * to halt, and because the use of this construct isn't necessary in
12983      * matching Unicode property values */
12984     RExC_parse++;
12985     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
12986        subpatterns in regex; marked by <-- HERE in m/%s/
12987      */
12988     vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
12989           " subpatterns");
12990 
12991     /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
12992      * legal at all in wildcards, so can't get this far */
12993 
12994     NOT_REACHED; /*NOTREACHED*/
12995 }
12996 
12997 STATIC bool
S_grok_bslash_N(pTHX_ RExC_state_t * pRExC_state,regnode_offset * node_p,UV * code_point_p,int * cp_count,I32 * flagp,const bool strict,const U32 depth)12998 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12999                 regnode_offset * node_p,
13000                 UV * code_point_p,
13001                 int * cp_count,
13002                 I32 * flagp,
13003                 const bool strict,
13004                 const U32 depth
13005     )
13006 {
13007  /* This routine teases apart the various meanings of \N and returns
13008   * accordingly.  The input parameters constrain which meaning(s) is/are valid
13009   * in the current context.
13010   *
13011   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
13012   *
13013   * If <code_point_p> is not NULL, the context is expecting the result to be a
13014   * single code point.  If this \N instance turns out to a single code point,
13015   * the function returns TRUE and sets *code_point_p to that code point.
13016   *
13017   * If <node_p> is not NULL, the context is expecting the result to be one of
13018   * the things representable by a regnode.  If this \N instance turns out to be
13019   * one such, the function generates the regnode, returns TRUE and sets *node_p
13020   * to point to the offset of that regnode into the regex engine program being
13021   * compiled.
13022   *
13023   * If this instance of \N isn't legal in any context, this function will
13024   * generate a fatal error and not return.
13025   *
13026   * On input, RExC_parse should point to the first char following the \N at the
13027   * time of the call.  On successful return, RExC_parse will have been updated
13028   * to point to just after the sequence identified by this routine.  Also
13029   * *flagp has been updated as needed.
13030   *
13031   * When there is some problem with the current context and this \N instance,
13032   * the function returns FALSE, without advancing RExC_parse, nor setting
13033   * *node_p, nor *code_point_p, nor *flagp.
13034   *
13035   * If <cp_count> is not NULL, the caller wants to know the length (in code
13036   * points) that this \N sequence matches.  This is set, and the input is
13037   * parsed for errors, even if the function returns FALSE, as detailed below.
13038   *
13039   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
13040   *
13041   * Probably the most common case is for the \N to specify a single code point.
13042   * *cp_count will be set to 1, and *code_point_p will be set to that code
13043   * point.
13044   *
13045   * Another possibility is for the input to be an empty \N{}.  This is no
13046   * longer accepted, and will generate a fatal error.
13047   *
13048   * Another possibility is for a custom charnames handler to be in effect which
13049   * translates the input name to an empty string.  *cp_count will be set to 0.
13050   * *node_p will be set to a generated NOTHING node.
13051   *
13052   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
13053   * set to 0. *node_p will be set to a generated REG_ANY node.
13054   *
13055   * The fifth possibility is that \N resolves to a sequence of more than one
13056   * code points.  *cp_count will be set to the number of code points in the
13057   * sequence. *node_p will be set to a generated node returned by this
13058   * function calling S_reg().
13059   *
13060   * The sixth and final possibility is that it is premature to be calling this
13061   * function; the parse needs to be restarted.  This can happen when this
13062   * changes from /d to /u rules, or when the pattern needs to be upgraded to
13063   * UTF-8.  The latter occurs only when the fifth possibility would otherwise
13064   * be in effect, and is because one of those code points requires the pattern
13065   * to be recompiled as UTF-8.  The function returns FALSE, and sets the
13066   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
13067   * happens, the caller needs to desist from continuing parsing, and return
13068   * this information to its caller.  This is not set for when there is only one
13069   * code point, as this can be called as part of an ANYOF node, and they can
13070   * store above-Latin1 code points without the pattern having to be in UTF-8.
13071   *
13072   * For non-single-quoted regexes, the tokenizer has resolved character and
13073   * sequence names inside \N{...} into their Unicode values, normalizing the
13074   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
13075   * hex-represented code points in the sequence.  This is done there because
13076   * the names can vary based on what charnames pragma is in scope at the time,
13077   * so we need a way to take a snapshot of what they resolve to at the time of
13078   * the original parse. [perl #56444].
13079   *
13080   * That parsing is skipped for single-quoted regexes, so here we may get
13081   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
13082   * like '\N{U+41}', that code point is Unicode, and has to be translated into
13083   * the native character set for non-ASCII platforms.  The other possibilities
13084   * are already native, so no translation is done. */
13085 
13086     char * endbrace;    /* points to '}' following the name */
13087     char * e;           /* points to final non-blank before endbrace */
13088     char* p = RExC_parse; /* Temporary */
13089 
13090     SV * substitute_parse = NULL;
13091     char *orig_end;
13092     char *save_start;
13093     I32 flags;
13094 
13095     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13096 
13097     PERL_ARGS_ASSERT_GROK_BSLASH_N;
13098 
13099     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
13100     assert(! (node_p && cp_count));               /* At most 1 should be set */
13101 
13102     if (cp_count) {     /* Initialize return for the most common case */
13103         *cp_count = 1;
13104     }
13105 
13106     /* The [^\n] meaning of \N ignores spaces and comments under the /x
13107      * modifier.  The other meanings do not (except blanks adjacent to and
13108      * within the braces), so use a temporary until we find out which we are
13109      * being called with */
13110     skip_to_be_ignored_text(pRExC_state, &p,
13111                             FALSE /* Don't force to /x */ );
13112 
13113     /* Disambiguate between \N meaning a named character versus \N meaning
13114      * [^\n].  The latter is assumed when the {...} following the \N is a legal
13115      * quantifier, or if there is no '{' at all */
13116     if (*p != '{' || regcurly(p, RExC_end, NULL)) {
13117         RExC_parse = p;
13118         if (cp_count) {
13119             *cp_count = -1;
13120         }
13121 
13122         if (! node_p) {
13123             return FALSE;
13124         }
13125 
13126         *node_p = reg_node(pRExC_state, REG_ANY);
13127         *flagp |= HASWIDTH|SIMPLE;
13128         MARK_NAUGHTY(1);
13129         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
13130         return TRUE;
13131     }
13132 
13133     /* The test above made sure that the next real character is a '{', but
13134      * under the /x modifier, it could be separated by space (or a comment and
13135      * \n) and this is not allowed (for consistency with \x{...} and the
13136      * tokenizer handling of \N{NAME}). */
13137     if (*RExC_parse != '{') {
13138         vFAIL("Missing braces on \\N{}");
13139     }
13140 
13141     RExC_parse++;       /* Skip past the '{' */
13142 
13143     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13144     if (! endbrace) { /* no trailing brace */
13145         vFAIL2("Missing right brace on \\%c{}", 'N');
13146     }
13147 
13148     /* Here, we have decided it should be a named character or sequence.  These
13149      * imply Unicode semantics */
13150     REQUIRE_UNI_RULES(flagp, FALSE);
13151 
13152     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13153      * nothing at all (not allowed under strict) */
13154     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13155         RExC_parse = endbrace;
13156         if (strict) {
13157             RExC_parse++;   /* Position after the "}" */
13158             vFAIL("Zero length \\N{}");
13159         }
13160 
13161         if (cp_count) {
13162             *cp_count = 0;
13163         }
13164         nextchar(pRExC_state);
13165         if (! node_p) {
13166             return FALSE;
13167         }
13168 
13169         *node_p = reg_node(pRExC_state, NOTHING);
13170         return TRUE;
13171     }
13172 
13173     while (isBLANK(*RExC_parse)) {
13174         RExC_parse++;
13175     }
13176 
13177     e = endbrace;
13178     while (RExC_parse < e && isBLANK(*(e-1))) {
13179         e--;
13180     }
13181 
13182     if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13183 
13184         /* Here, the name isn't of the form  U+....  This can happen if the
13185          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
13186          * is the time to find out what the name means */
13187 
13188         const STRLEN name_len = e - RExC_parse;
13189         SV *  value_sv;     /* What does this name evaluate to */
13190         SV ** value_svp;
13191         const U8 * value;   /* string of name's value */
13192         STRLEN value_len;   /* and its length */
13193 
13194         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
13195          *  toke.c, and their values. Make sure is initialized */
13196         if (! RExC_unlexed_names) {
13197             RExC_unlexed_names = newHV();
13198         }
13199 
13200         /* If we have already seen this name in this pattern, use that.  This
13201          * allows us to only call the charnames handler once per name per
13202          * pattern.  A broken or malicious handler could return something
13203          * different each time, which could cause the results to vary depending
13204          * on if something gets added or subtracted from the pattern that
13205          * causes the number of passes to change, for example */
13206         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13207                                                       name_len, 0)))
13208         {
13209             value_sv = *value_svp;
13210         }
13211         else { /* Otherwise we have to go out and get the name */
13212             const char * error_msg = NULL;
13213             value_sv = get_and_check_backslash_N_name(RExC_parse, e,
13214                                                       UTF,
13215                                                       &error_msg);
13216             if (error_msg) {
13217                 RExC_parse = endbrace;
13218                 vFAIL(error_msg);
13219             }
13220 
13221             /* If no error message, should have gotten a valid return */
13222             assert (value_sv);
13223 
13224             /* Save the name's meaning for later use */
13225             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13226                            value_sv, 0))
13227             {
13228                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13229             }
13230         }
13231 
13232         /* Here, we have the value the name evaluates to in 'value_sv' */
13233         value = (U8 *) SvPV(value_sv, value_len);
13234 
13235         /* See if the result is one code point vs 0 or multiple */
13236         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13237                                   ? UTF8SKIP(value)
13238                                   : 1)))
13239         {
13240             /* Here, exactly one code point.  If that isn't what is wanted,
13241              * fail */
13242             if (! code_point_p) {
13243                 RExC_parse = p;
13244                 return FALSE;
13245             }
13246 
13247             /* Convert from string to numeric code point */
13248             *code_point_p = (SvUTF8(value_sv))
13249                             ? valid_utf8_to_uvchr(value, NULL)
13250                             : *value;
13251 
13252             /* Have parsed this entire single code point \N{...}.  *cp_count
13253              * has already been set to 1, so don't do it again. */
13254             RExC_parse = endbrace;
13255             nextchar(pRExC_state);
13256             return TRUE;
13257         } /* End of is a single code point */
13258 
13259         /* Count the code points, if caller desires.  The API says to do this
13260          * even if we will later return FALSE */
13261         if (cp_count) {
13262             *cp_count = 0;
13263 
13264             *cp_count = (SvUTF8(value_sv))
13265                         ? utf8_length(value, value + value_len)
13266                         : value_len;
13267         }
13268 
13269         /* Fail if caller doesn't want to handle a multi-code-point sequence.
13270          * But don't back the pointer up if the caller wants to know how many
13271          * code points there are (they need to handle it themselves in this
13272          * case).  */
13273         if (! node_p) {
13274             if (! cp_count) {
13275                 RExC_parse = p;
13276             }
13277             return FALSE;
13278         }
13279 
13280         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13281          * reg recursively to parse it.  That way, it retains its atomicness,
13282          * while not having to worry about any special handling that some code
13283          * points may have. */
13284 
13285         substitute_parse = newSVpvs("?:");
13286         sv_catsv(substitute_parse, value_sv);
13287         sv_catpv(substitute_parse, ")");
13288 
13289         /* The value should already be native, so no need to convert on EBCDIC
13290          * platforms.*/
13291         assert(! RExC_recode_x_to_native);
13292 
13293     }
13294     else {   /* \N{U+...} */
13295         Size_t count = 0;   /* code point count kept internally */
13296 
13297         /* We can get to here when the input is \N{U+...} or when toke.c has
13298          * converted a name to the \N{U+...} form.  This include changing a
13299          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13300 
13301         RExC_parse += 2;    /* Skip past the 'U+' */
13302 
13303         /* Code points are separated by dots.  The '}' terminates the whole
13304          * thing. */
13305 
13306         do {    /* Loop until the ending brace */
13307             I32 flags = PERL_SCAN_SILENT_OVERFLOW
13308                       | PERL_SCAN_SILENT_ILLDIGIT
13309                       | PERL_SCAN_NOTIFY_ILLDIGIT
13310                       | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13311                       | PERL_SCAN_DISALLOW_PREFIX;
13312             STRLEN len = e - RExC_parse;
13313             NV overflow_value;
13314             char * start_digit = RExC_parse;
13315             UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13316 
13317             if (len == 0) {
13318                 RExC_parse++;
13319               bad_NU:
13320                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13321             }
13322 
13323             RExC_parse += len;
13324 
13325             if (cp > MAX_LEGAL_CP) {
13326                 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13327             }
13328 
13329             if (RExC_parse >= e) { /* Got to the closing '}' */
13330                 if (count) {
13331                     goto do_concat;
13332                 }
13333 
13334                 /* Here, is a single code point; fail if doesn't want that */
13335                 if (! code_point_p) {
13336                     RExC_parse = p;
13337                     return FALSE;
13338                 }
13339 
13340                 /* A single code point is easy to handle; just return it */
13341                 *code_point_p = UNI_TO_NATIVE(cp);
13342                 RExC_parse = endbrace;
13343                 nextchar(pRExC_state);
13344                 return TRUE;
13345             }
13346 
13347             /* Here, the parse stopped bfore the ending brace.  This is legal
13348              * only if that character is a dot separating code points, like a
13349              * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13350              * So the next character must be a dot (and the one after that
13351              * can't be the ending brace, or we'd have something like
13352              * \N{U+100.} )
13353              * */
13354             if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
13355                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13356                               ? UTF8SKIP(RExC_parse)
13357                               : 1;
13358                 RExC_parse = MIN(e, RExC_parse);/* Guard against malformed utf8
13359                                                  */
13360                 goto bad_NU;
13361             }
13362 
13363             /* Here, looks like its really a multiple character sequence.  Fail
13364              * if that's not what the caller wants.  But continue with counting
13365              * and error checking if they still want a count */
13366             if (! node_p && ! cp_count) {
13367                 return FALSE;
13368             }
13369 
13370             /* What is done here is to convert this to a sub-pattern of the
13371              * form \x{char1}\x{char2}...  and then call reg recursively to
13372              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13373              * atomicness, while not having to worry about special handling
13374              * that some code points may have.  We don't create a subpattern,
13375              * but go through the motions of code point counting and error
13376              * checking, if the caller doesn't want a node returned. */
13377 
13378             if (node_p && ! substitute_parse) {
13379                 substitute_parse = newSVpvs("?:");
13380             }
13381 
13382           do_concat:
13383 
13384             if (node_p) {
13385                 /* Convert to notation the rest of the code understands */
13386                 sv_catpvs(substitute_parse, "\\x{");
13387                 sv_catpvn(substitute_parse, start_digit,
13388                                             RExC_parse - start_digit);
13389                 sv_catpvs(substitute_parse, "}");
13390             }
13391 
13392             /* Move to after the dot (or ending brace the final time through.)
13393              * */
13394             RExC_parse++;
13395             count++;
13396 
13397         } while (RExC_parse < e);
13398 
13399         if (! node_p) { /* Doesn't want the node */
13400             assert (cp_count);
13401 
13402             *cp_count = count;
13403             return FALSE;
13404         }
13405 
13406         sv_catpvs(substitute_parse, ")");
13407 
13408         /* The values are Unicode, and therefore have to be converted to native
13409          * on a non-Unicode (meaning non-ASCII) platform. */
13410         SET_recode_x_to_native(1);
13411     }
13412 
13413     /* Here, we have the string the name evaluates to, ready to be parsed,
13414      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13415      * constructs.  This can be called from within a substitute parse already.
13416      * The error reporting mechanism doesn't work for 2 levels of this, but the
13417      * code above has validated this new construct, so there should be no
13418      * errors generated by the below.  And this isn' an exact copy, so the
13419      * mechanism to seamlessly deal with this won't work, so turn off warnings
13420      * during it */
13421     save_start = RExC_start;
13422     orig_end = RExC_end;
13423 
13424     RExC_parse = RExC_start = SvPVX(substitute_parse);
13425     RExC_end = RExC_parse + SvCUR(substitute_parse);
13426     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13427 
13428     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13429 
13430     /* Restore the saved values */
13431     RESTORE_WARNINGS;
13432     RExC_start = save_start;
13433     RExC_parse = endbrace;
13434     RExC_end = orig_end;
13435     SET_recode_x_to_native(0);
13436 
13437     SvREFCNT_dec_NN(substitute_parse);
13438 
13439     if (! *node_p) {
13440         RETURN_FAIL_ON_RESTART(flags, flagp);
13441         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13442             (UV) flags);
13443     }
13444     *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13445 
13446     nextchar(pRExC_state);
13447 
13448     return TRUE;
13449 }
13450 
13451 
13452 STATIC U8
S_compute_EXACTish(RExC_state_t * pRExC_state)13453 S_compute_EXACTish(RExC_state_t *pRExC_state)
13454 {
13455     U8 op;
13456 
13457     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13458 
13459     if (! FOLD) {
13460         return (LOC)
13461                 ? EXACTL
13462                 : EXACT;
13463     }
13464 
13465     op = get_regex_charset(RExC_flags);
13466     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13467         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13468                  been, so there is no hole */
13469     }
13470 
13471     return op + EXACTF;
13472 }
13473 
13474 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13475  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13476 
13477 static I32
S_backref_value(char * p,char * e)13478 S_backref_value(char *p, char *e)
13479 {
13480     const char* endptr = e;
13481     UV val;
13482     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13483         return (I32)val;
13484     return I32_MAX;
13485 }
13486 
13487 
13488 /*
13489  - regatom - the lowest level
13490 
13491    Try to identify anything special at the start of the current parse position.
13492    If there is, then handle it as required. This may involve generating a
13493    single regop, such as for an assertion; or it may involve recursing, such as
13494    to handle a () structure.
13495 
13496    If the string doesn't start with something special then we gobble up
13497    as much literal text as we can.  If we encounter a quantifier, we have to
13498    back off the final literal character, as that quantifier applies to just it
13499    and not to the whole string of literals.
13500 
13501    Once we have been able to handle whatever type of thing started the
13502    sequence, we return the offset into the regex engine program being compiled
13503    at which any  next regnode should be placed.
13504 
13505    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13506    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13507    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13508    Otherwise does not return 0.
13509 
13510    Note: we have to be careful with escapes, as they can be both literal
13511    and special, and in the case of \10 and friends, context determines which.
13512 
13513    A summary of the code structure is:
13514 
13515    switch (first_byte) {
13516 	cases for each special:
13517 	    handle this special;
13518 	    break;
13519 	case '\\':
13520 	    switch (2nd byte) {
13521 		cases for each unambiguous special:
13522 		    handle this special;
13523 		    break;
13524 		cases for each ambigous special/literal:
13525 		    disambiguate;
13526 		    if (special)  handle here
13527 		    else goto defchar;
13528 		default: // unambiguously literal:
13529 		    goto defchar;
13530 	    }
13531 	default:  // is a literal char
13532 	    // FALL THROUGH
13533 	defchar:
13534 	    create EXACTish node for literal;
13535 	    while (more input and node isn't full) {
13536 		switch (input_byte) {
13537 		   cases for each special;
13538                        make sure parse pointer is set so that the next call to
13539                            regatom will see this special first
13540                        goto loopdone; // EXACTish node terminated by prev. char
13541 		   default:
13542 		       append char to EXACTISH node;
13543 		}
13544 	        get next input byte;
13545 	    }
13546         loopdone:
13547    }
13548    return the generated node;
13549 
13550    Specifically there are two separate switches for handling
13551    escape sequences, with the one for handling literal escapes requiring
13552    a dummy entry for all of the special escapes that are actually handled
13553    by the other.
13554 
13555 */
13556 
13557 STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)13558 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13559 {
13560     regnode_offset ret = 0;
13561     I32 flags = 0;
13562     char *parse_start;
13563     U8 op;
13564     int invert = 0;
13565 
13566     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13567 
13568     *flagp = 0;		/* Initialize. */
13569 
13570     DEBUG_PARSE("atom");
13571 
13572     PERL_ARGS_ASSERT_REGATOM;
13573 
13574   tryagain:
13575     parse_start = RExC_parse;
13576     assert(RExC_parse < RExC_end);
13577     switch ((U8)*RExC_parse) {
13578     case '^':
13579 	RExC_seen_zerolen++;
13580 	nextchar(pRExC_state);
13581 	if (RExC_flags & RXf_PMf_MULTILINE)
13582 	    ret = reg_node(pRExC_state, MBOL);
13583 	else
13584 	    ret = reg_node(pRExC_state, SBOL);
13585         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13586 	break;
13587     case '$':
13588 	nextchar(pRExC_state);
13589 	if (*RExC_parse)
13590 	    RExC_seen_zerolen++;
13591 	if (RExC_flags & RXf_PMf_MULTILINE)
13592 	    ret = reg_node(pRExC_state, MEOL);
13593 	else
13594 	    ret = reg_node(pRExC_state, SEOL);
13595         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13596 	break;
13597     case '.':
13598 	nextchar(pRExC_state);
13599 	if (RExC_flags & RXf_PMf_SINGLELINE)
13600 	    ret = reg_node(pRExC_state, SANY);
13601 	else
13602 	    ret = reg_node(pRExC_state, REG_ANY);
13603 	*flagp |= HASWIDTH|SIMPLE;
13604 	MARK_NAUGHTY(1);
13605         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13606 	break;
13607     case '[':
13608     {
13609 	char * const oregcomp_parse = ++RExC_parse;
13610         ret = regclass(pRExC_state, flagp, depth+1,
13611                        FALSE, /* means parse the whole char class */
13612                        TRUE, /* allow multi-char folds */
13613                        FALSE, /* don't silence non-portable warnings. */
13614                        (bool) RExC_strict,
13615                        TRUE, /* Allow an optimized regnode result */
13616                        NULL);
13617         if (ret == 0) {
13618             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13619             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13620                   (UV) *flagp);
13621         }
13622 	if (*RExC_parse != ']') {
13623 	    RExC_parse = oregcomp_parse;
13624 	    vFAIL("Unmatched [");
13625 	}
13626 	nextchar(pRExC_state);
13627         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13628 	break;
13629     }
13630     case '(':
13631 	nextchar(pRExC_state);
13632         ret = reg(pRExC_state, 2, &flags, depth+1);
13633 	if (ret == 0) {
13634 		if (flags & TRYAGAIN) {
13635 		    if (RExC_parse >= RExC_end) {
13636 			 /* Make parent create an empty node if needed. */
13637 			*flagp |= TRYAGAIN;
13638 			return(0);
13639 		    }
13640 		    goto tryagain;
13641 		}
13642                 RETURN_FAIL_ON_RESTART(flags, flagp);
13643                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13644                                                                  (UV) flags);
13645 	}
13646 	*flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13647 	break;
13648     case '|':
13649     case ')':
13650 	if (flags & TRYAGAIN) {
13651 	    *flagp |= TRYAGAIN;
13652 	    return 0;
13653 	}
13654 	vFAIL("Internal urp");
13655 				/* Supposed to be caught earlier. */
13656 	break;
13657     case '?':
13658     case '+':
13659     case '*':
13660 	RExC_parse++;
13661 	vFAIL("Quantifier follows nothing");
13662 	break;
13663     case '\\':
13664 	/* Special Escapes
13665 
13666 	   This switch handles escape sequences that resolve to some kind
13667 	   of special regop and not to literal text. Escape sequences that
13668 	   resolve to literal text are handled below in the switch marked
13669 	   "Literal Escapes".
13670 
13671 	   Every entry in this switch *must* have a corresponding entry
13672 	   in the literal escape switch. However, the opposite is not
13673 	   required, as the default for this switch is to jump to the
13674 	   literal text handling code.
13675 	*/
13676 	RExC_parse++;
13677 	switch ((U8)*RExC_parse) {
13678 	/* Special Escapes */
13679 	case 'A':
13680 	    RExC_seen_zerolen++;
13681             /* Under wildcards, this is changed to match \n; should be
13682              * invisible to the user, as they have to compile under /m */
13683             if (RExC_pm_flags & PMf_WILDCARD) {
13684                 ret = reg_node(pRExC_state, MBOL);
13685             }
13686             else {
13687                 ret = reg_node(pRExC_state, SBOL);
13688                 /* SBOL is shared with /^/ so we set the flags so we can tell
13689                  * /\A/ from /^/ in split. */
13690                 FLAGS(REGNODE_p(ret)) = 1;
13691             }
13692 	    goto finish_meta_pat;
13693 	case 'G':
13694             if (RExC_pm_flags & PMf_WILDCARD) {
13695                 RExC_parse++;
13696                 /* diag_listed_as: Use of %s is not allowed in Unicode property
13697                    wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13698                  */
13699                 vFAIL("Use of '\\G' is not allowed in Unicode property"
13700                       " wildcard subpatterns");
13701             }
13702 	    ret = reg_node(pRExC_state, GPOS);
13703             RExC_seen |= REG_GPOS_SEEN;
13704 	    goto finish_meta_pat;
13705 	case 'K':
13706             if (!RExC_in_lookaround) {
13707                 RExC_seen_zerolen++;
13708                 ret = reg_node(pRExC_state, KEEPS);
13709                 /* XXX:dmq : disabling in-place substitution seems to
13710                  * be necessary here to avoid cases of memory corruption, as
13711                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13712                  */
13713                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13714                 goto finish_meta_pat;
13715             }
13716             else {
13717                 ++RExC_parse; /* advance past the 'K' */
13718                 vFAIL("\\K not permitted in lookahead/lookbehind");
13719             }
13720 	case 'Z':
13721             if (RExC_pm_flags & PMf_WILDCARD) {
13722                 /* See comment under \A above */
13723                 ret = reg_node(pRExC_state, MEOL);
13724             }
13725             else {
13726                 ret = reg_node(pRExC_state, SEOL);
13727             }
13728 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
13729 	    goto finish_meta_pat;
13730 	case 'z':
13731             if (RExC_pm_flags & PMf_WILDCARD) {
13732                 /* See comment under \A above */
13733                 ret = reg_node(pRExC_state, MEOL);
13734             }
13735             else {
13736                 ret = reg_node(pRExC_state, EOS);
13737             }
13738 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
13739 	    goto finish_meta_pat;
13740 	case 'C':
13741 	    vFAIL("\\C no longer supported");
13742 	case 'X':
13743 	    ret = reg_node(pRExC_state, CLUMP);
13744 	    *flagp |= HASWIDTH;
13745 	    goto finish_meta_pat;
13746 
13747 	case 'B':
13748             invert = 1;
13749             /* FALLTHROUGH */
13750 	case 'b':
13751           {
13752             U8 flags = 0;
13753 	    regex_charset charset = get_regex_charset(RExC_flags);
13754 
13755 	    RExC_seen_zerolen++;
13756             RExC_seen |= REG_LOOKBEHIND_SEEN;
13757 	    op = BOUND + charset;
13758 
13759 	    if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13760                 flags = TRADITIONAL_BOUND;
13761                 if (op > BOUNDA) {  /* /aa is same as /a */
13762                     op = BOUNDA;
13763                 }
13764             }
13765             else {
13766                 STRLEN length;
13767                 char name = *RExC_parse;
13768                 char * endbrace =  (char *) memchr(RExC_parse, '}',
13769                                                    RExC_end - RExC_parse);
13770                 char * e = endbrace;
13771 
13772                 RExC_parse += 2;
13773 
13774                 if (! endbrace) {
13775                     vFAIL2("Missing right brace on \\%c{}", name);
13776                 }
13777 
13778                 while (isBLANK(*RExC_parse)) {
13779                     RExC_parse++;
13780                 }
13781 
13782                 while (RExC_parse < e && isBLANK(*(e - 1))) {
13783                     e--;
13784                 }
13785 
13786                 if (e == RExC_parse) {
13787                     RExC_parse = endbrace + 1;  /* After the '}' */
13788                     vFAIL2("Empty \\%c{}", name);
13789                 }
13790 
13791                 length = e - RExC_parse;
13792 
13793                 switch (*RExC_parse) {
13794                     case 'g':
13795                         if (    length != 1
13796                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13797                         {
13798                             goto bad_bound_type;
13799                         }
13800                         flags = GCB_BOUND;
13801                         break;
13802                     case 'l':
13803                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13804                             goto bad_bound_type;
13805                         }
13806                         flags = LB_BOUND;
13807                         break;
13808                     case 's':
13809                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13810                             goto bad_bound_type;
13811                         }
13812                         flags = SB_BOUND;
13813                         break;
13814                     case 'w':
13815                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13816                             goto bad_bound_type;
13817                         }
13818                         flags = WB_BOUND;
13819                         break;
13820                     default:
13821                       bad_bound_type:
13822                         RExC_parse = e;
13823 			vFAIL2utf8f(
13824                             "'%" UTF8f "' is an unknown bound type",
13825 			    UTF8fARG(UTF, length, e - length));
13826                         NOT_REACHED; /*NOTREACHED*/
13827                 }
13828                 RExC_parse = endbrace;
13829                 REQUIRE_UNI_RULES(flagp, 0);
13830 
13831                 if (op == BOUND) {
13832                     op = BOUNDU;
13833                 }
13834                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13835                     op = BOUNDU;
13836                     length += 4;
13837 
13838                     /* Don't have to worry about UTF-8, in this message because
13839                      * to get here the contents of the \b must be ASCII */
13840                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13841                               "Using /u for '%.*s' instead of /%s",
13842                               (unsigned) length,
13843                               endbrace - length + 1,
13844                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13845                               ? ASCII_RESTRICT_PAT_MODS
13846                               : ASCII_MORE_RESTRICT_PAT_MODS);
13847                 }
13848 	    }
13849 
13850             if (op == BOUND) {
13851                 RExC_seen_d_op = TRUE;
13852             }
13853             else if (op == BOUNDL) {
13854                 RExC_contains_locale = 1;
13855             }
13856 
13857             if (invert) {
13858                 op += NBOUND - BOUND;
13859             }
13860 
13861 	    ret = reg_node(pRExC_state, op);
13862             FLAGS(REGNODE_p(ret)) = flags;
13863 
13864 	    goto finish_meta_pat;
13865           }
13866 
13867 	case 'R':
13868 	    ret = reg_node(pRExC_state, LNBREAK);
13869 	    *flagp |= HASWIDTH|SIMPLE;
13870 	    goto finish_meta_pat;
13871 
13872 	case 'd':
13873 	case 'D':
13874 	case 'h':
13875 	case 'H':
13876 	case 'p':
13877 	case 'P':
13878 	case 's':
13879 	case 'S':
13880 	case 'v':
13881 	case 'V':
13882 	case 'w':
13883 	case 'W':
13884             /* These all have the same meaning inside [brackets], and it knows
13885              * how to do the best optimizations for them.  So, pretend we found
13886              * these within brackets, and let it do the work */
13887             RExC_parse--;
13888 
13889             ret = regclass(pRExC_state, flagp, depth+1,
13890                            TRUE, /* means just parse this element */
13891                            FALSE, /* don't allow multi-char folds */
13892                            FALSE, /* don't silence non-portable warnings.  It
13893                                      would be a bug if these returned
13894                                      non-portables */
13895                            (bool) RExC_strict,
13896                            TRUE, /* Allow an optimized regnode result */
13897                            NULL);
13898             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13899             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13900              * multi-char folds are allowed.  */
13901             if (!ret)
13902                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13903                       (UV) *flagp);
13904 
13905             RExC_parse--;   /* regclass() leaves this one too far ahead */
13906 
13907           finish_meta_pat:
13908                    /* The escapes above that don't take a parameter can't be
13909                     * followed by a '{'.  But 'pX', 'p{foo}' and
13910                     * correspondingly 'P' can be */
13911             if (   RExC_parse - parse_start == 1
13912                 && UCHARAT(RExC_parse + 1) == '{'
13913                 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
13914             {
13915                 RExC_parse += 2;
13916                 vFAIL("Unescaped left brace in regex is illegal here");
13917             }
13918             Set_Node_Offset(REGNODE_p(ret), parse_start);
13919             Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13920             nextchar(pRExC_state);
13921 	    break;
13922         case 'N':
13923             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13924              * \N{...} evaluates to a sequence of more than one code points).
13925              * The function call below returns a regnode, which is our result.
13926              * The parameters cause it to fail if the \N{} evaluates to a
13927              * single code point; we handle those like any other literal.  The
13928              * reason that the multicharacter case is handled here and not as
13929              * part of the EXACtish code is because of quantifiers.  In
13930              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13931              * this way makes that Just Happen. dmq.
13932              * join_exact() will join this up with adjacent EXACTish nodes
13933              * later on, if appropriate. */
13934             ++RExC_parse;
13935             if (grok_bslash_N(pRExC_state,
13936                               &ret,     /* Want a regnode returned */
13937                               NULL,     /* Fail if evaluates to a single code
13938                                            point */
13939                               NULL,     /* Don't need a count of how many code
13940                                            points */
13941                               flagp,
13942                               RExC_strict,
13943                               depth)
13944             ) {
13945                 break;
13946             }
13947 
13948             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13949 
13950             /* Here, evaluates to a single code point.  Go get that */
13951             RExC_parse = parse_start;
13952             goto defchar;
13953 
13954 	case 'k':    /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
13955       parse_named_seq:  /* Also handle non-numeric \g{...} */
13956         {
13957             char ch;
13958             if (   RExC_parse >= RExC_end - 1
13959                 || ((   ch = RExC_parse[1]) != '<'
13960                                       && ch != '\''
13961                                       && ch != '{'))
13962             {
13963 	        RExC_parse++;
13964 		/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13965 	        vFAIL2("Sequence %.2s... not terminated", parse_start);
13966 	    } else {
13967 		RExC_parse += 2;
13968                 if (ch == '{') {
13969                     while (isBLANK(*RExC_parse)) {
13970                         RExC_parse++;
13971                     }
13972                 }
13973                 ret = handle_named_backref(pRExC_state,
13974                                            flagp,
13975                                            parse_start,
13976                                            (ch == '<')
13977                                            ? '>'
13978                                            : (ch == '{')
13979                                              ? '}'
13980                                              : '\'');
13981             }
13982             break;
13983 	}
13984 	case 'g':
13985 	case '1': case '2': case '3': case '4':
13986 	case '5': case '6': case '7': case '8': case '9':
13987 	    {
13988 		I32 num;
13989 		char * endbrace = NULL;
13990                 char * s = RExC_parse;
13991                 char * e = RExC_end;
13992 
13993 		if (*s == 'g') {
13994                     bool isrel = 0;
13995 
13996 		    s++;
13997 		    if (*s == '{') {
13998                         endbrace = (char *) memchr(s, '}', RExC_end - s);
13999                         if (! endbrace ) {
14000 
14001                             /* Missing '}'.  Position after the number to give
14002                              * a better indication to the user of where the
14003                              * problem is. */
14004                             s++;
14005                             if (*s == '-') {
14006                                 s++;
14007                             }
14008 
14009                             /* If it looks to be a name and not a number, go
14010                              * handle it there */
14011                             if (! isDIGIT(*s)) {
14012                                 goto parse_named_seq;
14013                             }
14014 
14015                             do {
14016                                 s++;
14017                             } while isDIGIT(*s);
14018 
14019                             RExC_parse = s;
14020                             vFAIL("Unterminated \\g{...} pattern");
14021                         }
14022 
14023 		        s++;    /* Past the '{' */
14024 
14025                         while (isBLANK(*s)) {
14026                             s++;
14027                         }
14028 
14029                         /* Ignore trailing blanks */
14030                         e = endbrace;
14031                         while (s < e && isBLANK(*(e - 1))) {
14032                             e--;
14033                         }
14034 		    }
14035 
14036                     /* Here, have isolated the meat of the construct from any
14037                      * surrounding braces */
14038 
14039 		    if (*s == '-') {
14040 		        isrel = 1;
14041 		        s++;
14042 		    }
14043 
14044 		    if (endbrace && !isDIGIT(*s)) {
14045 		        goto parse_named_seq;
14046                     }
14047 
14048                     RExC_parse = s;
14049                     num = S_backref_value(RExC_parse, RExC_end);
14050                     if (num == 0)
14051                         vFAIL("Reference to invalid group 0");
14052                     else if (num == I32_MAX) {
14053                          if (isDIGIT(*RExC_parse))
14054 			    vFAIL("Reference to nonexistent group");
14055                         else
14056                             vFAIL("Unterminated \\g... pattern");
14057                     }
14058 
14059                     if (isrel) {
14060                         num = RExC_npar - num;
14061                         if (num < 1)
14062                             vFAIL("Reference to nonexistent or unclosed group");
14063                     }
14064                 }
14065                 else {
14066                     num = S_backref_value(RExC_parse, RExC_end);
14067                     /* bare \NNN might be backref or octal - if it is larger
14068                      * than or equal RExC_npar then it is assumed to be an
14069                      * octal escape. Note RExC_npar is +1 from the actual
14070                      * number of parens. */
14071                     /* Note we do NOT check if num == I32_MAX here, as that is
14072                      * handled by the RExC_npar check */
14073 
14074                     if (    /* any numeric escape < 10 is always a backref */
14075                            num > 9
14076                             /* any numeric escape < RExC_npar is a backref */
14077                         && num >= RExC_npar
14078                             /* cannot be an octal escape if it starts with [89]
14079                              * */
14080                         && ! inRANGE(*RExC_parse, '8', '9')
14081                     ) {
14082                         /* Probably not meant to be a backref, instead likely
14083                          * to be an octal character escape, e.g. \35 or \777.
14084                          * The above logic should make it obvious why using
14085                          * octal escapes in patterns is problematic. - Yves */
14086                         RExC_parse = parse_start;
14087                         goto defchar;
14088                     }
14089                 }
14090 
14091                 /* At this point RExC_parse points at a numeric escape like
14092                  * \12 or \88 or the digits in \g{34} or \g34 or something
14093                  * similar, which we should NOT treat as an octal escape. It
14094                  * may or may not be a valid backref escape. For instance
14095                  * \88888888 is unlikely to be a valid backref.
14096                  *
14097                  * We've already figured out what value the digits represent.
14098                  * Now, move the parse to beyond them. */
14099                 if (endbrace) {
14100                     RExC_parse = endbrace + 1;
14101                 }
14102                 else while (isDIGIT(*RExC_parse)) {
14103                     RExC_parse++;
14104                 }
14105 
14106                 if (num >= (I32)RExC_npar) {
14107 
14108                     /* It might be a forward reference; we can't fail until we
14109                      * know, by completing the parse to get all the groups, and
14110                      * then reparsing */
14111                     if (ALL_PARENS_COUNTED)  {
14112                         if (num >= RExC_total_parens)  {
14113                             vFAIL("Reference to nonexistent group");
14114                         }
14115                     }
14116                     else {
14117                         REQUIRE_PARENS_PASS;
14118                     }
14119                 }
14120                 RExC_sawback = 1;
14121                 ret = reganode(pRExC_state,
14122                                ((! FOLD)
14123                                  ? REF
14124                                  : (ASCII_FOLD_RESTRICTED)
14125                                    ? REFFA
14126                                    : (AT_LEAST_UNI_SEMANTICS)
14127                                      ? REFFU
14128                                      : (LOC)
14129                                        ? REFFL
14130                                        : REFF),
14131                                 num);
14132                 if (OP(REGNODE_p(ret)) == REFF) {
14133                     RExC_seen_d_op = TRUE;
14134                 }
14135                 *flagp |= HASWIDTH;
14136 
14137                 /* override incorrect value set in reganode MJD */
14138                 Set_Node_Offset(REGNODE_p(ret), parse_start);
14139                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
14140                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14141                                         FALSE /* Don't force to /x */ );
14142 	    }
14143 	    break;
14144 	case '\0':
14145 	    if (RExC_parse >= RExC_end)
14146 		FAIL("Trailing \\");
14147 	    /* FALLTHROUGH */
14148 	default:
14149 	    /* Do not generate "unrecognized" warnings here, we fall
14150 	       back into the quick-grab loop below */
14151             RExC_parse = parse_start;
14152 	    goto defchar;
14153 	} /* end of switch on a \foo sequence */
14154 	break;
14155 
14156     case '#':
14157 
14158         /* '#' comments should have been spaced over before this function was
14159          * called */
14160         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14161 	/*
14162         if (RExC_flags & RXf_PMf_EXTENDED) {
14163 	    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14164 	    if (RExC_parse < RExC_end)
14165 		goto tryagain;
14166 	}
14167         */
14168 
14169 	/* FALLTHROUGH */
14170 
14171     default:
14172 	  defchar: {
14173 
14174             /* Here, we have determined that the next thing is probably a
14175              * literal character.  RExC_parse points to the first byte of its
14176              * definition.  (It still may be an escape sequence that evaluates
14177              * to a single character) */
14178 
14179 	    STRLEN len = 0;
14180 	    UV ender = 0;
14181 	    char *p;
14182 	    char *s, *old_s = NULL, *old_old_s = NULL;
14183 	    char *s0;
14184             U32 max_string_len = 255;
14185 
14186             /* We may have to reparse the node, artificially stopping filling
14187              * it early, based on info gleaned in the first parse.  This
14188              * variable gives where we stop.  Make it above the normal stopping
14189              * place first time through; otherwise it would stop too early */
14190             U32 upper_fill = max_string_len + 1;
14191 
14192             /* We start out as an EXACT node, even if under /i, until we find a
14193              * character which is in a fold.  The algorithm now segregates into
14194              * separate nodes, characters that fold from those that don't under
14195              * /i.  (This hopefully will create nodes that are fixed strings
14196              * even under /i, giving the optimizer something to grab on to.)
14197              * So, if a node has something in it and the next character is in
14198              * the opposite category, that node is closed up, and the function
14199              * returns.  Then regatom is called again, and a new node is
14200              * created for the new category. */
14201             U8 node_type = EXACT;
14202 
14203             /* Assume the node will be fully used; the excess is given back at
14204              * the end.  Under /i, we may need to temporarily add the fold of
14205              * an extra character or two at the end to check for splitting
14206              * multi-char folds, so allocate extra space for that.   We can't
14207              * make any other length assumptions, as a byte input sequence
14208              * could shrink down. */
14209             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14210                                                  + ((! FOLD)
14211                                                     ? 0
14212                                                     : 2 * ((UTF)
14213                                                            ? UTF8_MAXBYTES_CASE
14214                         /* Max non-UTF-8 expansion is 2 */ : 2)));
14215 
14216             bool next_is_quantifier;
14217             char * oldp = NULL;
14218 
14219             /* We can convert EXACTF nodes to EXACTFU if they contain only
14220              * characters that match identically regardless of the target
14221              * string's UTF8ness.  The reason to do this is that EXACTF is not
14222              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14223              * runtime.
14224              *
14225              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14226              * contain only above-Latin1 characters (hence must be in UTF8),
14227              * which don't participate in folds with Latin1-range characters,
14228              * as the latter's folds aren't known until runtime. */
14229             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14230 
14231             /* Single-character EXACTish nodes are almost always SIMPLE.  This
14232              * allows us to override this as encountered */
14233             U8 maybe_SIMPLE = SIMPLE;
14234 
14235             /* Does this node contain something that can't match unless the
14236              * target string is (also) in UTF-8 */
14237             bool requires_utf8_target = FALSE;
14238 
14239             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14240             bool has_ss = FALSE;
14241 
14242             /* So is the MICRO SIGN */
14243             bool has_micro_sign = FALSE;
14244 
14245             /* Set when we fill up the current node and there is still more
14246              * text to process */
14247             bool overflowed;
14248 
14249             /* Allocate an EXACT node.  The node_type may change below to
14250              * another EXACTish node, but since the size of the node doesn't
14251              * change, it works */
14252             ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14253                                                                     "exact");
14254             FILL_NODE(ret, node_type);
14255             RExC_emit++;
14256 
14257 	    s = STRING(REGNODE_p(ret));
14258 
14259             s0 = s;
14260 
14261 	  reparse:
14262 
14263             p = RExC_parse;
14264             len = 0;
14265             s = s0;
14266             node_type = EXACT;
14267             oldp = NULL;
14268             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14269             maybe_SIMPLE = SIMPLE;
14270             requires_utf8_target = FALSE;
14271             has_ss = FALSE;
14272             has_micro_sign = FALSE;
14273 
14274           continue_parse:
14275 
14276             /* This breaks under rare circumstances.  If folding, we do not
14277              * want to split a node at a character that is a non-final in a
14278              * multi-char fold, as an input string could just happen to want to
14279              * match across the node boundary.  The code at the end of the loop
14280              * looks for this, and backs off until it finds not such a
14281              * character, but it is possible (though extremely, extremely
14282              * unlikely) for all characters in the node to be non-final fold
14283              * ones, in which case we just leave the node fully filled, and
14284              * hope that it doesn't match the string in just the wrong place */
14285 
14286             assert( ! UTF     /* Is at the beginning of a character */
14287                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14288                    || UTF8_IS_START(UCHARAT(RExC_parse)));
14289 
14290             overflowed = FALSE;
14291 
14292             /* Here, we have a literal character.  Find the maximal string of
14293              * them in the input that we can fit into a single EXACTish node.
14294              * We quit at the first non-literal or when the node gets full, or
14295              * under /i the categorization of folding/non-folding character
14296              * changes */
14297             while (p < RExC_end && len < upper_fill) {
14298 
14299                 /* In most cases each iteration adds one byte to the output.
14300                  * The exceptions override this */
14301                 Size_t added_len = 1;
14302 
14303 		oldp = p;
14304                 old_old_s = old_s;
14305                 old_s = s;
14306 
14307                 /* White space has already been ignored */
14308                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
14309                        || ! is_PATWS_safe((p), RExC_end, UTF));
14310 
14311 		switch ((U8)*p) {
14312                   const char* message;
14313                   U32 packed_warn;
14314                   U8 grok_c_char;
14315 
14316 		case '^':
14317 		case '$':
14318 		case '.':
14319 		case '[':
14320 		case '(':
14321 		case ')':
14322 		case '|':
14323 		    goto loopdone;
14324 		case '\\':
14325 		    /* Literal Escapes Switch
14326 
14327 		       This switch is meant to handle escape sequences that
14328 		       resolve to a literal character.
14329 
14330 		       Every escape sequence that represents something
14331 		       else, like an assertion or a char class, is handled
14332 		       in the switch marked 'Special Escapes' above in this
14333 		       routine, but also has an entry here as anything that
14334 		       isn't explicitly mentioned here will be treated as
14335 		       an unescaped equivalent literal.
14336 		    */
14337 
14338 		    switch ((U8)*++p) {
14339 
14340 		    /* These are all the special escapes. */
14341 		    case 'A':             /* Start assertion */
14342 		    case 'b': case 'B':   /* Word-boundary assertion*/
14343 		    case 'C':             /* Single char !DANGEROUS! */
14344 		    case 'd': case 'D':   /* digit class */
14345 		    case 'g': case 'G':   /* generic-backref, pos assertion */
14346 		    case 'h': case 'H':   /* HORIZWS */
14347 		    case 'k': case 'K':   /* named backref, keep marker */
14348 		    case 'p': case 'P':   /* Unicode property */
14349 		              case 'R':   /* LNBREAK */
14350 		    case 's': case 'S':   /* space class */
14351 		    case 'v': case 'V':   /* VERTWS */
14352 		    case 'w': case 'W':   /* word class */
14353                     case 'X':             /* eXtended Unicode "combining
14354                                              character sequence" */
14355 		    case 'z': case 'Z':   /* End of line/string assertion */
14356 			--p;
14357 			goto loopdone;
14358 
14359 	            /* Anything after here is an escape that resolves to a
14360 	               literal. (Except digits, which may or may not)
14361 	             */
14362 		    case 'n':
14363 			ender = '\n';
14364 			p++;
14365 			break;
14366 		    case 'N': /* Handle a single-code point named character. */
14367                         RExC_parse = p + 1;
14368                         if (! grok_bslash_N(pRExC_state,
14369                                             NULL,   /* Fail if evaluates to
14370                                                        anything other than a
14371                                                        single code point */
14372                                             &ender, /* The returned single code
14373                                                        point */
14374                                             NULL,   /* Don't need a count of
14375                                                        how many code points */
14376                                             flagp,
14377                                             RExC_strict,
14378                                             depth)
14379                         ) {
14380                             if (*flagp & NEED_UTF8)
14381                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14382                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14383 
14384                             /* Here, it wasn't a single code point.  Go close
14385                              * up this EXACTish node.  The switch() prior to
14386                              * this switch handles the other cases */
14387                             RExC_parse = p = oldp;
14388                             goto loopdone;
14389                         }
14390                         p = RExC_parse;
14391                         RExC_parse = parse_start;
14392 
14393                         /* The \N{} means the pattern, if previously /d,
14394                          * becomes /u.  That means it can't be an EXACTF node,
14395                          * but an EXACTFU */
14396                         if (node_type == EXACTF) {
14397                             node_type = EXACTFU;
14398 
14399                             /* If the node already contains something that
14400                              * differs between EXACTF and EXACTFU, reparse it
14401                              * as EXACTFU */
14402                             if (! maybe_exactfu) {
14403                                 len = 0;
14404                                 s = s0;
14405                                 goto reparse;
14406                             }
14407                         }
14408 
14409                         break;
14410 		    case 'r':
14411 			ender = '\r';
14412 			p++;
14413 			break;
14414 		    case 't':
14415 			ender = '\t';
14416 			p++;
14417 			break;
14418 		    case 'f':
14419 			ender = '\f';
14420 			p++;
14421 			break;
14422 		    case 'e':
14423 			ender = ESC_NATIVE;
14424 			p++;
14425 			break;
14426 		    case 'a':
14427 			ender = '\a';
14428 			p++;
14429 			break;
14430 		    case 'o':
14431                         if (! grok_bslash_o(&p,
14432                                             RExC_end,
14433                                             &ender,
14434                                             &message,
14435                                             &packed_warn,
14436                                             (bool) RExC_strict,
14437                                             FALSE, /* No illegal cp's */
14438                                             UTF))
14439                         {
14440                             RExC_parse = p; /* going to die anyway; point to
14441                                                exact spot of failure */
14442                             vFAIL(message);
14443                         }
14444 
14445                         if (message && TO_OUTPUT_WARNINGS(p)) {
14446                             warn_non_literal_string(p, packed_warn, message);
14447                         }
14448                         break;
14449 		    case 'x':
14450                         if (! grok_bslash_x(&p,
14451                                             RExC_end,
14452                                             &ender,
14453                                             &message,
14454                                             &packed_warn,
14455                                             (bool) RExC_strict,
14456                                             FALSE, /* No illegal cp's */
14457                                             UTF))
14458                         {
14459                             RExC_parse = p;	/* going to die anyway; point
14460                                                    to exact spot of failure */
14461                             vFAIL(message);
14462                         }
14463 
14464                         if (message && TO_OUTPUT_WARNINGS(p)) {
14465                             warn_non_literal_string(p, packed_warn, message);
14466                         }
14467 
14468 #ifdef EBCDIC
14469                         if (ender < 0x100) {
14470                             if (RExC_recode_x_to_native) {
14471                                 ender = LATIN1_TO_NATIVE(ender);
14472                             }
14473                         }
14474 #endif
14475                         break;
14476 		    case 'c':
14477                         p++;
14478                         if (! grok_bslash_c(*p, &grok_c_char,
14479                                             &message, &packed_warn))
14480                         {
14481                             /* going to die anyway; point to exact spot of
14482                              * failure */
14483                             RExC_parse = p + ((UTF)
14484                                               ? UTF8_SAFE_SKIP(p, RExC_end)
14485                                               : 1);
14486                             vFAIL(message);
14487                         }
14488 
14489                         ender = grok_c_char;
14490                         p++;
14491                         if (message && TO_OUTPUT_WARNINGS(p)) {
14492                             warn_non_literal_string(p, packed_warn, message);
14493                         }
14494 
14495 			break;
14496                     case '8': case '9': /* must be a backreference */
14497                         --p;
14498                         /* we have an escape like \8 which cannot be an octal escape
14499                          * so we exit the loop, and let the outer loop handle this
14500                          * escape which may or may not be a legitimate backref. */
14501                         goto loopdone;
14502                     case '1': case '2': case '3':case '4':
14503 		    case '5': case '6': case '7':
14504 
14505                         /* When we parse backslash escapes there is ambiguity
14506                          * between backreferences and octal escapes. Any escape
14507                          * from \1 - \9 is a backreference, any multi-digit
14508                          * escape which does not start with 0 and which when
14509                          * evaluated as decimal could refer to an already
14510                          * parsed capture buffer is a back reference. Anything
14511                          * else is octal.
14512                          *
14513                          * Note this implies that \118 could be interpreted as
14514                          * 118 OR as "\11" . "8" depending on whether there
14515                          * were 118 capture buffers defined already in the
14516                          * pattern.  */
14517 
14518                         /* NOTE, RExC_npar is 1 more than the actual number of
14519                          * parens we have seen so far, hence the "<" as opposed
14520                          * to "<=" */
14521                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14522                         {  /* Not to be treated as an octal constant, go
14523                                    find backref */
14524                             p = oldp;
14525                             goto loopdone;
14526                         }
14527                         /* FALLTHROUGH */
14528                     case '0':
14529 			{
14530 			    I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14531                                       | PERL_SCAN_NOTIFY_ILLDIGIT;
14532 			    STRLEN numlen = 3;
14533 			    ender = grok_oct(p, &numlen, &flags, NULL);
14534 			    p += numlen;
14535                             if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14536                                 && isDIGIT(*p)  /* like \08, \178 */
14537                                 && ckWARN(WARN_REGEXP))
14538                             {
14539 				reg_warn_non_literal_string(
14540                                      p + 1,
14541                                      form_alien_digit_msg(8, numlen, p,
14542                                                         RExC_end, UTF, FALSE));
14543                             }
14544 			}
14545 			break;
14546 		    case '\0':
14547 			if (p >= RExC_end)
14548 			    FAIL("Trailing \\");
14549 			/* FALLTHROUGH */
14550 		    default:
14551 			if (isALPHANUMERIC(*p)) {
14552                             /* An alpha followed by '{' is going to fail next
14553                              * iteration, so don't output this warning in that
14554                              * case */
14555                             if (! isALPHA(*p) || *(p + 1) != '{') {
14556                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14557                                                   " passed through", p);
14558                             }
14559 			}
14560 			goto normal_default;
14561 		    } /* End of switch on '\' */
14562 		    break;
14563 		case '{':
14564                     /* Trying to gain new uses for '{' without breaking too
14565                      * much existing code is hard.  The solution currently
14566                      * adopted is:
14567                      *  1)  If there is no ambiguity that a '{' should always
14568                      *      be taken literally, at the start of a construct, we
14569                      *      just do so.
14570                      *  2)  If the literal '{' conflicts with our desired use
14571                      *      of it as a metacharacter, we die.  The deprecation
14572                      *      cycles for this have come and gone.
14573                      *  3)  If there is ambiguity, we raise a simple warning.
14574                      *      This could happen, for example, if the user
14575                      *      intended it to introduce a quantifier, but slightly
14576                      *      misspelled the quantifier.  Without this warning,
14577                      *      the quantifier would silently be taken as a literal
14578                      *      string of characters instead of a meta construct */
14579 		    if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14580                         if (      RExC_strict
14581                             || (  p > parse_start + 1
14582                                 && isALPHA_A(*(p - 1))
14583                                 && *(p - 2) == '\\'))
14584                         {
14585                             RExC_parse = p + 1;
14586                             vFAIL("Unescaped left brace in regex is "
14587                                   "illegal here");
14588                         }
14589                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14590                                          " passed through");
14591 		    }
14592 		    goto normal_default;
14593                 case '}':
14594                 case ']':
14595                     if (p > RExC_parse && RExC_strict) {
14596                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14597                     }
14598 		    /*FALLTHROUGH*/
14599 		default:    /* A literal character */
14600 		  normal_default:
14601 		    if (! UTF8_IS_INVARIANT(*p) && UTF) {
14602 			STRLEN numlen;
14603 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14604 					       &numlen, UTF8_ALLOW_DEFAULT);
14605 			p += numlen;
14606 		    }
14607 		    else
14608 			ender = (U8) *p++;
14609 		    break;
14610 		} /* End of switch on the literal */
14611 
14612 		/* Here, have looked at the literal character, and <ender>
14613                  * contains its ordinal; <p> points to the character after it.
14614                  * */
14615 
14616                 if (ender > 255) {
14617                     REQUIRE_UTF8(flagp);
14618                     if (   UNICODE_IS_PERL_EXTENDED(ender)
14619                         && TO_OUTPUT_WARNINGS(p))
14620                     {
14621                         ckWARN2_non_literal_string(p,
14622                                                    packWARN(WARN_PORTABLE),
14623                                                    PL_extended_cp_format,
14624                                                    ender);
14625                     }
14626                 }
14627 
14628                 /* We need to check if the next non-ignored thing is a
14629                  * quantifier.  Move <p> to after anything that should be
14630                  * ignored, which, as a side effect, positions <p> for the next
14631                  * loop iteration */
14632                 skip_to_be_ignored_text(pRExC_state, &p,
14633                                         FALSE /* Don't force to /x */ );
14634 
14635                 /* If the next thing is a quantifier, it applies to this
14636                  * character only, which means that this character has to be in
14637                  * its own node and can't just be appended to the string in an
14638                  * existing node, so if there are already other characters in
14639                  * the node, close the node with just them, and set up to do
14640                  * this character again next time through, when it will be the
14641                  * only thing in its new node */
14642 
14643                 next_is_quantifier =    LIKELY(p < RExC_end)
14644                                      && UNLIKELY(isQUANTIFIER(p, RExC_end));
14645 
14646                 if (next_is_quantifier && LIKELY(len)) {
14647                     p = oldp;
14648                     goto loopdone;
14649                 }
14650 
14651                 /* Ready to add 'ender' to the node */
14652 
14653                 if (! FOLD) {  /* The simple case, just append the literal */
14654                   not_fold_common:
14655 
14656                     /* Don't output if it would overflow */
14657                     if (UNLIKELY(len > max_string_len - ((UTF)
14658                                                       ? UVCHR_SKIP(ender)
14659                                                       : 1)))
14660                     {
14661                         overflowed = TRUE;
14662                         break;
14663                     }
14664 
14665                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14666                         *(s++) = (char) ender;
14667                     }
14668                     else {
14669                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14670                         added_len = (char *) new_s - s;
14671                         s = (char *) new_s;
14672 
14673                         if (ender > 255)  {
14674                             requires_utf8_target = TRUE;
14675                         }
14676                     }
14677                 }
14678                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14679 
14680                     /* Here are folding under /l, and the code point is
14681                      * problematic.  If this is the first character in the
14682                      * node, change the node type to folding.   Otherwise, if
14683                      * this is the first problematic character, close up the
14684                      * existing node, so can start a new node with this one */
14685                     if (! len) {
14686                         node_type = EXACTFL;
14687                         RExC_contains_locale = 1;
14688                     }
14689                     else if (node_type == EXACT) {
14690                         p = oldp;
14691                         goto loopdone;
14692                     }
14693 
14694                     /* This problematic code point means we can't simplify
14695                      * things */
14696                     maybe_exactfu = FALSE;
14697 
14698                     /* Although these two characters have folds that are
14699                      * locale-problematic, they also have folds to above Latin1
14700                      * that aren't a problem.  Doing these now helps at
14701                      * runtime. */
14702                     if (UNLIKELY(   ender == GREEK_CAPITAL_LETTER_MU
14703                                  || ender == LATIN_CAPITAL_LETTER_SHARP_S))
14704                     {
14705                         goto fold_anyway;
14706                     }
14707 
14708                     /* Here, we are adding a problematic fold character.
14709                      * "Problematic" in this context means that its fold isn't
14710                      * known until runtime.  (The non-problematic code points
14711                      * are the above-Latin1 ones that fold to also all
14712                      * above-Latin1.  Their folds don't vary no matter what the
14713                      * locale is.) But here we have characters whose fold
14714                      * depends on the locale.  We just add in the unfolded
14715                      * character, and wait until runtime to fold it */
14716                     goto not_fold_common;
14717                 }
14718                 else /* regular fold; see if actually is in a fold */
14719                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14720                          || (ender > 255
14721                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14722                 {
14723                     /* Here, folding, but the character isn't in a fold.
14724                      *
14725                      * Start a new node if previous characters in the node were
14726                      * folded */
14727                     if (len && node_type != EXACT) {
14728                         p = oldp;
14729                         goto loopdone;
14730                     }
14731 
14732                     /* Here, continuing a node with non-folded characters.  Add
14733                      * this one */
14734                     goto not_fold_common;
14735                 }
14736                 else {  /* Here, does participate in some fold */
14737 
14738                     /* If this is the first character in the node, change its
14739                      * type to folding.  Otherwise, if this is the first
14740                      * folding character in the node, close up the existing
14741                      * node, so can start a new node with this one.  */
14742                     if (! len) {
14743                         node_type = compute_EXACTish(pRExC_state);
14744                     }
14745                     else if (node_type == EXACT) {
14746                         p = oldp;
14747                         goto loopdone;
14748                     }
14749 
14750                     if (UTF) {  /* Alway use the folded value for UTF-8
14751                                    patterns */
14752                         if (UVCHR_IS_INVARIANT(ender)) {
14753                             if (UNLIKELY(len + 1 > max_string_len)) {
14754                                 overflowed = TRUE;
14755                                 break;
14756                             }
14757 
14758                             *(s)++ = (U8) toFOLD(ender);
14759                         }
14760                         else {
14761                             UV folded;
14762 
14763                           fold_anyway:
14764                             folded = _to_uni_fold_flags(
14765                                     ender,
14766                                     (U8 *) s,  /* We have allocated extra space
14767                                                   in 's' so can't run off the
14768                                                   end */
14769                                     &added_len,
14770                                     FOLD_FLAGS_FULL
14771                                   | ((   ASCII_FOLD_RESTRICTED
14772                                       || node_type == EXACTFL)
14773                                     ? FOLD_FLAGS_NOMIX_ASCII
14774                                     : 0));
14775                             if (UNLIKELY(len + added_len > max_string_len)) {
14776                                 overflowed = TRUE;
14777                                 break;
14778                             }
14779 
14780                             s += added_len;
14781 
14782                             if (   folded > 255
14783                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14784                             {
14785                                 /* U+B5 folds to the MU, so its possible for a
14786                                  * non-UTF-8 target to match it */
14787                                 requires_utf8_target = TRUE;
14788                             }
14789                         }
14790                     }
14791                     else { /* Here is non-UTF8. */
14792 
14793                         /* The fold will be one or (rarely) two characters.
14794                          * Check that there's room for at least a single one
14795                          * before setting any flags, etc.  Because otherwise an
14796                          * overflowing character could cause a flag to be set
14797                          * even though it doesn't end up in this node.  (For
14798                          * the two character fold, we check again, before
14799                          * setting any flags) */
14800                         if (UNLIKELY(len + 1 > max_string_len)) {
14801                             overflowed = TRUE;
14802                             break;
14803                         }
14804 
14805 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14806    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14807                                       || UNICODE_DOT_DOT_VERSION > 0)
14808 
14809                         /* On non-ancient Unicodes, check for the only possible
14810                          * multi-char fold  */
14811                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14812 
14813                             /* This potential multi-char fold means the node
14814                              * can't be simple (because it could match more
14815                              * than a single char).  And in some cases it will
14816                              * match 'ss', so set that flag */
14817                             maybe_SIMPLE = 0;
14818                             has_ss = TRUE;
14819 
14820                             /* It can't change to be an EXACTFU (unless already
14821                              * is one).  We fold it iff under /u rules. */
14822                             if (node_type != EXACTFU) {
14823                                 maybe_exactfu = FALSE;
14824                             }
14825                             else {
14826                                 if (UNLIKELY(len + 2 > max_string_len)) {
14827                                     overflowed = TRUE;
14828                                     break;
14829                                 }
14830 
14831                                 *(s++) = 's';
14832                                 *(s++) = 's';
14833                                 added_len = 2;
14834 
14835                                 goto done_with_this_char;
14836                             }
14837                         }
14838                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14839                                  && LIKELY(len > 0)
14840                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14841                         {
14842                             /* Also, the sequence 'ss' is special when not
14843                              * under /u.  If the target string is UTF-8, it
14844                              * should match SHARP S; otherwise it won't.  So,
14845                              * here we have to exclude the possibility of this
14846                              * node moving to /u.*/
14847                             has_ss = TRUE;
14848                             maybe_exactfu = FALSE;
14849                         }
14850 #endif
14851                         /* Here, the fold will be a single character */
14852 
14853                         if (UNLIKELY(ender == MICRO_SIGN)) {
14854                             has_micro_sign = TRUE;
14855                         }
14856                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14857 
14858                             /* If the character's fold differs between /d and
14859                              * /u, this can't change to be an EXACTFU node */
14860                             maybe_exactfu = FALSE;
14861                         }
14862 
14863                         *(s++) = (DEPENDS_SEMANTICS)
14864                                  ? (char) toFOLD(ender)
14865 
14866                                    /* Under /u, the fold of any character in
14867                                     * the 0-255 range happens to be its
14868                                     * lowercase equivalent, except for LATIN
14869                                     * SMALL LETTER SHARP S, which was handled
14870                                     * above, and the MICRO SIGN, whose fold
14871                                     * requires UTF-8 to represent.  */
14872                                  : (char) toLOWER_L1(ender);
14873                     }
14874 		} /* End of adding current character to the node */
14875 
14876               done_with_this_char:
14877 
14878                 len += added_len;
14879 
14880 		if (next_is_quantifier) {
14881 
14882                     /* Here, the next input is a quantifier, and to get here,
14883                      * the current character is the only one in the node. */
14884                     goto loopdone;
14885 		}
14886 
14887 	    } /* End of loop through literal characters */
14888 
14889             /* Here we have either exhausted the input or run out of room in
14890              * the node.  If the former, we are done.  (If we encountered a
14891              * character that can't be in the node, transfer is made directly
14892              * to <loopdone>, and so we wouldn't have fallen off the end of the
14893              * loop.)  */
14894             if (LIKELY(! overflowed)) {
14895                 goto loopdone;
14896             }
14897 
14898             /* Here we have run out of room.  We can grow plain EXACT and
14899              * LEXACT nodes.  If the pattern is gigantic enough, though,
14900              * eventually we'll have to artificially chunk the pattern into
14901              * multiple nodes. */
14902             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14903                 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14904                 Size_t overhead_expansion = 0;
14905                 char temp[256];
14906                 Size_t max_nodes_for_string;
14907                 Size_t achievable;
14908                 SSize_t delta;
14909 
14910                 /* Here we couldn't fit the final character in the current
14911                  * node, so it will have to be reparsed, no matter what else we
14912                  * do */
14913                 p = oldp;
14914 
14915                 /* If would have overflowed a regular EXACT node, switch
14916                  * instead to an LEXACT.  The code below is structured so that
14917                  * the actual growing code is common to changing from an EXACT
14918                  * or just increasing the LEXACT size.  This means that we have
14919                  * to save the string in the EXACT case before growing, and
14920                  * then copy it afterwards to its new location */
14921                 if (node_type == EXACT) {
14922                     overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14923                     RExC_emit += overhead_expansion;
14924                     Copy(s0, temp, len, char);
14925                 }
14926 
14927                 /* Ready to grow.  If it was a plain EXACT, the string was
14928                  * saved, and the first few bytes of it overwritten by adding
14929                  * an argument field.  We assume, as we do elsewhere in this
14930                  * file, that one byte of remaining input will translate into
14931                  * one byte of output, and if that's too small, we grow again,
14932                  * if too large the excess memory is freed at the end */
14933 
14934                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14935                 achievable = MIN(max_nodes_for_string,
14936                                  current_string_nodes + STR_SZ(RExC_end - p));
14937                 delta = achievable - current_string_nodes;
14938 
14939                 /* If there is just no more room, go finish up this chunk of
14940                  * the pattern. */
14941                 if (delta <= 0) {
14942                     goto loopdone;
14943                 }
14944 
14945                 change_engine_size(pRExC_state, delta + overhead_expansion);
14946                 current_string_nodes += delta;
14947                 max_string_len
14948                            = sizeof(struct regnode) * current_string_nodes;
14949                 upper_fill = max_string_len + 1;
14950 
14951                 /* If the length was small, we know this was originally an
14952                  * EXACT node now converted to LEXACT, and the string has to be
14953                  * restored.  Otherwise the string was untouched.  260 is just
14954                  * a number safely above 255 so don't have to worry about
14955                  * getting it precise */
14956                 if (len < 260) {
14957                     node_type = LEXACT;
14958                     FILL_NODE(ret, node_type);
14959                     s0 = STRING(REGNODE_p(ret));
14960                     Copy(temp, s0, len, char);
14961                     s = s0 + len;
14962                 }
14963 
14964                 goto continue_parse;
14965             }
14966             else if (FOLD) {
14967                 bool splittable = FALSE;
14968                 bool backed_up = FALSE;
14969                 char * e;       /* should this be U8? */
14970                 char * s_start; /* should this be U8? */
14971 
14972                 /* Here is /i.  Running out of room creates a problem if we are
14973                  * folding, and the split happens in the middle of a
14974                  * multi-character fold, as a match that should have occurred,
14975                  * won't, due to the way nodes are matched, and our artificial
14976                  * boundary.  So back off until we aren't splitting such a
14977                  * fold.  If there is no such place to back off to, we end up
14978                  * taking the entire node as-is.  This can happen if the node
14979                  * consists entirely of 'f' or entirely of 's' characters (or
14980                  * things that fold to them) as 'ff' and 'ss' are
14981                  * multi-character folds.
14982                  *
14983                  * The Unicode standard says that multi character folds consist
14984                  * of either two or three characters.  That means we would be
14985                  * splitting one if the final character in the node is at the
14986                  * beginning of either type, or is the second of a three
14987                  * character fold.
14988                  *
14989                  * At this point:
14990                  *  ender     is the code point of the character that won't fit
14991                  *            in the node
14992                  *  s         points to just beyond the final byte in the node.
14993                  *            It's where we would place ender if there were
14994                  *            room, and where in fact we do place ender's fold
14995                  *            in the code below, as we've over-allocated space
14996                  *            for s0 (hence s) to allow for this
14997                  *  e         starts at 's' and advances as we append things.
14998                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
14999                  *            have been advanced to beyond it).
15000                  *  old_old_s points to the beginning byte of the final
15001                  *            character in the node
15002                  *  p         points to the beginning byte in the input of the
15003                  *            character beyond 'ender'.
15004                  *  oldp      points to the beginning byte in the input of
15005                  *            'ender'.
15006                  *
15007                  * In the case of /il, we haven't folded anything that could be
15008                  * affected by the locale.  That means only above-Latin1
15009                  * characters that fold to other above-latin1 characters get
15010                  * folded at compile time.  To check where a good place to
15011                  * split nodes is, everything in it will have to be folded.
15012                  * The boolean 'maybe_exactfu' keeps track in /il if there are
15013                  * any unfolded characters in the node. */
15014                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
15015 
15016                 /* If we do need to fold the node, we need a place to store the
15017                  * folded copy, and a way to map back to the unfolded original
15018                  * */
15019                 char * locfold_buf = NULL;
15020                 Size_t * loc_correspondence = NULL;
15021 
15022                 if (! need_to_fold_loc) {   /* The normal case.  Just
15023                                                initialize to the actual node */
15024                     e = s;
15025                     s_start = s0;
15026                     s = old_old_s;  /* Point to the beginning of the final char
15027                                        that fits in the node */
15028                 }
15029                 else {
15030 
15031                     /* Here, we have filled a /il node, and there are unfolded
15032                      * characters in it.  If the runtime locale turns out to be
15033                      * UTF-8, there are possible multi-character folds, just
15034                      * like when not under /l.  The node hence can't terminate
15035                      * in the middle of such a fold.  To determine this, we
15036                      * have to create a folded copy of this node.  That means
15037                      * reparsing the node, folding everything assuming a UTF-8
15038                      * locale.  (If at runtime it isn't such a locale, the
15039                      * actions here wouldn't have been necessary, but we have
15040                      * to assume the worst case.)  If we find we need to back
15041                      * off the folded string, we do so, and then map that
15042                      * position back to the original unfolded node, which then
15043                      * gets output, truncated at that spot */
15044 
15045                     char * redo_p = RExC_parse;
15046                     char * redo_e;
15047                     char * old_redo_e;
15048 
15049                     /* Allow enough space assuming a single byte input folds to
15050                      * a single byte output, plus assume that the two unparsed
15051                      * characters (that we may need) fold to the largest number
15052                      * of bytes possible, plus extra for one more worst case
15053                      * scenario.  In the loop below, if we start eating into
15054                      * that final spare space, we enlarge this initial space */
15055                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
15056 
15057                     Newxz(locfold_buf, size, char);
15058                     Newxz(loc_correspondence, size, Size_t);
15059 
15060                     /* Redo this node's parse, folding into 'locfold_buf' */
15061                     redo_p = RExC_parse;
15062                     old_redo_e = redo_e = locfold_buf;
15063                     while (redo_p <= oldp) {
15064 
15065                         old_redo_e = redo_e;
15066                         loc_correspondence[redo_e - locfold_buf]
15067                                                         = redo_p - RExC_parse;
15068 
15069                         if (UTF) {
15070                             Size_t added_len;
15071 
15072                             (void) _to_utf8_fold_flags((U8 *) redo_p,
15073                                                        (U8 *) RExC_end,
15074                                                        (U8 *) redo_e,
15075                                                        &added_len,
15076                                                        FOLD_FLAGS_FULL);
15077                             redo_e += added_len;
15078                             redo_p += UTF8SKIP(redo_p);
15079                         }
15080                         else {
15081 
15082                             /* Note that if this code is run on some ancient
15083                              * Unicode versions, SHARP S doesn't fold to 'ss',
15084                              * but rather than clutter the code with #ifdef's,
15085                              * as is done above, we ignore that possibility.
15086                              * This is ok because this code doesn't affect what
15087                              * gets matched, but merely where the node gets
15088                              * split */
15089                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
15090                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
15091                             }
15092                             else {
15093                                 *redo_e++ = 's';
15094                                 *redo_e++ = 's';
15095                             }
15096                             redo_p++;
15097                         }
15098 
15099 
15100                         /* If we're getting so close to the end that a
15101                          * worst-case fold in the next character would cause us
15102                          * to overflow, increase, assuming one byte output byte
15103                          * per one byte input one, plus room for another worst
15104                          * case fold */
15105                         if (   redo_p <= oldp
15106                             && redo_e > locfold_buf + size
15107                                                     - (UTF8_MAXBYTES_CASE + 1))
15108                         {
15109                             Size_t new_size = size
15110                                             + (oldp - redo_p)
15111                                             + UTF8_MAXBYTES_CASE + 1;
15112                             Ptrdiff_t e_offset = redo_e - locfold_buf;
15113 
15114                             Renew(locfold_buf, new_size, char);
15115                             Renew(loc_correspondence, new_size, Size_t);
15116                             size = new_size;
15117 
15118                             redo_e = locfold_buf + e_offset;
15119                         }
15120                     }
15121 
15122                     /* Set so that things are in terms of the folded, temporary
15123                      * string */
15124                     s = old_redo_e;
15125                     s_start = locfold_buf;
15126                     e = redo_e;
15127 
15128                 }
15129 
15130                 /* Here, we have 's', 's_start' and 'e' set up to point to the
15131                  * input that goes into the node, folded.
15132                  *
15133                  * If the final character of the node and the fold of ender
15134                  * form the first two characters of a three character fold, we
15135                  * need to peek ahead at the next (unparsed) character in the
15136                  * input to determine if the three actually do form such a
15137                  * fold.  Just looking at that character is not generally
15138                  * sufficient, as it could be, for example, an escape sequence
15139                  * that evaluates to something else, and it needs to be folded.
15140                  *
15141                  * khw originally thought to just go through the parse loop one
15142                  * extra time, but that doesn't work easily as that iteration
15143                  * could cause things to think that the parse is over and to
15144                  * goto loopdone.  The character could be a '$' for example, or
15145                  * the character beyond could be a quantifier, and other
15146                  * glitches as well.
15147                  *
15148                  * The solution used here for peeking ahead is to look at that
15149                  * next character.  If it isn't ASCII punctuation, then it will
15150                  * be something that would continue on in an EXACTish node if
15151                  * there were space.  We append the fold of it to s, having
15152                  * reserved enough room in s0 for the purpose.  If we can't
15153                  * reasonably peek ahead, we instead assume the worst case:
15154                  * that it is something that would form the completion of a
15155                  * multi-char fold.
15156                  *
15157                  * If we can't split between s and ender, we work backwards
15158                  * character-by-character down to s0.  At each current point
15159                  * see if we are at the beginning of a multi-char fold.  If so,
15160                  * that means we would be splitting the fold across nodes, and
15161                  * so we back up one and try again.
15162                  *
15163                  * If we're not at the beginning, we still could be at the
15164                  * final two characters of a (rare) three character fold.  We
15165                  * check if the sequence starting at the character before the
15166                  * current position (and including the current and next
15167                  * characters) is a three character fold.  If not, the node can
15168                  * be split here.  If it is, we have to backup two characters
15169                  * and try again.
15170                  *
15171                  * Otherwise, the node can be split at the current position.
15172                  *
15173                  * The same logic is used for UTF-8 patterns and not */
15174                 if (UTF) {
15175                     Size_t added_len;
15176 
15177                     /* Append the fold of ender */
15178                     (void) _to_uni_fold_flags(
15179                         ender,
15180                         (U8 *) e,
15181                         &added_len,
15182                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15183                                         ? FOLD_FLAGS_NOMIX_ASCII
15184                                         : 0));
15185                     e += added_len;
15186 
15187                     /* 's' and the character folded to by ender may be the
15188                      * first two of a three-character fold, in which case the
15189                      * node should not be split here.  That may mean examining
15190                      * the so-far unparsed character starting at 'p'.  But if
15191                      * ender folded to more than one character, we already have
15192                      * three characters to look at.  Also, we first check if
15193                      * the sequence consisting of s and the next character form
15194                      * the first two of some three character fold.  If not,
15195                      * there's no need to peek ahead. */
15196                     if (   added_len <= UTF8SKIP(e - added_len)
15197                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15198                     {
15199                         /* Here, the two do form the beginning of a potential
15200                          * three character fold.  The unexamined character may
15201                          * or may not complete it.  Peek at it.  It might be
15202                          * something that ends the node or an escape sequence,
15203                          * in which case we don't know without a lot of work
15204                          * what it evaluates to, so we have to assume the worst
15205                          * case: that it does complete the fold, and so we
15206                          * can't split here.  All such instances  will have
15207                          * that character be an ASCII punctuation character,
15208                          * like a backslash.  So, for that case, backup one and
15209                          * drop down to try at that position */
15210                         if (isPUNCT(*p)) {
15211                             s = (char *) utf8_hop_back((U8 *) s, -1,
15212                                        (U8 *) s_start);
15213                             backed_up = TRUE;
15214                         }
15215                         else {
15216                             /* Here, since it's not punctuation, it must be a
15217                              * real character, and we can append its fold to
15218                              * 'e' (having deliberately reserved enough space
15219                              * for this eventuality) and drop down to check if
15220                              * the three actually do form a folded sequence */
15221                             (void) _to_utf8_fold_flags(
15222                                 (U8 *) p, (U8 *) RExC_end,
15223                                 (U8 *) e,
15224                                 &added_len,
15225                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15226                                                 ? FOLD_FLAGS_NOMIX_ASCII
15227                                                 : 0));
15228                             e += added_len;
15229                         }
15230                     }
15231 
15232                     /* Here, we either have three characters available in
15233                      * sequence starting at 's', or we have two characters and
15234                      * know that the following one can't possibly be part of a
15235                      * three character fold.  We go through the node backwards
15236                      * until we find a place where we can split it without
15237                      * breaking apart a multi-character fold.  At any given
15238                      * point we have to worry about if such a fold begins at
15239                      * the current 's', and also if a three-character fold
15240                      * begins at s-1, (containing s and s+1).  Splitting in
15241                      * either case would break apart a fold */
15242                     do {
15243                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15244                                                             (U8 *) s_start);
15245 
15246                         /* If is a multi-char fold, can't split here.  Backup
15247                          * one char and try again */
15248                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15249                             s = prev_s;
15250                             backed_up = TRUE;
15251                             continue;
15252                         }
15253 
15254                         /* If the two characters beginning at 's' are part of a
15255                          * three character fold starting at the character
15256                          * before s, we can't split either before or after s.
15257                          * Backup two chars and try again */
15258                         if (   LIKELY(s > s_start)
15259                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15260                         {
15261                             s = prev_s;
15262                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15263                             backed_up = TRUE;
15264                             continue;
15265                         }
15266 
15267                         /* Here there's no multi-char fold between s and the
15268                          * next character following it.  We can split */
15269                         splittable = TRUE;
15270                         break;
15271 
15272                     } while (s > s_start); /* End of loops backing up through the node */
15273 
15274                     /* Here we either couldn't find a place to split the node,
15275                      * or else we broke out of the loop setting 'splittable' to
15276                      * true.  In the latter case, the place to split is between
15277                      * the first and second characters in the sequence starting
15278                      * at 's' */
15279                     if (splittable) {
15280                         s += UTF8SKIP(s);
15281                     }
15282                 }
15283                 else {  /* Pattern not UTF-8 */
15284                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
15285                         || ASCII_FOLD_RESTRICTED)
15286                     {
15287                         assert( toLOWER_L1(ender) < 256 );
15288                         *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15289                     }
15290                     else {
15291                         *e++ = 's';
15292                         *e++ = 's';
15293                     }
15294 
15295                     if (   e - s  <= 1
15296                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15297                     {
15298                         if (isPUNCT(*p)) {
15299                             s--;
15300                             backed_up = TRUE;
15301                         }
15302                         else {
15303                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15304                                 || ASCII_FOLD_RESTRICTED)
15305                             {
15306                                 assert( toLOWER_L1(ender) < 256 );
15307                                 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15308                             }
15309                             else {
15310                                 *e++ = 's';
15311                                 *e++ = 's';
15312                             }
15313                         }
15314                     }
15315 
15316                     do {
15317                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15318                             s--;
15319                             backed_up = TRUE;
15320                             continue;
15321                         }
15322 
15323                         if (   LIKELY(s > s_start)
15324                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15325                         {
15326                             s -= 2;
15327                             backed_up = TRUE;
15328                             continue;
15329                         }
15330 
15331                         splittable = TRUE;
15332                         break;
15333 
15334                     } while (s > s_start);
15335 
15336                     if (splittable) {
15337                         s++;
15338                     }
15339                 }
15340 
15341                 /* Here, we are done backing up.  If we didn't backup at all
15342                  * (the likely case), just proceed */
15343                 if (backed_up) {
15344 
15345                    /* If we did find a place to split, reparse the entire node
15346                     * stopping where we have calculated. */
15347                     if (splittable) {
15348 
15349                        /* If we created a temporary folded string under /l, we
15350                         * have to map that back to the original */
15351                         if (need_to_fold_loc) {
15352                             upper_fill = loc_correspondence[s - s_start];
15353                             if (upper_fill == 0) {
15354                                 FAIL2("panic: loc_correspondence[%d] is 0",
15355                                       (int) (s - s_start));
15356                             }
15357                             Safefree(locfold_buf);
15358                             Safefree(loc_correspondence);
15359                         }
15360                         else {
15361                             upper_fill = s - s0;
15362                         }
15363                         goto reparse;
15364                     }
15365 
15366                     /* Here the node consists entirely of non-final multi-char
15367                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
15368                      * decent place to split it, so give up and just take the
15369                      * whole thing */
15370                     len = old_s - s0;
15371                 }
15372 
15373                 if (need_to_fold_loc) {
15374                     Safefree(locfold_buf);
15375                     Safefree(loc_correspondence);
15376                 }
15377 	    }   /* End of verifying node ends with an appropriate char */
15378 
15379             /* We need to start the next node at the character that didn't fit
15380              * in this one */
15381             p = oldp;
15382 
15383           loopdone:   /* Jumped to when encounters something that shouldn't be
15384                          in the node */
15385 
15386             /* Free up any over-allocated space; cast is to silence bogus
15387              * warning in MS VC */
15388             change_engine_size(pRExC_state,
15389                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15390 
15391             /* I (khw) don't know if you can get here with zero length, but the
15392              * old code handled this situation by creating a zero-length EXACT
15393              * node.  Might as well be NOTHING instead */
15394             if (len == 0) {
15395                 OP(REGNODE_p(ret)) = NOTHING;
15396             }
15397             else {
15398 
15399                 /* If the node type is EXACT here, check to see if it
15400                  * should be EXACTL, or EXACT_REQ8. */
15401                 if (node_type == EXACT) {
15402                     if (LOC) {
15403                         node_type = EXACTL;
15404                     }
15405                     else if (requires_utf8_target) {
15406                         node_type = EXACT_REQ8;
15407                     }
15408                 }
15409                 else if (node_type == LEXACT) {
15410                     if (requires_utf8_target) {
15411                         node_type = LEXACT_REQ8;
15412                     }
15413                 }
15414                 else if (FOLD) {
15415                     if (    UNLIKELY(has_micro_sign || has_ss)
15416                         && (node_type == EXACTFU || (   node_type == EXACTF
15417                                                      && maybe_exactfu)))
15418                     {   /* These two conditions are problematic in non-UTF-8
15419                            EXACTFU nodes. */
15420                         assert(! UTF);
15421                         node_type = EXACTFUP;
15422                     }
15423                     else if (node_type == EXACTFL) {
15424 
15425                         /* 'maybe_exactfu' is deliberately set above to
15426                          * indicate this node type, where all code points in it
15427                          * are above 255 */
15428                         if (maybe_exactfu) {
15429                             node_type = EXACTFLU8;
15430                         }
15431                         else if (UNLIKELY(
15432                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15433                         {
15434                             /* A character that folds to more than one will
15435                              * match multiple characters, so can't be SIMPLE.
15436                              * We don't have to worry about this with EXACTFLU8
15437                              * nodes just above, as they have already been
15438                              * folded (since the fold doesn't vary at run
15439                              * time).  Here, if the final character in the node
15440                              * folds to multiple, it can't be simple.  (This
15441                              * only has an effect if the node has only a single
15442                              * character, hence the final one, as elsewhere we
15443                              * turn off simple for nodes whose length > 1 */
15444                             maybe_SIMPLE = 0;
15445                         }
15446                     }
15447                     else if (node_type == EXACTF) {  /* Means is /di */
15448 
15449                         /* This intermediate variable is needed solely because
15450                          * the asserts in the macro where used exceed Win32's
15451                          * literal string capacity */
15452                         char first_char = * STRING(REGNODE_p(ret));
15453 
15454                         /* If 'maybe_exactfu' is clear, then we need to stay
15455                          * /di.  If it is set, it means there are no code
15456                          * points that match differently depending on UTF8ness
15457                          * of the target string, so it can become an EXACTFU
15458                          * node */
15459                         if (! maybe_exactfu) {
15460                             RExC_seen_d_op = TRUE;
15461                         }
15462                         else if (   isALPHA_FOLD_EQ(first_char, 's')
15463                                  || isALPHA_FOLD_EQ(ender, 's'))
15464                         {
15465                             /* But, if the node begins or ends in an 's' we
15466                              * have to defer changing it into an EXACTFU, as
15467                              * the node could later get joined with another one
15468                              * that ends or begins with 's' creating an 'ss'
15469                              * sequence which would then wrongly match the
15470                              * sharp s without the target being UTF-8.  We
15471                              * create a special node that we resolve later when
15472                              * we join nodes together */
15473 
15474                             node_type = EXACTFU_S_EDGE;
15475                         }
15476                         else {
15477                             node_type = EXACTFU;
15478                         }
15479                     }
15480 
15481                     if (requires_utf8_target && node_type == EXACTFU) {
15482                         node_type = EXACTFU_REQ8;
15483                     }
15484                 }
15485 
15486                 OP(REGNODE_p(ret)) = node_type;
15487                 setSTR_LEN(REGNODE_p(ret), len);
15488                 RExC_emit += STR_SZ(len);
15489 
15490                 /* If the node isn't a single character, it can't be SIMPLE */
15491                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15492                     maybe_SIMPLE = 0;
15493                 }
15494 
15495                 *flagp |= HASWIDTH | maybe_SIMPLE;
15496             }
15497 
15498             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15499             RExC_parse = p;
15500 
15501 	    {
15502 		/* len is STRLEN which is unsigned, need to copy to signed */
15503 		IV iv = len;
15504 		if (iv < 0)
15505 		    vFAIL("Internal disaster");
15506 	    }
15507 
15508 	} /* End of label 'defchar:' */
15509 	break;
15510     } /* End of giant switch on input character */
15511 
15512     /* Position parse to next real character */
15513     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15514                                             FALSE /* Don't force to /x */ );
15515     if (   *RExC_parse == '{'
15516         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
15517     {
15518         if (RExC_strict) {
15519             RExC_parse++;
15520             vFAIL("Unescaped left brace in regex is illegal here");
15521         }
15522         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15523                                   " passed through");
15524     }
15525 
15526     return(ret);
15527 }
15528 
15529 
15530 STATIC void
S_populate_ANYOF_from_invlist(pTHX_ regnode * node,SV ** invlist_ptr)15531 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15532 {
15533     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
15534      * sets up the bitmap and any flags, removing those code points from the
15535      * inversion list, setting it to NULL should it become completely empty */
15536 
15537 
15538     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15539     assert(PL_regkind[OP(node)] == ANYOF);
15540 
15541     /* There is no bitmap for this node type */
15542     if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15543         return;
15544     }
15545 
15546     ANYOF_BITMAP_ZERO(node);
15547     if (*invlist_ptr) {
15548 
15549 	/* This gets set if we actually need to modify things */
15550 	bool change_invlist = FALSE;
15551 
15552 	UV start, end;
15553 
15554 	/* Start looking through *invlist_ptr */
15555 	invlist_iterinit(*invlist_ptr);
15556 	while (invlist_iternext(*invlist_ptr, &start, &end)) {
15557 	    UV high;
15558 	    int i;
15559 
15560             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15561                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15562             }
15563 
15564 	    /* Quit if are above what we should change */
15565 	    if (start >= NUM_ANYOF_CODE_POINTS) {
15566 		break;
15567 	    }
15568 
15569 	    change_invlist = TRUE;
15570 
15571 	    /* Set all the bits in the range, up to the max that we are doing */
15572 	    high = (end < NUM_ANYOF_CODE_POINTS - 1)
15573                    ? end
15574                    : NUM_ANYOF_CODE_POINTS - 1;
15575 	    for (i = start; i <= (int) high; i++) {
15576                 ANYOF_BITMAP_SET(node, i);
15577 	    }
15578 	}
15579 	invlist_iterfinish(*invlist_ptr);
15580 
15581         /* Done with loop; remove any code points that are in the bitmap from
15582          * *invlist_ptr; similarly for code points above the bitmap if we have
15583          * a flag to match all of them anyways */
15584 	if (change_invlist) {
15585 	    _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15586 	}
15587         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15588 	    _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15589 	}
15590 
15591 	/* If have completely emptied it, remove it completely */
15592 	if (_invlist_len(*invlist_ptr) == 0) {
15593 	    SvREFCNT_dec_NN(*invlist_ptr);
15594 	    *invlist_ptr = NULL;
15595 	}
15596     }
15597 }
15598 
15599 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15600    Character classes ([:foo:]) can also be negated ([:^foo:]).
15601    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15602    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15603    but trigger failures because they are currently unimplemented. */
15604 
15605 #define POSIXCC_DONE(c)   ((c) == ':')
15606 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15607 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15608 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15609 
15610 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
15611 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
15612 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
15613 
15614 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15615 
15616 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15617  * routine. q.v. */
15618 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
15619         if (posix_warnings) {                                               \
15620             if (! RExC_warn_text ) RExC_warn_text =                         \
15621                                          (AV *) sv_2mortal((SV *) newAV()); \
15622             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
15623                                              WARNING_PREFIX                 \
15624                                              text                           \
15625                                              REPORT_LOCATION,               \
15626                                              REPORT_LOCATION_ARGS(p)));     \
15627         }                                                                   \
15628     } STMT_END
15629 #define CLEAR_POSIX_WARNINGS()                                              \
15630     STMT_START {                                                            \
15631         if (posix_warnings && RExC_warn_text)                               \
15632             av_clear(RExC_warn_text);                                       \
15633     } STMT_END
15634 
15635 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15636     STMT_START {                                                            \
15637         CLEAR_POSIX_WARNINGS();                                             \
15638         return ret;                                                         \
15639     } STMT_END
15640 
15641 STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t * pRExC_state,const char * const s,char ** updated_parse_ptr,AV ** posix_warnings,const bool check_only)15642 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15643 
15644     const char * const s,      /* Where the putative posix class begins.
15645                                   Normally, this is one past the '['.  This
15646                                   parameter exists so it can be somewhere
15647                                   besides RExC_parse. */
15648     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15649                                   NULL */
15650     AV ** posix_warnings,      /* Where to place any generated warnings, or
15651                                   NULL */
15652     const bool check_only      /* Don't die if error */
15653 )
15654 {
15655     /* This parses what the caller thinks may be one of the three POSIX
15656      * constructs:
15657      *  1) a character class, like [:blank:]
15658      *  2) a collating symbol, like [. .]
15659      *  3) an equivalence class, like [= =]
15660      * In the latter two cases, it croaks if it finds a syntactically legal
15661      * one, as these are not handled by Perl.
15662      *
15663      * The main purpose is to look for a POSIX character class.  It returns:
15664      *  a) the class number
15665      *      if it is a completely syntactically and semantically legal class.
15666      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15667      *      closing ']' of the class
15668      *  b) OOB_NAMEDCLASS
15669      *      if it appears that one of the three POSIX constructs was meant, but
15670      *      its specification was somehow defective.  'updated_parse_ptr', if
15671      *      not NULL, is set to point to the character just after the end
15672      *      character of the class.  See below for handling of warnings.
15673      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15674      *      if it  doesn't appear that a POSIX construct was intended.
15675      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15676      *      raised.
15677      *
15678      * In b) there may be errors or warnings generated.  If 'check_only' is
15679      * TRUE, then any errors are discarded.  Warnings are returned to the
15680      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15681      * instead it is NULL, warnings are suppressed.
15682      *
15683      * The reason for this function, and its complexity is that a bracketed
15684      * character class can contain just about anything.  But it's easy to
15685      * mistype the very specific posix class syntax but yielding a valid
15686      * regular bracketed class, so it silently gets compiled into something
15687      * quite unintended.
15688      *
15689      * The solution adopted here maintains backward compatibility except that
15690      * it adds a warning if it looks like a posix class was intended but
15691      * improperly specified.  The warning is not raised unless what is input
15692      * very closely resembles one of the 14 legal posix classes.  To do this,
15693      * it uses fuzzy parsing.  It calculates how many single-character edits it
15694      * would take to transform what was input into a legal posix class.  Only
15695      * if that number is quite small does it think that the intention was a
15696      * posix class.  Obviously these are heuristics, and there will be cases
15697      * where it errs on one side or another, and they can be tweaked as
15698      * experience informs.
15699      *
15700      * The syntax for a legal posix class is:
15701      *
15702      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15703      *
15704      * What this routine considers syntactically to be an intended posix class
15705      * is this (the comments indicate some restrictions that the pattern
15706      * doesn't show):
15707      *
15708      *  qr/(?x: \[?                         # The left bracket, possibly
15709      *                                      # omitted
15710      *          \h*                         # possibly followed by blanks
15711      *          (?: \^ \h* )?               # possibly a misplaced caret
15712      *          [:;]?                       # The opening class character,
15713      *                                      # possibly omitted.  A typo
15714      *                                      # semi-colon can also be used.
15715      *          \h*
15716      *          \^?                         # possibly a correctly placed
15717      *                                      # caret, but not if there was also
15718      *                                      # a misplaced one
15719      *          \h*
15720      *          .{3,15}                     # The class name.  If there are
15721      *                                      # deviations from the legal syntax,
15722      *                                      # its edit distance must be close
15723      *                                      # to a real class name in order
15724      *                                      # for it to be considered to be
15725      *                                      # an intended posix class.
15726      *          \h*
15727      *          [[:punct:]]?                # The closing class character,
15728      *                                      # possibly omitted.  If not a colon
15729      *                                      # nor semi colon, the class name
15730      *                                      # must be even closer to a valid
15731      *                                      # one
15732      *          \h*
15733      *          \]?                         # The right bracket, possibly
15734      *                                      # omitted.
15735      *     )/
15736      *
15737      * In the above, \h must be ASCII-only.
15738      *
15739      * These are heuristics, and can be tweaked as field experience dictates.
15740      * There will be cases when someone didn't intend to specify a posix class
15741      * that this warns as being so.  The goal is to minimize these, while
15742      * maximizing the catching of things intended to be a posix class that
15743      * aren't parsed as such.
15744      */
15745 
15746     const char* p             = s;
15747     const char * const e      = RExC_end;
15748     unsigned complement       = 0;      /* If to complement the class */
15749     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15750     bool has_opening_bracket  = FALSE;
15751     bool has_opening_colon    = FALSE;
15752     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15753                                                    valid class */
15754     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15755     const char* name_start;             /* ptr to class name first char */
15756 
15757     /* If the number of single-character typos the input name is away from a
15758      * legal name is no more than this number, it is considered to have meant
15759      * the legal name */
15760     int max_distance          = 2;
15761 
15762     /* to store the name.  The size determines the maximum length before we
15763      * decide that no posix class was intended.  Should be at least
15764      * sizeof("alphanumeric") */
15765     UV input_text[15];
15766     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15767 
15768     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15769 
15770     CLEAR_POSIX_WARNINGS();
15771 
15772     if (p >= e) {
15773         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15774     }
15775 
15776     if (*(p - 1) != '[') {
15777         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15778         found_problem = TRUE;
15779     }
15780     else {
15781         has_opening_bracket = TRUE;
15782     }
15783 
15784     /* They could be confused and think you can put spaces between the
15785      * components */
15786     if (isBLANK(*p)) {
15787         found_problem = TRUE;
15788 
15789         do {
15790             p++;
15791         } while (p < e && isBLANK(*p));
15792 
15793         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15794     }
15795 
15796     /* For [. .] and [= =].  These are quite different internally from [: :],
15797      * so they are handled separately.  */
15798     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15799                                             and 1 for at least one char in it
15800                                           */
15801     {
15802         const char open_char  = *p;
15803         const char * temp_ptr = p + 1;
15804 
15805         /* These two constructs are not handled by perl, and if we find a
15806          * syntactically valid one, we croak.  khw, who wrote this code, finds
15807          * this explanation of them very unclear:
15808          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15809          * And searching the rest of the internet wasn't very helpful either.
15810          * It looks like just about any byte can be in these constructs,
15811          * depending on the locale.  But unless the pattern is being compiled
15812          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15813          * In that case, it looks like [= =] isn't allowed at all, and that
15814          * [. .] could be any single code point, but for longer strings the
15815          * constituent characters would have to be the ASCII alphabetics plus
15816          * the minus-hyphen.  Any sensible locale definition would limit itself
15817          * to these.  And any portable one definitely should.  Trying to parse
15818          * the general case is a nightmare (see [perl #127604]).  So, this code
15819          * looks only for interiors of these constructs that match:
15820          *      qr/.|[-\w]{2,}/
15821          * Using \w relaxes the apparent rules a little, without adding much
15822          * danger of mistaking something else for one of these constructs.
15823          *
15824          * [. .] in some implementations described on the internet is usable to
15825          * escape a character that otherwise is special in bracketed character
15826          * classes.  For example [.].] means a literal right bracket instead of
15827          * the ending of the class
15828          *
15829          * [= =] can legitimately contain a [. .] construct, but we don't
15830          * handle this case, as that [. .] construct will later get parsed
15831          * itself and croak then.  And [= =] is checked for even when not under
15832          * /l, as Perl has long done so.
15833          *
15834          * The code below relies on there being a trailing NUL, so it doesn't
15835          * have to keep checking if the parse ptr < e.
15836          */
15837         if (temp_ptr[1] == open_char) {
15838             temp_ptr++;
15839         }
15840         else while (    temp_ptr < e
15841                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15842         {
15843             temp_ptr++;
15844         }
15845 
15846         if (*temp_ptr == open_char) {
15847             temp_ptr++;
15848             if (*temp_ptr == ']') {
15849                 temp_ptr++;
15850                 if (! found_problem && ! check_only) {
15851                     RExC_parse = (char *) temp_ptr;
15852                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15853                             "extensions", open_char, open_char);
15854                 }
15855 
15856                 /* Here, the syntax wasn't completely valid, or else the call
15857                  * is to check-only */
15858                 if (updated_parse_ptr) {
15859                     *updated_parse_ptr = (char *) temp_ptr;
15860                 }
15861 
15862                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15863             }
15864         }
15865 
15866         /* If we find something that started out to look like one of these
15867          * constructs, but isn't, we continue below so that it can be checked
15868          * for being a class name with a typo of '.' or '=' instead of a colon.
15869          * */
15870     }
15871 
15872     /* Here, we think there is a possibility that a [: :] class was meant, and
15873      * we have the first real character.  It could be they think the '^' comes
15874      * first */
15875     if (*p == '^') {
15876         found_problem = TRUE;
15877         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15878         complement = 1;
15879         p++;
15880 
15881         if (isBLANK(*p)) {
15882             found_problem = TRUE;
15883 
15884             do {
15885                 p++;
15886             } while (p < e && isBLANK(*p));
15887 
15888             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15889         }
15890     }
15891 
15892     /* But the first character should be a colon, which they could have easily
15893      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15894      * distinguish from a colon, so treat that as a colon).  */
15895     if (*p == ':') {
15896         p++;
15897         has_opening_colon = TRUE;
15898     }
15899     else if (*p == ';') {
15900         found_problem = TRUE;
15901         p++;
15902         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15903         has_opening_colon = TRUE;
15904     }
15905     else {
15906         found_problem = TRUE;
15907         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15908 
15909         /* Consider an initial punctuation (not one of the recognized ones) to
15910          * be a left terminator */
15911         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15912             p++;
15913         }
15914     }
15915 
15916     /* They may think that you can put spaces between the components */
15917     if (isBLANK(*p)) {
15918         found_problem = TRUE;
15919 
15920         do {
15921             p++;
15922         } while (p < e && isBLANK(*p));
15923 
15924         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15925     }
15926 
15927     if (*p == '^') {
15928 
15929         /* We consider something like [^:^alnum:]] to not have been intended to
15930          * be a posix class, but XXX maybe we should */
15931         if (complement) {
15932             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15933         }
15934 
15935         complement = 1;
15936         p++;
15937     }
15938 
15939     /* Again, they may think that you can put spaces between the components */
15940     if (isBLANK(*p)) {
15941         found_problem = TRUE;
15942 
15943         do {
15944             p++;
15945         } while (p < e && isBLANK(*p));
15946 
15947         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15948     }
15949 
15950     if (*p == ']') {
15951 
15952         /* XXX This ']' may be a typo, and something else was meant.  But
15953          * treating it as such creates enough complications, that that
15954          * possibility isn't currently considered here.  So we assume that the
15955          * ']' is what is intended, and if we've already found an initial '[',
15956          * this leaves this construct looking like [:] or [:^], which almost
15957          * certainly weren't intended to be posix classes */
15958         if (has_opening_bracket) {
15959             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15960         }
15961 
15962         /* But this function can be called when we parse the colon for
15963          * something like qr/[alpha:]]/, so we back up to look for the
15964          * beginning */
15965         p--;
15966 
15967         if (*p == ';') {
15968             found_problem = TRUE;
15969             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15970         }
15971         else if (*p != ':') {
15972 
15973             /* XXX We are currently very restrictive here, so this code doesn't
15974              * consider the possibility that, say, /[alpha.]]/ was intended to
15975              * be a posix class. */
15976             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15977         }
15978 
15979         /* Here we have something like 'foo:]'.  There was no initial colon,
15980          * and we back up over 'foo.  XXX Unlike the going forward case, we
15981          * don't handle typos of non-word chars in the middle */
15982         has_opening_colon = FALSE;
15983         p--;
15984 
15985         while (p > RExC_start && isWORDCHAR(*p)) {
15986             p--;
15987         }
15988         p++;
15989 
15990         /* Here, we have positioned ourselves to where we think the first
15991          * character in the potential class is */
15992     }
15993 
15994     /* Now the interior really starts.  There are certain key characters that
15995      * can end the interior, or these could just be typos.  To catch both
15996      * cases, we may have to do two passes.  In the first pass, we keep on
15997      * going unless we come to a sequence that matches
15998      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15999      * This means it takes a sequence to end the pass, so two typos in a row if
16000      * that wasn't what was intended.  If the class is perfectly formed, just
16001      * this one pass is needed.  We also stop if there are too many characters
16002      * being accumulated, but this number is deliberately set higher than any
16003      * real class.  It is set high enough so that someone who thinks that
16004      * 'alphanumeric' is a correct name would get warned that it wasn't.
16005      * While doing the pass, we keep track of where the key characters were in
16006      * it.  If we don't find an end to the class, and one of the key characters
16007      * was found, we redo the pass, but stop when we get to that character.
16008      * Thus the key character was considered a typo in the first pass, but a
16009      * terminator in the second.  If two key characters are found, we stop at
16010      * the second one in the first pass.  Again this can miss two typos, but
16011      * catches a single one
16012      *
16013      * In the first pass, 'possible_end' starts as NULL, and then gets set to
16014      * point to the first key character.  For the second pass, it starts as -1.
16015      * */
16016 
16017     name_start = p;
16018   parse_name:
16019     {
16020         bool has_blank               = FALSE;
16021         bool has_upper               = FALSE;
16022         bool has_terminating_colon   = FALSE;
16023         bool has_terminating_bracket = FALSE;
16024         bool has_semi_colon          = FALSE;
16025         unsigned int name_len        = 0;
16026         int punct_count              = 0;
16027 
16028         while (p < e) {
16029 
16030             /* Squeeze out blanks when looking up the class name below */
16031             if (isBLANK(*p) ) {
16032                 has_blank = TRUE;
16033                 found_problem = TRUE;
16034                 p++;
16035                 continue;
16036             }
16037 
16038             /* The name will end with a punctuation */
16039             if (isPUNCT(*p)) {
16040                 const char * peek = p + 1;
16041 
16042                 /* Treat any non-']' punctuation followed by a ']' (possibly
16043                  * with intervening blanks) as trying to terminate the class.
16044                  * ']]' is very likely to mean a class was intended (but
16045                  * missing the colon), but the warning message that gets
16046                  * generated shows the error position better if we exit the
16047                  * loop at the bottom (eventually), so skip it here. */
16048                 if (*p != ']') {
16049                     if (peek < e && isBLANK(*peek)) {
16050                         has_blank = TRUE;
16051                         found_problem = TRUE;
16052                         do {
16053                             peek++;
16054                         } while (peek < e && isBLANK(*peek));
16055                     }
16056 
16057                     if (peek < e && *peek == ']') {
16058                         has_terminating_bracket = TRUE;
16059                         if (*p == ':') {
16060                             has_terminating_colon = TRUE;
16061                         }
16062                         else if (*p == ';') {
16063                             has_semi_colon = TRUE;
16064                             has_terminating_colon = TRUE;
16065                         }
16066                         else {
16067                             found_problem = TRUE;
16068                         }
16069                         p = peek + 1;
16070                         goto try_posix;
16071                     }
16072                 }
16073 
16074                 /* Here we have punctuation we thought didn't end the class.
16075                  * Keep track of the position of the key characters that are
16076                  * more likely to have been class-enders */
16077                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
16078 
16079                     /* Allow just one such possible class-ender not actually
16080                      * ending the class. */
16081                     if (possible_end) {
16082                         break;
16083                     }
16084                     possible_end = p;
16085                 }
16086 
16087                 /* If we have too many punctuation characters, no use in
16088                  * keeping going */
16089                 if (++punct_count > max_distance) {
16090                     break;
16091                 }
16092 
16093                 /* Treat the punctuation as a typo. */
16094                 input_text[name_len++] = *p;
16095                 p++;
16096             }
16097             else if (isUPPER(*p)) { /* Use lowercase for lookup */
16098                 input_text[name_len++] = toLOWER(*p);
16099                 has_upper = TRUE;
16100                 found_problem = TRUE;
16101                 p++;
16102             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
16103                 input_text[name_len++] = *p;
16104                 p++;
16105             }
16106             else {
16107                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
16108                 p+= UTF8SKIP(p);
16109             }
16110 
16111             /* The declaration of 'input_text' is how long we allow a potential
16112              * class name to be, before saying they didn't mean a class name at
16113              * all */
16114             if (name_len >= C_ARRAY_LENGTH(input_text)) {
16115                 break;
16116             }
16117         }
16118 
16119         /* We get to here when the possible class name hasn't been properly
16120          * terminated before:
16121          *   1) we ran off the end of the pattern; or
16122          *   2) found two characters, each of which might have been intended to
16123          *      be the name's terminator
16124          *   3) found so many punctuation characters in the purported name,
16125          *      that the edit distance to a valid one is exceeded
16126          *   4) we decided it was more characters than anyone could have
16127          *      intended to be one. */
16128 
16129         found_problem = TRUE;
16130 
16131         /* In the final two cases, we know that looking up what we've
16132          * accumulated won't lead to a match, even a fuzzy one. */
16133         if (   name_len >= C_ARRAY_LENGTH(input_text)
16134             || punct_count > max_distance)
16135         {
16136             /* If there was an intermediate key character that could have been
16137              * an intended end, redo the parse, but stop there */
16138             if (possible_end && possible_end != (char *) -1) {
16139                 possible_end = (char *) -1; /* Special signal value to say
16140                                                we've done a first pass */
16141                 p = name_start;
16142                 goto parse_name;
16143             }
16144 
16145             /* Otherwise, it can't have meant to have been a class */
16146             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16147         }
16148 
16149         /* If we ran off the end, and the final character was a punctuation
16150          * one, back up one, to look at that final one just below.  Later, we
16151          * will restore the parse pointer if appropriate */
16152         if (name_len && p == e && isPUNCT(*(p-1))) {
16153             p--;
16154             name_len--;
16155         }
16156 
16157         if (p < e && isPUNCT(*p)) {
16158             if (*p == ']') {
16159                 has_terminating_bracket = TRUE;
16160 
16161                 /* If this is a 2nd ']', and the first one is just below this
16162                  * one, consider that to be the real terminator.  This gives a
16163                  * uniform and better positioning for the warning message  */
16164                 if (   possible_end
16165                     && possible_end != (char *) -1
16166                     && *possible_end == ']'
16167                     && name_len && input_text[name_len - 1] == ']')
16168                 {
16169                     name_len--;
16170                     p = possible_end;
16171 
16172                     /* And this is actually equivalent to having done the 2nd
16173                      * pass now, so set it to not try again */
16174                     possible_end = (char *) -1;
16175                 }
16176             }
16177             else {
16178                 if (*p == ':') {
16179                     has_terminating_colon = TRUE;
16180                 }
16181                 else if (*p == ';') {
16182                     has_semi_colon = TRUE;
16183                     has_terminating_colon = TRUE;
16184                 }
16185                 p++;
16186             }
16187         }
16188 
16189     try_posix:
16190 
16191         /* Here, we have a class name to look up.  We can short circuit the
16192          * stuff below for short names that can't possibly be meant to be a
16193          * class name.  (We can do this on the first pass, as any second pass
16194          * will yield an even shorter name) */
16195         if (name_len < 3) {
16196             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16197         }
16198 
16199         /* Find which class it is.  Initially switch on the length of the name.
16200          * */
16201         switch (name_len) {
16202             case 4:
16203                 if (memEQs(name_start, 4, "word")) {
16204                     /* this is not POSIX, this is the Perl \w */
16205                     class_number = ANYOF_WORDCHAR;
16206                 }
16207                 break;
16208             case 5:
16209                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16210                  *                        graph lower print punct space upper
16211                  * Offset 4 gives the best switch position.  */
16212                 switch (name_start[4]) {
16213                     case 'a':
16214                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
16215                             class_number = ANYOF_ALPHA;
16216                         break;
16217                     case 'e':
16218                         if (memBEGINs(name_start, 5, "spac")) /* space */
16219                             class_number = ANYOF_SPACE;
16220                         break;
16221                     case 'h':
16222                         if (memBEGINs(name_start, 5, "grap")) /* graph */
16223                             class_number = ANYOF_GRAPH;
16224                         break;
16225                     case 'i':
16226                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
16227                             class_number = ANYOF_ASCII;
16228                         break;
16229                     case 'k':
16230                         if (memBEGINs(name_start, 5, "blan")) /* blank */
16231                             class_number = ANYOF_BLANK;
16232                         break;
16233                     case 'l':
16234                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16235                             class_number = ANYOF_CNTRL;
16236                         break;
16237                     case 'm':
16238                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16239                             class_number = ANYOF_ALPHANUMERIC;
16240                         break;
16241                     case 'r':
16242                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
16243                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16244                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16245                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16246                         break;
16247                     case 't':
16248                         if (memBEGINs(name_start, 5, "digi")) /* digit */
16249                             class_number = ANYOF_DIGIT;
16250                         else if (memBEGINs(name_start, 5, "prin")) /* print */
16251                             class_number = ANYOF_PRINT;
16252                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
16253                             class_number = ANYOF_PUNCT;
16254                         break;
16255                 }
16256                 break;
16257             case 6:
16258                 if (memEQs(name_start, 6, "xdigit"))
16259                     class_number = ANYOF_XDIGIT;
16260                 break;
16261         }
16262 
16263         /* If the name exactly matches a posix class name the class number will
16264          * here be set to it, and the input almost certainly was meant to be a
16265          * posix class, so we can skip further checking.  If instead the syntax
16266          * is exactly correct, but the name isn't one of the legal ones, we
16267          * will return that as an error below.  But if neither of these apply,
16268          * it could be that no posix class was intended at all, or that one
16269          * was, but there was a typo.  We tease these apart by doing fuzzy
16270          * matching on the name */
16271         if (class_number == OOB_NAMEDCLASS && found_problem) {
16272             const UV posix_names[][6] = {
16273                                                 { 'a', 'l', 'n', 'u', 'm' },
16274                                                 { 'a', 'l', 'p', 'h', 'a' },
16275                                                 { 'a', 's', 'c', 'i', 'i' },
16276                                                 { 'b', 'l', 'a', 'n', 'k' },
16277                                                 { 'c', 'n', 't', 'r', 'l' },
16278                                                 { 'd', 'i', 'g', 'i', 't' },
16279                                                 { 'g', 'r', 'a', 'p', 'h' },
16280                                                 { 'l', 'o', 'w', 'e', 'r' },
16281                                                 { 'p', 'r', 'i', 'n', 't' },
16282                                                 { 'p', 'u', 'n', 'c', 't' },
16283                                                 { 's', 'p', 'a', 'c', 'e' },
16284                                                 { 'u', 'p', 'p', 'e', 'r' },
16285                                                 { 'w', 'o', 'r', 'd' },
16286                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
16287                                             };
16288             /* The names of the above all have added NULs to make them the same
16289              * size, so we need to also have the real lengths */
16290             const UV posix_name_lengths[] = {
16291                                                 sizeof("alnum") - 1,
16292                                                 sizeof("alpha") - 1,
16293                                                 sizeof("ascii") - 1,
16294                                                 sizeof("blank") - 1,
16295                                                 sizeof("cntrl") - 1,
16296                                                 sizeof("digit") - 1,
16297                                                 sizeof("graph") - 1,
16298                                                 sizeof("lower") - 1,
16299                                                 sizeof("print") - 1,
16300                                                 sizeof("punct") - 1,
16301                                                 sizeof("space") - 1,
16302                                                 sizeof("upper") - 1,
16303                                                 sizeof("word")  - 1,
16304                                                 sizeof("xdigit")- 1
16305                                             };
16306             unsigned int i;
16307             int temp_max = max_distance;    /* Use a temporary, so if we
16308                                                reparse, we haven't changed the
16309                                                outer one */
16310 
16311             /* Use a smaller max edit distance if we are missing one of the
16312              * delimiters */
16313             if (   has_opening_bracket + has_opening_colon < 2
16314                 || has_terminating_bracket + has_terminating_colon < 2)
16315             {
16316                 temp_max--;
16317             }
16318 
16319             /* See if the input name is close to a legal one */
16320             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16321 
16322                 /* Short circuit call if the lengths are too far apart to be
16323                  * able to match */
16324                 if (abs( (int) (name_len - posix_name_lengths[i]))
16325                     > temp_max)
16326                 {
16327                     continue;
16328                 }
16329 
16330                 if (edit_distance(input_text,
16331                                   posix_names[i],
16332                                   name_len,
16333                                   posix_name_lengths[i],
16334                                   temp_max
16335                                  )
16336                     > -1)
16337                 { /* If it is close, it probably was intended to be a class */
16338                     goto probably_meant_to_be;
16339                 }
16340             }
16341 
16342             /* Here the input name is not close enough to a valid class name
16343              * for us to consider it to be intended to be a posix class.  If
16344              * we haven't already done so, and the parse found a character that
16345              * could have been terminators for the name, but which we absorbed
16346              * as typos during the first pass, repeat the parse, signalling it
16347              * to stop at that character */
16348             if (possible_end && possible_end != (char *) -1) {
16349                 possible_end = (char *) -1;
16350                 p = name_start;
16351                 goto parse_name;
16352             }
16353 
16354             /* Here neither pass found a close-enough class name */
16355             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16356         }
16357 
16358     probably_meant_to_be:
16359 
16360         /* Here we think that a posix specification was intended.  Update any
16361          * parse pointer */
16362         if (updated_parse_ptr) {
16363             *updated_parse_ptr = (char *) p;
16364         }
16365 
16366         /* If a posix class name was intended but incorrectly specified, we
16367          * output or return the warnings */
16368         if (found_problem) {
16369 
16370             /* We set flags for these issues in the parse loop above instead of
16371              * adding them to the list of warnings, because we can parse it
16372              * twice, and we only want one warning instance */
16373             if (has_upper) {
16374                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16375             }
16376             if (has_blank) {
16377                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16378             }
16379             if (has_semi_colon) {
16380                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16381             }
16382             else if (! has_terminating_colon) {
16383                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16384             }
16385             if (! has_terminating_bracket) {
16386                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16387             }
16388 
16389             if (   posix_warnings
16390                 && RExC_warn_text
16391                 && av_count(RExC_warn_text) > 0)
16392             {
16393                 *posix_warnings = RExC_warn_text;
16394             }
16395         }
16396         else if (class_number != OOB_NAMEDCLASS) {
16397             /* If it is a known class, return the class.  The class number
16398              * #defines are structured so each complement is +1 to the normal
16399              * one */
16400             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16401         }
16402         else if (! check_only) {
16403 
16404             /* Here, it is an unrecognized class.  This is an error (unless the
16405             * call is to check only, which we've already handled above) */
16406             const char * const complement_string = (complement)
16407                                                    ? "^"
16408                                                    : "";
16409             RExC_parse = (char *) p;
16410             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16411                         complement_string,
16412                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16413         }
16414     }
16415 
16416     return OOB_NAMEDCLASS;
16417 }
16418 #undef ADD_POSIX_WARNING
16419 
16420 STATIC unsigned  int
S_regex_set_precedence(const U8 my_operator)16421 S_regex_set_precedence(const U8 my_operator) {
16422 
16423     /* Returns the precedence in the (?[...]) construct of the input operator,
16424      * specified by its character representation.  The precedence follows
16425      * general Perl rules, but it extends this so that ')' and ']' have (low)
16426      * precedence even though they aren't really operators */
16427 
16428     switch (my_operator) {
16429         case '!':
16430             return 5;
16431         case '&':
16432             return 4;
16433         case '^':
16434         case '|':
16435         case '+':
16436         case '-':
16437             return 3;
16438         case ')':
16439             return 2;
16440         case ']':
16441             return 1;
16442     }
16443 
16444     NOT_REACHED; /* NOTREACHED */
16445     return 0;   /* Silence compiler warning */
16446 }
16447 
16448 STATIC regnode_offset
S_handle_regex_sets(pTHX_ RExC_state_t * pRExC_state,SV ** return_invlist,I32 * flagp,U32 depth,char * const oregcomp_parse)16449 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16450                     I32 *flagp, U32 depth,
16451                     char * const oregcomp_parse)
16452 {
16453     /* Handle the (?[...]) construct to do set operations */
16454 
16455     U8 curchar;                     /* Current character being parsed */
16456     UV start, end;	            /* End points of code point ranges */
16457     SV* final = NULL;               /* The end result inversion list */
16458     SV* result_string;              /* 'final' stringified */
16459     AV* stack;                      /* stack of operators and operands not yet
16460                                        resolved */
16461     AV* fence_stack = NULL;         /* A stack containing the positions in
16462                                        'stack' of where the undealt-with left
16463                                        parens would be if they were actually
16464                                        put there */
16465     /* The 'volatile' is a workaround for an optimiser bug
16466      * in Solaris Studio 12.3. See RT #127455 */
16467     volatile IV fence = 0;          /* Position of where most recent undealt-
16468                                        with left paren in stack is; -1 if none.
16469                                      */
16470     STRLEN len;                     /* Temporary */
16471     regnode_offset node;            /* Temporary, and final regnode returned by
16472                                        this function */
16473     const bool save_fold = FOLD;    /* Temporary */
16474     char *save_end, *save_parse;    /* Temporaries */
16475     const bool in_locale = LOC;     /* we turn off /l during processing */
16476 
16477     DECLARE_AND_GET_RE_DEBUG_FLAGS;
16478 
16479     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16480     PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16481 
16482     DEBUG_PARSE("xcls");
16483 
16484     if (in_locale) {
16485         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16486     }
16487 
16488     /* The use of this operator implies /u.  This is required so that the
16489      * compile time values are valid in all runtime cases */
16490     REQUIRE_UNI_RULES(flagp, 0);
16491 
16492     ckWARNexperimental(RExC_parse,
16493                        WARN_EXPERIMENTAL__REGEX_SETS,
16494                        "The regex_sets feature is experimental");
16495 
16496     /* Everything in this construct is a metacharacter.  Operands begin with
16497      * either a '\' (for an escape sequence), or a '[' for a bracketed
16498      * character class.  Any other character should be an operator, or
16499      * parenthesis for grouping.  Both types of operands are handled by calling
16500      * regclass() to parse them.  It is called with a parameter to indicate to
16501      * return the computed inversion list.  The parsing here is implemented via
16502      * a stack.  Each entry on the stack is a single character representing one
16503      * of the operators; or else a pointer to an operand inversion list. */
16504 
16505 #define IS_OPERATOR(a) SvIOK(a)
16506 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
16507 
16508     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
16509      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16510      * with pronouncing it called it Reverse Polish instead, but now that YOU
16511      * know how to pronounce it you can use the correct term, thus giving due
16512      * credit to the person who invented it, and impressing your geek friends.
16513      * Wikipedia says that the pronounciation of "Ł" has been changing so that
16514      * it is now more like an English initial W (as in wonk) than an L.)
16515      *
16516      * This means that, for example, 'a | b & c' is stored on the stack as
16517      *
16518      * c  [4]
16519      * b  [3]
16520      * &  [2]
16521      * a  [1]
16522      * |  [0]
16523      *
16524      * where the numbers in brackets give the stack [array] element number.
16525      * In this implementation, parentheses are not stored on the stack.
16526      * Instead a '(' creates a "fence" so that the part of the stack below the
16527      * fence is invisible except to the corresponding ')' (this allows us to
16528      * replace testing for parens, by using instead subtraction of the fence
16529      * position).  As new operands are processed they are pushed onto the stack
16530      * (except as noted in the next paragraph).  New operators of higher
16531      * precedence than the current final one are inserted on the stack before
16532      * the lhs operand (so that when the rhs is pushed next, everything will be
16533      * in the correct positions shown above.  When an operator of equal or
16534      * lower precedence is encountered in parsing, all the stacked operations
16535      * of equal or higher precedence are evaluated, leaving the result as the
16536      * top entry on the stack.  This makes higher precedence operations
16537      * evaluate before lower precedence ones, and causes operations of equal
16538      * precedence to left associate.
16539      *
16540      * The only unary operator '!' is immediately pushed onto the stack when
16541      * encountered.  When an operand is encountered, if the top of the stack is
16542      * a '!", the complement is immediately performed, and the '!' popped.  The
16543      * resulting value is treated as a new operand, and the logic in the
16544      * previous paragraph is executed.  Thus in the expression
16545      *      [a] + ! [b]
16546      * the stack looks like
16547      *
16548      * !
16549      * a
16550      * +
16551      *
16552      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16553      * becomes
16554      *
16555      * !b
16556      * a
16557      * +
16558      *
16559      * A ')' is treated as an operator with lower precedence than all the
16560      * aforementioned ones, which causes all operations on the stack above the
16561      * corresponding '(' to be evaluated down to a single resultant operand.
16562      * Then the fence for the '(' is removed, and the operand goes through the
16563      * algorithm above, without the fence.
16564      *
16565      * A separate stack is kept of the fence positions, so that the position of
16566      * the latest so-far unbalanced '(' is at the top of it.
16567      *
16568      * The ']' ending the construct is treated as the lowest operator of all,
16569      * so that everything gets evaluated down to a single operand, which is the
16570      * result */
16571 
16572     sv_2mortal((SV *)(stack = newAV()));
16573     sv_2mortal((SV *)(fence_stack = newAV()));
16574 
16575     while (RExC_parse < RExC_end) {
16576         I32 top_index;              /* Index of top-most element in 'stack' */
16577         SV** top_ptr;               /* Pointer to top 'stack' element */
16578         SV* current = NULL;         /* To contain the current inversion list
16579                                        operand */
16580         SV* only_to_avoid_leaks;
16581 
16582         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16583                                 TRUE /* Force /x */ );
16584         if (RExC_parse >= RExC_end) {   /* Fail */
16585             break;
16586         }
16587 
16588         curchar = UCHARAT(RExC_parse);
16589 
16590 redo_curchar:
16591 
16592 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16593                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16594         DEBUG_U(dump_regex_sets_structures(pRExC_state,
16595                                            stack, fence, fence_stack));
16596 #endif
16597 
16598         top_index = av_tindex_skip_len_mg(stack);
16599 
16600         switch (curchar) {
16601             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
16602             char stacked_operator;  /* The topmost operator on the 'stack'. */
16603             SV* lhs;                /* Operand to the left of the operator */
16604             SV* rhs;                /* Operand to the right of the operator */
16605             SV* fence_ptr;          /* Pointer to top element of the fence
16606                                        stack */
16607             case '(':
16608 
16609                 if (   RExC_parse < RExC_end - 2
16610                     && UCHARAT(RExC_parse + 1) == '?'
16611                     && UCHARAT(RExC_parse + 2) == '^')
16612                 {
16613                     const regnode_offset orig_emit = RExC_emit;
16614                     SV * resultant_invlist;
16615 
16616                     /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16617                      * This happens when we have some thing like
16618                      *
16619                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16620                      *   ...
16621                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
16622                      *
16623                      * Here we would be handling the interpolated
16624                      * '$thai_or_lao'.  We handle this by a recursive call to
16625                      * reg which returns the inversion list the
16626                      * interpolated expression evaluates to.  Actually, the
16627                      * return is a special regnode containing a pointer to that
16628                      * inversion list.  If the return isn't that regnode alone,
16629                      * we know that this wasn't such an interpolation, which is
16630                      * an error: we need to get a single inversion list back
16631                      * from the recursion */
16632 
16633                     RExC_parse++;
16634                     RExC_sets_depth++;
16635 
16636 	            node = reg(pRExC_state, 2, flagp, depth+1);
16637                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16638 
16639                     if (   OP(REGNODE_p(node)) != REGEX_SET
16640                            /* If more than a single node returned, the nested
16641                             * parens evaluated to more than just a (?[...]),
16642                             * which isn't legal */
16643                         || RExC_emit != orig_emit
16644                                       + NODE_STEP_REGNODE
16645                                       + regarglen[REGEX_SET])
16646                     {
16647                         vFAIL("Expecting interpolated extended charclass");
16648                     }
16649                     resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16650                     current = invlist_clone(resultant_invlist, NULL);
16651                     SvREFCNT_dec(resultant_invlist);
16652 
16653                     RExC_sets_depth--;
16654                     RExC_emit = orig_emit;
16655                     goto handle_operand;
16656                 }
16657 
16658                 /* A regular '('.  Look behind for illegal syntax */
16659                 if (top_index - fence >= 0) {
16660                     /* If the top entry on the stack is an operator, it had
16661                      * better be a '!', otherwise the entry below the top
16662                      * operand should be an operator */
16663                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16664                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16665                         || (   IS_OPERAND(*top_ptr)
16666                             && (   top_index - fence < 1
16667                                 || ! (stacked_ptr = av_fetch(stack,
16668                                                              top_index - 1,
16669                                                              FALSE))
16670                                 || ! IS_OPERATOR(*stacked_ptr))))
16671                     {
16672                         RExC_parse++;
16673                         vFAIL("Unexpected '(' with no preceding operator");
16674                     }
16675                 }
16676 
16677                 /* Stack the position of this undealt-with left paren */
16678                 av_push(fence_stack, newSViv(fence));
16679                 fence = top_index + 1;
16680                 break;
16681 
16682             case '\\':
16683                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16684                  * multi-char folds are allowed.  */
16685                 if (!regclass(pRExC_state, flagp, depth+1,
16686                               TRUE, /* means parse just the next thing */
16687                               FALSE, /* don't allow multi-char folds */
16688                               FALSE, /* don't silence non-portable warnings.  */
16689                               TRUE,  /* strict */
16690                               FALSE, /* Require return to be an ANYOF */
16691                               &current))
16692                 {
16693                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16694                     goto regclass_failed;
16695                 }
16696 
16697                 assert(current);
16698 
16699                 /* regclass() will return with parsing just the \ sequence,
16700                  * leaving the parse pointer at the next thing to parse */
16701                 RExC_parse--;
16702                 goto handle_operand;
16703 
16704             case '[':   /* Is a bracketed character class */
16705             {
16706                 /* See if this is a [:posix:] class. */
16707                 bool is_posix_class = (OOB_NAMEDCLASS
16708                             < handle_possible_posix(pRExC_state,
16709                                                 RExC_parse + 1,
16710                                                 NULL,
16711                                                 NULL,
16712                                                 TRUE /* checking only */));
16713                 /* If it is a posix class, leave the parse pointer at the '['
16714                  * to fool regclass() into thinking it is part of a
16715                  * '[[:posix:]]'. */
16716                 if (! is_posix_class) {
16717                     RExC_parse++;
16718                 }
16719 
16720                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16721                  * multi-char folds are allowed.  */
16722                 if (!regclass(pRExC_state, flagp, depth+1,
16723                                 is_posix_class, /* parse the whole char
16724                                                     class only if not a
16725                                                     posix class */
16726                                 FALSE, /* don't allow multi-char folds */
16727                                 TRUE, /* silence non-portable warnings. */
16728                                 TRUE, /* strict */
16729                                 FALSE, /* Require return to be an ANYOF */
16730                                 &current))
16731                 {
16732                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16733                     goto regclass_failed;
16734                 }
16735 
16736                 assert(current);
16737 
16738                 /* function call leaves parse pointing to the ']', except if we
16739                  * faked it */
16740                 if (is_posix_class) {
16741                     RExC_parse--;
16742                 }
16743 
16744                 goto handle_operand;
16745             }
16746 
16747             case ']':
16748                 if (top_index >= 1) {
16749                     goto join_operators;
16750                 }
16751 
16752                 /* Only a single operand on the stack: are done */
16753                 goto done;
16754 
16755             case ')':
16756                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16757                     if (UCHARAT(RExC_parse - 1) == ']')  {
16758                         break;
16759                     }
16760                     RExC_parse++;
16761                     vFAIL("Unexpected ')'");
16762                 }
16763 
16764                 /* If nothing after the fence, is missing an operand */
16765                 if (top_index - fence < 0) {
16766                     RExC_parse++;
16767                     goto bad_syntax;
16768                 }
16769                 /* If at least two things on the stack, treat this as an
16770                   * operator */
16771                 if (top_index - fence >= 1) {
16772                     goto join_operators;
16773                 }
16774 
16775                 /* Here only a single thing on the fenced stack, and there is a
16776                  * fence.  Get rid of it */
16777                 fence_ptr = av_pop(fence_stack);
16778                 assert(fence_ptr);
16779                 fence = SvIV(fence_ptr);
16780                 SvREFCNT_dec_NN(fence_ptr);
16781                 fence_ptr = NULL;
16782 
16783                 if (fence < 0) {
16784                     fence = 0;
16785                 }
16786 
16787                 /* Having gotten rid of the fence, we pop the operand at the
16788                  * stack top and process it as a newly encountered operand */
16789                 current = av_pop(stack);
16790                 if (IS_OPERAND(current)) {
16791                     goto handle_operand;
16792                 }
16793 
16794                 RExC_parse++;
16795                 goto bad_syntax;
16796 
16797             case '&':
16798             case '|':
16799             case '+':
16800             case '-':
16801             case '^':
16802 
16803                 /* These binary operators should have a left operand already
16804                  * parsed */
16805                 if (   top_index - fence < 0
16806                     || top_index - fence == 1
16807                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16808                     || ! IS_OPERAND(*top_ptr))
16809                 {
16810                     goto unexpected_binary;
16811                 }
16812 
16813                 /* If only the one operand is on the part of the stack visible
16814                  * to us, we just place this operator in the proper position */
16815                 if (top_index - fence < 2) {
16816 
16817                     /* Place the operator before the operand */
16818 
16819                     SV* lhs = av_pop(stack);
16820                     av_push(stack, newSVuv(curchar));
16821                     av_push(stack, lhs);
16822                     break;
16823                 }
16824 
16825                 /* But if there is something else on the stack, we need to
16826                  * process it before this new operator if and only if the
16827                  * stacked operation has equal or higher precedence than the
16828                  * new one */
16829 
16830              join_operators:
16831 
16832                 /* The operator on the stack is supposed to be below both its
16833                  * operands */
16834                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16835                     || IS_OPERAND(*stacked_ptr))
16836                 {
16837                     /* But if not, it's legal and indicates we are completely
16838                      * done if and only if we're currently processing a ']',
16839                      * which should be the final thing in the expression */
16840                     if (curchar == ']') {
16841                         goto done;
16842                     }
16843 
16844                   unexpected_binary:
16845                     RExC_parse++;
16846                     vFAIL2("Unexpected binary operator '%c' with no "
16847                            "preceding operand", curchar);
16848                 }
16849                 stacked_operator = (char) SvUV(*stacked_ptr);
16850 
16851                 if (regex_set_precedence(curchar)
16852                     > regex_set_precedence(stacked_operator))
16853                 {
16854                     /* Here, the new operator has higher precedence than the
16855                      * stacked one.  This means we need to add the new one to
16856                      * the stack to await its rhs operand (and maybe more
16857                      * stuff).  We put it before the lhs operand, leaving
16858                      * untouched the stacked operator and everything below it
16859                      * */
16860                     lhs = av_pop(stack);
16861                     assert(IS_OPERAND(lhs));
16862 
16863                     av_push(stack, newSVuv(curchar));
16864                     av_push(stack, lhs);
16865                     break;
16866                 }
16867 
16868                 /* Here, the new operator has equal or lower precedence than
16869                  * what's already there.  This means the operation already
16870                  * there should be performed now, before the new one. */
16871 
16872                 rhs = av_pop(stack);
16873                 if (! IS_OPERAND(rhs)) {
16874 
16875                     /* This can happen when a ! is not followed by an operand,
16876                      * like in /(?[\t &!])/ */
16877                     goto bad_syntax;
16878                 }
16879 
16880                 lhs = av_pop(stack);
16881 
16882                 if (! IS_OPERAND(lhs)) {
16883 
16884                     /* This can happen when there is an empty (), like in
16885                      * /(?[[0]+()+])/ */
16886                     goto bad_syntax;
16887                 }
16888 
16889                 switch (stacked_operator) {
16890                     case '&':
16891                         _invlist_intersection(lhs, rhs, &rhs);
16892                         break;
16893 
16894                     case '|':
16895                     case '+':
16896                         _invlist_union(lhs, rhs, &rhs);
16897                         break;
16898 
16899                     case '-':
16900                         _invlist_subtract(lhs, rhs, &rhs);
16901                         break;
16902 
16903                     case '^':   /* The union minus the intersection */
16904                     {
16905                         SV* i = NULL;
16906                         SV* u = NULL;
16907 
16908                         _invlist_union(lhs, rhs, &u);
16909                         _invlist_intersection(lhs, rhs, &i);
16910                         _invlist_subtract(u, i, &rhs);
16911                         SvREFCNT_dec_NN(i);
16912                         SvREFCNT_dec_NN(u);
16913                         break;
16914                     }
16915                 }
16916                 SvREFCNT_dec(lhs);
16917 
16918                 /* Here, the higher precedence operation has been done, and the
16919                  * result is in 'rhs'.  We overwrite the stacked operator with
16920                  * the result.  Then we redo this code to either push the new
16921                  * operator onto the stack or perform any higher precedence
16922                  * stacked operation */
16923                 only_to_avoid_leaks = av_pop(stack);
16924                 SvREFCNT_dec(only_to_avoid_leaks);
16925                 av_push(stack, rhs);
16926                 goto redo_curchar;
16927 
16928             case '!':   /* Highest priority, right associative */
16929 
16930                 /* If what's already at the top of the stack is another '!",
16931                  * they just cancel each other out */
16932                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16933                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16934                 {
16935                     only_to_avoid_leaks = av_pop(stack);
16936                     SvREFCNT_dec(only_to_avoid_leaks);
16937                 }
16938                 else { /* Otherwise, since it's right associative, just push
16939                           onto the stack */
16940                     av_push(stack, newSVuv(curchar));
16941                 }
16942                 break;
16943 
16944             default:
16945                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16946                 if (RExC_parse >= RExC_end) {
16947                     break;
16948                 }
16949                 vFAIL("Unexpected character");
16950 
16951           handle_operand:
16952 
16953             /* Here 'current' is the operand.  If something is already on the
16954              * stack, we have to check if it is a !.  But first, the code above
16955              * may have altered the stack in the time since we earlier set
16956              * 'top_index'.  */
16957 
16958             top_index = av_tindex_skip_len_mg(stack);
16959             if (top_index - fence >= 0) {
16960                 /* If the top entry on the stack is an operator, it had better
16961                  * be a '!', otherwise the entry below the top operand should
16962                  * be an operator */
16963                 top_ptr = av_fetch(stack, top_index, FALSE);
16964                 assert(top_ptr);
16965                 if (IS_OPERATOR(*top_ptr)) {
16966 
16967                     /* The only permissible operator at the top of the stack is
16968                      * '!', which is applied immediately to this operand. */
16969                     curchar = (char) SvUV(*top_ptr);
16970                     if (curchar != '!') {
16971                         SvREFCNT_dec(current);
16972                         vFAIL2("Unexpected binary operator '%c' with no "
16973                                 "preceding operand", curchar);
16974                     }
16975 
16976                     _invlist_invert(current);
16977 
16978                     only_to_avoid_leaks = av_pop(stack);
16979                     SvREFCNT_dec(only_to_avoid_leaks);
16980 
16981                     /* And we redo with the inverted operand.  This allows
16982                      * handling multiple ! in a row */
16983                     goto handle_operand;
16984                 }
16985                           /* Single operand is ok only for the non-binary ')'
16986                            * operator */
16987                 else if ((top_index - fence == 0 && curchar != ')')
16988                          || (top_index - fence > 0
16989                              && (! (stacked_ptr = av_fetch(stack,
16990                                                            top_index - 1,
16991                                                            FALSE))
16992                                  || IS_OPERAND(*stacked_ptr))))
16993                 {
16994                     SvREFCNT_dec(current);
16995                     vFAIL("Operand with no preceding operator");
16996                 }
16997             }
16998 
16999             /* Here there was nothing on the stack or the top element was
17000              * another operand.  Just add this new one */
17001             av_push(stack, current);
17002 
17003         } /* End of switch on next parse token */
17004 
17005         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
17006     } /* End of loop parsing through the construct */
17007 
17008     vFAIL("Syntax error in (?[...])");
17009 
17010   done:
17011 
17012     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
17013         if (RExC_parse < RExC_end) {
17014             RExC_parse++;
17015         }
17016 
17017         vFAIL("Unexpected ']' with no following ')' in (?[...");
17018     }
17019 
17020     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
17021         vFAIL("Unmatched (");
17022     }
17023 
17024     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
17025         || ((final = av_pop(stack)) == NULL)
17026         || ! IS_OPERAND(final)
17027         || ! is_invlist(final)
17028         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
17029     {
17030       bad_syntax:
17031         SvREFCNT_dec(final);
17032         vFAIL("Incomplete expression within '(?[ ])'");
17033     }
17034 
17035     /* Here, 'final' is the resultant inversion list from evaluating the
17036      * expression.  Return it if so requested */
17037     if (return_invlist) {
17038         *return_invlist = final;
17039         return END;
17040     }
17041 
17042     if (RExC_sets_depth) {  /* If within a recursive call, return in a special
17043                                regnode */
17044         RExC_parse++;
17045         node = regpnode(pRExC_state, REGEX_SET, final);
17046     }
17047     else {
17048 
17049         /* Otherwise generate a resultant node, based on 'final'.  regclass()
17050          * is expecting a string of ranges and individual code points */
17051         invlist_iterinit(final);
17052         result_string = newSVpvs("");
17053         while (invlist_iternext(final, &start, &end)) {
17054             if (start == end) {
17055                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
17056             }
17057             else {
17058                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
17059                                                         UVXf "}", start, end);
17060             }
17061         }
17062 
17063         /* About to generate an ANYOF (or similar) node from the inversion list
17064          * we have calculated */
17065         save_parse = RExC_parse;
17066         RExC_parse = SvPV(result_string, len);
17067         save_end = RExC_end;
17068         RExC_end = RExC_parse + len;
17069         TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
17070 
17071         /* We turn off folding around the call, as the class we have
17072          * constructed already has all folding taken into consideration, and we
17073          * don't want regclass() to add to that */
17074         RExC_flags &= ~RXf_PMf_FOLD;
17075         /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
17076          * folds are allowed.  */
17077         node = regclass(pRExC_state, flagp, depth+1,
17078                         FALSE, /* means parse the whole char class */
17079                         FALSE, /* don't allow multi-char folds */
17080                         TRUE, /* silence non-portable warnings.  The above may
17081                                  very well have generated non-portable code
17082                                  points, but they're valid on this machine */
17083                         FALSE, /* similarly, no need for strict */
17084 
17085                         /* We can optimize into something besides an ANYOF,
17086                          * except under /l, which needs to be ANYOF because of
17087                          * runtime checks for locale sanity, etc */
17088                     ! in_locale,
17089                         NULL
17090                     );
17091 
17092         RESTORE_WARNINGS;
17093         RExC_parse = save_parse + 1;
17094         RExC_end = save_end;
17095         SvREFCNT_dec_NN(final);
17096         SvREFCNT_dec_NN(result_string);
17097 
17098         if (save_fold) {
17099             RExC_flags |= RXf_PMf_FOLD;
17100         }
17101 
17102         if (!node) {
17103             RETURN_FAIL_ON_RESTART(*flagp, flagp);
17104             goto regclass_failed;
17105         }
17106 
17107         /* Fix up the node type if we are in locale.  (We have pretended we are
17108          * under /u for the purposes of regclass(), as this construct will only
17109          * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
17110          * (so as to cause any warnings about bad locales to be output in
17111          * regexec.c), and add the flag that indicates to check if not in a
17112          * UTF-8 locale.  The reason we above forbid optimization into
17113          * something other than an ANYOF node is simply to minimize the number
17114          * of code changes in regexec.c.  Otherwise we would have to create new
17115          * EXACTish node types and deal with them.  This decision could be
17116          * revisited should this construct become popular.
17117          *
17118          * (One might think we could look at the resulting ANYOF node and
17119          * suppress the flag if everything is above 255, as those would be
17120          * UTF-8 only, but this isn't true, as the components that led to that
17121          * result could have been locale-affected, and just happen to cancel
17122          * each other out under UTF-8 locales.) */
17123         if (in_locale) {
17124             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
17125 
17126             assert(OP(REGNODE_p(node)) == ANYOF);
17127 
17128             OP(REGNODE_p(node)) = ANYOFL;
17129             ANYOF_FLAGS(REGNODE_p(node))
17130                     |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17131         }
17132     }
17133 
17134     nextchar(pRExC_state);
17135     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
17136     return node;
17137 
17138   regclass_failed:
17139     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
17140                                                                 (UV) *flagp);
17141 }
17142 
17143 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17144 
17145 STATIC void
S_dump_regex_sets_structures(pTHX_ RExC_state_t * pRExC_state,AV * stack,const IV fence,AV * fence_stack)17146 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
17147                              AV * stack, const IV fence, AV * fence_stack)
17148 {   /* Dumps the stacks in handle_regex_sets() */
17149 
17150     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
17151     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
17152     SSize_t i;
17153 
17154     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
17155 
17156     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
17157 
17158     if (stack_top < 0) {
17159         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
17160     }
17161     else {
17162         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17163         for (i = stack_top; i >= 0; i--) {
17164             SV ** element_ptr = av_fetch(stack, i, FALSE);
17165             if (! element_ptr) {
17166             }
17167 
17168             if (IS_OPERATOR(*element_ptr)) {
17169                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17170                                             (int) i, (int) SvIV(*element_ptr));
17171             }
17172             else {
17173                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17174                 sv_dump(*element_ptr);
17175             }
17176         }
17177     }
17178 
17179     if (fence_stack_top < 0) {
17180         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17181     }
17182     else {
17183         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17184         for (i = fence_stack_top; i >= 0; i--) {
17185             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17186             if (! element_ptr) {
17187             }
17188 
17189             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17190                                             (int) i, (int) SvIV(*element_ptr));
17191         }
17192     }
17193 }
17194 
17195 #endif
17196 
17197 #undef IS_OPERATOR
17198 #undef IS_OPERAND
17199 
17200 STATIC void
S_add_above_Latin1_folds(pTHX_ RExC_state_t * pRExC_state,const U8 cp,SV ** invlist)17201 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17202 {
17203     /* This adds the Latin1/above-Latin1 folding rules.
17204      *
17205      * This should be called only for a Latin1-range code points, cp, which is
17206      * known to be involved in a simple fold with other code points above
17207      * Latin1.  It would give false results if /aa has been specified.
17208      * Multi-char folds are outside the scope of this, and must be handled
17209      * specially. */
17210 
17211     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17212 
17213     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17214 
17215     /* The rules that are valid for all Unicode versions are hard-coded in */
17216     switch (cp) {
17217         case 'k':
17218         case 'K':
17219           *invlist =
17220              add_cp_to_invlist(*invlist, KELVIN_SIGN);
17221             break;
17222         case 's':
17223         case 'S':
17224           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17225             break;
17226         case MICRO_SIGN:
17227           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17228           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17229             break;
17230         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17231         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17232           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17233             break;
17234         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17235           *invlist = add_cp_to_invlist(*invlist,
17236                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17237             break;
17238 
17239         default:    /* Other code points are checked against the data for the
17240                        current Unicode version */
17241           {
17242             Size_t folds_count;
17243             U32 first_fold;
17244             const U32 * remaining_folds;
17245             UV folded_cp;
17246 
17247             if (isASCII(cp)) {
17248                 folded_cp = toFOLD(cp);
17249             }
17250             else {
17251                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17252                 Size_t dummy_len;
17253                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17254             }
17255 
17256             if (folded_cp > 255) {
17257                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17258             }
17259 
17260             folds_count = _inverse_folds(folded_cp, &first_fold,
17261                                                     &remaining_folds);
17262             if (folds_count == 0) {
17263 
17264                 /* Use deprecated warning to increase the chances of this being
17265                  * output */
17266                 ckWARN2reg_d(RExC_parse,
17267                         "Perl folding rules are not up-to-date for 0x%02X;"
17268                         " please use the perlbug utility to report;", cp);
17269             }
17270             else {
17271                 unsigned int i;
17272 
17273                 if (first_fold > 255) {
17274                     *invlist = add_cp_to_invlist(*invlist, first_fold);
17275                 }
17276                 for (i = 0; i < folds_count - 1; i++) {
17277                     if (remaining_folds[i] > 255) {
17278                         *invlist = add_cp_to_invlist(*invlist,
17279                                                     remaining_folds[i]);
17280                     }
17281                 }
17282             }
17283             break;
17284          }
17285     }
17286 }
17287 
17288 STATIC void
S_output_posix_warnings(pTHX_ RExC_state_t * pRExC_state,AV * posix_warnings)17289 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17290 {
17291     /* Output the elements of the array given by '*posix_warnings' as REGEXP
17292      * warnings. */
17293 
17294     SV * msg;
17295     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17296 
17297     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17298 
17299     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17300         CLEAR_POSIX_WARNINGS();
17301         return;
17302     }
17303 
17304     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17305         if (first_is_fatal) {           /* Avoid leaking this */
17306             av_undef(posix_warnings);   /* This isn't necessary if the
17307                                             array is mortal, but is a
17308                                             fail-safe */
17309             (void) sv_2mortal(msg);
17310             PREPARE_TO_DIE;
17311         }
17312         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17313         SvREFCNT_dec_NN(msg);
17314     }
17315 
17316     UPDATE_WARNINGS_LOC(RExC_parse);
17317 }
17318 
17319 PERL_STATIC_INLINE Size_t
S_find_first_differing_byte_pos(const U8 * s1,const U8 * s2,const Size_t max)17320 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17321 {
17322     const U8 * const start = s1;
17323     const U8 * const send = start + max;
17324 
17325     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17326 
17327     while (s1 < send && *s1  == *s2) {
17328         s1++; s2++;
17329     }
17330 
17331     return s1 - start;
17332 }
17333 
17334 
17335 STATIC AV *
S_add_multi_match(pTHX_ AV * multi_char_matches,SV * multi_string,const STRLEN cp_count)17336 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17337 {
17338     /* This adds the string scalar <multi_string> to the array
17339      * <multi_char_matches>.  <multi_string> is known to have exactly
17340      * <cp_count> code points in it.  This is used when constructing a
17341      * bracketed character class and we find something that needs to match more
17342      * than a single character.
17343      *
17344      * <multi_char_matches> is actually an array of arrays.  Each top-level
17345      * element is an array that contains all the strings known so far that are
17346      * the same length.  And that length (in number of code points) is the same
17347      * as the index of the top-level array.  Hence, the [2] element is an
17348      * array, each element thereof is a string containing TWO code points;
17349      * while element [3] is for strings of THREE characters, and so on.  Since
17350      * this is for multi-char strings there can never be a [0] nor [1] element.
17351      *
17352      * When we rewrite the character class below, we will do so such that the
17353      * longest strings are written first, so that it prefers the longest
17354      * matching strings first.  This is done even if it turns out that any
17355      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
17356      * Christiansen has agreed that this is ok.  This makes the test for the
17357      * ligature 'ffi' come before the test for 'ff', for example */
17358 
17359     AV* this_array;
17360     AV** this_array_ptr;
17361 
17362     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17363 
17364     if (! multi_char_matches) {
17365         multi_char_matches = newAV();
17366     }
17367 
17368     if (av_exists(multi_char_matches, cp_count)) {
17369         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17370         this_array = *this_array_ptr;
17371     }
17372     else {
17373         this_array = newAV();
17374         av_store(multi_char_matches, cp_count,
17375                  (SV*) this_array);
17376     }
17377     av_push(this_array, multi_string);
17378 
17379     return multi_char_matches;
17380 }
17381 
17382 /* The names of properties whose definitions are not known at compile time are
17383  * stored in this SV, after a constant heading.  So if the length has been
17384  * changed since initialization, then there is a run-time definition. */
17385 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
17386                                         (SvCUR(listsv) != initial_listsv_len)
17387 
17388 /* There is a restricted set of white space characters that are legal when
17389  * ignoring white space in a bracketed character class.  This generates the
17390  * code to skip them.
17391  *
17392  * There is a line below that uses the same white space criteria but is outside
17393  * this macro.  Both here and there must use the same definition */
17394 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
17395     STMT_START {                                                        \
17396         if (do_skip) {                                                  \
17397             while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
17398             {                                                           \
17399                 p++;                                                    \
17400             }                                                           \
17401         }                                                               \
17402     } STMT_END
17403 
17404 STATIC regnode_offset
S_regclass(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth,const bool stop_at_1,bool allow_mutiple_chars,const bool silence_non_portable,const bool strict,bool optimizable,SV ** ret_invlist)17405 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17406                  const bool stop_at_1,  /* Just parse the next thing, don't
17407                                            look for a full character class */
17408                  bool allow_mutiple_chars,
17409                  const bool silence_non_portable,   /* Don't output warnings
17410                                                        about too large
17411                                                        characters */
17412                  const bool strict,
17413                  bool optimizable,                  /* ? Allow a non-ANYOF return
17414                                                        node */
17415                  SV** ret_invlist  /* Return an inversion list, not a node */
17416           )
17417 {
17418     /* parse a bracketed class specification.  Most of these will produce an
17419      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17420      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
17421      * under /i with multi-character folds: it will be rewritten following the
17422      * paradigm of this example, where the <multi-fold>s are characters which
17423      * fold to multiple character sequences:
17424      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17425      * gets effectively rewritten as:
17426      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17427      * reg() gets called (recursively) on the rewritten version, and this
17428      * function will return what it constructs.  (Actually the <multi-fold>s
17429      * aren't physically removed from the [abcdefghi], it's just that they are
17430      * ignored in the recursion by means of a flag:
17431      * <RExC_in_multi_char_class>.)
17432      *
17433      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17434      * characters, with the corresponding bit set if that character is in the
17435      * list.  For characters above this, an inversion list is used.  There
17436      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17437      * determinable at compile time
17438      *
17439      * On success, returns the offset at which any next node should be placed
17440      * into the regex engine program being compiled.
17441      *
17442      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17443      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17444      * UTF-8
17445      */
17446 
17447     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17448     IV range = 0;
17449     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17450     regnode_offset ret = -1;    /* Initialized to an illegal value */
17451     STRLEN numlen;
17452     int namedclass = OOB_NAMEDCLASS;
17453     char *rangebegin = NULL;
17454     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
17455                                aren't available at the time this was called */
17456     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17457 				      than just initialized.  */
17458     SV* properties = NULL;    /* Code points that match \p{} \P{} */
17459     SV* posixes = NULL;     /* Code points that match classes like [:word:],
17460                                extended beyond the Latin1 range.  These have to
17461                                be kept separate from other code points for much
17462                                of this function because their handling  is
17463                                different under /i, and for most classes under
17464                                /d as well */
17465     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
17466                                separate for a while from the non-complemented
17467                                versions because of complications with /d
17468                                matching */
17469     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17470                                   treated more simply than the general case,
17471                                   leading to less compilation and execution
17472                                   work */
17473     UV element_count = 0;   /* Number of distinct elements in the class.
17474 			       Optimizations may be possible if this is tiny */
17475     AV * multi_char_matches = NULL; /* Code points that fold to more than one
17476                                        character; used under /i */
17477     UV n;
17478     char * stop_ptr = RExC_end;    /* where to stop parsing */
17479 
17480     /* ignore unescaped whitespace? */
17481     const bool skip_white = cBOOL(   ret_invlist
17482                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17483 
17484     /* inversion list of code points this node matches only when the target
17485      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
17486      * /d) */
17487     SV* upper_latin1_only_utf8_matches = NULL;
17488 
17489     /* Inversion list of code points this node matches regardless of things
17490      * like locale, folding, utf8ness of the target string */
17491     SV* cp_list = NULL;
17492 
17493     /* Like cp_list, but code points on this list need to be checked for things
17494      * that fold to/from them under /i */
17495     SV* cp_foldable_list = NULL;
17496 
17497     /* Like cp_list, but code points on this list are valid only when the
17498      * runtime locale is UTF-8 */
17499     SV* only_utf8_locale_list = NULL;
17500 
17501     /* In a range, if one of the endpoints is non-character-set portable,
17502      * meaning that it hard-codes a code point that may mean a different
17503      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17504      * mnemonic '\t' which each mean the same character no matter which
17505      * character set the platform is on. */
17506     unsigned int non_portable_endpoint = 0;
17507 
17508     /* Is the range unicode? which means on a platform that isn't 1-1 native
17509      * to Unicode (i.e. non-ASCII), each code point in it should be considered
17510      * to be a Unicode value.  */
17511     bool unicode_range = FALSE;
17512     bool invert = FALSE;    /* Is this class to be complemented */
17513 
17514     bool warn_super = ALWAYS_WARN_SUPER;
17515 
17516     const char * orig_parse = RExC_parse;
17517 
17518     /* This variable is used to mark where the end in the input is of something
17519      * that looks like a POSIX construct but isn't.  During the parse, when
17520      * something looks like it could be such a construct is encountered, it is
17521      * checked for being one, but not if we've already checked this area of the
17522      * input.  Only after this position is reached do we check again */
17523     char *not_posix_region_end = RExC_parse - 1;
17524 
17525     AV* posix_warnings = NULL;
17526     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17527     U8 op = END;    /* The returned node-type, initialized to an impossible
17528                        one.  */
17529     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
17530     U32 posixl = 0;       /* bit field of posix classes matched under /l */
17531 
17532 
17533 /* Flags as to what things aren't knowable until runtime.  (Note that these are
17534  * mutually exclusive.) */
17535 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
17536                                             haven't been defined as of yet */
17537 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
17538                                             UTF-8 or not */
17539 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
17540                                             what gets folded */
17541     U32 has_runtime_dependency = 0;     /* OR of the above flags */
17542 
17543     DECLARE_AND_GET_RE_DEBUG_FLAGS;
17544 
17545     PERL_ARGS_ASSERT_REGCLASS;
17546 #ifndef DEBUGGING
17547     PERL_UNUSED_ARG(depth);
17548 #endif
17549 
17550     assert(! (ret_invlist && allow_mutiple_chars));
17551 
17552     /* If wants an inversion list returned, we can't optimize to something
17553      * else. */
17554     if (ret_invlist) {
17555         optimizable = FALSE;
17556     }
17557 
17558     DEBUG_PARSE("clas");
17559 
17560 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
17561     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
17562                                    && UNICODE_DOT_DOT_VERSION == 0)
17563     allow_mutiple_chars = FALSE;
17564 #endif
17565 
17566     /* We include the /i status at the beginning of this so that we can
17567      * know it at runtime */
17568     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17569     initial_listsv_len = SvCUR(listsv);
17570     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
17571 
17572     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17573 
17574     assert(RExC_parse <= RExC_end);
17575 
17576     if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
17577 	RExC_parse++;
17578         invert = TRUE;
17579         allow_mutiple_chars = FALSE;
17580         MARK_NAUGHTY(1);
17581         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17582     }
17583 
17584     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17585     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17586         int maybe_class = handle_possible_posix(pRExC_state,
17587                                                 RExC_parse,
17588                                                 &not_posix_region_end,
17589                                                 NULL,
17590                                                 TRUE /* checking only */);
17591         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17592             ckWARN4reg(not_posix_region_end,
17593                     "POSIX syntax [%c %c] belongs inside character classes%s",
17594                     *RExC_parse, *RExC_parse,
17595                     (maybe_class == OOB_NAMEDCLASS)
17596                     ? ((POSIXCC_NOTYET(*RExC_parse))
17597                         ? " (but this one isn't implemented)"
17598                         : " (but this one isn't fully valid)")
17599                     : ""
17600                     );
17601         }
17602     }
17603 
17604     /* If the caller wants us to just parse a single element, accomplish this
17605      * by faking the loop ending condition */
17606     if (stop_at_1 && RExC_end > RExC_parse) {
17607         stop_ptr = RExC_parse + 1;
17608     }
17609 
17610     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17611     if (UCHARAT(RExC_parse) == ']')
17612 	goto charclassloop;
17613 
17614     while (1) {
17615 
17616         if (   posix_warnings
17617             && av_tindex_skip_len_mg(posix_warnings) >= 0
17618             && RExC_parse > not_posix_region_end)
17619         {
17620             /* Warnings about posix class issues are considered tentative until
17621              * we are far enough along in the parse that we can no longer
17622              * change our mind, at which point we output them.  This is done
17623              * each time through the loop so that a later class won't zap them
17624              * before they have been dealt with. */
17625             output_posix_warnings(pRExC_state, posix_warnings);
17626         }
17627 
17628         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17629 
17630         if  (RExC_parse >= stop_ptr) {
17631             break;
17632         }
17633 
17634         if  (UCHARAT(RExC_parse) == ']') {
17635             break;
17636         }
17637 
17638       charclassloop:
17639 
17640 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17641         save_value = value;
17642         save_prevvalue = prevvalue;
17643 
17644 	if (!range) {
17645 	    rangebegin = RExC_parse;
17646 	    element_count++;
17647             non_portable_endpoint = 0;
17648 	}
17649 	if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17650 	    value = utf8n_to_uvchr((U8*)RExC_parse,
17651 				   RExC_end - RExC_parse,
17652 				   &numlen, UTF8_ALLOW_DEFAULT);
17653 	    RExC_parse += numlen;
17654 	}
17655 	else
17656 	    value = UCHARAT(RExC_parse++);
17657 
17658         if (value == '[') {
17659             char * posix_class_end;
17660             namedclass = handle_possible_posix(pRExC_state,
17661                                                RExC_parse,
17662                                                &posix_class_end,
17663                                                do_posix_warnings ? &posix_warnings : NULL,
17664                                                FALSE    /* die if error */);
17665             if (namedclass > OOB_NAMEDCLASS) {
17666 
17667                 /* If there was an earlier attempt to parse this particular
17668                  * posix class, and it failed, it was a false alarm, as this
17669                  * successful one proves */
17670                 if (   posix_warnings
17671                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17672                     && not_posix_region_end >= RExC_parse
17673                     && not_posix_region_end <= posix_class_end)
17674                 {
17675                     av_undef(posix_warnings);
17676                 }
17677 
17678                 RExC_parse = posix_class_end;
17679             }
17680             else if (namedclass == OOB_NAMEDCLASS) {
17681                 not_posix_region_end = posix_class_end;
17682             }
17683             else {
17684                 namedclass = OOB_NAMEDCLASS;
17685             }
17686         }
17687         else if (   RExC_parse - 1 > not_posix_region_end
17688                  && MAYBE_POSIXCC(value))
17689         {
17690             (void) handle_possible_posix(
17691                         pRExC_state,
17692                         RExC_parse - 1,  /* -1 because parse has already been
17693                                             advanced */
17694                         &not_posix_region_end,
17695                         do_posix_warnings ? &posix_warnings : NULL,
17696                         TRUE /* checking only */);
17697         }
17698         else if (  strict && ! skip_white
17699                  && (   _generic_isCC(value, _CC_VERTSPACE)
17700                      || is_VERTWS_cp_high(value)))
17701         {
17702             vFAIL("Literal vertical space in [] is illegal except under /x");
17703         }
17704         else if (value == '\\') {
17705             /* Is a backslash; get the code point of the char after it */
17706 
17707             if (RExC_parse >= RExC_end) {
17708                 vFAIL("Unmatched [");
17709             }
17710 
17711 	    if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17712 		value = utf8n_to_uvchr((U8*)RExC_parse,
17713 				   RExC_end - RExC_parse,
17714 				   &numlen, UTF8_ALLOW_DEFAULT);
17715 		RExC_parse += numlen;
17716 	    }
17717 	    else
17718 		value = UCHARAT(RExC_parse++);
17719 
17720 	    /* Some compilers cannot handle switching on 64-bit integer
17721 	     * values, therefore value cannot be an UV.  Yes, this will
17722 	     * be a problem later if we want switch on Unicode.
17723 	     * A similar issue a little bit later when switching on
17724 	     * namedclass. --jhi */
17725 
17726             /* If the \ is escaping white space when white space is being
17727              * skipped, it means that that white space is wanted literally, and
17728              * is already in 'value'.  Otherwise, need to translate the escape
17729              * into what it signifies. */
17730             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17731                 const char * message;
17732                 U32 packed_warn;
17733                 U8 grok_c_char;
17734 
17735 	    case 'w':	namedclass = ANYOF_WORDCHAR;	break;
17736 	    case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
17737 	    case 's':	namedclass = ANYOF_SPACE;	break;
17738 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
17739 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
17740 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
17741 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
17742 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
17743 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
17744 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
17745             case 'N':  /* Handle \N{NAME} in class */
17746                 {
17747                     const char * const backslash_N_beg = RExC_parse - 2;
17748                     int cp_count;
17749 
17750                     if (! grok_bslash_N(pRExC_state,
17751                                         NULL,      /* No regnode */
17752                                         &value,    /* Yes single value */
17753                                         &cp_count, /* Multiple code pt count */
17754                                         flagp,
17755                                         strict,
17756                                         depth)
17757                     ) {
17758 
17759                         if (*flagp & NEED_UTF8)
17760                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17761 
17762                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17763 
17764                         if (cp_count < 0) {
17765                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17766                         }
17767                         else if (cp_count == 0) {
17768                             ckWARNreg(RExC_parse,
17769                               "Ignoring zero length \\N{} in character class");
17770                         }
17771                         else { /* cp_count > 1 */
17772                             assert(cp_count > 1);
17773                             if (! RExC_in_multi_char_class) {
17774                                 if ( ! allow_mutiple_chars
17775                                     || invert
17776                                     || range
17777                                     || *RExC_parse == '-')
17778                                 {
17779                                     if (strict) {
17780                                         RExC_parse--;
17781                                         vFAIL("\\N{} here is restricted to one character");
17782                                     }
17783                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17784                                     break; /* <value> contains the first code
17785                                               point. Drop out of the switch to
17786                                               process it */
17787                                 }
17788                                 else {
17789                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17790                                                  RExC_parse - backslash_N_beg);
17791                                     multi_char_matches
17792                                         = add_multi_match(multi_char_matches,
17793                                                           multi_char_N,
17794                                                           cp_count);
17795                                 }
17796                             }
17797                         } /* End of cp_count != 1 */
17798 
17799                         /* This element should not be processed further in this
17800                          * class */
17801                         element_count--;
17802                         value = save_value;
17803                         prevvalue = save_prevvalue;
17804                         continue;   /* Back to top of loop to get next char */
17805                     }
17806 
17807                     /* Here, is a single code point, and <value> contains it */
17808                     unicode_range = TRUE;   /* \N{} are Unicode */
17809                 }
17810                 break;
17811 	    case 'p':
17812 	    case 'P':
17813 		{
17814 		char *e;
17815 
17816                 if (RExC_pm_flags & PMf_WILDCARD) {
17817                     RExC_parse++;
17818                     /* diag_listed_as: Use of %s is not allowed in Unicode
17819                        property wildcard subpatterns in regex; marked by <--
17820                        HERE in m/%s/ */
17821                     vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17822                            " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17823                 }
17824 
17825 		/* \p means they want Unicode semantics */
17826 		REQUIRE_UNI_RULES(flagp, 0);
17827 
17828 		if (RExC_parse >= RExC_end)
17829 		    vFAIL2("Empty \\%c", (U8)value);
17830 		if (*RExC_parse == '{') {
17831 		    const U8 c = (U8)value;
17832 		    e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17833                     if (!e) {
17834                         RExC_parse++;
17835                         vFAIL2("Missing right brace on \\%c{}", c);
17836                     }
17837 
17838                     RExC_parse++;
17839 
17840                     /* White space is allowed adjacent to the braces and after
17841                      * any '^', even when not under /x */
17842                     while (isSPACE(*RExC_parse)) {
17843                          RExC_parse++;
17844 		    }
17845 
17846 		    if (UCHARAT(RExC_parse) == '^') {
17847 
17848                         /* toggle.  (The rhs xor gets the single bit that
17849                          * differs between P and p; the other xor inverts just
17850                          * that bit) */
17851                         value ^= 'P' ^ 'p';
17852 
17853                         RExC_parse++;
17854                         while (isSPACE(*RExC_parse)) {
17855                             RExC_parse++;
17856                         }
17857                     }
17858 
17859                     if (e == RExC_parse)
17860                         vFAIL2("Empty \\%c{}", c);
17861 
17862 		    n = e - RExC_parse;
17863 		    while (isSPACE(*(RExC_parse + n - 1)))
17864 		        n--;
17865 
17866 		}   /* The \p isn't immediately followed by a '{' */
17867 		else if (! isALPHA(*RExC_parse)) {
17868                     RExC_parse += (UTF)
17869                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17870                                   : 1;
17871                     vFAIL2("Character following \\%c must be '{' or a "
17872                            "single-character Unicode property name",
17873                            (U8) value);
17874                 }
17875                 else {
17876 		    e = RExC_parse;
17877 		    n = 1;
17878 		}
17879 		{
17880                     char* name = RExC_parse;
17881 
17882                     /* Any message returned about expanding the definition */
17883                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17884 
17885                     /* If set TRUE, the property is user-defined as opposed to
17886                      * official Unicode */
17887                     bool user_defined = FALSE;
17888                     AV * strings = NULL;
17889 
17890                     SV * prop_definition = parse_uniprop_string(
17891                                             name, n, UTF, FOLD,
17892                                             FALSE, /* This is compile-time */
17893 
17894                                             /* We can't defer this defn when
17895                                              * the full result is required in
17896                                              * this call */
17897                                             ! cBOOL(ret_invlist),
17898 
17899                                             &strings,
17900                                             &user_defined,
17901                                             msg,
17902                                             0 /* Base level */
17903                                            );
17904                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17905                         assert(prop_definition == NULL);
17906                         RExC_parse = e + 1;
17907                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17908                                                thing so, or else the display is
17909                                                mojibake */
17910                             RExC_utf8 = TRUE;
17911                         }
17912 			/* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17913                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17914                                     SvCUR(msg), SvPVX(msg)));
17915                     }
17916 
17917                     assert(prop_definition || strings);
17918 
17919                     if (strings) {
17920                         if (ret_invlist) {
17921                             if (! prop_definition) {
17922                                 RExC_parse = e + 1;
17923                                 vFAIL("Unicode string properties are not implemented in (?[...])");
17924                             }
17925                             else {
17926                                 ckWARNreg(e + 1,
17927                                     "Using just the single character results"
17928                                     " returned by \\p{} in (?[...])");
17929                             }
17930                         }
17931                         else if (! RExC_in_multi_char_class) {
17932                             if (invert ^ (value == 'P')) {
17933                                 RExC_parse = e + 1;
17934                                 vFAIL("Inverting a character class which contains"
17935                                     " a multi-character sequence is illegal");
17936                             }
17937 
17938                             /* For each multi-character string ... */
17939                             while (av_count(strings) > 0) {
17940                                 /* ... Each entry is itself an array of code
17941                                 * points. */
17942                                 AV * this_string = (AV *) av_shift( strings);
17943                                 STRLEN cp_count = av_count(this_string);
17944                                 SV * final = newSV(cp_count * 4);
17945                                 SvPVCLEAR(final);
17946 
17947                                 /* Create another string of sequences of \x{...} */
17948                                 while (av_count(this_string) > 0) {
17949                                     SV * character = av_shift(this_string);
17950                                     UV cp = SvUV(character);
17951 
17952                                     if (cp > 255) {
17953                                         REQUIRE_UTF8(flagp);
17954                                     }
17955                                     Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17956                                                                         cp);
17957                                     SvREFCNT_dec_NN(character);
17958                                 }
17959                                 SvREFCNT_dec_NN(this_string);
17960 
17961                                 /* And add that to the list of such things */
17962                                 multi_char_matches
17963                                             = add_multi_match(multi_char_matches,
17964                                                             final,
17965                                                             cp_count);
17966                             }
17967                         }
17968                         SvREFCNT_dec_NN(strings);
17969                     }
17970 
17971                     if (! prop_definition) {    /* If we got only a string,
17972                                                    this iteration didn't really
17973                                                    find a character */
17974                         element_count--;
17975                     }
17976                     else if (! is_invlist(prop_definition)) {
17977 
17978                         /* Here, the definition isn't known, so we have gotten
17979                          * returned a string that will be evaluated if and when
17980                          * encountered at runtime.  We add it to the list of
17981                          * such properties, along with whether it should be
17982                          * complemented or not */
17983                         if (value == 'P') {
17984                             sv_catpvs(listsv, "!");
17985                         }
17986                         else {
17987                             sv_catpvs(listsv, "+");
17988                         }
17989                         sv_catsv(listsv, prop_definition);
17990 
17991                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17992 
17993                         /* We don't know yet what this matches, so have to flag
17994                          * it */
17995                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17996                     }
17997                     else {
17998                         assert (prop_definition && is_invlist(prop_definition));
17999 
18000                         /* Here we do have the complete property definition
18001                          *
18002                          * Temporary workaround for [perl #133136].  For this
18003                          * precise input that is in the .t that is failing,
18004                          * load utf8.pm, which is what the test wants, so that
18005                          * that .t passes */
18006                         if (     memEQs(RExC_start, e + 1 - RExC_start,
18007                                         "foo\\p{Alnum}")
18008                             && ! hv_common(GvHVn(PL_incgv),
18009                                            NULL,
18010                                            "utf8.pm", sizeof("utf8.pm") - 1,
18011                                            0, HV_FETCH_ISEXISTS, NULL, 0))
18012                         {
18013                             require_pv("utf8.pm");
18014                         }
18015 
18016                         if (! user_defined &&
18017                             /* We warn on matching an above-Unicode code point
18018                              * if the match would return true, except don't
18019                              * warn for \p{All}, which has exactly one element
18020                              * = 0 */
18021                             (_invlist_contains_cp(prop_definition, 0x110000)
18022                                 && (! (_invlist_len(prop_definition) == 1
18023                                        && *invlist_array(prop_definition) == 0))))
18024                         {
18025                             warn_super = TRUE;
18026                         }
18027 
18028                         /* Invert if asking for the complement */
18029                         if (value == 'P') {
18030 			    _invlist_union_complement_2nd(properties,
18031                                                           prop_definition,
18032                                                           &properties);
18033                         }
18034                         else {
18035                             _invlist_union(properties, prop_definition, &properties);
18036 			}
18037                     }
18038                 }
18039 
18040 		RExC_parse = e + 1;
18041                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
18042                                                 named */
18043 		}
18044 		break;
18045 	    case 'n':	value = '\n';			break;
18046 	    case 'r':	value = '\r';			break;
18047 	    case 't':	value = '\t';			break;
18048 	    case 'f':	value = '\f';			break;
18049 	    case 'b':	value = '\b';			break;
18050 	    case 'e':	value = ESC_NATIVE;             break;
18051 	    case 'a':	value = '\a';                   break;
18052 	    case 'o':
18053 		RExC_parse--;	/* function expects to be pointed at the 'o' */
18054                 if (! grok_bslash_o(&RExC_parse,
18055                                             RExC_end,
18056                                             &value,
18057                                             &message,
18058                                             &packed_warn,
18059                                             strict,
18060                                             cBOOL(range), /* MAX_UV allowed for range
18061                                                       upper limit */
18062                                             UTF))
18063                 {
18064                     vFAIL(message);
18065                 }
18066                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18067                     warn_non_literal_string(RExC_parse, packed_warn, message);
18068                 }
18069 
18070                 if (value < 256) {
18071                     non_portable_endpoint++;
18072                 }
18073 		break;
18074 	    case 'x':
18075 		RExC_parse--;	/* function expects to be pointed at the 'x' */
18076                 if (!  grok_bslash_x(&RExC_parse,
18077                                             RExC_end,
18078                                             &value,
18079                                             &message,
18080                                             &packed_warn,
18081                                             strict,
18082                                             cBOOL(range), /* MAX_UV allowed for range
18083                                                       upper limit */
18084                                             UTF))
18085                 {
18086                     vFAIL(message);
18087                 }
18088                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18089                     warn_non_literal_string(RExC_parse, packed_warn, message);
18090                 }
18091 
18092                 if (value < 256) {
18093                     non_portable_endpoint++;
18094                 }
18095 		break;
18096 	    case 'c':
18097                 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
18098                                                                 &packed_warn))
18099                 {
18100                     /* going to die anyway; point to exact spot of
18101                         * failure */
18102                     RExC_parse += (UTF)
18103                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
18104                                   : 1;
18105                     vFAIL(message);
18106                 }
18107 
18108                 value = grok_c_char;
18109                 RExC_parse++;
18110                 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18111                     warn_non_literal_string(RExC_parse, packed_warn, message);
18112                 }
18113 
18114                 non_portable_endpoint++;
18115 		break;
18116 	    case '0': case '1': case '2': case '3': case '4':
18117 	    case '5': case '6': case '7':
18118 		{
18119 		    /* Take 1-3 octal digits */
18120 		    I32 flags = PERL_SCAN_SILENT_ILLDIGIT
18121                               | PERL_SCAN_NOTIFY_ILLDIGIT;
18122                     numlen = (strict) ? 4 : 3;
18123                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
18124 		    RExC_parse += numlen;
18125                     if (numlen != 3) {
18126                         if (strict) {
18127                             RExC_parse += (UTF)
18128                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
18129                                           : 1;
18130                             vFAIL("Need exactly 3 octal digits");
18131                         }
18132                         else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
18133                                  && RExC_parse < RExC_end
18134                                  && isDIGIT(*RExC_parse)
18135                                  && ckWARN(WARN_REGEXP))
18136                         {
18137                             reg_warn_non_literal_string(
18138                                  RExC_parse + 1,
18139                                  form_alien_digit_msg(8, numlen, RExC_parse,
18140                                                         RExC_end, UTF, FALSE));
18141                         }
18142                     }
18143                     if (value < 256) {
18144                         non_portable_endpoint++;
18145                     }
18146 		    break;
18147 		}
18148 	    default:
18149 		/* Allow \_ to not give an error */
18150 		if (isWORDCHAR(value) && value != '_') {
18151                     if (strict) {
18152                         vFAIL2("Unrecognized escape \\%c in character class",
18153                                (int)value);
18154                     }
18155                     else {
18156                         ckWARN2reg(RExC_parse,
18157                             "Unrecognized escape \\%c in character class passed through",
18158                             (int)value);
18159                     }
18160 		}
18161 		break;
18162 	    }   /* End of switch on char following backslash */
18163 	} /* end of handling backslash escape sequences */
18164 
18165         /* Here, we have the current token in 'value' */
18166 
18167 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18168             U8 classnum;
18169 
18170 	    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
18171 	     * literal, as is the character that began the false range, i.e.
18172 	     * the 'a' in the examples */
18173 	    if (range) {
18174                 const int w = (RExC_parse >= rangebegin)
18175                                 ? RExC_parse - rangebegin
18176                                 : 0;
18177                 if (strict) {
18178                     vFAIL2utf8f(
18179                         "False [] range \"%" UTF8f "\"",
18180                         UTF8fARG(UTF, w, rangebegin));
18181                 }
18182                 else {
18183                     ckWARN2reg(RExC_parse,
18184                         "False [] range \"%" UTF8f "\"",
18185                         UTF8fARG(UTF, w, rangebegin));
18186                     cp_list = add_cp_to_invlist(cp_list, '-');
18187                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18188                                                             prevvalue);
18189                 }
18190 
18191 		range = 0; /* this was not a true range */
18192                 element_count += 2; /* So counts for three values */
18193 	    }
18194 
18195             classnum = namedclass_to_classnum(namedclass);
18196 
18197 	    if (LOC && namedclass < ANYOF_POSIXL_MAX
18198 #ifndef HAS_ISASCII
18199                 && classnum != _CC_ASCII
18200 #endif
18201             ) {
18202                 SV* scratch_list = NULL;
18203 
18204                 /* What the Posix classes (like \w, [:space:]) match isn't
18205                  * generally knowable under locale until actual match time.  A
18206                  * special node is used for these which has extra space for a
18207                  * bitmap, with a bit reserved for each named class that is to
18208                  * be matched against.  (This isn't needed for \p{} and
18209                  * pseudo-classes, as they are not affected by locale, and
18210                  * hence are dealt with separately.)  However, if a named class
18211                  * and its complement are both present, then it matches
18212                  * everything, and there is no runtime dependency.  Odd numbers
18213                  * are the complements of the next lower number, so xor works.
18214                  * (Note that something like [\w\D] should match everything,
18215                  * because \d should be a proper subset of \w.  But rather than
18216                  * trust that the locale is well behaved, we leave this to
18217                  * runtime to sort out) */
18218                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18219                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18220                     POSIXL_ZERO(posixl);
18221                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18222                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18223                     continue;   /* We could ignore the rest of the class, but
18224                                    best to parse it for any errors */
18225                 }
18226                 else { /* Here, isn't the complement of any already parsed
18227                           class */
18228                     POSIXL_SET(posixl, namedclass);
18229                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18230                     anyof_flags |= ANYOF_MATCHES_POSIXL;
18231 
18232                     /* The above-Latin1 characters are not subject to locale
18233                      * rules.  Just add them to the unconditionally-matched
18234                      * list */
18235 
18236                     /* Get the list of the above-Latin1 code points this
18237                      * matches */
18238                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18239                                             PL_XPosix_ptrs[classnum],
18240 
18241                                             /* Odd numbers are complements,
18242                                              * like NDIGIT, NASCII, ... */
18243                                             namedclass % 2 != 0,
18244                                             &scratch_list);
18245                     /* Checking if 'cp_list' is NULL first saves an extra
18246                      * clone.  Its reference count will be decremented at the
18247                      * next union, etc, or if this is the only instance, at the
18248                      * end of the routine */
18249                     if (! cp_list) {
18250                         cp_list = scratch_list;
18251                     }
18252                     else {
18253                         _invlist_union(cp_list, scratch_list, &cp_list);
18254                         SvREFCNT_dec_NN(scratch_list);
18255                     }
18256                     continue;   /* Go get next character */
18257                 }
18258             }
18259             else {
18260 
18261                 /* Here, is not /l, or is a POSIX class for which /l doesn't
18262                  * matter (or is a Unicode property, which is skipped here). */
18263                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
18264                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18265 
18266                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
18267                          * nor /l make a difference in what these match,
18268                          * therefore we just add what they match to cp_list. */
18269                         if (classnum != _CC_VERTSPACE) {
18270                             assert(   namedclass == ANYOF_HORIZWS
18271                                    || namedclass == ANYOF_NHORIZWS);
18272 
18273                             /* It turns out that \h is just a synonym for
18274                              * XPosixBlank */
18275                             classnum = _CC_BLANK;
18276                         }
18277 
18278                         _invlist_union_maybe_complement_2nd(
18279                                 cp_list,
18280                                 PL_XPosix_ptrs[classnum],
18281                                 namedclass % 2 != 0,    /* Complement if odd
18282                                                           (NHORIZWS, NVERTWS)
18283                                                         */
18284                                 &cp_list);
18285                     }
18286                 }
18287                 else if (   AT_LEAST_UNI_SEMANTICS
18288                          || classnum == _CC_ASCII
18289                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
18290                                                    || classnum == _CC_XDIGIT)))
18291                 {
18292                     /* We usually have to worry about /d affecting what POSIX
18293                      * classes match, with special code needed because we won't
18294                      * know until runtime what all matches.  But there is no
18295                      * extra work needed under /u and /a; and [:ascii:] is
18296                      * unaffected by /d; and :digit: and :xdigit: don't have
18297                      * runtime differences under /d.  So we can special case
18298                      * these, and avoid some extra work below, and at runtime.
18299                      * */
18300                     _invlist_union_maybe_complement_2nd(
18301                                                      simple_posixes,
18302                                                       ((AT_LEAST_ASCII_RESTRICTED)
18303                                                        ? PL_Posix_ptrs[classnum]
18304                                                        : PL_XPosix_ptrs[classnum]),
18305                                                      namedclass % 2 != 0,
18306                                                      &simple_posixes);
18307                 }
18308                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
18309                            complement and use nposixes */
18310                     SV** posixes_ptr = namedclass % 2 == 0
18311                                        ? &posixes
18312                                        : &nposixes;
18313                     _invlist_union_maybe_complement_2nd(
18314                                                      *posixes_ptr,
18315                                                      PL_XPosix_ptrs[classnum],
18316                                                      namedclass % 2 != 0,
18317                                                      posixes_ptr);
18318                 }
18319 	    }
18320 	} /* end of namedclass \blah */
18321 
18322         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18323 
18324         /* If 'range' is set, 'value' is the ending of a range--check its
18325          * validity.  (If value isn't a single code point in the case of a
18326          * range, we should have figured that out above in the code that
18327          * catches false ranges).  Later, we will handle each individual code
18328          * point in the range.  If 'range' isn't set, this could be the
18329          * beginning of a range, so check for that by looking ahead to see if
18330          * the next real character to be processed is the range indicator--the
18331          * minus sign */
18332 
18333 	if (range) {
18334 #ifdef EBCDIC
18335             /* For unicode ranges, we have to test that the Unicode as opposed
18336              * to the native values are not decreasing.  (Above 255, there is
18337              * no difference between native and Unicode) */
18338 	    if (unicode_range && prevvalue < 255 && value < 255) {
18339                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18340                     goto backwards_range;
18341                 }
18342             }
18343             else
18344 #endif
18345 	    if (prevvalue > value) /* b-a */ {
18346 		int w;
18347 #ifdef EBCDIC
18348               backwards_range:
18349 #endif
18350                 w = RExC_parse - rangebegin;
18351                 vFAIL2utf8f(
18352                     "Invalid [] range \"%" UTF8f "\"",
18353                     UTF8fARG(UTF, w, rangebegin));
18354                 NOT_REACHED; /* NOTREACHED */
18355 	    }
18356 	}
18357 	else {
18358             prevvalue = value; /* save the beginning of the potential range */
18359             if (! stop_at_1     /* Can't be a range if parsing just one thing */
18360                 && *RExC_parse == '-')
18361             {
18362                 char* next_char_ptr = RExC_parse + 1;
18363 
18364                 /* Get the next real char after the '-' */
18365                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18366 
18367                 /* If the '-' is at the end of the class (just before the ']',
18368                  * it is a literal minus; otherwise it is a range */
18369                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18370                     RExC_parse = next_char_ptr;
18371 
18372                     /* a bad range like \w-, [:word:]- ? */
18373                     if (namedclass > OOB_NAMEDCLASS) {
18374                         if (strict || ckWARN(WARN_REGEXP)) {
18375                             const int w = RExC_parse >= rangebegin
18376                                           ?  RExC_parse - rangebegin
18377                                           : 0;
18378                             if (strict) {
18379                                 vFAIL4("False [] range \"%*.*s\"",
18380                                     w, w, rangebegin);
18381                             }
18382                             else {
18383                                 vWARN4(RExC_parse,
18384                                     "False [] range \"%*.*s\"",
18385                                     w, w, rangebegin);
18386                             }
18387                         }
18388                         cp_list = add_cp_to_invlist(cp_list, '-');
18389                         element_count++;
18390                     } else
18391                         range = 1;	/* yeah, it's a range! */
18392                     continue;	/* but do it the next time */
18393                 }
18394 	    }
18395 	}
18396 
18397         if (namedclass > OOB_NAMEDCLASS) {
18398             continue;
18399         }
18400 
18401         /* Here, we have a single value this time through the loop, and
18402          * <prevvalue> is the beginning of the range, if any; or <value> if
18403          * not. */
18404 
18405 	/* non-Latin1 code point implies unicode semantics. */
18406 	if (value > 255) {
18407             if (value > MAX_LEGAL_CP && (   value != UV_MAX
18408                                          || prevvalue > MAX_LEGAL_CP))
18409             {
18410                 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18411             }
18412             REQUIRE_UNI_RULES(flagp, 0);
18413             if (  ! silence_non_portable
18414                 &&  UNICODE_IS_PERL_EXTENDED(value)
18415                 &&  TO_OUTPUT_WARNINGS(RExC_parse))
18416             {
18417                 ckWARN2_non_literal_string(RExC_parse,
18418                                            packWARN(WARN_PORTABLE),
18419                                            PL_extended_cp_format,
18420                                            value);
18421             }
18422 	}
18423 
18424         /* Ready to process either the single value, or the completed range.
18425          * For single-valued non-inverted ranges, we consider the possibility
18426          * of multi-char folds.  (We made a conscious decision to not do this
18427          * for the other cases because it can often lead to non-intuitive
18428          * results.  For example, you have the peculiar case that:
18429          *  "s s" =~ /^[^\xDF]+$/i => Y
18430          *  "ss"  =~ /^[^\xDF]+$/i => N
18431          *
18432          * See [perl #89750] */
18433         if (FOLD && allow_mutiple_chars && value == prevvalue) {
18434             if (    value == LATIN_SMALL_LETTER_SHARP_S
18435                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18436                                                         value)))
18437             {
18438                 /* Here <value> is indeed a multi-char fold.  Get what it is */
18439 
18440                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18441                 STRLEN foldlen;
18442 
18443                 UV folded = _to_uni_fold_flags(
18444                                 value,
18445                                 foldbuf,
18446                                 &foldlen,
18447                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18448                                                    ? FOLD_FLAGS_NOMIX_ASCII
18449                                                    : 0)
18450                                 );
18451 
18452                 /* Here, <folded> should be the first character of the
18453                  * multi-char fold of <value>, with <foldbuf> containing the
18454                  * whole thing.  But, if this fold is not allowed (because of
18455                  * the flags), <fold> will be the same as <value>, and should
18456                  * be processed like any other character, so skip the special
18457                  * handling */
18458                 if (folded != value) {
18459 
18460                     /* Skip if we are recursed, currently parsing the class
18461                      * again.  Otherwise add this character to the list of
18462                      * multi-char folds. */
18463                     if (! RExC_in_multi_char_class) {
18464                         STRLEN cp_count = utf8_length(foldbuf,
18465                                                       foldbuf + foldlen);
18466                         SV* multi_fold = sv_2mortal(newSVpvs(""));
18467 
18468                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18469 
18470                         multi_char_matches
18471                                         = add_multi_match(multi_char_matches,
18472                                                           multi_fold,
18473                                                           cp_count);
18474 
18475                     }
18476 
18477                     /* This element should not be processed further in this
18478                      * class */
18479                     element_count--;
18480                     value = save_value;
18481                     prevvalue = save_prevvalue;
18482                     continue;
18483                 }
18484             }
18485         }
18486 
18487         if (strict && ckWARN(WARN_REGEXP)) {
18488             if (range) {
18489 
18490                 /* If the range starts above 255, everything is portable and
18491                  * likely to be so for any forseeable character set, so don't
18492                  * warn. */
18493                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18494                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18495                 }
18496                 else if (prevvalue != value) {
18497 
18498                     /* Under strict, ranges that stop and/or end in an ASCII
18499                      * printable should have each end point be a portable value
18500                      * for it (preferably like 'A', but we don't warn if it is
18501                      * a (portable) Unicode name or code point), and the range
18502                      * must be all digits or all letters of the same case.
18503                      * Otherwise, the range is non-portable and unclear as to
18504                      * what it contains */
18505                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
18506                         && (          non_portable_endpoint
18507                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18508                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
18509                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
18510                     ))) {
18511                         vWARN(RExC_parse, "Ranges of ASCII printables should"
18512                                           " be some subset of \"0-9\","
18513                                           " \"A-Z\", or \"a-z\"");
18514                     }
18515                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18516                         SSize_t index_start;
18517                         SSize_t index_final;
18518 
18519                         /* But the nature of Unicode and languages mean we
18520                          * can't do the same checks for above-ASCII ranges,
18521                          * except in the case of digit ones.  These should
18522                          * contain only digits from the same group of 10.  The
18523                          * ASCII case is handled just above.  Hence here, the
18524                          * range could be a range of digits.  First some
18525                          * unlikely special cases.  Grandfather in that a range
18526                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18527                          * if its starting value is one of the 10 digits prior
18528                          * to it.  This is because it is an alternate way of
18529                          * writing 19D1, and some people may expect it to be in
18530                          * that group.  But it is bad, because it won't give
18531                          * the expected results.  In Unicode 5.2 it was
18532                          * considered to be in that group (of 11, hence), but
18533                          * this was fixed in the next version */
18534 
18535                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18536                             goto warn_bad_digit_range;
18537                         }
18538                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
18539                                           &&     value <= 0x1D7FF))
18540                         {
18541                             /* This is the only other case currently in Unicode
18542                              * where the algorithm below fails.  The code
18543                              * points just above are the end points of a single
18544                              * range containing only decimal digits.  It is 5
18545                              * different series of 0-9.  All other ranges of
18546                              * digits currently in Unicode are just a single
18547                              * series.  (And mktables will notify us if a later
18548                              * Unicode version breaks this.)
18549                              *
18550                              * If the range being checked is at most 9 long,
18551                              * and the digit values represented are in
18552                              * numerical order, they are from the same series.
18553                              * */
18554                             if (         value - prevvalue > 9
18555                                 ||    (((    value - 0x1D7CE) % 10)
18556                                      <= (prevvalue - 0x1D7CE) % 10))
18557                             {
18558                                 goto warn_bad_digit_range;
18559                             }
18560                         }
18561                         else {
18562 
18563                             /* For all other ranges of digits in Unicode, the
18564                              * algorithm is just to check if both end points
18565                              * are in the same series, which is the same range.
18566                              * */
18567                             index_start = _invlist_search(
18568                                                     PL_XPosix_ptrs[_CC_DIGIT],
18569                                                     prevvalue);
18570 
18571                             /* Warn if the range starts and ends with a digit,
18572                              * and they are not in the same group of 10. */
18573                             if (   index_start >= 0
18574                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18575                                 && (index_final =
18576                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18577                                                     value)) != index_start
18578                                 && index_final >= 0
18579                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18580                             {
18581                               warn_bad_digit_range:
18582                                 vWARN(RExC_parse, "Ranges of digits should be"
18583                                                   " from the same group of"
18584                                                   " 10");
18585                             }
18586                         }
18587                     }
18588                 }
18589             }
18590             if ((! range || prevvalue == value) && non_portable_endpoint) {
18591                 if (isPRINT_A(value)) {
18592                     char literal[3];
18593                     unsigned d = 0;
18594                     if (isBACKSLASHED_PUNCT(value)) {
18595                         literal[d++] = '\\';
18596                     }
18597                     literal[d++] = (char) value;
18598                     literal[d++] = '\0';
18599 
18600                     vWARN4(RExC_parse,
18601                            "\"%.*s\" is more clearly written simply as \"%s\"",
18602                            (int) (RExC_parse - rangebegin),
18603                            rangebegin,
18604                            literal
18605                         );
18606                 }
18607                 else if (isMNEMONIC_CNTRL(value)) {
18608                     vWARN4(RExC_parse,
18609                            "\"%.*s\" is more clearly written simply as \"%s\"",
18610                            (int) (RExC_parse - rangebegin),
18611                            rangebegin,
18612                            cntrl_to_mnemonic((U8) value)
18613                         );
18614                 }
18615             }
18616         }
18617 
18618         /* Deal with this element of the class */
18619 
18620 #ifndef EBCDIC
18621         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18622                                                     prevvalue, value);
18623 #else
18624         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18625          * that don't require special handling, we can just add the range like
18626          * we do for ASCII platforms */
18627         if ((UNLIKELY(prevvalue == 0) && value >= 255)
18628             || ! (prevvalue < 256
18629                     && (unicode_range
18630                         || (! non_portable_endpoint
18631                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18632                                 || (isUPPER_A(prevvalue)
18633                                     && isUPPER_A(value)))))))
18634         {
18635             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18636                                                         prevvalue, value);
18637         }
18638         else {
18639             /* Here, requires special handling.  This can be because it is a
18640              * range whose code points are considered to be Unicode, and so
18641              * must be individually translated into native, or because its a
18642              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18643              * EBCDIC, but we have defined them to include only the "expected"
18644              * upper or lower case ASCII alphabetics.  Subranges above 255 are
18645              * the same in native and Unicode, so can be added as a range */
18646             U8 start = NATIVE_TO_LATIN1(prevvalue);
18647             unsigned j;
18648             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18649             for (j = start; j <= end; j++) {
18650                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18651             }
18652             if (value > 255) {
18653                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18654                                                             256, value);
18655             }
18656         }
18657 #endif
18658 
18659 	range = 0; /* this range (if it was one) is done now */
18660     } /* End of loop through all the text within the brackets */
18661 
18662     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18663         output_posix_warnings(pRExC_state, posix_warnings);
18664     }
18665 
18666     /* If anything in the class expands to more than one character, we have to
18667      * deal with them by building up a substitute parse string, and recursively
18668      * calling reg() on it, instead of proceeding */
18669     if (multi_char_matches) {
18670 	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18671         I32 cp_count;
18672 	STRLEN len;
18673 	char *save_end = RExC_end;
18674 	char *save_parse = RExC_parse;
18675 	char *save_start = RExC_start;
18676         Size_t constructed_prefix_len = 0; /* This gives the length of the
18677                                               constructed portion of the
18678                                               substitute parse. */
18679         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
18680                                        a "|" */
18681         I32 reg_flags;
18682 
18683         assert(! invert);
18684         /* Only one level of recursion allowed */
18685         assert(RExC_copy_start_in_constructed == RExC_precomp);
18686 
18687 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
18688            because too confusing */
18689         if (invert) {
18690             sv_catpvs(substitute_parse, "(?:");
18691         }
18692 #endif
18693 
18694         /* Look at the longest strings first */
18695         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18696                         cp_count > 0;
18697                         cp_count--)
18698         {
18699 
18700             if (av_exists(multi_char_matches, cp_count)) {
18701                 AV** this_array_ptr;
18702                 SV* this_sequence;
18703 
18704                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18705                                                  cp_count, FALSE);
18706                 while ((this_sequence = av_pop(*this_array_ptr)) !=
18707                                                                 &PL_sv_undef)
18708                 {
18709                     if (! first_time) {
18710                         sv_catpvs(substitute_parse, "|");
18711                     }
18712                     first_time = FALSE;
18713 
18714                     sv_catpv(substitute_parse, SvPVX(this_sequence));
18715                 }
18716             }
18717         }
18718 
18719         /* If the character class contains anything else besides these
18720          * multi-character strings, have to include it in recursive parsing */
18721         if (element_count) {
18722             bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18723 
18724             sv_catpvs(substitute_parse, "|");
18725             if (has_l_bracket) {    /* Add an [ if the original had one */
18726                 sv_catpvs(substitute_parse, "[");
18727             }
18728             constructed_prefix_len = SvCUR(substitute_parse);
18729             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18730 
18731             /* Put in a closing ']' to match any opening one, but not if going
18732              * off the end, as otherwise we are adding something that really
18733              * isn't there */
18734             if (has_l_bracket && RExC_parse < RExC_end) {
18735                 sv_catpvs(substitute_parse, "]");
18736             }
18737         }
18738 
18739         sv_catpvs(substitute_parse, ")");
18740 #if 0
18741         if (invert) {
18742             /* This is a way to get the parse to skip forward a whole named
18743              * sequence instead of matching the 2nd character when it fails the
18744              * first */
18745             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18746         }
18747 #endif
18748 
18749         /* Set up the data structure so that any errors will be properly
18750          * reported.  See the comments at the definition of
18751          * REPORT_LOCATION_ARGS for details */
18752         RExC_copy_start_in_input = (char *) orig_parse;
18753 	RExC_start = RExC_parse = SvPV(substitute_parse, len);
18754         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18755 	RExC_end = RExC_parse + len;
18756         RExC_in_multi_char_class = 1;
18757 
18758 	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18759 
18760         *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
18761 
18762         /* And restore so can parse the rest of the pattern */
18763         RExC_parse = save_parse;
18764 	RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18765 	RExC_end = save_end;
18766 	RExC_in_multi_char_class = 0;
18767         SvREFCNT_dec_NN(multi_char_matches);
18768         SvREFCNT_dec(properties);
18769         SvREFCNT_dec(cp_list);
18770         SvREFCNT_dec(simple_posixes);
18771         SvREFCNT_dec(posixes);
18772         SvREFCNT_dec(nposixes);
18773         SvREFCNT_dec(cp_foldable_list);
18774         return ret;
18775     }
18776 
18777     /* If folding, we calculate all characters that could fold to or from the
18778      * ones already on the list */
18779     if (cp_foldable_list) {
18780         if (FOLD) {
18781             UV start, end;	/* End points of code point ranges */
18782 
18783             SV* fold_intersection = NULL;
18784             SV** use_list;
18785 
18786             /* Our calculated list will be for Unicode rules.  For locale
18787              * matching, we have to keep a separate list that is consulted at
18788              * runtime only when the locale indicates Unicode rules (and we
18789              * don't include potential matches in the ASCII/Latin1 range, as
18790              * any code point could fold to any other, based on the run-time
18791              * locale).   For non-locale, we just use the general list */
18792             if (LOC) {
18793                 use_list = &only_utf8_locale_list;
18794             }
18795             else {
18796                 use_list = &cp_list;
18797             }
18798 
18799             /* Only the characters in this class that participate in folds need
18800              * be checked.  Get the intersection of this class and all the
18801              * possible characters that are foldable.  This can quickly narrow
18802              * down a large class */
18803             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18804                                   &fold_intersection);
18805 
18806             /* Now look at the foldable characters in this class individually */
18807             invlist_iterinit(fold_intersection);
18808             while (invlist_iternext(fold_intersection, &start, &end)) {
18809                 UV j;
18810                 UV folded;
18811 
18812                 /* Look at every character in the range */
18813                 for (j = start; j <= end; j++) {
18814                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18815                     STRLEN foldlen;
18816                     unsigned int k;
18817                     Size_t folds_count;
18818                     U32 first_fold;
18819                     const U32 * remaining_folds;
18820 
18821                     if (j < 256) {
18822 
18823                         /* Under /l, we don't know what code points below 256
18824                          * fold to, except we do know the MICRO SIGN folds to
18825                          * an above-255 character if the locale is UTF-8, so we
18826                          * add it to the special list (in *use_list)  Otherwise
18827                          * we know now what things can match, though some folds
18828                          * are valid under /d only if the target is UTF-8.
18829                          * Those go in a separate list */
18830                         if (      IS_IN_SOME_FOLD_L1(j)
18831                             && ! (LOC && j != MICRO_SIGN))
18832                         {
18833 
18834                             /* ASCII is always matched; non-ASCII is matched
18835                              * only under Unicode rules (which could happen
18836                              * under /l if the locale is a UTF-8 one */
18837                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18838                                 *use_list = add_cp_to_invlist(*use_list,
18839                                                             PL_fold_latin1[j]);
18840                             }
18841                             else if (j != PL_fold_latin1[j]) {
18842                                 upper_latin1_only_utf8_matches
18843                                         = add_cp_to_invlist(
18844                                                 upper_latin1_only_utf8_matches,
18845                                                 PL_fold_latin1[j]);
18846                             }
18847                         }
18848 
18849                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18850                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18851                         {
18852                             add_above_Latin1_folds(pRExC_state,
18853                                                    (U8) j,
18854                                                    use_list);
18855                         }
18856                         continue;
18857                     }
18858 
18859                     /* Here is an above Latin1 character.  We don't have the
18860                      * rules hard-coded for it.  First, get its fold.  This is
18861                      * the simple fold, as the multi-character folds have been
18862                      * handled earlier and separated out */
18863                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18864                                                         (ASCII_FOLD_RESTRICTED)
18865                                                         ? FOLD_FLAGS_NOMIX_ASCII
18866                                                         : 0);
18867 
18868                     /* Single character fold of above Latin1.  Add everything
18869                      * in its fold closure to the list that this node should
18870                      * match. */
18871                     folds_count = _inverse_folds(folded, &first_fold,
18872                                                     &remaining_folds);
18873                     for (k = 0; k <= folds_count; k++) {
18874                         UV c = (k == 0)     /* First time through use itself */
18875                                 ? folded
18876                                 : (k == 1)  /* 2nd time use, the first fold */
18877                                    ? first_fold
18878 
18879                                      /* Then the remaining ones */
18880                                    : remaining_folds[k-2];
18881 
18882                         /* /aa doesn't allow folds between ASCII and non- */
18883                         if ((   ASCII_FOLD_RESTRICTED
18884                             && (isASCII(c) != isASCII(j))))
18885                         {
18886                             continue;
18887                         }
18888 
18889                         /* Folds under /l which cross the 255/256 boundary are
18890                          * added to a separate list.  (These are valid only
18891                          * when the locale is UTF-8.) */
18892                         if (c < 256 && LOC) {
18893                             *use_list = add_cp_to_invlist(*use_list, c);
18894                             continue;
18895                         }
18896 
18897                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18898                         {
18899                             cp_list = add_cp_to_invlist(cp_list, c);
18900                         }
18901                         else {
18902                             /* Similarly folds involving non-ascii Latin1
18903                              * characters under /d are added to their list */
18904                             upper_latin1_only_utf8_matches
18905                                     = add_cp_to_invlist(
18906                                                 upper_latin1_only_utf8_matches,
18907                                                 c);
18908                         }
18909                     }
18910                 }
18911             }
18912             SvREFCNT_dec_NN(fold_intersection);
18913         }
18914 
18915         /* Now that we have finished adding all the folds, there is no reason
18916          * to keep the foldable list separate */
18917         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18918 	SvREFCNT_dec_NN(cp_foldable_list);
18919     }
18920 
18921     /* And combine the result (if any) with any inversion lists from posix
18922      * classes.  The lists are kept separate up to now because we don't want to
18923      * fold the classes */
18924     if (simple_posixes) {   /* These are the classes known to be unaffected by
18925                                /a, /aa, and /d */
18926         if (cp_list) {
18927             _invlist_union(cp_list, simple_posixes, &cp_list);
18928             SvREFCNT_dec_NN(simple_posixes);
18929         }
18930         else {
18931             cp_list = simple_posixes;
18932         }
18933     }
18934     if (posixes || nposixes) {
18935         if (! DEPENDS_SEMANTICS) {
18936 
18937             /* For everything but /d, we can just add the current 'posixes' and
18938              * 'nposixes' to the main list */
18939             if (posixes) {
18940                 if (cp_list) {
18941                     _invlist_union(cp_list, posixes, &cp_list);
18942                     SvREFCNT_dec_NN(posixes);
18943                 }
18944                 else {
18945                     cp_list = posixes;
18946                 }
18947             }
18948             if (nposixes) {
18949                 if (cp_list) {
18950                     _invlist_union(cp_list, nposixes, &cp_list);
18951                     SvREFCNT_dec_NN(nposixes);
18952                 }
18953                 else {
18954                     cp_list = nposixes;
18955                 }
18956             }
18957         }
18958         else {
18959             /* Under /d, things like \w match upper Latin1 characters only if
18960              * the target string is in UTF-8.  But things like \W match all the
18961              * upper Latin1 characters if the target string is not in UTF-8.
18962              *
18963              * Handle the case with something like \W separately */
18964             if (nposixes) {
18965                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18966 
18967                 /* A complemented posix class matches all upper Latin1
18968                  * characters if not in UTF-8.  And it matches just certain
18969                  * ones when in UTF-8.  That means those certain ones are
18970                  * matched regardless, so can just be added to the
18971                  * unconditional list */
18972                 if (cp_list) {
18973                     _invlist_union(cp_list, nposixes, &cp_list);
18974                     SvREFCNT_dec_NN(nposixes);
18975                     nposixes = NULL;
18976                 }
18977                 else {
18978                     cp_list = nposixes;
18979                 }
18980 
18981                 /* Likewise for 'posixes' */
18982                 _invlist_union(posixes, cp_list, &cp_list);
18983                 SvREFCNT_dec(posixes);
18984 
18985                 /* Likewise for anything else in the range that matched only
18986                  * under UTF-8 */
18987                 if (upper_latin1_only_utf8_matches) {
18988                     _invlist_union(cp_list,
18989                                    upper_latin1_only_utf8_matches,
18990                                    &cp_list);
18991                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18992                     upper_latin1_only_utf8_matches = NULL;
18993                 }
18994 
18995                 /* If we don't match all the upper Latin1 characters regardless
18996                  * of UTF-8ness, we have to set a flag to match the rest when
18997                  * not in UTF-8 */
18998                 _invlist_subtract(only_non_utf8_list, cp_list,
18999                                   &only_non_utf8_list);
19000                 if (_invlist_len(only_non_utf8_list) != 0) {
19001                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
19002                 }
19003                 SvREFCNT_dec_NN(only_non_utf8_list);
19004             }
19005             else {
19006                 /* Here there were no complemented posix classes.  That means
19007                  * the upper Latin1 characters in 'posixes' match only when the
19008                  * target string is in UTF-8.  So we have to add them to the
19009                  * list of those types of code points, while adding the
19010                  * remainder to the unconditional list.
19011                  *
19012                  * First calculate what they are */
19013                 SV* nonascii_but_latin1_properties = NULL;
19014                 _invlist_intersection(posixes, PL_UpperLatin1,
19015                                       &nonascii_but_latin1_properties);
19016 
19017                 /* And add them to the final list of such characters. */
19018                 _invlist_union(upper_latin1_only_utf8_matches,
19019                                nonascii_but_latin1_properties,
19020                                &upper_latin1_only_utf8_matches);
19021 
19022                 /* Remove them from what now becomes the unconditional list */
19023                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
19024                                   &posixes);
19025 
19026                 /* And add those unconditional ones to the final list */
19027                 if (cp_list) {
19028                     _invlist_union(cp_list, posixes, &cp_list);
19029                     SvREFCNT_dec_NN(posixes);
19030                     posixes = NULL;
19031                 }
19032                 else {
19033                     cp_list = posixes;
19034                 }
19035 
19036                 SvREFCNT_dec(nonascii_but_latin1_properties);
19037 
19038                 /* Get rid of any characters from the conditional list that we
19039                  * now know are matched unconditionally, which may make that
19040                  * list empty */
19041                 _invlist_subtract(upper_latin1_only_utf8_matches,
19042                                   cp_list,
19043                                   &upper_latin1_only_utf8_matches);
19044                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
19045                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19046                     upper_latin1_only_utf8_matches = NULL;
19047                 }
19048             }
19049         }
19050     }
19051 
19052     /* And combine the result (if any) with any inversion list from properties.
19053      * The lists are kept separate up to now so that we can distinguish the two
19054      * in regards to matching above-Unicode.  A run-time warning is generated
19055      * if a Unicode property is matched against a non-Unicode code point. But,
19056      * we allow user-defined properties to match anything, without any warning,
19057      * and we also suppress the warning if there is a portion of the character
19058      * class that isn't a Unicode property, and which matches above Unicode, \W
19059      * or [\x{110000}] for example.
19060      * (Note that in this case, unlike the Posix one above, there is no
19061      * <upper_latin1_only_utf8_matches>, because having a Unicode property
19062      * forces Unicode semantics */
19063     if (properties) {
19064         if (cp_list) {
19065 
19066             /* If it matters to the final outcome, see if a non-property
19067              * component of the class matches above Unicode.  If so, the
19068              * warning gets suppressed.  This is true even if just a single
19069              * such code point is specified, as, though not strictly correct if
19070              * another such code point is matched against, the fact that they
19071              * are using above-Unicode code points indicates they should know
19072              * the issues involved */
19073             if (warn_super) {
19074                 warn_super = ! (invert
19075                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
19076             }
19077 
19078             _invlist_union(properties, cp_list, &cp_list);
19079             SvREFCNT_dec_NN(properties);
19080         }
19081         else {
19082             cp_list = properties;
19083         }
19084 
19085         if (warn_super) {
19086             anyof_flags
19087              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
19088 
19089             /* Because an ANYOF node is the only one that warns, this node
19090              * can't be optimized into something else */
19091             optimizable = FALSE;
19092         }
19093     }
19094 
19095     /* Here, we have calculated what code points should be in the character
19096      * class.
19097      *
19098      * Now we can see about various optimizations.  Fold calculation (which we
19099      * did above) needs to take place before inversion.  Otherwise /[^k]/i
19100      * would invert to include K, which under /i would match k, which it
19101      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
19102      * folded until runtime */
19103 
19104     /* If we didn't do folding, it's because some information isn't available
19105      * until runtime; set the run-time fold flag for these  We know to set the
19106      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
19107      * at least one 0-255 range code point */
19108     if (LOC && FOLD) {
19109 
19110         /* Some things on the list might be unconditionally included because of
19111          * other components.  Remove them, and clean up the list if it goes to
19112          * 0 elements */
19113         if (only_utf8_locale_list && cp_list) {
19114             _invlist_subtract(only_utf8_locale_list, cp_list,
19115                               &only_utf8_locale_list);
19116 
19117             if (_invlist_len(only_utf8_locale_list) == 0) {
19118                 SvREFCNT_dec_NN(only_utf8_locale_list);
19119                 only_utf8_locale_list = NULL;
19120             }
19121         }
19122         if (    only_utf8_locale_list
19123             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
19124                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
19125         {
19126             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
19127             anyof_flags
19128                  |= ANYOFL_FOLD
19129                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
19130         }
19131         else if (cp_list && invlist_lowest(cp_list) < 256) {
19132             /* If nothing is below 256, has no locale dependency; otherwise it
19133              * does */
19134             anyof_flags |= ANYOFL_FOLD;
19135             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
19136         }
19137     }
19138     else if (   DEPENDS_SEMANTICS
19139              && (    upper_latin1_only_utf8_matches
19140                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
19141     {
19142         RExC_seen_d_op = TRUE;
19143         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
19144     }
19145 
19146     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
19147      * compile time. */
19148     if (     cp_list
19149         &&   invert
19150         && ! has_runtime_dependency)
19151     {
19152         _invlist_invert(cp_list);
19153 
19154 	/* Clear the invert flag since have just done it here */
19155 	invert = FALSE;
19156     }
19157 
19158     /* All possible optimizations below still have these characteristics.
19159      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
19160      * routine) */
19161     *flagp |= HASWIDTH|SIMPLE;
19162 
19163     if (ret_invlist) {
19164         *ret_invlist = cp_list;
19165 
19166         return (cp_list) ? RExC_emit : 0;
19167     }
19168 
19169     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19170         RExC_contains_locale = 1;
19171     }
19172 
19173     /* Some character classes are equivalent to other nodes.  Such nodes take
19174      * up less room, and some nodes require fewer operations to execute, than
19175      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
19176      * improve efficiency. */
19177 
19178     if (optimizable) {
19179         PERL_UINT_FAST8_T i;
19180         UV partial_cp_count = 0;
19181         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19182         UV   end[MAX_FOLD_FROMS+1] = { 0 };
19183         bool single_range = FALSE;
19184 
19185         if (cp_list) { /* Count the code points in enough ranges that we would
19186                           see all the ones possible in any fold in this version
19187                           of Unicode */
19188 
19189             invlist_iterinit(cp_list);
19190             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19191                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19192                     break;
19193                 }
19194                 partial_cp_count += end[i] - start[i] + 1;
19195             }
19196 
19197             if (i == 1) {
19198                 single_range = TRUE;
19199             }
19200             invlist_iterfinish(cp_list);
19201         }
19202 
19203         /* If we know at compile time that this matches every possible code
19204          * point, any run-time dependencies don't matter */
19205         if (start[0] == 0 && end[0] == UV_MAX) {
19206             if (invert) {
19207                 ret = reganode(pRExC_state, OPFAIL, 0);
19208             }
19209             else {
19210                 ret = reg_node(pRExC_state, SANY);
19211                 MARK_NAUGHTY(1);
19212             }
19213             goto not_anyof;
19214         }
19215 
19216         /* Similarly, for /l posix classes, if both a class and its
19217          * complement match, any run-time dependencies don't matter */
19218         if (posixl) {
19219             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19220                                                         namedclass += 2)
19221             {
19222                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
19223                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19224                 {
19225                     if (invert) {
19226                         ret = reganode(pRExC_state, OPFAIL, 0);
19227                     }
19228                     else {
19229                         ret = reg_node(pRExC_state, SANY);
19230                         MARK_NAUGHTY(1);
19231                     }
19232                     goto not_anyof;
19233                 }
19234             }
19235 
19236             /* For well-behaved locales, some classes are subsets of others,
19237              * so complementing the subset and including the non-complemented
19238              * superset should match everything, like [\D[:alnum:]], and
19239              * [[:^alpha:][:alnum:]], but some implementations of locales are
19240              * buggy, and khw thinks its a bad idea to have optimization change
19241              * behavior, even if it avoids an OS bug in a given case */
19242 
19243 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19244 
19245             /* If is a single posix /l class, can optimize to just that op.
19246              * Such a node will not match anything in the Latin1 range, as that
19247              * is not determinable until runtime, but will match whatever the
19248              * class does outside that range.  (Note that some classes won't
19249              * match anything outside the range, like [:ascii:]) */
19250             if (    isSINGLE_BIT_SET(posixl)
19251                 && (partial_cp_count == 0 || start[0] > 255))
19252             {
19253                 U8 classnum;
19254                 SV * class_above_latin1 = NULL;
19255                 bool already_inverted;
19256                 bool are_equivalent;
19257 
19258                 /* Compute which bit is set, which is the same thing as, e.g.,
19259                  * ANYOF_CNTRL.  From
19260                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19261                  * */
19262                 static const int MultiplyDeBruijnBitPosition2[32] =
19263                     {
19264                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19265                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19266                     };
19267 
19268                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19269                                                           * 0x077CB531U) >> 27];
19270                 classnum = namedclass_to_classnum(namedclass);
19271 
19272                 /* The named classes are such that the inverted number is one
19273                  * larger than the non-inverted one */
19274                 already_inverted = namedclass
19275                                  - classnum_to_namedclass(classnum);
19276 
19277                 /* Create an inversion list of the official property, inverted
19278                  * if the constructed node list is inverted, and restricted to
19279                  * only the above latin1 code points, which are the only ones
19280                  * known at compile time */
19281                 _invlist_intersection_maybe_complement_2nd(
19282                                                     PL_AboveLatin1,
19283                                                     PL_XPosix_ptrs[classnum],
19284                                                     already_inverted,
19285                                                     &class_above_latin1);
19286                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19287                                                                         FALSE);
19288                 SvREFCNT_dec_NN(class_above_latin1);
19289 
19290                 if (are_equivalent) {
19291 
19292                     /* Resolve the run-time inversion flag with this possibly
19293                      * inverted class */
19294                     invert = invert ^ already_inverted;
19295 
19296                     ret = reg_node(pRExC_state,
19297                                    POSIXL + invert * (NPOSIXL - POSIXL));
19298                     FLAGS(REGNODE_p(ret)) = classnum;
19299                     goto not_anyof;
19300                 }
19301             }
19302         }
19303 
19304         /* khw can't think of any other possible transformation involving
19305          * these. */
19306         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19307             goto is_anyof;
19308         }
19309 
19310         if (! has_runtime_dependency) {
19311 
19312             /* If the list is empty, nothing matches.  This happens, for
19313              * example, when a Unicode property that doesn't match anything is
19314              * the only element in the character class (perluniprops.pod notes
19315              * such properties). */
19316             if (partial_cp_count == 0) {
19317                 if (invert) {
19318                     ret = reg_node(pRExC_state, SANY);
19319                 }
19320                 else {
19321                     ret = reganode(pRExC_state, OPFAIL, 0);
19322                 }
19323 
19324                 goto not_anyof;
19325             }
19326 
19327             /* If matches everything but \n */
19328             if (   start[0] == 0 && end[0] == '\n' - 1
19329                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19330             {
19331                 assert (! invert);
19332                 ret = reg_node(pRExC_state, REG_ANY);
19333                 MARK_NAUGHTY(1);
19334                 goto not_anyof;
19335             }
19336         }
19337 
19338         /* Next see if can optimize classes that contain just a few code points
19339          * into an EXACTish node.  The reason to do this is to let the
19340          * optimizer join this node with adjacent EXACTish ones, and ANYOF
19341          * nodes require conversion to code point from UTF-8.
19342          *
19343          * An EXACTFish node can be generated even if not under /i, and vice
19344          * versa.  But care must be taken.  An EXACTFish node has to be such
19345          * that it only matches precisely the code points in the class, but we
19346          * want to generate the least restrictive one that does that, to
19347          * increase the odds of being able to join with an adjacent node.  For
19348          * example, if the class contains [kK], we have to make it an EXACTFAA
19349          * node to prevent the KELVIN SIGN from matching.  Whether we are under
19350          * /i or not is irrelevant in this case.  Less obvious is the pattern
19351          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
19352          * supposed to match the single character U+0149 LATIN SMALL LETTER N
19353          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
19354          * that includes \X{02BC}, there is a multi-char fold that does, and so
19355          * the node generated for it must be an EXACTFish one.  On the other
19356          * hand qr/:/i should generate a plain EXACT node since the colon
19357          * participates in no fold whatsoever, and having it EXACT tells the
19358          * optimizer the target string cannot match unless it has a colon in
19359          * it.
19360          */
19361         if (   ! posixl
19362             && ! invert
19363 
19364                 /* Only try if there are no more code points in the class than
19365                  * in the max possible fold */
19366             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19367         {
19368             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19369             {
19370                 /* We can always make a single code point class into an
19371                  * EXACTish node. */
19372 
19373                 if (LOC) {
19374 
19375                     /* Here is /l:  Use EXACTL, except if there is a fold not
19376                      * known until runtime so shows as only a single code point
19377                      * here.  For code points above 255, we know which can
19378                      * cause problems by having a potential fold to the Latin1
19379                      * range. */
19380                     if (  ! FOLD
19381                         || (     start[0] > 255
19382                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19383                     {
19384                         op = EXACTL;
19385                     }
19386                     else {
19387                         op = EXACTFL;
19388                     }
19389                 }
19390                 else if (! FOLD) { /* Not /l and not /i */
19391                     op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19392                 }
19393                 else if (start[0] < 256) { /* /i, not /l, and the code point is
19394                                               small */
19395 
19396                     /* Under /i, it gets a little tricky.  A code point that
19397                      * doesn't participate in a fold should be an EXACT node.
19398                      * We know this one isn't the result of a simple fold, or
19399                      * there'd be more than one code point in the list, but it
19400                      * could be part of a multi- character fold.  In that case
19401                      * we better not create an EXACT node, as we would wrongly
19402                      * be telling the optimizer that this code point must be in
19403                      * the target string, and that is wrong.  This is because
19404                      * if the sequence around this code point forms a
19405                      * multi-char fold, what needs to be in the string could be
19406                      * the code point that folds to the sequence.
19407                      *
19408                      * This handles the case of below-255 code points, as we
19409                      * have an easy look up for those.  The next clause handles
19410                      * the above-256 one */
19411                     op = IS_IN_SOME_FOLD_L1(start[0])
19412                          ? EXACTFU
19413                          : EXACT;
19414                 }
19415                 else {  /* /i, larger code point.  Since we are under /i, and
19416                            have just this code point, we know that it can't
19417                            fold to something else, so PL_InMultiCharFold
19418                            applies to it */
19419                     op = _invlist_contains_cp(PL_InMultiCharFold,
19420                                               start[0])
19421                          ? EXACTFU_REQ8
19422                          : EXACT_REQ8;
19423                 }
19424 
19425                 value = start[0];
19426             }
19427             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19428                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
19429             {
19430                 /* Here, the only runtime dependency, if any, is from /d, and
19431                  * the class matches more than one code point, and the lowest
19432                  * code point participates in some fold.  It might be that the
19433                  * other code points are /i equivalent to this one, and hence
19434                  * they would representable by an EXACTFish node.  Above, we
19435                  * eliminated classes that contain too many code points to be
19436                  * EXACTFish, with the test for MAX_FOLD_FROMS
19437                  *
19438                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
19439                  * We do this because we have EXACTFAA at our disposal for the
19440                  * ASCII range */
19441                 if (partial_cp_count == 2 && isASCII(start[0])) {
19442 
19443                     /* The only ASCII characters that participate in folds are
19444                      * alphabetics */
19445                     assert(isALPHA(start[0]));
19446                     if (   end[0] == start[0]   /* First range is a single
19447                                                    character, so 2nd exists */
19448                         && isALPHA_FOLD_EQ(start[0], start[1]))
19449                     {
19450 
19451                         /* Here, is part of an ASCII fold pair */
19452 
19453                         if (   ASCII_FOLD_RESTRICTED
19454                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19455                         {
19456                             /* If the second clause just above was true, it
19457                              * means we can't be under /i, or else the list
19458                              * would have included more than this fold pair.
19459                              * Therefore we have to exclude the possibility of
19460                              * whatever else it is that folds to these, by
19461                              * using EXACTFAA */
19462                             op = EXACTFAA;
19463                         }
19464                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19465 
19466                             /* Here, there's no simple fold that start[0] is part
19467                              * of, but there is a multi-character one.  If we
19468                              * are not under /i, we want to exclude that
19469                              * possibility; if under /i, we want to include it
19470                              * */
19471                             op = (FOLD) ? EXACTFU : EXACTFAA;
19472                         }
19473                         else {
19474 
19475                             /* Here, the only possible fold start[0] particpates in
19476                              * is with start[1].  /i or not isn't relevant */
19477                             op = EXACTFU;
19478                         }
19479 
19480                         value = toFOLD(start[0]);
19481                     }
19482                 }
19483                 else if (  ! upper_latin1_only_utf8_matches
19484                          || (   _invlist_len(upper_latin1_only_utf8_matches)
19485                                                                           == 2
19486                              && PL_fold_latin1[
19487                                invlist_highest(upper_latin1_only_utf8_matches)]
19488                              == start[0]))
19489                 {
19490                     /* Here, the smallest character is non-ascii or there are
19491                      * more than 2 code points matched by this node.  Also, we
19492                      * either don't have /d UTF-8 dependent matches, or if we
19493                      * do, they look like they could be a single character that
19494                      * is the fold of the lowest one in the always-match list.
19495                      * This test quickly excludes most of the false positives
19496                      * when there are /d UTF-8 depdendent matches.  These are
19497                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19498                      * SMALL LETTER A WITH GRAVE iff the target string is
19499                      * UTF-8.  (We don't have to worry above about exceeding
19500                      * the array bounds of PL_fold_latin1[] because any code
19501                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
19502                      *
19503                      * EXACTFAA would apply only to pairs (hence exactly 2 code
19504                      * points) in the ASCII range, so we can't use it here to
19505                      * artificially restrict the fold domain, so we check if
19506                      * the class does or does not match some EXACTFish node.
19507                      * Further, if we aren't under /i, and the folded-to
19508                      * character is part of a multi-character fold, we can't do
19509                      * this optimization, as the sequence around it could be
19510                      * that multi-character fold, and we don't here know the
19511                      * context, so we have to assume it is that multi-char
19512                      * fold, to prevent potential bugs.
19513                      *
19514                      * To do the general case, we first find the fold of the
19515                      * lowest code point (which may be higher than the lowest
19516                      * one), then find everything that folds to it.  (The data
19517                      * structure we have only maps from the folded code points,
19518                      * so we have to do the earlier step.) */
19519 
19520                     Size_t foldlen;
19521                     U8 foldbuf[UTF8_MAXBYTES_CASE];
19522                     UV folded = _to_uni_fold_flags(start[0],
19523                                                         foldbuf, &foldlen, 0);
19524                     U32 first_fold;
19525                     const U32 * remaining_folds;
19526                     Size_t folds_to_this_cp_count = _inverse_folds(
19527                                                             folded,
19528                                                             &first_fold,
19529                                                             &remaining_folds);
19530                     Size_t folds_count = folds_to_this_cp_count + 1;
19531                     SV * fold_list = _new_invlist(folds_count);
19532                     unsigned int i;
19533 
19534                     /* If there are UTF-8 dependent matches, create a temporary
19535                      * list of what this node matches, including them. */
19536                     SV * all_cp_list = NULL;
19537                     SV ** use_this_list = &cp_list;
19538 
19539                     if (upper_latin1_only_utf8_matches) {
19540                         all_cp_list = _new_invlist(0);
19541                         use_this_list = &all_cp_list;
19542                         _invlist_union(cp_list,
19543                                        upper_latin1_only_utf8_matches,
19544                                        use_this_list);
19545                     }
19546 
19547                     /* Having gotten everything that participates in the fold
19548                      * containing the lowest code point, we turn that into an
19549                      * inversion list, making sure everything is included. */
19550                     fold_list = add_cp_to_invlist(fold_list, start[0]);
19551                     fold_list = add_cp_to_invlist(fold_list, folded);
19552                     if (folds_to_this_cp_count > 0) {
19553                         fold_list = add_cp_to_invlist(fold_list, first_fold);
19554                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19555                             fold_list = add_cp_to_invlist(fold_list,
19556                                                         remaining_folds[i]);
19557                         }
19558                     }
19559 
19560                     /* If the fold list is identical to what's in this ANYOF
19561                      * node, the node can be represented by an EXACTFish one
19562                      * instead */
19563                     if (_invlistEQ(*use_this_list, fold_list,
19564                                    0 /* Don't complement */ )
19565                     ) {
19566 
19567                         /* But, we have to be careful, as mentioned above.
19568                          * Just the right sequence of characters could match
19569                          * this if it is part of a multi-character fold.  That
19570                          * IS what we want if we are under /i.  But it ISN'T
19571                          * what we want if not under /i, as it could match when
19572                          * it shouldn't.  So, when we aren't under /i and this
19573                          * character participates in a multi-char fold, we
19574                          * don't optimize into an EXACTFish node.  So, for each
19575                          * case below we have to check if we are folding
19576                          * and if not, if it is not part of a multi-char fold.
19577                          * */
19578                         if (start[0] > 255) {    /* Highish code point */
19579                             if (FOLD || ! _invlist_contains_cp(
19580                                             PL_InMultiCharFold, folded))
19581                             {
19582                                 op = (LOC)
19583                                      ? EXACTFLU8
19584                                      : (ASCII_FOLD_RESTRICTED)
19585                                        ? EXACTFAA
19586                                        : EXACTFU_REQ8;
19587                                 value = folded;
19588                             }
19589                         }   /* Below, the lowest code point < 256 */
19590                         else if (    FOLD
19591                                  &&  folded == 's'
19592                                  &&  DEPENDS_SEMANTICS)
19593                         {   /* An EXACTF node containing a single character
19594                                 's', can be an EXACTFU if it doesn't get
19595                                 joined with an adjacent 's' */
19596                             op = EXACTFU_S_EDGE;
19597                             value = folded;
19598                         }
19599                         else if (    FOLD
19600                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19601                         {
19602                             if (upper_latin1_only_utf8_matches) {
19603                                 op = EXACTF;
19604 
19605                                 /* We can't use the fold, as that only matches
19606                                  * under UTF-8 */
19607                                 value = start[0];
19608                             }
19609                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
19610                                      && ! UTF)
19611                             {   /* EXACTFUP is a special node for this
19612                                    character */
19613                                 op = (ASCII_FOLD_RESTRICTED)
19614                                      ? EXACTFAA
19615                                      : EXACTFUP;
19616                                 value = MICRO_SIGN;
19617                             }
19618                             else if (     ASCII_FOLD_RESTRICTED
19619                                      && ! isASCII(start[0]))
19620                             {   /* For ASCII under /iaa, we can use EXACTFU
19621                                    below */
19622                                 op = EXACTFAA;
19623                                 value = folded;
19624                             }
19625                             else {
19626                                 op = EXACTFU;
19627                                 value = folded;
19628                             }
19629                         }
19630                     }
19631 
19632                     SvREFCNT_dec_NN(fold_list);
19633                     SvREFCNT_dec(all_cp_list);
19634                 }
19635             }
19636 
19637             if (op != END) {
19638                 U8 len;
19639 
19640                 /* Here, we have calculated what EXACTish node to use.  Have to
19641                  * convert to UTF-8 if not already there */
19642                 if (value > 255) {
19643                     if (! UTF) {
19644                         SvREFCNT_dec(cp_list);;
19645                         REQUIRE_UTF8(flagp);
19646                     }
19647 
19648                     /* This is a kludge to the special casing issues with this
19649                      * ligature under /aa.  FB05 should fold to FB06, but the
19650                      * call above to _to_uni_fold_flags() didn't find this, as
19651                      * it didn't use the /aa restriction in order to not miss
19652                      * other folds that would be affected.  This is the only
19653                      * instance likely to ever be a problem in all of Unicode.
19654                      * So special case it. */
19655                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
19656                         && ASCII_FOLD_RESTRICTED)
19657                     {
19658                         value = LATIN_SMALL_LIGATURE_ST;
19659                     }
19660                 }
19661 
19662                 len = (UTF) ? UVCHR_SKIP(value) : 1;
19663 
19664                 ret = regnode_guts(pRExC_state, op, len, "exact");
19665                 FILL_NODE(ret, op);
19666                 RExC_emit += 1 + STR_SZ(len);
19667                 setSTR_LEN(REGNODE_p(ret), len);
19668                 if (len == 1) {
19669                     *STRINGs(REGNODE_p(ret)) = (U8) value;
19670                 }
19671                 else {
19672                     uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19673                 }
19674                 goto not_anyof;
19675             }
19676         }
19677 
19678         if (! has_runtime_dependency) {
19679 
19680             /* See if this can be turned into an ANYOFM node.  Think about the
19681              * bit patterns in two different bytes.  In some positions, the
19682              * bits in each will be 1; and in other positions both will be 0;
19683              * and in some positions the bit will be 1 in one byte, and 0 in
19684              * the other.  Let 'n' be the number of positions where the bits
19685              * differ.  We create a mask which has exactly 'n' 0 bits, each in
19686              * a position where the two bytes differ.  Now take the set of all
19687              * bytes that when ANDed with the mask yield the same result.  That
19688              * set has 2**n elements, and is representable by just two 8 bit
19689              * numbers: the result and the mask.  Importantly, matching the set
19690              * can be vectorized by creating a word full of the result bytes,
19691              * and a word full of the mask bytes, yielding a significant speed
19692              * up.  Here, see if this node matches such a set.  As a concrete
19693              * example consider [01], and the byte representing '0' which is
19694              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
19695              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
19696              * 0x30.  Any other bytes ANDed yield something else.  So [01],
19697              * which is a common usage, is optimizable into ANYOFM, and can
19698              * benefit from the speed up.  We can only do this on UTF-8
19699              * invariant bytes, because they have the same bit patterns under
19700              * UTF-8 as not. */
19701             PERL_UINT_FAST8_T inverted = 0;
19702 #ifdef EBCDIC
19703             const PERL_UINT_FAST8_T max_permissible = 0xFF;
19704 #else
19705             const PERL_UINT_FAST8_T max_permissible = 0x7F;
19706 #endif
19707             /* If doesn't fit the criteria for ANYOFM, invert and try again.
19708              * If that works we will instead later generate an NANYOFM, and
19709              * invert back when through */
19710             if (invlist_highest(cp_list) > max_permissible) {
19711                 _invlist_invert(cp_list);
19712                 inverted = 1;
19713             }
19714 
19715             if (invlist_highest(cp_list) <= max_permissible) {
19716                 UV this_start, this_end;
19717                 UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
19718                 U8 bits_differing = 0;
19719                 Size_t full_cp_count = 0;
19720                 bool first_time = TRUE;
19721 
19722                 /* Go through the bytes and find the bit positions that differ
19723                  * */
19724                 invlist_iterinit(cp_list);
19725                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19726                     unsigned int i = this_start;
19727 
19728                     if (first_time) {
19729                         if (! UVCHR_IS_INVARIANT(i)) {
19730                             goto done_anyofm;
19731                         }
19732 
19733                         first_time = FALSE;
19734                         lowest_cp = this_start;
19735 
19736                         /* We have set up the code point to compare with.
19737                          * Don't compare it with itself */
19738                         i++;
19739                     }
19740 
19741                     /* Find the bit positions that differ from the lowest code
19742                      * point in the node.  Keep track of all such positions by
19743                      * OR'ing */
19744                     for (; i <= this_end; i++) {
19745                         if (! UVCHR_IS_INVARIANT(i)) {
19746                             goto done_anyofm;
19747                         }
19748 
19749                         bits_differing  |= i ^ lowest_cp;
19750                     }
19751 
19752                     full_cp_count += this_end - this_start + 1;
19753                 }
19754 
19755                 /* At the end of the loop, we count how many bits differ from
19756                  * the bits in lowest code point, call the count 'd'.  If the
19757                  * set we found contains 2**d elements, it is the closure of
19758                  * all code points that differ only in those bit positions.  To
19759                  * convince yourself of that, first note that the number in the
19760                  * closure must be a power of 2, which we test for.  The only
19761                  * way we could have that count and it be some differing set,
19762                  * is if we got some code points that don't differ from the
19763                  * lowest code point in any position, but do differ from each
19764                  * other in some other position.  That means one code point has
19765                  * a 1 in that position, and another has a 0.  But that would
19766                  * mean that one of them differs from the lowest code point in
19767                  * that position, which possibility we've already excluded.  */
19768                 if (  (inverted || full_cp_count > 1)
19769                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19770                 {
19771                     U8 ANYOFM_mask;
19772 
19773                     op = ANYOFM + inverted;;
19774 
19775                     /* We need to make the bits that differ be 0's */
19776                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19777 
19778                     /* The argument is the lowest code point */
19779                     ret = reganode(pRExC_state, op, lowest_cp);
19780                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19781                 }
19782 
19783               done_anyofm:
19784                 invlist_iterfinish(cp_list);
19785             }
19786 
19787             if (inverted) {
19788                 _invlist_invert(cp_list);
19789             }
19790 
19791             if (op != END) {
19792                 goto not_anyof;
19793             }
19794 
19795             /* XXX We could create an ANYOFR_LOW node here if we saved above if
19796              * all were invariants, it wasn't inverted, and there is a single
19797              * range.  This would be faster than some of the posix nodes we
19798              * create below like /\d/a, but would be twice the size.  Without
19799              * having actually measured the gain, khw doesn't think the
19800              * tradeoff is really worth it */
19801         }
19802 
19803         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19804             PERL_UINT_FAST8_T type;
19805             SV * intersection = NULL;
19806             SV* d_invlist = NULL;
19807 
19808             /* See if this matches any of the POSIX classes.  The POSIXA and
19809              * POSIXD ones are about the same speed as ANYOF ops, but take less
19810              * room; the ones that have above-Latin1 code point matches are
19811              * somewhat faster than ANYOF.  */
19812 
19813             for (type = POSIXA; type >= POSIXD; type--) {
19814                 int posix_class;
19815 
19816                 if (type == POSIXL) {   /* But not /l posix classes */
19817                     continue;
19818                 }
19819 
19820                 for (posix_class = 0;
19821                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19822                      posix_class++)
19823                 {
19824                     SV** our_code_points = &cp_list;
19825                     SV** official_code_points;
19826                     int try_inverted;
19827 
19828                     if (type == POSIXA) {
19829                         official_code_points = &PL_Posix_ptrs[posix_class];
19830                     }
19831                     else {
19832                         official_code_points = &PL_XPosix_ptrs[posix_class];
19833                     }
19834 
19835                     /* Skip non-existent classes of this type.  e.g. \v only
19836                      * has an entry in PL_XPosix_ptrs */
19837                     if (! *official_code_points) {
19838                         continue;
19839                     }
19840 
19841                     /* Try both the regular class, and its inversion */
19842                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19843                         bool this_inverted = invert ^ try_inverted;
19844 
19845                         if (type != POSIXD) {
19846 
19847                             /* This class that isn't /d can't match if we have
19848                              * /d dependencies */
19849                             if (has_runtime_dependency
19850                                                     & HAS_D_RUNTIME_DEPENDENCY)
19851                             {
19852                                 continue;
19853                             }
19854                         }
19855                         else /* is /d */ if (! this_inverted) {
19856 
19857                             /* /d classes don't match anything non-ASCII below
19858                              * 256 unconditionally (which cp_list contains) */
19859                             _invlist_intersection(cp_list, PL_UpperLatin1,
19860                                                            &intersection);
19861                             if (_invlist_len(intersection) != 0) {
19862                                 continue;
19863                             }
19864 
19865                             SvREFCNT_dec(d_invlist);
19866                             d_invlist = invlist_clone(cp_list, NULL);
19867 
19868                             /* But under UTF-8 it turns into using /u rules.
19869                              * Add the things it matches under these conditions
19870                              * so that we check below that these are identical
19871                              * to what the tested class should match */
19872                             if (upper_latin1_only_utf8_matches) {
19873                                 _invlist_union(
19874                                             d_invlist,
19875                                             upper_latin1_only_utf8_matches,
19876                                             &d_invlist);
19877                             }
19878                             our_code_points = &d_invlist;
19879                         }
19880                         else {  /* POSIXD, inverted.  If this doesn't have this
19881                                    flag set, it isn't /d. */
19882                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19883                             {
19884                                 continue;
19885                             }
19886                             our_code_points = &cp_list;
19887                         }
19888 
19889                         /* Here, have weeded out some things.  We want to see
19890                          * if the list of characters this node contains
19891                          * ('*our_code_points') precisely matches those of the
19892                          * class we are currently checking against
19893                          * ('*official_code_points'). */
19894                         if (_invlistEQ(*our_code_points,
19895                                        *official_code_points,
19896                                        try_inverted))
19897                         {
19898                             /* Here, they precisely match.  Optimize this ANYOF
19899                              * node into its equivalent POSIX one of the
19900                              * correct type, possibly inverted */
19901                             ret = reg_node(pRExC_state, (try_inverted)
19902                                                         ? type + NPOSIXA
19903                                                                 - POSIXA
19904                                                         : type);
19905                             FLAGS(REGNODE_p(ret)) = posix_class;
19906                             SvREFCNT_dec(d_invlist);
19907                             SvREFCNT_dec(intersection);
19908                             goto not_anyof;
19909                         }
19910                     }
19911                 }
19912             }
19913             SvREFCNT_dec(d_invlist);
19914             SvREFCNT_dec(intersection);
19915         }
19916 
19917         /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19918          * both in size and speed.  Currently, a 20 bit range base (smallest
19919          * code point in the range), and a 12 bit maximum delta are packed into
19920          * a 32 bit word.  This allows for using it on all of the Unicode code
19921          * points except for the highest plane, which is only for private use
19922          * code points.  khw doubts that a bigger delta is likely in real world
19923          * applications */
19924         if (     single_range
19925             && ! has_runtime_dependency
19926             &&   anyof_flags == 0
19927             &&   start[0] < (1 << ANYOFR_BASE_BITS)
19928             &&   end[0] - start[0]
19929                     < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19930                                    * CHARBITS - ANYOFR_BASE_BITS))))
19931 
19932         {
19933             U8 low_utf8[UTF8_MAXBYTES+1];
19934             U8 high_utf8[UTF8_MAXBYTES+1];
19935 
19936             ret = reganode(pRExC_state, ANYOFR,
19937                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19938 
19939             /* Place the lowest UTF-8 start byte in the flags field, so as to
19940              * allow efficient ruling out at run time of many possible inputs.
19941              * */
19942             (void) uvchr_to_utf8(low_utf8, start[0]);
19943             (void) uvchr_to_utf8(high_utf8, end[0]);
19944 
19945             /* If all code points share the same first byte, this can be an
19946              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
19947              * quickly rule out many inputs at run-time without having to
19948              * compute the code point from UTF-8.  For EBCDIC, we use I8, as
19949              * not doing that transformation would not rule out nearly so many
19950              * things */
19951             if (low_utf8[0] == high_utf8[0]) {
19952                 OP(REGNODE_p(ret)) = ANYOFRb;
19953                 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19954             }
19955             else {
19956                 ANYOF_FLAGS(REGNODE_p(ret))
19957                                     = NATIVE_UTF8_TO_I8(low_utf8[0]);
19958             }
19959 
19960             goto not_anyof;
19961         }
19962 
19963         /* If didn't find an optimization and there is no need for a bitmap,
19964          * optimize to indicate that */
19965         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19966             && ! LOC
19967             && ! upper_latin1_only_utf8_matches
19968             &&   anyof_flags == 0)
19969         {
19970             U8 low_utf8[UTF8_MAXBYTES+1];
19971             UV highest_cp = invlist_highest(cp_list);
19972 
19973             /* Currently the maximum allowed code point by the system is
19974              * IV_MAX.  Higher ones are reserved for future internal use.  This
19975              * particular regnode can be used for higher ones, but we can't
19976              * calculate the code point of those.  IV_MAX suffices though, as
19977              * it will be a large first byte */
19978             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19979                            - low_utf8;
19980 
19981             /* We store the lowest possible first byte of the UTF-8
19982              * representation, using the flags field.  This allows for quick
19983              * ruling out of some inputs without having to convert from UTF-8
19984              * to code point.  For EBCDIC, we use I8, as not doing that
19985              * transformation would not rule out nearly so many things */
19986             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19987 
19988             op = ANYOFH;
19989 
19990             /* If the first UTF-8 start byte for the highest code point in the
19991              * range is suitably small, we may be able to get an upper bound as
19992              * well */
19993             if (highest_cp <= IV_MAX) {
19994                 U8 high_utf8[UTF8_MAXBYTES+1];
19995                 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19996                                 - high_utf8;
19997 
19998                 /* If the lowest and highest are the same, we can get an exact
19999                  * first byte instead of a just minimum or even a sequence of
20000                  * exact leading bytes.  We signal these with different
20001                  * regnodes */
20002                 if (low_utf8[0] == high_utf8[0]) {
20003                     Size_t len = find_first_differing_byte_pos(low_utf8,
20004                                                                high_utf8,
20005                                                        MIN(low_len, high_len));
20006 
20007                     if (len == 1) {
20008 
20009                         /* No need to convert to I8 for EBCDIC as this is an
20010                          * exact match */
20011                         anyof_flags = low_utf8[0];
20012                         op = ANYOFHb;
20013                     }
20014                     else {
20015                         op = ANYOFHs;
20016                         ret = regnode_guts(pRExC_state, op,
20017                                            regarglen[op] + STR_SZ(len),
20018                                            "anyofhs");
20019                         FILL_NODE(ret, op);
20020                         ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
20021                                                                         = len;
20022                         Copy(low_utf8,  /* Add the common bytes */
20023                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
20024                            len, U8);
20025                         RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
20026                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
20027                                                   NULL, only_utf8_locale_list);
20028                         goto not_anyof;
20029                     }
20030                 }
20031                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
20032                 {
20033 
20034                     /* Here, the high byte is not the same as the low, but is
20035                      * small enough that its reasonable to have a loose upper
20036                      * bound, which is packed in with the strict lower bound.
20037                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
20038                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
20039                      * is the same thing as UTF-8 */
20040 
20041                     U8 bits = 0;
20042                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
20043                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
20044                                   - anyof_flags;
20045 
20046                     if (range_diff <= max_range_diff / 8) {
20047                         bits = 3;
20048                     }
20049                     else if (range_diff <= max_range_diff / 4) {
20050                         bits = 2;
20051                     }
20052                     else if (range_diff <= max_range_diff / 2) {
20053                         bits = 1;
20054                     }
20055                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
20056                     op = ANYOFHr;
20057                 }
20058             }
20059 
20060             goto done_finding_op;
20061         }
20062     }   /* End of seeing if can optimize it into a different node */
20063 
20064   is_anyof: /* It's going to be an ANYOF node. */
20065     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
20066          ? ANYOFD
20067          : ((posixl)
20068             ? ANYOFPOSIXL
20069             : ((LOC)
20070                ? ANYOFL
20071                : ANYOF));
20072 
20073   done_finding_op:
20074 
20075     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
20076     FILL_NODE(ret, op);        /* We set the argument later */
20077     RExC_emit += 1 + regarglen[op];
20078     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
20079 
20080     /* Here, <cp_list> contains all the code points we can determine at
20081      * compile time that match under all conditions.  Go through it, and
20082      * for things that belong in the bitmap, put them there, and delete from
20083      * <cp_list>.  While we are at it, see if everything above 255 is in the
20084      * list, and if so, set a flag to speed up execution */
20085 
20086     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
20087 
20088     if (posixl) {
20089         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
20090     }
20091 
20092     if (invert) {
20093         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
20094     }
20095 
20096     /* Here, the bitmap has been populated with all the Latin1 code points that
20097      * always match.  Can now add to the overall list those that match only
20098      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
20099      * */
20100     if (upper_latin1_only_utf8_matches) {
20101 	if (cp_list) {
20102 	    _invlist_union(cp_list,
20103                            upper_latin1_only_utf8_matches,
20104                            &cp_list);
20105 	    SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
20106 	}
20107 	else {
20108 	    cp_list = upper_latin1_only_utf8_matches;
20109 	}
20110         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
20111     }
20112 
20113     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
20114                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
20115                    ? listsv
20116                    : NULL,
20117                   only_utf8_locale_list);
20118     SvREFCNT_dec(cp_list);;
20119     SvREFCNT_dec(only_utf8_locale_list);
20120     return ret;
20121 
20122   not_anyof:
20123 
20124     /* Here, the node is getting optimized into something that's not an ANYOF
20125      * one.  Finish up. */
20126 
20127     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
20128                                            RExC_parse - orig_parse);;
20129     SvREFCNT_dec(cp_list);;
20130     SvREFCNT_dec(only_utf8_locale_list);
20131     SvREFCNT_dec(upper_latin1_only_utf8_matches);
20132     return ret;
20133 }
20134 
20135 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
20136 
20137 STATIC void
S_set_ANYOF_arg(pTHX_ RExC_state_t * const pRExC_state,regnode * const node,SV * const cp_list,SV * const runtime_defns,SV * const only_utf8_locale_list)20138 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
20139                 regnode* const node,
20140                 SV* const cp_list,
20141                 SV* const runtime_defns,
20142                 SV* const only_utf8_locale_list)
20143 {
20144     /* Sets the arg field of an ANYOF-type node 'node', using information about
20145      * the node passed-in.  If there is nothing outside the node's bitmap, the
20146      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
20147      * the count returned by add_data(), having allocated and stored an array,
20148      * av, as follows:
20149      *
20150      *  av[0] stores the inversion list defining this class as far as known at
20151      *        this time, or PL_sv_undef if nothing definite is now known.
20152      *  av[1] stores the inversion list of code points that match only if the
20153      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
20154      *        av[2], or no entry otherwise.
20155      *  av[2] stores the list of user-defined properties whose subroutine
20156      *        definitions aren't known at this time, or no entry if none. */
20157 
20158     UV n;
20159 
20160     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
20161 
20162     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
20163         assert(! (ANYOF_FLAGS(node)
20164                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
20165 	ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
20166     }
20167     else {
20168 	AV * const av = newAV();
20169 	SV *rv;
20170 
20171         if (cp_list) {
20172             av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20173         }
20174 
20175         /* (Note that if any of this changes, the size calculations in
20176          * S_optimize_regclass() might need to be updated.) */
20177 
20178         if (only_utf8_locale_list) {
20179             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20180                                      SvREFCNT_inc_NN(only_utf8_locale_list));
20181         }
20182 
20183         if (runtime_defns) {
20184             av_store(av, DEFERRED_USER_DEFINED_INDEX,
20185                          SvREFCNT_inc_NN(runtime_defns));
20186         }
20187 
20188 	rv = newRV_noinc(MUTABLE_SV(av));
20189 	n = add_data(pRExC_state, STR_WITH_LEN("s"));
20190 	RExC_rxi->data->data[n] = (void*)rv;
20191 	ARG_SET(node, n);
20192     }
20193 }
20194 
20195 SV *
20196 
20197 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
Perl_get_regclass_nonbitmap_data(pTHX_ const regexp * prog,const regnode * node,bool doinit,SV ** listsvp,SV ** only_utf8_locale_ptr,SV ** output_invlist)20198 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20199 #else
20200 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20201 #endif
20202 
20203 {
20204     /* For internal core use only.
20205      * Returns the inversion list for the input 'node' in the regex 'prog'.
20206      * If <doinit> is 'true', will attempt to create the inversion list if not
20207      *    already done.
20208      * If <listsvp> is non-null, will return the printable contents of the
20209      *    property definition.  This can be used to get debugging information
20210      *    even before the inversion list exists, by calling this function with
20211      *    'doinit' set to false, in which case the components that will be used
20212      *    to eventually create the inversion list are returned  (in a printable
20213      *    form).
20214      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20215      *    store an inversion list of code points that should match only if the
20216      *    execution-time locale is a UTF-8 one.
20217      * If <output_invlist> is not NULL, it is where this routine is to store an
20218      *    inversion list of the code points that would be instead returned in
20219      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
20220      *    when this parameter is used, is just the non-code point data that
20221      *    will go into creating the inversion list.  This currently should be just
20222      *    user-defined properties whose definitions were not known at compile
20223      *    time.  Using this parameter allows for easier manipulation of the
20224      *    inversion list's data by the caller.  It is illegal to call this
20225      *    function with this parameter set, but not <listsvp>
20226      *
20227      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
20228      * that, in spite of this function's name, the inversion list it returns
20229      * may include the bitmap data as well */
20230 
20231     SV *si  = NULL;         /* Input initialization string */
20232     SV* invlist = NULL;
20233 
20234     RXi_GET_DECL(prog, progi);
20235     const struct reg_data * const data = prog ? progi->data : NULL;
20236 
20237 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20238     PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20239 #else
20240     PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20241 #endif
20242     assert(! output_invlist || listsvp);
20243 
20244     if (data && data->count) {
20245 	const U32 n = ARG(node);
20246 
20247 	if (data->what[n] == 's') {
20248 	    SV * const rv = MUTABLE_SV(data->data[n]);
20249 	    AV * const av = MUTABLE_AV(SvRV(rv));
20250 	    SV **const ary = AvARRAY(av);
20251 
20252             invlist = ary[INVLIST_INDEX];
20253 
20254             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20255                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20256             }
20257 
20258             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20259                 si = ary[DEFERRED_USER_DEFINED_INDEX];
20260             }
20261 
20262 	    if (doinit && (si || invlist)) {
20263                 if (si) {
20264                     bool user_defined;
20265                     SV * msg = newSVpvs_flags("", SVs_TEMP);
20266 
20267                     SV * prop_definition = handle_user_defined_property(
20268                             "", 0, FALSE,   /* There is no \p{}, \P{} */
20269                             SvPVX_const(si)[1] - '0',   /* /i or not has been
20270                                                            stored here for just
20271                                                            this occasion */
20272                             TRUE,           /* run time */
20273                             FALSE,          /* This call must find the defn */
20274                             si,             /* The property definition  */
20275                             &user_defined,
20276                             msg,
20277                             0               /* base level call */
20278                            );
20279 
20280                     if (SvCUR(msg)) {
20281                         assert(prop_definition == NULL);
20282 
20283                         Perl_croak(aTHX_ "%" UTF8f,
20284                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20285                     }
20286 
20287                     if (invlist) {
20288                         _invlist_union(invlist, prop_definition, &invlist);
20289                         SvREFCNT_dec_NN(prop_definition);
20290                     }
20291                     else {
20292                         invlist = prop_definition;
20293                     }
20294 
20295                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20296                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20297 
20298                     ary[INVLIST_INDEX] = invlist;
20299                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20300                                  ? ONLY_LOCALE_MATCHES_INDEX
20301                                  : INVLIST_INDEX);
20302                     si = NULL;
20303                 }
20304 	    }
20305 	}
20306     }
20307 
20308     /* If requested, return a printable version of what this ANYOF node matches
20309      * */
20310     if (listsvp) {
20311 	SV* matches_string = NULL;
20312 
20313         /* This function can be called at compile-time, before everything gets
20314          * resolved, in which case we return the currently best available
20315          * information, which is the string that will eventually be used to do
20316          * that resolving, 'si' */
20317 	if (si) {
20318             /* Here, we only have 'si' (and possibly some passed-in data in
20319              * 'invlist', which is handled below)  If the caller only wants
20320              * 'si', use that.  */
20321             if (! output_invlist) {
20322                 matches_string = newSVsv(si);
20323             }
20324             else {
20325                 /* But if the caller wants an inversion list of the node, we
20326                  * need to parse 'si' and place as much as possible in the
20327                  * desired output inversion list, making 'matches_string' only
20328                  * contain the currently unresolvable things */
20329                 const char *si_string = SvPVX(si);
20330                 STRLEN remaining = SvCUR(si);
20331                 UV prev_cp = 0;
20332                 U8 count = 0;
20333 
20334                 /* Ignore everything before and including the first new-line */
20335                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20336                 assert (si_string != NULL);
20337                 si_string++;
20338                 remaining = SvPVX(si) + SvCUR(si) - si_string;
20339 
20340                 while (remaining > 0) {
20341 
20342                     /* The data consists of just strings defining user-defined
20343                      * property names, but in prior incarnations, and perhaps
20344                      * somehow from pluggable regex engines, it could still
20345                      * hold hex code point definitions, all of which should be
20346                      * legal (or it wouldn't have gotten this far).  Each
20347                      * component of a range would be separated by a tab, and
20348                      * each range by a new-line.  If these are found, instead
20349                      * add them to the inversion list */
20350                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
20351                                      |PERL_SCAN_SILENT_NON_PORTABLE;
20352                     STRLEN len = remaining;
20353                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20354 
20355                     /* If the hex decode routine found something, it should go
20356                      * up to the next \n */
20357                     if (   *(si_string + len) == '\n') {
20358                         if (count) {    /* 2nd code point on line */
20359                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20360                         }
20361                         else {
20362                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20363                         }
20364                         count = 0;
20365                         goto prepare_for_next_iteration;
20366                     }
20367 
20368                     /* If the hex decode was instead for the lower range limit,
20369                      * save it, and go parse the upper range limit */
20370                     if (*(si_string + len) == '\t') {
20371                         assert(count == 0);
20372 
20373                         prev_cp = cp;
20374                         count = 1;
20375                       prepare_for_next_iteration:
20376                         si_string += len + 1;
20377                         remaining -= len + 1;
20378                         continue;
20379                     }
20380 
20381                     /* Here, didn't find a legal hex number.  Just add the text
20382                      * from here up to the next \n, omitting any trailing
20383                      * markers. */
20384 
20385                     remaining -= len;
20386                     len = strcspn(si_string,
20387                                         DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20388                     remaining -= len;
20389                     if (matches_string) {
20390                         sv_catpvn(matches_string, si_string, len);
20391                     }
20392                     else {
20393                         matches_string = newSVpvn(si_string, len);
20394                     }
20395                     sv_catpvs(matches_string, " ");
20396 
20397                     si_string += len;
20398                     if (   remaining
20399                         && UCHARAT(si_string)
20400                                             == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20401                     {
20402                         si_string++;
20403                         remaining--;
20404                     }
20405                     if (remaining && UCHARAT(si_string) == '\n') {
20406                         si_string++;
20407                         remaining--;
20408                     }
20409                 } /* end of loop through the text */
20410 
20411                 assert(matches_string);
20412                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
20413                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20414                 }
20415             } /* end of has an 'si' */
20416 	}
20417 
20418         /* Add the stuff that's already known */
20419         if (invlist) {
20420 
20421             /* Again, if the caller doesn't want the output inversion list, put
20422              * everything in 'matches-string' */
20423             if (! output_invlist) {
20424                 if ( ! matches_string) {
20425                     matches_string = newSVpvs("\n");
20426                 }
20427                 sv_catsv(matches_string, invlist_contents(invlist,
20428                                                   TRUE /* traditional style */
20429                                                   ));
20430             }
20431             else if (! *output_invlist) {
20432                 *output_invlist = invlist_clone(invlist, NULL);
20433             }
20434             else {
20435                 _invlist_union(*output_invlist, invlist, output_invlist);
20436             }
20437         }
20438 
20439 	*listsvp = matches_string;
20440     }
20441 
20442     return invlist;
20443 }
20444 
20445 /* reg_skipcomment()
20446 
20447    Absorbs an /x style # comment from the input stream,
20448    returning a pointer to the first character beyond the comment, or if the
20449    comment terminates the pattern without anything following it, this returns
20450    one past the final character of the pattern (in other words, RExC_end) and
20451    sets the REG_RUN_ON_COMMENT_SEEN flag.
20452 
20453    Note it's the callers responsibility to ensure that we are
20454    actually in /x mode
20455 
20456 */
20457 
20458 PERL_STATIC_INLINE char*
S_reg_skipcomment(RExC_state_t * pRExC_state,char * p)20459 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20460 {
20461     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20462 
20463     assert(*p == '#');
20464 
20465     while (p < RExC_end) {
20466         if (*(++p) == '\n') {
20467             return p+1;
20468         }
20469     }
20470 
20471     /* we ran off the end of the pattern without ending the comment, so we have
20472      * to add an \n when wrapping */
20473     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20474     return p;
20475 }
20476 
20477 STATIC void
S_skip_to_be_ignored_text(pTHX_ RExC_state_t * pRExC_state,char ** p,const bool force_to_xmod)20478 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20479                                 char ** p,
20480                                 const bool force_to_xmod
20481                          )
20482 {
20483     /* If the text at the current parse position '*p' is a '(?#...)' comment,
20484      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20485      * is /x whitespace, advance '*p' so that on exit it points to the first
20486      * byte past all such white space and comments */
20487 
20488     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20489 
20490     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20491 
20492     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20493 
20494     for (;;) {
20495 	if (RExC_end - (*p) >= 3
20496 	    && *(*p)     == '('
20497 	    && *(*p + 1) == '?'
20498 	    && *(*p + 2) == '#')
20499 	{
20500 	    while (*(*p) != ')') {
20501 		if ((*p) == RExC_end)
20502 		    FAIL("Sequence (?#... not terminated");
20503 		(*p)++;
20504 	    }
20505 	    (*p)++;
20506 	    continue;
20507 	}
20508 
20509 	if (use_xmod) {
20510             const char * save_p = *p;
20511             while ((*p) < RExC_end) {
20512                 STRLEN len;
20513                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20514                     (*p) += len;
20515                 }
20516                 else if (*(*p) == '#') {
20517                     (*p) = reg_skipcomment(pRExC_state, (*p));
20518                 }
20519                 else {
20520                     break;
20521                 }
20522             }
20523             if (*p != save_p) {
20524                 continue;
20525             }
20526 	}
20527 
20528         break;
20529     }
20530 
20531     return;
20532 }
20533 
20534 /* nextchar()
20535 
20536    Advances the parse position by one byte, unless that byte is the beginning
20537    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
20538    those two cases, the parse position is advanced beyond all such comments and
20539    white space.
20540 
20541    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20542 */
20543 
20544 STATIC void
S_nextchar(pTHX_ RExC_state_t * pRExC_state)20545 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20546 {
20547     PERL_ARGS_ASSERT_NEXTCHAR;
20548 
20549     if (RExC_parse < RExC_end) {
20550         assert(   ! UTF
20551                || UTF8_IS_INVARIANT(*RExC_parse)
20552                || UTF8_IS_START(*RExC_parse));
20553 
20554         RExC_parse += (UTF)
20555                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20556                       : 1;
20557 
20558         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20559                                 FALSE /* Don't force /x */ );
20560     }
20561 }
20562 
20563 STATIC void
S_change_engine_size(pTHX_ RExC_state_t * pRExC_state,const Ptrdiff_t size)20564 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20565 {
20566     /* 'size' is the delta number of smallest regnode equivalents to add or
20567      * subtract from the current memory allocated to the regex engine being
20568      * constructed. */
20569 
20570     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20571 
20572     RExC_size += size;
20573 
20574     Renewc(RExC_rxi,
20575            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20576                                                 /* +1 for REG_MAGIC */
20577            char,
20578            regexp_internal);
20579     if ( RExC_rxi == NULL )
20580 	FAIL("Regexp out of space");
20581     RXi_SET(RExC_rx, RExC_rxi);
20582 
20583     RExC_emit_start = RExC_rxi->program;
20584     if (size > 0) {
20585         Zero(REGNODE_p(RExC_emit), size, regnode);
20586     }
20587 
20588 #ifdef RE_TRACK_PATTERN_OFFSETS
20589     Renew(RExC_offsets, 2*RExC_size+1, U32);
20590     if (size > 0) {
20591         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20592     }
20593     RExC_offsets[0] = RExC_size;
20594 #endif
20595 }
20596 
20597 STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t * pRExC_state,const U8 op,const STRLEN extra_size,const char * const name)20598 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20599 {
20600     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20601      * equivalents space.  It aligns and increments RExC_size
20602      *
20603      * It returns the regnode's offset into the regex engine program */
20604 
20605     const regnode_offset ret = RExC_emit;
20606 
20607     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20608 
20609     PERL_ARGS_ASSERT_REGNODE_GUTS;
20610 
20611     SIZE_ALIGN(RExC_size);
20612     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20613     NODE_ALIGN_FILL(REGNODE_p(ret));
20614 #ifndef RE_TRACK_PATTERN_OFFSETS
20615     PERL_UNUSED_ARG(name);
20616     PERL_UNUSED_ARG(op);
20617 #else
20618     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20619 
20620     if (RExC_offsets) {         /* MJD */
20621 	MJD_OFFSET_DEBUG(
20622               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20623               name, __LINE__,
20624               PL_reg_name[op],
20625               (UV)(RExC_emit) > RExC_offsets[0]
20626 		? "Overwriting end of array!\n" : "OK",
20627               (UV)(RExC_emit),
20628               (UV)(RExC_parse - RExC_start),
20629               (UV)RExC_offsets[0]));
20630 	Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20631     }
20632 #endif
20633     return(ret);
20634 }
20635 
20636 /*
20637 - reg_node - emit a node
20638 */
20639 STATIC regnode_offset /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)20640 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20641 {
20642     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20643     regnode_offset ptr = ret;
20644 
20645     PERL_ARGS_ASSERT_REG_NODE;
20646 
20647     assert(regarglen[op] == 0);
20648 
20649     FILL_ADVANCE_NODE(ptr, op);
20650     RExC_emit = ptr;
20651     return(ret);
20652 }
20653 
20654 /*
20655 - reganode - emit a node with an argument
20656 */
20657 STATIC regnode_offset /* Location. */
S_reganode(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)20658 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20659 {
20660     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20661     regnode_offset ptr = ret;
20662 
20663     PERL_ARGS_ASSERT_REGANODE;
20664 
20665     /* ANYOF are special cased to allow non-length 1 args */
20666     assert(regarglen[op] == 1);
20667 
20668     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20669     RExC_emit = ptr;
20670     return(ret);
20671 }
20672 
20673 /*
20674 - regpnode - emit a temporary node with a SV* argument
20675 */
20676 STATIC regnode_offset /* Location. */
S_regpnode(pTHX_ RExC_state_t * pRExC_state,U8 op,SV * arg)20677 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20678 {
20679     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20680     regnode_offset ptr = ret;
20681 
20682     PERL_ARGS_ASSERT_REGPNODE;
20683 
20684     FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20685     RExC_emit = ptr;
20686     return(ret);
20687 }
20688 
20689 STATIC regnode_offset
S_reg2Lanode(pTHX_ RExC_state_t * pRExC_state,const U8 op,const U32 arg1,const I32 arg2)20690 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20691 {
20692     /* emit a node with U32 and I32 arguments */
20693 
20694     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20695     regnode_offset ptr = ret;
20696 
20697     PERL_ARGS_ASSERT_REG2LANODE;
20698 
20699     assert(regarglen[op] == 2);
20700 
20701     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20702     RExC_emit = ptr;
20703     return(ret);
20704 }
20705 
20706 /*
20707 - reginsert - insert an operator in front of already-emitted operand
20708 *
20709 * That means that on exit 'operand' is the offset of the newly inserted
20710 * operator, and the original operand has been relocated.
20711 *
20712 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20713 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20714 *
20715 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20716 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20717 *
20718 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20719 */
20720 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,const U8 op,const regnode_offset operand,const U32 depth)20721 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20722                   const regnode_offset operand, const U32 depth)
20723 {
20724     regnode *src;
20725     regnode *dst;
20726     regnode *place;
20727     const int offset = regarglen[(U8)op];
20728     const int size = NODE_STEP_REGNODE + offset;
20729     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20730 
20731     PERL_ARGS_ASSERT_REGINSERT;
20732     PERL_UNUSED_CONTEXT;
20733     PERL_UNUSED_ARG(depth);
20734 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20735     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20736     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20737                                     studying. If this is wrong then we need to adjust RExC_recurse
20738                                     below like we do with RExC_open_parens/RExC_close_parens. */
20739     change_engine_size(pRExC_state, (Ptrdiff_t) size);
20740     src = REGNODE_p(RExC_emit);
20741     RExC_emit += size;
20742     dst = REGNODE_p(RExC_emit);
20743 
20744     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20745      * and [perl #133871] shows this can lead to problems, so skip this
20746      * realignment of parens until a later pass when they are reliable */
20747     if (! IN_PARENS_PASS && RExC_open_parens) {
20748         int paren;
20749         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20750         /* remember that RExC_npar is rex->nparens + 1,
20751          * iow it is 1 more than the number of parens seen in
20752          * the pattern so far. */
20753         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20754             /* note, RExC_open_parens[0] is the start of the
20755              * regex, it can't move. RExC_close_parens[0] is the end
20756              * of the regex, it *can* move. */
20757             if ( paren && RExC_open_parens[paren] >= operand ) {
20758                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20759                 RExC_open_parens[paren] += size;
20760             } else {
20761                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20762             }
20763             if ( RExC_close_parens[paren] >= operand ) {
20764                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20765                 RExC_close_parens[paren] += size;
20766             } else {
20767                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20768             }
20769         }
20770     }
20771     if (RExC_end_op)
20772         RExC_end_op += size;
20773 
20774     while (src > REGNODE_p(operand)) {
20775 	StructCopy(--src, --dst, regnode);
20776 #ifdef RE_TRACK_PATTERN_OFFSETS
20777         if (RExC_offsets) {     /* MJD 20010112 */
20778 	    MJD_OFFSET_DEBUG(
20779                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20780                   "reginsert",
20781 		  __LINE__,
20782 		  PL_reg_name[op],
20783                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20784 		    ? "Overwriting end of array!\n" : "OK",
20785                   (UV)REGNODE_OFFSET(src),
20786                   (UV)REGNODE_OFFSET(dst),
20787                   (UV)RExC_offsets[0]));
20788 	    Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20789 	    Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20790         }
20791 #endif
20792     }
20793 
20794     place = REGNODE_p(operand);	/* Op node, where operand used to be. */
20795 #ifdef RE_TRACK_PATTERN_OFFSETS
20796     if (RExC_offsets) {         /* MJD */
20797 	MJD_OFFSET_DEBUG(
20798               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20799               "reginsert",
20800 	      __LINE__,
20801 	      PL_reg_name[op],
20802               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20803               ? "Overwriting end of array!\n" : "OK",
20804               (UV)REGNODE_OFFSET(place),
20805               (UV)(RExC_parse - RExC_start),
20806               (UV)RExC_offsets[0]));
20807 	Set_Node_Offset(place, RExC_parse);
20808 	Set_Node_Length(place, 1);
20809     }
20810 #endif
20811     src = NEXTOPER(place);
20812     FLAGS(place) = 0;
20813     FILL_NODE(operand, op);
20814 
20815     /* Zero out any arguments in the new node */
20816     Zero(src, offset, regnode);
20817 }
20818 
20819 /*
20820 - regtail - set the next-pointer at the end of a node chain of p to val.  If
20821             that value won't fit in the space available, instead returns FALSE.
20822             (Except asserts if we can't fit in the largest space the regex
20823             engine is designed for.)
20824 - SEE ALSO: regtail_study
20825 */
20826 STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,const regnode_offset p,const regnode_offset val,const U32 depth)20827 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20828                 const regnode_offset p,
20829                 const regnode_offset val,
20830                 const U32 depth)
20831 {
20832     regnode_offset scan;
20833     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20834 
20835     PERL_ARGS_ASSERT_REGTAIL;
20836 #ifndef DEBUGGING
20837     PERL_UNUSED_ARG(depth);
20838 #endif
20839 
20840     /* The final node in the chain is the first one with a nonzero next pointer
20841      * */
20842     scan = (regnode_offset) p;
20843     for (;;) {
20844 	regnode * const temp = regnext(REGNODE_p(scan));
20845         DEBUG_PARSE_r({
20846             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20847             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20848             Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
20849                 SvPV_nolen_const(RExC_mysv), scan,
20850                     (temp == NULL ? "->" : ""),
20851                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20852             );
20853         });
20854         if (temp == NULL)
20855             break;
20856         scan = REGNODE_OFFSET(temp);
20857     }
20858 
20859     /* Populate this node's next pointer */
20860     assert(val >= scan);
20861     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20862         assert((UV) (val - scan) <= U32_MAX);
20863         ARG_SET(REGNODE_p(scan), val - scan);
20864     }
20865     else {
20866         if (val - scan > U16_MAX) {
20867             /* Populate this with something that won't loop and will likely
20868              * lead to a crash if the caller ignores the failure return, and
20869              * execution continues */
20870             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20871             return FALSE;
20872         }
20873         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20874     }
20875 
20876     return TRUE;
20877 }
20878 
20879 #ifdef DEBUGGING
20880 /*
20881 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20882 - Look for optimizable sequences at the same time.
20883 - currently only looks for EXACT chains.
20884 
20885 This is experimental code. The idea is to use this routine to perform
20886 in place optimizations on branches and groups as they are constructed,
20887 with the long term intention of removing optimization from study_chunk so
20888 that it is purely analytical.
20889 
20890 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20891 to control which is which.
20892 
20893 This used to return a value that was ignored.  It was a problem that it is
20894 #ifdef'd to be another function that didn't return a value.  khw has changed it
20895 so both currently return a pass/fail return.
20896 
20897 */
20898 /* TODO: All four parms should be const */
20899 
20900 STATIC bool
S_regtail_study(pTHX_ RExC_state_t * pRExC_state,regnode_offset p,const regnode_offset val,U32 depth)20901 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20902                       const regnode_offset val, U32 depth)
20903 {
20904     regnode_offset scan;
20905     U8 exact = PSEUDO;
20906 #ifdef EXPERIMENTAL_INPLACESCAN
20907     I32 min = 0;
20908 #endif
20909     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20910 
20911     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20912 
20913 
20914     /* Find last node. */
20915 
20916     scan = p;
20917     for (;;) {
20918         regnode * const temp = regnext(REGNODE_p(scan));
20919 #ifdef EXPERIMENTAL_INPLACESCAN
20920         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20921 	    bool unfolded_multi_char;	/* Unexamined in this routine */
20922             if (join_exact(pRExC_state, scan, &min,
20923                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20924                 return TRUE; /* Was return EXACT */
20925 	}
20926 #endif
20927         if ( exact ) {
20928             if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20929                 if (exact == PSEUDO )
20930                     exact= OP(REGNODE_p(scan));
20931                 else if (exact != OP(REGNODE_p(scan)) )
20932                     exact= 0;
20933             }
20934             else if (OP(REGNODE_p(scan)) != NOTHING) {
20935                 exact= 0;
20936             }
20937         }
20938         DEBUG_PARSE_r({
20939             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20940             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20941             Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
20942                 SvPV_nolen_const(RExC_mysv),
20943                 scan,
20944                 PL_reg_name[exact]);
20945         });
20946 	if (temp == NULL)
20947 	    break;
20948 	scan = REGNODE_OFFSET(temp);
20949     }
20950     DEBUG_PARSE_r({
20951         DEBUG_PARSE_MSG("");
20952         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20953         Perl_re_printf( aTHX_
20954                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20955 		      SvPV_nolen_const(RExC_mysv),
20956 		      (IV)val,
20957 		      (IV)(val - scan)
20958         );
20959     });
20960     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20961         assert((UV) (val - scan) <= U32_MAX);
20962 	ARG_SET(REGNODE_p(scan), val - scan);
20963     }
20964     else {
20965         if (val - scan > U16_MAX) {
20966             /* Populate this with something that won't loop and will likely
20967              * lead to a crash if the caller ignores the failure return, and
20968              * execution continues */
20969             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20970             return FALSE;
20971         }
20972 	NEXT_OFF(REGNODE_p(scan)) = val - scan;
20973     }
20974 
20975     return TRUE; /* Was 'return exact' */
20976 }
20977 #endif
20978 
20979 STATIC SV*
S_get_ANYOFM_contents(pTHX_ const regnode * n)20980 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20981 
20982     /* Returns an inversion list of all the code points matched by the
20983      * ANYOFM/NANYOFM node 'n' */
20984 
20985     SV * cp_list = _new_invlist(-1);
20986     const U8 lowest = (U8) ARG(n);
20987     unsigned int i;
20988     U8 count = 0;
20989     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20990 
20991     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20992 
20993     /* Starting with the lowest code point, any code point that ANDed with the
20994      * mask yields the lowest code point is in the set */
20995     for (i = lowest; i <= 0xFF; i++) {
20996         if ((i & FLAGS(n)) == ARG(n)) {
20997             cp_list = add_cp_to_invlist(cp_list, i);
20998             count++;
20999 
21000             /* We know how many code points (a power of two) that are in the
21001              * set.  No use looking once we've got that number */
21002             if (count >= needed) break;
21003         }
21004     }
21005 
21006     if (OP(n) == NANYOFM) {
21007         _invlist_invert(cp_list);
21008     }
21009     return cp_list;
21010 }
21011 
21012 /*
21013  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
21014  */
21015 #ifdef DEBUGGING
21016 
21017 static void
S_regdump_intflags(pTHX_ const char * lead,const U32 flags)21018 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
21019 {
21020     int bit;
21021     int set=0;
21022 
21023     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
21024 
21025     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
21026         if (flags & (1<<bit)) {
21027             if (!set++ && lead)
21028                 Perl_re_printf( aTHX_  "%s", lead);
21029             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
21030         }
21031     }
21032     if (lead)  {
21033         if (set)
21034             Perl_re_printf( aTHX_  "\n");
21035         else
21036             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
21037     }
21038 }
21039 
21040 static void
S_regdump_extflags(pTHX_ const char * lead,const U32 flags)21041 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
21042 {
21043     int bit;
21044     int set=0;
21045     regex_charset cs;
21046 
21047     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
21048 
21049     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
21050         if (flags & (1<<bit)) {
21051 	    if ((1<<bit) & RXf_PMf_CHARSET) {	/* Output separately, below */
21052 		continue;
21053 	    }
21054             if (!set++ && lead)
21055                 Perl_re_printf( aTHX_  "%s", lead);
21056             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
21057         }
21058     }
21059     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
21060             if (!set++ && lead) {
21061                 Perl_re_printf( aTHX_  "%s", lead);
21062             }
21063             switch (cs) {
21064                 case REGEX_UNICODE_CHARSET:
21065                     Perl_re_printf( aTHX_  "UNICODE");
21066                     break;
21067                 case REGEX_LOCALE_CHARSET:
21068                     Perl_re_printf( aTHX_  "LOCALE");
21069                     break;
21070                 case REGEX_ASCII_RESTRICTED_CHARSET:
21071                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
21072                     break;
21073                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
21074                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
21075                     break;
21076                 default:
21077                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
21078                     break;
21079             }
21080     }
21081     if (lead)  {
21082         if (set)
21083             Perl_re_printf( aTHX_  "\n");
21084         else
21085             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
21086     }
21087 }
21088 #endif
21089 
21090 void
Perl_regdump(pTHX_ const regexp * r)21091 Perl_regdump(pTHX_ const regexp *r)
21092 {
21093 #ifdef DEBUGGING
21094     int i;
21095     SV * const sv = sv_newmortal();
21096     SV *dsv= sv_newmortal();
21097     RXi_GET_DECL(r, ri);
21098     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21099 
21100     PERL_ARGS_ASSERT_REGDUMP;
21101 
21102     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
21103 
21104     /* Header fields of interest. */
21105     for (i = 0; i < 2; i++) {
21106         if (r->substrs->data[i].substr) {
21107             RE_PV_QUOTED_DECL(s, 0, dsv,
21108                             SvPVX_const(r->substrs->data[i].substr),
21109                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
21110                             PL_dump_re_max_len);
21111             Perl_re_printf( aTHX_
21112                           "%s %s%s at %" IVdf "..%" UVuf " ",
21113                           i ? "floating" : "anchored",
21114                           s,
21115                           RE_SV_TAIL(r->substrs->data[i].substr),
21116                           (IV)r->substrs->data[i].min_offset,
21117                           (UV)r->substrs->data[i].max_offset);
21118         }
21119         else if (r->substrs->data[i].utf8_substr) {
21120             RE_PV_QUOTED_DECL(s, 1, dsv,
21121                             SvPVX_const(r->substrs->data[i].utf8_substr),
21122                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
21123                             30);
21124             Perl_re_printf( aTHX_
21125                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
21126                           i ? "floating" : "anchored",
21127                           s,
21128                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
21129                           (IV)r->substrs->data[i].min_offset,
21130                           (UV)r->substrs->data[i].max_offset);
21131         }
21132     }
21133 
21134     if (r->check_substr || r->check_utf8)
21135         Perl_re_printf( aTHX_
21136 		      (const char *)
21137 		      (   r->check_substr == r->substrs->data[1].substr
21138 		       && r->check_utf8   == r->substrs->data[1].utf8_substr
21139 		       ? "(checking floating" : "(checking anchored"));
21140     if (r->intflags & PREGf_NOSCAN)
21141         Perl_re_printf( aTHX_  " noscan");
21142     if (r->extflags & RXf_CHECK_ALL)
21143         Perl_re_printf( aTHX_  " isall");
21144     if (r->check_substr || r->check_utf8)
21145         Perl_re_printf( aTHX_  ") ");
21146 
21147     if (ri->regstclass) {
21148         regprop(r, sv, ri->regstclass, NULL, NULL);
21149         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
21150     }
21151     if (r->intflags & PREGf_ANCH) {
21152         Perl_re_printf( aTHX_  "anchored");
21153         if (r->intflags & PREGf_ANCH_MBOL)
21154             Perl_re_printf( aTHX_  "(MBOL)");
21155         if (r->intflags & PREGf_ANCH_SBOL)
21156             Perl_re_printf( aTHX_  "(SBOL)");
21157         if (r->intflags & PREGf_ANCH_GPOS)
21158             Perl_re_printf( aTHX_  "(GPOS)");
21159         Perl_re_printf( aTHX_ " ");
21160     }
21161     if (r->intflags & PREGf_GPOS_SEEN)
21162         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
21163     if (r->intflags & PREGf_SKIP)
21164         Perl_re_printf( aTHX_  "plus ");
21165     if (r->intflags & PREGf_IMPLICIT)
21166         Perl_re_printf( aTHX_  "implicit ");
21167     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
21168     if (r->extflags & RXf_EVAL_SEEN)
21169         Perl_re_printf( aTHX_  "with eval ");
21170     Perl_re_printf( aTHX_  "\n");
21171     DEBUG_FLAGS_r({
21172         regdump_extflags("r->extflags: ", r->extflags);
21173         regdump_intflags("r->intflags: ", r->intflags);
21174     });
21175 #else
21176     PERL_ARGS_ASSERT_REGDUMP;
21177     PERL_UNUSED_CONTEXT;
21178     PERL_UNUSED_ARG(r);
21179 #endif	/* DEBUGGING */
21180 }
21181 
21182 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21183 #ifdef DEBUGGING
21184 
21185 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
21186      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
21187      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
21188      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
21189      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
21190      || _CC_VERTSPACE != 15
21191 #   error Need to adjust order of anyofs[]
21192 #  endif
21193 static const char * const anyofs[] = {
21194     "\\w",
21195     "\\W",
21196     "\\d",
21197     "\\D",
21198     "[:alpha:]",
21199     "[:^alpha:]",
21200     "[:lower:]",
21201     "[:^lower:]",
21202     "[:upper:]",
21203     "[:^upper:]",
21204     "[:punct:]",
21205     "[:^punct:]",
21206     "[:print:]",
21207     "[:^print:]",
21208     "[:alnum:]",
21209     "[:^alnum:]",
21210     "[:graph:]",
21211     "[:^graph:]",
21212     "[:cased:]",
21213     "[:^cased:]",
21214     "\\s",
21215     "\\S",
21216     "[:blank:]",
21217     "[:^blank:]",
21218     "[:xdigit:]",
21219     "[:^xdigit:]",
21220     "[:cntrl:]",
21221     "[:^cntrl:]",
21222     "[:ascii:]",
21223     "[:^ascii:]",
21224     "\\v",
21225     "\\V"
21226 };
21227 #endif
21228 
21229 /*
21230 - regprop - printable representation of opcode, with run time support
21231 */
21232 
21233 void
Perl_regprop(pTHX_ const regexp * prog,SV * sv,const regnode * o,const regmatch_info * reginfo,const RExC_state_t * pRExC_state)21234 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21235 {
21236 #ifdef DEBUGGING
21237     int k;
21238     RXi_GET_DECL(prog, progi);
21239     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21240 
21241     PERL_ARGS_ASSERT_REGPROP;
21242 
21243     SvPVCLEAR(sv);
21244 
21245     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
21246         if (pRExC_state) {  /* This gives more info, if we have it */
21247             FAIL3("panic: corrupted regexp opcode %d > %d",
21248                   (int)OP(o), (int)REGNODE_MAX);
21249         }
21250         else {
21251             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21252                              (int)OP(o), (int)REGNODE_MAX);
21253         }
21254     }
21255     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21256 
21257     k = PL_regkind[OP(o)];
21258 
21259     if (k == EXACT) {
21260 	sv_catpvs(sv, " ");
21261 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21262 	 * is a crude hack but it may be the best for now since
21263 	 * we have no flag "this EXACTish node was UTF-8"
21264 	 * --jhi */
21265 	pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21266                   PL_colors[0], PL_colors[1],
21267 		  PERL_PV_ESCAPE_UNI_DETECT |
21268 		  PERL_PV_ESCAPE_NONASCII   |
21269 		  PERL_PV_PRETTY_ELLIPSES   |
21270 		  PERL_PV_PRETTY_LTGT       |
21271 		  PERL_PV_PRETTY_NOCLEAR
21272 		  );
21273     } else if (k == TRIE) {
21274 	/* print the details of the trie in dumpuntil instead, as
21275 	 * progi->data isn't available here */
21276         const char op = OP(o);
21277         const U32 n = ARG(o);
21278         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21279                (reg_ac_data *)progi->data->data[n] :
21280                NULL;
21281         const reg_trie_data * const trie
21282 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21283 
21284         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21285         DEBUG_TRIE_COMPILE_r({
21286           if (trie->jump)
21287             sv_catpvs(sv, "(JUMP)");
21288           Perl_sv_catpvf(aTHX_ sv,
21289             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21290             (UV)trie->startstate,
21291             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21292             (UV)trie->wordcount,
21293             (UV)trie->minlen,
21294             (UV)trie->maxlen,
21295             (UV)TRIE_CHARCOUNT(trie),
21296             (UV)trie->uniquecharcount
21297           );
21298         });
21299         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21300             sv_catpvs(sv, "[");
21301             (void) put_charclass_bitmap_innards(sv,
21302                                                 ((IS_ANYOF_TRIE(op))
21303                                                  ? ANYOF_BITMAP(o)
21304                                                  : TRIE_BITMAP(trie)),
21305                                                 NULL,
21306                                                 NULL,
21307                                                 NULL,
21308                                                 0,
21309                                                 FALSE
21310                                                );
21311             sv_catpvs(sv, "]");
21312         }
21313     } else if (k == CURLY) {
21314         U32 lo = ARG1(o), hi = ARG2(o);
21315 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21316 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21317         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21318         if (hi == REG_INFTY)
21319             sv_catpvs(sv, "INFTY");
21320         else
21321             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21322         sv_catpvs(sv, "}");
21323     }
21324     else if (k == WHILEM && o->flags)			/* Ordinal/of */
21325 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21326     else if (k == REF || k == OPEN || k == CLOSE
21327              || k == GROUPP || OP(o)==ACCEPT)
21328     {
21329         AV *name_list= NULL;
21330         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21331         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
21332 	if ( RXp_PAREN_NAMES(prog) ) {
21333             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21334         } else if ( pRExC_state ) {
21335             name_list= RExC_paren_name_list;
21336         }
21337         if (name_list) {
21338             if ( k != REF || (OP(o) < REFN)) {
21339                 SV **name= av_fetch(name_list, parno, 0 );
21340 	        if (name)
21341 	            Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21342             }
21343             else {
21344                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21345                 I32 *nums=(I32*)SvPVX(sv_dat);
21346                 SV **name= av_fetch(name_list, nums[0], 0 );
21347                 I32 n;
21348                 if (name) {
21349                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
21350                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21351                                     (n ? "," : ""), (IV)nums[n]);
21352                     }
21353                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21354                 }
21355             }
21356         }
21357         if ( k == REF && reginfo) {
21358             U32 n = ARG(o);  /* which paren pair */
21359             I32 ln = prog->offs[n].start;
21360             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21361                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21362             else if (ln == prog->offs[n].end)
21363                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21364             else {
21365                 const char *s = reginfo->strbeg + ln;
21366                 Perl_sv_catpvf(aTHX_ sv, ": ");
21367                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21368                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21369             }
21370         }
21371     } else if (k == GOSUB) {
21372         AV *name_list= NULL;
21373         if ( RXp_PAREN_NAMES(prog) ) {
21374             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21375         } else if ( pRExC_state ) {
21376             name_list= RExC_paren_name_list;
21377         }
21378 
21379         /* Paren and offset */
21380         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21381                 (int)((o + (int)ARG2L(o)) - progi->program) );
21382         if (name_list) {
21383             SV **name= av_fetch(name_list, ARG(o), 0 );
21384             if (name)
21385                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21386         }
21387     }
21388     else if (k == LOGICAL)
21389         /* 2: embedded, otherwise 1 */
21390 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21391     else if (k == ANYOF || k == ANYOFR) {
21392         U8 flags;
21393         char * bitmap;
21394         U32 arg;
21395         bool do_sep = FALSE;    /* Do we need to separate various components of
21396                                    the output? */
21397         /* Set if there is still an unresolved user-defined property */
21398         SV *unresolved                = NULL;
21399 
21400         /* Things that are ignored except when the runtime locale is UTF-8 */
21401         SV *only_utf8_locale_invlist = NULL;
21402 
21403         /* Code points that don't fit in the bitmap */
21404         SV *nonbitmap_invlist = NULL;
21405 
21406         /* And things that aren't in the bitmap, but are small enough to be */
21407         SV* bitmap_range_not_in_bitmap = NULL;
21408 
21409         bool inverted;
21410 
21411         if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21412             flags = 0;
21413             bitmap = NULL;
21414             arg = 0;
21415         }
21416         else {
21417             flags = ANYOF_FLAGS(o);
21418             bitmap = ANYOF_BITMAP(o);
21419             arg = ARG(o);
21420         }
21421 
21422 	if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21423             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21424                 sv_catpvs(sv, "{utf8-locale-reqd}");
21425             }
21426             if (flags & ANYOFL_FOLD) {
21427                 sv_catpvs(sv, "{i}");
21428             }
21429         }
21430 
21431         inverted = flags & ANYOF_INVERT;
21432 
21433         /* If there is stuff outside the bitmap, get it */
21434         if (arg != ANYOF_ONLY_HAS_BITMAP) {
21435             if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21436                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21437                                             ANYOFRbase(o),
21438                                             ANYOFRbase(o) + ANYOFRdelta(o));
21439             }
21440             else {
21441 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21442                 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21443                                                 &unresolved,
21444                                                 &only_utf8_locale_invlist,
21445                                                 &nonbitmap_invlist);
21446 #else
21447                 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21448                                                 &unresolved,
21449                                                 &only_utf8_locale_invlist,
21450                                                 &nonbitmap_invlist);
21451 #endif
21452             }
21453 
21454             /* The non-bitmap data may contain stuff that could fit in the
21455              * bitmap.  This could come from a user-defined property being
21456              * finally resolved when this call was done; or much more likely
21457              * because there are matches that require UTF-8 to be valid, and so
21458              * aren't in the bitmap (or ANYOFR).  This is teased apart later */
21459             _invlist_intersection(nonbitmap_invlist,
21460                                   PL_InBitmap,
21461                                   &bitmap_range_not_in_bitmap);
21462             /* Leave just the things that don't fit into the bitmap */
21463             _invlist_subtract(nonbitmap_invlist,
21464                               PL_InBitmap,
21465                               &nonbitmap_invlist);
21466         }
21467 
21468         /* Obey this flag to add all above-the-bitmap code points */
21469         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21470             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21471                                                       NUM_ANYOF_CODE_POINTS,
21472                                                       UV_MAX);
21473         }
21474 
21475         /* Ready to start outputting.  First, the initial left bracket */
21476 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21477 
21478         /* ANYOFH by definition doesn't have anything that will fit inside the
21479          * bitmap;  ANYOFR may or may not. */
21480         if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21481             && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21482                 ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21483         {
21484             /* Then all the things that could fit in the bitmap */
21485             do_sep = put_charclass_bitmap_innards(sv,
21486                                                   bitmap,
21487                                                   bitmap_range_not_in_bitmap,
21488                                                   only_utf8_locale_invlist,
21489                                                   o,
21490                                                   flags,
21491 
21492                                                   /* Can't try inverting for a
21493                                                    * better display if there
21494                                                    * are things that haven't
21495                                                    * been resolved */
21496                                                   unresolved != NULL
21497                                             || inRANGE(OP(o), ANYOFR, ANYOFRb));
21498             SvREFCNT_dec(bitmap_range_not_in_bitmap);
21499 
21500             /* If there are user-defined properties which haven't been defined
21501              * yet, output them.  If the result is not to be inverted, it is
21502              * clearest to output them in a separate [] from the bitmap range
21503              * stuff.  If the result is to be complemented, we have to show
21504              * everything in one [], as the inversion applies to the whole
21505              * thing.  Use {braces} to separate them from anything in the
21506              * bitmap and anything above the bitmap. */
21507             if (unresolved) {
21508                 if (inverted) {
21509                     if (! do_sep) { /* If didn't output anything in the bitmap
21510                                      */
21511                         sv_catpvs(sv, "^");
21512                     }
21513                     sv_catpvs(sv, "{");
21514                 }
21515                 else if (do_sep) {
21516                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21517                                                       PL_colors[0]);
21518                 }
21519                 sv_catsv(sv, unresolved);
21520                 if (inverted) {
21521                     sv_catpvs(sv, "}");
21522                 }
21523                 do_sep = ! inverted;
21524             }
21525         }
21526 
21527         /* And, finally, add the above-the-bitmap stuff */
21528         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21529             SV* contents;
21530 
21531             /* See if truncation size is overridden */
21532             const STRLEN dump_len = (PL_dump_re_max_len > 256)
21533                                     ? PL_dump_re_max_len
21534                                     : 256;
21535 
21536             /* This is output in a separate [] */
21537             if (do_sep) {
21538                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21539             }
21540 
21541             /* And, for easy of understanding, it is shown in the
21542              * uncomplemented form if possible.  The one exception being if
21543              * there are unresolved items, where the inversion has to be
21544              * delayed until runtime */
21545             if (inverted && ! unresolved) {
21546                 _invlist_invert(nonbitmap_invlist);
21547                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21548             }
21549 
21550             contents = invlist_contents(nonbitmap_invlist,
21551                                         FALSE /* output suitable for catsv */
21552                                        );
21553 
21554             /* If the output is shorter than the permissible maximum, just do it. */
21555             if (SvCUR(contents) <= dump_len) {
21556                 sv_catsv(sv, contents);
21557             }
21558             else {
21559                 const char * contents_string = SvPVX(contents);
21560                 STRLEN i = dump_len;
21561 
21562                 /* Otherwise, start at the permissible max and work back to the
21563                  * first break possibility */
21564                 while (i > 0 && contents_string[i] != ' ') {
21565                     i--;
21566                 }
21567                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
21568                                        find a legal break */
21569                     i = dump_len;
21570                 }
21571 
21572                 sv_catpvn(sv, contents_string, i);
21573                 sv_catpvs(sv, "...");
21574             }
21575 
21576             SvREFCNT_dec_NN(contents);
21577             SvREFCNT_dec_NN(nonbitmap_invlist);
21578         }
21579 
21580         /* And finally the matching, closing ']' */
21581 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21582 
21583         if (OP(o) == ANYOFHs) {
21584             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21585         }
21586         else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21587             U8 lowest = (OP(o) != ANYOFHr)
21588                          ? FLAGS(o)
21589                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21590             U8 highest = (OP(o) == ANYOFHr)
21591                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21592                          : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21593                            ? 0xFF
21594                            : lowest;
21595 #ifndef EBCDIC
21596             if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21597 #endif
21598             {
21599                 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21600                 if (lowest != highest) {
21601                     Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21602                 }
21603                 Perl_sv_catpvf(aTHX_ sv, ")");
21604             }
21605         }
21606 
21607         SvREFCNT_dec(unresolved);
21608     }
21609     else if (k == ANYOFM) {
21610         SV * cp_list = get_ANYOFM_contents(o);
21611 
21612 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21613         if (OP(o) == NANYOFM) {
21614             _invlist_invert(cp_list);
21615         }
21616 
21617         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21618 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21619 
21620         SvREFCNT_dec(cp_list);
21621     }
21622     else if (k == POSIXD || k == NPOSIXD) {
21623         U8 index = FLAGS(o) * 2;
21624         if (index < C_ARRAY_LENGTH(anyofs)) {
21625             if (*anyofs[index] != '[')  {
21626                 sv_catpvs(sv, "[");
21627             }
21628             sv_catpv(sv, anyofs[index]);
21629             if (*anyofs[index] != '[')  {
21630                 sv_catpvs(sv, "]");
21631             }
21632         }
21633         else {
21634             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21635         }
21636     }
21637     else if (k == BOUND || k == NBOUND) {
21638         /* Must be synced with order of 'bound_type' in regcomp.h */
21639         const char * const bounds[] = {
21640             "",      /* Traditional */
21641             "{gcb}",
21642             "{lb}",
21643             "{sb}",
21644             "{wb}"
21645         };
21646         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21647         sv_catpv(sv, bounds[FLAGS(o)]);
21648     }
21649     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21650 	Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21651         if (o->next_off) {
21652             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21653         }
21654 	Perl_sv_catpvf(aTHX_ sv, "]");
21655     }
21656     else if (OP(o) == SBOL)
21657         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21658 
21659     /* add on the verb argument if there is one */
21660     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21661         if ( ARG(o) )
21662             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21663                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21664         else
21665             sv_catpvs(sv, ":NULL");
21666     }
21667 #else
21668     PERL_UNUSED_CONTEXT;
21669     PERL_UNUSED_ARG(sv);
21670     PERL_UNUSED_ARG(o);
21671     PERL_UNUSED_ARG(prog);
21672     PERL_UNUSED_ARG(reginfo);
21673     PERL_UNUSED_ARG(pRExC_state);
21674 #endif	/* DEBUGGING */
21675 }
21676 
21677 
21678 
21679 SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)21680 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21681 {				/* Assume that RE_INTUIT is set */
21682     /* Returns an SV containing a string that must appear in the target for it
21683      * to match, or NULL if nothing is known that must match.
21684      *
21685      * CAUTION: the SV can be freed during execution of the regex engine */
21686 
21687     struct regexp *const prog = ReANY(r);
21688     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21689 
21690     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21691     PERL_UNUSED_CONTEXT;
21692 
21693     DEBUG_COMPILE_r(
21694 	{
21695             if (prog->maxlen > 0) {
21696                 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21697 		      ? prog->check_utf8 : prog->check_substr);
21698 
21699                 if (!PL_colorset) reginitcolors();
21700                 Perl_re_printf( aTHX_
21701 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21702 		      PL_colors[4],
21703 		      RX_UTF8(r) ? "utf8 " : "",
21704 		      PL_colors[5], PL_colors[0],
21705 		      s,
21706 		      PL_colors[1],
21707 		      (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21708             }
21709 	} );
21710 
21711     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21712     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21713 }
21714 
21715 /*
21716    pregfree()
21717 
21718    handles refcounting and freeing the perl core regexp structure. When
21719    it is necessary to actually free the structure the first thing it
21720    does is call the 'free' method of the regexp_engine associated to
21721    the regexp, allowing the handling of the void *pprivate; member
21722    first. (This routine is not overridable by extensions, which is why
21723    the extensions free is called first.)
21724 
21725    See regdupe and regdupe_internal if you change anything here.
21726 */
21727 #ifndef PERL_IN_XSUB_RE
21728 void
Perl_pregfree(pTHX_ REGEXP * r)21729 Perl_pregfree(pTHX_ REGEXP *r)
21730 {
21731     SvREFCNT_dec(r);
21732 }
21733 
21734 void
Perl_pregfree2(pTHX_ REGEXP * rx)21735 Perl_pregfree2(pTHX_ REGEXP *rx)
21736 {
21737     struct regexp *const r = ReANY(rx);
21738     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21739 
21740     PERL_ARGS_ASSERT_PREGFREE2;
21741 
21742     if (! r)
21743         return;
21744 
21745     if (r->mother_re) {
21746         ReREFCNT_dec(r->mother_re);
21747     } else {
21748         CALLREGFREE_PVT(rx); /* free the private data */
21749         SvREFCNT_dec(RXp_PAREN_NAMES(r));
21750     }
21751     if (r->substrs) {
21752         int i;
21753         for (i = 0; i < 2; i++) {
21754             SvREFCNT_dec(r->substrs->data[i].substr);
21755             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21756         }
21757 	Safefree(r->substrs);
21758     }
21759     RX_MATCH_COPY_FREE(rx);
21760 #ifdef PERL_ANY_COW
21761     SvREFCNT_dec(r->saved_copy);
21762 #endif
21763     Safefree(r->offs);
21764     SvREFCNT_dec(r->qr_anoncv);
21765     if (r->recurse_locinput)
21766         Safefree(r->recurse_locinput);
21767 }
21768 
21769 
21770 /*  reg_temp_copy()
21771 
21772     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21773     except that dsv will be created if NULL.
21774 
21775     This function is used in two main ways. First to implement
21776         $r = qr/....; $s = $$r;
21777 
21778     Secondly, it is used as a hacky workaround to the structural issue of
21779     match results
21780     being stored in the regexp structure which is in turn stored in
21781     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21782     could be PL_curpm in multiple contexts, and could require multiple
21783     result sets being associated with the pattern simultaneously, such
21784     as when doing a recursive match with (??{$qr})
21785 
21786     The solution is to make a lightweight copy of the regexp structure
21787     when a qr// is returned from the code executed by (??{$qr}) this
21788     lightweight copy doesn't actually own any of its data except for
21789     the starp/end and the actual regexp structure itself.
21790 
21791 */
21792 
21793 
21794 REGEXP *
Perl_reg_temp_copy(pTHX_ REGEXP * dsv,REGEXP * ssv)21795 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21796 {
21797     struct regexp *drx;
21798     struct regexp *const srx = ReANY(ssv);
21799     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21800 
21801     PERL_ARGS_ASSERT_REG_TEMP_COPY;
21802 
21803     if (!dsv)
21804 	dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21805     else {
21806         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21807 
21808         /* our only valid caller, sv_setsv_flags(), should have done
21809          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21810         assert(!SvOOK(dsv));
21811         assert(!SvIsCOW(dsv));
21812         assert(!SvROK(dsv));
21813 
21814         if (SvPVX_const(dsv)) {
21815             if (SvLEN(dsv))
21816                 Safefree(SvPVX(dsv));
21817             SvPVX(dsv) = NULL;
21818         }
21819         SvLEN_set(dsv, 0);
21820         SvCUR_set(dsv, 0);
21821 	SvOK_off((SV *)dsv);
21822 
21823 	if (islv) {
21824 	    /* For PVLVs, the head (sv_any) points to an XPVLV, while
21825              * the LV's xpvlenu_rx will point to a regexp body, which
21826              * we allocate here */
21827 	    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21828 	    assert(!SvPVX(dsv));
21829             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21830 	    temp->sv_any = NULL;
21831 	    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21832 	    SvREFCNT_dec_NN(temp);
21833 	    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21834 	       ing below will not set it. */
21835 	    SvCUR_set(dsv, SvCUR(ssv));
21836 	}
21837     }
21838     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21839        sv_force_normal(sv) is called.  */
21840     SvFAKE_on(dsv);
21841     drx = ReANY(dsv);
21842 
21843     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21844     SvPV_set(dsv, RX_WRAPPED(ssv));
21845     /* We share the same string buffer as the original regexp, on which we
21846        hold a reference count, incremented when mother_re is set below.
21847        The string pointer is copied here, being part of the regexp struct.
21848      */
21849     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21850 	   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21851     if (!islv)
21852         SvLEN_set(dsv, 0);
21853     if (srx->offs) {
21854         const I32 npar = srx->nparens+1;
21855         Newx(drx->offs, npar, regexp_paren_pair);
21856         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21857     }
21858     if (srx->substrs) {
21859         int i;
21860         Newx(drx->substrs, 1, struct reg_substr_data);
21861 	StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21862 
21863         for (i = 0; i < 2; i++) {
21864             SvREFCNT_inc_void(drx->substrs->data[i].substr);
21865             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21866         }
21867 
21868 	/* check_substr and check_utf8, if non-NULL, point to either their
21869 	   anchored or float namesakes, and don't hold a second reference.  */
21870     }
21871     RX_MATCH_COPIED_off(dsv);
21872 #ifdef PERL_ANY_COW
21873     drx->saved_copy = NULL;
21874 #endif
21875     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21876     SvREFCNT_inc_void(drx->qr_anoncv);
21877     if (srx->recurse_locinput)
21878         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21879 
21880     return dsv;
21881 }
21882 #endif
21883 
21884 
21885 /* regfree_internal()
21886 
21887    Free the private data in a regexp. This is overloadable by
21888    extensions. Perl takes care of the regexp structure in pregfree(),
21889    this covers the *pprivate pointer which technically perl doesn't
21890    know about, however of course we have to handle the
21891    regexp_internal structure when no extension is in use.
21892 
21893    Note this is called before freeing anything in the regexp
21894    structure.
21895  */
21896 
21897 void
Perl_regfree_internal(pTHX_ REGEXP * const rx)21898 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21899 {
21900     struct regexp *const r = ReANY(rx);
21901     RXi_GET_DECL(r, ri);
21902     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21903 
21904     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21905 
21906     if (! ri) {
21907         return;
21908     }
21909 
21910     DEBUG_COMPILE_r({
21911 	if (!PL_colorset)
21912 	    reginitcolors();
21913 	{
21914 	    SV *dsv= sv_newmortal();
21915             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21916                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21917             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21918                 PL_colors[4], PL_colors[5], s);
21919         }
21920     });
21921 
21922 #ifdef RE_TRACK_PATTERN_OFFSETS
21923     if (ri->u.offsets)
21924         Safefree(ri->u.offsets);             /* 20010421 MJD */
21925 #endif
21926     if (ri->code_blocks)
21927         S_free_codeblocks(aTHX_ ri->code_blocks);
21928 
21929     if (ri->data) {
21930 	int n = ri->data->count;
21931 
21932 	while (--n >= 0) {
21933           /* If you add a ->what type here, update the comment in regcomp.h */
21934 	    switch (ri->data->what[n]) {
21935 	    case 'a':
21936 	    case 'r':
21937 	    case 's':
21938 	    case 'S':
21939 	    case 'u':
21940 		SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21941 		break;
21942 	    case 'f':
21943 		Safefree(ri->data->data[n]);
21944 		break;
21945 	    case 'l':
21946 	    case 'L':
21947 	        break;
21948             case 'T':
21949                 { /* Aho Corasick add-on structure for a trie node.
21950                      Used in stclass optimization only */
21951                     U32 refcount;
21952                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21953                     OP_REFCNT_LOCK;
21954                     refcount = --aho->refcount;
21955                     OP_REFCNT_UNLOCK;
21956                     if ( !refcount ) {
21957                         PerlMemShared_free(aho->states);
21958                         PerlMemShared_free(aho->fail);
21959 			 /* do this last!!!! */
21960                         PerlMemShared_free(ri->data->data[n]);
21961                         /* we should only ever get called once, so
21962                          * assert as much, and also guard the free
21963                          * which /might/ happen twice. At the least
21964                          * it will make code anlyzers happy and it
21965                          * doesn't cost much. - Yves */
21966                         assert(ri->regstclass);
21967                         if (ri->regstclass) {
21968                             PerlMemShared_free(ri->regstclass);
21969                             ri->regstclass = 0;
21970                         }
21971                     }
21972                 }
21973                 break;
21974 	    case 't':
21975 	        {
21976 	            /* trie structure. */
21977 	            U32 refcount;
21978 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21979                     OP_REFCNT_LOCK;
21980                     refcount = --trie->refcount;
21981                     OP_REFCNT_UNLOCK;
21982                     if ( !refcount ) {
21983                         PerlMemShared_free(trie->charmap);
21984                         PerlMemShared_free(trie->states);
21985                         PerlMemShared_free(trie->trans);
21986                         if (trie->bitmap)
21987                             PerlMemShared_free(trie->bitmap);
21988                         if (trie->jump)
21989                             PerlMemShared_free(trie->jump);
21990 			PerlMemShared_free(trie->wordinfo);
21991                         /* do this last!!!! */
21992                         PerlMemShared_free(ri->data->data[n]);
21993 		    }
21994 		}
21995 		break;
21996 	    default:
21997 		Perl_croak(aTHX_ "panic: regfree data code '%c'",
21998                                                     ri->data->what[n]);
21999 	    }
22000 	}
22001 	Safefree(ri->data->what);
22002 	Safefree(ri->data);
22003     }
22004 
22005     Safefree(ri);
22006 }
22007 
22008 #define av_dup_inc(s, t)	MUTABLE_AV(sv_dup_inc((const SV *)s, t))
22009 #define hv_dup_inc(s, t)	MUTABLE_HV(sv_dup_inc((const SV *)s, t))
22010 #define SAVEPVN(p, n)	((p) ? savepvn(p, n) : NULL)
22011 
22012 /*
22013 =for apidoc re_dup_guts
22014 Duplicate a regexp.
22015 
22016 This routine is expected to clone a given regexp structure. It is only
22017 compiled under USE_ITHREADS.
22018 
22019 After all of the core data stored in struct regexp is duplicated
22020 the C<regexp_engine.dupe> method is used to copy any private data
22021 stored in the *pprivate pointer. This allows extensions to handle
22022 any duplication they need to do.
22023 
22024 =cut
22025 
22026    See pregfree() and regfree_internal() if you change anything here.
22027 */
22028 #if defined(USE_ITHREADS)
22029 #ifndef PERL_IN_XSUB_RE
22030 void
Perl_re_dup_guts(pTHX_ const REGEXP * sstr,REGEXP * dstr,CLONE_PARAMS * param)22031 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
22032 {
22033     I32 npar;
22034     const struct regexp *r = ReANY(sstr);
22035     struct regexp *ret = ReANY(dstr);
22036 
22037     PERL_ARGS_ASSERT_RE_DUP_GUTS;
22038 
22039     npar = r->nparens+1;
22040     Newx(ret->offs, npar, regexp_paren_pair);
22041     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
22042 
22043     if (ret->substrs) {
22044 	/* Do it this way to avoid reading from *r after the StructCopy().
22045 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
22046 	   cache, it doesn't matter.  */
22047         int i;
22048 	const bool anchored = r->check_substr
22049 	    ? r->check_substr == r->substrs->data[0].substr
22050 	    : r->check_utf8   == r->substrs->data[0].utf8_substr;
22051         Newx(ret->substrs, 1, struct reg_substr_data);
22052 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
22053 
22054         for (i = 0; i < 2; i++) {
22055             ret->substrs->data[i].substr =
22056                         sv_dup_inc(ret->substrs->data[i].substr, param);
22057             ret->substrs->data[i].utf8_substr =
22058                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
22059         }
22060 
22061 	/* check_substr and check_utf8, if non-NULL, point to either their
22062 	   anchored or float namesakes, and don't hold a second reference.  */
22063 
22064 	if (ret->check_substr) {
22065 	    if (anchored) {
22066 		assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
22067 
22068 		ret->check_substr = ret->substrs->data[0].substr;
22069 		ret->check_utf8   = ret->substrs->data[0].utf8_substr;
22070 	    } else {
22071 		assert(r->check_substr == r->substrs->data[1].substr);
22072 		assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
22073 
22074 		ret->check_substr = ret->substrs->data[1].substr;
22075 		ret->check_utf8   = ret->substrs->data[1].utf8_substr;
22076 	    }
22077 	} else if (ret->check_utf8) {
22078 	    if (anchored) {
22079 		ret->check_utf8 = ret->substrs->data[0].utf8_substr;
22080 	    } else {
22081 		ret->check_utf8 = ret->substrs->data[1].utf8_substr;
22082 	    }
22083 	}
22084     }
22085 
22086     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
22087     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
22088     if (r->recurse_locinput)
22089         Newx(ret->recurse_locinput, r->nparens + 1, char *);
22090 
22091     if (ret->pprivate)
22092 	RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
22093 
22094     if (RX_MATCH_COPIED(dstr))
22095 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
22096     else
22097 	ret->subbeg = NULL;
22098 #ifdef PERL_ANY_COW
22099     ret->saved_copy = NULL;
22100 #endif
22101 
22102     /* Whether mother_re be set or no, we need to copy the string.  We
22103        cannot refrain from copying it when the storage points directly to
22104        our mother regexp, because that's
22105 	       1: a buffer in a different thread
22106 	       2: something we no longer hold a reference on
22107 	       so we need to copy it locally.  */
22108     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
22109     /* set malloced length to a non-zero value so it will be freed
22110      * (otherwise in combination with SVf_FAKE it looks like an alien
22111      * buffer). It doesn't have to be the actual malloced size, since it
22112      * should never be grown */
22113     SvLEN_set(dstr, SvCUR(sstr)+1);
22114     ret->mother_re   = NULL;
22115 }
22116 #endif /* PERL_IN_XSUB_RE */
22117 
22118 /*
22119    regdupe_internal()
22120 
22121    This is the internal complement to regdupe() which is used to copy
22122    the structure pointed to by the *pprivate pointer in the regexp.
22123    This is the core version of the extension overridable cloning hook.
22124    The regexp structure being duplicated will be copied by perl prior
22125    to this and will be provided as the regexp *r argument, however
22126    with the /old/ structures pprivate pointer value. Thus this routine
22127    may override any copying normally done by perl.
22128 
22129    It returns a pointer to the new regexp_internal structure.
22130 */
22131 
22132 void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx,CLONE_PARAMS * param)22133 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
22134 {
22135     struct regexp *const r = ReANY(rx);
22136     regexp_internal *reti;
22137     int len;
22138     RXi_GET_DECL(r, ri);
22139 
22140     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
22141 
22142     len = ProgLen(ri);
22143 
22144     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
22145           char, regexp_internal);
22146     Copy(ri->program, reti->program, len+1, regnode);
22147 
22148 
22149     if (ri->code_blocks) {
22150 	int n;
22151 	Newx(reti->code_blocks, 1, struct reg_code_blocks);
22152 	Newx(reti->code_blocks->cb, ri->code_blocks->count,
22153                     struct reg_code_block);
22154 	Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22155              ri->code_blocks->count, struct reg_code_block);
22156 	for (n = 0; n < ri->code_blocks->count; n++)
22157 	     reti->code_blocks->cb[n].src_regex = (REGEXP*)
22158 		    sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22159         reti->code_blocks->count = ri->code_blocks->count;
22160         reti->code_blocks->refcnt = 1;
22161     }
22162     else
22163 	reti->code_blocks = NULL;
22164 
22165     reti->regstclass = NULL;
22166 
22167     if (ri->data) {
22168 	struct reg_data *d;
22169         const int count = ri->data->count;
22170 	int i;
22171 
22172 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22173 		char, struct reg_data);
22174 	Newx(d->what, count, U8);
22175 
22176 	d->count = count;
22177 	for (i = 0; i < count; i++) {
22178 	    d->what[i] = ri->data->what[i];
22179 	    switch (d->what[i]) {
22180 	        /* see also regcomp.h and regfree_internal() */
22181             case 'a': /* actually an AV, but the dup function is identical.
22182                          values seem to be "plain sv's" generally. */
22183             case 'r': /* a compiled regex (but still just another SV) */
22184             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22185                          this use case should go away, the code could have used
22186                          'a' instead - see S_set_ANYOF_arg() for array contents. */
22187             case 'S': /* actually an SV, but the dup function is identical.  */
22188             case 'u': /* actually an HV, but the dup function is identical.
22189                          values are "plain sv's" */
22190 		d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22191 		break;
22192 	    case 'f':
22193                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22194                  * patterns which could start with several different things. Pre-TRIE
22195                  * this was more important than it is now, however this still helps
22196                  * in some places, for instance /x?a+/ might produce a SSC equivalent
22197                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22198                  * in regexec.c
22199                  */
22200 		/* This is cheating. */
22201 		Newx(d->data[i], 1, regnode_ssc);
22202 		StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22203 		reti->regstclass = (regnode*)d->data[i];
22204 		break;
22205 	    case 'T':
22206                 /* AHO-CORASICK fail table */
22207                 /* Trie stclasses are readonly and can thus be shared
22208 		 * without duplication. We free the stclass in pregfree
22209 		 * when the corresponding reg_ac_data struct is freed.
22210 		 */
22211 		reti->regstclass= ri->regstclass;
22212 		/* FALLTHROUGH */
22213 	    case 't':
22214                 /* TRIE transition table */
22215 		OP_REFCNT_LOCK;
22216 		((reg_trie_data*)ri->data->data[i])->refcount++;
22217 		OP_REFCNT_UNLOCK;
22218 		/* FALLTHROUGH */
22219             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22220             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22221                          is not from another regexp */
22222 		d->data[i] = ri->data->data[i];
22223 		break;
22224             default:
22225                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22226                                                            ri->data->what[i]);
22227 	    }
22228 	}
22229 
22230 	reti->data = d;
22231     }
22232     else
22233 	reti->data = NULL;
22234 
22235     reti->name_list_idx = ri->name_list_idx;
22236 
22237 #ifdef RE_TRACK_PATTERN_OFFSETS
22238     if (ri->u.offsets) {
22239         Newx(reti->u.offsets, 2*len+1, U32);
22240         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22241     }
22242 #else
22243     SetProgLen(reti, len);
22244 #endif
22245 
22246     return (void*)reti;
22247 }
22248 
22249 #endif    /* USE_ITHREADS */
22250 
22251 #ifndef PERL_IN_XSUB_RE
22252 
22253 /*
22254  - regnext - dig the "next" pointer out of a node
22255  */
22256 regnode *
Perl_regnext(pTHX_ regnode * p)22257 Perl_regnext(pTHX_ regnode *p)
22258 {
22259     I32 offset;
22260 
22261     if (!p)
22262 	return(NULL);
22263 
22264     if (OP(p) > REGNODE_MAX) {		/* regnode.type is unsigned */
22265 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22266                                                 (int)OP(p), (int)REGNODE_MAX);
22267     }
22268 
22269     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22270     if (offset == 0)
22271 	return(NULL);
22272 
22273     return(p+offset);
22274 }
22275 
22276 #endif
22277 
22278 STATIC void
S_re_croak(pTHX_ bool utf8,const char * pat,...)22279 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22280 {
22281     va_list args;
22282     STRLEN len = strlen(pat);
22283     char buf[512];
22284     SV *msv;
22285     const char *message;
22286 
22287     PERL_ARGS_ASSERT_RE_CROAK;
22288 
22289     if (len > 510)
22290 	len = 510;
22291     Copy(pat, buf, len , char);
22292     buf[len] = '\n';
22293     buf[len + 1] = '\0';
22294     va_start(args, pat);
22295     msv = vmess(buf, &args);
22296     va_end(args);
22297     message = SvPV_const(msv, len);
22298     if (len > 512)
22299 	len = 512;
22300     Copy(message, buf, len , char);
22301     /* len-1 to avoid \n */
22302     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22303 }
22304 
22305 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
22306 
22307 #ifndef PERL_IN_XSUB_RE
22308 void
Perl_save_re_context(pTHX)22309 Perl_save_re_context(pTHX)
22310 {
22311     I32 nparens = -1;
22312     I32 i;
22313 
22314     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22315 
22316     if (PL_curpm) {
22317 	const REGEXP * const rx = PM_GETRE(PL_curpm);
22318 	if (rx)
22319             nparens = RX_NPARENS(rx);
22320     }
22321 
22322     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22323      * that PL_curpm will be null, but that utf8.pm and the modules it
22324      * loads will only use $1..$3.
22325      * The t/porting/re_context.t test file checks this assumption.
22326      */
22327     if (nparens == -1)
22328         nparens = 3;
22329 
22330     for (i = 1; i <= nparens; i++) {
22331         char digits[TYPE_CHARS(long)];
22332         const STRLEN len = my_snprintf(digits, sizeof(digits),
22333                                        "%lu", (long)i);
22334         GV *const *const gvp
22335             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22336 
22337         if (gvp) {
22338             GV * const gv = *gvp;
22339             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22340                 save_scalar(gv);
22341         }
22342     }
22343 }
22344 #endif
22345 
22346 #ifdef DEBUGGING
22347 
22348 STATIC void
S_put_code_point(pTHX_ SV * sv,UV c)22349 S_put_code_point(pTHX_ SV *sv, UV c)
22350 {
22351     PERL_ARGS_ASSERT_PUT_CODE_POINT;
22352 
22353     if (c > 255) {
22354         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22355     }
22356     else if (isPRINT(c)) {
22357 	const char string = (char) c;
22358 
22359         /* We use {phrase} as metanotation in the class, so also escape literal
22360          * braces */
22361 	if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22362 	    sv_catpvs(sv, "\\");
22363 	sv_catpvn(sv, &string, 1);
22364     }
22365     else if (isMNEMONIC_CNTRL(c)) {
22366         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22367     }
22368     else {
22369         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22370     }
22371 }
22372 
22373 STATIC void
S_put_range(pTHX_ SV * sv,UV start,const UV end,const bool allow_literals)22374 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22375 {
22376     /* Appends to 'sv' a displayable version of the range of code points from
22377      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
22378      * that have them, when they occur at the beginning or end of the range.
22379      * It uses hex to output the remaining code points, unless 'allow_literals'
22380      * is true, in which case the printable ASCII ones are output as-is (though
22381      * some of these will be escaped by put_code_point()).
22382      *
22383      * NOTE:  This is designed only for printing ranges of code points that fit
22384      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
22385      */
22386 
22387     const unsigned int min_range_count = 3;
22388 
22389     assert(start <= end);
22390 
22391     PERL_ARGS_ASSERT_PUT_RANGE;
22392 
22393     while (start <= end) {
22394         UV this_end;
22395         const char * format;
22396 
22397         if (    end - start < min_range_count
22398             && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
22399         {
22400             /* Output a range of 1 or 2 chars individually, or longer ranges
22401              * when printable */
22402             for (; start <= end; start++) {
22403                 put_code_point(sv, start);
22404             }
22405             break;
22406         }
22407 
22408         /* If permitted by the input options, and there is a possibility that
22409          * this range contains a printable literal, look to see if there is
22410          * one. */
22411         if (allow_literals && start <= MAX_PRINT_A) {
22412 
22413             /* If the character at the beginning of the range isn't an ASCII
22414              * printable, effectively split the range into two parts:
22415              *  1) the portion before the first such printable,
22416              *  2) the rest
22417              * and output them separately. */
22418             if (! isPRINT_A(start)) {
22419                 UV temp_end = start + 1;
22420 
22421                 /* There is no point looking beyond the final possible
22422                  * printable, in MAX_PRINT_A */
22423                 UV max = MIN(end, MAX_PRINT_A);
22424 
22425                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22426                     temp_end++;
22427                 }
22428 
22429                 /* Here, temp_end points to one beyond the first printable if
22430                  * found, or to one beyond 'max' if not.  If none found, make
22431                  * sure that we use the entire range */
22432                 if (temp_end > MAX_PRINT_A) {
22433                     temp_end = end + 1;
22434                 }
22435 
22436                 /* Output the first part of the split range: the part that
22437                  * doesn't have printables, with the parameter set to not look
22438                  * for literals (otherwise we would infinitely recurse) */
22439                 put_range(sv, start, temp_end - 1, FALSE);
22440 
22441                 /* The 2nd part of the range (if any) starts here. */
22442                 start = temp_end;
22443 
22444                 /* We do a continue, instead of dropping down, because even if
22445                  * the 2nd part is non-empty, it could be so short that we want
22446                  * to output it as individual characters, as tested for at the
22447                  * top of this loop.  */
22448                 continue;
22449             }
22450 
22451             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
22452              * output a sub-range of just the digits or letters, then process
22453              * the remaining portion as usual. */
22454             if (isALPHANUMERIC_A(start)) {
22455                 UV mask = (isDIGIT_A(start))
22456                            ? _CC_DIGIT
22457                              : isUPPER_A(start)
22458                                ? _CC_UPPER
22459                                : _CC_LOWER;
22460                 UV temp_end = start + 1;
22461 
22462                 /* Find the end of the sub-range that includes just the
22463                  * characters in the same class as the first character in it */
22464                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22465                     temp_end++;
22466                 }
22467                 temp_end--;
22468 
22469                 /* For short ranges, don't duplicate the code above to output
22470                  * them; just call recursively */
22471                 if (temp_end - start < min_range_count) {
22472                     put_range(sv, start, temp_end, FALSE);
22473                 }
22474                 else {  /* Output as a range */
22475                     put_code_point(sv, start);
22476                     sv_catpvs(sv, "-");
22477                     put_code_point(sv, temp_end);
22478                 }
22479                 start = temp_end + 1;
22480                 continue;
22481             }
22482 
22483             /* We output any other printables as individual characters */
22484             if (isPUNCT_A(start) || isSPACE_A(start)) {
22485                 while (start <= end && (isPUNCT_A(start)
22486                                         || isSPACE_A(start)))
22487                 {
22488                     put_code_point(sv, start);
22489                     start++;
22490                 }
22491                 continue;
22492             }
22493         } /* End of looking for literals */
22494 
22495         /* Here is not to output as a literal.  Some control characters have
22496          * mnemonic names.  Split off any of those at the beginning and end of
22497          * the range to print mnemonically.  It isn't possible for many of
22498          * these to be in a row, so this won't overwhelm with output */
22499         if (   start <= end
22500             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22501         {
22502             while (isMNEMONIC_CNTRL(start) && start <= end) {
22503                 put_code_point(sv, start);
22504                 start++;
22505             }
22506 
22507             /* If this didn't take care of the whole range ... */
22508             if (start <= end) {
22509 
22510                 /* Look backwards from the end to find the final non-mnemonic
22511                  * */
22512                 UV temp_end = end;
22513                 while (isMNEMONIC_CNTRL(temp_end)) {
22514                     temp_end--;
22515                 }
22516 
22517                 /* And separately output the interior range that doesn't start
22518                  * or end with mnemonics */
22519                 put_range(sv, start, temp_end, FALSE);
22520 
22521                 /* Then output the mnemonic trailing controls */
22522                 start = temp_end + 1;
22523                 while (start <= end) {
22524                     put_code_point(sv, start);
22525                     start++;
22526                 }
22527                 break;
22528             }
22529         }
22530 
22531         /* As a final resort, output the range or subrange as hex. */
22532 
22533         if (start >= NUM_ANYOF_CODE_POINTS) {
22534             this_end = end;
22535         }
22536         else {  /* Have to split range at the bitmap boundary */
22537             this_end = (end < NUM_ANYOF_CODE_POINTS)
22538                         ? end
22539                         : NUM_ANYOF_CODE_POINTS - 1;
22540         }
22541 #if NUM_ANYOF_CODE_POINTS > 256
22542         format = (this_end < 256)
22543                  ? "\\x%02" UVXf "-\\x%02" UVXf
22544                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22545 #else
22546         format = "\\x%02" UVXf "-\\x%02" UVXf;
22547 #endif
22548         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22549         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22550         GCC_DIAG_RESTORE_STMT;
22551         break;
22552     }
22553 }
22554 
22555 STATIC void
S_put_charclass_bitmap_innards_invlist(pTHX_ SV * sv,SV * invlist)22556 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22557 {
22558     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22559      * 'invlist' */
22560 
22561     UV start, end;
22562     bool allow_literals = TRUE;
22563 
22564     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22565 
22566     /* Generally, it is more readable if printable characters are output as
22567      * literals, but if a range (nearly) spans all of them, it's best to output
22568      * it as a single range.  This code will use a single range if all but 2
22569      * ASCII printables are in it */
22570     invlist_iterinit(invlist);
22571     while (invlist_iternext(invlist, &start, &end)) {
22572 
22573         /* If the range starts beyond the final printable, it doesn't have any
22574          * in it */
22575         if (start > MAX_PRINT_A) {
22576             break;
22577         }
22578 
22579         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
22580          * all but two, the range must start and end no later than 2 from
22581          * either end */
22582         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22583             if (end > MAX_PRINT_A) {
22584                 end = MAX_PRINT_A;
22585             }
22586             if (start < ' ') {
22587                 start = ' ';
22588             }
22589             if (end - start >= MAX_PRINT_A - ' ' - 2) {
22590                 allow_literals = FALSE;
22591             }
22592             break;
22593         }
22594     }
22595     invlist_iterfinish(invlist);
22596 
22597     /* Here we have figured things out.  Output each range */
22598     invlist_iterinit(invlist);
22599     while (invlist_iternext(invlist, &start, &end)) {
22600         if (start >= NUM_ANYOF_CODE_POINTS) {
22601             break;
22602         }
22603         put_range(sv, start, end, allow_literals);
22604     }
22605     invlist_iterfinish(invlist);
22606 
22607     return;
22608 }
22609 
22610 STATIC SV*
S_put_charclass_bitmap_innards_common(pTHX_ SV * invlist,SV * posixes,SV * only_utf8,SV * not_utf8,SV * only_utf8_locale,const bool invert)22611 S_put_charclass_bitmap_innards_common(pTHX_
22612         SV* invlist,            /* The bitmap */
22613         SV* posixes,            /* Under /l, things like [:word:], \S */
22614         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
22615         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
22616         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
22617         const bool invert       /* Is the result to be inverted? */
22618 )
22619 {
22620     /* Create and return an SV containing a displayable version of the bitmap
22621      * and associated information determined by the input parameters.  If the
22622      * output would have been only the inversion indicator '^', NULL is instead
22623      * returned. */
22624 
22625     SV * output;
22626 
22627     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22628 
22629     if (invert) {
22630         output = newSVpvs("^");
22631     }
22632     else {
22633         output = newSVpvs("");
22634     }
22635 
22636     /* First, the code points in the bitmap that are unconditionally there */
22637     put_charclass_bitmap_innards_invlist(output, invlist);
22638 
22639     /* Traditionally, these have been placed after the main code points */
22640     if (posixes) {
22641         sv_catsv(output, posixes);
22642     }
22643 
22644     if (only_utf8 && _invlist_len(only_utf8)) {
22645         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22646         put_charclass_bitmap_innards_invlist(output, only_utf8);
22647     }
22648 
22649     if (not_utf8 && _invlist_len(not_utf8)) {
22650         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22651         put_charclass_bitmap_innards_invlist(output, not_utf8);
22652     }
22653 
22654     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22655         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22656         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22657 
22658         /* This is the only list in this routine that can legally contain code
22659          * points outside the bitmap range.  The call just above to
22660          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22661          * output them here.  There's about a half-dozen possible, and none in
22662          * contiguous ranges longer than 2 */
22663         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22664             UV start, end;
22665             SV* above_bitmap = NULL;
22666 
22667             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22668 
22669             invlist_iterinit(above_bitmap);
22670             while (invlist_iternext(above_bitmap, &start, &end)) {
22671                 UV i;
22672 
22673                 for (i = start; i <= end; i++) {
22674                     put_code_point(output, i);
22675                 }
22676             }
22677             invlist_iterfinish(above_bitmap);
22678             SvREFCNT_dec_NN(above_bitmap);
22679         }
22680     }
22681 
22682     if (invert && SvCUR(output) == 1) {
22683         return NULL;
22684     }
22685 
22686     return output;
22687 }
22688 
22689 STATIC bool
S_put_charclass_bitmap_innards(pTHX_ SV * sv,char * bitmap,SV * nonbitmap_invlist,SV * only_utf8_locale_invlist,const regnode * const node,const U8 flags,const bool force_as_is_display)22690 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22691                                      char *bitmap,
22692                                      SV *nonbitmap_invlist,
22693                                      SV *only_utf8_locale_invlist,
22694                                      const regnode * const node,
22695                                      const U8 flags,
22696                                      const bool force_as_is_display)
22697 {
22698     /* Appends to 'sv' a displayable version of the innards of the bracketed
22699      * character class defined by the other arguments:
22700      *  'bitmap' points to the bitmap, or NULL if to ignore that.
22701      *  'nonbitmap_invlist' is an inversion list of the code points that are in
22702      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
22703      *      none.  The reasons for this could be that they require some
22704      *      condition such as the target string being or not being in UTF-8
22705      *      (under /d), or because they came from a user-defined property that
22706      *      was not resolved at the time of the regex compilation (under /u)
22707      *  'only_utf8_locale_invlist' is an inversion list of the code points that
22708      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
22709      *  'node' is the regex pattern ANYOF node.  It is needed only when the
22710      *      above two parameters are not null, and is passed so that this
22711      *      routine can tease apart the various reasons for them.
22712      *  'flags' is the flags field of 'node'
22713      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
22714      *      to invert things to see if that leads to a cleaner display.  If
22715      *      FALSE, this routine is free to use its judgment about doing this.
22716      *
22717      * It returns TRUE if there was actually something output.  (It may be that
22718      * the bitmap, etc is empty.)
22719      *
22720      * When called for outputting the bitmap of a non-ANYOF node, just pass the
22721      * bitmap, with the succeeding parameters set to NULL, and the final one to
22722      * FALSE.
22723      */
22724 
22725     /* In general, it tries to display the 'cleanest' representation of the
22726      * innards, choosing whether to display them inverted or not, regardless of
22727      * whether the class itself is to be inverted.  However,  there are some
22728      * cases where it can't try inverting, as what actually matches isn't known
22729      * until runtime, and hence the inversion isn't either. */
22730 
22731     bool inverting_allowed = ! force_as_is_display;
22732 
22733     int i;
22734     STRLEN orig_sv_cur = SvCUR(sv);
22735 
22736     SV* invlist;            /* Inversion list we accumulate of code points that
22737                                are unconditionally matched */
22738     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
22739                                UTF-8 */
22740     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
22741                              */
22742     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
22743     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
22744                                        is UTF-8 */
22745 
22746     SV* as_is_display;      /* The output string when we take the inputs
22747                                literally */
22748     SV* inverted_display;   /* The output string when we invert the inputs */
22749 
22750     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
22751                                                    to match? */
22752     /* We are biased in favor of displaying things without them being inverted,
22753      * as that is generally easier to understand */
22754     const int bias = 5;
22755 
22756     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22757 
22758     /* Start off with whatever code points are passed in.  (We clone, so we
22759      * don't change the caller's list) */
22760     if (nonbitmap_invlist) {
22761         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22762         invlist = invlist_clone(nonbitmap_invlist, NULL);
22763     }
22764     else {  /* Worst case size is every other code point is matched */
22765         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22766     }
22767 
22768     if (flags) {
22769         if (OP(node) == ANYOFD) {
22770 
22771             /* This flag indicates that the code points below 0x100 in the
22772              * nonbitmap list are precisely the ones that match only when the
22773              * target is UTF-8 (they should all be non-ASCII). */
22774             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22775             {
22776                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22777                 _invlist_subtract(invlist, only_utf8, &invlist);
22778             }
22779 
22780             /* And this flag for matching all non-ASCII 0xFF and below */
22781             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22782             {
22783                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22784             }
22785         }
22786         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22787 
22788             /* If either of these flags are set, what matches isn't
22789              * determinable except during execution, so don't know enough here
22790              * to invert */
22791             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22792                 inverting_allowed = FALSE;
22793             }
22794 
22795             /* What the posix classes match also varies at runtime, so these
22796              * will be output symbolically. */
22797             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22798                 int i;
22799 
22800                 posixes = newSVpvs("");
22801                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22802                     if (ANYOF_POSIXL_TEST(node, i)) {
22803                         sv_catpv(posixes, anyofs[i]);
22804                     }
22805                 }
22806             }
22807         }
22808     }
22809 
22810     /* Accumulate the bit map into the unconditional match list */
22811     if (bitmap) {
22812         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22813             if (BITMAP_TEST(bitmap, i)) {
22814                 int start = i++;
22815                 for (;
22816                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22817                      i++)
22818                 { /* empty */ }
22819                 invlist = _add_range_to_invlist(invlist, start, i-1);
22820             }
22821         }
22822     }
22823 
22824     /* Make sure that the conditional match lists don't have anything in them
22825      * that match unconditionally; otherwise the output is quite confusing.
22826      * This could happen if the code that populates these misses some
22827      * duplication. */
22828     if (only_utf8) {
22829         _invlist_subtract(only_utf8, invlist, &only_utf8);
22830     }
22831     if (not_utf8) {
22832         _invlist_subtract(not_utf8, invlist, &not_utf8);
22833     }
22834 
22835     if (only_utf8_locale_invlist) {
22836 
22837         /* Since this list is passed in, we have to make a copy before
22838          * modifying it */
22839         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22840 
22841         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22842 
22843         /* And, it can get really weird for us to try outputting an inverted
22844          * form of this list when it has things above the bitmap, so don't even
22845          * try */
22846         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22847             inverting_allowed = FALSE;
22848         }
22849     }
22850 
22851     /* Calculate what the output would be if we take the input as-is */
22852     as_is_display = put_charclass_bitmap_innards_common(invlist,
22853                                                     posixes,
22854                                                     only_utf8,
22855                                                     not_utf8,
22856                                                     only_utf8_locale,
22857                                                     invert);
22858 
22859     /* If have to take the output as-is, just do that */
22860     if (! inverting_allowed) {
22861         if (as_is_display) {
22862             sv_catsv(sv, as_is_display);
22863             SvREFCNT_dec_NN(as_is_display);
22864         }
22865     }
22866     else { /* But otherwise, create the output again on the inverted input, and
22867               use whichever version is shorter */
22868 
22869         int inverted_bias, as_is_bias;
22870 
22871         /* We will apply our bias to whichever of the results doesn't have
22872          * the '^' */
22873         if (invert) {
22874             invert = FALSE;
22875             as_is_bias = bias;
22876             inverted_bias = 0;
22877         }
22878         else {
22879             invert = TRUE;
22880             as_is_bias = 0;
22881             inverted_bias = bias;
22882         }
22883 
22884         /* Now invert each of the lists that contribute to the output,
22885          * excluding from the result things outside the possible range */
22886 
22887         /* For the unconditional inversion list, we have to add in all the
22888          * conditional code points, so that when inverted, they will be gone
22889          * from it */
22890         _invlist_union(only_utf8, invlist, &invlist);
22891         _invlist_union(not_utf8, invlist, &invlist);
22892         _invlist_union(only_utf8_locale, invlist, &invlist);
22893         _invlist_invert(invlist);
22894         _invlist_intersection(invlist, PL_InBitmap, &invlist);
22895 
22896         if (only_utf8) {
22897             _invlist_invert(only_utf8);
22898             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22899         }
22900         else if (not_utf8) {
22901 
22902             /* If a code point matches iff the target string is not in UTF-8,
22903              * then complementing the result has it not match iff not in UTF-8,
22904              * which is the same thing as matching iff it is UTF-8. */
22905             only_utf8 = not_utf8;
22906             not_utf8 = NULL;
22907         }
22908 
22909         if (only_utf8_locale) {
22910             _invlist_invert(only_utf8_locale);
22911             _invlist_intersection(only_utf8_locale,
22912                                   PL_InBitmap,
22913                                   &only_utf8_locale);
22914         }
22915 
22916         inverted_display = put_charclass_bitmap_innards_common(
22917                                             invlist,
22918                                             posixes,
22919                                             only_utf8,
22920                                             not_utf8,
22921                                             only_utf8_locale, invert);
22922 
22923         /* Use the shortest representation, taking into account our bias
22924          * against showing it inverted */
22925         if (   inverted_display
22926             && (   ! as_is_display
22927                 || (  SvCUR(inverted_display) + inverted_bias
22928                     < SvCUR(as_is_display)    + as_is_bias)))
22929         {
22930 	    sv_catsv(sv, inverted_display);
22931         }
22932         else if (as_is_display) {
22933 	    sv_catsv(sv, as_is_display);
22934         }
22935 
22936         SvREFCNT_dec(as_is_display);
22937         SvREFCNT_dec(inverted_display);
22938     }
22939 
22940     SvREFCNT_dec_NN(invlist);
22941     SvREFCNT_dec(only_utf8);
22942     SvREFCNT_dec(not_utf8);
22943     SvREFCNT_dec(posixes);
22944     SvREFCNT_dec(only_utf8_locale);
22945 
22946     return SvCUR(sv) > orig_sv_cur;
22947 }
22948 
22949 #define CLEAR_OPTSTART                                                       \
22950     if (optstart) STMT_START {                                               \
22951         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
22952                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22953         optstart=NULL;                                                       \
22954     } STMT_END
22955 
22956 #define DUMPUNTIL(b,e)                                                       \
22957                     CLEAR_OPTSTART;                                          \
22958                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22959 
22960 STATIC const regnode *
S_dumpuntil(pTHX_ const regexp * r,const regnode * start,const regnode * node,const regnode * last,const regnode * plast,SV * sv,I32 indent,U32 depth)22961 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22962 	    const regnode *last, const regnode *plast,
22963 	    SV* sv, I32 indent, U32 depth)
22964 {
22965     U8 op = PSEUDO;	/* Arbitrary non-END op. */
22966     const regnode *next;
22967     const regnode *optstart= NULL;
22968 
22969     RXi_GET_DECL(r, ri);
22970     DECLARE_AND_GET_RE_DEBUG_FLAGS;
22971 
22972     PERL_ARGS_ASSERT_DUMPUNTIL;
22973 
22974 #ifdef DEBUG_DUMPUNTIL
22975     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22976         last ? last-start : 0, plast ? plast-start : 0);
22977 #endif
22978 
22979     if (plast && plast < last)
22980         last= plast;
22981 
22982     while (PL_regkind[op] != END && (!last || node < last)) {
22983         assert(node);
22984 	/* While that wasn't END last time... */
22985 	NODE_ALIGN(node);
22986 	op = OP(node);
22987 	if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22988 	    indent--;
22989 	next = regnext((regnode *)node);
22990 
22991 	/* Where, what. */
22992 	if (OP(node) == OPTIMIZED) {
22993 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22994 	        optstart = node;
22995 	    else
22996 		goto after_print;
22997 	} else
22998 	    CLEAR_OPTSTART;
22999 
23000         regprop(r, sv, node, NULL, NULL);
23001         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
23002 		      (int)(2*indent + 1), "", SvPVX_const(sv));
23003 
23004         if (OP(node) != OPTIMIZED) {
23005             if (next == NULL)		/* Next ptr. */
23006                 Perl_re_printf( aTHX_  " (0)");
23007             else if (PL_regkind[(U8)op] == BRANCH
23008                      && PL_regkind[OP(next)] != BRANCH )
23009                 Perl_re_printf( aTHX_  " (FAIL)");
23010             else
23011                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
23012             Perl_re_printf( aTHX_ "\n");
23013         }
23014 
23015       after_print:
23016 	if (PL_regkind[(U8)op] == BRANCHJ) {
23017 	    assert(next);
23018 	    {
23019                 const regnode *nnode = (OP(next) == LONGJMP
23020                                        ? regnext((regnode *)next)
23021                                        : next);
23022                 if (last && nnode > last)
23023                     nnode = last;
23024                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
23025 	    }
23026 	}
23027 	else if (PL_regkind[(U8)op] == BRANCH) {
23028 	    assert(next);
23029 	    DUMPUNTIL(NEXTOPER(node), next);
23030 	}
23031 	else if ( PL_regkind[(U8)op]  == TRIE ) {
23032 	    const regnode *this_trie = node;
23033 	    const char op = OP(node);
23034             const U32 n = ARG(node);
23035 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
23036                (reg_ac_data *)ri->data->data[n] :
23037                NULL;
23038 	    const reg_trie_data * const trie =
23039 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
23040 #ifdef DEBUGGING
23041 	    AV *const trie_words
23042                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
23043 #endif
23044 	    const regnode *nextbranch= NULL;
23045 	    I32 word_idx;
23046             SvPVCLEAR(sv);
23047 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
23048 		SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
23049 
23050                 Perl_re_indentf( aTHX_  "%s ",
23051                     indent+3,
23052                     elem_ptr
23053                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
23054                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
23055                                 PL_colors[0], PL_colors[1],
23056                                 (SvUTF8(*elem_ptr)
23057                                  ? PERL_PV_ESCAPE_UNI
23058                                  : 0)
23059                                 | PERL_PV_PRETTY_ELLIPSES
23060                                 | PERL_PV_PRETTY_LTGT
23061                             )
23062                     : "???"
23063                 );
23064                 if (trie->jump) {
23065                     U16 dist= trie->jump[word_idx+1];
23066                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
23067                                (UV)((dist ? this_trie + dist : next) - start));
23068                     if (dist) {
23069                         if (!nextbranch)
23070                             nextbranch= this_trie + trie->jump[0];
23071 			DUMPUNTIL(this_trie + dist, nextbranch);
23072                     }
23073                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
23074                         nextbranch= regnext((regnode *)nextbranch);
23075                 } else {
23076                     Perl_re_printf( aTHX_  "\n");
23077 		}
23078 	    }
23079 	    if (last && next > last)
23080 	        node= last;
23081 	    else
23082 	        node= next;
23083 	}
23084 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
23085 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
23086                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
23087 	}
23088 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
23089 	    assert(next);
23090 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
23091 	}
23092 	else if ( op == PLUS || op == STAR) {
23093 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
23094 	}
23095 	else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
23096             /* Literal string, where present. */
23097 	    node += NODE_SZ_STR(node) - 1;
23098 	    node = NEXTOPER(node);
23099 	}
23100 	else {
23101 	    node = NEXTOPER(node);
23102 	    node += regarglen[(U8)op];
23103 	}
23104 	if (op == CURLYX || op == OPEN || op == SROPEN)
23105 	    indent++;
23106     }
23107     CLEAR_OPTSTART;
23108 #ifdef DEBUG_DUMPUNTIL
23109     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
23110 #endif
23111     return node;
23112 }
23113 
23114 #endif	/* DEBUGGING */
23115 
23116 #ifndef PERL_IN_XSUB_RE
23117 
23118 #  include "uni_keywords.h"
23119 
23120 void
Perl_init_uniprops(pTHX)23121 Perl_init_uniprops(pTHX)
23122 {
23123 
23124 #  ifdef DEBUGGING
23125     char * dump_len_string;
23126 
23127     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
23128     if (   ! dump_len_string
23129         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
23130     {
23131         PL_dump_re_max_len = 60;    /* A reasonable default */
23132     }
23133 #  endif
23134 
23135     PL_user_def_props = newHV();
23136 
23137 #  ifdef USE_ITHREADS
23138 
23139     HvSHAREKEYS_off(PL_user_def_props);
23140     PL_user_def_props_aTHX = aTHX;
23141 
23142 #  endif
23143 
23144     /* Set up the inversion list interpreter-level variables */
23145 
23146     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23147     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
23148     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
23149     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23150     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23151     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23152     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23153     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23154     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23155     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23156     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23157     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23158     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23159     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23160     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23161     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23162 
23163     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23164     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23165     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23166     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23167     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23168     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23169     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23170     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23171     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23172     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23173     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23174     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23175     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23176     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23177     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23178     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23179 
23180     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23181     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23182     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23183     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23184     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23185 
23186     PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23187     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23188     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23189     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23190 
23191     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23192 
23193     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23194     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23195 
23196     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23197     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23198 
23199     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23200     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23201                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23202     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23203                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23204     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23205     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23206     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23207     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23208     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23209     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23210     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23211     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23212     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23213 
23214 #  ifdef UNI_XIDC
23215     /* The below are used only by deprecated functions.  They could be removed */
23216     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23217     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23218     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23219 #  endif
23220 }
23221 
23222 /* These four functions are compiled only in regcomp.c, where they have access
23223  * to the data they return.  They are a way for re_comp.c to get access to that
23224  * data without having to compile the whole data structures. */
23225 
23226 I16
Perl_do_uniprop_match(const char * const key,const U16 key_len)23227 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23228 {
23229     PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23230 
23231     return match_uniprop((U8 *) key, key_len);
23232 }
23233 
23234 SV *
Perl_get_prop_definition(pTHX_ const int table_index)23235 Perl_get_prop_definition(pTHX_ const int table_index)
23236 {
23237     PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23238 
23239     /* Create and return the inversion list */
23240     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23241 }
23242 
23243 const char * const *
Perl_get_prop_values(const int table_index)23244 Perl_get_prop_values(const int table_index)
23245 {
23246     PERL_ARGS_ASSERT_GET_PROP_VALUES;
23247 
23248     return UNI_prop_value_ptrs[table_index];
23249 }
23250 
23251 const char *
Perl_get_deprecated_property_msg(const Size_t warning_offset)23252 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23253 {
23254     PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23255 
23256     return deprecated_property_msgs[warning_offset];
23257 }
23258 
23259 #  if 0
23260 
23261 This code was mainly added for backcompat to give a warning for non-portable
23262 code points in user-defined properties.  But experiments showed that the
23263 warning in earlier perls were only omitted on overflow, which should be an
23264 error, so there really isnt a backcompat issue, and actually adding the
23265 warning when none was present before might cause breakage, for little gain.  So
23266 khw left this code in, but not enabled.  Tests were never added.
23267 
23268 embed.fnc entry:
23269 Ei	|const char *|get_extended_utf8_msg|const UV cp
23270 
23271 PERL_STATIC_INLINE const char *
23272 S_get_extended_utf8_msg(pTHX_ const UV cp)
23273 {
23274     U8 dummy[UTF8_MAXBYTES + 1];
23275     HV *msgs;
23276     SV **msg;
23277 
23278     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23279                              &msgs);
23280 
23281     msg = hv_fetchs(msgs, "text", 0);
23282     assert(msg);
23283 
23284     (void) sv_2mortal((SV *) msgs);
23285 
23286     return SvPVX(*msg);
23287 }
23288 
23289 #  endif
23290 #endif /* end of ! PERL_IN_XSUB_RE */
23291 
23292 STATIC REGEXP *
S_compile_wildcard(pTHX_ const char * subpattern,const STRLEN len,const bool ignore_case)23293 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23294                          const bool ignore_case)
23295 {
23296     /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23297      * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
23298      * because nothing outside of ASCII will match.  Use /m because the input
23299      * string may be a bunch of lines strung together.
23300      *
23301      * Also sets up the debugging info */
23302 
23303     U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23304     U32 rx_flags;
23305     SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23306     REGEXP * subpattern_re;
23307     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23308 
23309     PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23310 
23311     if (ignore_case) {
23312         flags |= PMf_FOLD;
23313     }
23314     set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23315 
23316     /* Like in op.c, we copy the compile time pm flags to the rx ones */
23317     rx_flags = flags & RXf_PMf_COMPILETIME;
23318 
23319 #ifndef PERL_IN_XSUB_RE
23320     /* Use the core engine if this file is regcomp.c.  That means no
23321      * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23322     subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23323                                              &PL_core_reg_engine,
23324                                              NULL, NULL,
23325                                              rx_flags, flags);
23326 #else
23327     if (isDEBUG_WILDCARD) {
23328         /* Use the special debugging engine if this file is re_comp.c and wants
23329          * to output the wildcard matching.  This uses whatever
23330          * 'use re "Debug ..." is in effect */
23331         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23332                                                  &my_reg_engine,
23333                                                  NULL, NULL,
23334                                                  rx_flags, flags);
23335     }
23336     else {
23337         /* Use the special wildcard engine if this file is re_comp.c and
23338          * doesn't want to output the wildcard matching.  This uses whatever
23339          * 'use re "Debug ..." is in effect for compilation, but this engine
23340          * structure has been set up so that it uses the core engine for
23341          * execution, so no execution debugging as a result of re.pm will be
23342          * displayed. */
23343         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23344                                                  &wild_reg_engine,
23345                                                  NULL, NULL,
23346                                                  rx_flags, flags);
23347         /* XXX The above has the effect that any user-supplied regex engine
23348          * won't be called for matching wildcards.  That might be good, or bad.
23349          * It could be changed in several ways.  The reason it is done the
23350          * current way is to avoid having to save and restore
23351          * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
23352          * could be used.  Another suggestion is to keep the authoritative
23353          * value of the debug flags in a thread-local variable and add set/get
23354          * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23355          * Still another is to pass a flag, say in the engine's intflags that
23356          * would be checked each time before doing the debug output */
23357     }
23358 #endif
23359 
23360     assert(subpattern_re);  /* Should have died if didn't compile successfully */
23361     return subpattern_re;
23362 }
23363 
23364 STATIC I32
S_execute_wildcard(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)23365 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23366 	 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23367 {
23368     I32 result;
23369     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23370 
23371     PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23372 
23373     ENTER;
23374 
23375     /* The compilation has set things up so that if the program doesn't want to
23376      * see the wildcard matching procedure, it will get the core execution
23377      * engine, which is subject only to -Dr.  So we have to turn that off
23378      * around this procedure */
23379     if (! isDEBUG_WILDCARD) {
23380         /* Note! Casts away 'volatile' */
23381         SAVEI32(PL_debug);
23382         PL_debug &= ~ DEBUG_r_FLAG;
23383     }
23384 
23385     result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23386                          NULL, nosave);
23387     LEAVE;
23388 
23389     return result;
23390 }
23391 
23392 SV *
S_handle_user_defined_property(pTHX_ const char * name,const STRLEN name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,SV * contents,bool * user_defined_ptr,SV * msg,const STRLEN level)23393 S_handle_user_defined_property(pTHX_
23394 
23395     /* Parses the contents of a user-defined property definition; returning the
23396      * expanded definition if possible.  If so, the return is an inversion
23397      * list.
23398      *
23399      * If there are subroutines that are part of the expansion and which aren't
23400      * known at the time of the call to this function, this returns what
23401      * parse_uniprop_string() returned for the first one encountered.
23402      *
23403      * If an error was found, NULL is returned, and 'msg' gets a suitable
23404      * message appended to it.  (Appending allows the back trace of how we got
23405      * to the faulty definition to be displayed through nested calls of
23406      * user-defined subs.)
23407      *
23408      * The caller IS responsible for freeing any returned SV.
23409      *
23410      * The syntax of the contents is pretty much described in perlunicode.pod,
23411      * but we also allow comments on each line */
23412 
23413     const char * name,          /* Name of property */
23414     const STRLEN name_len,      /* The name's length in bytes */
23415     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23416     const bool to_fold,         /* ? Is this under /i */
23417     const bool runtime,         /* ? Are we in compile- or run-time */
23418     const bool deferrable,      /* Is it ok for this property's full definition
23419                                    to be deferred until later? */
23420     SV* contents,               /* The property's definition */
23421     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
23422                                    getting called unless this is thought to be
23423                                    a user-defined property */
23424     SV * msg,                   /* Any error or warning msg(s) are appended to
23425                                    this */
23426     const STRLEN level)         /* Recursion level of this call */
23427 {
23428     STRLEN len;
23429     const char * string         = SvPV_const(contents, len);
23430     const char * const e        = string + len;
23431     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23432     const STRLEN msgs_length_on_entry = SvCUR(msg);
23433 
23434     const char * s0 = string;   /* Points to first byte in the current line
23435                                    being parsed in 'string' */
23436     const char overflow_msg[] = "Code point too large in \"";
23437     SV* running_definition = NULL;
23438 
23439     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23440 
23441     *user_defined_ptr = TRUE;
23442 
23443     /* Look at each line */
23444     while (s0 < e) {
23445         const char * s;     /* Current byte */
23446         char op = '+';      /* Default operation is 'union' */
23447         IV   min = 0;       /* range begin code point */
23448         IV   max = -1;      /* and range end */
23449         SV* this_definition;
23450 
23451         /* Skip comment lines */
23452         if (*s0 == '#') {
23453             s0 = strchr(s0, '\n');
23454             if (s0 == NULL) {
23455                 break;
23456             }
23457             s0++;
23458             continue;
23459         }
23460 
23461         /* For backcompat, allow an empty first line */
23462         if (*s0 == '\n') {
23463             s0++;
23464             continue;
23465         }
23466 
23467         /* First character in the line may optionally be the operation */
23468         if (   *s0 == '+'
23469             || *s0 == '!'
23470             || *s0 == '-'
23471             || *s0 == '&')
23472         {
23473             op = *s0++;
23474         }
23475 
23476         /* If the line is one or two hex digits separated by blank space, its
23477          * a range; otherwise it is either another user-defined property or an
23478          * error */
23479 
23480         s = s0;
23481 
23482         if (! isXDIGIT(*s)) {
23483             goto check_if_property;
23484         }
23485 
23486         do { /* Each new hex digit will add 4 bits. */
23487             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23488                 s = strchr(s, '\n');
23489                 if (s == NULL) {
23490                     s = e;
23491                 }
23492                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23493                 sv_catpv(msg, overflow_msg);
23494                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23495                                      UTF8fARG(is_contents_utf8, s - s0, s0));
23496                 sv_catpvs(msg, "\"");
23497                 goto return_failure;
23498             }
23499 
23500             /* Accumulate this digit into the value */
23501             min = (min << 4) + READ_XDIGIT(s);
23502         } while (isXDIGIT(*s));
23503 
23504         while (isBLANK(*s)) { s++; }
23505 
23506         /* We allow comments at the end of the line */
23507         if (*s == '#') {
23508             s = strchr(s, '\n');
23509             if (s == NULL) {
23510                 s = e;
23511             }
23512             s++;
23513         }
23514         else if (s < e && *s != '\n') {
23515             if (! isXDIGIT(*s)) {
23516                 goto check_if_property;
23517             }
23518 
23519             /* Look for the high point of the range */
23520             max = 0;
23521             do {
23522                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23523                     s = strchr(s, '\n');
23524                     if (s == NULL) {
23525                         s = e;
23526                     }
23527                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23528                     sv_catpv(msg, overflow_msg);
23529                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23530                                       UTF8fARG(is_contents_utf8, s - s0, s0));
23531                     sv_catpvs(msg, "\"");
23532                     goto return_failure;
23533                 }
23534 
23535                 max = (max << 4) + READ_XDIGIT(s);
23536             } while (isXDIGIT(*s));
23537 
23538             while (isBLANK(*s)) { s++; }
23539 
23540             if (*s == '#') {
23541                 s = strchr(s, '\n');
23542                 if (s == NULL) {
23543                     s = e;
23544                 }
23545             }
23546             else if (s < e && *s != '\n') {
23547                 goto check_if_property;
23548             }
23549         }
23550 
23551         if (max == -1) {    /* The line only had one entry */
23552             max = min;
23553         }
23554         else if (max < min) {
23555             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23556             sv_catpvs(msg, "Illegal range in \"");
23557             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23558                                 UTF8fARG(is_contents_utf8, s - s0, s0));
23559             sv_catpvs(msg, "\"");
23560             goto return_failure;
23561         }
23562 
23563 #  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
23564 
23565         if (   UNICODE_IS_PERL_EXTENDED(min)
23566             || UNICODE_IS_PERL_EXTENDED(max))
23567         {
23568             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23569 
23570             /* If both code points are non-portable, warn only on the lower
23571              * one. */
23572             sv_catpv(msg, get_extended_utf8_msg(
23573                                             (UNICODE_IS_PERL_EXTENDED(min))
23574                                             ? min : max));
23575             sv_catpvs(msg, " in \"");
23576             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23577                                  UTF8fARG(is_contents_utf8, s - s0, s0));
23578             sv_catpvs(msg, "\"");
23579         }
23580 
23581 #  endif
23582 
23583         /* Here, this line contains a legal range */
23584         this_definition = sv_2mortal(_new_invlist(2));
23585         this_definition = _add_range_to_invlist(this_definition, min, max);
23586         goto calculate;
23587 
23588       check_if_property:
23589 
23590         /* Here it isn't a legal range line.  See if it is a legal property
23591          * line.  First find the end of the meat of the line */
23592         s = strpbrk(s, "#\n");
23593         if (s == NULL) {
23594             s = e;
23595         }
23596 
23597         /* Ignore trailing blanks in keeping with the requirements of
23598          * parse_uniprop_string() */
23599         s--;
23600         while (s > s0 && isBLANK_A(*s)) {
23601             s--;
23602         }
23603         s++;
23604 
23605         this_definition = parse_uniprop_string(s0, s - s0,
23606                                                is_utf8, to_fold, runtime,
23607                                                deferrable,
23608                                                NULL,
23609                                                user_defined_ptr, msg,
23610                                                (name_len == 0)
23611                                                 ? level /* Don't increase level
23612                                                            if input is empty */
23613                                                 : level + 1
23614                                               );
23615         if (this_definition == NULL) {
23616             goto return_failure;    /* 'msg' should have had the reason
23617                                        appended to it by the above call */
23618         }
23619 
23620         if (! is_invlist(this_definition)) {    /* Unknown at this time */
23621             return newSVsv(this_definition);
23622         }
23623 
23624         if (*s != '\n') {
23625             s = strchr(s, '\n');
23626             if (s == NULL) {
23627                 s = e;
23628             }
23629         }
23630 
23631       calculate:
23632 
23633         switch (op) {
23634             case '+':
23635                 _invlist_union(running_definition, this_definition,
23636                                                         &running_definition);
23637                 break;
23638             case '-':
23639                 _invlist_subtract(running_definition, this_definition,
23640                                                         &running_definition);
23641                 break;
23642             case '&':
23643                 _invlist_intersection(running_definition, this_definition,
23644                                                         &running_definition);
23645                 break;
23646             case '!':
23647                 _invlist_union_complement_2nd(running_definition,
23648                                         this_definition, &running_definition);
23649                 break;
23650             default:
23651                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23652                                  __FILE__, __LINE__, op);
23653                 break;
23654         }
23655 
23656         /* Position past the '\n' */
23657         s0 = s + 1;
23658     }   /* End of loop through the lines of 'contents' */
23659 
23660     /* Here, we processed all the lines in 'contents' without error.  If we
23661      * didn't add any warnings, simply return success */
23662     if (msgs_length_on_entry == SvCUR(msg)) {
23663 
23664         /* If the expansion was empty, the answer isn't nothing: its an empty
23665          * inversion list */
23666         if (running_definition == NULL) {
23667             running_definition = _new_invlist(1);
23668         }
23669 
23670         return running_definition;
23671     }
23672 
23673     /* Otherwise, add some explanatory text, but we will return success */
23674     goto return_msg;
23675 
23676   return_failure:
23677     running_definition = NULL;
23678 
23679   return_msg:
23680 
23681     if (name_len > 0) {
23682         sv_catpvs(msg, " in expansion of ");
23683         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23684     }
23685 
23686     return running_definition;
23687 }
23688 
23689 /* As explained below, certain operations need to take place in the first
23690  * thread created.  These macros switch contexts */
23691 #  ifdef USE_ITHREADS
23692 #    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
23693                                         PerlInterpreter * save_aTHX = aTHX;
23694 #    define SWITCH_TO_GLOBAL_CONTEXT                                        \
23695                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23696 #    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
23697 #    define CUR_CONTEXT      aTHX
23698 #    define ORIGINAL_CONTEXT save_aTHX
23699 #  else
23700 #    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
23701 #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
23702 #    define RESTORE_CONTEXT                   NOOP
23703 #    define CUR_CONTEXT                       NULL
23704 #    define ORIGINAL_CONTEXT                  NULL
23705 #  endif
23706 
23707 STATIC void
S_delete_recursion_entry(pTHX_ void * key)23708 S_delete_recursion_entry(pTHX_ void *key)
23709 {
23710     /* Deletes the entry used to detect recursion when expanding user-defined
23711      * properties.  This is a function so it can be set up to be called even if
23712      * the program unexpectedly quits */
23713 
23714     SV ** current_entry;
23715     const STRLEN key_len = strlen((const char *) key);
23716     DECLARATION_FOR_GLOBAL_CONTEXT;
23717 
23718     SWITCH_TO_GLOBAL_CONTEXT;
23719 
23720     /* If the entry is one of these types, it is a permanent entry, and not the
23721      * one used to detect recursions.  This function should delete only the
23722      * recursion entry */
23723     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23724     if (     current_entry
23725         && ! is_invlist(*current_entry)
23726         && ! SvPOK(*current_entry))
23727     {
23728         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23729                                                                     G_DISCARD);
23730     }
23731 
23732     RESTORE_CONTEXT;
23733 }
23734 
23735 STATIC SV *
S_get_fq_name(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool has_colon_colon)23736 S_get_fq_name(pTHX_
23737               const char * const name,    /* The first non-blank in the \p{}, \P{} */
23738               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
23739               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23740               const bool has_colon_colon
23741              )
23742 {
23743     /* Returns a mortal SV containing the fully qualified version of the input
23744      * name */
23745 
23746     SV * fq_name;
23747 
23748     fq_name = newSVpvs_flags("", SVs_TEMP);
23749 
23750     /* Use the current package if it wasn't included in our input */
23751     if (! has_colon_colon) {
23752         const HV * pkg = (IN_PERL_COMPILETIME)
23753                          ? PL_curstash
23754                          : CopSTASH(PL_curcop);
23755         const char* pkgname = HvNAME(pkg);
23756 
23757         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23758                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23759         sv_catpvs(fq_name, "::");
23760     }
23761 
23762     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23763                          UTF8fARG(is_utf8, name_len, name));
23764     return fq_name;
23765 }
23766 
23767 STATIC SV *
S_parse_uniprop_string(pTHX_ const char * const name,Size_t name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,AV ** strings,bool * user_defined_ptr,SV * msg,const STRLEN level)23768 S_parse_uniprop_string(pTHX_
23769 
23770     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
23771      * now.  If so, the return is an inversion list.
23772      *
23773      * If the property is user-defined, it is a subroutine, which in turn
23774      * may call other subroutines.  This function will call the whole nest of
23775      * them to get the definition they return; if some aren't known at the time
23776      * of the call to this function, the fully qualified name of the highest
23777      * level sub is returned.  It is an error to call this function at runtime
23778      * without every sub defined.
23779      *
23780      * If an error was found, NULL is returned, and 'msg' gets a suitable
23781      * message appended to it.  (Appending allows the back trace of how we got
23782      * to the faulty definition to be displayed through nested calls of
23783      * user-defined subs.)
23784      *
23785      * The caller should NOT try to free any returned inversion list.
23786      *
23787      * Other parameters will be set on return as described below */
23788 
23789     const char * const name,    /* The first non-blank in the \p{}, \P{} */
23790     Size_t name_len,            /* Its length in bytes, not including any
23791                                    trailing space */
23792     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23793     const bool to_fold,         /* ? Is this under /i */
23794     const bool runtime,         /* TRUE if this is being called at run time */
23795     const bool deferrable,      /* TRUE if it's ok for the definition to not be
23796                                    known at this call */
23797     AV ** strings,              /* To return string property values, like named
23798                                    sequences */
23799     bool *user_defined_ptr,     /* Upon return from this function it will be
23800                                    set to TRUE if any component is a
23801                                    user-defined property */
23802     SV * msg,                   /* Any error or warning msg(s) are appended to
23803                                    this */
23804     const STRLEN level)         /* Recursion level of this call */
23805 {
23806     char* lookup_name;          /* normalized name for lookup in our tables */
23807     unsigned lookup_len;        /* Its length */
23808     enum { Not_Strict = 0,      /* Some properties have stricter name */
23809            Strict,              /* normalization rules, which we decide */
23810            As_Is                /* upon based on parsing */
23811          } stricter = Not_Strict;
23812 
23813     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23814      * (though it requires extra effort to download them from Unicode and
23815      * compile perl to know about them) */
23816     bool is_nv_type = FALSE;
23817 
23818     unsigned int i, j = 0;
23819     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
23820     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
23821     int table_index = 0;    /* The entry number for this property in the table
23822                                of all Unicode property names */
23823     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
23824     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
23825                                    the normalized name in certain situations */
23826     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
23827                                    part of a package name */
23828     Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
23829     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
23830                                              property rather than a Unicode
23831                                              one. */
23832     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
23833                                      if an error.  If it is an inversion list,
23834                                      it is the definition.  Otherwise it is a
23835                                      string containing the fully qualified sub
23836                                      name of 'name' */
23837     SV * fq_name = NULL;        /* For user-defined properties, the fully
23838                                    qualified name */
23839     bool invert_return = FALSE; /* ? Do we need to complement the result before
23840                                      returning it */
23841     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23842                                        explicit utf8:: package that we strip
23843                                        off  */
23844     /* The expansion of properties that could be either user-defined or
23845      * official unicode ones is deferred until runtime, including a marker for
23846      * those that might be in the latter category.  This boolean indicates if
23847      * we've seen that marker.  If not, what we're parsing can't be such an
23848      * official Unicode property whose expansion was deferred */
23849     bool could_be_deferred_official = FALSE;
23850 
23851     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23852 
23853     /* The input will be normalized into 'lookup_name' */
23854     Newx(lookup_name, name_len, char);
23855     SAVEFREEPV(lookup_name);
23856 
23857     /* Parse the input. */
23858     for (i = 0; i < name_len; i++) {
23859         char cur = name[i];
23860 
23861         /* Most of the characters in the input will be of this ilk, being parts
23862          * of a name */
23863         if (isIDCONT_A(cur)) {
23864 
23865             /* Case differences are ignored.  Our lookup routine assumes
23866              * everything is lowercase, so normalize to that */
23867             if (isUPPER_A(cur)) {
23868                 lookup_name[j++] = toLOWER_A(cur);
23869                 continue;
23870             }
23871 
23872             if (cur == '_') { /* Don't include these in the normalized name */
23873                 continue;
23874             }
23875 
23876             lookup_name[j++] = cur;
23877 
23878             /* The first character in a user-defined name must be of this type.
23879              * */
23880             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23881                 could_be_user_defined = FALSE;
23882             }
23883 
23884             continue;
23885         }
23886 
23887         /* Here, the character is not something typically in a name,  But these
23888          * two types of characters (and the '_' above) can be freely ignored in
23889          * most situations.  Later it may turn out we shouldn't have ignored
23890          * them, and we have to reparse, but we don't have enough information
23891          * yet to make that decision */
23892         if (cur == '-' || isSPACE_A(cur)) {
23893             could_be_user_defined = FALSE;
23894             continue;
23895         }
23896 
23897         /* An equals sign or single colon mark the end of the first part of
23898          * the property name */
23899         if (    cur == '='
23900             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23901         {
23902             lookup_name[j++] = '='; /* Treat the colon as an '=' */
23903             equals_pos = j; /* Note where it occurred in the input */
23904             could_be_user_defined = FALSE;
23905             break;
23906         }
23907 
23908         /* If this looks like it is a marker we inserted at compile time,
23909          * set a flag and otherwise ignore it.  If it isn't in the final
23910          * position, keep it as it would have been user input. */
23911         if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23912             && ! deferrable
23913             &&   could_be_user_defined
23914             &&   i == name_len - 1)
23915         {
23916             name_len--;
23917             could_be_deferred_official = TRUE;
23918             continue;
23919         }
23920 
23921         /* Otherwise, this character is part of the name. */
23922         lookup_name[j++] = cur;
23923 
23924         /* Here it isn't a single colon, so if it is a colon, it must be a
23925          * double colon */
23926         if (cur == ':') {
23927 
23928             /* A double colon should be a package qualifier.  We note its
23929              * position and continue.  Note that one could have
23930              *      pkg1::pkg2::...::foo
23931              * so that the position at the end of the loop will be just after
23932              * the final qualifier */
23933 
23934             i++;
23935             non_pkg_begin = i + 1;
23936             lookup_name[j++] = ':';
23937             lun_non_pkg_begin = j;
23938         }
23939         else { /* Only word chars (and '::') can be in a user-defined name */
23940             could_be_user_defined = FALSE;
23941         }
23942     } /* End of parsing through the lhs of the property name (or all of it if
23943          no rhs) */
23944 
23945 #  define STRLENs(s)  (sizeof("" s "") - 1)
23946 
23947     /* If there is a single package name 'utf8::', it is ambiguous.  It could
23948      * be for a user-defined property, or it could be a Unicode property, as
23949      * all of them are considered to be for that package.  For the purposes of
23950      * parsing the rest of the property, strip it off */
23951     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23952         lookup_name +=  STRLENs("utf8::");
23953         j -=  STRLENs("utf8::");
23954         equals_pos -=  STRLENs("utf8::");
23955         stripped_utf8_pkg = TRUE;
23956     }
23957 
23958     /* Here, we are either done with the whole property name, if it was simple;
23959      * or are positioned just after the '=' if it is compound. */
23960 
23961     if (equals_pos >= 0) {
23962         assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23963 
23964         /* Space immediately after the '=' is ignored */
23965         i++;
23966         for (; i < name_len; i++) {
23967             if (! isSPACE_A(name[i])) {
23968                 break;
23969             }
23970         }
23971 
23972         /* Most punctuation after the equals indicates a subpattern, like
23973          * \p{foo=/bar/} */
23974         if (   isPUNCT_A(name[i])
23975             &&  name[i] != '-'
23976             &&  name[i] != '+'
23977             &&  name[i] != '_'
23978             &&  name[i] != '{'
23979                 /* A backslash means the real delimitter is the next character,
23980                  * but it must be punctuation */
23981             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23982         {
23983             bool special_property = memEQs(lookup_name, j - 1, "name")
23984                                  || memEQs(lookup_name, j - 1, "na");
23985             if (! special_property) {
23986                 /* Find the property.  The table includes the equals sign, so
23987                  * we use 'j' as-is */
23988                 table_index = do_uniprop_match(lookup_name, j);
23989             }
23990             if (special_property || table_index) {
23991                 REGEXP * subpattern_re;
23992                 char open = name[i++];
23993                 char close;
23994                 const char * pos_in_brackets;
23995                 const char * const * prop_values;
23996                 bool escaped = 0;
23997 
23998                 /* Backslash => delimitter is the character following.  We
23999                  * already checked that it is punctuation */
24000                 if (open == '\\') {
24001                     open = name[i++];
24002                     escaped = 1;
24003                 }
24004 
24005                 /* This data structure is constructed so that the matching
24006                  * closing bracket is 3 past its matching opening.  The second
24007                  * set of closing is so that if the opening is something like
24008                  * ']', the closing will be that as well.  Something similar is
24009                  * done in toke.c */
24010                 pos_in_brackets = memCHRs("([<)]>)]>", open);
24011                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
24012 
24013                 if (    i >= name_len
24014                     ||  name[name_len-1] != close
24015                     || (escaped && name[name_len-2] != '\\')
24016                         /* Also make sure that there are enough characters.
24017                          * e.g., '\\\' would show up incorrectly as legal even
24018                          * though it is too short */
24019                     || (SSize_t) (name_len - i - 1 - escaped) < 0)
24020                 {
24021                     sv_catpvs(msg, "Unicode property wildcard not terminated");
24022                     goto append_name_to_msg;
24023                 }
24024 
24025                 Perl_ck_warner_d(aTHX_
24026                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
24027                     "The Unicode property wildcards feature is experimental");
24028 
24029                 if (special_property) {
24030                     const char * error_msg;
24031                     const char * revised_name = name + i;
24032                     Size_t revised_name_len = name_len - (i + 1 + escaped);
24033 
24034                     /* Currently, the only 'special_property' is name, which we
24035                      * lookup in _charnames.pm */
24036 
24037                     if (! load_charnames(newSVpvs("placeholder"),
24038                                          revised_name, revised_name_len,
24039                                          &error_msg))
24040                     {
24041                         sv_catpv(msg, error_msg);
24042                         goto append_name_to_msg;
24043                     }
24044 
24045                     /* Farm this out to a function just to make the current
24046                      * function less unwieldy */
24047                     if (handle_names_wildcard(revised_name, revised_name_len,
24048                                               &prop_definition,
24049                                               strings))
24050                     {
24051                         return prop_definition;
24052                     }
24053 
24054                     goto failed;
24055                 }
24056 
24057                 prop_values = get_prop_values(table_index);
24058 
24059                 /* Now create and compile the wildcard subpattern.  Use /i
24060                  * because the property values are supposed to match with case
24061                  * ignored. */
24062                 subpattern_re = compile_wildcard(name + i,
24063                                                  name_len - i - 1 - escaped,
24064                                                  TRUE /* /i */
24065                                                 );
24066 
24067                 /* For each legal property value, see if the supplied pattern
24068                  * matches it. */
24069                 while (*prop_values) {
24070                     const char * const entry = *prop_values;
24071                     const Size_t len = strlen(entry);
24072                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
24073 
24074                     if (execute_wildcard(subpattern_re,
24075                                  (char *) entry,
24076                                  (char *) entry + len,
24077                                  (char *) entry, 0,
24078                                  entry_sv,
24079                                  0))
24080                     { /* Here, matched.  Add to the returned list */
24081                         Size_t total_len = j + len;
24082                         SV * sub_invlist = NULL;
24083                         char * this_string;
24084 
24085                         /* We know this is a legal \p{property=value}.  Call
24086                          * the function to return the list of code points that
24087                          * match it */
24088                         Newxz(this_string, total_len + 1, char);
24089                         Copy(lookup_name, this_string, j, char);
24090                         my_strlcat(this_string, entry, total_len + 1);
24091                         SAVEFREEPV(this_string);
24092                         sub_invlist = parse_uniprop_string(this_string,
24093                                                            total_len,
24094                                                            is_utf8,
24095                                                            to_fold,
24096                                                            runtime,
24097                                                            deferrable,
24098                                                            NULL,
24099                                                            user_defined_ptr,
24100                                                            msg,
24101                                                            level + 1);
24102                         _invlist_union(prop_definition, sub_invlist,
24103                                        &prop_definition);
24104                     }
24105 
24106                     prop_values++;  /* Next iteration, look at next propvalue */
24107                 } /* End of looking through property values; (the data
24108                      structure is terminated by a NULL ptr) */
24109 
24110                 SvREFCNT_dec_NN(subpattern_re);
24111 
24112                 if (prop_definition) {
24113                     return prop_definition;
24114                 }
24115 
24116                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
24117                 goto append_name_to_msg;
24118             }
24119 
24120             /* Here's how khw thinks we should proceed to handle the properties
24121              * not yet done:    Bidi Mirroring Glyph        can map to ""
24122                                 Bidi Paired Bracket         can map to ""
24123                                 Case Folding  (both full and simple)
24124                                             Shouldn't /i be good enough for Full
24125                                 Decomposition Mapping
24126                                 Equivalent Unified Ideograph    can map to ""
24127                                 Lowercase Mapping  (both full and simple)
24128                                 NFKC Case Fold                  can map to ""
24129                                 Titlecase Mapping  (both full and simple)
24130                                 Uppercase Mapping  (both full and simple)
24131              * Handle these the same way Name is done, using say, _wild.pm, but
24132              * having both loose and full, like in charclass_invlists.h.
24133              * Perhaps move block and script to that as they are somewhat large
24134              * in charclass_invlists.h.
24135              * For properties where the default is the code point itself, such
24136              * as any of the case changing mappings, the string would otherwise
24137              * consist of all Unicode code points in UTF-8 strung together.
24138              * This would be impractical.  So instead, examine their compiled
24139              * pattern, looking at the ssc.  If none, reject the pattern as an
24140              * error.  Otherwise run the pattern against every code point in
24141              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
24142              * And it might be good to create an API to return the ssc.
24143              * Or handle them like the algorithmic names are done
24144              */
24145         } /* End of is a wildcard subppattern */
24146 
24147         /* \p{name=...} is handled specially.  Instead of using the normal
24148          * mechanism involving charclass_invlists.h, it uses _charnames.pm
24149          * which has the necessary (huge) data accessible to it, and which
24150          * doesn't get loaded unless necessary.  The legal syntax for names is
24151          * somewhat different than other properties due both to the vagaries of
24152          * a few outlier official names, and the fact that only a few ASCII
24153          * characters are permitted in them */
24154         if (   memEQs(lookup_name, j - 1, "name")
24155             || memEQs(lookup_name, j - 1, "na"))
24156         {
24157             dSP;
24158             HV * table;
24159             SV * character;
24160             const char * error_msg;
24161             CV* lookup_loose;
24162             SV * character_name;
24163             STRLEN character_len;
24164             UV cp;
24165 
24166             stricter = As_Is;
24167 
24168             /* Since the RHS (after skipping initial space) is passed unchanged
24169              * to charnames, and there are different criteria for what are
24170              * legal characters in the name, just parse it here.  A character
24171              * name must begin with an ASCII alphabetic */
24172             if (! isALPHA(name[i])) {
24173                 goto failed;
24174             }
24175             lookup_name[j++] = name[i];
24176 
24177             for (++i; i < name_len; i++) {
24178                 /* Official names can only be in the ASCII range, and only
24179                  * certain characters */
24180                 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24181                     goto failed;
24182                 }
24183                 lookup_name[j++] = name[i];
24184             }
24185 
24186             /* Finished parsing, save the name into an SV */
24187             character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24188 
24189             /* Make sure _charnames is loaded.  (The parameters give context
24190              * for any errors generated */
24191             table = load_charnames(character_name, name, name_len, &error_msg);
24192             if (table == NULL) {
24193                 sv_catpv(msg, error_msg);
24194                 goto append_name_to_msg;
24195             }
24196 
24197             lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24198             if (! lookup_loose) {
24199                 Perl_croak(aTHX_
24200                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
24201             }
24202 
24203             PUSHSTACKi(PERLSI_REGCOMP);
24204             ENTER ;
24205             SAVETMPS;
24206             save_re_context();
24207 
24208             PUSHMARK(SP) ;
24209             XPUSHs(character_name);
24210             PUTBACK;
24211             call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24212 
24213             SPAGAIN ;
24214 
24215             character = POPs;
24216             SvREFCNT_inc_simple_void_NN(character);
24217 
24218             PUTBACK ;
24219             FREETMPS ;
24220             LEAVE ;
24221             POPSTACK;
24222 
24223             if (! SvOK(character)) {
24224                 goto failed;
24225             }
24226 
24227             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24228             if (character_len == SvCUR(character)) {
24229                 prop_definition = add_cp_to_invlist(NULL, cp);
24230             }
24231             else {
24232                 AV * this_string;
24233 
24234                 /* First of the remaining characters in the string. */
24235                 char * remaining = SvPVX(character) + character_len;
24236 
24237                 if (strings == NULL) {
24238                     goto failed;    /* XXX Perhaps a specific msg instead, like
24239                                        'not available here' */
24240                 }
24241 
24242                 if (*strings == NULL) {
24243                     *strings = newAV();
24244                 }
24245 
24246                 this_string = newAV();
24247                 av_push(this_string, newSVuv(cp));
24248 
24249                 do {
24250                     cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24251                     av_push(this_string, newSVuv(cp));
24252                     remaining += character_len;
24253                 } while (remaining < SvEND(character));
24254 
24255                 av_push(*strings, (SV *) this_string);
24256             }
24257 
24258             return prop_definition;
24259         }
24260 
24261         /* Certain properties whose values are numeric need special handling.
24262          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
24263          * purposes of checking if this is one of those properties */
24264         if (memBEGINPs(lookup_name, j, "is")) {
24265             lookup_offset = 2;
24266         }
24267 
24268         /* Then check if it is one of these specially-handled properties.  The
24269          * possibilities are hard-coded because easier this way, and the list
24270          * is unlikely to change.
24271          *
24272          * All numeric value type properties are of this ilk, and are also
24273          * special in a different way later on.  So find those first.  There
24274          * are several numeric value type properties in the Unihan DB (which is
24275          * unlikely to be compiled with perl, but we handle it here in case it
24276          * does get compiled).  They all end with 'numeric'.  The interiors
24277          * aren't checked for the precise property.  This would stop working if
24278          * a cjk property were to be created that ended with 'numeric' and
24279          * wasn't a numeric type */
24280         is_nv_type = memEQs(lookup_name + lookup_offset,
24281                        j - 1 - lookup_offset, "numericvalue")
24282                   || memEQs(lookup_name + lookup_offset,
24283                       j - 1 - lookup_offset, "nv")
24284                   || (   memENDPs(lookup_name + lookup_offset,
24285                             j - 1 - lookup_offset, "numeric")
24286                       && (   memBEGINPs(lookup_name + lookup_offset,
24287                                       j - 1 - lookup_offset, "cjk")
24288                           || memBEGINPs(lookup_name + lookup_offset,
24289                                       j - 1 - lookup_offset, "k")));
24290         if (   is_nv_type
24291             || memEQs(lookup_name + lookup_offset,
24292                       j - 1 - lookup_offset, "canonicalcombiningclass")
24293             || memEQs(lookup_name + lookup_offset,
24294                       j - 1 - lookup_offset, "ccc")
24295             || memEQs(lookup_name + lookup_offset,
24296                       j - 1 - lookup_offset, "age")
24297             || memEQs(lookup_name + lookup_offset,
24298                       j - 1 - lookup_offset, "in")
24299             || memEQs(lookup_name + lookup_offset,
24300                       j - 1 - lookup_offset, "presentin"))
24301         {
24302             unsigned int k;
24303 
24304             /* Since the stuff after the '=' is a number, we can't throw away
24305              * '-' willy-nilly, as those could be a minus sign.  Other stricter
24306              * rules also apply.  However, these properties all can have the
24307              * rhs not be a number, in which case they contain at least one
24308              * alphabetic.  In those cases, the stricter rules don't apply.
24309              * But the numeric type properties can have the alphas [Ee] to
24310              * signify an exponent, and it is still a number with stricter
24311              * rules.  So look for an alpha that signifies not-strict */
24312             stricter = Strict;
24313             for (k = i; k < name_len; k++) {
24314                 if (   isALPHA_A(name[k])
24315                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24316                 {
24317                     stricter = Not_Strict;
24318                     break;
24319                 }
24320             }
24321         }
24322 
24323         if (stricter) {
24324 
24325             /* A number may have a leading '+' or '-'.  The latter is retained
24326              * */
24327             if (name[i] == '+') {
24328                 i++;
24329             }
24330             else if (name[i] == '-') {
24331                 lookup_name[j++] = '-';
24332                 i++;
24333             }
24334 
24335             /* Skip leading zeros including single underscores separating the
24336              * zeros, or between the final leading zero and the first other
24337              * digit */
24338             for (; i < name_len - 1; i++) {
24339                 if (    name[i] != '0'
24340                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24341                 {
24342                     break;
24343                 }
24344             }
24345         }
24346     }
24347     else {  /* No '=' */
24348 
24349        /* Only a few properties without an '=' should be parsed with stricter
24350         * rules.  The list is unlikely to change. */
24351         if (   memBEGINPs(lookup_name, j, "perl")
24352             && memNEs(lookup_name + 4, j - 4, "space")
24353             && memNEs(lookup_name + 4, j - 4, "word"))
24354         {
24355             stricter = Strict;
24356 
24357             /* We set the inputs back to 0 and the code below will reparse,
24358              * using strict */
24359             i = j = 0;
24360         }
24361     }
24362 
24363     /* Here, we have either finished the property, or are positioned to parse
24364      * the remainder, and we know if stricter rules apply.  Finish out, if not
24365      * already done */
24366     for (; i < name_len; i++) {
24367         char cur = name[i];
24368 
24369         /* In all instances, case differences are ignored, and we normalize to
24370          * lowercase */
24371         if (isUPPER_A(cur)) {
24372             lookup_name[j++] = toLOWER(cur);
24373             continue;
24374         }
24375 
24376         /* An underscore is skipped, but not under strict rules unless it
24377          * separates two digits */
24378         if (cur == '_') {
24379             if (    stricter
24380                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
24381                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24382             {
24383                 lookup_name[j++] = '_';
24384             }
24385             continue;
24386         }
24387 
24388         /* Hyphens are skipped except under strict */
24389         if (cur == '-' && ! stricter) {
24390             continue;
24391         }
24392 
24393         /* XXX Bug in documentation.  It says white space skipped adjacent to
24394          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
24395          * in a number */
24396         if (isSPACE_A(cur) && ! stricter) {
24397             continue;
24398         }
24399 
24400         lookup_name[j++] = cur;
24401 
24402         /* Unless this is a non-trailing slash, we are done with it */
24403         if (i >= name_len - 1 || cur != '/') {
24404             continue;
24405         }
24406 
24407         slash_pos = j;
24408 
24409         /* A slash in the 'numeric value' property indicates that what follows
24410          * is a denominator.  It can have a leading '+' and '0's that should be
24411          * skipped.  But we have never allowed a negative denominator, so treat
24412          * a minus like every other character.  (No need to rule out a second
24413          * '/', as that won't match anything anyway */
24414         if (is_nv_type) {
24415             i++;
24416             if (i < name_len && name[i] == '+') {
24417                 i++;
24418             }
24419 
24420             /* Skip leading zeros including underscores separating digits */
24421             for (; i < name_len - 1; i++) {
24422                 if (   name[i] != '0'
24423                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24424                 {
24425                     break;
24426                 }
24427             }
24428 
24429             /* Store the first real character in the denominator */
24430             if (i < name_len) {
24431                 lookup_name[j++] = name[i];
24432             }
24433         }
24434     }
24435 
24436     /* Here are completely done parsing the input 'name', and 'lookup_name'
24437      * contains a copy, normalized.
24438      *
24439      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24440      * different from without the underscores.  */
24441     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
24442            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24443         && UNLIKELY(name[name_len-1] == '_'))
24444     {
24445         lookup_name[j++] = '&';
24446     }
24447 
24448     /* If the original input began with 'In' or 'Is', it could be a subroutine
24449      * call to a user-defined property instead of a Unicode property name. */
24450     if (    name_len - non_pkg_begin > 2
24451         &&  name[non_pkg_begin+0] == 'I'
24452         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24453     {
24454         /* Names that start with In have different characterstics than those
24455          * that start with Is */
24456         if (name[non_pkg_begin+1] == 's') {
24457             starts_with_Is = TRUE;
24458         }
24459     }
24460     else {
24461         could_be_user_defined = FALSE;
24462     }
24463 
24464     if (could_be_user_defined) {
24465         CV* user_sub;
24466 
24467         /* If the user defined property returns the empty string, it could
24468          * easily be because the pattern is being compiled before the data it
24469          * actually needs to compile is available.  This could be argued to be
24470          * a bug in the perl code, but this is a change of behavior for Perl,
24471          * so we handle it.  This means that intentionally returning nothing
24472          * will not be resolved until runtime */
24473         bool empty_return = FALSE;
24474 
24475         /* Here, the name could be for a user defined property, which are
24476          * implemented as subs. */
24477         user_sub = get_cvn_flags(name, name_len, 0);
24478         if (! user_sub) {
24479 
24480             /* Here, the property name could be a user-defined one, but there
24481              * is no subroutine to handle it (as of now).   Defer handling it
24482              * until runtime.  Otherwise, a block defined by Unicode in a later
24483              * release would get the synonym InFoo added for it, and existing
24484              * code that used that name would suddenly break if it referred to
24485              * the property before the sub was declared.  See [perl #134146] */
24486             if (deferrable) {
24487                 goto definition_deferred;
24488             }
24489 
24490             /* Here, we are at runtime, and didn't find the user property.  It
24491              * could be an official property, but only if no package was
24492              * specified, or just the utf8:: package. */
24493             if (could_be_deferred_official) {
24494                 lookup_name += lun_non_pkg_begin;
24495                 j -= lun_non_pkg_begin;
24496             }
24497             else if (! stripped_utf8_pkg) {
24498                 goto unknown_user_defined;
24499             }
24500 
24501             /* Drop down to look up in the official properties */
24502         }
24503         else {
24504             const char insecure[] = "Insecure user-defined property";
24505 
24506             /* Here, there is a sub by the correct name.  Normally we call it
24507              * to get the property definition */
24508             dSP;
24509             SV * user_sub_sv = MUTABLE_SV(user_sub);
24510             SV * error;     /* Any error returned by calling 'user_sub' */
24511             SV * key;       /* The key into the hash of user defined sub names
24512                              */
24513             SV * placeholder;
24514             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
24515 
24516             /* How many times to retry when another thread is in the middle of
24517              * expanding the same definition we want */
24518             PERL_INT_FAST8_T retry_countdown = 10;
24519 
24520             DECLARATION_FOR_GLOBAL_CONTEXT;
24521 
24522             /* If we get here, we know this property is user-defined */
24523             *user_defined_ptr = TRUE;
24524 
24525             /* We refuse to call a potentially tainted subroutine; returning an
24526              * error instead */
24527             if (TAINT_get) {
24528                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24529                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24530                 goto append_name_to_msg;
24531             }
24532 
24533             /* In principal, we only call each subroutine property definition
24534              * once during the life of the program.  This guarantees that the
24535              * property definition never changes.  The results of the single
24536              * sub call are stored in a hash, which is used instead for future
24537              * references to this property.  The property definition is thus
24538              * immutable.  But, to allow the user to have a /i-dependent
24539              * definition, we call the sub once for non-/i, and once for /i,
24540              * should the need arise, passing the /i status as a parameter.
24541              *
24542              * We start by constructing the hash key name, consisting of the
24543              * fully qualified subroutine name, preceded by the /i status, so
24544              * that there is a key for /i and a different key for non-/i */
24545             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24546             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24547                                           non_pkg_begin != 0);
24548             sv_catsv(key, fq_name);
24549             sv_2mortal(key);
24550 
24551             /* We only call the sub once throughout the life of the program
24552              * (with the /i, non-/i exception noted above).  That means the
24553              * hash must be global and accessible to all threads.  It is
24554              * created at program start-up, before any threads are created, so
24555              * is accessible to all children.  But this creates some
24556              * complications.
24557              *
24558              * 1) The keys can't be shared, or else problems arise; sharing is
24559              *    turned off at hash creation time
24560              * 2) All SVs in it are there for the remainder of the life of the
24561              *    program, and must be created in the same interpreter context
24562              *    as the hash, or else they will be freed from the wrong pool
24563              *    at global destruction time.  This is handled by switching to
24564              *    the hash's context to create each SV going into it, and then
24565              *    immediately switching back
24566              * 3) All accesses to the hash must be controlled by a mutex, to
24567              *    prevent two threads from getting an unstable state should
24568              *    they simultaneously be accessing it.  The code below is
24569              *    crafted so that the mutex is locked whenever there is an
24570              *    access and unlocked only when the next stable state is
24571              *    achieved.
24572              *
24573              * The hash stores either the definition of the property if it was
24574              * valid, or, if invalid, the error message that was raised.  We
24575              * use the type of SV to distinguish.
24576              *
24577              * There's also the need to guard against the definition expansion
24578              * from infinitely recursing.  This is handled by storing the aTHX
24579              * of the expanding thread during the expansion.  Again the SV type
24580              * is used to distinguish this from the other two cases.  If we
24581              * come to here and the hash entry for this property is our aTHX,
24582              * it means we have recursed, and the code assumes that we would
24583              * infinitely recurse, so instead stops and raises an error.
24584              * (Any recursion has always been treated as infinite recursion in
24585              * this feature.)
24586              *
24587              * If instead, the entry is for a different aTHX, it means that
24588              * that thread has gotten here first, and hasn't finished expanding
24589              * the definition yet.  We just have to wait until it is done.  We
24590              * sleep and retry a few times, returning an error if the other
24591              * thread doesn't complete. */
24592 
24593           re_fetch:
24594             USER_PROP_MUTEX_LOCK;
24595 
24596             /* If we have an entry for this key, the subroutine has already
24597              * been called once with this /i status. */
24598             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24599                                                    SvPVX(key), SvCUR(key), 0);
24600             if (saved_user_prop_ptr) {
24601 
24602                 /* If the saved result is an inversion list, it is the valid
24603                  * definition of this property */
24604                 if (is_invlist(*saved_user_prop_ptr)) {
24605                     prop_definition = *saved_user_prop_ptr;
24606 
24607                     /* The SV in the hash won't be removed until global
24608                      * destruction, so it is stable and we can unlock */
24609                     USER_PROP_MUTEX_UNLOCK;
24610 
24611                     /* The caller shouldn't try to free this SV */
24612                     return prop_definition;
24613                 }
24614 
24615                 /* Otherwise, if it is a string, it is the error message
24616                  * that was returned when we first tried to evaluate this
24617                  * property.  Fail, and append the message */
24618                 if (SvPOK(*saved_user_prop_ptr)) {
24619                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24620                     sv_catsv(msg, *saved_user_prop_ptr);
24621 
24622                     /* The SV in the hash won't be removed until global
24623                      * destruction, so it is stable and we can unlock */
24624                     USER_PROP_MUTEX_UNLOCK;
24625 
24626                     return NULL;
24627                 }
24628 
24629                 assert(SvIOK(*saved_user_prop_ptr));
24630 
24631                 /* Here, we have an unstable entry in the hash.  Either another
24632                  * thread is in the middle of expanding the property's
24633                  * definition, or we are ourselves recursing.  We use the aTHX
24634                  * in it to distinguish */
24635                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24636 
24637                     /* Here, it's another thread doing the expanding.  We've
24638                      * looked as much as we are going to at the contents of the
24639                      * hash entry.  It's safe to unlock. */
24640                     USER_PROP_MUTEX_UNLOCK;
24641 
24642                     /* Retry a few times */
24643                     if (retry_countdown-- > 0) {
24644                         PerlProc_sleep(1);
24645                         goto re_fetch;
24646                     }
24647 
24648                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24649                     sv_catpvs(msg, "Timeout waiting for another thread to "
24650                                    "define");
24651                     goto append_name_to_msg;
24652                 }
24653 
24654                 /* Here, we are recursing; don't dig any deeper */
24655                 USER_PROP_MUTEX_UNLOCK;
24656 
24657                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24658                 sv_catpvs(msg,
24659                           "Infinite recursion in user-defined property");
24660                 goto append_name_to_msg;
24661             }
24662 
24663             /* Here, this thread has exclusive control, and there is no entry
24664              * for this property in the hash.  So we have the go ahead to
24665              * expand the definition ourselves. */
24666 
24667             PUSHSTACKi(PERLSI_REGCOMP);
24668             ENTER;
24669 
24670             /* Create a temporary placeholder in the hash to detect recursion
24671              * */
24672             SWITCH_TO_GLOBAL_CONTEXT;
24673             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24674             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24675             RESTORE_CONTEXT;
24676 
24677             /* Now that we have a placeholder, we can let other threads
24678              * continue */
24679             USER_PROP_MUTEX_UNLOCK;
24680 
24681             /* Make sure the placeholder always gets destroyed */
24682             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24683 
24684             PUSHMARK(SP);
24685             SAVETMPS;
24686 
24687             /* Call the user's function, with the /i status as a parameter.
24688              * Note that we have gone to a lot of trouble to keep this call
24689              * from being within the locked mutex region. */
24690             XPUSHs(boolSV(to_fold));
24691             PUTBACK;
24692 
24693             /* The following block was taken from swash_init().  Presumably
24694              * they apply to here as well, though we no longer use a swash --
24695              * khw */
24696             SAVEHINTS();
24697             save_re_context();
24698             /* We might get here via a subroutine signature which uses a utf8
24699              * parameter name, at which point PL_subname will have been set
24700              * but not yet used. */
24701             save_item(PL_subname);
24702 
24703             /* G_SCALAR guarantees a single return value */
24704             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24705 
24706             SPAGAIN;
24707 
24708             error = ERRSV;
24709             if (TAINT_get || SvTRUE(error)) {
24710                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24711                 if (SvTRUE(error)) {
24712                     sv_catpvs(msg, "Error \"");
24713                     sv_catsv(msg, error);
24714                     sv_catpvs(msg, "\"");
24715                 }
24716                 if (TAINT_get) {
24717                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
24718                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24719                 }
24720 
24721                 if (name_len > 0) {
24722                     sv_catpvs(msg, " in expansion of ");
24723                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24724                                                                   name_len,
24725                                                                   name));
24726                 }
24727 
24728                 (void) POPs;
24729                 prop_definition = NULL;
24730             }
24731             else {
24732                 SV * contents = POPs;
24733 
24734                 /* The contents is supposed to be the expansion of the property
24735                  * definition.  If the definition is deferrable, and we got an
24736                  * empty string back, set a flag to later defer it (after clean
24737                  * up below). */
24738                 if (      deferrable
24739                     && (! SvPOK(contents) || SvCUR(contents) == 0))
24740                 {
24741                         empty_return = TRUE;
24742                 }
24743                 else { /* Otherwise, call a function to check for valid syntax,
24744                           and handle it */
24745 
24746                     prop_definition = handle_user_defined_property(
24747                                                     name, name_len,
24748                                                     is_utf8, to_fold, runtime,
24749                                                     deferrable,
24750                                                     contents, user_defined_ptr,
24751                                                     msg,
24752                                                     level);
24753                 }
24754             }
24755 
24756             /* Here, we have the results of the expansion.  Delete the
24757              * placeholder, and if the definition is now known, replace it with
24758              * that definition.  We need exclusive access to the hash, and we
24759              * can't let anyone else in, between when we delete the placeholder
24760              * and add the permanent entry */
24761             USER_PROP_MUTEX_LOCK;
24762 
24763             S_delete_recursion_entry(aTHX_ SvPVX(key));
24764 
24765             if (    ! empty_return
24766                 && (! prop_definition || is_invlist(prop_definition)))
24767             {
24768                 /* If we got success we use the inversion list defining the
24769                  * property; otherwise use the error message */
24770                 SWITCH_TO_GLOBAL_CONTEXT;
24771                 (void) hv_store_ent(PL_user_def_props,
24772                                     key,
24773                                     ((prop_definition)
24774                                      ? newSVsv(prop_definition)
24775                                      : newSVsv(msg)),
24776                                     0);
24777                 RESTORE_CONTEXT;
24778             }
24779 
24780             /* All done, and the hash now has a permanent entry for this
24781              * property.  Give up exclusive control */
24782             USER_PROP_MUTEX_UNLOCK;
24783 
24784             FREETMPS;
24785             LEAVE;
24786             POPSTACK;
24787 
24788             if (empty_return) {
24789                 goto definition_deferred;
24790             }
24791 
24792             if (prop_definition) {
24793 
24794                 /* If the definition is for something not known at this time,
24795                  * we toss it, and go return the main property name, as that's
24796                  * the one the user will be aware of */
24797                 if (! is_invlist(prop_definition)) {
24798                     SvREFCNT_dec_NN(prop_definition);
24799                     goto definition_deferred;
24800                 }
24801 
24802                 sv_2mortal(prop_definition);
24803             }
24804 
24805             /* And return */
24806             return prop_definition;
24807 
24808         }   /* End of calling the subroutine for the user-defined property */
24809     }       /* End of it could be a user-defined property */
24810 
24811     /* Here it wasn't a user-defined property that is known at this time.  See
24812      * if it is a Unicode property */
24813 
24814     lookup_len = j;     /* This is a more mnemonic name than 'j' */
24815 
24816     /* Get the index into our pointer table of the inversion list corresponding
24817      * to the property */
24818     table_index = do_uniprop_match(lookup_name, lookup_len);
24819 
24820     /* If it didn't find the property ... */
24821     if (table_index == 0) {
24822 
24823         /* Try again stripping off any initial 'Is'.  This is because we
24824          * promise that an initial Is is optional.  The same isn't true of
24825          * names that start with 'In'.  Those can match only blocks, and the
24826          * lookup table already has those accounted for.  The lookup table also
24827          * has already accounted for Perl extensions (without and = sign)
24828          * starting with 'i's'. */
24829         if (starts_with_Is && equals_pos >= 0) {
24830             lookup_name += 2;
24831             lookup_len -= 2;
24832             equals_pos -= 2;
24833             slash_pos -= 2;
24834 
24835             table_index = do_uniprop_match(lookup_name, lookup_len);
24836         }
24837 
24838         if (table_index == 0) {
24839             char * canonical;
24840 
24841             /* Here, we didn't find it.  If not a numeric type property, and
24842              * can't be a user-defined one, it isn't a legal property */
24843             if (! is_nv_type) {
24844                 if (! could_be_user_defined) {
24845                     goto failed;
24846                 }
24847 
24848                 /* Here, the property name is legal as a user-defined one.   At
24849                  * compile time, it might just be that the subroutine for that
24850                  * property hasn't been encountered yet, but at runtime, it's
24851                  * an error to try to use an undefined one */
24852                 if (! deferrable) {
24853                     goto unknown_user_defined;;
24854                 }
24855 
24856                 goto definition_deferred;
24857             } /* End of isn't a numeric type property */
24858 
24859             /* The numeric type properties need more work to decide.  What we
24860              * do is make sure we have the number in canonical form and look
24861              * that up. */
24862 
24863             if (slash_pos < 0) {    /* No slash */
24864 
24865                 /* When it isn't a rational, take the input, convert it to a
24866                  * NV, then create a canonical string representation of that
24867                  * NV. */
24868 
24869                 NV value;
24870                 SSize_t value_len = lookup_len - equals_pos;
24871 
24872                 /* Get the value */
24873                 if (   value_len <= 0
24874                     || my_atof3(lookup_name + equals_pos, &value,
24875                                 value_len)
24876                           != lookup_name + lookup_len)
24877                 {
24878                     goto failed;
24879                 }
24880 
24881                 /* If the value is an integer, the canonical value is integral
24882                  * */
24883                 if (Perl_ceil(value) == value) {
24884                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24885                                             equals_pos, lookup_name, value);
24886                 }
24887                 else {  /* Otherwise, it is %e with a known precision */
24888                     char * exp_ptr;
24889 
24890                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24891                                                 equals_pos, lookup_name,
24892                                                 PL_E_FORMAT_PRECISION, value);
24893 
24894                     /* The exponent generated is expecting two digits, whereas
24895                      * %e on some systems will generate three.  Remove leading
24896                      * zeros in excess of 2 from the exponent.  We start
24897                      * looking for them after the '=' */
24898                     exp_ptr = strchr(canonical + equals_pos, 'e');
24899                     if (exp_ptr) {
24900                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24901                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24902 
24903                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24904 
24905                         if (excess_exponent_len > 0) {
24906                             SSize_t leading_zeros = strspn(cur_ptr, "0");
24907                             SSize_t excess_leading_zeros
24908                                     = MIN(leading_zeros, excess_exponent_len);
24909                             if (excess_leading_zeros > 0) {
24910                                 Move(cur_ptr + excess_leading_zeros,
24911                                      cur_ptr,
24912                                      strlen(cur_ptr) - excess_leading_zeros
24913                                        + 1,  /* Copy the NUL as well */
24914                                      char);
24915                             }
24916                         }
24917                     }
24918                 }
24919             }
24920             else {  /* Has a slash.  Create a rational in canonical form  */
24921                 UV numerator, denominator, gcd, trial;
24922                 const char * end_ptr;
24923                 const char * sign = "";
24924 
24925                 /* We can't just find the numerator, denominator, and do the
24926                  * division, then use the method above, because that is
24927                  * inexact.  And the input could be a rational that is within
24928                  * epsilon (given our precision) of a valid rational, and would
24929                  * then incorrectly compare valid.
24930                  *
24931                  * We're only interested in the part after the '=' */
24932                 const char * this_lookup_name = lookup_name + equals_pos;
24933                 lookup_len -= equals_pos;
24934                 slash_pos -= equals_pos;
24935 
24936                 /* Handle any leading minus */
24937                 if (this_lookup_name[0] == '-') {
24938                     sign = "-";
24939                     this_lookup_name++;
24940                     lookup_len--;
24941                     slash_pos--;
24942                 }
24943 
24944                 /* Convert the numerator to numeric */
24945                 end_ptr = this_lookup_name + slash_pos;
24946                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24947                     goto failed;
24948                 }
24949 
24950                 /* It better have included all characters before the slash */
24951                 if (*end_ptr != '/') {
24952                     goto failed;
24953                 }
24954 
24955                 /* Set to look at just the denominator */
24956                 this_lookup_name += slash_pos;
24957                 lookup_len -= slash_pos;
24958                 end_ptr = this_lookup_name + lookup_len;
24959 
24960                 /* Convert the denominator to numeric */
24961                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24962                     goto failed;
24963                 }
24964 
24965                 /* It better be the rest of the characters, and don't divide by
24966                  * 0 */
24967                 if (   end_ptr != this_lookup_name + lookup_len
24968                     || denominator == 0)
24969                 {
24970                     goto failed;
24971                 }
24972 
24973                 /* Get the greatest common denominator using
24974                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
24975                 gcd = numerator;
24976                 trial = denominator;
24977                 while (trial != 0) {
24978                     UV temp = trial;
24979                     trial = gcd % trial;
24980                     gcd = temp;
24981                 }
24982 
24983                 /* If already in lowest possible terms, we have already tried
24984                  * looking this up */
24985                 if (gcd == 1) {
24986                     goto failed;
24987                 }
24988 
24989                 /* Reduce the rational, which should put it in canonical form
24990                  * */
24991                 numerator /= gcd;
24992                 denominator /= gcd;
24993 
24994                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24995                         equals_pos, lookup_name, sign, numerator, denominator);
24996             }
24997 
24998             /* Here, we have the number in canonical form.  Try that */
24999             table_index = do_uniprop_match(canonical, strlen(canonical));
25000             if (table_index == 0) {
25001                 goto failed;
25002             }
25003         }   /* End of still didn't find the property in our table */
25004     }       /* End of       didn't find the property in our table */
25005 
25006     /* Here, we have a non-zero return, which is an index into a table of ptrs.
25007      * A negative return signifies that the real index is the absolute value,
25008      * but the result needs to be inverted */
25009     if (table_index < 0) {
25010         invert_return = TRUE;
25011         table_index = -table_index;
25012     }
25013 
25014     /* Out-of band indices indicate a deprecated property.  The proper index is
25015      * modulo it with the table size.  And dividing by the table size yields
25016      * an offset into a table constructed by regen/mk_invlists.pl to contain
25017      * the corresponding warning message */
25018     if (table_index > MAX_UNI_KEYWORD_INDEX) {
25019         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
25020         table_index %= MAX_UNI_KEYWORD_INDEX;
25021         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
25022                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
25023                 (int) name_len, name,
25024                 get_deprecated_property_msg(warning_offset));
25025     }
25026 
25027     /* In a few properties, a different property is used under /i.  These are
25028      * unlikely to change, so are hard-coded here. */
25029     if (to_fold) {
25030         if (   table_index == UNI_XPOSIXUPPER
25031             || table_index == UNI_XPOSIXLOWER
25032             || table_index == UNI_TITLE)
25033         {
25034             table_index = UNI_CASED;
25035         }
25036         else if (   table_index == UNI_UPPERCASELETTER
25037                  || table_index == UNI_LOWERCASELETTER
25038 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
25039                  || table_index == UNI_TITLECASELETTER
25040 #  endif
25041         ) {
25042             table_index = UNI_CASEDLETTER;
25043         }
25044         else if (  table_index == UNI_POSIXUPPER
25045                 || table_index == UNI_POSIXLOWER)
25046         {
25047             table_index = UNI_POSIXALPHA;
25048         }
25049     }
25050 
25051     /* Create and return the inversion list */
25052     prop_definition = get_prop_definition(table_index);
25053     sv_2mortal(prop_definition);
25054 
25055     /* See if there is a private use override to add to this definition */
25056     {
25057         COPHH * hinthash = (IN_PERL_COMPILETIME)
25058                            ? CopHINTHASH_get(&PL_compiling)
25059                            : CopHINTHASH_get(PL_curcop);
25060 	SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
25061 
25062         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
25063 
25064             /* See if there is an element in the hints hash for this table */
25065             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
25066             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
25067 
25068             if (pos) {
25069                 bool dummy;
25070                 SV * pu_definition;
25071                 SV * pu_invlist;
25072                 SV * expanded_prop_definition =
25073                             sv_2mortal(invlist_clone(prop_definition, NULL));
25074 
25075                 /* If so, it's definition is the string from here to the next
25076                  * \a character.  And its format is the same as a user-defined
25077                  * property */
25078                 pos += SvCUR(pu_lookup);
25079                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
25080                 pu_invlist = handle_user_defined_property(lookup_name,
25081                                                           lookup_len,
25082                                                           0, /* Not UTF-8 */
25083                                                           0, /* Not folded */
25084                                                           runtime,
25085                                                           deferrable,
25086                                                           pu_definition,
25087                                                           &dummy,
25088                                                           msg,
25089                                                           level);
25090                 if (TAINT_get) {
25091                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25092                     sv_catpvs(msg, "Insecure private-use override");
25093                     goto append_name_to_msg;
25094                 }
25095 
25096                 /* For now, as a safety measure, make sure that it doesn't
25097                  * override non-private use code points */
25098                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
25099 
25100                 /* Add it to the list to be returned */
25101                 _invlist_union(prop_definition, pu_invlist,
25102                                &expanded_prop_definition);
25103                 prop_definition = expanded_prop_definition;
25104                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
25105             }
25106         }
25107     }
25108 
25109     if (invert_return) {
25110         _invlist_invert(prop_definition);
25111     }
25112     return prop_definition;
25113 
25114   unknown_user_defined:
25115     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25116     sv_catpvs(msg, "Unknown user-defined property name");
25117     goto append_name_to_msg;
25118 
25119   failed:
25120     if (non_pkg_begin != 0) {
25121         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25122         sv_catpvs(msg, "Illegal user-defined property name");
25123     }
25124     else {
25125         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25126         sv_catpvs(msg, "Can't find Unicode property definition");
25127     }
25128     /* FALLTHROUGH */
25129 
25130   append_name_to_msg:
25131     {
25132         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
25133         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
25134 
25135         sv_catpv(msg, prefix);
25136         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
25137         sv_catpv(msg, suffix);
25138     }
25139 
25140     return NULL;
25141 
25142   definition_deferred:
25143 
25144     {
25145         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
25146 
25147         /* Here it could yet to be defined, so defer evaluation of this until
25148          * its needed at runtime.  We need the fully qualified property name to
25149          * avoid ambiguity */
25150         if (! fq_name) {
25151             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25152                                                                 is_qualified);
25153         }
25154 
25155         /* If it didn't come with a package, or the package is utf8::, this
25156          * actually could be an official Unicode property whose inclusion we
25157          * are deferring until runtime to make sure that it isn't overridden by
25158          * a user-defined property of the same name (which we haven't
25159          * encountered yet).  Add a marker to indicate this possibility, for
25160          * use at such time when we first need the definition during pattern
25161          * matching execution */
25162         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25163             sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25164         }
25165 
25166         /* We also need a trailing newline */
25167         sv_catpvs(fq_name, "\n");
25168 
25169         *user_defined_ptr = TRUE;
25170         return fq_name;
25171     }
25172 }
25173 
25174 STATIC bool
S_handle_names_wildcard(pTHX_ const char * wname,const STRLEN wname_len,SV ** prop_definition,AV ** strings)25175 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25176                               const STRLEN wname_len, /* Its length */
25177                               SV ** prop_definition,
25178                               AV ** strings)
25179 {
25180     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25181      * any matches, adding them to prop_definition */
25182 
25183     dSP;
25184 
25185     CV * get_names_info;        /* entry to charnames.pm to get info we need */
25186     SV * names_string;          /* Contains all character names, except algo */
25187     SV * algorithmic_names;     /* Contains info about algorithmically
25188                                    generated character names */
25189     REGEXP * subpattern_re;     /* The user's pattern to match with */
25190     struct regexp * prog;       /* The compiled pattern */
25191     char * all_names_start;     /* lib/unicore/Name.pl string of every
25192                                    (non-algorithmic) character name */
25193     char * cur_pos;             /* We match, effectively using /gc; this is
25194                                    where we are now */
25195     bool found_matches = FALSE; /* Did any name match so far? */
25196     SV * empty;                 /* For matching zero length names */
25197     SV * must_sv;               /* Contains the substring, if any, that must be
25198                                    in a name for the subpattern to match */
25199     const char * must;          /* The PV of 'must' */
25200     STRLEN must_len;            /* And its length */
25201     SV * syllable_name = NULL;  /* For Hangul syllables */
25202     const char hangul_prefix[] = "HANGUL SYLLABLE ";
25203     const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25204 
25205     /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25206      * syllable name, and these are immutable and guaranteed by the Unicode
25207      * standard to never be extended */
25208     const STRLEN syl_max_len = hangul_prefix_len + 7;
25209 
25210     IV i;
25211 
25212     PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25213 
25214     /* Make sure _charnames is loaded.  (The parameters give context
25215      * for any errors generated */
25216     get_names_info = get_cv("_charnames::_get_names_info", 0);
25217     if (! get_names_info) {
25218         Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25219     }
25220 
25221     /* Get the charnames data */
25222     PUSHSTACKi(PERLSI_REGCOMP);
25223     ENTER ;
25224     SAVETMPS;
25225     save_re_context();
25226 
25227     PUSHMARK(SP) ;
25228     PUTBACK;
25229 
25230     /* Special _charnames entry point that returns the info this routine
25231      * requires */
25232     call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25233 
25234     SPAGAIN ;
25235 
25236     /* Data structure for names which end in their very own code points */
25237     algorithmic_names = POPs;
25238     SvREFCNT_inc_simple_void_NN(algorithmic_names);
25239 
25240     /* The lib/unicore/Name.pl string */
25241     names_string = POPs;
25242     SvREFCNT_inc_simple_void_NN(names_string);
25243 
25244     PUTBACK ;
25245     FREETMPS ;
25246     LEAVE ;
25247     POPSTACK;
25248 
25249     if (   ! SvROK(names_string)
25250         || ! SvROK(algorithmic_names))
25251     {   /* Perhaps should panic instead XXX */
25252         SvREFCNT_dec(names_string);
25253         SvREFCNT_dec(algorithmic_names);
25254         return FALSE;
25255     }
25256 
25257     names_string = sv_2mortal(SvRV(names_string));
25258     all_names_start = SvPVX(names_string);
25259     cur_pos = all_names_start;
25260 
25261     algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25262 
25263     /* Compile the subpattern consisting of the name being looked for */
25264     subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25265 
25266     must_sv = re_intuit_string(subpattern_re);
25267     if (must_sv) {
25268         /* regexec.c can free the re_intuit_string() return. GH #17734 */
25269         must_sv = sv_2mortal(newSVsv(must_sv));
25270         must = SvPV(must_sv, must_len);
25271     }
25272     else {
25273         must = "";
25274         must_len = 0;
25275     }
25276 
25277     /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
25278      * This works because the NUL causes the function to return early, thus
25279      * showing that there are characters in it other than the acceptable ones,
25280      * which is our desired result.) */
25281 
25282     prog = ReANY(subpattern_re);
25283 
25284     /* If only nothing is matched, skip to where empty names are looked for */
25285     if (prog->maxlen == 0) {
25286         goto check_empty;
25287     }
25288 
25289     /* And match against the string of all names /gc.  Don't even try if it
25290      * must match a character not found in any name. */
25291     if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25292     {
25293         while (execute_wildcard(subpattern_re,
25294                                 cur_pos,
25295                                 SvEND(names_string),
25296                                 all_names_start, 0,
25297                                 names_string,
25298                                 0))
25299         { /* Here, matched. */
25300 
25301             /* Note the string entries look like
25302              *      00001\nSTART OF HEADING\n\n
25303              * so we could match anywhere in that string.  We have to rule out
25304              * matching a code point line */
25305             char * this_name_start = all_names_start
25306                                                 + RX_OFFS(subpattern_re)->start;
25307             char * this_name_end   = all_names_start
25308                                                 + RX_OFFS(subpattern_re)->end;
25309             char * cp_start;
25310             char * cp_end;
25311             UV cp = 0;      /* Silences some compilers */
25312             AV * this_string = NULL;
25313             bool is_multi = FALSE;
25314 
25315             /* If matched nothing, advance to next possible match */
25316             if (this_name_start == this_name_end) {
25317                 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25318                                           SvEND(names_string) - this_name_end);
25319                 if (cur_pos == NULL) {
25320                     break;
25321                 }
25322             }
25323             else {
25324                 /* Position the next match to start beyond the current returned
25325                  * entry */
25326                 cur_pos = (char *) memchr(this_name_end, '\n',
25327                                           SvEND(names_string) - this_name_end);
25328             }
25329 
25330             /* Back up to the \n just before the beginning of the character. */
25331             cp_end = (char *) my_memrchr(all_names_start,
25332                                          '\n',
25333                                          this_name_start - all_names_start);
25334 
25335             /* If we didn't find a \n, it means it matched somewhere in the
25336              * initial '00000' in the string, so isn't a real match */
25337             if (cp_end == NULL) {
25338                 continue;
25339             }
25340 
25341             this_name_start = cp_end + 1;   /* The name starts just after */
25342             cp_end--;                       /* the \n, and the code point */
25343                                             /* ends just before it */
25344 
25345             /* All code points are 5 digits long */
25346             cp_start = cp_end - 4;
25347 
25348             /* This shouldn't happen, as we found a \n, and the first \n is
25349              * further along than what we subtracted */
25350             assert(cp_start >= all_names_start);
25351 
25352             if (cp_start == all_names_start) {
25353                 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25354                 continue;
25355             }
25356 
25357             /* If the character is a blank, we either have a named sequence, or
25358              * something is wrong */
25359             if (*(cp_start - 1) == ' ') {
25360                 cp_start = (char *) my_memrchr(all_names_start,
25361                                                '\n',
25362                                                cp_start - all_names_start);
25363                 cp_start++;
25364             }
25365 
25366             assert(cp_start != NULL && cp_start >= all_names_start + 2);
25367 
25368             /* Except for the first line in the string, the sequence before the
25369              * code point is \n\n.  If that isn't the case here, we didn't
25370              * match the name of a character.  (We could have matched a named
25371              * sequence, not currently handled */
25372             if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25373                 continue;
25374             }
25375 
25376             /* We matched!  Add this to the list */
25377             found_matches = TRUE;
25378 
25379             /* Loop through all the code points in the sequence */
25380             while (cp_start < cp_end) {
25381 
25382                 /* Calculate this code point from its 5 digits */
25383                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25384                    + (XDIGIT_VALUE(cp_start[1]) << 12)
25385                    + (XDIGIT_VALUE(cp_start[2]) << 8)
25386                    + (XDIGIT_VALUE(cp_start[3]) << 4)
25387                    +  XDIGIT_VALUE(cp_start[4]);
25388 
25389                 cp_start += 6;  /* Go past any blank */
25390 
25391                 if (cp_start < cp_end || is_multi) {
25392                     if (this_string == NULL) {
25393                         this_string = newAV();
25394                     }
25395 
25396                     is_multi = TRUE;
25397                     av_push(this_string, newSVuv(cp));
25398                 }
25399             }
25400 
25401             if (is_multi) { /* Was more than one code point */
25402                 if (*strings == NULL) {
25403                     *strings = newAV();
25404                 }
25405 
25406                 av_push(*strings, (SV *) this_string);
25407             }
25408             else {  /* Only a single code point */
25409                 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25410             }
25411         } /* End of loop through the non-algorithmic names string */
25412     }
25413 
25414     /* There are also character names not in 'names_string'.  These are
25415      * algorithmically generatable.  Try this pattern on each possible one.
25416      * (khw originally planned to leave this out given the large number of
25417      * matches attempted; but the speed turned out to be quite acceptable
25418      *
25419      * There are plenty of opportunities to optimize to skip many of the tests.
25420      * beyond the rudimentary ones already here */
25421 
25422     /* First see if the subpattern matches any of the algorithmic generatable
25423      * Hangul syllable names.
25424      *
25425      * We know none of these syllable names will match if the input pattern
25426      * requires more bytes than any syllable has, or if the input pattern only
25427      * matches an empty name, or if the pattern has something it must match and
25428      * one of the characters in that isn't in any Hangul syllable. */
25429     if (    prog->minlen <= (SSize_t) syl_max_len
25430         &&  prog->maxlen > 0
25431         && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25432     {
25433         /* These constants, names, values, and algorithm are adapted from the
25434          * Unicode standard, version 5.1, section 3.12, and should never
25435          * change. */
25436         const char * JamoL[] = {
25437             "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25438             "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25439         };
25440         const int LCount = C_ARRAY_LENGTH(JamoL);
25441 
25442         const char * JamoV[] = {
25443             "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25444             "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25445             "I"
25446         };
25447         const int VCount = C_ARRAY_LENGTH(JamoV);
25448 
25449         const char * JamoT[] = {
25450             "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25451             "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25452             "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25453         };
25454         const int TCount = C_ARRAY_LENGTH(JamoT);
25455 
25456         int L, V, T;
25457 
25458         /* This is the initial Hangul syllable code point; each time through the
25459          * inner loop, it maps to the next higher code point.  For more info,
25460          * see the Hangul syllable section of the Unicode standard. */
25461         int cp = 0xAC00;
25462 
25463         syllable_name = sv_2mortal(newSV(syl_max_len));
25464         sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25465 
25466         for (L = 0; L < LCount; L++) {
25467             for (V = 0; V < VCount; V++) {
25468                 for (T = 0; T < TCount; T++) {
25469 
25470                     /* Truncate back to the prefix, which is unvarying */
25471                     SvCUR_set(syllable_name, hangul_prefix_len);
25472 
25473                     sv_catpv(syllable_name, JamoL[L]);
25474                     sv_catpv(syllable_name, JamoV[V]);
25475                     sv_catpv(syllable_name, JamoT[T]);
25476 
25477                     if (execute_wildcard(subpattern_re,
25478                                 SvPVX(syllable_name),
25479                                 SvEND(syllable_name),
25480                                 SvPVX(syllable_name), 0,
25481                                 syllable_name,
25482                                 0))
25483                     {
25484                         *prop_definition = add_cp_to_invlist(*prop_definition,
25485                                                              cp);
25486                         found_matches = TRUE;
25487                     }
25488 
25489                     cp++;
25490                 }
25491             }
25492         }
25493     }
25494 
25495     /* The rest of the algorithmically generatable names are of the form
25496      * "PREFIX-code_point".  The prefixes and the code point limits of each
25497      * were returned to us in the array 'algorithmic_names' from data in
25498      * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
25499     for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25500         IV j;
25501 
25502         /* Each element of the array is a hash, giving the details for the
25503          * series of names it covers.  There is the base name of the characters
25504          * in the series, and the low and high code points in the series.  And,
25505          * for optimization purposes a string containing all the legal
25506          * characters that could possibly be in a name in this series. */
25507         HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25508         SV * prefix = * hv_fetchs(this_series, "name", 0);
25509         IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25510         IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25511         char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25512 
25513         /* Pre-allocate an SV with enough space */
25514         SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25515                                                         SvPVX(prefix)));
25516         if (high >= 0x10000) {
25517             sv_catpvs(algo_name, "0");
25518         }
25519 
25520         /* This series can be skipped entirely if the pattern requires
25521          * something longer than any name in the series, or can only match an
25522          * empty name, or contains a character not found in any name in the
25523          * series */
25524         if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
25525             &&  prog->maxlen > 0
25526             && (strspn(must, legal) == must_len))
25527         {
25528             for (j = low; j <= high; j++) { /* For each code point in the series */
25529 
25530                 /* Get its name, and see if it matches the subpattern */
25531                 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25532                                      (unsigned) j);
25533 
25534                 if (execute_wildcard(subpattern_re,
25535                                     SvPVX(algo_name),
25536                                     SvEND(algo_name),
25537                                     SvPVX(algo_name), 0,
25538                                     algo_name,
25539                                     0))
25540                 {
25541                     *prop_definition = add_cp_to_invlist(*prop_definition, j);
25542                     found_matches = TRUE;
25543                 }
25544             }
25545         }
25546     }
25547 
25548   check_empty:
25549     /* Finally, see if the subpattern matches an empty string */
25550     empty = newSVpvs("");
25551     if (execute_wildcard(subpattern_re,
25552                          SvPVX(empty),
25553                          SvEND(empty),
25554                          SvPVX(empty), 0,
25555                          empty,
25556                          0))
25557     {
25558         /* Many code points have empty names.  Currently these are the \p{GC=C}
25559          * ones, minus CC and CF */
25560 
25561         SV * empty_names_ref = get_prop_definition(UNI_C);
25562         SV * empty_names = invlist_clone(empty_names_ref, NULL);
25563 
25564         SV * subtract = get_prop_definition(UNI_CC);
25565 
25566         _invlist_subtract(empty_names, subtract, &empty_names);
25567         SvREFCNT_dec_NN(empty_names_ref);
25568         SvREFCNT_dec_NN(subtract);
25569 
25570         subtract = get_prop_definition(UNI_CF);
25571         _invlist_subtract(empty_names, subtract, &empty_names);
25572         SvREFCNT_dec_NN(subtract);
25573 
25574         _invlist_union(*prop_definition, empty_names, prop_definition);
25575         found_matches = TRUE;
25576         SvREFCNT_dec_NN(empty_names);
25577     }
25578     SvREFCNT_dec_NN(empty);
25579 
25580 #if 0
25581     /* If we ever were to accept aliases for, say private use names, we would
25582      * need to do something fancier to find empty names.  The code below works
25583      * (at the time it was written), and is slower than the above */
25584     const char empties_pat[] = "^.";
25585     if (strNE(name, empties_pat)) {
25586         SV * empty = newSVpvs("");
25587         if (execute_wildcard(subpattern_re,
25588                     SvPVX(empty),
25589                     SvEND(empty),
25590                     SvPVX(empty), 0,
25591                     empty,
25592                     0))
25593         {
25594             SV * empties = NULL;
25595 
25596             (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25597 
25598             _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25599             SvREFCNT_dec_NN(empties);
25600 
25601             found_matches = TRUE;
25602         }
25603         SvREFCNT_dec_NN(empty);
25604     }
25605 #endif
25606 
25607     SvREFCNT_dec_NN(subpattern_re);
25608     return found_matches;
25609 }
25610 
25611 /*
25612  * ex: set ts=8 sts=4 sw=4 et:
25613  */
25614