1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 #ifdef PERL_EXT_RE_BUILD
35 #include "re_top.h"
36 #endif
37 
38 /*
39  * pregcomp and pregexec -- regsub and regerror are not used in perl
40  *
41  *	Copyright (c) 1986 by University of Toronto.
42  *	Written by Henry Spencer.  Not derived from licensed software.
43  *
44  *	Permission is granted to anyone to use this software for any
45  *	purpose on any computer system, and to redistribute it freely,
46  *	subject to the following restrictions:
47  *
48  *	1. The author is not responsible for the consequences of use of
49  *		this software, no matter how awful, even if they arise
50  *		from defects in it.
51  *
52  *	2. The origin of this software must not be misrepresented, either
53  *		by explicit claim or by omission.
54  *
55  *	3. Altered versions must be plainly marked as such, and must not
56  *		be misrepresented as being the original software.
57  *
58  *
59  ****    Alterations to Henry's code are...
60  ****
61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63  ****    by Larry Wall and others
64  ****
65  ****    You may distribute under the terms of either the GNU General Public
66  ****    License or the Artistic License, as specified in the README file.
67 
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 
74 /* Note on debug output:
75  *
76  * This is set up so that -Dr turns on debugging like all other flags that are
77  * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
78  * all regular expressions encountered in a program, and gives a huge amount of
79  * output for all but the shortest programs.
80  *
81  * The ability to output pattern debugging information lexically, and with much
82  * finer grained control was added, with 'use re qw(Debug ....);' available even
83  * in non-DEBUGGING builds.  This is accomplished by copying the contents of
84  * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85  * Those files are compiled and linked into the perl executable, and they are
86  * compiled essentially as if DEBUGGING were enabled, and controlled by calls
87  * to re.pm.
88  *
89  * That would normally mean linking errors when two functions of the same name
90  * are attempted to be placed into the same executable.  That is solved in one
91  * of four ways:
92  *  1)  Static functions aren't known outside the file they are in, so for the
93  *      many functions of that type in this file, it just isn't a problem.
94  *  2)  Most externally known functions are enclosed in
95  *          #ifndef PERL_IN_XSUB_RE
96  *          ...
97  *          #endif
98  *      blocks, so there is only one defintion for them in the whole
99  *      executable, the one in regcomp.c (or regexec.c).  The implication of
100  *      that is any debugging info that comes from them is controlled only by
101  *      -Dr.  Further, any static function they call will also be the version
102  *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103  *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
104  *      have different names, so that what gets loaded in the executable is
105  *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106  *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
107  *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108  *      versions and their callees are under control of re.pm.   The catch is
109  *      that references to all these go through the regexp_engine structure,
110  *      which is initialized in regcomp.h to the Perl_foo versions, and
111  *      substituted out in lexical scopes where 'use re' is in effect to the
112  *      'my_foo' ones.   That structure is public API, so it would be a hard
113  *      sell to add any additional members.
114  *  4)  For functions in regcomp.c and re_comp.c that are called only from,
115  *      respectively, regexec.c and re_exec.c, they can have two different
116  *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
117  *      embed.fnc.
118  *
119  * The bottom line is that if you add code to one of the public functions
120  * listed in ext/re/re_top.h, debugging automagically works.  But if you write
121  * a new function that needs to do debugging or there is a chain of calls from
122  * it that need to do debugging, all functions in the chain should use options
123  * 2) or 4) above.
124  *
125  * A function may have to be split so that debugging stuff is static, but it
126  * calls out to some other function that only gets compiled in regcomp.c to
127  * access data that we don't want to duplicate.
128  */
129 
130 #include "EXTERN.h"
131 #define PERL_IN_REGCOMP_C
132 #include "perl.h"
133 
134 #define REG_COMP_C
135 #ifdef PERL_IN_XSUB_RE
136 #  include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
139 #else
140 #  include "regcomp.h"
141 #endif
142 
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
145 
146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
152 
153 #ifndef STATIC
154 #define	STATIC	static
155 #endif
156 
157 /* this is a chain of data about sub patterns we are processing that
158    need to be handled separately/specially in study_chunk. Its so
159    we can simulate recursion without losing state.  */
160 struct scan_frame;
161 typedef struct scan_frame {
162     regnode *last_regnode;      /* last node to process in this frame */
163     regnode *next_regnode;      /* next node to process when last is reached */
164     U32 prev_recursed_depth;
165     I32 stopparen;              /* what stopparen do we use */
166     bool in_gosub;              /* this or an outer frame is for GOSUB */
167 
168     struct scan_frame *this_prev_frame; /* this previous frame */
169     struct scan_frame *prev_frame;      /* previous frame */
170     struct scan_frame *next_frame;      /* next frame */
171 } scan_frame;
172 
173 /* Certain characters are output as a sequence with the first being a
174  * backslash. */
175 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
176 
177 
178 struct RExC_state_t {
179     U32		flags;			/* RXf_* are we folding, multilining? */
180     U32		pm_flags;		/* PMf_* stuff from the calling PMOP */
181     char	*precomp;		/* uncompiled string. */
182     char	*precomp_end;		/* pointer to end of uncompiled string. */
183     REGEXP	*rx_sv;			/* The SV that is the regexp. */
184     regexp	*rx;                    /* perl core regexp structure */
185     regexp_internal	*rxi;           /* internal data for regexp object
186                                            pprivate field */
187     char	*start;			/* Start of input for compile */
188     char	*end;			/* End of input for compile */
189     char	*parse;			/* Input-scan pointer. */
190     char        *copy_start;            /* start of copy of input within
191                                            constructed parse string */
192     char        *save_copy_start;       /* Provides one level of saving
193                                            and restoring 'copy_start' */
194     char        *copy_start_in_input;   /* Position in input string
195                                            corresponding to copy_start */
196     SSize_t	whilem_seen;		/* number of WHILEM in this expr */
197     regnode	*emit_start;		/* Start of emitted-code area */
198     regnode_offset emit;		/* Code-emit pointer */
199     I32		naughty;		/* How bad is this pattern? */
200     I32		sawback;		/* Did we see \1, ...? */
201     SSize_t	size;			/* Number of regnode equivalents in
202                                            pattern */
203     Size_t      sets_depth;              /* Counts recursion depth of already-
204                                            compiled regex set patterns */
205     U32		seen;
206 
207     I32      parens_buf_size;           /* #slots malloced open/close_parens */
208     regnode_offset *open_parens;	/* offsets to open parens */
209     regnode_offset *close_parens;	/* offsets to close parens */
210     HV		*paren_names;		/* Paren names */
211 
212     /* position beyond 'precomp' of the warning message furthest away from
213      * 'precomp'.  During the parse, no warnings are raised for any problems
214      * earlier in the parse than this position.  This works if warnings are
215      * raised the first time a given spot is parsed, and if only one
216      * independent warning is raised for any given spot */
217     Size_t	latest_warn_offset;
218 
219     I32         npar;                   /* Capture buffer count so far in the
220                                            parse, (OPEN) plus one. ("par" 0 is
221                                            the whole pattern)*/
222     I32         total_par;              /* During initial parse, is either 0,
223                                            or -1; the latter indicating a
224                                            reparse is needed.  After that pass,
225                                            it is what 'npar' became after the
226                                            pass.  Hence, it being > 0 indicates
227                                            we are in a reparse situation */
228     I32		nestroot;		/* root parens we are in - used by
229                                            accept */
230     I32		seen_zerolen;
231     regnode     *end_op;                /* END node in program */
232     I32		utf8;		/* whether the pattern is utf8 or not */
233     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
234 				/* XXX use this for future optimisation of case
235 				 * where pattern must be upgraded to utf8. */
236     I32		uni_semantics;	/* If a d charset modifier should use unicode
237 				   rules, even if the pattern is not in
238 				   utf8 */
239 
240     I32         recurse_count;          /* Number of recurse regops we have generated */
241     regnode	**recurse;		/* Recurse regops */
242     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
243                                            through */
244     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
245     I32		in_lookaround;
246     I32		contains_locale;
247     I32		override_recoding;
248     I32         recode_x_to_native;
249     I32		in_multi_char_class;
250     int		code_index;		/* next code_blocks[] slot */
251     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
252 					    within pattern */
253     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
254     scan_frame *frame_head;
255     scan_frame *frame_last;
256     U32         frame_count;
257     AV         *warn_text;
258     HV         *unlexed_names;
259     SV		*runtime_code_qr;	/* qr with the runtime code blocks */
260 #ifdef DEBUGGING
261     const char  *lastparse;
262     I32         lastnum;
263     U32         study_chunk_recursed_count;
264     AV          *paren_name_list;       /* idx -> name */
265     SV          *mysv1;
266     SV          *mysv2;
267 
268 #define RExC_lastparse	(pRExC_state->lastparse)
269 #define RExC_lastnum	(pRExC_state->lastnum)
270 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
271 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
272 #define RExC_mysv	(pRExC_state->mysv1)
273 #define RExC_mysv1	(pRExC_state->mysv1)
274 #define RExC_mysv2	(pRExC_state->mysv2)
275 
276 #endif
277     bool        seen_d_op;
278     bool        strict;
279     bool        study_started;
280     bool        in_script_run;
281     bool        use_BRANCHJ;
282     bool        sWARN_EXPERIMENTAL__VLB;
283     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
284 };
285 
286 #define RExC_flags	(pRExC_state->flags)
287 #define RExC_pm_flags	(pRExC_state->pm_flags)
288 #define RExC_precomp	(pRExC_state->precomp)
289 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
290 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
291 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
292 #define RExC_precomp_end (pRExC_state->precomp_end)
293 #define RExC_rx_sv	(pRExC_state->rx_sv)
294 #define RExC_rx		(pRExC_state->rx)
295 #define RExC_rxi	(pRExC_state->rxi)
296 #define RExC_start	(pRExC_state->start)
297 #define RExC_end	(pRExC_state->end)
298 #define RExC_parse	(pRExC_state->parse)
299 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
300 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
301 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
302                                                    under /d from /u ? */
303 
304 #ifdef RE_TRACK_PATTERN_OFFSETS
305 #  define RExC_offsets	(RExC_rxi->u.offsets) /* I am not like the
306                                                          others */
307 #endif
308 #define RExC_emit	(pRExC_state->emit)
309 #define RExC_emit_start	(pRExC_state->emit_start)
310 #define RExC_sawback	(pRExC_state->sawback)
311 #define RExC_seen	(pRExC_state->seen)
312 #define RExC_size	(pRExC_state->size)
313 #define RExC_maxlen        (pRExC_state->maxlen)
314 #define RExC_npar	(pRExC_state->npar)
315 #define RExC_total_parens	(pRExC_state->total_par)
316 #define RExC_parens_buf_size	(pRExC_state->parens_buf_size)
317 #define RExC_nestroot   (pRExC_state->nestroot)
318 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
319 #define RExC_utf8	(pRExC_state->utf8)
320 #define RExC_uni_semantics	(pRExC_state->uni_semantics)
321 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
322 #define RExC_open_parens	(pRExC_state->open_parens)
323 #define RExC_close_parens	(pRExC_state->close_parens)
324 #define RExC_end_op	(pRExC_state->end_op)
325 #define RExC_paren_names	(pRExC_state->paren_names)
326 #define RExC_recurse	(pRExC_state->recurse)
327 #define RExC_recurse_count	(pRExC_state->recurse_count)
328 #define RExC_sets_depth         (pRExC_state->sets_depth)
329 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
330 #define RExC_study_chunk_recursed_bytes  \
331                                    (pRExC_state->study_chunk_recursed_bytes)
332 #define RExC_in_lookaround	(pRExC_state->in_lookaround)
333 #define RExC_contains_locale	(pRExC_state->contains_locale)
334 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
335 
336 #ifdef EBCDIC
337 #  define SET_recode_x_to_native(x)                                         \
338                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
339 #else
340 #  define SET_recode_x_to_native(x) NOOP
341 #endif
342 
343 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
344 #define RExC_frame_head (pRExC_state->frame_head)
345 #define RExC_frame_last (pRExC_state->frame_last)
346 #define RExC_frame_count (pRExC_state->frame_count)
347 #define RExC_strict (pRExC_state->strict)
348 #define RExC_study_started      (pRExC_state->study_started)
349 #define RExC_warn_text (pRExC_state->warn_text)
350 #define RExC_in_script_run      (pRExC_state->in_script_run)
351 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
352 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
353 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
354 #define RExC_unlexed_names (pRExC_state->unlexed_names)
355 
356 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
357  * a flag to disable back-off on the fixed/floating substrings - if it's
358  * a high complexity pattern we assume the benefit of avoiding a full match
359  * is worth the cost of checking for the substrings even if they rarely help.
360  */
361 #define RExC_naughty	(pRExC_state->naughty)
362 #define TOO_NAUGHTY (10)
363 #define MARK_NAUGHTY(add) \
364     if (RExC_naughty < TOO_NAUGHTY) \
365         RExC_naughty += (add)
366 #define MARK_NAUGHTY_EXP(exp, add) \
367     if (RExC_naughty < TOO_NAUGHTY) \
368         RExC_naughty += RExC_naughty / (exp) + (add)
369 
370 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
371 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
372 	((*s) == '{' && regcurly(s)))
373 
374 /*
375  * Flags to be passed up and down.
376  */
377 #define	WORST		0	/* Worst case. */
378 #define	HASWIDTH	0x01	/* Known to not match null strings, could match
379                                    non-null ones. */
380 
381 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
382  * character.  (There needs to be a case: in the switch statement in regexec.c
383  * for any node marked SIMPLE.)  Note that this is not the same thing as
384  * REGNODE_SIMPLE */
385 #define	SIMPLE		0x02
386 #define	SPSTART		0x04	/* Starts with * or + */
387 #define POSTPONED	0x08    /* (?1),(?&name), (??{...}) or similar */
388 #define TRYAGAIN	0x10	/* Weeded out a declaration. */
389 #define RESTART_PARSE   0x20    /* Need to redo the parse */
390 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
391                                    calcuate sizes as UTF-8 */
392 
393 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
394 
395 /* whether trie related optimizations are enabled */
396 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
397 #define TRIE_STUDY_OPT
398 #define FULL_TRIE_STUDY
399 #define TRIE_STCLASS
400 #endif
401 
402 
403 
404 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
405 #define PBITVAL(paren) (1 << ((paren) & 7))
406 #define PAREN_OFFSET(depth) \
407     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
408 #define PAREN_TEST(depth, paren) \
409     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
410 #define PAREN_SET(depth, paren) \
411     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
412 #define PAREN_UNSET(depth, paren) \
413     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
414 
415 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
416                                      if (!UTF) {                           \
417                                          *flagp = RESTART_PARSE|NEED_UTF8; \
418                                          return 0;                         \
419                                      }                                     \
420                              } STMT_END
421 
422 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
423  * pattern is in UTF-8.  This latter condition is in case the outermost rules
424  * are locale.  See GH #17278 */
425 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
426 
427 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
428  * a flag that indicates we need to override /d with /u as a result of
429  * something in the pattern.  It should only be used in regards to calling
430  * set_regex_charset() or get_regex_charset() */
431 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
432     STMT_START {                                                            \
433             if (DEPENDS_SEMANTICS) {                                        \
434                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
435                 RExC_uni_semantics = 1;                                     \
436                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
437                     /* No need to restart the parse if we haven't seen      \
438                      * anything that differs between /u and /d, and no need \
439                      * to restart immediately if we're going to reparse     \
440                      * anyway to count parens */                            \
441                     *flagp |= RESTART_PARSE;                                \
442                     return restart_retval;                                  \
443                 }                                                           \
444             }                                                               \
445     } STMT_END
446 
447 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
448     STMT_START {                                                            \
449                 RExC_use_BRANCHJ = 1;                                       \
450                 *flagp |= RESTART_PARSE;                                    \
451                 return restart_retval;                                      \
452     } STMT_END
453 
454 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
455  * less.  After that, it must always be positive, because the whole re is
456  * considered to be surrounded by virtual parens.  Setting it to negative
457  * indicates there is some construct that needs to know the actual number of
458  * parens to be properly handled.  And that means an extra pass will be
459  * required after we've counted them all */
460 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
461 #define REQUIRE_PARENS_PASS                                                 \
462     STMT_START {  /* No-op if have completed a pass */                      \
463                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
464     } STMT_END
465 #define IN_PARENS_PASS (RExC_total_parens < 0)
466 
467 
468 /* This is used to return failure (zero) early from the calling function if
469  * various flags in 'flags' are set.  Two flags always cause a return:
470  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
471  * additional flags that should cause a return; 0 if none.  If the return will
472  * be done, '*flagp' is first set to be all of the flags that caused the
473  * return. */
474 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
475     STMT_START {                                                            \
476             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
477                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
478                 return 0;                                                   \
479             }                                                               \
480     } STMT_END
481 
482 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
483 
484 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
485                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
486 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
487                                     if (MUST_RESTART(*(flagp))) return 0
488 
489 /* This converts the named class defined in regcomp.h to its equivalent class
490  * number defined in handy.h. */
491 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
492 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
493 
494 #define _invlist_union_complement_2nd(a, b, output) \
495                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
496 #define _invlist_intersection_complement_2nd(a, b, output) \
497                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
498 
499 /* We add a marker if we are deferring expansion of a property that is both
500  * 1) potentiallly user-defined; and
501  * 2) could also be an official Unicode property.
502  *
503  * Without this marker, any deferred expansion can only be for a user-defined
504  * one.  This marker shouldn't conflict with any that could be in a legal name,
505  * and is appended to its name to indicate this.  There is a string and
506  * character form */
507 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
508 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
509 
510 /* What is infinity for optimization purposes */
511 #define OPTIMIZE_INFTY  SSize_t_MAX
512 
513 /* About scan_data_t.
514 
515   During optimisation we recurse through the regexp program performing
516   various inplace (keyhole style) optimisations. In addition study_chunk
517   and scan_commit populate this data structure with information about
518   what strings MUST appear in the pattern. We look for the longest
519   string that must appear at a fixed location, and we look for the
520   longest string that may appear at a floating location. So for instance
521   in the pattern:
522 
523     /FOO[xX]A.*B[xX]BAR/
524 
525   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
526   strings (because they follow a .* construct). study_chunk will identify
527   both FOO and BAR as being the longest fixed and floating strings respectively.
528 
529   The strings can be composites, for instance
530 
531      /(f)(o)(o)/
532 
533   will result in a composite fixed substring 'foo'.
534 
535   For each string some basic information is maintained:
536 
537   - min_offset
538     This is the position the string must appear at, or not before.
539     It also implicitly (when combined with minlenp) tells us how many
540     characters must match before the string we are searching for.
541     Likewise when combined with minlenp and the length of the string it
542     tells us how many characters must appear after the string we have
543     found.
544 
545   - max_offset
546     Only used for floating strings. This is the rightmost point that
547     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
548     string can occur infinitely far to the right.
549     For fixed strings, it is equal to min_offset.
550 
551   - minlenp
552     A pointer to the minimum number of characters of the pattern that the
553     string was found inside. This is important as in the case of positive
554     lookahead or positive lookbehind we can have multiple patterns
555     involved. Consider
556 
557     /(?=FOO).*F/
558 
559     The minimum length of the pattern overall is 3, the minimum length
560     of the lookahead part is 3, but the minimum length of the part that
561     will actually match is 1. So 'FOO's minimum length is 3, but the
562     minimum length for the F is 1. This is important as the minimum length
563     is used to determine offsets in front of and behind the string being
564     looked for.  Since strings can be composites this is the length of the
565     pattern at the time it was committed with a scan_commit. Note that
566     the length is calculated by study_chunk, so that the minimum lengths
567     are not known until the full pattern has been compiled, thus the
568     pointer to the value.
569 
570   - lookbehind
571 
572     In the case of lookbehind the string being searched for can be
573     offset past the start point of the final matching string.
574     If this value was just blithely removed from the min_offset it would
575     invalidate some of the calculations for how many chars must match
576     before or after (as they are derived from min_offset and minlen and
577     the length of the string being searched for).
578     When the final pattern is compiled and the data is moved from the
579     scan_data_t structure into the regexp structure the information
580     about lookbehind is factored in, with the information that would
581     have been lost precalculated in the end_shift field for the
582     associated string.
583 
584   The fields pos_min and pos_delta are used to store the minimum offset
585   and the delta to the maximum offset at the current point in the pattern.
586 
587 */
588 
589 struct scan_data_substrs {
590     SV      *str;       /* longest substring found in pattern */
591     SSize_t min_offset; /* earliest point in string it can appear */
592     SSize_t max_offset; /* latest point in string it can appear */
593     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
594     SSize_t lookbehind; /* is the pos of the string modified by LB */
595     I32 flags;          /* per substring SF_* and SCF_* flags */
596 };
597 
598 typedef struct scan_data_t {
599     /*I32 len_min;      unused */
600     /*I32 len_delta;    unused */
601     SSize_t pos_min;
602     SSize_t pos_delta;
603     SV *last_found;
604     SSize_t last_end;	    /* min value, <0 unless valid. */
605     SSize_t last_start_min;
606     SSize_t last_start_max;
607     U8      cur_is_floating; /* whether the last_* values should be set as
608                               * the next fixed (0) or floating (1)
609                               * substring */
610 
611     /* [0] is longest fixed substring so far, [1] is longest float so far */
612     struct scan_data_substrs  substrs[2];
613 
614     I32 flags;             /* common SF_* and SCF_* flags */
615     I32 whilem_c;
616     SSize_t *last_closep;
617     regnode_ssc *start_class;
618 } scan_data_t;
619 
620 /*
621  * Forward declarations for pregcomp()'s friends.
622  */
623 
624 static const scan_data_t zero_scan_data = {
625     0, 0, NULL, 0, 0, 0, 0,
626     {
627         { NULL, 0, 0, 0, 0, 0 },
628         { NULL, 0, 0, 0, 0, 0 },
629     },
630     0, 0, NULL, NULL
631 };
632 
633 /* study flags */
634 
635 #define SF_BEFORE_SEOL		0x0001
636 #define SF_BEFORE_MEOL		0x0002
637 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
638 
639 #define SF_IS_INF		0x0040
640 #define SF_HAS_PAR		0x0080
641 #define SF_IN_PAR		0x0100
642 #define SF_HAS_EVAL		0x0200
643 
644 
645 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
646  * longest substring in the pattern. When it is not set the optimiser keeps
647  * track of position, but does not keep track of the actual strings seen,
648  *
649  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
650  * /foo/i will not.
651  *
652  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
653  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
654  * turned off because of the alternation (BRANCH). */
655 #define SCF_DO_SUBSTR		0x0400
656 
657 #define SCF_DO_STCLASS_AND	0x0800
658 #define SCF_DO_STCLASS_OR	0x1000
659 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
660 #define SCF_WHILEM_VISITED_POS	0x2000
661 
662 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
663 #define SCF_SEEN_ACCEPT         0x8000
664 #define SCF_TRIE_DOING_RESTUDY 0x10000
665 #define SCF_IN_DEFINE          0x20000
666 
667 
668 
669 
670 #define UTF cBOOL(RExC_utf8)
671 
672 /* The enums for all these are ordered so things work out correctly */
673 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
674 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
675                                                      == REGEX_DEPENDS_CHARSET)
676 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
677 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
678                                                      >= REGEX_UNICODE_CHARSET)
679 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
680                                             == REGEX_ASCII_RESTRICTED_CHARSET)
681 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
682                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
683 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
684                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
685 
686 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
687 
688 /* For programs that want to be strictly Unicode compatible by dying if any
689  * attempt is made to match a non-Unicode code point against a Unicode
690  * property.  */
691 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
692 
693 #define OOB_NAMEDCLASS		-1
694 
695 /* There is no code point that is out-of-bounds, so this is problematic.  But
696  * its only current use is to initialize a variable that is always set before
697  * looked at. */
698 #define OOB_UNICODE		0xDEADBEEF
699 
700 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
701 
702 
703 /* length of regex to show in messages that don't mark a position within */
704 #define RegexLengthToShowInErrorMessages 127
705 
706 /*
707  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
708  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
709  * op/pragma/warn/regcomp.
710  */
711 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
712 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
713 
714 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
715                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
716 
717 /* The code in this file in places uses one level of recursion with parsing
718  * rebased to an alternate string constructed by us in memory.  This can take
719  * the form of something that is completely different from the input, or
720  * something that uses the input as part of the alternate.  In the first case,
721  * there should be no possibility of an error, as we are in complete control of
722  * the alternate string.  But in the second case we don't completely control
723  * the input portion, so there may be errors in that.  Here's an example:
724  *      /[abc\x{DF}def]/ui
725  * is handled specially because \x{df} folds to a sequence of more than one
726  * character: 'ss'.  What is done is to create and parse an alternate string,
727  * which looks like this:
728  *      /(?:\x{DF}|[abc\x{DF}def])/ui
729  * where it uses the input unchanged in the middle of something it constructs,
730  * which is a branch for the DF outside the character class, and clustering
731  * parens around the whole thing. (It knows enough to skip the DF inside the
732  * class while in this substitute parse.) 'abc' and 'def' may have errors that
733  * need to be reported.  The general situation looks like this:
734  *
735  *                                       |<------- identical ------>|
736  *              sI                       tI               xI       eI
737  * Input:       ---------------------------------------------------------------
738  * Constructed:         ---------------------------------------------------
739  *                      sC               tC               xC       eC     EC
740  *                                       |<------- identical ------>|
741  *
742  * sI..eI   is the portion of the input pattern we are concerned with here.
743  * sC..EC   is the constructed substitute parse string.
744  *  sC..tC  is constructed by us
745  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
746  *          In the diagram, these are vertically aligned.
747  *  eC..EC  is also constructed by us.
748  * xC       is the position in the substitute parse string where we found a
749  *          problem.
750  * xI       is the position in the original pattern corresponding to xC.
751  *
752  * We want to display a message showing the real input string.  Thus we need to
753  * translate from xC to xI.  We know that xC >= tC, since the portion of the
754  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
755  * get:
756  *      xI = tI + (xC - tC)
757  *
758  * When the substitute parse is constructed, the code needs to set:
759  *      RExC_start (sC)
760  *      RExC_end (eC)
761  *      RExC_copy_start_in_input  (tI)
762  *      RExC_copy_start_in_constructed (tC)
763  * and restore them when done.
764  *
765  * During normal processing of the input pattern, both
766  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
767  * sI, so that xC equals xI.
768  */
769 
770 #define sI              RExC_precomp
771 #define eI              RExC_precomp_end
772 #define sC              RExC_start
773 #define eC              RExC_end
774 #define tI              RExC_copy_start_in_input
775 #define tC              RExC_copy_start_in_constructed
776 #define xI(xC)          (tI + (xC - tC))
777 #define xI_offset(xC)   (xI(xC) - sI)
778 
779 #define REPORT_LOCATION_ARGS(xC)                                            \
780     UTF8fARG(UTF,                                                           \
781              (xI(xC) > eI) /* Don't run off end */                          \
782               ? eI - sI   /* Length before the <--HERE */                   \
783               : ((xI_offset(xC) >= 0)                                       \
784                  ? xI_offset(xC)                                            \
785                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
786                                     IVdf " trying to output message for "   \
787                                     " pattern %.*s",                        \
788                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
789                                     ((int) (eC - sC)), sC), 0)),            \
790              sI),         /* The input pattern printed up to the <--HERE */ \
791     UTF8fARG(UTF,                                                           \
792              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
793              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
794 
795 /* Used to point after bad bytes for an error message, but avoid skipping
796  * past a nul byte. */
797 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
798 
799 /* Set up to clean up after our imminent demise */
800 #define PREPARE_TO_DIE                                                      \
801     STMT_START {					                    \
802         if (RExC_rx_sv)                                                     \
803             SAVEFREESV(RExC_rx_sv);                                         \
804         if (RExC_open_parens)                                               \
805             SAVEFREEPV(RExC_open_parens);                                   \
806         if (RExC_close_parens)                                              \
807             SAVEFREEPV(RExC_close_parens);                                  \
808     } STMT_END
809 
810 /*
811  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
812  * arg. Show regex, up to a maximum length. If it's too long, chop and add
813  * "...".
814  */
815 #define _FAIL(code) STMT_START {					\
816     const char *ellipses = "";						\
817     IV len = RExC_precomp_end - RExC_precomp;				\
818 									\
819     PREPARE_TO_DIE;						        \
820     if (len > RegexLengthToShowInErrorMessages) {			\
821 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
822 	len = RegexLengthToShowInErrorMessages - 10;			\
823 	ellipses = "...";						\
824     }									\
825     code;                                                               \
826 } STMT_END
827 
828 #define	FAIL(msg) _FAIL(			    \
829     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",	    \
830 	    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
831 
832 #define	FAIL2(msg,arg) _FAIL(			    \
833     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",	    \
834 	    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
835 
836 #define	FAIL3(msg,arg1,arg2) _FAIL(			    \
837     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",	    \
838      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
839 
840 /*
841  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
842  */
843 #define	Simple_vFAIL(m) STMT_START {					\
844     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
845 	    m, REPORT_LOCATION_ARGS(RExC_parse));	                \
846 } STMT_END
847 
848 /*
849  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
850  */
851 #define	vFAIL(m) STMT_START {				\
852     PREPARE_TO_DIE;                                     \
853     Simple_vFAIL(m);					\
854 } STMT_END
855 
856 /*
857  * Like Simple_vFAIL(), but accepts two arguments.
858  */
859 #define	Simple_vFAIL2(m,a1) STMT_START {			\
860     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,		\
861                       REPORT_LOCATION_ARGS(RExC_parse));	\
862 } STMT_END
863 
864 /*
865  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
866  */
867 #define	vFAIL2(m,a1) STMT_START {			\
868     PREPARE_TO_DIE;                                     \
869     Simple_vFAIL2(m, a1);				\
870 } STMT_END
871 
872 
873 /*
874  * Like Simple_vFAIL(), but accepts three arguments.
875  */
876 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
877     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,		\
878 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
879 } STMT_END
880 
881 /*
882  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
883  */
884 #define	vFAIL3(m,a1,a2) STMT_START {			\
885     PREPARE_TO_DIE;                                     \
886     Simple_vFAIL3(m, a1, a2);				\
887 } STMT_END
888 
889 /*
890  * Like Simple_vFAIL(), but accepts four arguments.
891  */
892 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
893     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,	\
894 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
895 } STMT_END
896 
897 #define	vFAIL4(m,a1,a2,a3) STMT_START {			\
898     PREPARE_TO_DIE;                                     \
899     Simple_vFAIL4(m, a1, a2, a3);			\
900 } STMT_END
901 
902 /* A specialized version of vFAIL2 that works with UTF8f */
903 #define vFAIL2utf8f(m, a1) STMT_START {             \
904     PREPARE_TO_DIE;                                 \
905     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
906             REPORT_LOCATION_ARGS(RExC_parse));      \
907 } STMT_END
908 
909 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
910     PREPARE_TO_DIE;                                     \
911     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
912             REPORT_LOCATION_ARGS(RExC_parse));          \
913 } STMT_END
914 
915 /* Setting this to NULL is a signal to not output warnings */
916 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
917     STMT_START {                                                            \
918       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
919       RExC_copy_start_in_constructed = NULL;                                \
920     } STMT_END
921 #define RESTORE_WARNINGS                                                    \
922     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
923 
924 /* Since a warning can be generated multiple times as the input is reparsed, we
925  * output it the first time we come to that point in the parse, but suppress it
926  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
927  * generate any warnings */
928 #define TO_OUTPUT_WARNINGS(loc)                                         \
929   (   RExC_copy_start_in_constructed                                    \
930    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
931 
932 /* After we've emitted a warning, we save the position in the input so we don't
933  * output it again */
934 #define UPDATE_WARNINGS_LOC(loc)                                        \
935     STMT_START {                                                        \
936         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
937             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
938                                                        - RExC_precomp;  \
939         }                                                               \
940     } STMT_END
941 
942 /* 'warns' is the output of the packWARNx macro used in 'code' */
943 #define _WARN_HELPER(loc, warns, code)                                  \
944     STMT_START {                                                        \
945         if (! RExC_copy_start_in_constructed) {                         \
946             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
947                               " expected at '%s'",                      \
948                               __FILE__, __LINE__, loc);                 \
949         }                                                               \
950         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
951             if (ckDEAD(warns))                                          \
952                 PREPARE_TO_DIE;                                         \
953             code;                                                       \
954             UPDATE_WARNINGS_LOC(loc);                                   \
955         }                                                               \
956     } STMT_END
957 
958 /* m is not necessarily a "literal string", in this macro */
959 #define warn_non_literal_string(loc, packed_warn, m)                    \
960     _WARN_HELPER(loc, packed_warn,                                      \
961                       Perl_warner(aTHX_ packed_warn,                    \
962                                        "%s" REPORT_LOCATION,            \
963                                   m, REPORT_LOCATION_ARGS(loc)))
964 #define reg_warn_non_literal_string(loc, m)                             \
965                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
966 
967 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
968     STMT_START {                                                            \
969                 char * format;                                              \
970                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
971                 Newx(format, format_size, char);                            \
972                 my_strlcpy(format, m, format_size);                         \
973                 my_strlcat(format, REPORT_LOCATION, format_size);           \
974                 SAVEFREEPV(format);                                         \
975                 _WARN_HELPER(loc, packwarn,                                 \
976                       Perl_ck_warner(aTHX_ packwarn,                        \
977                                         format,                             \
978                                         a1, REPORT_LOCATION_ARGS(loc)));    \
979     } STMT_END
980 
981 #define	ckWARNreg(loc,m) 					        \
982     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
983                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
984                                           m REPORT_LOCATION,	        \
985 	                                  REPORT_LOCATION_ARGS(loc)))
986 
987 #define	vWARN(loc, m)           				        \
988     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
989                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
990                                        m REPORT_LOCATION,               \
991                                        REPORT_LOCATION_ARGS(loc)))      \
992 
993 #define	vWARN_dep(loc, m)           				        \
994     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
995                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
996                                        m REPORT_LOCATION,               \
997 	                               REPORT_LOCATION_ARGS(loc)))
998 
999 #define	ckWARNdep(loc,m)            				        \
1000     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
1001                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
1002 	                                    m REPORT_LOCATION,          \
1003 	                                    REPORT_LOCATION_ARGS(loc)))
1004 
1005 #define	ckWARNregdep(loc,m)             				    \
1006     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
1007                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
1008                                                       WARN_REGEXP),         \
1009 	                                     m REPORT_LOCATION,             \
1010 	                                     REPORT_LOCATION_ARGS(loc)))
1011 
1012 #define	ckWARN2reg_d(loc,m, a1)             				    \
1013     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1014                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1015 	                                    m REPORT_LOCATION,              \
1016 	                                    a1, REPORT_LOCATION_ARGS(loc)))
1017 
1018 #define	ckWARN2reg(loc, m, a1)                                              \
1019     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1020                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1021                                           m REPORT_LOCATION,	            \
1022                                           a1, REPORT_LOCATION_ARGS(loc)))
1023 
1024 #define	vWARN3(loc, m, a1, a2)          				    \
1025     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1026                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1027                                        m REPORT_LOCATION,                   \
1028 	                               a1, a2, REPORT_LOCATION_ARGS(loc)))
1029 
1030 #define	ckWARN3reg(loc, m, a1, a2)          				    \
1031     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1032                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1033                                           m REPORT_LOCATION,                \
1034 	                                  a1, a2,                           \
1035                                           REPORT_LOCATION_ARGS(loc)))
1036 
1037 #define	vWARN4(loc, m, a1, a2, a3)          				\
1038     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1039                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1040                                        m REPORT_LOCATION,               \
1041 	                               a1, a2, a3,                      \
1042                                        REPORT_LOCATION_ARGS(loc)))
1043 
1044 #define	ckWARN4reg(loc, m, a1, a2, a3)          			\
1045     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1046                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1047                                           m REPORT_LOCATION,            \
1048 	                                  a1, a2, a3,                   \
1049                                           REPORT_LOCATION_ARGS(loc)))
1050 
1051 #define	vWARN5(loc, m, a1, a2, a3, a4)          			\
1052     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1053                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1054                                        m REPORT_LOCATION,		\
1055 	                               a1, a2, a3, a4,                  \
1056                                        REPORT_LOCATION_ARGS(loc)))
1057 
1058 #define	ckWARNexperimental(loc, class, m)                               \
1059     STMT_START {                                                        \
1060         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1061             RExC_warned_ ## class = 1;                                  \
1062             _WARN_HELPER(loc, packWARN(class),                          \
1063                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1064                                             m REPORT_LOCATION,          \
1065                                             REPORT_LOCATION_ARGS(loc)));\
1066         }                                                               \
1067     } STMT_END
1068 
1069 /* Convert between a pointer to a node and its offset from the beginning of the
1070  * program */
1071 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1072 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1073 
1074 /* Macros for recording node offsets.   20001227 mjd@plover.com
1075  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
1076  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
1077  * Element 0 holds the number n.
1078  * Position is 1 indexed.
1079  */
1080 #ifndef RE_TRACK_PATTERN_OFFSETS
1081 #define Set_Node_Offset_To_R(offset,byte)
1082 #define Set_Node_Offset(node,byte)
1083 #define Set_Cur_Node_Offset
1084 #define Set_Node_Length_To_R(node,len)
1085 #define Set_Node_Length(node,len)
1086 #define Set_Node_Cur_Length(node,start)
1087 #define Node_Offset(n)
1088 #define Node_Length(n)
1089 #define Set_Node_Offset_Length(node,offset,len)
1090 #define ProgLen(ri) ri->u.proglen
1091 #define SetProgLen(ri,x) ri->u.proglen = x
1092 #define Track_Code(code)
1093 #else
1094 #define ProgLen(ri) ri->u.offsets[0]
1095 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1096 #define Set_Node_Offset_To_R(offset,byte) STMT_START {			\
1097 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
1098 		    __LINE__, (int)(offset), (int)(byte)));		\
1099 	if((offset) < 0) {						\
1100 	    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
1101                                          (int)(offset));                \
1102 	} else {							\
1103             RExC_offsets[2*(offset)-1] = (byte);	                \
1104 	}								\
1105 } STMT_END
1106 
1107 #define Set_Node_Offset(node,byte)                                      \
1108     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1109 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1110 
1111 #define Set_Node_Length_To_R(node,len) STMT_START {			\
1112 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
1113 		__LINE__, (int)(node), (int)(len)));			\
1114 	if((node) < 0) {						\
1115 	    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1116                                          (int)(node));                  \
1117 	} else {							\
1118 	    RExC_offsets[2*(node)] = (len);				\
1119 	}								\
1120 } STMT_END
1121 
1122 #define Set_Node_Length(node,len) \
1123     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1124 #define Set_Node_Cur_Length(node, start)                \
1125     Set_Node_Length(node, RExC_parse - start)
1126 
1127 /* Get offsets and lengths */
1128 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1129 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1130 
1131 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
1132     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));	\
1133     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));	\
1134 } STMT_END
1135 
1136 #define Track_Code(code) STMT_START { code } STMT_END
1137 #endif
1138 
1139 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1140 #define EXPERIMENTAL_INPLACESCAN
1141 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1142 
1143 #ifdef DEBUGGING
1144 int
Perl_re_printf(pTHX_ const char * fmt,...)1145 Perl_re_printf(pTHX_ const char *fmt, ...)
1146 {
1147     va_list ap;
1148     int result;
1149     PerlIO *f= Perl_debug_log;
1150     PERL_ARGS_ASSERT_RE_PRINTF;
1151     va_start(ap, fmt);
1152     result = PerlIO_vprintf(f, fmt, ap);
1153     va_end(ap);
1154     return result;
1155 }
1156 
1157 int
Perl_re_indentf(pTHX_ const char * fmt,U32 depth,...)1158 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1159 {
1160     va_list ap;
1161     int result;
1162     PerlIO *f= Perl_debug_log;
1163     PERL_ARGS_ASSERT_RE_INDENTF;
1164     va_start(ap, depth);
1165     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1166     result = PerlIO_vprintf(f, fmt, ap);
1167     va_end(ap);
1168     return result;
1169 }
1170 #endif /* DEBUGGING */
1171 
1172 #define DEBUG_RExC_seen()                                                   \
1173         DEBUG_OPTIMISE_MORE_r({                                             \
1174             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1175                                                                             \
1176             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1177                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1178                                                                             \
1179             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1180                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1181                                                                             \
1182             if (RExC_seen & REG_GPOS_SEEN)                                  \
1183                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1184                                                                             \
1185             if (RExC_seen & REG_RECURSE_SEEN)                               \
1186                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1187                                                                             \
1188             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1189                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1190                                                                             \
1191             if (RExC_seen & REG_VERBARG_SEEN)                               \
1192                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1193                                                                             \
1194             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1195                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1196                                                                             \
1197             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1198                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1199                                                                             \
1200             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1201                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1202                                                                             \
1203             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1204                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1205                                                                             \
1206             Perl_re_printf( aTHX_ "\n");                                    \
1207         });
1208 
1209 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1210   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1211 
1212 
1213 #ifdef DEBUGGING
1214 static void
S_debug_show_study_flags(pTHX_ U32 flags,const char * open_str,const char * close_str)1215 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1216                                     const char *close_str)
1217 {
1218     if (!flags)
1219         return;
1220 
1221     Perl_re_printf( aTHX_  "%s", open_str);
1222     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1223     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1224     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1225     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1226     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1227     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1228     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1229     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1230     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1231     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1232     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1233     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1234     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1235     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1236     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1237     Perl_re_printf( aTHX_  "%s", close_str);
1238 }
1239 
1240 
1241 static void
S_debug_studydata(pTHX_ const char * where,scan_data_t * data,U32 depth,int is_inf)1242 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1243                     U32 depth, int is_inf)
1244 {
1245     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1246 
1247     DEBUG_OPTIMISE_MORE_r({
1248         if (!data)
1249             return;
1250         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1251             depth,
1252             where,
1253             (IV)data->pos_min,
1254             (IV)data->pos_delta,
1255             (UV)data->flags
1256         );
1257 
1258         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1259 
1260         Perl_re_printf( aTHX_
1261             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1262             (IV)data->whilem_c,
1263             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1264             is_inf ? "INF " : ""
1265         );
1266 
1267         if (data->last_found) {
1268             int i;
1269             Perl_re_printf(aTHX_
1270                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1271                     SvPVX_const(data->last_found),
1272                     (IV)data->last_end,
1273                     (IV)data->last_start_min,
1274                     (IV)data->last_start_max
1275             );
1276 
1277             for (i = 0; i < 2; i++) {
1278                 Perl_re_printf(aTHX_
1279                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1280                     data->cur_is_floating == i ? "*" : "",
1281                     i ? "Float" : "Fixed",
1282                     SvPVX_const(data->substrs[i].str),
1283                     (IV)data->substrs[i].min_offset,
1284                     (IV)data->substrs[i].max_offset
1285                 );
1286                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1287             }
1288         }
1289 
1290         Perl_re_printf( aTHX_ "\n");
1291     });
1292 }
1293 
1294 
1295 static void
S_debug_peep(pTHX_ const char * str,const RExC_state_t * pRExC_state,regnode * scan,U32 depth,U32 flags)1296 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1297                 regnode *scan, U32 depth, U32 flags)
1298 {
1299     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1300 
1301     DEBUG_OPTIMISE_r({
1302         regnode *Next;
1303 
1304         if (!scan)
1305             return;
1306         Next = regnext(scan);
1307         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1308         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1309             depth,
1310             str,
1311             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1312             Next ? (REG_NODE_NUM(Next)) : 0 );
1313         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1314         Perl_re_printf( aTHX_  "\n");
1315    });
1316 }
1317 
1318 
1319 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1320                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1321 
1322 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1323                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1324 
1325 #else
1326 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1327 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1328 #endif
1329 
1330 
1331 /* =========================================================
1332  * BEGIN edit_distance stuff.
1333  *
1334  * This calculates how many single character changes of any type are needed to
1335  * transform a string into another one.  It is taken from version 3.1 of
1336  *
1337  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1338  */
1339 
1340 /* Our unsorted dictionary linked list.   */
1341 /* Note we use UVs, not chars. */
1342 
1343 struct dictionary{
1344   UV key;
1345   UV value;
1346   struct dictionary* next;
1347 };
1348 typedef struct dictionary item;
1349 
1350 
1351 PERL_STATIC_INLINE item*
push(UV key,item * curr)1352 push(UV key, item* curr)
1353 {
1354     item* head;
1355     Newx(head, 1, item);
1356     head->key = key;
1357     head->value = 0;
1358     head->next = curr;
1359     return head;
1360 }
1361 
1362 
1363 PERL_STATIC_INLINE item*
find(item * head,UV key)1364 find(item* head, UV key)
1365 {
1366     item* iterator = head;
1367     while (iterator){
1368         if (iterator->key == key){
1369             return iterator;
1370         }
1371         iterator = iterator->next;
1372     }
1373 
1374     return NULL;
1375 }
1376 
1377 PERL_STATIC_INLINE item*
uniquePush(item * head,UV key)1378 uniquePush(item* head, UV key)
1379 {
1380     item* iterator = head;
1381 
1382     while (iterator){
1383         if (iterator->key == key) {
1384             return head;
1385         }
1386         iterator = iterator->next;
1387     }
1388 
1389     return push(key, head);
1390 }
1391 
1392 PERL_STATIC_INLINE void
dict_free(item * head)1393 dict_free(item* head)
1394 {
1395     item* iterator = head;
1396 
1397     while (iterator) {
1398         item* temp = iterator;
1399         iterator = iterator->next;
1400         Safefree(temp);
1401     }
1402 
1403     head = NULL;
1404 }
1405 
1406 /* End of Dictionary Stuff */
1407 
1408 /* All calculations/work are done here */
1409 STATIC int
S_edit_distance(const UV * src,const UV * tgt,const STRLEN x,const STRLEN y,const SSize_t maxDistance)1410 S_edit_distance(const UV* src,
1411                 const UV* tgt,
1412                 const STRLEN x,             /* length of src[] */
1413                 const STRLEN y,             /* length of tgt[] */
1414                 const SSize_t maxDistance
1415 )
1416 {
1417     item *head = NULL;
1418     UV swapCount, swapScore, targetCharCount, i, j;
1419     UV *scores;
1420     UV score_ceil = x + y;
1421 
1422     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1423 
1424     /* intialize matrix start values */
1425     Newx(scores, ( (x + 2) * (y + 2)), UV);
1426     scores[0] = score_ceil;
1427     scores[1 * (y + 2) + 0] = score_ceil;
1428     scores[0 * (y + 2) + 1] = score_ceil;
1429     scores[1 * (y + 2) + 1] = 0;
1430     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1431 
1432     /* work loops    */
1433     /* i = src index */
1434     /* j = tgt index */
1435     for (i=1;i<=x;i++) {
1436         if (i < x)
1437             head = uniquePush(head, src[i]);
1438         scores[(i+1) * (y + 2) + 1] = i;
1439         scores[(i+1) * (y + 2) + 0] = score_ceil;
1440         swapCount = 0;
1441 
1442         for (j=1;j<=y;j++) {
1443             if (i == 1) {
1444                 if(j < y)
1445                 head = uniquePush(head, tgt[j]);
1446                 scores[1 * (y + 2) + (j + 1)] = j;
1447                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1448             }
1449 
1450             targetCharCount = find(head, tgt[j-1])->value;
1451             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1452 
1453             if (src[i-1] != tgt[j-1]){
1454                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1455             }
1456             else {
1457                 swapCount = j;
1458                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1459             }
1460         }
1461 
1462         find(head, src[i-1])->value = i;
1463     }
1464 
1465     {
1466         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1467         dict_free(head);
1468         Safefree(scores);
1469         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1470     }
1471 }
1472 
1473 /* END of edit_distance() stuff
1474  * ========================================================= */
1475 
1476 /* Mark that we cannot extend a found fixed substring at this point.
1477    Update the longest found anchored substring or the longest found
1478    floating substrings if needed. */
1479 
1480 STATIC void
S_scan_commit(pTHX_ const RExC_state_t * pRExC_state,scan_data_t * data,SSize_t * minlenp,int is_inf)1481 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1482                     SSize_t *minlenp, int is_inf)
1483 {
1484     const STRLEN l = CHR_SVLEN(data->last_found);
1485     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1486     const STRLEN old_l = CHR_SVLEN(longest_sv);
1487     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1488 
1489     PERL_ARGS_ASSERT_SCAN_COMMIT;
1490 
1491     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1492         const U8 i = data->cur_is_floating;
1493 	SvSetMagicSV(longest_sv, data->last_found);
1494         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1495 
1496 	if (!i) /* fixed */
1497 	    data->substrs[0].max_offset = data->substrs[0].min_offset;
1498 	else { /* float */
1499 	    data->substrs[1].max_offset =
1500                       (is_inf)
1501                        ? OPTIMIZE_INFTY
1502                        : (l
1503                           ? data->last_start_max
1504                           /* temporary underflow guard for 5.32 */
1505                           : data->pos_delta < 0 ? OPTIMIZE_INFTY
1506                           : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1507 					 ? OPTIMIZE_INFTY
1508 					 : data->pos_min + data->pos_delta));
1509         }
1510 
1511         data->substrs[i].flags &= ~SF_BEFORE_EOL;
1512         data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1513         data->substrs[i].minlenp = minlenp;
1514         data->substrs[i].lookbehind = 0;
1515     }
1516 
1517     SvCUR_set(data->last_found, 0);
1518     {
1519 	SV * const sv = data->last_found;
1520 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
1521 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1522 	    if (mg)
1523 		mg->mg_len = 0;
1524 	}
1525     }
1526     data->last_end = -1;
1527     data->flags &= ~SF_BEFORE_EOL;
1528     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1529 }
1530 
1531 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1532  * list that describes which code points it matches */
1533 
1534 STATIC void
S_ssc_anything(pTHX_ regnode_ssc * ssc)1535 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1536 {
1537     /* Set the SSC 'ssc' to match an empty string or any code point */
1538 
1539     PERL_ARGS_ASSERT_SSC_ANYTHING;
1540 
1541     assert(is_ANYOF_SYNTHETIC(ssc));
1542 
1543     /* mortalize so won't leak */
1544     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1545     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1546 }
1547 
1548 STATIC int
S_ssc_is_anything(const regnode_ssc * ssc)1549 S_ssc_is_anything(const regnode_ssc *ssc)
1550 {
1551     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1552      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1553      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1554      * in any way, so there's no point in using it */
1555 
1556     UV start, end;
1557     bool ret;
1558 
1559     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1560 
1561     assert(is_ANYOF_SYNTHETIC(ssc));
1562 
1563     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1564         return FALSE;
1565     }
1566 
1567     /* See if the list consists solely of the range 0 - Infinity */
1568     invlist_iterinit(ssc->invlist);
1569     ret = invlist_iternext(ssc->invlist, &start, &end)
1570           && start == 0
1571           && end == UV_MAX;
1572 
1573     invlist_iterfinish(ssc->invlist);
1574 
1575     if (ret) {
1576         return TRUE;
1577     }
1578 
1579     /* If e.g., both \w and \W are set, matches everything */
1580     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1581         int i;
1582         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1583             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1584                 return TRUE;
1585             }
1586         }
1587     }
1588 
1589     return FALSE;
1590 }
1591 
1592 STATIC void
S_ssc_init(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc)1593 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1594 {
1595     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1596      * string, any code point, or any posix class under locale */
1597 
1598     PERL_ARGS_ASSERT_SSC_INIT;
1599 
1600     Zero(ssc, 1, regnode_ssc);
1601     set_ANYOF_SYNTHETIC(ssc);
1602     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1603     ssc_anything(ssc);
1604 
1605     /* If any portion of the regex is to operate under locale rules that aren't
1606      * fully known at compile time, initialization includes it.  The reason
1607      * this isn't done for all regexes is that the optimizer was written under
1608      * the assumption that locale was all-or-nothing.  Given the complexity and
1609      * lack of documentation in the optimizer, and that there are inadequate
1610      * test cases for locale, many parts of it may not work properly, it is
1611      * safest to avoid locale unless necessary. */
1612     if (RExC_contains_locale) {
1613 	ANYOF_POSIXL_SETALL(ssc);
1614     }
1615     else {
1616 	ANYOF_POSIXL_ZERO(ssc);
1617     }
1618 }
1619 
1620 STATIC int
S_ssc_is_cp_posixl_init(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)1621 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1622                         const regnode_ssc *ssc)
1623 {
1624     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1625      * to the list of code points matched, and locale posix classes; hence does
1626      * not check its flags) */
1627 
1628     UV start, end;
1629     bool ret;
1630 
1631     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1632 
1633     assert(is_ANYOF_SYNTHETIC(ssc));
1634 
1635     invlist_iterinit(ssc->invlist);
1636     ret = invlist_iternext(ssc->invlist, &start, &end)
1637           && start == 0
1638           && end == UV_MAX;
1639 
1640     invlist_iterfinish(ssc->invlist);
1641 
1642     if (! ret) {
1643         return FALSE;
1644     }
1645 
1646     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1647         return FALSE;
1648     }
1649 
1650     return TRUE;
1651 }
1652 
1653 #define INVLIST_INDEX 0
1654 #define ONLY_LOCALE_MATCHES_INDEX 1
1655 #define DEFERRED_USER_DEFINED_INDEX 2
1656 
1657 STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t * pRExC_state,const regnode_charclass * const node)1658 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1659                                const regnode_charclass* const node)
1660 {
1661     /* Returns a mortal inversion list defining which code points are matched
1662      * by 'node', which is of type ANYOF.  Handles complementing the result if
1663      * appropriate.  If some code points aren't knowable at this time, the
1664      * returned list must, and will, contain every code point that is a
1665      * possibility. */
1666 
1667     dVAR;
1668     SV* invlist = NULL;
1669     SV* only_utf8_locale_invlist = NULL;
1670     unsigned int i;
1671     const U32 n = ARG(node);
1672     bool new_node_has_latin1 = FALSE;
1673     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1674                       ? 0
1675                       : ANYOF_FLAGS(node);
1676 
1677     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1678 
1679     /* Look at the data structure created by S_set_ANYOF_arg() */
1680     if (n != ANYOF_ONLY_HAS_BITMAP) {
1681         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1682         AV * const av = MUTABLE_AV(SvRV(rv));
1683         SV **const ary = AvARRAY(av);
1684         assert(RExC_rxi->data->what[n] == 's');
1685 
1686         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1687 
1688             /* Here there are things that won't be known until runtime -- we
1689              * have to assume it could be anything */
1690             invlist = sv_2mortal(_new_invlist(1));
1691             return _add_range_to_invlist(invlist, 0, UV_MAX);
1692         }
1693         else if (ary[INVLIST_INDEX]) {
1694 
1695             /* Use the node's inversion list */
1696             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1697         }
1698 
1699         /* Get the code points valid only under UTF-8 locales */
1700         if (   (flags & ANYOFL_FOLD)
1701             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1702         {
1703             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1704         }
1705     }
1706 
1707     if (! invlist) {
1708         invlist = sv_2mortal(_new_invlist(0));
1709     }
1710 
1711     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1712      * code points, and an inversion list for the others, but if there are code
1713      * points that should match only conditionally on the target string being
1714      * UTF-8, those are placed in the inversion list, and not the bitmap.
1715      * Since there are circumstances under which they could match, they are
1716      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1717      * to exclude them here, so that when we invert below, the end result
1718      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1719      * have to do this here before we add the unconditionally matched code
1720      * points */
1721     if (flags & ANYOF_INVERT) {
1722         _invlist_intersection_complement_2nd(invlist,
1723                                              PL_UpperLatin1,
1724                                              &invlist);
1725     }
1726 
1727     /* Add in the points from the bit map */
1728     if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1729         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1730             if (ANYOF_BITMAP_TEST(node, i)) {
1731                 unsigned int start = i++;
1732 
1733                 for (;    i < NUM_ANYOF_CODE_POINTS
1734                        && ANYOF_BITMAP_TEST(node, i); ++i)
1735                 {
1736                     /* empty */
1737                 }
1738                 invlist = _add_range_to_invlist(invlist, start, i-1);
1739                 new_node_has_latin1 = TRUE;
1740             }
1741         }
1742     }
1743 
1744     /* If this can match all upper Latin1 code points, have to add them
1745      * as well.  But don't add them if inverting, as when that gets done below,
1746      * it would exclude all these characters, including the ones it shouldn't
1747      * that were added just above */
1748     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1749         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1750     {
1751         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1752     }
1753 
1754     /* Similarly for these */
1755     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1756         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1757     }
1758 
1759     if (flags & ANYOF_INVERT) {
1760         _invlist_invert(invlist);
1761     }
1762     else if (flags & ANYOFL_FOLD) {
1763         if (new_node_has_latin1) {
1764 
1765             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1766              * the locale.  We can skip this if there are no 0-255 at all. */
1767             _invlist_union(invlist, PL_Latin1, &invlist);
1768 
1769             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1770             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1771         }
1772         else {
1773             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1774                 invlist = add_cp_to_invlist(invlist, 'I');
1775             }
1776             if (_invlist_contains_cp(invlist,
1777                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1778             {
1779                 invlist = add_cp_to_invlist(invlist, 'i');
1780             }
1781         }
1782     }
1783 
1784     /* Similarly add the UTF-8 locale possible matches.  These have to be
1785      * deferred until after the non-UTF-8 locale ones are taken care of just
1786      * above, or it leads to wrong results under ANYOF_INVERT */
1787     if (only_utf8_locale_invlist) {
1788         _invlist_union_maybe_complement_2nd(invlist,
1789                                             only_utf8_locale_invlist,
1790                                             flags & ANYOF_INVERT,
1791                                             &invlist);
1792     }
1793 
1794     return invlist;
1795 }
1796 
1797 /* These two functions currently do the exact same thing */
1798 #define ssc_init_zero		ssc_init
1799 
1800 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1801 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1802 
1803 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1804  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1805  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1806 
1807 STATIC void
S_ssc_and(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * and_with)1808 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1809                 const regnode_charclass *and_with)
1810 {
1811     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1812      * another SSC or a regular ANYOF class.  Can create false positives. */
1813 
1814     SV* anded_cp_list;
1815     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1816                           ? 0
1817                           : ANYOF_FLAGS(and_with);
1818     U8  anded_flags;
1819 
1820     PERL_ARGS_ASSERT_SSC_AND;
1821 
1822     assert(is_ANYOF_SYNTHETIC(ssc));
1823 
1824     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1825      * the code point inversion list and just the relevant flags */
1826     if (is_ANYOF_SYNTHETIC(and_with)) {
1827         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1828         anded_flags = and_with_flags;
1829 
1830         /* XXX This is a kludge around what appears to be deficiencies in the
1831          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1832          * there are paths through the optimizer where it doesn't get weeded
1833          * out when it should.  And if we don't make some extra provision for
1834          * it like the code just below, it doesn't get added when it should.
1835          * This solution is to add it only when AND'ing, which is here, and
1836          * only when what is being AND'ed is the pristine, original node
1837          * matching anything.  Thus it is like adding it to ssc_anything() but
1838          * only when the result is to be AND'ed.  Probably the same solution
1839          * could be adopted for the same problem we have with /l matching,
1840          * which is solved differently in S_ssc_init(), and that would lead to
1841          * fewer false positives than that solution has.  But if this solution
1842          * creates bugs, the consequences are only that a warning isn't raised
1843          * that should be; while the consequences for having /l bugs is
1844          * incorrect matches */
1845         if (ssc_is_anything((regnode_ssc *)and_with)) {
1846             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1847         }
1848     }
1849     else {
1850         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1851         if (OP(and_with) == ANYOFD) {
1852             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1853         }
1854         else {
1855             anded_flags = and_with_flags
1856             &( ANYOF_COMMON_FLAGS
1857               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1858               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1859             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1860                 anded_flags &=
1861                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1862             }
1863         }
1864     }
1865 
1866     ANYOF_FLAGS(ssc) &= anded_flags;
1867 
1868     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1869      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1870      * 'and_with' may be inverted.  When not inverted, we have the situation of
1871      * computing:
1872      *  (C1 | P1) & (C2 | P2)
1873      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1874      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1875      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1876      *                    <=  ((C1 & C2) | P1 | P2)
1877      * Alternatively, the last few steps could be:
1878      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1879      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1880      *                    <=  (C1 | C2 | (P1 & P2))
1881      * We favor the second approach if either P1 or P2 is non-empty.  This is
1882      * because these components are a barrier to doing optimizations, as what
1883      * they match cannot be known until the moment of matching as they are
1884      * dependent on the current locale, 'AND"ing them likely will reduce or
1885      * eliminate them.
1886      * But we can do better if we know that C1,P1 are in their initial state (a
1887      * frequent occurrence), each matching everything:
1888      *  (<everything>) & (C2 | P2) =  C2 | P2
1889      * Similarly, if C2,P2 are in their initial state (again a frequent
1890      * occurrence), the result is a no-op
1891      *  (C1 | P1) & (<everything>) =  C1 | P1
1892      *
1893      * Inverted, we have
1894      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1895      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1896      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1897      * */
1898 
1899     if ((and_with_flags & ANYOF_INVERT)
1900         && ! is_ANYOF_SYNTHETIC(and_with))
1901     {
1902         unsigned int i;
1903 
1904         ssc_intersection(ssc,
1905                          anded_cp_list,
1906                          FALSE /* Has already been inverted */
1907                          );
1908 
1909         /* If either P1 or P2 is empty, the intersection will be also; can skip
1910          * the loop */
1911         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1912             ANYOF_POSIXL_ZERO(ssc);
1913         }
1914         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1915 
1916             /* Note that the Posix class component P from 'and_with' actually
1917              * looks like:
1918              *      P = Pa | Pb | ... | Pn
1919              * where each component is one posix class, such as in [\w\s].
1920              * Thus
1921              *      ~P = ~(Pa | Pb | ... | Pn)
1922              *         = ~Pa & ~Pb & ... & ~Pn
1923              *        <= ~Pa | ~Pb | ... | ~Pn
1924              * The last is something we can easily calculate, but unfortunately
1925              * is likely to have many false positives.  We could do better
1926              * in some (but certainly not all) instances if two classes in
1927              * P have known relationships.  For example
1928              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1929              * So
1930              *      :lower: & :print: = :lower:
1931              * And similarly for classes that must be disjoint.  For example,
1932              * since \s and \w can have no elements in common based on rules in
1933              * the POSIX standard,
1934              *      \w & ^\S = nothing
1935              * Unfortunately, some vendor locales do not meet the Posix
1936              * standard, in particular almost everything by Microsoft.
1937              * The loop below just changes e.g., \w into \W and vice versa */
1938 
1939             regnode_charclass_posixl temp;
1940             int add = 1;    /* To calculate the index of the complement */
1941 
1942             Zero(&temp, 1, regnode_charclass_posixl);
1943             ANYOF_POSIXL_ZERO(&temp);
1944             for (i = 0; i < ANYOF_MAX; i++) {
1945                 assert(i % 2 != 0
1946                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1947                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1948 
1949                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1950                     ANYOF_POSIXL_SET(&temp, i + add);
1951                 }
1952                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1953             }
1954             ANYOF_POSIXL_AND(&temp, ssc);
1955 
1956         } /* else ssc already has no posixes */
1957     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1958          in its initial state */
1959     else if (! is_ANYOF_SYNTHETIC(and_with)
1960              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1961     {
1962         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1963          * copy it over 'ssc' */
1964         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1965             if (is_ANYOF_SYNTHETIC(and_with)) {
1966                 StructCopy(and_with, ssc, regnode_ssc);
1967             }
1968             else {
1969                 ssc->invlist = anded_cp_list;
1970                 ANYOF_POSIXL_ZERO(ssc);
1971                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1972                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1973                 }
1974             }
1975         }
1976         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1977                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1978         {
1979             /* One or the other of P1, P2 is non-empty. */
1980             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1981                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1982             }
1983             ssc_union(ssc, anded_cp_list, FALSE);
1984         }
1985         else { /* P1 = P2 = empty */
1986             ssc_intersection(ssc, anded_cp_list, FALSE);
1987         }
1988     }
1989 }
1990 
1991 STATIC void
S_ssc_or(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * or_with)1992 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1993                const regnode_charclass *or_with)
1994 {
1995     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1996      * another SSC or a regular ANYOF class.  Can create false positives if
1997      * 'or_with' is to be inverted. */
1998 
1999     SV* ored_cp_list;
2000     U8 ored_flags;
2001     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
2002                          ? 0
2003                          : ANYOF_FLAGS(or_with);
2004 
2005     PERL_ARGS_ASSERT_SSC_OR;
2006 
2007     assert(is_ANYOF_SYNTHETIC(ssc));
2008 
2009     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2010      * the code point inversion list and just the relevant flags */
2011     if (is_ANYOF_SYNTHETIC(or_with)) {
2012         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2013         ored_flags = or_with_flags;
2014     }
2015     else {
2016         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2017         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2018         if (OP(or_with) != ANYOFD) {
2019             ored_flags
2020             |= or_with_flags
2021              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2022                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2023             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2024                 ored_flags |=
2025                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2026             }
2027         }
2028     }
2029 
2030     ANYOF_FLAGS(ssc) |= ored_flags;
2031 
2032     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2033      * C2 is the list of code points in 'or-with'; P2, its posix classes.
2034      * 'or_with' may be inverted.  When not inverted, we have the simple
2035      * situation of computing:
2036      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
2037      * If P1|P2 yields a situation with both a class and its complement are
2038      * set, like having both \w and \W, this matches all code points, and we
2039      * can delete these from the P component of the ssc going forward.  XXX We
2040      * might be able to delete all the P components, but I (khw) am not certain
2041      * about this, and it is better to be safe.
2042      *
2043      * Inverted, we have
2044      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
2045      *                         <=  (C1 | P1) | ~C2
2046      *                         <=  (C1 | ~C2) | P1
2047      * (which results in actually simpler code than the non-inverted case)
2048      * */
2049 
2050     if ((or_with_flags & ANYOF_INVERT)
2051         && ! is_ANYOF_SYNTHETIC(or_with))
2052     {
2053         /* We ignore P2, leaving P1 going forward */
2054     }   /* else  Not inverted */
2055     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2056         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2057         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2058             unsigned int i;
2059             for (i = 0; i < ANYOF_MAX; i += 2) {
2060                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2061                 {
2062                     ssc_match_all_cp(ssc);
2063                     ANYOF_POSIXL_CLEAR(ssc, i);
2064                     ANYOF_POSIXL_CLEAR(ssc, i+1);
2065                 }
2066             }
2067         }
2068     }
2069 
2070     ssc_union(ssc,
2071               ored_cp_list,
2072               FALSE /* Already has been inverted */
2073               );
2074 }
2075 
2076 STATIC void
S_ssc_union(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)2077 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2078 {
2079     PERL_ARGS_ASSERT_SSC_UNION;
2080 
2081     assert(is_ANYOF_SYNTHETIC(ssc));
2082 
2083     _invlist_union_maybe_complement_2nd(ssc->invlist,
2084                                         invlist,
2085                                         invert2nd,
2086                                         &ssc->invlist);
2087 }
2088 
2089 STATIC void
S_ssc_intersection(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)2090 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2091                          SV* const invlist,
2092                          const bool invert2nd)
2093 {
2094     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2095 
2096     assert(is_ANYOF_SYNTHETIC(ssc));
2097 
2098     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2099                                                invlist,
2100                                                invert2nd,
2101                                                &ssc->invlist);
2102 }
2103 
2104 STATIC void
S_ssc_add_range(pTHX_ regnode_ssc * ssc,const UV start,const UV end)2105 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2106 {
2107     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2108 
2109     assert(is_ANYOF_SYNTHETIC(ssc));
2110 
2111     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2112 }
2113 
2114 STATIC void
S_ssc_cp_and(pTHX_ regnode_ssc * ssc,const UV cp)2115 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2116 {
2117     /* AND just the single code point 'cp' into the SSC 'ssc' */
2118 
2119     SV* cp_list = _new_invlist(2);
2120 
2121     PERL_ARGS_ASSERT_SSC_CP_AND;
2122 
2123     assert(is_ANYOF_SYNTHETIC(ssc));
2124 
2125     cp_list = add_cp_to_invlist(cp_list, cp);
2126     ssc_intersection(ssc, cp_list,
2127                      FALSE /* Not inverted */
2128                      );
2129     SvREFCNT_dec_NN(cp_list);
2130 }
2131 
2132 STATIC void
S_ssc_clear_locale(regnode_ssc * ssc)2133 S_ssc_clear_locale(regnode_ssc *ssc)
2134 {
2135     /* Set the SSC 'ssc' to not match any locale things */
2136     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2137 
2138     assert(is_ANYOF_SYNTHETIC(ssc));
2139 
2140     ANYOF_POSIXL_ZERO(ssc);
2141     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2142 }
2143 
2144 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2145 
2146 STATIC bool
S_is_ssc_worth_it(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)2147 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2148 {
2149     /* The synthetic start class is used to hopefully quickly winnow down
2150      * places where a pattern could start a match in the target string.  If it
2151      * doesn't really narrow things down that much, there isn't much point to
2152      * having the overhead of using it.  This function uses some very crude
2153      * heuristics to decide if to use the ssc or not.
2154      *
2155      * It returns TRUE if 'ssc' rules out more than half what it considers to
2156      * be the "likely" possible matches, but of course it doesn't know what the
2157      * actual things being matched are going to be; these are only guesses
2158      *
2159      * For /l matches, it assumes that the only likely matches are going to be
2160      *      in the 0-255 range, uniformly distributed, so half of that is 127
2161      * For /a and /d matches, it assumes that the likely matches will be just
2162      *      the ASCII range, so half of that is 63
2163      * For /u and there isn't anything matching above the Latin1 range, it
2164      *      assumes that that is the only range likely to be matched, and uses
2165      *      half that as the cut-off: 127.  If anything matches above Latin1,
2166      *      it assumes that all of Unicode could match (uniformly), except for
2167      *      non-Unicode code points and things in the General Category "Other"
2168      *      (unassigned, private use, surrogates, controls and formats).  This
2169      *      is a much large number. */
2170 
2171     U32 count = 0;      /* Running total of number of code points matched by
2172                            'ssc' */
2173     UV start, end;      /* Start and end points of current range in inversion
2174                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2175     const U32 max_code_points = (LOC)
2176                                 ?  256
2177                                 : ((  ! UNI_SEMANTICS
2178                                     ||  invlist_highest(ssc->invlist) < 256)
2179                                   ? 128
2180                                   : NON_OTHER_COUNT);
2181     const U32 max_match = max_code_points / 2;
2182 
2183     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2184 
2185     invlist_iterinit(ssc->invlist);
2186     while (invlist_iternext(ssc->invlist, &start, &end)) {
2187         if (start >= max_code_points) {
2188             break;
2189         }
2190         end = MIN(end, max_code_points - 1);
2191         count += end - start + 1;
2192         if (count >= max_match) {
2193             invlist_iterfinish(ssc->invlist);
2194             return FALSE;
2195         }
2196     }
2197 
2198     return TRUE;
2199 }
2200 
2201 
2202 STATIC void
S_ssc_finalize(pTHX_ RExC_state_t * pRExC_state,regnode_ssc * ssc)2203 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2204 {
2205     /* The inversion list in the SSC is marked mortal; now we need a more
2206      * permanent copy, which is stored the same way that is done in a regular
2207      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2208      * map */
2209 
2210     SV* invlist = invlist_clone(ssc->invlist, NULL);
2211 
2212     PERL_ARGS_ASSERT_SSC_FINALIZE;
2213 
2214     assert(is_ANYOF_SYNTHETIC(ssc));
2215 
2216     /* The code in this file assumes that all but these flags aren't relevant
2217      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2218      * by the time we reach here */
2219     assert(! (ANYOF_FLAGS(ssc)
2220         & ~( ANYOF_COMMON_FLAGS
2221             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2222             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2223 
2224     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2225 
2226     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2227     SvREFCNT_dec(invlist);
2228 
2229     /* Make sure is clone-safe */
2230     ssc->invlist = NULL;
2231 
2232     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2233         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2234         OP(ssc) = ANYOFPOSIXL;
2235     }
2236     else if (RExC_contains_locale) {
2237         OP(ssc) = ANYOFL;
2238     }
2239 
2240     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2241 }
2242 
2243 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2244 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2245 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2246 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2247                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2248                                : 0 )
2249 
2250 
2251 #ifdef DEBUGGING
2252 /*
2253    dump_trie(trie,widecharmap,revcharmap)
2254    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2255    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2256 
2257    These routines dump out a trie in a somewhat readable format.
2258    The _interim_ variants are used for debugging the interim
2259    tables that are used to generate the final compressed
2260    representation which is what dump_trie expects.
2261 
2262    Part of the reason for their existence is to provide a form
2263    of documentation as to how the different representations function.
2264 
2265 */
2266 
2267 /*
2268   Dumps the final compressed table form of the trie to Perl_debug_log.
2269   Used for debugging make_trie().
2270 */
2271 
2272 STATIC void
S_dump_trie(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 depth)2273 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2274 	    AV *revcharmap, U32 depth)
2275 {
2276     U32 state;
2277     SV *sv=sv_newmortal();
2278     int colwidth= widecharmap ? 6 : 4;
2279     U16 word;
2280     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2281 
2282     PERL_ARGS_ASSERT_DUMP_TRIE;
2283 
2284     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2285         depth+1, "Match","Base","Ofs" );
2286 
2287     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2288 	SV ** const tmp = av_fetch( revcharmap, state, 0);
2289         if ( tmp ) {
2290             Perl_re_printf( aTHX_  "%*s",
2291                 colwidth,
2292                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2293 	                    PL_colors[0], PL_colors[1],
2294 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2295 	                    PERL_PV_ESCAPE_FIRSTCHAR
2296                 )
2297             );
2298         }
2299     }
2300     Perl_re_printf( aTHX_  "\n");
2301     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2302 
2303     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2304         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2305     Perl_re_printf( aTHX_  "\n");
2306 
2307     for( state = 1 ; state < trie->statecount ; state++ ) {
2308 	const U32 base = trie->states[ state ].trans.base;
2309 
2310         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2311 
2312         if ( trie->states[ state ].wordnum ) {
2313             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2314         } else {
2315             Perl_re_printf( aTHX_  "%6s", "" );
2316         }
2317 
2318         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2319 
2320         if ( base ) {
2321             U32 ofs = 0;
2322 
2323             while( ( base + ofs  < trie->uniquecharcount ) ||
2324                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2325                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2326                                                                     != state))
2327                     ofs++;
2328 
2329             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2330 
2331             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2332                 if ( ( base + ofs >= trie->uniquecharcount )
2333                         && ( base + ofs - trie->uniquecharcount
2334                                                         < trie->lasttrans )
2335                         && trie->trans[ base + ofs
2336                                     - trie->uniquecharcount ].check == state )
2337                 {
2338                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2339                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2340                    );
2341                 } else {
2342                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2343                 }
2344             }
2345 
2346             Perl_re_printf( aTHX_  "]");
2347 
2348         }
2349         Perl_re_printf( aTHX_  "\n" );
2350     }
2351     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2352                                 depth);
2353     for (word=1; word <= trie->wordcount; word++) {
2354         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2355 	    (int)word, (int)(trie->wordinfo[word].prev),
2356 	    (int)(trie->wordinfo[word].len));
2357     }
2358     Perl_re_printf( aTHX_  "\n" );
2359 }
2360 /*
2361   Dumps a fully constructed but uncompressed trie in list form.
2362   List tries normally only are used for construction when the number of
2363   possible chars (trie->uniquecharcount) is very high.
2364   Used for debugging make_trie().
2365 */
2366 STATIC void
S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)2367 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2368 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2369 			 U32 depth)
2370 {
2371     U32 state;
2372     SV *sv=sv_newmortal();
2373     int colwidth= widecharmap ? 6 : 4;
2374     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2375 
2376     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2377 
2378     /* print out the table precompression.  */
2379     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2380             depth+1 );
2381     Perl_re_indentf( aTHX_  "%s",
2382             depth+1, "------:-----+-----------------\n" );
2383 
2384     for( state=1 ; state < next_alloc ; state ++ ) {
2385         U16 charid;
2386 
2387         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2388             depth+1, (UV)state  );
2389         if ( ! trie->states[ state ].wordnum ) {
2390             Perl_re_printf( aTHX_  "%5s| ","");
2391         } else {
2392             Perl_re_printf( aTHX_  "W%4x| ",
2393                 trie->states[ state ].wordnum
2394             );
2395         }
2396         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2397 	    SV ** const tmp = av_fetch( revcharmap,
2398                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2399 	    if ( tmp ) {
2400                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2401                     colwidth,
2402                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2403                               colwidth,
2404                               PL_colors[0], PL_colors[1],
2405                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2406                               | PERL_PV_ESCAPE_FIRSTCHAR
2407                     ) ,
2408                     TRIE_LIST_ITEM(state, charid).forid,
2409                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2410                 );
2411                 if (!(charid % 10))
2412                     Perl_re_printf( aTHX_  "\n%*s| ",
2413                         (int)((depth * 2) + 14), "");
2414             }
2415         }
2416         Perl_re_printf( aTHX_  "\n");
2417     }
2418 }
2419 
2420 /*
2421   Dumps a fully constructed but uncompressed trie in table form.
2422   This is the normal DFA style state transition table, with a few
2423   twists to facilitate compression later.
2424   Used for debugging make_trie().
2425 */
2426 STATIC void
S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)2427 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2428 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
2429 			  U32 depth)
2430 {
2431     U32 state;
2432     U16 charid;
2433     SV *sv=sv_newmortal();
2434     int colwidth= widecharmap ? 6 : 4;
2435     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2436 
2437     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2438 
2439     /*
2440        print out the table precompression so that we can do a visual check
2441        that they are identical.
2442      */
2443 
2444     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2445 
2446     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2447 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
2448         if ( tmp ) {
2449             Perl_re_printf( aTHX_  "%*s",
2450                 colwidth,
2451                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2452 	                    PL_colors[0], PL_colors[1],
2453 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2454 	                    PERL_PV_ESCAPE_FIRSTCHAR
2455                 )
2456             );
2457         }
2458     }
2459 
2460     Perl_re_printf( aTHX_ "\n");
2461     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2462 
2463     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2464         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2465     }
2466 
2467     Perl_re_printf( aTHX_  "\n" );
2468 
2469     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2470 
2471         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2472             depth+1,
2473             (UV)TRIE_NODENUM( state ) );
2474 
2475         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2476             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2477             if (v)
2478                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2479             else
2480                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2481         }
2482         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2483             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2484                                             (UV)trie->trans[ state ].check );
2485         } else {
2486             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2487                                             (UV)trie->trans[ state ].check,
2488             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2489         }
2490     }
2491 }
2492 
2493 #endif
2494 
2495 
2496 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2497   startbranch: the first branch in the whole branch sequence
2498   first      : start branch of sequence of branch-exact nodes.
2499 	       May be the same as startbranch
2500   last       : Thing following the last branch.
2501 	       May be the same as tail.
2502   tail       : item following the branch sequence
2503   count      : words in the sequence
2504   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2505   depth      : indent depth
2506 
2507 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2508 
2509 A trie is an N'ary tree where the branches are determined by digital
2510 decomposition of the key. IE, at the root node you look up the 1st character and
2511 follow that branch repeat until you find the end of the branches. Nodes can be
2512 marked as "accepting" meaning they represent a complete word. Eg:
2513 
2514   /he|she|his|hers/
2515 
2516 would convert into the following structure. Numbers represent states, letters
2517 following numbers represent valid transitions on the letter from that state, if
2518 the number is in square brackets it represents an accepting state, otherwise it
2519 will be in parenthesis.
2520 
2521       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2522       |    |
2523       |   (2)
2524       |    |
2525      (1)   +-i->(6)-+-s->[7]
2526       |
2527       +-s->(3)-+-h->(4)-+-e->[5]
2528 
2529       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2530 
2531 This shows that when matching against the string 'hers' we will begin at state 1
2532 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2533 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2534 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2535 single traverse. We store a mapping from accepting to state to which word was
2536 matched, and then when we have multiple possibilities we try to complete the
2537 rest of the regex in the order in which they occurred in the alternation.
2538 
2539 The only prior NFA like behaviour that would be changed by the TRIE support is
2540 the silent ignoring of duplicate alternations which are of the form:
2541 
2542  / (DUPE|DUPE) X? (?{ ... }) Y /x
2543 
2544 Thus EVAL blocks following a trie may be called a different number of times with
2545 and without the optimisation. With the optimisations dupes will be silently
2546 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2547 the following demonstrates:
2548 
2549  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2550 
2551 which prints out 'word' three times, but
2552 
2553  'words'=~/(word|word|word)(?{ print $1 })S/
2554 
2555 which doesnt print it out at all. This is due to other optimisations kicking in.
2556 
2557 Example of what happens on a structural level:
2558 
2559 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2560 
2561    1: CURLYM[1] {1,32767}(18)
2562    5:   BRANCH(8)
2563    6:     EXACT <ac>(16)
2564    8:   BRANCH(11)
2565    9:     EXACT <ad>(16)
2566   11:   BRANCH(14)
2567   12:     EXACT <ab>(16)
2568   16:   SUCCEED(0)
2569   17:   NOTHING(18)
2570   18: END(0)
2571 
2572 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2573 and should turn into:
2574 
2575    1: CURLYM[1] {1,32767}(18)
2576    5:   TRIE(16)
2577 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2578 	  <ac>
2579 	  <ad>
2580 	  <ab>
2581   16:   SUCCEED(0)
2582   17:   NOTHING(18)
2583   18: END(0)
2584 
2585 Cases where tail != last would be like /(?foo|bar)baz/:
2586 
2587    1: BRANCH(4)
2588    2:   EXACT <foo>(8)
2589    4: BRANCH(7)
2590    5:   EXACT <bar>(8)
2591    7: TAIL(8)
2592    8: EXACT <baz>(10)
2593   10: END(0)
2594 
2595 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2596 and would end up looking like:
2597 
2598     1: TRIE(8)
2599       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2600 	<foo>
2601 	<bar>
2602    7: TAIL(8)
2603    8: EXACT <baz>(10)
2604   10: END(0)
2605 
2606     d = uvchr_to_utf8_flags(d, uv, 0);
2607 
2608 is the recommended Unicode-aware way of saying
2609 
2610     *(d++) = uv;
2611 */
2612 
2613 #define TRIE_STORE_REVCHAR(val)                                            \
2614     STMT_START {                                                           \
2615 	if (UTF) {							   \
2616             SV *zlopp = newSV(UTF8_MAXBYTES);				   \
2617 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
2618             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2619             *kapow = '\0';                                                 \
2620 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
2621 	    SvPOK_on(zlopp);						   \
2622 	    SvUTF8_on(zlopp);						   \
2623 	    av_push(revcharmap, zlopp);					   \
2624 	} else {							   \
2625             char ooooff = (char)val;                                           \
2626 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
2627 	}								   \
2628         } STMT_END
2629 
2630 /* This gets the next character from the input, folding it if not already
2631  * folded. */
2632 #define TRIE_READ_CHAR STMT_START {                                           \
2633     wordlen++;                                                                \
2634     if ( UTF ) {                                                              \
2635         /* if it is UTF then it is either already folded, or does not need    \
2636          * folding */                                                         \
2637         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2638     }                                                                         \
2639     else if (folder == PL_fold_latin1) {                                      \
2640         /* This folder implies Unicode rules, which in the range expressible  \
2641          *  by not UTF is the lower case, with the two exceptions, one of     \
2642          *  which should have been taken care of before calling this */       \
2643         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2644         uvc = toLOWER_L1(*uc);                                                \
2645         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2646         len = 1;                                                              \
2647     } else {                                                                  \
2648         /* raw data, will be folded later if needed */                        \
2649         uvc = (U32)*uc;                                                       \
2650         len = 1;                                                              \
2651     }                                                                         \
2652 } STMT_END
2653 
2654 
2655 
2656 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2657     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2658 	U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2659 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2660         TRIE_LIST_LEN( state ) = ging;                          \
2661     }                                                           \
2662     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2663     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2664     TRIE_LIST_CUR( state )++;                                   \
2665 } STMT_END
2666 
2667 #define TRIE_LIST_NEW(state) STMT_START {                       \
2668     Newx( trie->states[ state ].trans.list,                     \
2669 	4, reg_trie_trans_le );                                 \
2670      TRIE_LIST_CUR( state ) = 1;                                \
2671      TRIE_LIST_LEN( state ) = 4;                                \
2672 } STMT_END
2673 
2674 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2675     U16 dupe= trie->states[ state ].wordnum;                    \
2676     regnode * const noper_next = regnext( noper );              \
2677                                                                 \
2678     DEBUG_r({                                                   \
2679         /* store the word for dumping */                        \
2680         SV* tmp;                                                \
2681         if (OP(noper) != NOTHING)                               \
2682             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);	\
2683         else                                                    \
2684             tmp = newSVpvn_utf8( "", 0, UTF );			\
2685         av_push( trie_words, tmp );                             \
2686     });                                                         \
2687                                                                 \
2688     curword++;                                                  \
2689     trie->wordinfo[curword].prev   = 0;                         \
2690     trie->wordinfo[curword].len    = wordlen;                   \
2691     trie->wordinfo[curword].accept = state;                     \
2692                                                                 \
2693     if ( noper_next < tail ) {                                  \
2694         if (!trie->jump)                                        \
2695             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2696                                                  sizeof(U16) ); \
2697         trie->jump[curword] = (U16)(noper_next - convert);      \
2698         if (!jumper)                                            \
2699             jumper = noper_next;                                \
2700         if (!nextbranch)                                        \
2701             nextbranch= regnext(cur);                           \
2702     }                                                           \
2703                                                                 \
2704     if ( dupe ) {                                               \
2705         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2706         /* chain, so that when the bits of chain are later    */\
2707         /* linked together, the dups appear in the chain      */\
2708 	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2709 	trie->wordinfo[dupe].prev = curword;                    \
2710     } else {                                                    \
2711         /* we haven't inserted this word yet.                */ \
2712         trie->states[ state ].wordnum = curword;                \
2713     }                                                           \
2714 } STMT_END
2715 
2716 
2717 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
2718      ( ( base + charid >=  ucharcount					\
2719          && base + charid < ubound					\
2720          && state == trie->trans[ base - ucharcount + charid ].check	\
2721          && trie->trans[ base - ucharcount + charid ].next )		\
2722            ? trie->trans[ base - ucharcount + charid ].next		\
2723            : ( state==1 ? special : 0 )					\
2724       )
2725 
2726 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2727 STMT_START {                                                \
2728     TRIE_BITMAP_SET(trie, uvc);                             \
2729     /* store the folded codepoint */                        \
2730     if ( folder )                                           \
2731         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2732                                                             \
2733     if ( !UTF ) {                                           \
2734         /* store first byte of utf8 representation of */    \
2735         /* variant codepoints */                            \
2736         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2737             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2738         }                                                   \
2739     }                                                       \
2740 } STMT_END
2741 #define MADE_TRIE       1
2742 #define MADE_JUMP_TRIE  2
2743 #define MADE_EXACT_TRIE 4
2744 
2745 STATIC I32
S_make_trie(pTHX_ RExC_state_t * pRExC_state,regnode * startbranch,regnode * first,regnode * last,regnode * tail,U32 word_count,U32 flags,U32 depth)2746 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2747                   regnode *first, regnode *last, regnode *tail,
2748                   U32 word_count, U32 flags, U32 depth)
2749 {
2750     /* first pass, loop through and scan words */
2751     reg_trie_data *trie;
2752     HV *widecharmap = NULL;
2753     AV *revcharmap = newAV();
2754     regnode *cur;
2755     STRLEN len = 0;
2756     UV uvc = 0;
2757     U16 curword = 0;
2758     U32 next_alloc = 0;
2759     regnode *jumper = NULL;
2760     regnode *nextbranch = NULL;
2761     regnode *convert = NULL;
2762     U32 *prev_states; /* temp array mapping each state to previous one */
2763     /* we just use folder as a flag in utf8 */
2764     const U8 * folder = NULL;
2765 
2766     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2767      * which stands for one trie structure, one hash, optionally followed
2768      * by two arrays */
2769 #ifdef DEBUGGING
2770     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2771     AV *trie_words = NULL;
2772     /* along with revcharmap, this only used during construction but both are
2773      * useful during debugging so we store them in the struct when debugging.
2774      */
2775 #else
2776     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2777     STRLEN trie_charcount=0;
2778 #endif
2779     SV *re_trie_maxbuff;
2780     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2781 
2782     PERL_ARGS_ASSERT_MAKE_TRIE;
2783 #ifndef DEBUGGING
2784     PERL_UNUSED_ARG(depth);
2785 #endif
2786 
2787     switch (flags) {
2788         case EXACT: case EXACT_REQ8: case EXACTL: break;
2789 	case EXACTFAA:
2790         case EXACTFUP:
2791 	case EXACTFU:
2792 	case EXACTFLU8: folder = PL_fold_latin1; break;
2793 	case EXACTF:  folder = PL_fold; break;
2794         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2795     }
2796 
2797     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2798     trie->refcount = 1;
2799     trie->startstate = 1;
2800     trie->wordcount = word_count;
2801     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2802     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2803     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2804 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2805     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2806                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2807 
2808     DEBUG_r({
2809         trie_words = newAV();
2810     });
2811 
2812     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2813     assert(re_trie_maxbuff);
2814     if (!SvIOK(re_trie_maxbuff)) {
2815         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2816     }
2817     DEBUG_TRIE_COMPILE_r({
2818         Perl_re_indentf( aTHX_
2819           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2820           depth+1,
2821           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2822           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2823     });
2824 
2825    /* Find the node we are going to overwrite */
2826     if ( first == startbranch && OP( last ) != BRANCH ) {
2827         /* whole branch chain */
2828         convert = first;
2829     } else {
2830         /* branch sub-chain */
2831         convert = NEXTOPER( first );
2832     }
2833 
2834     /*  -- First loop and Setup --
2835 
2836        We first traverse the branches and scan each word to determine if it
2837        contains widechars, and how many unique chars there are, this is
2838        important as we have to build a table with at least as many columns as we
2839        have unique chars.
2840 
2841        We use an array of integers to represent the character codes 0..255
2842        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2843        the native representation of the character value as the key and IV's for
2844        the coded index.
2845 
2846        *TODO* If we keep track of how many times each character is used we can
2847        remap the columns so that the table compression later on is more
2848        efficient in terms of memory by ensuring the most common value is in the
2849        middle and the least common are on the outside.  IMO this would be better
2850        than a most to least common mapping as theres a decent chance the most
2851        common letter will share a node with the least common, meaning the node
2852        will not be compressible. With a middle is most common approach the worst
2853        case is when we have the least common nodes twice.
2854 
2855      */
2856 
2857     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2858         regnode *noper = NEXTOPER( cur );
2859         const U8 *uc;
2860         const U8 *e;
2861         int foldlen = 0;
2862         U32 wordlen      = 0;         /* required init */
2863         STRLEN minchars = 0;
2864         STRLEN maxchars = 0;
2865         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2866                                                bitmap?*/
2867 
2868         if (OP(noper) == NOTHING) {
2869             /* skip past a NOTHING at the start of an alternation
2870              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2871              *
2872              * If the next node is not something we are supposed to process
2873              * we will just ignore it due to the condition guarding the
2874              * next block.
2875              */
2876 
2877             regnode *noper_next= regnext(noper);
2878             if (noper_next < tail)
2879                 noper= noper_next;
2880         }
2881 
2882         if (    noper < tail
2883             && (    OP(noper) == flags
2884                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2885                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
2886                                          || OP(noper) == EXACTFUP))))
2887         {
2888             uc= (U8*)STRING(noper);
2889             e= uc + STR_LEN(noper);
2890         } else {
2891             trie->minlen= 0;
2892             continue;
2893         }
2894 
2895 
2896         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2897             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2898                                           regardless of encoding */
2899             if (OP( noper ) == EXACTFUP) {
2900                 /* false positives are ok, so just set this */
2901                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2902             }
2903         }
2904 
2905         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2906                                            branch */
2907             TRIE_CHARCOUNT(trie)++;
2908             TRIE_READ_CHAR;
2909 
2910             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2911              * is in effect.  Under /i, this character can match itself, or
2912              * anything that folds to it.  If not under /i, it can match just
2913              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2914              * all fold to k, and all are single characters.   But some folds
2915              * expand to more than one character, so for example LATIN SMALL
2916              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2917              * the string beginning at 'uc' is 'ffi', it could be matched by
2918              * three characters, or just by the one ligature character. (It
2919              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2920              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2921              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2922              * match.)  The trie needs to know the minimum and maximum number
2923              * of characters that could match so that it can use size alone to
2924              * quickly reject many match attempts.  The max is simple: it is
2925              * the number of folded characters in this branch (since a fold is
2926              * never shorter than what folds to it. */
2927 
2928             maxchars++;
2929 
2930             /* And the min is equal to the max if not under /i (indicated by
2931              * 'folder' being NULL), or there are no multi-character folds.  If
2932              * there is a multi-character fold, the min is incremented just
2933              * once, for the character that folds to the sequence.  Each
2934              * character in the sequence needs to be added to the list below of
2935              * characters in the trie, but we count only the first towards the
2936              * min number of characters needed.  This is done through the
2937              * variable 'foldlen', which is returned by the macros that look
2938              * for these sequences as the number of bytes the sequence
2939              * occupies.  Each time through the loop, we decrement 'foldlen' by
2940              * how many bytes the current char occupies.  Only when it reaches
2941              * 0 do we increment 'minchars' or look for another multi-character
2942              * sequence. */
2943             if (folder == NULL) {
2944                 minchars++;
2945             }
2946             else if (foldlen > 0) {
2947                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2948             }
2949             else {
2950                 minchars++;
2951 
2952                 /* See if *uc is the beginning of a multi-character fold.  If
2953                  * so, we decrement the length remaining to look at, to account
2954                  * for the current character this iteration.  (We can use 'uc'
2955                  * instead of the fold returned by TRIE_READ_CHAR because for
2956                  * non-UTF, the latin1_safe macro is smart enough to account
2957                  * for all the unfolded characters, and because for UTF, the
2958                  * string will already have been folded earlier in the
2959                  * compilation process */
2960                 if (UTF) {
2961                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2962                         foldlen -= UTF8SKIP(uc);
2963                     }
2964                 }
2965                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2966                     foldlen--;
2967                 }
2968             }
2969 
2970             /* The current character (and any potential folds) should be added
2971              * to the possible matching characters for this position in this
2972              * branch */
2973             if ( uvc < 256 ) {
2974                 if ( folder ) {
2975                     U8 folded= folder[ (U8) uvc ];
2976                     if ( !trie->charmap[ folded ] ) {
2977                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2978                         TRIE_STORE_REVCHAR( folded );
2979                     }
2980                 }
2981                 if ( !trie->charmap[ uvc ] ) {
2982                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2983                     TRIE_STORE_REVCHAR( uvc );
2984                 }
2985                 if ( set_bit ) {
2986 		    /* store the codepoint in the bitmap, and its folded
2987 		     * equivalent. */
2988                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2989                     set_bit = 0; /* We've done our bit :-) */
2990                 }
2991             } else {
2992 
2993                 /* XXX We could come up with the list of code points that fold
2994                  * to this using PL_utf8_foldclosures, except not for
2995                  * multi-char folds, as there may be multiple combinations
2996                  * there that could work, which needs to wait until runtime to
2997                  * resolve (The comment about LIGATURE FFI above is such an
2998                  * example */
2999 
3000                 SV** svpp;
3001                 if ( !widecharmap )
3002                     widecharmap = newHV();
3003 
3004                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
3005 
3006                 if ( !svpp )
3007                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
3008 
3009                 if ( !SvTRUE( *svpp ) ) {
3010                     sv_setiv( *svpp, ++trie->uniquecharcount );
3011                     TRIE_STORE_REVCHAR(uvc);
3012                 }
3013             }
3014         } /* end loop through characters in this branch of the trie */
3015 
3016         /* We take the min and max for this branch and combine to find the min
3017          * and max for all branches processed so far */
3018         if( cur == first ) {
3019             trie->minlen = minchars;
3020             trie->maxlen = maxchars;
3021         } else if (minchars < trie->minlen) {
3022             trie->minlen = minchars;
3023         } else if (maxchars > trie->maxlen) {
3024             trie->maxlen = maxchars;
3025         }
3026     } /* end first pass */
3027     DEBUG_TRIE_COMPILE_r(
3028         Perl_re_indentf( aTHX_
3029                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3030                 depth+1,
3031                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3032 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3033 		(int)trie->minlen, (int)trie->maxlen )
3034     );
3035 
3036     /*
3037         We now know what we are dealing with in terms of unique chars and
3038         string sizes so we can calculate how much memory a naive
3039         representation using a flat table  will take. If it's over a reasonable
3040         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3041         conservative but potentially much slower representation using an array
3042         of lists.
3043 
3044         At the end we convert both representations into the same compressed
3045         form that will be used in regexec.c for matching with. The latter
3046         is a form that cannot be used to construct with but has memory
3047         properties similar to the list form and access properties similar
3048         to the table form making it both suitable for fast searches and
3049         small enough that its feasable to store for the duration of a program.
3050 
3051         See the comment in the code where the compressed table is produced
3052         inplace from the flat tabe representation for an explanation of how
3053         the compression works.
3054 
3055     */
3056 
3057 
3058     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3059     prev_states[1] = 0;
3060 
3061     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3062                                                     > SvIV(re_trie_maxbuff) )
3063     {
3064         /*
3065             Second Pass -- Array Of Lists Representation
3066 
3067             Each state will be represented by a list of charid:state records
3068             (reg_trie_trans_le) the first such element holds the CUR and LEN
3069             points of the allocated array. (See defines above).
3070 
3071             We build the initial structure using the lists, and then convert
3072             it into the compressed table form which allows faster lookups
3073             (but cant be modified once converted).
3074         */
3075 
3076         STRLEN transcount = 1;
3077 
3078         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
3079             depth+1));
3080 
3081 	trie->states = (reg_trie_state *)
3082 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3083 				  sizeof(reg_trie_state) );
3084         TRIE_LIST_NEW(1);
3085         next_alloc = 2;
3086 
3087         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3088 
3089             regnode *noper   = NEXTOPER( cur );
3090 	    U32 state        = 1;         /* required init */
3091 	    U16 charid       = 0;         /* sanity init */
3092             U32 wordlen      = 0;         /* required init */
3093 
3094             if (OP(noper) == NOTHING) {
3095                 regnode *noper_next= regnext(noper);
3096                 if (noper_next < tail)
3097                     noper= noper_next;
3098                 /* we will undo this assignment if noper does not
3099                  * point at a trieable type in the else clause of
3100                  * the following statement. */
3101             }
3102 
3103             if (    noper < tail
3104                 && (    OP(noper) == flags
3105                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3106                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3107                                              || OP(noper) == EXACTFUP))))
3108             {
3109                 const U8 *uc= (U8*)STRING(noper);
3110                 const U8 *e= uc + STR_LEN(noper);
3111 
3112                 for ( ; uc < e ; uc += len ) {
3113 
3114                     TRIE_READ_CHAR;
3115 
3116                     if ( uvc < 256 ) {
3117                         charid = trie->charmap[ uvc ];
3118 		    } else {
3119                         SV** const svpp = hv_fetch( widecharmap,
3120                                                     (char*)&uvc,
3121                                                     sizeof( UV ),
3122                                                     0);
3123                         if ( !svpp ) {
3124                             charid = 0;
3125                         } else {
3126                             charid=(U16)SvIV( *svpp );
3127                         }
3128 		    }
3129                     /* charid is now 0 if we dont know the char read, or
3130                      * nonzero if we do */
3131                     if ( charid ) {
3132 
3133                         U16 check;
3134                         U32 newstate = 0;
3135 
3136                         charid--;
3137                         if ( !trie->states[ state ].trans.list ) {
3138                             TRIE_LIST_NEW( state );
3139 			}
3140                         for ( check = 1;
3141                               check <= TRIE_LIST_USED( state );
3142                               check++ )
3143                         {
3144                             if ( TRIE_LIST_ITEM( state, check ).forid
3145                                                                     == charid )
3146                             {
3147                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3148                                 break;
3149                             }
3150                         }
3151                         if ( ! newstate ) {
3152                             newstate = next_alloc++;
3153 			    prev_states[newstate] = state;
3154                             TRIE_LIST_PUSH( state, charid, newstate );
3155                             transcount++;
3156                         }
3157                         state = newstate;
3158                     } else {
3159                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3160 		    }
3161 		}
3162             } else {
3163                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3164                  * on a trieable type. So we need to reset noper back to point at the first regop
3165                  * in the branch before we call TRIE_HANDLE_WORD()
3166                 */
3167                 noper= NEXTOPER(cur);
3168             }
3169             TRIE_HANDLE_WORD(state);
3170 
3171         } /* end second pass */
3172 
3173         /* next alloc is the NEXT state to be allocated */
3174         trie->statecount = next_alloc;
3175         trie->states = (reg_trie_state *)
3176 	    PerlMemShared_realloc( trie->states,
3177 				   next_alloc
3178 				   * sizeof(reg_trie_state) );
3179 
3180         /* and now dump it out before we compress it */
3181         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3182 							 revcharmap, next_alloc,
3183 							 depth+1)
3184         );
3185 
3186         trie->trans = (reg_trie_trans *)
3187 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3188         {
3189             U32 state;
3190             U32 tp = 0;
3191             U32 zp = 0;
3192 
3193 
3194             for( state=1 ; state < next_alloc ; state ++ ) {
3195                 U32 base=0;
3196 
3197                 /*
3198                 DEBUG_TRIE_COMPILE_MORE_r(
3199                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3200                 );
3201                 */
3202 
3203                 if (trie->states[state].trans.list) {
3204                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3205                     U16 maxid=minid;
3206 		    U16 idx;
3207 
3208                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3209 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3210 			if ( forid < minid ) {
3211 			    minid=forid;
3212 			} else if ( forid > maxid ) {
3213 			    maxid=forid;
3214 			}
3215                     }
3216                     if ( transcount < tp + maxid - minid + 1) {
3217                         transcount *= 2;
3218 			trie->trans = (reg_trie_trans *)
3219 			    PerlMemShared_realloc( trie->trans,
3220 						     transcount
3221 						     * sizeof(reg_trie_trans) );
3222                         Zero( trie->trans + (transcount / 2),
3223                               transcount / 2,
3224                               reg_trie_trans );
3225                     }
3226                     base = trie->uniquecharcount + tp - minid;
3227                     if ( maxid == minid ) {
3228                         U32 set = 0;
3229                         for ( ; zp < tp ; zp++ ) {
3230                             if ( ! trie->trans[ zp ].next ) {
3231                                 base = trie->uniquecharcount + zp - minid;
3232                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3233                                                                    1).newstate;
3234                                 trie->trans[ zp ].check = state;
3235                                 set = 1;
3236                                 break;
3237                             }
3238                         }
3239                         if ( !set ) {
3240                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3241                                                                    1).newstate;
3242                             trie->trans[ tp ].check = state;
3243                             tp++;
3244                             zp = tp;
3245                         }
3246                     } else {
3247                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3248                             const U32 tid = base
3249                                            - trie->uniquecharcount
3250                                            + TRIE_LIST_ITEM( state, idx ).forid;
3251                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3252                                                                 idx ).newstate;
3253                             trie->trans[ tid ].check = state;
3254                         }
3255                         tp += ( maxid - minid + 1 );
3256                     }
3257                     Safefree(trie->states[ state ].trans.list);
3258                 }
3259                 /*
3260                 DEBUG_TRIE_COMPILE_MORE_r(
3261                     Perl_re_printf( aTHX_  " base: %d\n",base);
3262                 );
3263                 */
3264                 trie->states[ state ].trans.base=base;
3265             }
3266             trie->lasttrans = tp + 1;
3267         }
3268     } else {
3269         /*
3270            Second Pass -- Flat Table Representation.
3271 
3272            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3273            each.  We know that we will need Charcount+1 trans at most to store
3274            the data (one row per char at worst case) So we preallocate both
3275            structures assuming worst case.
3276 
3277            We then construct the trie using only the .next slots of the entry
3278            structs.
3279 
3280            We use the .check field of the first entry of the node temporarily
3281            to make compression both faster and easier by keeping track of how
3282            many non zero fields are in the node.
3283 
3284            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3285            transition.
3286 
3287            There are two terms at use here: state as a TRIE_NODEIDX() which is
3288            a number representing the first entry of the node, and state as a
3289            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3290            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3291            if there are 2 entrys per node. eg:
3292 
3293              A B       A B
3294           1. 2 4    1. 3 7
3295           2. 0 3    3. 0 5
3296           3. 0 0    5. 0 0
3297           4. 0 0    7. 0 0
3298 
3299            The table is internally in the right hand, idx form. However as we
3300            also have to deal with the states array which is indexed by nodenum
3301            we have to use TRIE_NODENUM() to convert.
3302 
3303         */
3304         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3305             depth+1));
3306 
3307 	trie->trans = (reg_trie_trans *)
3308 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3309 				  * trie->uniquecharcount + 1,
3310 				  sizeof(reg_trie_trans) );
3311         trie->states = (reg_trie_state *)
3312 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3313 				  sizeof(reg_trie_state) );
3314         next_alloc = trie->uniquecharcount + 1;
3315 
3316 
3317         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3318 
3319             regnode *noper   = NEXTOPER( cur );
3320 
3321             U32 state        = 1;         /* required init */
3322 
3323             U16 charid       = 0;         /* sanity init */
3324             U32 accept_state = 0;         /* sanity init */
3325 
3326             U32 wordlen      = 0;         /* required init */
3327 
3328             if (OP(noper) == NOTHING) {
3329                 regnode *noper_next= regnext(noper);
3330                 if (noper_next < tail)
3331                     noper= noper_next;
3332                 /* we will undo this assignment if noper does not
3333                  * point at a trieable type in the else clause of
3334                  * the following statement. */
3335             }
3336 
3337             if (    noper < tail
3338                 && (    OP(noper) == flags
3339                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3340                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3341                                              || OP(noper) == EXACTFUP))))
3342             {
3343                 const U8 *uc= (U8*)STRING(noper);
3344                 const U8 *e= uc + STR_LEN(noper);
3345 
3346                 for ( ; uc < e ; uc += len ) {
3347 
3348                     TRIE_READ_CHAR;
3349 
3350                     if ( uvc < 256 ) {
3351                         charid = trie->charmap[ uvc ];
3352                     } else {
3353                         SV* const * const svpp = hv_fetch( widecharmap,
3354                                                            (char*)&uvc,
3355                                                            sizeof( UV ),
3356                                                            0);
3357                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3358                     }
3359                     if ( charid ) {
3360                         charid--;
3361                         if ( !trie->trans[ state + charid ].next ) {
3362                             trie->trans[ state + charid ].next = next_alloc;
3363                             trie->trans[ state ].check++;
3364 			    prev_states[TRIE_NODENUM(next_alloc)]
3365 				    = TRIE_NODENUM(state);
3366                             next_alloc += trie->uniquecharcount;
3367                         }
3368                         state = trie->trans[ state + charid ].next;
3369                     } else {
3370                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3371                     }
3372                     /* charid is now 0 if we dont know the char read, or
3373                      * nonzero if we do */
3374                 }
3375             } else {
3376                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3377                  * on a trieable type. So we need to reset noper back to point at the first regop
3378                  * in the branch before we call TRIE_HANDLE_WORD().
3379                 */
3380                 noper= NEXTOPER(cur);
3381             }
3382             accept_state = TRIE_NODENUM( state );
3383             TRIE_HANDLE_WORD(accept_state);
3384 
3385         } /* end second pass */
3386 
3387         /* and now dump it out before we compress it */
3388         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3389 							  revcharmap,
3390 							  next_alloc, depth+1));
3391 
3392         {
3393         /*
3394            * Inplace compress the table.*
3395 
3396            For sparse data sets the table constructed by the trie algorithm will
3397            be mostly 0/FAIL transitions or to put it another way mostly empty.
3398            (Note that leaf nodes will not contain any transitions.)
3399 
3400            This algorithm compresses the tables by eliminating most such
3401            transitions, at the cost of a modest bit of extra work during lookup:
3402 
3403            - Each states[] entry contains a .base field which indicates the
3404            index in the state[] array wheres its transition data is stored.
3405 
3406            - If .base is 0 there are no valid transitions from that node.
3407 
3408            - If .base is nonzero then charid is added to it to find an entry in
3409            the trans array.
3410 
3411            -If trans[states[state].base+charid].check!=state then the
3412            transition is taken to be a 0/Fail transition. Thus if there are fail
3413            transitions at the front of the node then the .base offset will point
3414            somewhere inside the previous nodes data (or maybe even into a node
3415            even earlier), but the .check field determines if the transition is
3416            valid.
3417 
3418            XXX - wrong maybe?
3419            The following process inplace converts the table to the compressed
3420            table: We first do not compress the root node 1,and mark all its
3421            .check pointers as 1 and set its .base pointer as 1 as well. This
3422            allows us to do a DFA construction from the compressed table later,
3423            and ensures that any .base pointers we calculate later are greater
3424            than 0.
3425 
3426            - We set 'pos' to indicate the first entry of the second node.
3427 
3428            - We then iterate over the columns of the node, finding the first and
3429            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3430            and set the .check pointers accordingly, and advance pos
3431            appropriately and repreat for the next node. Note that when we copy
3432            the next pointers we have to convert them from the original
3433            NODEIDX form to NODENUM form as the former is not valid post
3434            compression.
3435 
3436            - If a node has no transitions used we mark its base as 0 and do not
3437            advance the pos pointer.
3438 
3439            - If a node only has one transition we use a second pointer into the
3440            structure to fill in allocated fail transitions from other states.
3441            This pointer is independent of the main pointer and scans forward
3442            looking for null transitions that are allocated to a state. When it
3443            finds one it writes the single transition into the "hole".  If the
3444            pointer doesnt find one the single transition is appended as normal.
3445 
3446            - Once compressed we can Renew/realloc the structures to release the
3447            excess space.
3448 
3449            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3450            specifically Fig 3.47 and the associated pseudocode.
3451 
3452            demq
3453         */
3454         const U32 laststate = TRIE_NODENUM( next_alloc );
3455 	U32 state, charid;
3456         U32 pos = 0, zp=0;
3457         trie->statecount = laststate;
3458 
3459         for ( state = 1 ; state < laststate ; state++ ) {
3460             U8 flag = 0;
3461 	    const U32 stateidx = TRIE_NODEIDX( state );
3462 	    const U32 o_used = trie->trans[ stateidx ].check;
3463 	    U32 used = trie->trans[ stateidx ].check;
3464             trie->trans[ stateidx ].check = 0;
3465 
3466             for ( charid = 0;
3467                   used && charid < trie->uniquecharcount;
3468                   charid++ )
3469             {
3470                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3471                     if ( trie->trans[ stateidx + charid ].next ) {
3472                         if (o_used == 1) {
3473                             for ( ; zp < pos ; zp++ ) {
3474                                 if ( ! trie->trans[ zp ].next ) {
3475                                     break;
3476                                 }
3477                             }
3478                             trie->states[ state ].trans.base
3479                                                     = zp
3480                                                       + trie->uniquecharcount
3481                                                       - charid ;
3482                             trie->trans[ zp ].next
3483                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3484                                                              + charid ].next );
3485                             trie->trans[ zp ].check = state;
3486                             if ( ++zp > pos ) pos = zp;
3487                             break;
3488                         }
3489                         used--;
3490                     }
3491                     if ( !flag ) {
3492                         flag = 1;
3493                         trie->states[ state ].trans.base
3494                                        = pos + trie->uniquecharcount - charid ;
3495                     }
3496                     trie->trans[ pos ].next
3497                         = SAFE_TRIE_NODENUM(
3498                                        trie->trans[ stateidx + charid ].next );
3499                     trie->trans[ pos ].check = state;
3500                     pos++;
3501                 }
3502             }
3503         }
3504         trie->lasttrans = pos + 1;
3505         trie->states = (reg_trie_state *)
3506 	    PerlMemShared_realloc( trie->states, laststate
3507 				   * sizeof(reg_trie_state) );
3508         DEBUG_TRIE_COMPILE_MORE_r(
3509             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3510                 depth+1,
3511                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3512                        + 1 ),
3513                 (IV)next_alloc,
3514                 (IV)pos,
3515                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3516             );
3517 
3518         } /* end table compress */
3519     }
3520     DEBUG_TRIE_COMPILE_MORE_r(
3521             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3522                 depth+1,
3523                 (UV)trie->statecount,
3524                 (UV)trie->lasttrans)
3525     );
3526     /* resize the trans array to remove unused space */
3527     trie->trans = (reg_trie_trans *)
3528 	PerlMemShared_realloc( trie->trans, trie->lasttrans
3529 			       * sizeof(reg_trie_trans) );
3530 
3531     {   /* Modify the program and insert the new TRIE node */
3532         U8 nodetype =(U8)(flags & 0xFF);
3533         char *str=NULL;
3534 
3535 #ifdef DEBUGGING
3536         regnode *optimize = NULL;
3537 #ifdef RE_TRACK_PATTERN_OFFSETS
3538 
3539         U32 mjd_offset = 0;
3540         U32 mjd_nodelen = 0;
3541 #endif /* RE_TRACK_PATTERN_OFFSETS */
3542 #endif /* DEBUGGING */
3543         /*
3544            This means we convert either the first branch or the first Exact,
3545            depending on whether the thing following (in 'last') is a branch
3546            or not and whther first is the startbranch (ie is it a sub part of
3547            the alternation or is it the whole thing.)
3548            Assuming its a sub part we convert the EXACT otherwise we convert
3549            the whole branch sequence, including the first.
3550          */
3551         /* Find the node we are going to overwrite */
3552         if ( first != startbranch || OP( last ) == BRANCH ) {
3553             /* branch sub-chain */
3554             NEXT_OFF( first ) = (U16)(last - first);
3555 #ifdef RE_TRACK_PATTERN_OFFSETS
3556             DEBUG_r({
3557                 mjd_offset= Node_Offset((convert));
3558                 mjd_nodelen= Node_Length((convert));
3559             });
3560 #endif
3561             /* whole branch chain */
3562         }
3563 #ifdef RE_TRACK_PATTERN_OFFSETS
3564         else {
3565             DEBUG_r({
3566                 const  regnode *nop = NEXTOPER( convert );
3567                 mjd_offset= Node_Offset((nop));
3568                 mjd_nodelen= Node_Length((nop));
3569             });
3570         }
3571         DEBUG_OPTIMISE_r(
3572             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3573                 depth+1,
3574                 (UV)mjd_offset, (UV)mjd_nodelen)
3575         );
3576 #endif
3577         /* But first we check to see if there is a common prefix we can
3578            split out as an EXACT and put in front of the TRIE node.  */
3579         trie->startstate= 1;
3580         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3581             /* we want to find the first state that has more than
3582              * one transition, if that state is not the first state
3583              * then we have a common prefix which we can remove.
3584              */
3585             U32 state;
3586             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3587                 U32 ofs = 0;
3588                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3589                                        transition, -1 means none */
3590                 U32 count = 0;
3591                 const U32 base = trie->states[ state ].trans.base;
3592 
3593                 /* does this state terminate an alternation? */
3594                 if ( trie->states[state].wordnum )
3595                         count = 1;
3596 
3597                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3598                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3599                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3600                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3601                     {
3602                         if ( ++count > 1 ) {
3603                             /* we have more than one transition */
3604                             SV **tmp;
3605                             U8 *ch;
3606                             /* if this is the first state there is no common prefix
3607                              * to extract, so we can exit */
3608                             if ( state == 1 ) break;
3609                             tmp = av_fetch( revcharmap, ofs, 0);
3610                             ch = (U8*)SvPV_nolen_const( *tmp );
3611 
3612                             /* if we are on count 2 then we need to initialize the
3613                              * bitmap, and store the previous char if there was one
3614                              * in it*/
3615                             if ( count == 2 ) {
3616                                 /* clear the bitmap */
3617                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3618                                 DEBUG_OPTIMISE_r(
3619                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3620                                         depth+1,
3621                                         (UV)state));
3622                                 if (first_ofs >= 0) {
3623                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3624 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3625 
3626                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3627                                     DEBUG_OPTIMISE_r(
3628                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3629                                     );
3630 				}
3631 			    }
3632                             /* store the current firstchar in the bitmap */
3633                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3634                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3635 			}
3636                         first_ofs = ofs;
3637 		    }
3638                 }
3639                 if ( count == 1 ) {
3640                     /* This state has only one transition, its transition is part
3641                      * of a common prefix - we need to concatenate the char it
3642                      * represents to what we have so far. */
3643                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3644                     STRLEN len;
3645                     char *ch = SvPV( *tmp, len );
3646                     DEBUG_OPTIMISE_r({
3647                         SV *sv=sv_newmortal();
3648                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3649                             depth+1,
3650                             (UV)state, (UV)first_ofs,
3651                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3652 	                        PL_colors[0], PL_colors[1],
3653 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3654 	                        PERL_PV_ESCAPE_FIRSTCHAR
3655                             )
3656                         );
3657                     });
3658                     if ( state==1 ) {
3659                         OP( convert ) = nodetype;
3660                         str=STRING(convert);
3661                         setSTR_LEN(convert, 0);
3662                     }
3663                     assert( ( STR_LEN(convert) + len ) < 256 );
3664                     setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3665                     while (len--)
3666                         *str++ = *ch++;
3667 		} else {
3668 #ifdef DEBUGGING
3669 		    if (state>1)
3670                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3671 #endif
3672 		    break;
3673 		}
3674 	    }
3675 	    trie->prefixlen = (state-1);
3676             if (str) {
3677                 regnode *n = convert+NODE_SZ_STR(convert);
3678                 assert( NODE_SZ_STR(convert) <= U16_MAX );
3679                 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3680                 trie->startstate = state;
3681                 trie->minlen -= (state - 1);
3682                 trie->maxlen -= (state - 1);
3683 #ifdef DEBUGGING
3684                /* At least the UNICOS C compiler choked on this
3685                 * being argument to DEBUG_r(), so let's just have
3686                 * it right here. */
3687                if (
3688 #ifdef PERL_EXT_RE_BUILD
3689                    1
3690 #else
3691                    DEBUG_r_TEST
3692 #endif
3693                    ) {
3694                    regnode *fix = convert;
3695                    U32 word = trie->wordcount;
3696 #ifdef RE_TRACK_PATTERN_OFFSETS
3697                    mjd_nodelen++;
3698 #endif
3699                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3700                    while( ++fix < n ) {
3701                        Set_Node_Offset_Length(fix, 0, 0);
3702                    }
3703                    while (word--) {
3704                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3705                        if (tmp) {
3706                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3707                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3708                            else
3709                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3710                        }
3711                    }
3712                }
3713 #endif
3714                 if (trie->maxlen) {
3715                     convert = n;
3716 		} else {
3717                     NEXT_OFF(convert) = (U16)(tail - convert);
3718                     DEBUG_r(optimize= n);
3719                 }
3720             }
3721         }
3722         if (!jumper)
3723             jumper = last;
3724         if ( trie->maxlen ) {
3725 	    NEXT_OFF( convert ) = (U16)(tail - convert);
3726 	    ARG_SET( convert, data_slot );
3727 	    /* Store the offset to the first unabsorbed branch in
3728 	       jump[0], which is otherwise unused by the jump logic.
3729 	       We use this when dumping a trie and during optimisation. */
3730 	    if (trie->jump)
3731 	        trie->jump[0] = (U16)(nextbranch - convert);
3732 
3733             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3734 	     *   and there is a bitmap
3735 	     *   and the first "jump target" node we found leaves enough room
3736 	     * then convert the TRIE node into a TRIEC node, with the bitmap
3737 	     * embedded inline in the opcode - this is hypothetically faster.
3738 	     */
3739             if ( !trie->states[trie->startstate].wordnum
3740 		 && trie->bitmap
3741 		 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3742             {
3743                 OP( convert ) = TRIEC;
3744                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3745                 PerlMemShared_free(trie->bitmap);
3746                 trie->bitmap= NULL;
3747             } else
3748                 OP( convert ) = TRIE;
3749 
3750             /* store the type in the flags */
3751             convert->flags = nodetype;
3752             DEBUG_r({
3753             optimize = convert
3754                       + NODE_STEP_REGNODE
3755                       + regarglen[ OP( convert ) ];
3756             });
3757             /* XXX We really should free up the resource in trie now,
3758                    as we won't use them - (which resources?) dmq */
3759         }
3760         /* needed for dumping*/
3761         DEBUG_r(if (optimize) {
3762             regnode *opt = convert;
3763 
3764             while ( ++opt < optimize) {
3765                 Set_Node_Offset_Length(opt, 0, 0);
3766             }
3767             /*
3768                 Try to clean up some of the debris left after the
3769                 optimisation.
3770              */
3771             while( optimize < jumper ) {
3772                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3773                 OP( optimize ) = OPTIMIZED;
3774                 Set_Node_Offset_Length(optimize, 0, 0);
3775                 optimize++;
3776             }
3777             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3778         });
3779     } /* end node insert */
3780 
3781     /*  Finish populating the prev field of the wordinfo array.  Walk back
3782      *  from each accept state until we find another accept state, and if
3783      *  so, point the first word's .prev field at the second word. If the
3784      *  second already has a .prev field set, stop now. This will be the
3785      *  case either if we've already processed that word's accept state,
3786      *  or that state had multiple words, and the overspill words were
3787      *  already linked up earlier.
3788      */
3789     {
3790 	U16 word;
3791 	U32 state;
3792 	U16 prev;
3793 
3794 	for (word=1; word <= trie->wordcount; word++) {
3795 	    prev = 0;
3796 	    if (trie->wordinfo[word].prev)
3797 		continue;
3798 	    state = trie->wordinfo[word].accept;
3799 	    while (state) {
3800 		state = prev_states[state];
3801 		if (!state)
3802 		    break;
3803 		prev = trie->states[state].wordnum;
3804 		if (prev)
3805 		    break;
3806 	    }
3807 	    trie->wordinfo[word].prev = prev;
3808 	}
3809 	Safefree(prev_states);
3810     }
3811 
3812 
3813     /* and now dump out the compressed format */
3814     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3815 
3816     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3817 #ifdef DEBUGGING
3818     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3819     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3820 #else
3821     SvREFCNT_dec_NN(revcharmap);
3822 #endif
3823     return trie->jump
3824            ? MADE_JUMP_TRIE
3825            : trie->startstate>1
3826              ? MADE_EXACT_TRIE
3827              : MADE_TRIE;
3828 }
3829 
3830 STATIC regnode *
S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t * pRExC_state,regnode * source,U32 depth)3831 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3832 {
3833 /* The Trie is constructed and compressed now so we can build a fail array if
3834  * it's needed
3835 
3836    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3837    3.32 in the
3838    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3839    Ullman 1985/88
3840    ISBN 0-201-10088-6
3841 
3842    We find the fail state for each state in the trie, this state is the longest
3843    proper suffix of the current state's 'word' that is also a proper prefix of
3844    another word in our trie. State 1 represents the word '' and is thus the
3845    default fail state. This allows the DFA not to have to restart after its
3846    tried and failed a word at a given point, it simply continues as though it
3847    had been matching the other word in the first place.
3848    Consider
3849       'abcdgu'=~/abcdefg|cdgu/
3850    When we get to 'd' we are still matching the first word, we would encounter
3851    'g' which would fail, which would bring us to the state representing 'd' in
3852    the second word where we would try 'g' and succeed, proceeding to match
3853    'cdgu'.
3854  */
3855  /* add a fail transition */
3856     const U32 trie_offset = ARG(source);
3857     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3858     U32 *q;
3859     const U32 ucharcount = trie->uniquecharcount;
3860     const U32 numstates = trie->statecount;
3861     const U32 ubound = trie->lasttrans + ucharcount;
3862     U32 q_read = 0;
3863     U32 q_write = 0;
3864     U32 charid;
3865     U32 base = trie->states[ 1 ].trans.base;
3866     U32 *fail;
3867     reg_ac_data *aho;
3868     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3869     regnode *stclass;
3870     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3871 
3872     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3873     PERL_UNUSED_CONTEXT;
3874 #ifndef DEBUGGING
3875     PERL_UNUSED_ARG(depth);
3876 #endif
3877 
3878     if ( OP(source) == TRIE ) {
3879         struct regnode_1 *op = (struct regnode_1 *)
3880             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3881         StructCopy(source, op, struct regnode_1);
3882         stclass = (regnode *)op;
3883     } else {
3884         struct regnode_charclass *op = (struct regnode_charclass *)
3885             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3886         StructCopy(source, op, struct regnode_charclass);
3887         stclass = (regnode *)op;
3888     }
3889     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3890 
3891     ARG_SET( stclass, data_slot );
3892     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3893     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3894     aho->trie=trie_offset;
3895     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3896     Copy( trie->states, aho->states, numstates, reg_trie_state );
3897     Newx( q, numstates, U32);
3898     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3899     aho->refcount = 1;
3900     fail = aho->fail;
3901     /* initialize fail[0..1] to be 1 so that we always have
3902        a valid final fail state */
3903     fail[ 0 ] = fail[ 1 ] = 1;
3904 
3905     for ( charid = 0; charid < ucharcount ; charid++ ) {
3906 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3907 	if ( newstate ) {
3908             q[ q_write ] = newstate;
3909             /* set to point at the root */
3910             fail[ q[ q_write++ ] ]=1;
3911         }
3912     }
3913     while ( q_read < q_write) {
3914 	const U32 cur = q[ q_read++ % numstates ];
3915         base = trie->states[ cur ].trans.base;
3916 
3917         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3918 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3919 	    if (ch_state) {
3920                 U32 fail_state = cur;
3921                 U32 fail_base;
3922                 do {
3923                     fail_state = fail[ fail_state ];
3924                     fail_base = aho->states[ fail_state ].trans.base;
3925                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3926 
3927                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3928                 fail[ ch_state ] = fail_state;
3929                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3930                 {
3931                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3932                 }
3933                 q[ q_write++ % numstates] = ch_state;
3934             }
3935         }
3936     }
3937     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3938        when we fail in state 1, this allows us to use the
3939        charclass scan to find a valid start char. This is based on the principle
3940        that theres a good chance the string being searched contains lots of stuff
3941        that cant be a start char.
3942      */
3943     fail[ 0 ] = fail[ 1 ] = 0;
3944     DEBUG_TRIE_COMPILE_r({
3945         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3946                       depth, (UV)numstates
3947         );
3948         for( q_read=1; q_read<numstates; q_read++ ) {
3949             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3950         }
3951         Perl_re_printf( aTHX_  "\n");
3952     });
3953     Safefree(q);
3954     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3955     return stclass;
3956 }
3957 
3958 
3959 /* The below joins as many adjacent EXACTish nodes as possible into a single
3960  * one.  The regop may be changed if the node(s) contain certain sequences that
3961  * require special handling.  The joining is only done if:
3962  * 1) there is room in the current conglomerated node to entirely contain the
3963  *    next one.
3964  * 2) they are compatible node types
3965  *
3966  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3967  * these get optimized out
3968  *
3969  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3970  * as possible, even if that means splitting an existing node so that its first
3971  * part is moved to the preceeding node.  This would maximise the efficiency of
3972  * memEQ during matching.
3973  *
3974  * If a node is to match under /i (folded), the number of characters it matches
3975  * can be different than its character length if it contains a multi-character
3976  * fold.  *min_subtract is set to the total delta number of characters of the
3977  * input nodes.
3978  *
3979  * And *unfolded_multi_char is set to indicate whether or not the node contains
3980  * an unfolded multi-char fold.  This happens when it won't be known until
3981  * runtime whether the fold is valid or not; namely
3982  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3983  *      target string being matched against turns out to be UTF-8 is that fold
3984  *      valid; or
3985  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3986  *      runtime.
3987  * (Multi-char folds whose components are all above the Latin1 range are not
3988  * run-time locale dependent, and have already been folded by the time this
3989  * function is called.)
3990  *
3991  * This is as good a place as any to discuss the design of handling these
3992  * multi-character fold sequences.  It's been wrong in Perl for a very long
3993  * time.  There are three code points in Unicode whose multi-character folds
3994  * were long ago discovered to mess things up.  The previous designs for
3995  * dealing with these involved assigning a special node for them.  This
3996  * approach doesn't always work, as evidenced by this example:
3997  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3998  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3999  * would match just the \xDF, it won't be able to handle the case where a
4000  * successful match would have to cross the node's boundary.  The new approach
4001  * that hopefully generally solves the problem generates an EXACTFUP node
4002  * that is "sss" in this case.
4003  *
4004  * It turns out that there are problems with all multi-character folds, and not
4005  * just these three.  Now the code is general, for all such cases.  The
4006  * approach taken is:
4007  * 1)   This routine examines each EXACTFish node that could contain multi-
4008  *      character folded sequences.  Since a single character can fold into
4009  *      such a sequence, the minimum match length for this node is less than
4010  *      the number of characters in the node.  This routine returns in
4011  *      *min_subtract how many characters to subtract from the actual
4012  *      length of the string to get a real minimum match length; it is 0 if
4013  *      there are no multi-char foldeds.  This delta is used by the caller to
4014  *      adjust the min length of the match, and the delta between min and max,
4015  *      so that the optimizer doesn't reject these possibilities based on size
4016  *      constraints.
4017  *
4018  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4019  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
4020  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4021  *      EXACTFU nodes.  The node type of such nodes is then changed to
4022  *      EXACTFUP, indicating it is problematic, and needs careful handling.
4023  *      (The procedures in step 1) above are sufficient to handle this case in
4024  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
4025  *      the only case where there is a possible fold length change in non-UTF-8
4026  *      patterns.  By reserving a special node type for problematic cases, the
4027  *      far more common regular EXACTFU nodes can be processed faster.
4028  *      regexec.c takes advantage of this.
4029  *
4030  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4031  *      problematic cases.   These all only occur when the pattern is not
4032  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
4033  *      length change, it handles the situation where the string cannot be
4034  *      entirely folded.  The strings in an EXACTFish node are folded as much
4035  *      as possible during compilation in regcomp.c.  This saves effort in
4036  *      regex matching.  By using an EXACTFUP node when it is not possible to
4037  *      fully fold at compile time, regexec.c can know that everything in an
4038  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
4039  *      case where folding in EXACTFU nodes can't be done at compile time is
4040  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
4041  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
4042  *      handle two very different cases.  Alternatively, there could have been
4043  *      a node type where there are length changes, one for unfolded, and one
4044  *      for both.  If yet another special case needed to be created, the number
4045  *      of required node types would have to go to 7.  khw figures that even
4046  *      though there are plenty of node types to spare, that the maintenance
4047  *      cost wasn't worth the small speedup of doing it that way, especially
4048  *      since he thinks the MICRO SIGN is rarely encountered in practice.
4049  *
4050  *      There are other cases where folding isn't done at compile time, but
4051  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
4052  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
4053  *      changes.  Some folds in EXACTF depend on if the runtime target string
4054  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
4055  *      when no fold in it depends on the UTF-8ness of the target string.)
4056  *
4057  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
4058  *      validity of the fold won't be known until runtime, and so must remain
4059  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
4060  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
4061  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
4062  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4063  *      The reason this is a problem is that the optimizer part of regexec.c
4064  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4065  *      that a character in the pattern corresponds to at most a single
4066  *      character in the target string.  (And I do mean character, and not byte
4067  *      here, unlike other parts of the documentation that have never been
4068  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
4069  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4070  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
4071  *      EXACTFL nodes, violate the assumption, and they are the only instances
4072  *      where it is violated.  I'm reluctant to try to change the assumption,
4073  *      as the code involved is impenetrable to me (khw), so instead the code
4074  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
4075  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4076  *      boolean indicating whether or not the node contains such a fold.  When
4077  *      it is true, the caller sets a flag that later causes the optimizer in
4078  *      this file to not set values for the floating and fixed string lengths,
4079  *      and thus avoids the optimizer code in regexec.c that makes the invalid
4080  *      assumption.  Thus, there is no optimization based on string lengths for
4081  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4082  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
4083  *      assumption is wrong only in these cases is that all other non-UTF-8
4084  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4085  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
4086  *      EXACTF nodes because we don't know at compile time if it actually
4087  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
4088  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
4089  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
4090  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4091  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4092  *      string would require the pattern to be forced into UTF-8, the overhead
4093  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
4094  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4095  *      locale.)
4096  *
4097  *      Similarly, the code that generates tries doesn't currently handle
4098  *      not-already-folded multi-char folds, and it looks like a pain to change
4099  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
4100  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
4101  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
4102  *      using /iaa matching will be doing so almost entirely with ASCII
4103  *      strings, so this should rarely be encountered in practice */
4104 
4105 STATIC U32
S_join_exact(pTHX_ RExC_state_t * pRExC_state,regnode * scan,UV * min_subtract,bool * unfolded_multi_char,U32 flags,regnode * val,U32 depth)4106 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4107                    UV *min_subtract, bool *unfolded_multi_char,
4108                    U32 flags, regnode *val, U32 depth)
4109 {
4110     /* Merge several consecutive EXACTish nodes into one. */
4111 
4112     regnode *n = regnext(scan);
4113     U32 stringok = 1;
4114     regnode *next = scan + NODE_SZ_STR(scan);
4115     U32 merged = 0;
4116     U32 stopnow = 0;
4117 #ifdef DEBUGGING
4118     regnode *stop = scan;
4119     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4120 #else
4121     PERL_UNUSED_ARG(depth);
4122 #endif
4123 
4124     PERL_ARGS_ASSERT_JOIN_EXACT;
4125 #ifndef EXPERIMENTAL_INPLACESCAN
4126     PERL_UNUSED_ARG(flags);
4127     PERL_UNUSED_ARG(val);
4128 #endif
4129     DEBUG_PEEP("join", scan, depth, 0);
4130 
4131     assert(PL_regkind[OP(scan)] == EXACT);
4132 
4133     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4134      * EXACT ones that are mergeable to the current one. */
4135     while (    n
4136            && (    PL_regkind[OP(n)] == NOTHING
4137                || (stringok && PL_regkind[OP(n)] == EXACT))
4138            && NEXT_OFF(n)
4139            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4140     {
4141 
4142         if (OP(n) == TAIL || n > next)
4143             stringok = 0;
4144         if (PL_regkind[OP(n)] == NOTHING) {
4145             DEBUG_PEEP("skip:", n, depth, 0);
4146             NEXT_OFF(scan) += NEXT_OFF(n);
4147             next = n + NODE_STEP_REGNODE;
4148 #ifdef DEBUGGING
4149             if (stringok)
4150                 stop = n;
4151 #endif
4152             n = regnext(n);
4153         }
4154         else if (stringok) {
4155             const unsigned int oldl = STR_LEN(scan);
4156             regnode * const nnext = regnext(n);
4157 
4158             /* XXX I (khw) kind of doubt that this works on platforms (should
4159              * Perl ever run on one) where U8_MAX is above 255 because of lots
4160              * of other assumptions */
4161             /* Don't join if the sum can't fit into a single node */
4162             if (oldl + STR_LEN(n) > U8_MAX)
4163                 break;
4164 
4165             /* Joining something that requires UTF-8 with something that
4166              * doesn't, means the result requires UTF-8. */
4167             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4168                 OP(scan) = EXACT_REQ8;
4169             }
4170             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4171                 ;   /* join is compatible, no need to change OP */
4172             }
4173             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4174                 OP(scan) = EXACTFU_REQ8;
4175             }
4176             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4177                 ;   /* join is compatible, no need to change OP */
4178             }
4179             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4180                 ;   /* join is compatible, no need to change OP */
4181             }
4182             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4183 
4184                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4185                   * which can join with EXACTFU ones.  We check for this case
4186                   * here.  These need to be resolved to either EXACTFU or
4187                   * EXACTF at joining time.  They have nothing in them that
4188                   * would forbid them from being the more desirable EXACTFU
4189                   * nodes except that they begin and/or end with a single [Ss].
4190                   * The reason this is problematic is because they could be
4191                   * joined in this loop with an adjacent node that ends and/or
4192                   * begins with [Ss] which would then form the sequence 'ss',
4193                   * which matches differently under /di than /ui, in which case
4194                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4195                   * formed, the nodes get absorbed into any adjacent EXACTFU
4196                   * node.  And if the only adjacent node is EXACTF, they get
4197                   * absorbed into that, under the theory that a longer node is
4198                   * better than two shorter ones, even if one is EXACTFU.  Note
4199                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4200                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4201 
4202                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4203 
4204                     /* Here the joined node would end with 's'.  If the node
4205                      * following the combination is an EXACTF one, it's better to
4206                      * join this trailing edge 's' node with that one, leaving the
4207                      * current one in 'scan' be the more desirable EXACTFU */
4208                     if (OP(nnext) == EXACTF) {
4209                         break;
4210                     }
4211 
4212                     OP(scan) = EXACTFU_S_EDGE;
4213 
4214                 }   /* Otherwise, the beginning 's' of the 2nd node just
4215                        becomes an interior 's' in 'scan' */
4216             }
4217             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4218                 ;   /* join is compatible, no need to change OP */
4219             }
4220             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4221 
4222                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4223                  * nodes.  But the latter nodes can be also joined with EXACTFU
4224                  * ones, and that is a better outcome, so if the node following
4225                  * 'n' is EXACTFU, quit now so that those two can be joined
4226                  * later */
4227                 if (OP(nnext) == EXACTFU) {
4228                     break;
4229                 }
4230 
4231                 /* The join is compatible, and the combined node will be
4232                  * EXACTF.  (These don't care if they begin or end with 's' */
4233             }
4234             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4235                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4236                     && STRING(n)[0] == 's')
4237                 {
4238                     /* When combined, we have the sequence 'ss', which means we
4239                      * have to remain /di */
4240                     OP(scan) = EXACTF;
4241                 }
4242             }
4243             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4244                 if (STRING(n)[0] == 's') {
4245                     ;   /* Here the join is compatible and the combined node
4246                            starts with 's', no need to change OP */
4247                 }
4248                 else {  /* Now the trailing 's' is in the interior */
4249                     OP(scan) = EXACTFU;
4250                 }
4251             }
4252             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4253 
4254                 /* The join is compatible, and the combined node will be
4255                  * EXACTF.  (These don't care if they begin or end with 's' */
4256                 OP(scan) = EXACTF;
4257             }
4258             else if (OP(scan) != OP(n)) {
4259 
4260                 /* The only other compatible joinings are the same node type */
4261                 break;
4262             }
4263 
4264             DEBUG_PEEP("merg", n, depth, 0);
4265             merged++;
4266 
4267             NEXT_OFF(scan) += NEXT_OFF(n);
4268             assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4269             setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4270             next = n + NODE_SZ_STR(n);
4271             /* Now we can overwrite *n : */
4272             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4273 #ifdef DEBUGGING
4274             stop = next - 1;
4275 #endif
4276             n = nnext;
4277             if (stopnow) break;
4278         }
4279 
4280 #ifdef EXPERIMENTAL_INPLACESCAN
4281 	if (flags && !NEXT_OFF(n)) {
4282 	    DEBUG_PEEP("atch", val, depth, 0);
4283 	    if (reg_off_by_arg[OP(n)]) {
4284 		ARG_SET(n, val - n);
4285 	    }
4286 	    else {
4287 		NEXT_OFF(n) = val - n;
4288 	    }
4289 	    stopnow = 1;
4290 	}
4291 #endif
4292     }
4293 
4294     /* This temporary node can now be turned into EXACTFU, and must, as
4295      * regexec.c doesn't handle it */
4296     if (OP(scan) == EXACTFU_S_EDGE) {
4297         OP(scan) = EXACTFU;
4298     }
4299 
4300     *min_subtract = 0;
4301     *unfolded_multi_char = FALSE;
4302 
4303     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4304      * can now analyze for sequences of problematic code points.  (Prior to
4305      * this final joining, sequences could have been split over boundaries, and
4306      * hence missed).  The sequences only happen in folding, hence for any
4307      * non-EXACT EXACTish node */
4308     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4309         U8* s0 = (U8*) STRING(scan);
4310         U8* s = s0;
4311         U8* s_end = s0 + STR_LEN(scan);
4312 
4313         int total_count_delta = 0;  /* Total delta number of characters that
4314                                        multi-char folds expand to */
4315 
4316 	/* One pass is made over the node's string looking for all the
4317 	 * possibilities.  To avoid some tests in the loop, there are two main
4318 	 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4319 	 * non-UTF-8 */
4320 	if (UTF) {
4321             U8* folded = NULL;
4322 
4323             if (OP(scan) == EXACTFL) {
4324                 U8 *d;
4325 
4326                 /* An EXACTFL node would already have been changed to another
4327                  * node type unless there is at least one character in it that
4328                  * is problematic; likely a character whose fold definition
4329                  * won't be known until runtime, and so has yet to be folded.
4330                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4331                  * to handle the UTF-8 case, we need to create a temporary
4332                  * folded copy using UTF-8 locale rules in order to analyze it.
4333                  * This is because our macros that look to see if a sequence is
4334                  * a multi-char fold assume everything is folded (otherwise the
4335                  * tests in those macros would be too complicated and slow).
4336                  * Note that here, the non-problematic folds will have already
4337                  * been done, so we can just copy such characters.  We actually
4338                  * don't completely fold the EXACTFL string.  We skip the
4339                  * unfolded multi-char folds, as that would just create work
4340                  * below to figure out the size they already are */
4341 
4342                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4343                 d = folded;
4344                 while (s < s_end) {
4345                     STRLEN s_len = UTF8SKIP(s);
4346                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4347                         Copy(s, d, s_len, U8);
4348                         d += s_len;
4349                     }
4350                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4351                         *unfolded_multi_char = TRUE;
4352                         Copy(s, d, s_len, U8);
4353                         d += s_len;
4354                     }
4355                     else if (isASCII(*s)) {
4356                         *(d++) = toFOLD(*s);
4357                     }
4358                     else {
4359                         STRLEN len;
4360                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4361                         d += len;
4362                     }
4363                     s += s_len;
4364                 }
4365 
4366                 /* Point the remainder of the routine to look at our temporary
4367                  * folded copy */
4368                 s = folded;
4369                 s_end = d;
4370             } /* End of creating folded copy of EXACTFL string */
4371 
4372             /* Examine the string for a multi-character fold sequence.  UTF-8
4373              * patterns have all characters pre-folded by the time this code is
4374              * executed */
4375             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4376                                      length sequence we are looking for is 2 */
4377 	    {
4378                 int count = 0;  /* How many characters in a multi-char fold */
4379                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4380                 if (! len) {    /* Not a multi-char fold: get next char */
4381                     s += UTF8SKIP(s);
4382                     continue;
4383                 }
4384 
4385                 { /* Here is a generic multi-char fold. */
4386                     U8* multi_end  = s + len;
4387 
4388                     /* Count how many characters are in it.  In the case of
4389                      * /aa, no folds which contain ASCII code points are
4390                      * allowed, so check for those, and skip if found. */
4391                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4392                         count = utf8_length(s, multi_end);
4393                         s = multi_end;
4394                     }
4395                     else {
4396                         while (s < multi_end) {
4397                             if (isASCII(*s)) {
4398                                 s++;
4399                                 goto next_iteration;
4400                             }
4401                             else {
4402                                 s += UTF8SKIP(s);
4403                             }
4404                             count++;
4405                         }
4406                     }
4407                 }
4408 
4409                 /* The delta is how long the sequence is minus 1 (1 is how long
4410                  * the character that folds to the sequence is) */
4411                 total_count_delta += count - 1;
4412               next_iteration: ;
4413 	    }
4414 
4415             /* We created a temporary folded copy of the string in EXACTFL
4416              * nodes.  Therefore we need to be sure it doesn't go below zero,
4417              * as the real string could be shorter */
4418             if (OP(scan) == EXACTFL) {
4419                 int total_chars = utf8_length((U8*) STRING(scan),
4420                                            (U8*) STRING(scan) + STR_LEN(scan));
4421                 if (total_count_delta > total_chars) {
4422                     total_count_delta = total_chars;
4423                 }
4424             }
4425 
4426             *min_subtract += total_count_delta;
4427             Safefree(folded);
4428 	}
4429 	else if (OP(scan) == EXACTFAA) {
4430 
4431             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4432              * fold to the ASCII range (and there are no existing ones in the
4433              * upper latin1 range).  But, as outlined in the comments preceding
4434              * this function, we need to flag any occurrences of the sharp s.
4435              * This character forbids trie formation (because of added
4436              * complexity) */
4437 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4438    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4439                                       || UNICODE_DOT_DOT_VERSION > 0)
4440 	    while (s < s_end) {
4441                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4442                     OP(scan) = EXACTFAA_NO_TRIE;
4443                     *unfolded_multi_char = TRUE;
4444                     break;
4445                 }
4446                 s++;
4447             }
4448         }
4449 	else if (OP(scan) != EXACTFAA_NO_TRIE) {
4450 
4451             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4452              * folds that are all Latin1.  As explained in the comments
4453              * preceding this function, we look also for the sharp s in EXACTF
4454              * and EXACTFL nodes; it can be in the final position.  Otherwise
4455              * we can stop looking 1 byte earlier because have to find at least
4456              * two characters for a multi-fold */
4457 	    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4458                               ? s_end
4459                               : s_end -1;
4460 
4461 	    while (s < upper) {
4462                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4463                 if (! len) {    /* Not a multi-char fold. */
4464                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4465                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4466                     {
4467                         *unfolded_multi_char = TRUE;
4468                     }
4469                     s++;
4470                     continue;
4471                 }
4472 
4473                 if (len == 2
4474                     && isALPHA_FOLD_EQ(*s, 's')
4475                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4476                 {
4477 
4478                     /* EXACTF nodes need to know that the minimum length
4479                      * changed so that a sharp s in the string can match this
4480                      * ss in the pattern, but they remain EXACTF nodes, as they
4481                      * won't match this unless the target string is in UTF-8,
4482                      * which we don't know until runtime.  EXACTFL nodes can't
4483                      * transform into EXACTFU nodes */
4484                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4485                         OP(scan) = EXACTFUP;
4486                     }
4487 		}
4488 
4489                 *min_subtract += len - 1;
4490                 s += len;
4491 	    }
4492 #endif
4493 	}
4494     }
4495 
4496 #ifdef DEBUGGING
4497     /* Allow dumping but overwriting the collection of skipped
4498      * ops and/or strings with fake optimized ops */
4499     n = scan + NODE_SZ_STR(scan);
4500     while (n <= stop) {
4501 	OP(n) = OPTIMIZED;
4502 	FLAGS(n) = 0;
4503 	NEXT_OFF(n) = 0;
4504         n++;
4505     }
4506 #endif
4507     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4508     return stopnow;
4509 }
4510 
4511 /* REx optimizer.  Converts nodes into quicker variants "in place".
4512    Finds fixed substrings.  */
4513 
4514 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4515    to the position after last scanned or to NULL. */
4516 
4517 #define INIT_AND_WITHP \
4518     assert(!and_withp); \
4519     Newx(and_withp, 1, regnode_ssc); \
4520     SAVEFREEPV(and_withp)
4521 
4522 
4523 static void
S_unwind_scan_frames(pTHX_ const void * p)4524 S_unwind_scan_frames(pTHX_ const void *p)
4525 {
4526     scan_frame *f= (scan_frame *)p;
4527     do {
4528         scan_frame *n= f->next_frame;
4529         Safefree(f);
4530         f= n;
4531     } while (f);
4532 }
4533 
4534 /* Follow the next-chain of the current node and optimize away
4535    all the NOTHINGs from it.
4536  */
4537 STATIC void
S_rck_elide_nothing(pTHX_ regnode * node)4538 S_rck_elide_nothing(pTHX_ regnode *node)
4539 {
4540     dVAR;
4541 
4542     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4543 
4544     if (OP(node) != CURLYX) {
4545         const int max = (reg_off_by_arg[OP(node)]
4546                         ? I32_MAX
4547                           /* I32 may be smaller than U16 on CRAYs! */
4548                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4549         int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4550         int noff;
4551         regnode *n = node;
4552 
4553         /* Skip NOTHING and LONGJMP. */
4554         while (
4555             (n = regnext(n))
4556             && (
4557                 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4558                 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4559             )
4560             && off + noff < max
4561         ) {
4562             off += noff;
4563         }
4564         if (reg_off_by_arg[OP(node)])
4565             ARG(node) = off;
4566         else
4567             NEXT_OFF(node) = off;
4568     }
4569     return;
4570 }
4571 
4572 /* the return from this sub is the minimum length that could possibly match */
4573 STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,SSize_t * minlenp,SSize_t * deltap,regnode * last,scan_data_t * data,I32 stopparen,U32 recursed_depth,regnode_ssc * and_withp,U32 flags,U32 depth,bool was_mutate_ok)4574 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4575                         SSize_t *minlenp, SSize_t *deltap,
4576 			regnode *last,
4577 			scan_data_t *data,
4578 			I32 stopparen,
4579                         U32 recursed_depth,
4580 			regnode_ssc *and_withp,
4581 			U32 flags, U32 depth, bool was_mutate_ok)
4582 			/* scanp: Start here (read-write). */
4583 			/* deltap: Write maxlen-minlen here. */
4584 			/* last: Stop before this one. */
4585 			/* data: string data about the pattern */
4586 			/* stopparen: treat close N as END */
4587 			/* recursed: which subroutines have we recursed into */
4588 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4589 {
4590     dVAR;
4591     SSize_t final_minlen;
4592     /* There must be at least this number of characters to match */
4593     SSize_t min = 0;
4594     I32 pars = 0, code;
4595     regnode *scan = *scanp, *next;
4596     SSize_t delta = 0;
4597     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4598     int is_inf_internal = 0;		/* The studied chunk is infinite */
4599     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4600     scan_data_t data_fake;
4601     SV *re_trie_maxbuff = NULL;
4602     regnode *first_non_open = scan;
4603     SSize_t stopmin = OPTIMIZE_INFTY;
4604     scan_frame *frame = NULL;
4605     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4606 
4607     PERL_ARGS_ASSERT_STUDY_CHUNK;
4608     RExC_study_started= 1;
4609 
4610     Zero(&data_fake, 1, scan_data_t);
4611 
4612     if ( depth == 0 ) {
4613         while (first_non_open && OP(first_non_open) == OPEN)
4614             first_non_open=regnext(first_non_open);
4615     }
4616 
4617 
4618   fake_study_recurse:
4619     DEBUG_r(
4620         RExC_study_chunk_recursed_count++;
4621     );
4622     DEBUG_OPTIMISE_MORE_r(
4623     {
4624         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4625             depth, (long)stopparen,
4626             (unsigned long)RExC_study_chunk_recursed_count,
4627             (unsigned long)depth, (unsigned long)recursed_depth,
4628             scan,
4629             last);
4630         if (recursed_depth) {
4631             U32 i;
4632             U32 j;
4633             for ( j = 0 ; j < recursed_depth ; j++ ) {
4634                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4635                     if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4636                         Perl_re_printf( aTHX_ " %d",(int)i);
4637                         break;
4638                     }
4639                 }
4640                 if ( j + 1 < recursed_depth ) {
4641                     Perl_re_printf( aTHX_  ",");
4642                 }
4643             }
4644         }
4645         Perl_re_printf( aTHX_ "\n");
4646     }
4647     );
4648     while ( scan && OP(scan) != END && scan < last ){
4649         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4650                                    node length to get a real minimum (because
4651                                    the folded version may be shorter) */
4652 	bool unfolded_multi_char = FALSE;
4653         /* avoid mutating ops if we are anywhere within the recursed or
4654          * enframed handling for a GOSUB: the outermost level will handle it.
4655          */
4656         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4657 	/* Peephole optimizer: */
4658         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4659         DEBUG_PEEP("Peep", scan, depth, flags);
4660 
4661 
4662         /* The reason we do this here is that we need to deal with things like
4663          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4664          * parsing code, as each (?:..) is handled by a different invocation of
4665          * reg() -- Yves
4666          */
4667         if (PL_regkind[OP(scan)] == EXACT
4668             && OP(scan) != LEXACT
4669             && OP(scan) != LEXACT_REQ8
4670             && mutate_ok
4671         ) {
4672             join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4673                     0, NULL, depth + 1);
4674         }
4675 
4676         /* Follow the next-chain of the current node and optimize
4677            away all the NOTHINGs from it.
4678          */
4679         rck_elide_nothing(scan);
4680 
4681         /* The principal pseudo-switch.  Cannot be a switch, since we look into
4682          * several different things.  */
4683         if ( OP(scan) == DEFINEP ) {
4684             SSize_t minlen = 0;
4685             SSize_t deltanext = 0;
4686             SSize_t fake_last_close = 0;
4687             I32 f = SCF_IN_DEFINE;
4688 
4689             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4690             scan = regnext(scan);
4691             assert( OP(scan) == IFTHEN );
4692             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4693 
4694             data_fake.last_closep= &fake_last_close;
4695             minlen = *minlenp;
4696             next = regnext(scan);
4697             scan = NEXTOPER(NEXTOPER(scan));
4698             DEBUG_PEEP("scan", scan, depth, flags);
4699             DEBUG_PEEP("next", next, depth, flags);
4700 
4701             /* we suppose the run is continuous, last=next...
4702              * NOTE we dont use the return here! */
4703             /* DEFINEP study_chunk() recursion */
4704             (void)study_chunk(pRExC_state, &scan, &minlen,
4705                               &deltanext, next, &data_fake, stopparen,
4706                               recursed_depth, NULL, f, depth+1, mutate_ok);
4707 
4708             scan = next;
4709         } else
4710         if (
4711             OP(scan) == BRANCH  ||
4712             OP(scan) == BRANCHJ ||
4713             OP(scan) == IFTHEN
4714         ) {
4715 	    next = regnext(scan);
4716 	    code = OP(scan);
4717 
4718             /* The op(next)==code check below is to see if we
4719              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4720              * IFTHEN is special as it might not appear in pairs.
4721              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4722              * we dont handle it cleanly. */
4723 	    if (OP(next) == code || code == IFTHEN) {
4724                 /* NOTE - There is similar code to this block below for
4725                  * handling TRIE nodes on a re-study.  If you change stuff here
4726                  * check there too. */
4727 		SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4728 		regnode_ssc accum;
4729 		regnode * const startbranch=scan;
4730 
4731                 if (flags & SCF_DO_SUBSTR) {
4732                     /* Cannot merge strings after this. */
4733                     scan_commit(pRExC_state, data, minlenp, is_inf);
4734                 }
4735 
4736                 if (flags & SCF_DO_STCLASS)
4737 		    ssc_init_zero(pRExC_state, &accum);
4738 
4739 		while (OP(scan) == code) {
4740 		    SSize_t deltanext, minnext, fake;
4741 		    I32 f = 0;
4742 		    regnode_ssc this_class;
4743 
4744                     DEBUG_PEEP("Branch", scan, depth, flags);
4745 
4746 		    num++;
4747                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4748 		    if (data) {
4749 			data_fake.whilem_c = data->whilem_c;
4750 			data_fake.last_closep = data->last_closep;
4751 		    }
4752 		    else
4753 			data_fake.last_closep = &fake;
4754 
4755 		    data_fake.pos_delta = delta;
4756 		    next = regnext(scan);
4757 
4758                     scan = NEXTOPER(scan); /* everything */
4759                     if (code != BRANCH)    /* everything but BRANCH */
4760 			scan = NEXTOPER(scan);
4761 
4762 		    if (flags & SCF_DO_STCLASS) {
4763 			ssc_init(pRExC_state, &this_class);
4764 			data_fake.start_class = &this_class;
4765 			f = SCF_DO_STCLASS_AND;
4766 		    }
4767 		    if (flags & SCF_WHILEM_VISITED_POS)
4768 			f |= SCF_WHILEM_VISITED_POS;
4769 
4770 		    /* we suppose the run is continuous, last=next...*/
4771                     /* recurse study_chunk() for each BRANCH in an alternation */
4772 		    minnext = study_chunk(pRExC_state, &scan, minlenp,
4773                                       &deltanext, next, &data_fake, stopparen,
4774                                       recursed_depth, NULL, f, depth+1,
4775                                       mutate_ok);
4776 
4777 		    if (min1 > minnext)
4778 			min1 = minnext;
4779 		    if (deltanext == OPTIMIZE_INFTY) {
4780 			is_inf = is_inf_internal = 1;
4781 			max1 = OPTIMIZE_INFTY;
4782 		    } else if (max1 < minnext + deltanext)
4783 			max1 = minnext + deltanext;
4784 		    scan = next;
4785 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4786 			pars++;
4787 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
4788 	                if ( stopmin > minnext)
4789 	                    stopmin = min + min1;
4790 	                flags &= ~SCF_DO_SUBSTR;
4791 	                if (data)
4792 	                    data->flags |= SCF_SEEN_ACCEPT;
4793 	            }
4794 		    if (data) {
4795 			if (data_fake.flags & SF_HAS_EVAL)
4796 			    data->flags |= SF_HAS_EVAL;
4797 			data->whilem_c = data_fake.whilem_c;
4798 		    }
4799 		    if (flags & SCF_DO_STCLASS)
4800 			ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4801 		}
4802 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4803 		    min1 = 0;
4804 		if (flags & SCF_DO_SUBSTR) {
4805 		    data->pos_min += min1;
4806 		    if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4807 		        data->pos_delta = OPTIMIZE_INFTY;
4808 		    else
4809 		        data->pos_delta += max1 - min1;
4810 		    if (max1 != min1 || is_inf)
4811 			data->cur_is_floating = 1;
4812 		}
4813 		min += min1;
4814 		if (delta == OPTIMIZE_INFTY
4815 		 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4816 		    delta = OPTIMIZE_INFTY;
4817 		else
4818 		    delta += max1 - min1;
4819 		if (flags & SCF_DO_STCLASS_OR) {
4820 		    ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4821 		    if (min1) {
4822 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4823 			flags &= ~SCF_DO_STCLASS;
4824 		    }
4825 		}
4826 		else if (flags & SCF_DO_STCLASS_AND) {
4827 		    if (min1) {
4828 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4829 			flags &= ~SCF_DO_STCLASS;
4830 		    }
4831 		    else {
4832 			/* Switch to OR mode: cache the old value of
4833 			 * data->start_class */
4834 			INIT_AND_WITHP;
4835 			StructCopy(data->start_class, and_withp, regnode_ssc);
4836 			flags &= ~SCF_DO_STCLASS_AND;
4837 			StructCopy(&accum, data->start_class, regnode_ssc);
4838 			flags |= SCF_DO_STCLASS_OR;
4839 		    }
4840 		}
4841 
4842                 if (PERL_ENABLE_TRIE_OPTIMISATION
4843                     && OP(startbranch) == BRANCH
4844                     && mutate_ok
4845                 ) {
4846 		/* demq.
4847 
4848                    Assuming this was/is a branch we are dealing with: 'scan'
4849                    now points at the item that follows the branch sequence,
4850                    whatever it is. We now start at the beginning of the
4851                    sequence and look for subsequences of
4852 
4853 		   BRANCH->EXACT=>x1
4854 		   BRANCH->EXACT=>x2
4855 		   tail
4856 
4857                    which would be constructed from a pattern like
4858                    /A|LIST|OF|WORDS/
4859 
4860 		   If we can find such a subsequence we need to turn the first
4861 		   element into a trie and then add the subsequent branch exact
4862 		   strings to the trie.
4863 
4864 		   We have two cases
4865 
4866                      1. patterns where the whole set of branches can be
4867                         converted.
4868 
4869 		     2. patterns where only a subset can be converted.
4870 
4871 		   In case 1 we can replace the whole set with a single regop
4872 		   for the trie. In case 2 we need to keep the start and end
4873 		   branches so
4874 
4875 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4876 		     becomes BRANCH TRIE; BRANCH X;
4877 
4878 		  There is an additional case, that being where there is a
4879 		  common prefix, which gets split out into an EXACT like node
4880 		  preceding the TRIE node.
4881 
4882 		  If x(1..n)==tail then we can do a simple trie, if not we make
4883 		  a "jump" trie, such that when we match the appropriate word
4884 		  we "jump" to the appropriate tail node. Essentially we turn
4885 		  a nested if into a case structure of sorts.
4886 
4887 		*/
4888 
4889 		    int made=0;
4890 		    if (!re_trie_maxbuff) {
4891 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4892 			if (!SvIOK(re_trie_maxbuff))
4893 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4894 		    }
4895                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4896                         regnode *cur;
4897                         regnode *first = (regnode *)NULL;
4898                         regnode *prev = (regnode *)NULL;
4899                         regnode *tail = scan;
4900                         U8 trietype = 0;
4901                         U32 count=0;
4902 
4903                         /* var tail is used because there may be a TAIL
4904                            regop in the way. Ie, the exacts will point to the
4905                            thing following the TAIL, but the last branch will
4906                            point at the TAIL. So we advance tail. If we
4907                            have nested (?:) we may have to move through several
4908                            tails.
4909                          */
4910 
4911                         while ( OP( tail ) == TAIL ) {
4912                             /* this is the TAIL generated by (?:) */
4913                             tail = regnext( tail );
4914                         }
4915 
4916 
4917                         DEBUG_TRIE_COMPILE_r({
4918                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4919                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4920                               depth+1,
4921                               "Looking for TRIE'able sequences. Tail node is ",
4922                               (UV) REGNODE_OFFSET(tail),
4923                               SvPV_nolen_const( RExC_mysv )
4924                             );
4925                         });
4926 
4927                         /*
4928 
4929                             Step through the branches
4930                                 cur represents each branch,
4931                                 noper is the first thing to be matched as part
4932                                       of that branch
4933                                 noper_next is the regnext() of that node.
4934 
4935                             We normally handle a case like this
4936                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4937                             support building with NOJUMPTRIE, which restricts
4938                             the trie logic to structures like /FOO|BAR/.
4939 
4940                             If noper is a trieable nodetype then the branch is
4941                             a possible optimization target. If we are building
4942                             under NOJUMPTRIE then we require that noper_next is
4943                             the same as scan (our current position in the regex
4944                             program).
4945 
4946                             Once we have two or more consecutive such branches
4947                             we can create a trie of the EXACT's contents and
4948                             stitch it in place into the program.
4949 
4950                             If the sequence represents all of the branches in
4951                             the alternation we replace the entire thing with a
4952                             single TRIE node.
4953 
4954                             Otherwise when it is a subsequence we need to
4955                             stitch it in place and replace only the relevant
4956                             branches. This means the first branch has to remain
4957                             as it is used by the alternation logic, and its
4958                             next pointer, and needs to be repointed at the item
4959                             on the branch chain following the last branch we
4960                             have optimized away.
4961 
4962                             This could be either a BRANCH, in which case the
4963                             subsequence is internal, or it could be the item
4964                             following the branch sequence in which case the
4965                             subsequence is at the end (which does not
4966                             necessarily mean the first node is the start of the
4967                             alternation).
4968 
4969                             TRIE_TYPE(X) is a define which maps the optype to a
4970                             trietype.
4971 
4972                                 optype          |  trietype
4973                                 ----------------+-----------
4974                                 NOTHING         | NOTHING
4975                                 EXACT           | EXACT
4976                                 EXACT_REQ8     | EXACT
4977                                 EXACTFU         | EXACTFU
4978                                 EXACTFU_REQ8   | EXACTFU
4979                                 EXACTFUP        | EXACTFU
4980                                 EXACTFAA        | EXACTFAA
4981                                 EXACTL          | EXACTL
4982                                 EXACTFLU8       | EXACTFLU8
4983 
4984 
4985                         */
4986 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4987                        ? NOTHING                                            \
4988                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
4989                          ? EXACT                                            \
4990                          : (     EXACTFU == (X)                             \
4991                               || EXACTFU_REQ8 == (X)                       \
4992                               || EXACTFUP == (X) )                          \
4993                            ? EXACTFU                                        \
4994                            : ( EXACTFAA == (X) )                            \
4995                              ? EXACTFAA                                     \
4996                              : ( EXACTL == (X) )                            \
4997                                ? EXACTL                                     \
4998                                : ( EXACTFLU8 == (X) )                       \
4999                                  ? EXACTFLU8                                \
5000                                  : 0 )
5001 
5002                         /* dont use tail as the end marker for this traverse */
5003                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
5004                             regnode * const noper = NEXTOPER( cur );
5005                             U8 noper_type = OP( noper );
5006                             U8 noper_trietype = TRIE_TYPE( noper_type );
5007 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
5008                             regnode * const noper_next = regnext( noper );
5009                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5010                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
5011 #endif
5012 
5013                             DEBUG_TRIE_COMPILE_r({
5014                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5015                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
5016                                    depth+1,
5017                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5018 
5019                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5020                                 Perl_re_printf( aTHX_  " -> %d:%s",
5021                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5022 
5023                                 if ( noper_next ) {
5024                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5025                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5026                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5027                                 }
5028                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5029                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5030 				   PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5031 				);
5032                             });
5033 
5034                             /* Is noper a trieable nodetype that can be merged
5035                              * with the current trie (if there is one)? */
5036                             if ( noper_trietype
5037                                   &&
5038                                   (
5039                                         ( noper_trietype == NOTHING )
5040                                         || ( trietype == NOTHING )
5041                                         || ( trietype == noper_trietype )
5042                                   )
5043 #ifdef NOJUMPTRIE
5044                                   && noper_next >= tail
5045 #endif
5046                                   && count < U16_MAX)
5047                             {
5048                                 /* Handle mergable triable node Either we are
5049                                  * the first node in a new trieable sequence,
5050                                  * in which case we do some bookkeeping,
5051                                  * otherwise we update the end pointer. */
5052                                 if ( !first ) {
5053                                     first = cur;
5054 				    if ( noper_trietype == NOTHING ) {
5055 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5056 					regnode * const noper_next = regnext( noper );
5057                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5058 					U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5059 #endif
5060 
5061                                         if ( noper_next_trietype ) {
5062 					    trietype = noper_next_trietype;
5063                                         } else if (noper_next_type)  {
5064                                             /* a NOTHING regop is 1 regop wide.
5065                                              * We need at least two for a trie
5066                                              * so we can't merge this in */
5067                                             first = NULL;
5068                                         }
5069                                     } else {
5070                                         trietype = noper_trietype;
5071                                     }
5072                                 } else {
5073                                     if ( trietype == NOTHING )
5074                                         trietype = noper_trietype;
5075                                     prev = cur;
5076                                 }
5077 				if (first)
5078 				    count++;
5079                             } /* end handle mergable triable node */
5080                             else {
5081                                 /* handle unmergable node -
5082                                  * noper may either be a triable node which can
5083                                  * not be tried together with the current trie,
5084                                  * or a non triable node */
5085                                 if ( prev ) {
5086                                     /* If last is set and trietype is not
5087                                      * NOTHING then we have found at least two
5088                                      * triable branch sequences in a row of a
5089                                      * similar trietype so we can turn them
5090                                      * into a trie. If/when we allow NOTHING to
5091                                      * start a trie sequence this condition
5092                                      * will be required, and it isn't expensive
5093                                      * so we leave it in for now. */
5094                                     if ( trietype && trietype != NOTHING )
5095                                         make_trie( pRExC_state,
5096                                                 startbranch, first, cur, tail,
5097                                                 count, trietype, depth+1 );
5098                                     prev = NULL; /* note: we clear/update
5099                                                     first, trietype etc below,
5100                                                     so we dont do it here */
5101                                 }
5102                                 if ( noper_trietype
5103 #ifdef NOJUMPTRIE
5104                                      && noper_next >= tail
5105 #endif
5106                                 ){
5107                                     /* noper is triable, so we can start a new
5108                                      * trie sequence */
5109                                     count = 1;
5110                                     first = cur;
5111                                     trietype = noper_trietype;
5112                                 } else if (first) {
5113                                     /* if we already saw a first but the
5114                                      * current node is not triable then we have
5115                                      * to reset the first information. */
5116                                     count = 0;
5117                                     first = NULL;
5118                                     trietype = 0;
5119                                 }
5120                             } /* end handle unmergable node */
5121                         } /* loop over branches */
5122                         DEBUG_TRIE_COMPILE_r({
5123                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5124                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5125                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5126                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5127                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5128                                PL_reg_name[trietype]
5129                             );
5130 
5131                         });
5132                         if ( prev && trietype ) {
5133                             if ( trietype != NOTHING ) {
5134                                 /* the last branch of the sequence was part of
5135                                  * a trie, so we have to construct it here
5136                                  * outside of the loop */
5137                                 made= make_trie( pRExC_state, startbranch,
5138                                                  first, scan, tail, count,
5139                                                  trietype, depth+1 );
5140 #ifdef TRIE_STUDY_OPT
5141                                 if ( ((made == MADE_EXACT_TRIE &&
5142                                      startbranch == first)
5143                                      || ( first_non_open == first )) &&
5144                                      depth==0 ) {
5145                                     flags |= SCF_TRIE_RESTUDY;
5146                                     if ( startbranch == first
5147                                          && scan >= tail )
5148                                     {
5149                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5150                                     }
5151                                 }
5152 #endif
5153                             } else {
5154                                 /* at this point we know whatever we have is a
5155                                  * NOTHING sequence/branch AND if 'startbranch'
5156                                  * is 'first' then we can turn the whole thing
5157                                  * into a NOTHING
5158                                  */
5159                                 if ( startbranch == first ) {
5160                                     regnode *opt;
5161                                     /* the entire thing is a NOTHING sequence,
5162                                      * something like this: (?:|) So we can
5163                                      * turn it into a plain NOTHING op. */
5164                                     DEBUG_TRIE_COMPILE_r({
5165                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5166                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5167                                           depth+1,
5168                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5169 
5170                                     });
5171                                     OP(startbranch)= NOTHING;
5172                                     NEXT_OFF(startbranch)= tail - startbranch;
5173                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5174                                         OP(opt)= OPTIMIZED;
5175                                 }
5176                             }
5177                         } /* end if ( prev) */
5178                     } /* TRIE_MAXBUF is non zero */
5179                 } /* do trie */
5180 
5181 	    }
5182 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5183 		scan = NEXTOPER(NEXTOPER(scan));
5184 	    } else			/* single branch is optimized. */
5185 		scan = NEXTOPER(scan);
5186 	    continue;
5187         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5188             I32 paren = 0;
5189             regnode *start = NULL;
5190             regnode *end = NULL;
5191             U32 my_recursed_depth= recursed_depth;
5192 
5193             if (OP(scan) != SUSPEND) { /* GOSUB */
5194                 /* Do setup, note this code has side effects beyond
5195                  * the rest of this block. Specifically setting
5196                  * RExC_recurse[] must happen at least once during
5197                  * study_chunk(). */
5198                 paren = ARG(scan);
5199                 RExC_recurse[ARG2L(scan)] = scan;
5200                 start = REGNODE_p(RExC_open_parens[paren]);
5201                 end   = REGNODE_p(RExC_close_parens[paren]);
5202 
5203                 /* NOTE we MUST always execute the above code, even
5204                  * if we do nothing with a GOSUB */
5205                 if (
5206                     ( flags & SCF_IN_DEFINE )
5207                     ||
5208                     (
5209                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5210                         &&
5211                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5212                     )
5213                 ) {
5214                     /* no need to do anything here if we are in a define. */
5215                     /* or we are after some kind of infinite construct
5216                      * so we can skip recursing into this item.
5217                      * Since it is infinite we will not change the maxlen
5218                      * or delta, and if we miss something that might raise
5219                      * the minlen it will merely pessimise a little.
5220                      *
5221                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5222                      * might result in a minlen of 1 and not of 4,
5223                      * but this doesn't make us mismatch, just try a bit
5224                      * harder than we should.
5225                      *
5226                      * However we must assume this GOSUB is infinite, to
5227                      * avoid wrongly applying other optimizations in the
5228                      * enclosing scope - see GH 18096, for example.
5229                      */
5230                     is_inf = is_inf_internal = 1;
5231                     scan= regnext(scan);
5232                     continue;
5233                 }
5234 
5235                 if (
5236                     !recursed_depth
5237                     || !PAREN_TEST(recursed_depth - 1, paren)
5238                 ) {
5239                     /* it is quite possible that there are more efficient ways
5240                      * to do this. We maintain a bitmap per level of recursion
5241                      * of which patterns we have entered so we can detect if a
5242                      * pattern creates a possible infinite loop. When we
5243                      * recurse down a level we copy the previous levels bitmap
5244                      * down. When we are at recursion level 0 we zero the top
5245                      * level bitmap. It would be nice to implement a different
5246                      * more efficient way of doing this. In particular the top
5247                      * level bitmap may be unnecessary.
5248                      */
5249                     if (!recursed_depth) {
5250                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5251                     } else {
5252                         Copy(PAREN_OFFSET(recursed_depth - 1),
5253                              PAREN_OFFSET(recursed_depth),
5254                              RExC_study_chunk_recursed_bytes, U8);
5255                     }
5256                     /* we havent recursed into this paren yet, so recurse into it */
5257                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5258                     PAREN_SET(recursed_depth, paren);
5259                     my_recursed_depth= recursed_depth + 1;
5260                 } else {
5261                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5262                     /* some form of infinite recursion, assume infinite length
5263                      * */
5264                     if (flags & SCF_DO_SUBSTR) {
5265                         scan_commit(pRExC_state, data, minlenp, is_inf);
5266                         data->cur_is_floating = 1;
5267                     }
5268                     is_inf = is_inf_internal = 1;
5269                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5270                         ssc_anything(data->start_class);
5271                     flags &= ~SCF_DO_STCLASS;
5272 
5273                     start= NULL; /* reset start so we dont recurse later on. */
5274 	        }
5275             } else {
5276 	        paren = stopparen;
5277                 start = scan + 2;
5278 	        end = regnext(scan);
5279 	    }
5280             if (start) {
5281                 scan_frame *newframe;
5282                 assert(end);
5283                 if (!RExC_frame_last) {
5284                     Newxz(newframe, 1, scan_frame);
5285                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5286                     RExC_frame_head= newframe;
5287                     RExC_frame_count++;
5288                 } else if (!RExC_frame_last->next_frame) {
5289                     Newxz(newframe, 1, scan_frame);
5290                     RExC_frame_last->next_frame= newframe;
5291                     newframe->prev_frame= RExC_frame_last;
5292                     RExC_frame_count++;
5293                 } else {
5294                     newframe= RExC_frame_last->next_frame;
5295                 }
5296                 RExC_frame_last= newframe;
5297 
5298                 newframe->next_regnode = regnext(scan);
5299                 newframe->last_regnode = last;
5300                 newframe->stopparen = stopparen;
5301                 newframe->prev_recursed_depth = recursed_depth;
5302                 newframe->this_prev_frame= frame;
5303                 newframe->in_gosub = (
5304                     (frame && frame->in_gosub) || OP(scan) == GOSUB
5305                 );
5306 
5307                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5308                 DEBUG_PEEP("fnew", scan, depth, flags);
5309 
5310 	        frame = newframe;
5311 	        scan =  start;
5312 	        stopparen = paren;
5313 	        last = end;
5314                 depth = depth + 1;
5315                 recursed_depth= my_recursed_depth;
5316 
5317 	        continue;
5318 	    }
5319 	}
5320 	else if (   OP(scan) == EXACT
5321                  || OP(scan) == LEXACT
5322                  || OP(scan) == EXACT_REQ8
5323                  || OP(scan) == LEXACT_REQ8
5324                  || OP(scan) == EXACTL)
5325         {
5326 	    SSize_t bytelen = STR_LEN(scan), charlen;
5327 	    UV uc;
5328             assert(bytelen);
5329 	    if (UTF) {
5330 		const U8 * const s = (U8*)STRING(scan);
5331 		uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5332 		charlen = utf8_length(s, s + bytelen);
5333 	    } else {
5334 		uc = *((U8*)STRING(scan));
5335                 charlen = bytelen;
5336 	    }
5337 	    min += charlen;
5338 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5339 		/* The code below prefers earlier match for fixed
5340 		   offset, later match for variable offset.  */
5341 		if (data->last_end == -1) { /* Update the start info. */
5342 		    data->last_start_min = data->pos_min;
5343                     data->last_start_max =
5344                         is_inf ? OPTIMIZE_INFTY
5345                         : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5346                             ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5347 		}
5348 		sv_catpvn(data->last_found, STRING(scan), bytelen);
5349 		if (UTF)
5350 		    SvUTF8_on(data->last_found);
5351 		{
5352 		    SV * const sv = data->last_found;
5353 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5354 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
5355 		    if (mg && mg->mg_len >= 0)
5356 			mg->mg_len += charlen;
5357 		}
5358 		data->last_end = data->pos_min + charlen;
5359 		data->pos_min += charlen; /* As in the first entry. */
5360 		data->flags &= ~SF_BEFORE_EOL;
5361 	    }
5362 
5363             /* ANDing the code point leaves at most it, and not in locale, and
5364              * can't match null string */
5365 	    if (flags & SCF_DO_STCLASS_AND) {
5366                 ssc_cp_and(data->start_class, uc);
5367                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5368                 ssc_clear_locale(data->start_class);
5369 	    }
5370 	    else if (flags & SCF_DO_STCLASS_OR) {
5371                 ssc_add_cp(data->start_class, uc);
5372 		ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5373 
5374                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5375                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5376 	    }
5377 	    flags &= ~SCF_DO_STCLASS;
5378 	}
5379         else if (PL_regkind[OP(scan)] == EXACT) {
5380             /* But OP != EXACT!, so is EXACTFish */
5381 	    SSize_t bytelen = STR_LEN(scan), charlen;
5382             const U8 * s = (U8*)STRING(scan);
5383 
5384             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5385              * with the mask set to the complement of the bit that differs
5386              * between upper and lower case, and the lowest code point of the
5387              * pair (which the '&' forces) */
5388             if (     bytelen == 1
5389                 &&   isALPHA_A(*s)
5390                 &&  (         OP(scan) == EXACTFAA
5391                      || (     OP(scan) == EXACTFU
5392                          && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5393                 &&   mutate_ok
5394             ) {
5395                 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5396 
5397                 OP(scan) = ANYOFM;
5398                 ARG_SET(scan, *s & mask);
5399                 FLAGS(scan) = mask;
5400                 /* we're not EXACTFish any more, so restudy */
5401                 continue;
5402             }
5403 
5404 	    /* Search for fixed substrings supports EXACT only. */
5405 	    if (flags & SCF_DO_SUBSTR) {
5406 		assert(data);
5407                 scan_commit(pRExC_state, data, minlenp, is_inf);
5408 	    }
5409             charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5410 	    if (unfolded_multi_char) {
5411                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5412 	    }
5413 	    min += charlen - min_subtract;
5414             assert (min >= 0);
5415             delta += min_subtract;
5416 	    if (flags & SCF_DO_SUBSTR) {
5417 		data->pos_min += charlen - min_subtract;
5418 		if (data->pos_min < 0) {
5419                     data->pos_min = 0;
5420                 }
5421                 data->pos_delta += min_subtract;
5422 		if (min_subtract) {
5423 		    data->cur_is_floating = 1; /* float */
5424 		}
5425 	    }
5426 
5427             if (flags & SCF_DO_STCLASS) {
5428                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5429 
5430                 assert(EXACTF_invlist);
5431                 if (flags & SCF_DO_STCLASS_AND) {
5432                     if (OP(scan) != EXACTFL)
5433                         ssc_clear_locale(data->start_class);
5434                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5435                     ANYOF_POSIXL_ZERO(data->start_class);
5436                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5437                 }
5438                 else {  /* SCF_DO_STCLASS_OR */
5439                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5440                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5441 
5442                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5443                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5444                 }
5445                 flags &= ~SCF_DO_STCLASS;
5446                 SvREFCNT_dec(EXACTF_invlist);
5447             }
5448 	}
5449 	else if (REGNODE_VARIES(OP(scan))) {
5450 	    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5451 	    I32 fl = 0, f = flags;
5452 	    regnode * const oscan = scan;
5453 	    regnode_ssc this_class;
5454 	    regnode_ssc *oclass = NULL;
5455 	    I32 next_is_eval = 0;
5456 
5457 	    switch (PL_regkind[OP(scan)]) {
5458 	    case WHILEM:		/* End of (?:...)* . */
5459 		scan = NEXTOPER(scan);
5460 		goto finish;
5461 	    case PLUS:
5462 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5463 		    next = NEXTOPER(scan);
5464 		    if (   OP(next) == EXACT
5465                         || OP(next) == LEXACT
5466                         || OP(next) == EXACT_REQ8
5467                         || OP(next) == LEXACT_REQ8
5468                         || OP(next) == EXACTL
5469                         || (flags & SCF_DO_STCLASS))
5470                     {
5471 			mincount = 1;
5472 			maxcount = REG_INFTY;
5473 			next = regnext(scan);
5474 			scan = NEXTOPER(scan);
5475 			goto do_curly;
5476 		    }
5477 		}
5478 		if (flags & SCF_DO_SUBSTR)
5479 		    data->pos_min++;
5480 		min++;
5481 		/* FALLTHROUGH */
5482 	    case STAR:
5483                 next = NEXTOPER(scan);
5484 
5485                 /* This temporary node can now be turned into EXACTFU, and
5486                  * must, as regexec.c doesn't handle it */
5487                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5488                     OP(next) = EXACTFU;
5489                 }
5490 
5491                 if (     STR_LEN(next) == 1
5492                     &&   isALPHA_A(* STRING(next))
5493                     && (         OP(next) == EXACTFAA
5494                         || (     OP(next) == EXACTFU
5495                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5496                     &&   mutate_ok
5497                 ) {
5498                     /* These differ in just one bit */
5499                     U8 mask = ~ ('A' ^ 'a');
5500 
5501                     assert(isALPHA_A(* STRING(next)));
5502 
5503                     /* Then replace it by an ANYOFM node, with
5504                     * the mask set to the complement of the
5505                     * bit that differs between upper and lower
5506                     * case, and the lowest code point of the
5507                     * pair (which the '&' forces) */
5508                     OP(next) = ANYOFM;
5509                     ARG_SET(next, *STRING(next) & mask);
5510                     FLAGS(next) = mask;
5511                 }
5512 
5513 		if (flags & SCF_DO_STCLASS) {
5514 		    mincount = 0;
5515 		    maxcount = REG_INFTY;
5516 		    next = regnext(scan);
5517 		    scan = NEXTOPER(scan);
5518 		    goto do_curly;
5519 		}
5520 		if (flags & SCF_DO_SUBSTR) {
5521                     scan_commit(pRExC_state, data, minlenp, is_inf);
5522                     /* Cannot extend fixed substrings */
5523 		    data->cur_is_floating = 1; /* float */
5524 		}
5525                 is_inf = is_inf_internal = 1;
5526                 scan = regnext(scan);
5527 		goto optimize_curly_tail;
5528 	    case CURLY:
5529 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5530 	            && (scan->flags == stopparen))
5531 		{
5532 		    mincount = 1;
5533 		    maxcount = 1;
5534 		} else {
5535 		    mincount = ARG1(scan);
5536 		    maxcount = ARG2(scan);
5537 		}
5538 		next = regnext(scan);
5539 		if (OP(scan) == CURLYX) {
5540 		    I32 lp = (data ? *(data->last_closep) : 0);
5541 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5542 		}
5543 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5544 		next_is_eval = (OP(scan) == EVAL);
5545 	      do_curly:
5546 		if (flags & SCF_DO_SUBSTR) {
5547                     if (mincount == 0)
5548                         scan_commit(pRExC_state, data, minlenp, is_inf);
5549                     /* Cannot extend fixed substrings */
5550 		    pos_before = data->pos_min;
5551 		}
5552 		if (data) {
5553 		    fl = data->flags;
5554 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5555 		    if (is_inf)
5556 			data->flags |= SF_IS_INF;
5557 		}
5558 		if (flags & SCF_DO_STCLASS) {
5559 		    ssc_init(pRExC_state, &this_class);
5560 		    oclass = data->start_class;
5561 		    data->start_class = &this_class;
5562 		    f |= SCF_DO_STCLASS_AND;
5563 		    f &= ~SCF_DO_STCLASS_OR;
5564 		}
5565 	        /* Exclude from super-linear cache processing any {n,m}
5566 		   regops for which the combination of input pos and regex
5567 		   pos is not enough information to determine if a match
5568 		   will be possible.
5569 
5570 		   For example, in the regex /foo(bar\s*){4,8}baz/ with the
5571 		   regex pos at the \s*, the prospects for a match depend not
5572 		   only on the input position but also on how many (bar\s*)
5573 		   repeats into the {4,8} we are. */
5574                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5575 		    f &= ~SCF_WHILEM_VISITED_POS;
5576 
5577 		/* This will finish on WHILEM, setting scan, or on NULL: */
5578                 /* recurse study_chunk() on loop bodies */
5579 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5580                                   last, data, stopparen, recursed_depth, NULL,
5581                                   (mincount == 0
5582                                    ? (f & ~SCF_DO_SUBSTR)
5583                                    : f)
5584                                   , depth+1, mutate_ok);
5585 
5586 		if (flags & SCF_DO_STCLASS)
5587 		    data->start_class = oclass;
5588 		if (mincount == 0 || minnext == 0) {
5589 		    if (flags & SCF_DO_STCLASS_OR) {
5590 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5591 		    }
5592 		    else if (flags & SCF_DO_STCLASS_AND) {
5593 			/* Switch to OR mode: cache the old value of
5594 			 * data->start_class */
5595 			INIT_AND_WITHP;
5596 			StructCopy(data->start_class, and_withp, regnode_ssc);
5597 			flags &= ~SCF_DO_STCLASS_AND;
5598 			StructCopy(&this_class, data->start_class, regnode_ssc);
5599 			flags |= SCF_DO_STCLASS_OR;
5600                         ANYOF_FLAGS(data->start_class)
5601                                                 |= SSC_MATCHES_EMPTY_STRING;
5602 		    }
5603 		} else {		/* Non-zero len */
5604 		    if (flags & SCF_DO_STCLASS_OR) {
5605 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5606 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5607 		    }
5608 		    else if (flags & SCF_DO_STCLASS_AND)
5609 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5610 		    flags &= ~SCF_DO_STCLASS;
5611 		}
5612 		if (!scan) 		/* It was not CURLYX, but CURLY. */
5613 		    scan = next;
5614 		if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5615 		    /* ? quantifier ok, except for (?{ ... }) */
5616 		    && (next_is_eval || !(mincount == 0 && maxcount == 1))
5617 		    && (minnext == 0) && (deltanext == 0)
5618 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5619                     && maxcount <= REG_INFTY/3) /* Complement check for big
5620                                                    count */
5621 		{
5622 		    _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5623                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5624                             "Quantifier unexpected on zero-length expression "
5625                             "in regex m/%" UTF8f "/",
5626 			     UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5627 				  RExC_precomp)));
5628                 }
5629 
5630                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5631                     || min >= SSize_t_MAX - minnext * mincount )
5632                 {
5633                     FAIL("Regexp out of space");
5634                 }
5635 
5636 		min += minnext * mincount;
5637 		is_inf_internal |= deltanext == OPTIMIZE_INFTY
5638                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5639 		is_inf |= is_inf_internal;
5640                 if (is_inf) {
5641 		    delta = OPTIMIZE_INFTY;
5642                 } else {
5643 		    delta += (minnext + deltanext) * maxcount
5644                              - minnext * mincount;
5645                 }
5646 		/* Try powerful optimization CURLYX => CURLYN. */
5647 		if (  OP(oscan) == CURLYX && data
5648 		      && data->flags & SF_IN_PAR
5649 		      && !(data->flags & SF_HAS_EVAL)
5650 		      && !deltanext && minnext == 1
5651                       && mutate_ok
5652                 ) {
5653 		    /* Try to optimize to CURLYN.  */
5654 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5655 		    regnode * const nxt1 = nxt;
5656 #ifdef DEBUGGING
5657 		    regnode *nxt2;
5658 #endif
5659 
5660 		    /* Skip open. */
5661 		    nxt = regnext(nxt);
5662 		    if (!REGNODE_SIMPLE(OP(nxt))
5663 			&& !(PL_regkind[OP(nxt)] == EXACT
5664 			     && STR_LEN(nxt) == 1))
5665 			goto nogo;
5666 #ifdef DEBUGGING
5667 		    nxt2 = nxt;
5668 #endif
5669 		    nxt = regnext(nxt);
5670 		    if (OP(nxt) != CLOSE)
5671 			goto nogo;
5672 		    if (RExC_open_parens) {
5673 
5674                         /*open->CURLYM*/
5675                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5676 
5677                         /*close->while*/
5678                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5679 		    }
5680 		    /* Now we know that nxt2 is the only contents: */
5681 		    oscan->flags = (U8)ARG(nxt);
5682 		    OP(oscan) = CURLYN;
5683 		    OP(nxt1) = NOTHING;	/* was OPEN. */
5684 
5685 #ifdef DEBUGGING
5686 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5687 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5688 		    NEXT_OFF(nxt2) = 0;	/* just for consistency with CURLY. */
5689 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5690 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
5691 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5692 #endif
5693 		}
5694 	      nogo:
5695 
5696 		/* Try optimization CURLYX => CURLYM. */
5697 		if (  OP(oscan) == CURLYX && data
5698 		      && !(data->flags & SF_HAS_PAR)
5699 		      && !(data->flags & SF_HAS_EVAL)
5700 		      && !deltanext	/* atom is fixed width */
5701 		      && minnext != 0	/* CURLYM can't handle zero width */
5702                          /* Nor characters whose fold at run-time may be
5703                           * multi-character */
5704                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5705                       && mutate_ok
5706 		) {
5707 		    /* XXXX How to optimize if data == 0? */
5708 		    /* Optimize to a simpler form.  */
5709 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5710 		    regnode *nxt2;
5711 
5712 		    OP(oscan) = CURLYM;
5713 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5714 			    && (OP(nxt2) != WHILEM))
5715 			nxt = nxt2;
5716 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5717 		    /* Need to optimize away parenths. */
5718 		    if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5719 			/* Set the parenth number.  */
5720 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5721 
5722 			oscan->flags = (U8)ARG(nxt);
5723 			if (RExC_open_parens) {
5724                              /*open->CURLYM*/
5725                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5726 
5727                             /*close->NOTHING*/
5728                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5729                                                          + 1;
5730 			}
5731 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
5732 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5733 
5734 #ifdef DEBUGGING
5735 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5736 			OP(nxt + 1) = OPTIMIZED; /* was count. */
5737 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5738 			NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5739 #endif
5740 #if 0
5741 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
5742 			    regnode *nnxt = regnext(nxt1);
5743 			    if (nnxt == nxt) {
5744 				if (reg_off_by_arg[OP(nxt1)])
5745 				    ARG_SET(nxt1, nxt2 - nxt1);
5746 				else if (nxt2 - nxt1 < U16_MAX)
5747 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
5748 				else
5749 				    OP(nxt) = NOTHING;	/* Cannot beautify */
5750 			    }
5751 			    nxt1 = nnxt;
5752 			}
5753 #endif
5754 			/* Optimize again: */
5755                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5756 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5757                                     NULL, stopparen, recursed_depth, NULL, 0,
5758                                     depth+1, mutate_ok);
5759 		    }
5760 		    else
5761 			oscan->flags = 0;
5762 		}
5763 		else if ((OP(oscan) == CURLYX)
5764 			 && (flags & SCF_WHILEM_VISITED_POS)
5765 			 /* See the comment on a similar expression above.
5766 			    However, this time it's not a subexpression
5767 			    we care about, but the expression itself. */
5768 			 && (maxcount == REG_INFTY)
5769 			 && data) {
5770 		    /* This stays as CURLYX, we can put the count/of pair. */
5771 		    /* Find WHILEM (as in regexec.c) */
5772 		    regnode *nxt = oscan + NEXT_OFF(oscan);
5773 
5774 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5775 			nxt += ARG(nxt);
5776                     nxt = PREVOPER(nxt);
5777                     if (nxt->flags & 0xf) {
5778                         /* we've already set whilem count on this node */
5779                     } else if (++data->whilem_c < 16) {
5780                         assert(data->whilem_c <= RExC_whilem_seen);
5781                         nxt->flags = (U8)(data->whilem_c
5782                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5783                     }
5784 		}
5785 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5786 		    pars++;
5787 		if (flags & SCF_DO_SUBSTR) {
5788 		    SV *last_str = NULL;
5789                     STRLEN last_chrs = 0;
5790 		    int counted = mincount != 0;
5791 
5792                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5793                                                                   string. */
5794 			SSize_t b = pos_before >= data->last_start_min
5795 			    ? pos_before : data->last_start_min;
5796 			STRLEN l;
5797 			const char * const s = SvPV_const(data->last_found, l);
5798 			SSize_t old = b - data->last_start_min;
5799                         assert(old >= 0);
5800 
5801 			if (UTF)
5802 			    old = utf8_hop_forward((U8*)s, old,
5803                                                (U8 *) SvEND(data->last_found))
5804                                 - (U8*)s;
5805 			l -= old;
5806 			/* Get the added string: */
5807 			last_str = newSVpvn_utf8(s  + old, l, UTF);
5808                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5809                                             (U8*)(s + old + l)) : l;
5810 			if (deltanext == 0 && pos_before == b) {
5811 			    /* What was added is a constant string */
5812 			    if (mincount > 1) {
5813 
5814 				SvGROW(last_str, (mincount * l) + 1);
5815 				repeatcpy(SvPVX(last_str) + l,
5816 					  SvPVX_const(last_str), l,
5817                                           mincount - 1);
5818 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
5819 				/* Add additional parts. */
5820 				SvCUR_set(data->last_found,
5821 					  SvCUR(data->last_found) - l);
5822 				sv_catsv(data->last_found, last_str);
5823 				{
5824 				    SV * sv = data->last_found;
5825 				    MAGIC *mg =
5826 					SvUTF8(sv) && SvMAGICAL(sv) ?
5827 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
5828 				    if (mg && mg->mg_len >= 0)
5829 					mg->mg_len += last_chrs * (mincount-1);
5830 				}
5831                                 last_chrs *= mincount;
5832 				data->last_end += l * (mincount - 1);
5833 			    }
5834 			} else {
5835 			    /* start offset must point into the last copy */
5836 			    data->last_start_min += minnext * (mincount - 1);
5837 			    data->last_start_max =
5838                               is_inf
5839                                ? OPTIMIZE_INFTY
5840 			       : data->last_start_max +
5841                                  (maxcount - 1) * (minnext + data->pos_delta);
5842 			}
5843 		    }
5844 		    /* It is counted once already... */
5845 		    data->pos_min += minnext * (mincount - counted);
5846 #if 0
5847 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5848                               " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5849                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5850     (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5851     (UV)mincount);
5852 if (deltanext != OPTIMIZE_INFTY)
5853 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5854     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5855           - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5856 #endif
5857 		    if (deltanext == OPTIMIZE_INFTY
5858                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5859 		        data->pos_delta = OPTIMIZE_INFTY;
5860 		    else
5861 		        data->pos_delta += - counted * deltanext +
5862 			(minnext + deltanext) * maxcount - minnext * mincount;
5863 		    if (mincount != maxcount) {
5864 			 /* Cannot extend fixed substrings found inside
5865 			    the group.  */
5866                         scan_commit(pRExC_state, data, minlenp, is_inf);
5867 			if (mincount && last_str) {
5868 			    SV * const sv = data->last_found;
5869 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5870 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
5871 
5872 			    if (mg)
5873 				mg->mg_len = -1;
5874 			    sv_setsv(sv, last_str);
5875 			    data->last_end = data->pos_min;
5876 			    data->last_start_min = data->pos_min - last_chrs;
5877 			    data->last_start_max = is_inf
5878 				? OPTIMIZE_INFTY
5879 				: data->pos_min + data->pos_delta - last_chrs;
5880 			}
5881 			data->cur_is_floating = 1; /* float */
5882 		    }
5883 		    SvREFCNT_dec(last_str);
5884 		}
5885 		if (data && (fl & SF_HAS_EVAL))
5886 		    data->flags |= SF_HAS_EVAL;
5887 	      optimize_curly_tail:
5888 		rck_elide_nothing(oscan);
5889 		continue;
5890 
5891 	    default:
5892                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5893                                                                     OP(scan));
5894             case REF:
5895             case CLUMP:
5896 		if (flags & SCF_DO_SUBSTR) {
5897                     /* Cannot expect anything... */
5898                     scan_commit(pRExC_state, data, minlenp, is_inf);
5899 		    data->cur_is_floating = 1; /* float */
5900 		}
5901 		is_inf = is_inf_internal = 1;
5902 		if (flags & SCF_DO_STCLASS_OR) {
5903                     if (OP(scan) == CLUMP) {
5904                         /* Actually is any start char, but very few code points
5905                          * aren't start characters */
5906                         ssc_match_all_cp(data->start_class);
5907                     }
5908                     else {
5909                         ssc_anything(data->start_class);
5910                     }
5911                 }
5912 		flags &= ~SCF_DO_STCLASS;
5913 		break;
5914 	    }
5915 	}
5916 	else if (OP(scan) == LNBREAK) {
5917 	    if (flags & SCF_DO_STCLASS) {
5918     	        if (flags & SCF_DO_STCLASS_AND) {
5919                     ssc_intersection(data->start_class,
5920                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5921                     ssc_clear_locale(data->start_class);
5922                     ANYOF_FLAGS(data->start_class)
5923                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5924                 }
5925                 else if (flags & SCF_DO_STCLASS_OR) {
5926                     ssc_union(data->start_class,
5927                               PL_XPosix_ptrs[_CC_VERTSPACE],
5928                               FALSE);
5929 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5930 
5931                     /* See commit msg for
5932                      * 749e076fceedeb708a624933726e7989f2302f6a */
5933                     ANYOF_FLAGS(data->start_class)
5934                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5935                 }
5936 		flags &= ~SCF_DO_STCLASS;
5937             }
5938 	    min++;
5939             if (delta != OPTIMIZE_INFTY)
5940                 delta++;    /* Because of the 2 char string cr-lf */
5941             if (flags & SCF_DO_SUBSTR) {
5942                 /* Cannot expect anything... */
5943                 scan_commit(pRExC_state, data, minlenp, is_inf);
5944     	        data->pos_min += 1;
5945                 if (data->pos_delta != OPTIMIZE_INFTY) {
5946                     data->pos_delta += 1;
5947                 }
5948 		data->cur_is_floating = 1; /* float */
5949     	    }
5950 	}
5951 	else if (REGNODE_SIMPLE(OP(scan))) {
5952 
5953 	    if (flags & SCF_DO_SUBSTR) {
5954                 scan_commit(pRExC_state, data, minlenp, is_inf);
5955 		data->pos_min++;
5956 	    }
5957 	    min++;
5958 	    if (flags & SCF_DO_STCLASS) {
5959                 bool invert = 0;
5960                 SV* my_invlist = NULL;
5961                 U8 namedclass;
5962 
5963                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5964                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5965 
5966 		/* Some of the logic below assumes that switching
5967 		   locale on will only add false positives. */
5968 		switch (OP(scan)) {
5969 
5970 		default:
5971 #ifdef DEBUGGING
5972                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5973                                                                      OP(scan));
5974 #endif
5975 		case SANY:
5976 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5977 			ssc_match_all_cp(data->start_class);
5978 		    break;
5979 
5980 		case REG_ANY:
5981                     {
5982                         SV* REG_ANY_invlist = _new_invlist(2);
5983                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5984                                                             '\n');
5985                         if (flags & SCF_DO_STCLASS_OR) {
5986                             ssc_union(data->start_class,
5987                                       REG_ANY_invlist,
5988                                       TRUE /* TRUE => invert, hence all but \n
5989                                             */
5990                                       );
5991                         }
5992                         else if (flags & SCF_DO_STCLASS_AND) {
5993                             ssc_intersection(data->start_class,
5994                                              REG_ANY_invlist,
5995                                              TRUE  /* TRUE => invert */
5996                                              );
5997                             ssc_clear_locale(data->start_class);
5998                         }
5999                         SvREFCNT_dec_NN(REG_ANY_invlist);
6000 		    }
6001 		    break;
6002 
6003                 case ANYOFD:
6004                 case ANYOFL:
6005                 case ANYOFPOSIXL:
6006                 case ANYOFH:
6007                 case ANYOFHb:
6008                 case ANYOFHr:
6009                 case ANYOFHs:
6010                 case ANYOF:
6011 		    if (flags & SCF_DO_STCLASS_AND)
6012 			ssc_and(pRExC_state, data->start_class,
6013                                 (regnode_charclass *) scan);
6014 		    else
6015 			ssc_or(pRExC_state, data->start_class,
6016                                                           (regnode_charclass *) scan);
6017 		    break;
6018 
6019                 case NANYOFM: /* NANYOFM already contains the inversion of the
6020                                  input ANYOF data, so, unlike things like
6021                                  NPOSIXA, don't change 'invert' to TRUE */
6022                     /* FALLTHROUGH */
6023                 case ANYOFM:
6024                   {
6025                     SV* cp_list = get_ANYOFM_contents(scan);
6026 
6027                     if (flags & SCF_DO_STCLASS_OR) {
6028                         ssc_union(data->start_class, cp_list, invert);
6029                     }
6030                     else if (flags & SCF_DO_STCLASS_AND) {
6031                         ssc_intersection(data->start_class, cp_list, invert);
6032                     }
6033 
6034                     SvREFCNT_dec_NN(cp_list);
6035                     break;
6036                   }
6037 
6038                 case ANYOFR:
6039                 case ANYOFRb:
6040                   {
6041                     SV* cp_list = NULL;
6042 
6043                     cp_list = _add_range_to_invlist(cp_list,
6044                                         ANYOFRbase(scan),
6045                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
6046 
6047                     if (flags & SCF_DO_STCLASS_OR) {
6048                         ssc_union(data->start_class, cp_list, invert);
6049                     }
6050                     else if (flags & SCF_DO_STCLASS_AND) {
6051                         ssc_intersection(data->start_class, cp_list, invert);
6052                     }
6053 
6054                     SvREFCNT_dec_NN(cp_list);
6055                     break;
6056                   }
6057 
6058 		case NPOSIXL:
6059                     invert = 1;
6060                     /* FALLTHROUGH */
6061 
6062 		case POSIXL:
6063                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6064                     if (flags & SCF_DO_STCLASS_AND) {
6065                         bool was_there = cBOOL(
6066                                           ANYOF_POSIXL_TEST(data->start_class,
6067                                                                  namedclass));
6068                         ANYOF_POSIXL_ZERO(data->start_class);
6069                         if (was_there) {    /* Do an AND */
6070                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6071                         }
6072                         /* No individual code points can now match */
6073                         data->start_class->invlist
6074                                                 = sv_2mortal(_new_invlist(0));
6075                     }
6076                     else {
6077                         int complement = namedclass + ((invert) ? -1 : 1);
6078 
6079                         assert(flags & SCF_DO_STCLASS_OR);
6080 
6081                         /* If the complement of this class was already there,
6082                          * the result is that they match all code points,
6083                          * (\d + \D == everything).  Remove the classes from
6084                          * future consideration.  Locale is not relevant in
6085                          * this case */
6086                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6087                             ssc_match_all_cp(data->start_class);
6088                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6089                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
6090                         }
6091                         else {  /* The usual case; just add this class to the
6092                                    existing set */
6093                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6094                         }
6095                     }
6096                     break;
6097 
6098                 case NPOSIXA:   /* For these, we always know the exact set of
6099                                    what's matched */
6100                     invert = 1;
6101                     /* FALLTHROUGH */
6102 		case POSIXA:
6103                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6104                     goto join_posix_and_ascii;
6105 
6106 		case NPOSIXD:
6107 		case NPOSIXU:
6108                     invert = 1;
6109                     /* FALLTHROUGH */
6110 		case POSIXD:
6111 		case POSIXU:
6112                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6113 
6114                     /* NPOSIXD matches all upper Latin1 code points unless the
6115                      * target string being matched is UTF-8, which is
6116                      * unknowable until match time.  Since we are going to
6117                      * invert, we want to get rid of all of them so that the
6118                      * inversion will match all */
6119                     if (OP(scan) == NPOSIXD) {
6120                         _invlist_subtract(my_invlist, PL_UpperLatin1,
6121                                           &my_invlist);
6122                     }
6123 
6124                   join_posix_and_ascii:
6125 
6126                     if (flags & SCF_DO_STCLASS_AND) {
6127                         ssc_intersection(data->start_class, my_invlist, invert);
6128                         ssc_clear_locale(data->start_class);
6129                     }
6130                     else {
6131                         assert(flags & SCF_DO_STCLASS_OR);
6132                         ssc_union(data->start_class, my_invlist, invert);
6133                     }
6134                     SvREFCNT_dec(my_invlist);
6135 		}
6136 		if (flags & SCF_DO_STCLASS_OR)
6137 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6138 		flags &= ~SCF_DO_STCLASS;
6139 	    }
6140 	}
6141 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6142 	    data->flags |= (OP(scan) == MEOL
6143 			    ? SF_BEFORE_MEOL
6144 			    : SF_BEFORE_SEOL);
6145             scan_commit(pRExC_state, data, minlenp, is_inf);
6146 
6147 	}
6148 	else if (  PL_regkind[OP(scan)] == BRANCHJ
6149 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
6150 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
6151 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6152         {
6153             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6154                 || OP(scan) == UNLESSM )
6155             {
6156                 /* Negative Lookahead/lookbehind
6157                    In this case we can't do fixed string optimisation.
6158                 */
6159 
6160                 SSize_t deltanext, minnext, fake = 0;
6161                 regnode *nscan;
6162                 regnode_ssc intrnl;
6163                 int f = 0;
6164 
6165                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6166                 if (data) {
6167                     data_fake.whilem_c = data->whilem_c;
6168                     data_fake.last_closep = data->last_closep;
6169 		}
6170                 else
6171                     data_fake.last_closep = &fake;
6172 		data_fake.pos_delta = delta;
6173                 if ( flags & SCF_DO_STCLASS && !scan->flags
6174                      && OP(scan) == IFMATCH ) { /* Lookahead */
6175                     ssc_init(pRExC_state, &intrnl);
6176                     data_fake.start_class = &intrnl;
6177                     f |= SCF_DO_STCLASS_AND;
6178 		}
6179                 if (flags & SCF_WHILEM_VISITED_POS)
6180                     f |= SCF_WHILEM_VISITED_POS;
6181                 next = regnext(scan);
6182                 nscan = NEXTOPER(NEXTOPER(scan));
6183 
6184                 /* recurse study_chunk() for lookahead body */
6185                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6186                                       last, &data_fake, stopparen,
6187                                       recursed_depth, NULL, f, depth+1,
6188                                       mutate_ok);
6189                 if (scan->flags) {
6190                     if (   deltanext < 0
6191                         || deltanext > (I32) U8_MAX
6192                         || minnext > (I32)U8_MAX
6193                         || minnext + deltanext > (I32)U8_MAX)
6194                     {
6195 			FAIL2("Lookbehind longer than %" UVuf " not implemented",
6196                               (UV)U8_MAX);
6197                     }
6198 
6199                     /* The 'next_off' field has been repurposed to count the
6200                      * additional starting positions to try beyond the initial
6201                      * one.  (This leaves it at 0 for non-variable length
6202                      * matches to avoid breakage for those not using this
6203                      * extension) */
6204                     if (deltanext) {
6205                         scan->next_off = deltanext;
6206                         ckWARNexperimental(RExC_parse,
6207                             WARN_EXPERIMENTAL__VLB,
6208                             "Variable length lookbehind is experimental");
6209                     }
6210                     scan->flags = (U8)minnext + deltanext;
6211                 }
6212                 if (data) {
6213                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6214                         pars++;
6215                     if (data_fake.flags & SF_HAS_EVAL)
6216                         data->flags |= SF_HAS_EVAL;
6217                     data->whilem_c = data_fake.whilem_c;
6218                 }
6219                 if (f & SCF_DO_STCLASS_AND) {
6220 		    if (flags & SCF_DO_STCLASS_OR) {
6221 			/* OR before, AND after: ideally we would recurse with
6222 			 * data_fake to get the AND applied by study of the
6223 			 * remainder of the pattern, and then derecurse;
6224 			 * *** HACK *** for now just treat as "no information".
6225 			 * See [perl #56690].
6226 			 */
6227 			ssc_init(pRExC_state, data->start_class);
6228 		    }  else {
6229                         /* AND before and after: combine and continue.  These
6230                          * assertions are zero-length, so can match an EMPTY
6231                          * string */
6232 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6233                         ANYOF_FLAGS(data->start_class)
6234                                                    |= SSC_MATCHES_EMPTY_STRING;
6235 		    }
6236                 }
6237 	    }
6238 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6239             else {
6240                 /* Positive Lookahead/lookbehind
6241                    In this case we can do fixed string optimisation,
6242                    but we must be careful about it. Note in the case of
6243                    lookbehind the positions will be offset by the minimum
6244                    length of the pattern, something we won't know about
6245                    until after the recurse.
6246                 */
6247                 SSize_t deltanext, fake = 0;
6248                 regnode *nscan;
6249                 regnode_ssc intrnl;
6250                 int f = 0;
6251                 /* We use SAVEFREEPV so that when the full compile
6252                     is finished perl will clean up the allocated
6253                     minlens when it's all done. This way we don't
6254                     have to worry about freeing them when we know
6255                     they wont be used, which would be a pain.
6256                  */
6257                 SSize_t *minnextp;
6258                 Newx( minnextp, 1, SSize_t );
6259                 SAVEFREEPV(minnextp);
6260 
6261                 if (data) {
6262                     StructCopy(data, &data_fake, scan_data_t);
6263                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6264                         f |= SCF_DO_SUBSTR;
6265                         if (scan->flags)
6266                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6267                         data_fake.last_found=newSVsv(data->last_found);
6268                     }
6269                 }
6270                 else
6271                     data_fake.last_closep = &fake;
6272                 data_fake.flags = 0;
6273                 data_fake.substrs[0].flags = 0;
6274                 data_fake.substrs[1].flags = 0;
6275 		data_fake.pos_delta = delta;
6276                 if (is_inf)
6277 	            data_fake.flags |= SF_IS_INF;
6278                 if ( flags & SCF_DO_STCLASS && !scan->flags
6279                      && OP(scan) == IFMATCH ) { /* Lookahead */
6280                     ssc_init(pRExC_state, &intrnl);
6281                     data_fake.start_class = &intrnl;
6282                     f |= SCF_DO_STCLASS_AND;
6283                 }
6284                 if (flags & SCF_WHILEM_VISITED_POS)
6285                     f |= SCF_WHILEM_VISITED_POS;
6286                 next = regnext(scan);
6287                 nscan = NEXTOPER(NEXTOPER(scan));
6288 
6289                 /* positive lookahead study_chunk() recursion */
6290                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6291                                         &deltanext, last, &data_fake,
6292                                         stopparen, recursed_depth, NULL,
6293                                         f, depth+1, mutate_ok);
6294                 if (scan->flags) {
6295                     assert(0);  /* This code has never been tested since this
6296                                    is normally not compiled */
6297                     if (   deltanext < 0
6298                         || deltanext > (I32) U8_MAX
6299                         || *minnextp > (I32)U8_MAX
6300                         || *minnextp + deltanext > (I32)U8_MAX)
6301                     {
6302 			FAIL2("Lookbehind longer than %" UVuf " not implemented",
6303                               (UV)U8_MAX);
6304                     }
6305 
6306                     if (deltanext) {
6307                         scan->next_off = deltanext;
6308                     }
6309                     scan->flags = (U8)*minnextp + deltanext;
6310                 }
6311 
6312                 *minnextp += min;
6313 
6314                 if (f & SCF_DO_STCLASS_AND) {
6315                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6316                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6317                 }
6318                 if (data) {
6319                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6320                         pars++;
6321                     if (data_fake.flags & SF_HAS_EVAL)
6322                         data->flags |= SF_HAS_EVAL;
6323                     data->whilem_c = data_fake.whilem_c;
6324                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6325                         int i;
6326                         if (RExC_rx->minlen<*minnextp)
6327                             RExC_rx->minlen=*minnextp;
6328                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6329                         SvREFCNT_dec_NN(data_fake.last_found);
6330 
6331                         for (i = 0; i < 2; i++) {
6332                             if (data_fake.substrs[i].minlenp != minlenp) {
6333                                 data->substrs[i].min_offset =
6334                                             data_fake.substrs[i].min_offset;
6335                                 data->substrs[i].max_offset =
6336                                             data_fake.substrs[i].max_offset;
6337                                 data->substrs[i].minlenp =
6338                                             data_fake.substrs[i].minlenp;
6339                                 data->substrs[i].lookbehind += scan->flags;
6340                             }
6341                         }
6342                     }
6343                 }
6344 	    }
6345 #endif
6346 	}
6347 	else if (OP(scan) == OPEN) {
6348 	    if (stopparen != (I32)ARG(scan))
6349 	        pars++;
6350 	}
6351 	else if (OP(scan) == CLOSE) {
6352 	    if (stopparen == (I32)ARG(scan)) {
6353 	        break;
6354 	    }
6355 	    if ((I32)ARG(scan) == is_par) {
6356 		next = regnext(scan);
6357 
6358 		if ( next && (OP(next) != WHILEM) && next < last)
6359 		    is_par = 0;		/* Disable optimization */
6360 	    }
6361 	    if (data)
6362 		*(data->last_closep) = ARG(scan);
6363 	}
6364 	else if (OP(scan) == EVAL) {
6365 		if (data)
6366 		    data->flags |= SF_HAS_EVAL;
6367 	}
6368 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6369 	    if (flags & SCF_DO_SUBSTR) {
6370                 scan_commit(pRExC_state, data, minlenp, is_inf);
6371 		flags &= ~SCF_DO_SUBSTR;
6372 	    }
6373 	    if (data && OP(scan)==ACCEPT) {
6374 	        data->flags |= SCF_SEEN_ACCEPT;
6375 	        if (stopmin > min)
6376 	            stopmin = min;
6377 	    }
6378 	}
6379 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6380 	{
6381 		if (flags & SCF_DO_SUBSTR) {
6382                     scan_commit(pRExC_state, data, minlenp, is_inf);
6383 		    data->cur_is_floating = 1; /* float */
6384 		}
6385 		is_inf = is_inf_internal = 1;
6386 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6387 		    ssc_anything(data->start_class);
6388 		flags &= ~SCF_DO_STCLASS;
6389 	}
6390 	else if (OP(scan) == GPOS) {
6391             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6392 	        !(delta || is_inf || (data && data->pos_delta)))
6393 	    {
6394                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6395                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6396 	        if (RExC_rx->gofs < (STRLEN)min)
6397 		    RExC_rx->gofs = min;
6398             } else {
6399                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6400                 RExC_rx->gofs = 0;
6401             }
6402 	}
6403 #ifdef TRIE_STUDY_OPT
6404 #ifdef FULL_TRIE_STUDY
6405         else if (PL_regkind[OP(scan)] == TRIE) {
6406             /* NOTE - There is similar code to this block above for handling
6407                BRANCH nodes on the initial study.  If you change stuff here
6408                check there too. */
6409             regnode *trie_node= scan;
6410             regnode *tail= regnext(scan);
6411             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6412             SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6413             regnode_ssc accum;
6414 
6415             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6416                 /* Cannot merge strings after this. */
6417                 scan_commit(pRExC_state, data, minlenp, is_inf);
6418             }
6419             if (flags & SCF_DO_STCLASS)
6420                 ssc_init_zero(pRExC_state, &accum);
6421 
6422             if (!trie->jump) {
6423                 min1= trie->minlen;
6424                 max1= trie->maxlen;
6425             } else {
6426                 const regnode *nextbranch= NULL;
6427                 U32 word;
6428 
6429                 for ( word=1 ; word <= trie->wordcount ; word++)
6430                 {
6431                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6432                     regnode_ssc this_class;
6433 
6434                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6435                     if (data) {
6436                         data_fake.whilem_c = data->whilem_c;
6437                         data_fake.last_closep = data->last_closep;
6438                     }
6439                     else
6440                         data_fake.last_closep = &fake;
6441 		    data_fake.pos_delta = delta;
6442                     if (flags & SCF_DO_STCLASS) {
6443                         ssc_init(pRExC_state, &this_class);
6444                         data_fake.start_class = &this_class;
6445                         f = SCF_DO_STCLASS_AND;
6446                     }
6447                     if (flags & SCF_WHILEM_VISITED_POS)
6448                         f |= SCF_WHILEM_VISITED_POS;
6449 
6450                     if (trie->jump[word]) {
6451                         if (!nextbranch)
6452                             nextbranch = trie_node + trie->jump[0];
6453                         scan= trie_node + trie->jump[word];
6454                         /* We go from the jump point to the branch that follows
6455                            it. Note this means we need the vestigal unused
6456                            branches even though they arent otherwise used. */
6457                         /* optimise study_chunk() for TRIE */
6458                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6459                             &deltanext, (regnode *)nextbranch, &data_fake,
6460                             stopparen, recursed_depth, NULL, f, depth+1,
6461                             mutate_ok);
6462                     }
6463                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6464                         nextbranch= regnext((regnode*)nextbranch);
6465 
6466                     if (min1 > (SSize_t)(minnext + trie->minlen))
6467                         min1 = minnext + trie->minlen;
6468                     if (deltanext == OPTIMIZE_INFTY) {
6469                         is_inf = is_inf_internal = 1;
6470                         max1 = OPTIMIZE_INFTY;
6471                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6472                         max1 = minnext + deltanext + trie->maxlen;
6473 
6474                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6475                         pars++;
6476                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6477                         if ( stopmin > min + min1)
6478 	                    stopmin = min + min1;
6479 	                flags &= ~SCF_DO_SUBSTR;
6480 	                if (data)
6481 	                    data->flags |= SCF_SEEN_ACCEPT;
6482 	            }
6483                     if (data) {
6484                         if (data_fake.flags & SF_HAS_EVAL)
6485                             data->flags |= SF_HAS_EVAL;
6486                         data->whilem_c = data_fake.whilem_c;
6487                     }
6488                     if (flags & SCF_DO_STCLASS)
6489                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6490                 }
6491             }
6492             if (flags & SCF_DO_SUBSTR) {
6493                 data->pos_min += min1;
6494                 data->pos_delta += max1 - min1;
6495                 if (max1 != min1 || is_inf)
6496                     data->cur_is_floating = 1; /* float */
6497             }
6498             min += min1;
6499             if (delta != OPTIMIZE_INFTY) {
6500                 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6501                     delta += max1 - min1;
6502                 else
6503                     delta = OPTIMIZE_INFTY;
6504             }
6505             if (flags & SCF_DO_STCLASS_OR) {
6506                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6507                 if (min1) {
6508                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6509                     flags &= ~SCF_DO_STCLASS;
6510                 }
6511             }
6512             else if (flags & SCF_DO_STCLASS_AND) {
6513                 if (min1) {
6514                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6515                     flags &= ~SCF_DO_STCLASS;
6516                 }
6517                 else {
6518                     /* Switch to OR mode: cache the old value of
6519                      * data->start_class */
6520 		    INIT_AND_WITHP;
6521                     StructCopy(data->start_class, and_withp, regnode_ssc);
6522                     flags &= ~SCF_DO_STCLASS_AND;
6523                     StructCopy(&accum, data->start_class, regnode_ssc);
6524                     flags |= SCF_DO_STCLASS_OR;
6525                 }
6526             }
6527             scan= tail;
6528             continue;
6529         }
6530 #else
6531 	else if (PL_regkind[OP(scan)] == TRIE) {
6532 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6533 	    U8*bang=NULL;
6534 
6535 	    min += trie->minlen;
6536 	    delta += (trie->maxlen - trie->minlen);
6537 	    flags &= ~SCF_DO_STCLASS; /* xxx */
6538             if (flags & SCF_DO_SUBSTR) {
6539                 /* Cannot expect anything... */
6540                 scan_commit(pRExC_state, data, minlenp, is_inf);
6541     	        data->pos_min += trie->minlen;
6542     	        data->pos_delta += (trie->maxlen - trie->minlen);
6543 		if (trie->maxlen != trie->minlen)
6544 		    data->cur_is_floating = 1; /* float */
6545     	    }
6546     	    if (trie->jump) /* no more substrings -- for now /grr*/
6547                flags &= ~SCF_DO_SUBSTR;
6548 	}
6549         else if (OP(scan) == REGEX_SET) {
6550             Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6551                              " before optimization", reg_name[REGEX_SET]);
6552         }
6553 
6554 #endif /* old or new */
6555 #endif /* TRIE_STUDY_OPT */
6556 
6557 	/* Else: zero-length, ignore. */
6558 	scan = regnext(scan);
6559     }
6560 
6561   finish:
6562     if (frame) {
6563         /* we need to unwind recursion. */
6564         depth = depth - 1;
6565 
6566         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6567         DEBUG_PEEP("fend", scan, depth, flags);
6568 
6569         /* restore previous context */
6570         last = frame->last_regnode;
6571         scan = frame->next_regnode;
6572         stopparen = frame->stopparen;
6573         recursed_depth = frame->prev_recursed_depth;
6574 
6575         RExC_frame_last = frame->prev_frame;
6576         frame = frame->this_prev_frame;
6577         goto fake_study_recurse;
6578     }
6579 
6580     assert(!frame);
6581     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6582 
6583     *scanp = scan;
6584     *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6585 
6586     if (flags & SCF_DO_SUBSTR && is_inf)
6587 	data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6588     if (is_par > (I32)U8_MAX)
6589 	is_par = 0;
6590     if (is_par && pars==1 && data) {
6591 	data->flags |= SF_IN_PAR;
6592 	data->flags &= ~SF_HAS_PAR;
6593     }
6594     else if (pars && data) {
6595 	data->flags |= SF_HAS_PAR;
6596 	data->flags &= ~SF_IN_PAR;
6597     }
6598     if (flags & SCF_DO_STCLASS_OR)
6599 	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6600     if (flags & SCF_TRIE_RESTUDY)
6601         data->flags |= 	SCF_TRIE_RESTUDY;
6602 
6603     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6604 
6605     final_minlen = min < stopmin
6606             ? min : stopmin;
6607 
6608     if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6609         if (final_minlen > OPTIMIZE_INFTY - delta)
6610             RExC_maxlen = OPTIMIZE_INFTY;
6611         else if (RExC_maxlen < final_minlen + delta)
6612             RExC_maxlen = final_minlen + delta;
6613     }
6614     return final_minlen;
6615 }
6616 
6617 STATIC U32
S_add_data(RExC_state_t * const pRExC_state,const char * const s,const U32 n)6618 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6619 {
6620     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6621 
6622     PERL_ARGS_ASSERT_ADD_DATA;
6623 
6624     Renewc(RExC_rxi->data,
6625 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6626 	   char, struct reg_data);
6627     if(count)
6628 	Renew(RExC_rxi->data->what, count + n, U8);
6629     else
6630 	Newx(RExC_rxi->data->what, n, U8);
6631     RExC_rxi->data->count = count + n;
6632     Copy(s, RExC_rxi->data->what + count, n, U8);
6633     return count;
6634 }
6635 
6636 /*XXX: todo make this not included in a non debugging perl, but appears to be
6637  * used anyway there, in 'use re' */
6638 #ifndef PERL_IN_XSUB_RE
6639 void
Perl_reginitcolors(pTHX)6640 Perl_reginitcolors(pTHX)
6641 {
6642     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6643     if (s) {
6644 	char *t = savepv(s);
6645 	int i = 0;
6646 	PL_colors[0] = t;
6647 	while (++i < 6) {
6648 	    t = strchr(t, '\t');
6649 	    if (t) {
6650 		*t = '\0';
6651 		PL_colors[i] = ++t;
6652 	    }
6653 	    else
6654 		PL_colors[i] = t = (char *)"";
6655 	}
6656     } else {
6657 	int i = 0;
6658 	while (i < 6)
6659 	    PL_colors[i++] = (char *)"";
6660     }
6661     PL_colorset = 1;
6662 }
6663 #endif
6664 
6665 
6666 #ifdef TRIE_STUDY_OPT
6667 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6668     STMT_START {                                            \
6669         if (                                                \
6670               (data.flags & SCF_TRIE_RESTUDY)               \
6671               && ! restudied++                              \
6672         ) {                                                 \
6673             dOsomething;                                    \
6674             goto reStudy;                                   \
6675         }                                                   \
6676     } STMT_END
6677 #else
6678 #define CHECK_RESTUDY_GOTO_butfirst
6679 #endif
6680 
6681 /*
6682  * pregcomp - compile a regular expression into internal code
6683  *
6684  * Decides which engine's compiler to call based on the hint currently in
6685  * scope
6686  */
6687 
6688 #ifndef PERL_IN_XSUB_RE
6689 
6690 /* return the currently in-scope regex engine (or the default if none)  */
6691 
6692 regexp_engine const *
Perl_current_re_engine(pTHX)6693 Perl_current_re_engine(pTHX)
6694 {
6695     if (IN_PERL_COMPILETIME) {
6696 	HV * const table = GvHV(PL_hintgv);
6697 	SV **ptr;
6698 
6699 	if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6700 	    return &PL_core_reg_engine;
6701 	ptr = hv_fetchs(table, "regcomp", FALSE);
6702 	if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6703 	    return &PL_core_reg_engine;
6704 	return INT2PTR(regexp_engine*, SvIV(*ptr));
6705     }
6706     else {
6707 	SV *ptr;
6708 	if (!PL_curcop->cop_hints_hash)
6709 	    return &PL_core_reg_engine;
6710 	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6711 	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6712 	    return &PL_core_reg_engine;
6713 	return INT2PTR(regexp_engine*, SvIV(ptr));
6714     }
6715 }
6716 
6717 
6718 REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern,const U32 flags)6719 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6720 {
6721     regexp_engine const *eng = current_re_engine();
6722     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6723 
6724     PERL_ARGS_ASSERT_PREGCOMP;
6725 
6726     /* Dispatch a request to compile a regexp to correct regexp engine. */
6727     DEBUG_COMPILE_r({
6728         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6729 			PTR2UV(eng));
6730     });
6731     return CALLREGCOMP_ENG(eng, pattern, flags);
6732 }
6733 #endif
6734 
6735 /* public(ish) entry point for the perl core's own regex compiling code.
6736  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6737  * pattern rather than a list of OPs, and uses the internal engine rather
6738  * than the current one */
6739 
6740 REGEXP *
Perl_re_compile(pTHX_ SV * const pattern,U32 rx_flags)6741 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6742 {
6743     SV *pat = pattern; /* defeat constness! */
6744 
6745     PERL_ARGS_ASSERT_RE_COMPILE;
6746 
6747     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6748 #ifdef PERL_IN_XSUB_RE
6749                                 &my_reg_engine,
6750 #else
6751                                 &PL_core_reg_engine,
6752 #endif
6753                                 NULL, NULL, rx_flags, 0);
6754 }
6755 
6756 static void
S_free_codeblocks(pTHX_ struct reg_code_blocks * cbs)6757 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6758 {
6759     int n;
6760 
6761     if (--cbs->refcnt > 0)
6762         return;
6763     for (n = 0; n < cbs->count; n++) {
6764         REGEXP *rx = cbs->cb[n].src_regex;
6765         if (rx) {
6766             cbs->cb[n].src_regex = NULL;
6767             SvREFCNT_dec_NN(rx);
6768         }
6769     }
6770     Safefree(cbs->cb);
6771     Safefree(cbs);
6772 }
6773 
6774 
6775 static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)6776 S_alloc_code_blocks(pTHX_  int ncode)
6777 {
6778      struct reg_code_blocks *cbs;
6779     Newx(cbs, 1, struct reg_code_blocks);
6780     cbs->count = ncode;
6781     cbs->refcnt = 1;
6782     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6783     if (ncode)
6784         Newx(cbs->cb, ncode, struct reg_code_block);
6785     else
6786         cbs->cb = NULL;
6787     return cbs;
6788 }
6789 
6790 
6791 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6792  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6793  * point to the realloced string and length.
6794  *
6795  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6796  * stuff added */
6797 
6798 static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,char ** pat_p,STRLEN * plen_p,int num_code_blocks)6799 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6800 		    char **pat_p, STRLEN *plen_p, int num_code_blocks)
6801 {
6802     U8 *const src = (U8*)*pat_p;
6803     U8 *dst, *d;
6804     int n=0;
6805     STRLEN s = 0;
6806     bool do_end = 0;
6807     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6808 
6809     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6810         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6811 
6812     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6813     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6814     d = dst;
6815 
6816     while (s < *plen_p) {
6817         append_utf8_from_native_byte(src[s], &d);
6818 
6819         if (n < num_code_blocks) {
6820             assert(pRExC_state->code_blocks);
6821             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6822                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6823                 assert(*(d - 1) == '(');
6824                 do_end = 1;
6825             }
6826             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6827                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6828                 assert(*(d - 1) == ')');
6829                 do_end = 0;
6830                 n++;
6831             }
6832         }
6833         s++;
6834     }
6835     *d = '\0';
6836     *plen_p = d - dst;
6837     *pat_p = (char*) dst;
6838     SAVEFREEPV(*pat_p);
6839     RExC_orig_utf8 = RExC_utf8 = 1;
6840 }
6841 
6842 
6843 
6844 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6845  * while recording any code block indices, and handling overloading,
6846  * nested qr// objects etc.  If pat is null, it will allocate a new
6847  * string, or just return the first arg, if there's only one.
6848  *
6849  * Returns the malloced/updated pat.
6850  * patternp and pat_count is the array of SVs to be concatted;
6851  * oplist is the optional list of ops that generated the SVs;
6852  * recompile_p is a pointer to a boolean that will be set if
6853  *   the regex will need to be recompiled.
6854  * delim, if non-null is an SV that will be inserted between each element
6855  */
6856 
6857 static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,SV * pat,SV ** const patternp,int pat_count,OP * oplist,bool * recompile_p,SV * delim)6858 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6859                 SV *pat, SV ** const patternp, int pat_count,
6860                 OP *oplist, bool *recompile_p, SV *delim)
6861 {
6862     SV **svp;
6863     int n = 0;
6864     bool use_delim = FALSE;
6865     bool alloced = FALSE;
6866 
6867     /* if we know we have at least two args, create an empty string,
6868      * then concatenate args to that. For no args, return an empty string */
6869     if (!pat && pat_count != 1) {
6870         pat = newSVpvs("");
6871         SAVEFREESV(pat);
6872         alloced = TRUE;
6873     }
6874 
6875     for (svp = patternp; svp < patternp + pat_count; svp++) {
6876         SV *sv;
6877         SV *rx  = NULL;
6878         STRLEN orig_patlen = 0;
6879         bool code = 0;
6880         SV *msv = use_delim ? delim : *svp;
6881         if (!msv) msv = &PL_sv_undef;
6882 
6883         /* if we've got a delimiter, we go round the loop twice for each
6884          * svp slot (except the last), using the delimiter the second
6885          * time round */
6886         if (use_delim) {
6887             svp--;
6888             use_delim = FALSE;
6889         }
6890         else if (delim)
6891             use_delim = TRUE;
6892 
6893         if (SvTYPE(msv) == SVt_PVAV) {
6894             /* we've encountered an interpolated array within
6895              * the pattern, e.g. /...@a..../. Expand the list of elements,
6896              * then recursively append elements.
6897              * The code in this block is based on S_pushav() */
6898 
6899             AV *const av = (AV*)msv;
6900             const SSize_t maxarg = AvFILL(av) + 1;
6901             SV **array;
6902 
6903             if (oplist) {
6904                 assert(oplist->op_type == OP_PADAV
6905                     || oplist->op_type == OP_RV2AV);
6906                 oplist = OpSIBLING(oplist);
6907             }
6908 
6909             if (SvRMAGICAL(av)) {
6910                 SSize_t i;
6911 
6912                 Newx(array, maxarg, SV*);
6913                 SAVEFREEPV(array);
6914                 for (i=0; i < maxarg; i++) {
6915                     SV ** const svp = av_fetch(av, i, FALSE);
6916                     array[i] = svp ? *svp : &PL_sv_undef;
6917                 }
6918             }
6919             else
6920                 array = AvARRAY(av);
6921 
6922             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6923                                 array, maxarg, NULL, recompile_p,
6924                                 /* $" */
6925                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6926 
6927             continue;
6928         }
6929 
6930 
6931         /* we make the assumption here that each op in the list of
6932          * op_siblings maps to one SV pushed onto the stack,
6933          * except for code blocks, with have both an OP_NULL and
6934          * an OP_CONST.
6935          * This allows us to match up the list of SVs against the
6936          * list of OPs to find the next code block.
6937          *
6938          * Note that       PUSHMARK PADSV PADSV ..
6939          * is optimised to
6940          *                 PADRANGE PADSV  PADSV  ..
6941          * so the alignment still works. */
6942 
6943         if (oplist) {
6944             if (oplist->op_type == OP_NULL
6945                 && (oplist->op_flags & OPf_SPECIAL))
6946             {
6947                 assert(n < pRExC_state->code_blocks->count);
6948                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6949                 pRExC_state->code_blocks->cb[n].block = oplist;
6950                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6951                 n++;
6952                 code = 1;
6953                 oplist = OpSIBLING(oplist); /* skip CONST */
6954                 assert(oplist);
6955             }
6956             oplist = OpSIBLING(oplist);;
6957         }
6958 
6959 	/* apply magic and QR overloading to arg */
6960 
6961         SvGETMAGIC(msv);
6962         if (SvROK(msv) && SvAMAGIC(msv)) {
6963             SV *sv = AMG_CALLunary(msv, regexp_amg);
6964             if (sv) {
6965                 if (SvROK(sv))
6966                     sv = SvRV(sv);
6967                 if (SvTYPE(sv) != SVt_REGEXP)
6968                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6969                 msv = sv;
6970             }
6971         }
6972 
6973         /* try concatenation overload ... */
6974         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6975                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6976         {
6977             sv_setsv(pat, sv);
6978             /* overloading involved: all bets are off over literal
6979              * code. Pretend we haven't seen it */
6980             if (n)
6981                 pRExC_state->code_blocks->count -= n;
6982             n = 0;
6983         }
6984         else {
6985             /* ... or failing that, try "" overload */
6986             while (SvAMAGIC(msv)
6987                     && (sv = AMG_CALLunary(msv, string_amg))
6988                     && sv != msv
6989                     &&  !(   SvROK(msv)
6990                           && SvROK(sv)
6991                           && SvRV(msv) == SvRV(sv))
6992             ) {
6993                 msv = sv;
6994                 SvGETMAGIC(msv);
6995             }
6996             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6997                 msv = SvRV(msv);
6998 
6999             if (pat) {
7000                 /* this is a partially unrolled
7001                  *     sv_catsv_nomg(pat, msv);
7002                  * that allows us to adjust code block indices if
7003                  * needed */
7004                 STRLEN dlen;
7005                 char *dst = SvPV_force_nomg(pat, dlen);
7006                 orig_patlen = dlen;
7007                 if (SvUTF8(msv) && !SvUTF8(pat)) {
7008                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
7009                     sv_setpvn(pat, dst, dlen);
7010                     SvUTF8_on(pat);
7011                 }
7012                 sv_catsv_nomg(pat, msv);
7013                 rx = msv;
7014             }
7015             else {
7016                 /* We have only one SV to process, but we need to verify
7017                  * it is properly null terminated or we will fail asserts
7018                  * later. In theory we probably shouldn't get such SV's,
7019                  * but if we do we should handle it gracefully. */
7020                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7021                     /* not a string, or a string with a trailing null */
7022                     pat = msv;
7023                 } else {
7024                     /* a string with no trailing null, we need to copy it
7025                      * so it has a trailing null */
7026                     pat = sv_2mortal(newSVsv(msv));
7027                 }
7028             }
7029 
7030             if (code)
7031                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7032         }
7033 
7034         /* extract any code blocks within any embedded qr//'s */
7035         if (rx && SvTYPE(rx) == SVt_REGEXP
7036             && RX_ENGINE((REGEXP*)rx)->op_comp)
7037         {
7038 
7039             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7040             if (ri->code_blocks && ri->code_blocks->count) {
7041                 int i;
7042                 /* the presence of an embedded qr// with code means
7043                  * we should always recompile: the text of the
7044                  * qr// may not have changed, but it may be a
7045                  * different closure than last time */
7046                 *recompile_p = 1;
7047                 if (pRExC_state->code_blocks) {
7048                     int new_count = pRExC_state->code_blocks->count
7049                             + ri->code_blocks->count;
7050                     Renew(pRExC_state->code_blocks->cb,
7051                             new_count, struct reg_code_block);
7052                     pRExC_state->code_blocks->count = new_count;
7053                 }
7054                 else
7055                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7056                                                     ri->code_blocks->count);
7057 
7058                 for (i=0; i < ri->code_blocks->count; i++) {
7059                     struct reg_code_block *src, *dst;
7060                     STRLEN offset =  orig_patlen
7061                         + ReANY((REGEXP *)rx)->pre_prefix;
7062                     assert(n < pRExC_state->code_blocks->count);
7063                     src = &ri->code_blocks->cb[i];
7064                     dst = &pRExC_state->code_blocks->cb[n];
7065                     dst->start	    = src->start + offset;
7066                     dst->end	    = src->end   + offset;
7067                     dst->block	    = src->block;
7068                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
7069                                             src->src_regex
7070                                                 ? src->src_regex
7071                                                 : (REGEXP*)rx);
7072                     n++;
7073                 }
7074             }
7075         }
7076     }
7077     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7078     if (alloced)
7079         SvSETMAGIC(pat);
7080 
7081     return pat;
7082 }
7083 
7084 
7085 
7086 /* see if there are any run-time code blocks in the pattern.
7087  * False positives are allowed */
7088 
7089 static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)7090 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7091 		    char *pat, STRLEN plen)
7092 {
7093     int n = 0;
7094     STRLEN s;
7095 
7096     PERL_UNUSED_CONTEXT;
7097 
7098     for (s = 0; s < plen; s++) {
7099 	if (   pRExC_state->code_blocks
7100             && n < pRExC_state->code_blocks->count
7101 	    && s == pRExC_state->code_blocks->cb[n].start)
7102 	{
7103 	    s = pRExC_state->code_blocks->cb[n].end;
7104 	    n++;
7105 	    continue;
7106 	}
7107 	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
7108 	 * positives here */
7109 	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7110 	    (pat[s+2] == '{'
7111                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7112 	)
7113 	    return 1;
7114     }
7115     return 0;
7116 }
7117 
7118 /* Handle run-time code blocks. We will already have compiled any direct
7119  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7120  * copy of it, but with any literal code blocks blanked out and
7121  * appropriate chars escaped; then feed it into
7122  *
7123  *    eval "qr'modified_pattern'"
7124  *
7125  * For example,
7126  *
7127  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7128  *
7129  * becomes
7130  *
7131  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7132  *
7133  * After eval_sv()-ing that, grab any new code blocks from the returned qr
7134  * and merge them with any code blocks of the original regexp.
7135  *
7136  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7137  * instead, just save the qr and return FALSE; this tells our caller that
7138  * the original pattern needs upgrading to utf8.
7139  */
7140 
7141 static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)7142 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7143     char *pat, STRLEN plen)
7144 {
7145     SV *qr;
7146 
7147     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7148 
7149     if (pRExC_state->runtime_code_qr) {
7150 	/* this is the second time we've been called; this should
7151 	 * only happen if the main pattern got upgraded to utf8
7152 	 * during compilation; re-use the qr we compiled first time
7153 	 * round (which should be utf8 too)
7154 	 */
7155 	qr = pRExC_state->runtime_code_qr;
7156 	pRExC_state->runtime_code_qr = NULL;
7157 	assert(RExC_utf8 && SvUTF8(qr));
7158     }
7159     else {
7160 	int n = 0;
7161 	STRLEN s;
7162 	char *p, *newpat;
7163 	int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7164 	SV *sv, *qr_ref;
7165 	dSP;
7166 
7167 	/* determine how many extra chars we need for ' and \ escaping */
7168 	for (s = 0; s < plen; s++) {
7169 	    if (pat[s] == '\'' || pat[s] == '\\')
7170 		newlen++;
7171 	}
7172 
7173 	Newx(newpat, newlen, char);
7174 	p = newpat;
7175 	*p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7176 
7177 	for (s = 0; s < plen; s++) {
7178 	    if (   pRExC_state->code_blocks
7179 	        && n < pRExC_state->code_blocks->count
7180 		&& s == pRExC_state->code_blocks->cb[n].start)
7181 	    {
7182 		/* blank out literal code block so that they aren't
7183                  * recompiled: eg change from/to:
7184                  *     /(?{xyz})/
7185                  *     /(?=====)/
7186                  * and
7187                  *     /(??{xyz})/
7188                  *     /(?======)/
7189                  * and
7190                  *     /(?(?{xyz}))/
7191                  *     /(?(?=====))/
7192                 */
7193 		assert(pat[s]   == '(');
7194 		assert(pat[s+1] == '?');
7195                 *p++ = '(';
7196                 *p++ = '?';
7197                 s += 2;
7198 		while (s < pRExC_state->code_blocks->cb[n].end) {
7199 		    *p++ = '=';
7200 		    s++;
7201 		}
7202                 *p++ = ')';
7203 		n++;
7204 		continue;
7205 	    }
7206 	    if (pat[s] == '\'' || pat[s] == '\\')
7207 		*p++ = '\\';
7208 	    *p++ = pat[s];
7209 	}
7210 	*p++ = '\'';
7211 	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7212 	    *p++ = 'x';
7213             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7214                 *p++ = 'x';
7215             }
7216         }
7217 	*p++ = '\0';
7218 	DEBUG_COMPILE_r({
7219             Perl_re_printf( aTHX_
7220 		"%sre-parsing pattern for runtime code:%s %s\n",
7221 		PL_colors[4], PL_colors[5], newpat);
7222 	});
7223 
7224 	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7225 	Safefree(newpat);
7226 
7227 	ENTER;
7228 	SAVETMPS;
7229 	save_re_context();
7230 	PUSHSTACKi(PERLSI_REQUIRE);
7231         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7232          * parsing qr''; normally only q'' does this. It also alters
7233          * hints handling */
7234 	eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7235 	SvREFCNT_dec_NN(sv);
7236 	SPAGAIN;
7237 	qr_ref = POPs;
7238 	PUTBACK;
7239 	{
7240 	    SV * const errsv = ERRSV;
7241 	    if (SvTRUE_NN(errsv))
7242                 /* use croak_sv ? */
7243 		Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7244 	}
7245 	assert(SvROK(qr_ref));
7246 	qr = SvRV(qr_ref);
7247 	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7248 	/* the leaving below frees the tmp qr_ref.
7249 	 * Give qr a life of its own */
7250 	SvREFCNT_inc(qr);
7251 	POPSTACK;
7252 	FREETMPS;
7253 	LEAVE;
7254 
7255     }
7256 
7257     if (!RExC_utf8 && SvUTF8(qr)) {
7258 	/* first time through; the pattern got upgraded; save the
7259 	 * qr for the next time through */
7260 	assert(!pRExC_state->runtime_code_qr);
7261 	pRExC_state->runtime_code_qr = qr;
7262 	return 0;
7263     }
7264 
7265 
7266     /* extract any code blocks within the returned qr//  */
7267 
7268 
7269     /* merge the main (r1) and run-time (r2) code blocks into one */
7270     {
7271 	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7272 	struct reg_code_block *new_block, *dst;
7273 	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7274 	int i1 = 0, i2 = 0;
7275         int r1c, r2c;
7276 
7277 	if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7278 	{
7279 	    SvREFCNT_dec_NN(qr);
7280 	    return 1;
7281 	}
7282 
7283         if (!r1->code_blocks)
7284             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7285 
7286         r1c = r1->code_blocks->count;
7287         r2c = r2->code_blocks->count;
7288 
7289 	Newx(new_block, r1c + r2c, struct reg_code_block);
7290 
7291 	dst = new_block;
7292 
7293 	while (i1 < r1c || i2 < r2c) {
7294 	    struct reg_code_block *src;
7295 	    bool is_qr = 0;
7296 
7297 	    if (i1 == r1c) {
7298 		src = &r2->code_blocks->cb[i2++];
7299 		is_qr = 1;
7300 	    }
7301 	    else if (i2 == r2c)
7302 		src = &r1->code_blocks->cb[i1++];
7303 	    else if (  r1->code_blocks->cb[i1].start
7304 	             < r2->code_blocks->cb[i2].start)
7305 	    {
7306 		src = &r1->code_blocks->cb[i1++];
7307 		assert(src->end < r2->code_blocks->cb[i2].start);
7308 	    }
7309 	    else {
7310 		assert(  r1->code_blocks->cb[i1].start
7311 		       > r2->code_blocks->cb[i2].start);
7312 		src = &r2->code_blocks->cb[i2++];
7313 		is_qr = 1;
7314 		assert(src->end < r1->code_blocks->cb[i1].start);
7315 	    }
7316 
7317 	    assert(pat[src->start] == '(');
7318 	    assert(pat[src->end]   == ')');
7319 	    dst->start	    = src->start;
7320 	    dst->end	    = src->end;
7321 	    dst->block	    = src->block;
7322 	    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7323 				    : src->src_regex;
7324 	    dst++;
7325 	}
7326 	r1->code_blocks->count += r2c;
7327 	Safefree(r1->code_blocks->cb);
7328 	r1->code_blocks->cb = new_block;
7329     }
7330 
7331     SvREFCNT_dec_NN(qr);
7332     return 1;
7333 }
7334 
7335 
7336 STATIC bool
S_setup_longest(pTHX_ RExC_state_t * pRExC_state,struct reg_substr_datum * rsd,struct scan_data_substrs * sub,STRLEN longest_length)7337 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7338                       struct reg_substr_datum  *rsd,
7339                       struct scan_data_substrs *sub,
7340                       STRLEN longest_length)
7341 {
7342     /* This is the common code for setting up the floating and fixed length
7343      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7344      * as to whether succeeded or not */
7345 
7346     I32 t;
7347     SSize_t ml;
7348     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7349     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7350 
7351     if (! (longest_length
7352            || (eol /* Can't have SEOL and MULTI */
7353                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7354           )
7355             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7356         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7357     {
7358         return FALSE;
7359     }
7360 
7361     /* copy the information about the longest from the reg_scan_data
7362         over to the program. */
7363     if (SvUTF8(sub->str)) {
7364         rsd->substr      = NULL;
7365         rsd->utf8_substr = sub->str;
7366     } else {
7367         rsd->substr      = sub->str;
7368         rsd->utf8_substr = NULL;
7369     }
7370     /* end_shift is how many chars that must be matched that
7371         follow this item. We calculate it ahead of time as once the
7372         lookbehind offset is added in we lose the ability to correctly
7373         calculate it.*/
7374     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7375     rsd->end_shift = ml - sub->min_offset
7376         - longest_length
7377             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7378              * intead? - DAPM
7379             + (SvTAIL(sub->str) != 0)
7380             */
7381         + sub->lookbehind;
7382 
7383     t = (eol/* Can't have SEOL and MULTI */
7384          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7385     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7386 
7387     return TRUE;
7388 }
7389 
7390 STATIC void
S_set_regex_pv(pTHX_ RExC_state_t * pRExC_state,REGEXP * Rx)7391 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7392 {
7393     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7394      * properly wrapped with the right modifiers */
7395 
7396     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7397     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7398                                                 != REGEX_DEPENDS_CHARSET);
7399 
7400     /* The caret is output if there are any defaults: if not all the STD
7401         * flags are set, or if no character set specifier is needed */
7402     bool has_default =
7403                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7404                 || ! has_charset);
7405     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7406                                                 == REG_RUN_ON_COMMENT_SEEN);
7407     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7408                         >> RXf_PMf_STD_PMMOD_SHIFT);
7409     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7410     char *p;
7411     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7412 
7413     /* We output all the necessary flags; we never output a minus, as all
7414         * those are defaults, so are
7415         * covered by the caret */
7416     const STRLEN wraplen = pat_len + has_p + has_runon
7417         + has_default       /* If needs a caret */
7418         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7419 
7420             /* If needs a character set specifier */
7421         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7422         + (sizeof("(?:)") - 1);
7423 
7424     PERL_ARGS_ASSERT_SET_REGEX_PV;
7425 
7426     /* make sure PL_bitcount bounds not exceeded */
7427     assert(sizeof(STD_PAT_MODS) <= 8);
7428 
7429     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7430     SvPOK_on(Rx);
7431     if (RExC_utf8)
7432         SvFLAGS(Rx) |= SVf_UTF8;
7433     *p++='('; *p++='?';
7434 
7435     /* If a default, cover it using the caret */
7436     if (has_default) {
7437         *p++= DEFAULT_PAT_MOD;
7438     }
7439     if (has_charset) {
7440         STRLEN len;
7441         const char* name;
7442 
7443         name = get_regex_charset_name(RExC_rx->extflags, &len);
7444         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7445             assert(RExC_utf8);
7446             name = UNICODE_PAT_MODS;
7447             len = sizeof(UNICODE_PAT_MODS) - 1;
7448         }
7449         Copy(name, p, len, char);
7450         p += len;
7451     }
7452     if (has_p)
7453         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7454     {
7455         char ch;
7456         while((ch = *fptr++)) {
7457             if(reganch & 1)
7458                 *p++ = ch;
7459             reganch >>= 1;
7460         }
7461     }
7462 
7463     *p++ = ':';
7464     Copy(RExC_precomp, p, pat_len, char);
7465     assert ((RX_WRAPPED(Rx) - p) < 16);
7466     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7467     p += pat_len;
7468 
7469     /* Adding a trailing \n causes this to compile properly:
7470             my $R = qr / A B C # D E/x; /($R)/
7471         Otherwise the parens are considered part of the comment */
7472     if (has_runon)
7473         *p++ = '\n';
7474     *p++ = ')';
7475     *p = 0;
7476     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7477 }
7478 
7479 /*
7480  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7481  * regular expression into internal code.
7482  * The pattern may be passed either as:
7483  *    a list of SVs (patternp plus pat_count)
7484  *    a list of OPs (expr)
7485  * If both are passed, the SV list is used, but the OP list indicates
7486  * which SVs are actually pre-compiled code blocks
7487  *
7488  * The SVs in the list have magic and qr overloading applied to them (and
7489  * the list may be modified in-place with replacement SVs in the latter
7490  * case).
7491  *
7492  * If the pattern hasn't changed from old_re, then old_re will be
7493  * returned.
7494  *
7495  * eng is the current engine. If that engine has an op_comp method, then
7496  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7497  * do the initial concatenation of arguments and pass on to the external
7498  * engine.
7499  *
7500  * If is_bare_re is not null, set it to a boolean indicating whether the
7501  * arg list reduced (after overloading) to a single bare regex which has
7502  * been returned (i.e. /$qr/).
7503  *
7504  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7505  *
7506  * pm_flags contains the PMf_* flags, typically based on those from the
7507  * pm_flags field of the related PMOP. Currently we're only interested in
7508  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7509  *
7510  * For many years this code had an initial sizing pass that calculated
7511  * (sometimes incorrectly, leading to security holes) the size needed for the
7512  * compiled pattern.  That was changed by commit
7513  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7514  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7515  * references to this sizing pass.
7516  *
7517  * Now, an initial crude guess as to the size needed is made, based on the
7518  * length of the pattern.  Patches welcome to improve that guess.  That amount
7519  * of space is malloc'd and then immediately freed, and then clawed back node
7520  * by node.  This design is to minimze, to the extent possible, memory churn
7521  * when doing the reallocs.
7522  *
7523  * A separate parentheses counting pass may be needed in some cases.
7524  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7525  * of these cases.
7526  *
7527  * The existence of a sizing pass necessitated design decisions that are no
7528  * longer needed.  There are potential areas of simplification.
7529  *
7530  * Beware that the optimization-preparation code in here knows about some
7531  * of the structure of the compiled regexp.  [I'll say.]
7532  */
7533 
7534 REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp,int pat_count,OP * expr,const regexp_engine * eng,REGEXP * old_re,bool * is_bare_re,const U32 orig_rx_flags,const U32 pm_flags)7535 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7536 		    OP *expr, const regexp_engine* eng, REGEXP *old_re,
7537 		     bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7538 {
7539     dVAR;
7540     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7541     STRLEN plen;
7542     char *exp;
7543     regnode *scan;
7544     I32 flags;
7545     SSize_t minlen = 0;
7546     U32 rx_flags;
7547     SV *pat;
7548     SV** new_patternp = patternp;
7549 
7550     /* these are all flags - maybe they should be turned
7551      * into a single int with different bit masks */
7552     I32 sawlookahead = 0;
7553     I32 sawplus = 0;
7554     I32 sawopen = 0;
7555     I32 sawminmod = 0;
7556 
7557     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7558     bool recompile = 0;
7559     bool runtime_code = 0;
7560     scan_data_t data;
7561     RExC_state_t RExC_state;
7562     RExC_state_t * const pRExC_state = &RExC_state;
7563 #ifdef TRIE_STUDY_OPT
7564     int restudied = 0;
7565     RExC_state_t copyRExC_state;
7566 #endif
7567     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7568 
7569     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7570 
7571     DEBUG_r(if (!PL_colorset) reginitcolors());
7572 
7573 
7574     pRExC_state->warn_text = NULL;
7575     pRExC_state->unlexed_names = NULL;
7576     pRExC_state->code_blocks = NULL;
7577 
7578     if (is_bare_re)
7579 	*is_bare_re = FALSE;
7580 
7581     if (expr && (expr->op_type == OP_LIST ||
7582 		(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7583 	/* allocate code_blocks if needed */
7584 	OP *o;
7585 	int ncode = 0;
7586 
7587 	for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7588 	    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7589 		ncode++; /* count of DO blocks */
7590 
7591 	if (ncode)
7592             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7593     }
7594 
7595     if (!pat_count) {
7596         /* compile-time pattern with just OP_CONSTs and DO blocks */
7597 
7598         int n;
7599         OP *o;
7600 
7601         /* find how many CONSTs there are */
7602         assert(expr);
7603         n = 0;
7604         if (expr->op_type == OP_CONST)
7605             n = 1;
7606         else
7607             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7608                 if (o->op_type == OP_CONST)
7609                     n++;
7610             }
7611 
7612         /* fake up an SV array */
7613 
7614         assert(!new_patternp);
7615         Newx(new_patternp, n, SV*);
7616         SAVEFREEPV(new_patternp);
7617         pat_count = n;
7618 
7619         n = 0;
7620         if (expr->op_type == OP_CONST)
7621             new_patternp[n] = cSVOPx_sv(expr);
7622         else
7623             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7624                 if (o->op_type == OP_CONST)
7625                     new_patternp[n++] = cSVOPo_sv;
7626             }
7627 
7628     }
7629 
7630     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7631         "Assembling pattern from %d elements%s\n", pat_count,
7632             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7633 
7634     /* set expr to the first arg op */
7635 
7636     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7637          && expr->op_type != OP_CONST)
7638     {
7639             expr = cLISTOPx(expr)->op_first;
7640             assert(   expr->op_type == OP_PUSHMARK
7641                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7642                    || expr->op_type == OP_PADRANGE);
7643             expr = OpSIBLING(expr);
7644     }
7645 
7646     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7647                         expr, &recompile, NULL);
7648 
7649     /* handle bare (possibly after overloading) regex: foo =~ $re */
7650     {
7651         SV *re = pat;
7652         if (SvROK(re))
7653             re = SvRV(re);
7654         if (SvTYPE(re) == SVt_REGEXP) {
7655             if (is_bare_re)
7656                 *is_bare_re = TRUE;
7657             SvREFCNT_inc(re);
7658             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7659                 "Precompiled pattern%s\n",
7660                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7661 
7662             return (REGEXP*)re;
7663         }
7664     }
7665 
7666     exp = SvPV_nomg(pat, plen);
7667 
7668     if (!eng->op_comp) {
7669 	if ((SvUTF8(pat) && IN_BYTES)
7670 		|| SvGMAGICAL(pat) || SvAMAGIC(pat))
7671 	{
7672 	    /* make a temporary copy; either to convert to bytes,
7673 	     * or to avoid repeating get-magic / overloaded stringify */
7674 	    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7675 					(IN_BYTES ? 0 : SvUTF8(pat)));
7676 	}
7677 	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7678     }
7679 
7680     /* ignore the utf8ness if the pattern is 0 length */
7681     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7682     RExC_uni_semantics = 0;
7683     RExC_contains_locale = 0;
7684     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7685     RExC_in_script_run = 0;
7686     RExC_study_started = 0;
7687     pRExC_state->runtime_code_qr = NULL;
7688     RExC_frame_head= NULL;
7689     RExC_frame_last= NULL;
7690     RExC_frame_count= 0;
7691     RExC_latest_warn_offset = 0;
7692     RExC_use_BRANCHJ = 0;
7693     RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7694     RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7695     RExC_total_parens = 0;
7696     RExC_open_parens = NULL;
7697     RExC_close_parens = NULL;
7698     RExC_paren_names = NULL;
7699     RExC_size = 0;
7700     RExC_seen_d_op = FALSE;
7701 #ifdef DEBUGGING
7702     RExC_paren_name_list = NULL;
7703 #endif
7704 
7705     DEBUG_r({
7706         RExC_mysv1= sv_newmortal();
7707         RExC_mysv2= sv_newmortal();
7708     });
7709 
7710     DEBUG_COMPILE_r({
7711             SV *dsv= sv_newmortal();
7712             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7713             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7714                           PL_colors[4], PL_colors[5], s);
7715         });
7716 
7717     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7718      * to utf8 */
7719 
7720     if ((pm_flags & PMf_USE_RE_EVAL)
7721 		/* this second condition covers the non-regex literal case,
7722 		 * i.e.  $foo =~ '(?{})'. */
7723 		|| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7724     )
7725 	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7726 
7727   redo_parse:
7728     /* return old regex if pattern hasn't changed */
7729     /* XXX: note in the below we have to check the flags as well as the
7730      * pattern.
7731      *
7732      * Things get a touch tricky as we have to compare the utf8 flag
7733      * independently from the compile flags.  */
7734 
7735     if (   old_re
7736         && !recompile
7737         && !!RX_UTF8(old_re) == !!RExC_utf8
7738         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7739 	&& RX_PRECOMP(old_re)
7740 	&& RX_PRELEN(old_re) == plen
7741         && memEQ(RX_PRECOMP(old_re), exp, plen)
7742 	&& !runtime_code /* with runtime code, always recompile */ )
7743     {
7744         DEBUG_COMPILE_r({
7745             SV *dsv= sv_newmortal();
7746             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7747             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
7748                           PL_colors[4], PL_colors[5], s);
7749         });
7750         return old_re;
7751     }
7752 
7753     /* Allocate the pattern's SV */
7754     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7755     RExC_rx = ReANY(Rx);
7756     if ( RExC_rx == NULL )
7757         FAIL("Regexp out of space");
7758 
7759     rx_flags = orig_rx_flags;
7760 
7761     if (   toUSE_UNI_CHARSET_NOT_DEPENDS
7762         && initial_charset == REGEX_DEPENDS_CHARSET)
7763     {
7764 
7765 	/* Set to use unicode semantics if the pattern is in utf8 and has the
7766 	 * 'depends' charset specified, as it means unicode when utf8  */
7767 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7768         RExC_uni_semantics = 1;
7769     }
7770 
7771     RExC_pm_flags = pm_flags;
7772 
7773     if (runtime_code) {
7774         assert(TAINTING_get || !TAINT_get);
7775 	if (TAINT_get)
7776 	    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7777 
7778 	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7779 	    /* whoops, we have a non-utf8 pattern, whilst run-time code
7780 	     * got compiled as utf8. Try again with a utf8 pattern */
7781             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7782                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7783             goto redo_parse;
7784 	}
7785     }
7786     assert(!pRExC_state->runtime_code_qr);
7787 
7788     RExC_sawback = 0;
7789 
7790     RExC_seen = 0;
7791     RExC_maxlen = 0;
7792     RExC_in_lookaround = 0;
7793     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7794     RExC_recode_x_to_native = 0;
7795     RExC_in_multi_char_class = 0;
7796 
7797     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7798     RExC_precomp_end = RExC_end = exp + plen;
7799     RExC_nestroot = 0;
7800     RExC_whilem_seen = 0;
7801     RExC_end_op = NULL;
7802     RExC_recurse = NULL;
7803     RExC_study_chunk_recursed = NULL;
7804     RExC_study_chunk_recursed_bytes= 0;
7805     RExC_recurse_count = 0;
7806     RExC_sets_depth = 0;
7807     pRExC_state->code_index = 0;
7808 
7809     /* Initialize the string in the compiled pattern.  This is so that there is
7810      * something to output if necessary */
7811     set_regex_pv(pRExC_state, Rx);
7812 
7813     DEBUG_PARSE_r({
7814         Perl_re_printf( aTHX_
7815             "Starting parse and generation\n");
7816         RExC_lastnum=0;
7817         RExC_lastparse=NULL;
7818     });
7819 
7820     /* Allocate space and zero-initialize. Note, the two step process
7821        of zeroing when in debug mode, thus anything assigned has to
7822        happen after that */
7823     if (!  RExC_size) {
7824 
7825         /* On the first pass of the parse, we guess how big this will be.  Then
7826          * we grow in one operation to that amount and then give it back.  As
7827          * we go along, we re-allocate what we need.
7828          *
7829          * XXX Currently the guess is essentially that the pattern will be an
7830          * EXACT node with one byte input, one byte output.  This is crude, and
7831          * better heuristics are welcome.
7832          *
7833          * On any subsequent passes, we guess what we actually computed in the
7834          * latest earlier pass.  Such a pass probably didn't complete so is
7835          * missing stuff.  We could improve those guesses by knowing where the
7836          * parse stopped, and use the length so far plus apply the above
7837          * assumption to what's left. */
7838         RExC_size = STR_SZ(RExC_end - RExC_start);
7839     }
7840 
7841     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7842     if ( RExC_rxi == NULL )
7843         FAIL("Regexp out of space");
7844 
7845     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7846     RXi_SET( RExC_rx, RExC_rxi );
7847 
7848     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7849      * node parsed will give back any excess memory we have allocated so far).
7850      * */
7851     RExC_size = 0;
7852 
7853     /* non-zero initialization begins here */
7854     RExC_rx->engine= eng;
7855     RExC_rx->extflags = rx_flags;
7856     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7857 
7858     if (pm_flags & PMf_IS_QR) {
7859 	RExC_rxi->code_blocks = pRExC_state->code_blocks;
7860         if (RExC_rxi->code_blocks) {
7861             RExC_rxi->code_blocks->refcnt++;
7862         }
7863     }
7864 
7865     RExC_rx->intflags = 0;
7866 
7867     RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
7868     RExC_parse = exp;
7869 
7870     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7871      * code makes sure the final byte is an uncounted NUL.  But should this
7872      * ever not be the case, lots of things could read beyond the end of the
7873      * buffer: loops like
7874      *      while(isFOO(*RExC_parse)) RExC_parse++;
7875      *      strchr(RExC_parse, "foo");
7876      * etc.  So it is worth noting. */
7877     assert(*RExC_end == '\0');
7878 
7879     RExC_naughty = 0;
7880     RExC_npar = 1;
7881     RExC_parens_buf_size = 0;
7882     RExC_emit_start = RExC_rxi->program;
7883     pRExC_state->code_index = 0;
7884 
7885     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7886     RExC_emit = 1;
7887 
7888     /* Do the parse */
7889     if (reg(pRExC_state, 0, &flags, 1)) {
7890 
7891         /* Success!, But we may need to redo the parse knowing how many parens
7892          * there actually are */
7893         if (IN_PARENS_PASS) {
7894             flags |= RESTART_PARSE;
7895         }
7896 
7897         /* We have that number in RExC_npar */
7898         RExC_total_parens = RExC_npar;
7899 
7900         /* XXX For backporting, use long jumps if there is any possibility of
7901          * overflow */
7902         if (RExC_size > U16_MAX && ! RExC_use_BRANCHJ) {
7903             RExC_use_BRANCHJ = TRUE;
7904             flags |= RESTART_PARSE;
7905         }
7906     }
7907     else if (! MUST_RESTART(flags)) {
7908 	ReREFCNT_dec(Rx);
7909         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7910     }
7911 
7912     /* Here, we either have success, or we have to redo the parse for some reason */
7913     if (MUST_RESTART(flags)) {
7914 
7915         /* It's possible to write a regexp in ascii that represents Unicode
7916         codepoints outside of the byte range, such as via \x{100}. If we
7917         detect such a sequence we have to convert the entire pattern to utf8
7918         and then recompile, as our sizing calculation will have been based
7919         on 1 byte == 1 character, but we will need to use utf8 to encode
7920         at least some part of the pattern, and therefore must convert the whole
7921         thing.
7922         -- dmq */
7923         if (flags & NEED_UTF8) {
7924 
7925             /* We have stored the offset of the final warning output so far.
7926              * That must be adjusted.  Any variant characters between the start
7927              * of the pattern and this warning count for 2 bytes in the final,
7928              * so just add them again */
7929             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7930                 RExC_latest_warn_offset +=
7931                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7932                                                 + RExC_latest_warn_offset);
7933             }
7934             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7935             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7936             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7937         }
7938         else {
7939             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7940         }
7941 
7942         if (ALL_PARENS_COUNTED) {
7943             /* Make enough room for all the known parens, and zero it */
7944             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7945             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7946             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7947 
7948             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7949             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7950         }
7951         else { /* Parse did not complete.  Reinitialize the parentheses
7952                   structures */
7953             RExC_total_parens = 0;
7954             if (RExC_open_parens) {
7955                 Safefree(RExC_open_parens);
7956                 RExC_open_parens = NULL;
7957             }
7958             if (RExC_close_parens) {
7959                 Safefree(RExC_close_parens);
7960                 RExC_close_parens = NULL;
7961             }
7962         }
7963 
7964         /* Clean up what we did in this parse */
7965         SvREFCNT_dec_NN(RExC_rx_sv);
7966 
7967         goto redo_parse;
7968     }
7969 
7970     /* Here, we have successfully parsed and generated the pattern's program
7971      * for the regex engine.  We are ready to finish things up and look for
7972      * optimizations. */
7973 
7974     /* Update the string to compile, with correct modifiers, etc */
7975     set_regex_pv(pRExC_state, Rx);
7976 
7977     RExC_rx->nparens = RExC_total_parens - 1;
7978 
7979     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7980     if (RExC_whilem_seen > 15)
7981         RExC_whilem_seen = 15;
7982 
7983     DEBUG_PARSE_r({
7984         Perl_re_printf( aTHX_
7985             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7986         RExC_lastnum=0;
7987         RExC_lastparse=NULL;
7988     });
7989 
7990 #ifdef RE_TRACK_PATTERN_OFFSETS
7991     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7992                           "%s %" UVuf " bytes for offset annotations.\n",
7993                           RExC_offsets ? "Got" : "Couldn't get",
7994                           (UV)((RExC_offsets[0] * 2 + 1))));
7995     DEBUG_OFFSETS_r(if (RExC_offsets) {
7996         const STRLEN len = RExC_offsets[0];
7997         STRLEN i;
7998         DECLARE_AND_GET_RE_DEBUG_FLAGS;
7999         Perl_re_printf( aTHX_
8000                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
8001         for (i = 1; i <= len; i++) {
8002             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
8003                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
8004                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
8005         }
8006         Perl_re_printf( aTHX_  "\n");
8007     });
8008 
8009 #else
8010     SetProgLen(RExC_rxi,RExC_size);
8011 #endif
8012 
8013     DEBUG_DUMP_PRE_OPTIMIZE_r({
8014         SV * const sv = sv_newmortal();
8015         RXi_GET_DECL(RExC_rx, ri);
8016         DEBUG_RExC_seen();
8017         Perl_re_printf( aTHX_ "Program before optimization:\n");
8018 
8019         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
8020                         sv, 0, 0);
8021     });
8022 
8023     DEBUG_OPTIMISE_r(
8024         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
8025     );
8026 
8027     /* XXXX To minimize changes to RE engine we always allocate
8028        3-units-long substrs field. */
8029     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8030     if (RExC_recurse_count) {
8031         Newx(RExC_recurse, RExC_recurse_count, regnode *);
8032         SAVEFREEPV(RExC_recurse);
8033     }
8034 
8035     if (RExC_seen & REG_RECURSE_SEEN) {
8036         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8037          * So its 1 if there are no parens. */
8038         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8039                                          ((RExC_total_parens & 0x07) != 0);
8040         Newx(RExC_study_chunk_recursed,
8041              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8042         SAVEFREEPV(RExC_study_chunk_recursed);
8043     }
8044 
8045   reStudy:
8046     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8047     DEBUG_r(
8048         RExC_study_chunk_recursed_count= 0;
8049     );
8050     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8051     if (RExC_study_chunk_recursed) {
8052         Zero(RExC_study_chunk_recursed,
8053              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8054     }
8055 
8056 
8057 #ifdef TRIE_STUDY_OPT
8058     if (!restudied) {
8059         StructCopy(&zero_scan_data, &data, scan_data_t);
8060         copyRExC_state = RExC_state;
8061     } else {
8062         U32 seen=RExC_seen;
8063         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8064 
8065         RExC_state = copyRExC_state;
8066         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8067             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8068         else
8069             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8070 	StructCopy(&zero_scan_data, &data, scan_data_t);
8071     }
8072 #else
8073     StructCopy(&zero_scan_data, &data, scan_data_t);
8074 #endif
8075 
8076     /* Dig out information for optimizations. */
8077     RExC_rx->extflags = RExC_flags; /* was pm_op */
8078     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8079 
8080     if (UTF)
8081 	SvUTF8_on(Rx);	/* Unicode in it? */
8082     RExC_rxi->regstclass = NULL;
8083     if (RExC_naughty >= TOO_NAUGHTY)	/* Probably an expensive pattern. */
8084 	RExC_rx->intflags |= PREGf_NAUGHTY;
8085     scan = RExC_rxi->program + 1;		/* First BRANCH. */
8086 
8087     /* testing for BRANCH here tells us whether there is "must appear"
8088        data in the pattern. If there is then we can use it for optimisations */
8089     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
8090                                                   */
8091 	SSize_t fake;
8092 	STRLEN longest_length[2];
8093 	regnode_ssc ch_class; /* pointed to by data */
8094 	int stclass_flag;
8095 	SSize_t last_close = 0; /* pointed to by data */
8096         regnode *first= scan;
8097         regnode *first_next= regnext(first);
8098         int i;
8099 
8100 	/*
8101 	 * Skip introductions and multiplicators >= 1
8102 	 * so that we can extract the 'meat' of the pattern that must
8103 	 * match in the large if() sequence following.
8104 	 * NOTE that EXACT is NOT covered here, as it is normally
8105 	 * picked up by the optimiser separately.
8106 	 *
8107 	 * This is unfortunate as the optimiser isnt handling lookahead
8108 	 * properly currently.
8109 	 *
8110 	 */
8111 	while ((OP(first) == OPEN && (sawopen = 1)) ||
8112 	       /* An OR of *one* alternative - should not happen now. */
8113 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8114 	    /* for now we can't handle lookbehind IFMATCH*/
8115 	    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8116 	    (OP(first) == PLUS) ||
8117 	    (OP(first) == MINMOD) ||
8118 	       /* An {n,m} with n>0 */
8119 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8120 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8121 	{
8122 		/*
8123 		 * the only op that could be a regnode is PLUS, all the rest
8124 		 * will be regnode_1 or regnode_2.
8125 		 *
8126                  * (yves doesn't think this is true)
8127 		 */
8128 		if (OP(first) == PLUS)
8129 		    sawplus = 1;
8130                 else {
8131                     if (OP(first) == MINMOD)
8132                         sawminmod = 1;
8133 		    first += regarglen[OP(first)];
8134                 }
8135 		first = NEXTOPER(first);
8136 		first_next= regnext(first);
8137 	}
8138 
8139 	/* Starting-point info. */
8140       again:
8141         DEBUG_PEEP("first:", first, 0, 0);
8142         /* Ignore EXACT as we deal with it later. */
8143 	if (PL_regkind[OP(first)] == EXACT) {
8144 	    if (   OP(first) == EXACT
8145 	        || OP(first) == LEXACT
8146                 || OP(first) == EXACT_REQ8
8147                 || OP(first) == LEXACT_REQ8
8148                 || OP(first) == EXACTL)
8149             {
8150 		NOOP;	/* Empty, get anchored substr later. */
8151             }
8152 	    else
8153 		RExC_rxi->regstclass = first;
8154 	}
8155 #ifdef TRIE_STCLASS
8156 	else if (PL_regkind[OP(first)] == TRIE &&
8157 	        ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8158 	{
8159             /* this can happen only on restudy */
8160             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8161 	}
8162 #endif
8163 	else if (REGNODE_SIMPLE(OP(first)))
8164 	    RExC_rxi->regstclass = first;
8165 	else if (PL_regkind[OP(first)] == BOUND ||
8166 		 PL_regkind[OP(first)] == NBOUND)
8167 	    RExC_rxi->regstclass = first;
8168 	else if (PL_regkind[OP(first)] == BOL) {
8169             RExC_rx->intflags |= (OP(first) == MBOL
8170                            ? PREGf_ANCH_MBOL
8171                            : PREGf_ANCH_SBOL);
8172 	    first = NEXTOPER(first);
8173 	    goto again;
8174 	}
8175 	else if (OP(first) == GPOS) {
8176             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8177 	    first = NEXTOPER(first);
8178 	    goto again;
8179 	}
8180 	else if ((!sawopen || !RExC_sawback) &&
8181             !sawlookahead &&
8182 	    (OP(first) == STAR &&
8183 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8184             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8185 	{
8186 	    /* turn .* into ^.* with an implied $*=1 */
8187 	    const int type =
8188 		(OP(NEXTOPER(first)) == REG_ANY)
8189                     ? PREGf_ANCH_MBOL
8190                     : PREGf_ANCH_SBOL;
8191             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8192 	    first = NEXTOPER(first);
8193 	    goto again;
8194 	}
8195         if (sawplus && !sawminmod && !sawlookahead
8196             && (!sawopen || !RExC_sawback)
8197 	    && !pRExC_state->code_blocks) /* May examine pos and $& */
8198 	    /* x+ must match at the 1st pos of run of x's */
8199 	    RExC_rx->intflags |= PREGf_SKIP;
8200 
8201 	/* Scan is after the zeroth branch, first is atomic matcher. */
8202 #ifdef TRIE_STUDY_OPT
8203 	DEBUG_PARSE_r(
8204 	    if (!restudied)
8205                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8206 			      (IV)(first - scan + 1))
8207         );
8208 #else
8209 	DEBUG_PARSE_r(
8210             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8211 	        (IV)(first - scan + 1))
8212         );
8213 #endif
8214 
8215 
8216 	/*
8217 	* If there's something expensive in the r.e., find the
8218 	* longest literal string that must appear and make it the
8219 	* regmust.  Resolve ties in favor of later strings, since
8220 	* the regstart check works with the beginning of the r.e.
8221 	* and avoiding duplication strengthens checking.  Not a
8222 	* strong reason, but sufficient in the absence of others.
8223 	* [Now we resolve ties in favor of the earlier string if
8224 	* it happens that c_offset_min has been invalidated, since the
8225 	* earlier string may buy us something the later one won't.]
8226 	*/
8227 
8228 	data.substrs[0].str = newSVpvs("");
8229 	data.substrs[1].str = newSVpvs("");
8230 	data.last_found = newSVpvs("");
8231 	data.cur_is_floating = 0; /* initially any found substring is fixed */
8232 	ENTER_with_name("study_chunk");
8233 	SAVEFREESV(data.substrs[0].str);
8234 	SAVEFREESV(data.substrs[1].str);
8235 	SAVEFREESV(data.last_found);
8236 	first = scan;
8237 	if (!RExC_rxi->regstclass) {
8238 	    ssc_init(pRExC_state, &ch_class);
8239 	    data.start_class = &ch_class;
8240 	    stclass_flag = SCF_DO_STCLASS_AND;
8241 	} else				/* XXXX Check for BOUND? */
8242 	    stclass_flag = 0;
8243 	data.last_closep = &last_close;
8244 
8245         DEBUG_RExC_seen();
8246         /*
8247          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8248          * (NO top level branches)
8249          */
8250 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8251                              scan + RExC_size, /* Up to end */
8252             &data, -1, 0, NULL,
8253             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8254                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8255             0, TRUE);
8256 
8257 
8258         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8259 
8260 
8261 	if ( RExC_total_parens == 1 && !data.cur_is_floating
8262 	     && data.last_start_min == 0 && data.last_end > 0
8263 	     && !RExC_seen_zerolen
8264              && !(RExC_seen & REG_VERBARG_SEEN)
8265              && !(RExC_seen & REG_GPOS_SEEN)
8266         ){
8267 	    RExC_rx->extflags |= RXf_CHECK_ALL;
8268         }
8269 	scan_commit(pRExC_state, &data,&minlen, 0);
8270 
8271 
8272         /* XXX this is done in reverse order because that's the way the
8273          * code was before it was parameterised. Don't know whether it
8274          * actually needs doing in reverse order. DAPM */
8275         for (i = 1; i >= 0; i--) {
8276             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8277 
8278             if (   !(   i
8279                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8280                      &&    data.substrs[0].min_offset
8281                         == data.substrs[1].min_offset
8282                      &&    SvCUR(data.substrs[0].str)
8283                         == SvCUR(data.substrs[1].str)
8284                     )
8285                 && S_setup_longest (aTHX_ pRExC_state,
8286                                         &(RExC_rx->substrs->data[i]),
8287                                         &(data.substrs[i]),
8288                                         longest_length[i]))
8289             {
8290                 RExC_rx->substrs->data[i].min_offset =
8291                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8292 
8293                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8294                 /* Don't offset infinity */
8295                 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8296                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8297                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8298             }
8299             else {
8300                 RExC_rx->substrs->data[i].substr      = NULL;
8301                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8302                 longest_length[i] = 0;
8303             }
8304         }
8305 
8306 	LEAVE_with_name("study_chunk");
8307 
8308 	if (RExC_rxi->regstclass
8309 	    && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8310 	    RExC_rxi->regstclass = NULL;
8311 
8312 	if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8313               || RExC_rx->substrs->data[0].min_offset)
8314 	    && stclass_flag
8315             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8316 	    && is_ssc_worth_it(pRExC_state, data.start_class))
8317 	{
8318 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8319 
8320             ssc_finalize(pRExC_state, data.start_class);
8321 
8322 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8323 	    StructCopy(data.start_class,
8324 		       (regnode_ssc*)RExC_rxi->data->data[n],
8325 		       regnode_ssc);
8326 	    RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8327 	    RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
8328 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8329                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8330                       Perl_re_printf( aTHX_
8331 				    "synthetic stclass \"%s\".\n",
8332 				    SvPVX_const(sv));});
8333             data.start_class = NULL;
8334 	}
8335 
8336         /* A temporary algorithm prefers floated substr to fixed one of
8337          * same length to dig more info. */
8338 	i = (longest_length[0] <= longest_length[1]);
8339         RExC_rx->substrs->check_ix = i;
8340         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8341         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8342         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8343         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8344         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8345         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8346             RExC_rx->intflags |= PREGf_NOSCAN;
8347 
8348 	if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8349 	    RExC_rx->extflags |= RXf_USE_INTUIT;
8350 	    if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8351 		RExC_rx->extflags |= RXf_INTUIT_TAIL;
8352 	}
8353 
8354 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8355 	if ( (STRLEN)minlen < longest_length[1] )
8356             minlen= longest_length[1];
8357         if ( (STRLEN)minlen < longest_length[0] )
8358             minlen= longest_length[0];
8359         */
8360     }
8361     else {
8362 	/* Several toplevels. Best we can is to set minlen. */
8363 	SSize_t fake;
8364 	regnode_ssc ch_class;
8365 	SSize_t last_close = 0;
8366 
8367         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8368 
8369 	scan = RExC_rxi->program + 1;
8370 	ssc_init(pRExC_state, &ch_class);
8371 	data.start_class = &ch_class;
8372 	data.last_closep = &last_close;
8373 
8374         DEBUG_RExC_seen();
8375         /*
8376          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8377          * (patterns WITH top level branches)
8378          */
8379 	minlen = study_chunk(pRExC_state,
8380             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8381             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8382                                                       ? SCF_TRIE_DOING_RESTUDY
8383                                                       : 0),
8384             0, TRUE);
8385 
8386         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8387 
8388 	RExC_rx->check_substr = NULL;
8389         RExC_rx->check_utf8 = NULL;
8390         RExC_rx->substrs->data[0].substr      = NULL;
8391         RExC_rx->substrs->data[0].utf8_substr = NULL;
8392         RExC_rx->substrs->data[1].substr      = NULL;
8393         RExC_rx->substrs->data[1].utf8_substr = NULL;
8394 
8395         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8396 	    && is_ssc_worth_it(pRExC_state, data.start_class))
8397         {
8398 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8399 
8400             ssc_finalize(pRExC_state, data.start_class);
8401 
8402 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8403 	    StructCopy(data.start_class,
8404 		       (regnode_ssc*)RExC_rxi->data->data[n],
8405 		       regnode_ssc);
8406 	    RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8407 	    RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
8408 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8409                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8410                       Perl_re_printf( aTHX_
8411 				    "synthetic stclass \"%s\".\n",
8412 				    SvPVX_const(sv));});
8413             data.start_class = NULL;
8414 	}
8415     }
8416 
8417     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8418         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8419         RExC_rx->maxlen = REG_INFTY;
8420     }
8421     else {
8422         RExC_rx->maxlen = RExC_maxlen;
8423     }
8424 
8425     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8426        the "real" pattern. */
8427     DEBUG_OPTIMISE_r({
8428         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8429                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8430     });
8431     RExC_rx->minlenret = minlen;
8432     if (RExC_rx->minlen < minlen)
8433         RExC_rx->minlen = minlen;
8434 
8435     if (RExC_seen & REG_RECURSE_SEEN ) {
8436         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8437         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8438     }
8439     if (RExC_seen & REG_GPOS_SEEN)
8440         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8441     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8442         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8443                                                 lookbehind */
8444     if (pRExC_state->code_blocks)
8445 	RExC_rx->extflags |= RXf_EVAL_SEEN;
8446     if (RExC_seen & REG_VERBARG_SEEN)
8447     {
8448 	RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8449         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8450     }
8451     if (RExC_seen & REG_CUTGROUP_SEEN)
8452 	RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8453     if (pm_flags & PMf_USE_RE_EVAL)
8454 	RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8455     if (RExC_paren_names)
8456         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8457     else
8458         RXp_PAREN_NAMES(RExC_rx) = NULL;
8459 
8460     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8461      * so it can be used in pp.c */
8462     if (RExC_rx->intflags & PREGf_ANCH)
8463         RExC_rx->extflags |= RXf_IS_ANCHORED;
8464 
8465 
8466     {
8467         /* this is used to identify "special" patterns that might result
8468          * in Perl NOT calling the regex engine and instead doing the match "itself",
8469          * particularly special cases in split//. By having the regex compiler
8470          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8471          * we avoid weird issues with equivalent patterns resulting in different behavior,
8472          * AND we allow non Perl engines to get the same optimizations by the setting the
8473          * flags appropriately - Yves */
8474         regnode *first = RExC_rxi->program + 1;
8475         U8 fop = OP(first);
8476         regnode *next = regnext(first);
8477         U8 nop = OP(next);
8478 
8479         if (PL_regkind[fop] == NOTHING && nop == END)
8480             RExC_rx->extflags |= RXf_NULL;
8481         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8482             /* when fop is SBOL first->flags will be true only when it was
8483              * produced by parsing /\A/, and not when parsing /^/. This is
8484              * very important for the split code as there we want to
8485              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8486              * See rt #122761 for more details. -- Yves */
8487             RExC_rx->extflags |= RXf_START_ONLY;
8488         else if (fop == PLUS
8489                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8490                  && nop == END)
8491             RExC_rx->extflags |= RXf_WHITE;
8492         else if ( RExC_rx->extflags & RXf_SPLIT
8493                   && (   fop == EXACT || fop == LEXACT
8494                       || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8495                       || fop == EXACTL)
8496                   && STR_LEN(first) == 1
8497                   && *(STRING(first)) == ' '
8498                   && nop == END )
8499             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8500 
8501     }
8502 
8503     if (RExC_contains_locale) {
8504         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8505     }
8506 
8507 #ifdef DEBUGGING
8508     if (RExC_paren_names) {
8509         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8510         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8511                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8512     } else
8513 #endif
8514     RExC_rxi->name_list_idx = 0;
8515 
8516     while ( RExC_recurse_count > 0 ) {
8517         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8518         /*
8519          * This data structure is set up in study_chunk() and is used
8520          * to calculate the distance between a GOSUB regopcode and
8521          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8522          * it refers to.
8523          *
8524          * If for some reason someone writes code that optimises
8525          * away a GOSUB opcode then the assert should be changed to
8526          * an if(scan) to guard the ARG2L_SET() - Yves
8527          *
8528          */
8529         assert(scan && OP(scan) == GOSUB);
8530         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8531     }
8532 
8533     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8534     /* assume we don't need to swap parens around before we match */
8535     DEBUG_TEST_r({
8536         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8537             (unsigned long)RExC_study_chunk_recursed_count);
8538     });
8539     DEBUG_DUMP_r({
8540         DEBUG_RExC_seen();
8541         Perl_re_printf( aTHX_ "Final program:\n");
8542         regdump(RExC_rx);
8543     });
8544 
8545     if (RExC_open_parens) {
8546         Safefree(RExC_open_parens);
8547         RExC_open_parens = NULL;
8548     }
8549     if (RExC_close_parens) {
8550         Safefree(RExC_close_parens);
8551         RExC_close_parens = NULL;
8552     }
8553 
8554 #ifdef USE_ITHREADS
8555     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8556      * by setting the regexp SV to readonly-only instead. If the
8557      * pattern's been recompiled, the USEDness should remain. */
8558     if (old_re && SvREADONLY(old_re))
8559         SvREADONLY_on(Rx);
8560 #endif
8561     return Rx;
8562 }
8563 
8564 
8565 SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx,SV * const key,SV * const value,const U32 flags)8566 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8567                     const U32 flags)
8568 {
8569     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8570 
8571     PERL_UNUSED_ARG(value);
8572 
8573     if (flags & RXapif_FETCH) {
8574         return reg_named_buff_fetch(rx, key, flags);
8575     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8576         Perl_croak_no_modify();
8577         return NULL;
8578     } else if (flags & RXapif_EXISTS) {
8579         return reg_named_buff_exists(rx, key, flags)
8580             ? &PL_sv_yes
8581             : &PL_sv_no;
8582     } else if (flags & RXapif_REGNAMES) {
8583         return reg_named_buff_all(rx, flags);
8584     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8585         return reg_named_buff_scalar(rx, flags);
8586     } else {
8587         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8588         return NULL;
8589     }
8590 }
8591 
8592 SV*
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx,const SV * const lastkey,const U32 flags)8593 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8594                          const U32 flags)
8595 {
8596     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8597     PERL_UNUSED_ARG(lastkey);
8598 
8599     if (flags & RXapif_FIRSTKEY)
8600         return reg_named_buff_firstkey(rx, flags);
8601     else if (flags & RXapif_NEXTKEY)
8602         return reg_named_buff_nextkey(rx, flags);
8603     else {
8604         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8605                                             (int)flags);
8606         return NULL;
8607     }
8608 }
8609 
8610 SV*
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r,SV * const namesv,const U32 flags)8611 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8612 			  const U32 flags)
8613 {
8614     SV *ret;
8615     struct regexp *const rx = ReANY(r);
8616 
8617     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8618 
8619     if (rx && RXp_PAREN_NAMES(rx)) {
8620         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8621         if (he_str) {
8622             IV i;
8623             SV* sv_dat=HeVAL(he_str);
8624             I32 *nums=(I32*)SvPVX(sv_dat);
8625             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8626             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8627                 if ((I32)(rx->nparens) >= nums[i]
8628                     && rx->offs[nums[i]].start != -1
8629                     && rx->offs[nums[i]].end != -1)
8630                 {
8631                     ret = newSVpvs("");
8632                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8633                     if (!retarray)
8634                         return ret;
8635                 } else {
8636                     if (retarray)
8637                         ret = newSVsv(&PL_sv_undef);
8638                 }
8639                 if (retarray)
8640                     av_push(retarray, ret);
8641             }
8642             if (retarray)
8643                 return newRV_noinc(MUTABLE_SV(retarray));
8644         }
8645     }
8646     return NULL;
8647 }
8648 
8649 bool
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r,SV * const key,const U32 flags)8650 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8651                            const U32 flags)
8652 {
8653     struct regexp *const rx = ReANY(r);
8654 
8655     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8656 
8657     if (rx && RXp_PAREN_NAMES(rx)) {
8658         if (flags & RXapif_ALL) {
8659             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8660         } else {
8661 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8662             if (sv) {
8663 		SvREFCNT_dec_NN(sv);
8664                 return TRUE;
8665             } else {
8666                 return FALSE;
8667             }
8668         }
8669     } else {
8670         return FALSE;
8671     }
8672 }
8673 
8674 SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r,const U32 flags)8675 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8676 {
8677     struct regexp *const rx = ReANY(r);
8678 
8679     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8680 
8681     if ( rx && RXp_PAREN_NAMES(rx) ) {
8682 	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
8683 
8684 	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8685     } else {
8686 	return FALSE;
8687     }
8688 }
8689 
8690 SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r,const U32 flags)8691 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8692 {
8693     struct regexp *const rx = ReANY(r);
8694     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8695 
8696     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8697 
8698     if (rx && RXp_PAREN_NAMES(rx)) {
8699         HV *hv = RXp_PAREN_NAMES(rx);
8700         HE *temphe;
8701         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8702             IV i;
8703             IV parno = 0;
8704             SV* sv_dat = HeVAL(temphe);
8705             I32 *nums = (I32*)SvPVX(sv_dat);
8706             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8707                 if ((I32)(rx->lastparen) >= nums[i] &&
8708                     rx->offs[nums[i]].start != -1 &&
8709                     rx->offs[nums[i]].end != -1)
8710                 {
8711                     parno = nums[i];
8712                     break;
8713                 }
8714             }
8715             if (parno || flags & RXapif_ALL) {
8716 		return newSVhek(HeKEY_hek(temphe));
8717             }
8718         }
8719     }
8720     return NULL;
8721 }
8722 
8723 SV*
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r,const U32 flags)8724 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8725 {
8726     SV *ret;
8727     AV *av;
8728     SSize_t length;
8729     struct regexp *const rx = ReANY(r);
8730 
8731     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8732 
8733     if (rx && RXp_PAREN_NAMES(rx)) {
8734         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8735             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8736         } else if (flags & RXapif_ONE) {
8737             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8738             av = MUTABLE_AV(SvRV(ret));
8739             length = av_tindex(av);
8740 	    SvREFCNT_dec_NN(ret);
8741             return newSViv(length + 1);
8742         } else {
8743             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8744                                                 (int)flags);
8745             return NULL;
8746         }
8747     }
8748     return &PL_sv_undef;
8749 }
8750 
8751 SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r,const U32 flags)8752 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8753 {
8754     struct regexp *const rx = ReANY(r);
8755     AV *av = newAV();
8756 
8757     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8758 
8759     if (rx && RXp_PAREN_NAMES(rx)) {
8760         HV *hv= RXp_PAREN_NAMES(rx);
8761         HE *temphe;
8762         (void)hv_iterinit(hv);
8763         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8764             IV i;
8765             IV parno = 0;
8766             SV* sv_dat = HeVAL(temphe);
8767             I32 *nums = (I32*)SvPVX(sv_dat);
8768             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8769                 if ((I32)(rx->lastparen) >= nums[i] &&
8770                     rx->offs[nums[i]].start != -1 &&
8771                     rx->offs[nums[i]].end != -1)
8772                 {
8773                     parno = nums[i];
8774                     break;
8775                 }
8776             }
8777             if (parno || flags & RXapif_ALL) {
8778                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8779             }
8780         }
8781     }
8782 
8783     return newRV_noinc(MUTABLE_SV(av));
8784 }
8785 
8786 void
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r,const I32 paren,SV * const sv)8787 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8788 			     SV * const sv)
8789 {
8790     struct regexp *const rx = ReANY(r);
8791     char *s = NULL;
8792     SSize_t i = 0;
8793     SSize_t s1, t1;
8794     I32 n = paren;
8795 
8796     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8797 
8798     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8799            || n == RX_BUFF_IDX_CARET_FULLMATCH
8800            || n == RX_BUFF_IDX_CARET_POSTMATCH
8801        )
8802     {
8803         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8804         if (!keepcopy) {
8805             /* on something like
8806              *    $r = qr/.../;
8807              *    /$qr/p;
8808              * the KEEPCOPY is set on the PMOP rather than the regex */
8809             if (PL_curpm && r == PM_GETRE(PL_curpm))
8810                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8811         }
8812         if (!keepcopy)
8813             goto ret_undef;
8814     }
8815 
8816     if (!rx->subbeg)
8817         goto ret_undef;
8818 
8819     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8820         /* no need to distinguish between them any more */
8821         n = RX_BUFF_IDX_FULLMATCH;
8822 
8823     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8824         && rx->offs[0].start != -1)
8825     {
8826         /* $`, ${^PREMATCH} */
8827 	i = rx->offs[0].start;
8828 	s = rx->subbeg;
8829     }
8830     else
8831     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8832         && rx->offs[0].end != -1)
8833     {
8834         /* $', ${^POSTMATCH} */
8835 	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8836 	i = rx->sublen + rx->suboffset - rx->offs[0].end;
8837     }
8838     else
8839     if (inRANGE(n, 0, (I32)rx->nparens) &&
8840         (s1 = rx->offs[n].start) != -1  &&
8841         (t1 = rx->offs[n].end) != -1)
8842     {
8843         /* $&, ${^MATCH},  $1 ... */
8844         i = t1 - s1;
8845         s = rx->subbeg + s1 - rx->suboffset;
8846     } else {
8847         goto ret_undef;
8848     }
8849 
8850     assert(s >= rx->subbeg);
8851     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8852     if (i >= 0) {
8853 #ifdef NO_TAINT_SUPPORT
8854         sv_setpvn(sv, s, i);
8855 #else
8856         const int oldtainted = TAINT_get;
8857         TAINT_NOT;
8858         sv_setpvn(sv, s, i);
8859         TAINT_set(oldtainted);
8860 #endif
8861         if (RXp_MATCH_UTF8(rx))
8862             SvUTF8_on(sv);
8863         else
8864             SvUTF8_off(sv);
8865         if (TAINTING_get) {
8866             if (RXp_MATCH_TAINTED(rx)) {
8867                 if (SvTYPE(sv) >= SVt_PVMG) {
8868                     MAGIC* const mg = SvMAGIC(sv);
8869                     MAGIC* mgt;
8870                     TAINT;
8871                     SvMAGIC_set(sv, mg->mg_moremagic);
8872                     SvTAINT(sv);
8873                     if ((mgt = SvMAGIC(sv))) {
8874                         mg->mg_moremagic = mgt;
8875                         SvMAGIC_set(sv, mg);
8876                     }
8877                 } else {
8878                     TAINT;
8879                     SvTAINT(sv);
8880                 }
8881             } else
8882                 SvTAINTED_off(sv);
8883         }
8884     } else {
8885       ret_undef:
8886         sv_set_undef(sv);
8887         return;
8888     }
8889 }
8890 
8891 void
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx,const I32 paren,SV const * const value)8892 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8893 							 SV const * const value)
8894 {
8895     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8896 
8897     PERL_UNUSED_ARG(rx);
8898     PERL_UNUSED_ARG(paren);
8899     PERL_UNUSED_ARG(value);
8900 
8901     if (!PL_localizing)
8902         Perl_croak_no_modify();
8903 }
8904 
8905 I32
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r,const SV * const sv,const I32 paren)8906 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8907                               const I32 paren)
8908 {
8909     struct regexp *const rx = ReANY(r);
8910     I32 i;
8911     I32 s1, t1;
8912 
8913     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8914 
8915     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8916         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8917         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8918     )
8919     {
8920         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8921         if (!keepcopy) {
8922             /* on something like
8923              *    $r = qr/.../;
8924              *    /$qr/p;
8925              * the KEEPCOPY is set on the PMOP rather than the regex */
8926             if (PL_curpm && r == PM_GETRE(PL_curpm))
8927                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8928         }
8929         if (!keepcopy)
8930             goto warn_undef;
8931     }
8932 
8933     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8934     switch (paren) {
8935       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8936       case RX_BUFF_IDX_PREMATCH:       /* $` */
8937         if (rx->offs[0].start != -1) {
8938 			i = rx->offs[0].start;
8939 			if (i > 0) {
8940 				s1 = 0;
8941 				t1 = i;
8942 				goto getlen;
8943 			}
8944 	    }
8945         return 0;
8946 
8947       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8948       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8949 	    if (rx->offs[0].end != -1) {
8950 			i = rx->sublen - rx->offs[0].end;
8951 			if (i > 0) {
8952 				s1 = rx->offs[0].end;
8953 				t1 = rx->sublen;
8954 				goto getlen;
8955 			}
8956 	    }
8957         return 0;
8958 
8959       default: /* $& / ${^MATCH}, $1, $2, ... */
8960 	    if (paren <= (I32)rx->nparens &&
8961             (s1 = rx->offs[paren].start) != -1 &&
8962             (t1 = rx->offs[paren].end) != -1)
8963 	    {
8964             i = t1 - s1;
8965             goto getlen;
8966         } else {
8967           warn_undef:
8968             if (ckWARN(WARN_UNINITIALIZED))
8969                 report_uninit((const SV *)sv);
8970             return 0;
8971         }
8972     }
8973   getlen:
8974     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8975         const char * const s = rx->subbeg - rx->suboffset + s1;
8976         const U8 *ep;
8977         STRLEN el;
8978 
8979         i = t1 - s1;
8980         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8981             i = el;
8982     }
8983     return i;
8984 }
8985 
8986 SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)8987 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8988 {
8989     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8990 	PERL_UNUSED_ARG(rx);
8991 	if (0)
8992 	    return NULL;
8993 	else
8994 	    return newSVpvs("Regexp");
8995 }
8996 
8997 /* Scans the name of a named buffer from the pattern.
8998  * If flags is REG_RSN_RETURN_NULL returns null.
8999  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
9000  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
9001  * to the parsed name as looked up in the RExC_paren_names hash.
9002  * If there is an error throws a vFAIL().. type exception.
9003  */
9004 
9005 #define REG_RSN_RETURN_NULL    0
9006 #define REG_RSN_RETURN_NAME    1
9007 #define REG_RSN_RETURN_DATA    2
9008 
9009 STATIC SV*
S_reg_scan_name(pTHX_ RExC_state_t * pRExC_state,U32 flags)9010 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
9011 {
9012     char *name_start = RExC_parse;
9013     SV* sv_name;
9014 
9015     PERL_ARGS_ASSERT_REG_SCAN_NAME;
9016 
9017     assert (RExC_parse <= RExC_end);
9018     if (RExC_parse == RExC_end) NOOP;
9019     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
9020          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
9021           * using do...while */
9022 	if (UTF)
9023 	    do {
9024 		RExC_parse += UTF8SKIP(RExC_parse);
9025 	    } while (   RExC_parse < RExC_end
9026                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
9027 	else
9028 	    do {
9029 		RExC_parse++;
9030 	    } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
9031     } else {
9032         RExC_parse++; /* so the <- from the vFAIL is after the offending
9033                          character */
9034         vFAIL("Group name must start with a non-digit word character");
9035     }
9036     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9037 			     SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9038     if ( flags == REG_RSN_RETURN_NAME)
9039         return sv_name;
9040     else if (flags==REG_RSN_RETURN_DATA) {
9041         HE *he_str = NULL;
9042         SV *sv_dat = NULL;
9043         if ( ! sv_name )      /* should not happen*/
9044             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9045         if (RExC_paren_names)
9046             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9047         if ( he_str )
9048             sv_dat = HeVAL(he_str);
9049         if ( ! sv_dat ) {   /* Didn't find group */
9050 
9051             /* It might be a forward reference; we can't fail until we
9052                 * know, by completing the parse to get all the groups, and
9053                 * then reparsing */
9054             if (ALL_PARENS_COUNTED)  {
9055                 vFAIL("Reference to nonexistent named group");
9056             }
9057             else {
9058                 REQUIRE_PARENS_PASS;
9059             }
9060         }
9061         return sv_dat;
9062     }
9063 
9064     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9065                      (unsigned long) flags);
9066 }
9067 
9068 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
9069     if (RExC_lastparse!=RExC_parse) {                           \
9070         Perl_re_printf( aTHX_  "%s",                            \
9071             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
9072                 RExC_end - RExC_parse, 16,                      \
9073                 "", "",                                         \
9074                 PERL_PV_ESCAPE_UNI_DETECT |                     \
9075                 PERL_PV_PRETTY_ELLIPSES   |                     \
9076                 PERL_PV_PRETTY_LTGT       |                     \
9077                 PERL_PV_ESCAPE_RE         |                     \
9078                 PERL_PV_PRETTY_EXACTSIZE                        \
9079             )                                                   \
9080         );                                                      \
9081     } else                                                      \
9082         Perl_re_printf( aTHX_ "%16s","");                       \
9083                                                                 \
9084     if (RExC_lastnum!=RExC_emit)                                \
9085        Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
9086     else                                                        \
9087        Perl_re_printf( aTHX_ "|%4s","");                        \
9088     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
9089         (int)((depth*2)), "",                                   \
9090         (funcname)                                              \
9091     );                                                          \
9092     RExC_lastnum=RExC_emit;                                     \
9093     RExC_lastparse=RExC_parse;                                  \
9094 })
9095 
9096 
9097 
9098 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
9099     DEBUG_PARSE_MSG((funcname));                            \
9100     Perl_re_printf( aTHX_ "%4s","\n");                                  \
9101 })
9102 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
9103     DEBUG_PARSE_MSG((funcname));                            \
9104     Perl_re_printf( aTHX_ fmt "\n",args);                               \
9105 })
9106 
9107 /* This section of code defines the inversion list object and its methods.  The
9108  * interfaces are highly subject to change, so as much as possible is static to
9109  * this file.  An inversion list is here implemented as a malloc'd C UV array
9110  * as an SVt_INVLIST scalar.
9111  *
9112  * An inversion list for Unicode is an array of code points, sorted by ordinal
9113  * number.  Each element gives the code point that begins a range that extends
9114  * up-to but not including the code point given by the next element.  The final
9115  * element gives the first code point of a range that extends to the platform's
9116  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
9117  * ...) give ranges whose code points are all in the inversion list.  We say
9118  * that those ranges are in the set.  The odd-numbered elements give ranges
9119  * whose code points are not in the inversion list, and hence not in the set.
9120  * Thus, element [0] is the first code point in the list.  Element [1]
9121  * is the first code point beyond that not in the list; and element [2] is the
9122  * first code point beyond that that is in the list.  In other words, the first
9123  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9124  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
9125  * all code points in that range are not in the inversion list.  The third
9126  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9127  * list, and so forth.  Thus every element whose index is divisible by two
9128  * gives the beginning of a range that is in the list, and every element whose
9129  * index is not divisible by two gives the beginning of a range not in the
9130  * list.  If the final element's index is divisible by two, the inversion list
9131  * extends to the platform's infinity; otherwise the highest code point in the
9132  * inversion list is the contents of that element minus 1.
9133  *
9134  * A range that contains just a single code point N will look like
9135  *  invlist[i]   == N
9136  *  invlist[i+1] == N+1
9137  *
9138  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9139  * impossible to represent, so element [i+1] is omitted.  The single element
9140  * inversion list
9141  *  invlist[0] == UV_MAX
9142  * contains just UV_MAX, but is interpreted as matching to infinity.
9143  *
9144  * Taking the complement (inverting) an inversion list is quite simple, if the
9145  * first element is 0, remove it; otherwise add a 0 element at the beginning.
9146  * This implementation reserves an element at the beginning of each inversion
9147  * list to always contain 0; there is an additional flag in the header which
9148  * indicates if the list begins at the 0, or is offset to begin at the next
9149  * element.  This means that the inversion list can be inverted without any
9150  * copying; just flip the flag.
9151  *
9152  * More about inversion lists can be found in "Unicode Demystified"
9153  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9154  *
9155  * The inversion list data structure is currently implemented as an SV pointing
9156  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
9157  * array of UV whose memory management is automatically handled by the existing
9158  * facilities for SV's.
9159  *
9160  * Some of the methods should always be private to the implementation, and some
9161  * should eventually be made public */
9162 
9163 /* The header definitions are in F<invlist_inline.h> */
9164 
9165 #ifndef PERL_IN_XSUB_RE
9166 
9167 PERL_STATIC_INLINE UV*
S__invlist_array_init(SV * const invlist,const bool will_have_0)9168 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9169 {
9170     /* Returns a pointer to the first element in the inversion list's array.
9171      * This is called upon initialization of an inversion list.  Where the
9172      * array begins depends on whether the list has the code point U+0000 in it
9173      * or not.  The other parameter tells it whether the code that follows this
9174      * call is about to put a 0 in the inversion list or not.  The first
9175      * element is either the element reserved for 0, if TRUE, or the element
9176      * after it, if FALSE */
9177 
9178     bool* offset = get_invlist_offset_addr(invlist);
9179     UV* zero_addr = (UV *) SvPVX(invlist);
9180 
9181     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9182 
9183     /* Must be empty */
9184     assert(! _invlist_len(invlist));
9185 
9186     *zero_addr = 0;
9187 
9188     /* 1^1 = 0; 1^0 = 1 */
9189     *offset = 1 ^ will_have_0;
9190     return zero_addr + *offset;
9191 }
9192 
9193 STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest,SV * src)9194 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9195 {
9196     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9197      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9198      * is similar to what SvSetMagicSV() would do, if it were implemented on
9199      * inversion lists, though this routine avoids a copy */
9200 
9201     const UV src_len          = _invlist_len(src);
9202     const bool src_offset     = *get_invlist_offset_addr(src);
9203     const STRLEN src_byte_len = SvLEN(src);
9204     char * array              = SvPVX(src);
9205 
9206     const int oldtainted = TAINT_get;
9207 
9208     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9209 
9210     assert(is_invlist(src));
9211     assert(is_invlist(dest));
9212     assert(! invlist_is_iterating(src));
9213     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9214 
9215     /* Make sure it ends in the right place with a NUL, as our inversion list
9216      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9217      * asserts it */
9218     array[src_byte_len - 1] = '\0';
9219 
9220     TAINT_NOT;      /* Otherwise it breaks */
9221     sv_usepvn_flags(dest,
9222                     (char *) array,
9223                     src_byte_len - 1,
9224 
9225                     /* This flag is documented to cause a copy to be avoided */
9226                     SV_HAS_TRAILING_NUL);
9227     TAINT_set(oldtainted);
9228     SvPV_set(src, 0);
9229     SvLEN_set(src, 0);
9230     SvCUR_set(src, 0);
9231 
9232     /* Finish up copying over the other fields in an inversion list */
9233     *get_invlist_offset_addr(dest) = src_offset;
9234     invlist_set_len(dest, src_len, src_offset);
9235     *get_invlist_previous_index_addr(dest) = 0;
9236     invlist_iterfinish(dest);
9237 }
9238 
9239 PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV * invlist)9240 S_get_invlist_previous_index_addr(SV* invlist)
9241 {
9242     /* Return the address of the IV that is reserved to hold the cached index
9243      * */
9244     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9245 
9246     assert(is_invlist(invlist));
9247 
9248     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9249 }
9250 
9251 PERL_STATIC_INLINE IV
S_invlist_previous_index(SV * const invlist)9252 S_invlist_previous_index(SV* const invlist)
9253 {
9254     /* Returns cached index of previous search */
9255 
9256     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9257 
9258     return *get_invlist_previous_index_addr(invlist);
9259 }
9260 
9261 PERL_STATIC_INLINE void
S_invlist_set_previous_index(SV * const invlist,const IV index)9262 S_invlist_set_previous_index(SV* const invlist, const IV index)
9263 {
9264     /* Caches <index> for later retrieval */
9265 
9266     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9267 
9268     assert(index == 0 || index < (int) _invlist_len(invlist));
9269 
9270     *get_invlist_previous_index_addr(invlist) = index;
9271 }
9272 
9273 PERL_STATIC_INLINE void
S_invlist_trim(SV * invlist)9274 S_invlist_trim(SV* invlist)
9275 {
9276     /* Free the not currently-being-used space in an inversion list */
9277 
9278     /* But don't free up the space needed for the 0 UV that is always at the
9279      * beginning of the list, nor the trailing NUL */
9280     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9281 
9282     PERL_ARGS_ASSERT_INVLIST_TRIM;
9283 
9284     assert(is_invlist(invlist));
9285 
9286     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9287 }
9288 
9289 PERL_STATIC_INLINE void
S_invlist_clear(pTHX_ SV * invlist)9290 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9291 {
9292     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9293 
9294     assert(is_invlist(invlist));
9295 
9296     invlist_set_len(invlist, 0, 0);
9297     invlist_trim(invlist);
9298 }
9299 
9300 #endif /* ifndef PERL_IN_XSUB_RE */
9301 
9302 PERL_STATIC_INLINE bool
S_invlist_is_iterating(SV * const invlist)9303 S_invlist_is_iterating(SV* const invlist)
9304 {
9305     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9306 
9307     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9308 }
9309 
9310 #ifndef PERL_IN_XSUB_RE
9311 
9312 PERL_STATIC_INLINE UV
S_invlist_max(SV * const invlist)9313 S_invlist_max(SV* const invlist)
9314 {
9315     /* Returns the maximum number of elements storable in the inversion list's
9316      * array, without having to realloc() */
9317 
9318     PERL_ARGS_ASSERT_INVLIST_MAX;
9319 
9320     assert(is_invlist(invlist));
9321 
9322     /* Assumes worst case, in which the 0 element is not counted in the
9323      * inversion list, so subtracts 1 for that */
9324     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9325            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9326            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9327 }
9328 
9329 STATIC void
S_initialize_invlist_guts(pTHX_ SV * invlist,const Size_t initial_size)9330 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9331 {
9332     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9333 
9334     /* First 1 is in case the zero element isn't in the list; second 1 is for
9335      * trailing NUL */
9336     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9337     invlist_set_len(invlist, 0, 0);
9338 
9339     /* Force iterinit() to be used to get iteration to work */
9340     invlist_iterfinish(invlist);
9341 
9342     *get_invlist_previous_index_addr(invlist) = 0;
9343     SvPOK_on(invlist);  /* This allows B to extract the PV */
9344 }
9345 
9346 SV*
Perl__new_invlist(pTHX_ IV initial_size)9347 Perl__new_invlist(pTHX_ IV initial_size)
9348 {
9349 
9350     /* Return a pointer to a newly constructed inversion list, with enough
9351      * space to store 'initial_size' elements.  If that number is negative, a
9352      * system default is used instead */
9353 
9354     SV* new_list;
9355 
9356     if (initial_size < 0) {
9357 	initial_size = 10;
9358     }
9359 
9360     new_list = newSV_type(SVt_INVLIST);
9361     initialize_invlist_guts(new_list, initial_size);
9362 
9363     return new_list;
9364 }
9365 
9366 SV*
Perl__new_invlist_C_array(pTHX_ const UV * const list)9367 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9368 {
9369     /* Return a pointer to a newly constructed inversion list, initialized to
9370      * point to <list>, which has to be in the exact correct inversion list
9371      * form, including internal fields.  Thus this is a dangerous routine that
9372      * should not be used in the wrong hands.  The passed in 'list' contains
9373      * several header fields at the beginning that are not part of the
9374      * inversion list body proper */
9375 
9376     const STRLEN length = (STRLEN) list[0];
9377     const UV version_id =          list[1];
9378     const bool offset   =    cBOOL(list[2]);
9379 #define HEADER_LENGTH 3
9380     /* If any of the above changes in any way, you must change HEADER_LENGTH
9381      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9382      *      perl -E 'say int(rand 2**31-1)'
9383      */
9384 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9385                                         data structure type, so that one being
9386                                         passed in can be validated to be an
9387                                         inversion list of the correct vintage.
9388                                        */
9389 
9390     SV* invlist = newSV_type(SVt_INVLIST);
9391 
9392     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9393 
9394     if (version_id != INVLIST_VERSION_ID) {
9395         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9396     }
9397 
9398     /* The generated array passed in includes header elements that aren't part
9399      * of the list proper, so start it just after them */
9400     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9401 
9402     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9403 			       shouldn't touch it */
9404 
9405     *(get_invlist_offset_addr(invlist)) = offset;
9406 
9407     /* The 'length' passed to us is the physical number of elements in the
9408      * inversion list.  But if there is an offset the logical number is one
9409      * less than that */
9410     invlist_set_len(invlist, length  - offset, offset);
9411 
9412     invlist_set_previous_index(invlist, 0);
9413 
9414     /* Initialize the iteration pointer. */
9415     invlist_iterfinish(invlist);
9416 
9417     SvREADONLY_on(invlist);
9418     SvPOK_on(invlist);
9419 
9420     return invlist;
9421 }
9422 
9423 STATIC void
S__append_range_to_invlist(pTHX_ SV * const invlist,const UV start,const UV end)9424 S__append_range_to_invlist(pTHX_ SV* const invlist,
9425                                  const UV start, const UV end)
9426 {
9427    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9428     * the end of the inversion list.  The range must be above any existing
9429     * ones. */
9430 
9431     UV* array;
9432     UV max = invlist_max(invlist);
9433     UV len = _invlist_len(invlist);
9434     bool offset;
9435 
9436     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9437 
9438     if (len == 0) { /* Empty lists must be initialized */
9439         offset = start != 0;
9440         array = _invlist_array_init(invlist, ! offset);
9441     }
9442     else {
9443 	/* Here, the existing list is non-empty. The current max entry in the
9444 	 * list is generally the first value not in the set, except when the
9445 	 * set extends to the end of permissible values, in which case it is
9446 	 * the first entry in that final set, and so this call is an attempt to
9447 	 * append out-of-order */
9448 
9449 	UV final_element = len - 1;
9450 	array = invlist_array(invlist);
9451 	if (   array[final_element] > start
9452 	    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9453 	{
9454 	    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9455 		     array[final_element], start,
9456 		     ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9457 	}
9458 
9459         /* Here, it is a legal append.  If the new range begins 1 above the end
9460          * of the range below it, it is extending the range below it, so the
9461          * new first value not in the set is one greater than the newly
9462          * extended range.  */
9463         offset = *get_invlist_offset_addr(invlist);
9464 	if (array[final_element] == start) {
9465 	    if (end != UV_MAX) {
9466 		array[final_element] = end + 1;
9467 	    }
9468 	    else {
9469 		/* But if the end is the maximum representable on the machine,
9470                  * assume that infinity was actually what was meant.  Just let
9471                  * the range that this would extend to have no end */
9472 		invlist_set_len(invlist, len - 1, offset);
9473 	    }
9474 	    return;
9475 	}
9476     }
9477 
9478     /* Here the new range doesn't extend any existing set.  Add it */
9479 
9480     len += 2;	/* Includes an element each for the start and end of range */
9481 
9482     /* If wll overflow the existing space, extend, which may cause the array to
9483      * be moved */
9484     if (max < len) {
9485 	invlist_extend(invlist, len);
9486 
9487         /* Have to set len here to avoid assert failure in invlist_array() */
9488         invlist_set_len(invlist, len, offset);
9489 
9490 	array = invlist_array(invlist);
9491     }
9492     else {
9493 	invlist_set_len(invlist, len, offset);
9494     }
9495 
9496     /* The next item on the list starts the range, the one after that is
9497      * one past the new range.  */
9498     array[len - 2] = start;
9499     if (end != UV_MAX) {
9500 	array[len - 1] = end + 1;
9501     }
9502     else {
9503 	/* But if the end is the maximum representable on the machine, just let
9504 	 * the range have no end */
9505 	invlist_set_len(invlist, len - 1, offset);
9506     }
9507 }
9508 
9509 SSize_t
Perl__invlist_search(SV * const invlist,const UV cp)9510 Perl__invlist_search(SV* const invlist, const UV cp)
9511 {
9512     /* Searches the inversion list for the entry that contains the input code
9513      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9514      * return value is the index into the list's array of the range that
9515      * contains <cp>, that is, 'i' such that
9516      *	array[i] <= cp < array[i+1]
9517      */
9518 
9519     IV low = 0;
9520     IV mid;
9521     IV high = _invlist_len(invlist);
9522     const IV highest_element = high - 1;
9523     const UV* array;
9524 
9525     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9526 
9527     /* If list is empty, return failure. */
9528     if (high == 0) {
9529 	return -1;
9530     }
9531 
9532     /* (We can't get the array unless we know the list is non-empty) */
9533     array = invlist_array(invlist);
9534 
9535     mid = invlist_previous_index(invlist);
9536     assert(mid >=0);
9537     if (mid > highest_element) {
9538         mid = highest_element;
9539     }
9540 
9541     /* <mid> contains the cache of the result of the previous call to this
9542      * function (0 the first time).  See if this call is for the same result,
9543      * or if it is for mid-1.  This is under the theory that calls to this
9544      * function will often be for related code points that are near each other.
9545      * And benchmarks show that caching gives better results.  We also test
9546      * here if the code point is within the bounds of the list.  These tests
9547      * replace others that would have had to be made anyway to make sure that
9548      * the array bounds were not exceeded, and these give us extra information
9549      * at the same time */
9550     if (cp >= array[mid]) {
9551         if (cp >= array[highest_element]) {
9552             return highest_element;
9553         }
9554 
9555         /* Here, array[mid] <= cp < array[highest_element].  This means that
9556          * the final element is not the answer, so can exclude it; it also
9557          * means that <mid> is not the final element, so can refer to 'mid + 1'
9558          * safely */
9559         if (cp < array[mid + 1]) {
9560             return mid;
9561         }
9562         high--;
9563         low = mid + 1;
9564     }
9565     else { /* cp < aray[mid] */
9566         if (cp < array[0]) { /* Fail if outside the array */
9567             return -1;
9568         }
9569         high = mid;
9570         if (cp >= array[mid - 1]) {
9571             goto found_entry;
9572         }
9573     }
9574 
9575     /* Binary search.  What we are looking for is <i> such that
9576      *	array[i] <= cp < array[i+1]
9577      * The loop below converges on the i+1.  Note that there may not be an
9578      * (i+1)th element in the array, and things work nonetheless */
9579     while (low < high) {
9580 	mid = (low + high) / 2;
9581         assert(mid <= highest_element);
9582 	if (array[mid] <= cp) { /* cp >= array[mid] */
9583 	    low = mid + 1;
9584 
9585 	    /* We could do this extra test to exit the loop early.
9586 	    if (cp < array[low]) {
9587 		return mid;
9588 	    }
9589 	    */
9590 	}
9591 	else { /* cp < array[mid] */
9592 	    high = mid;
9593 	}
9594     }
9595 
9596   found_entry:
9597     high--;
9598     invlist_set_previous_index(invlist, high);
9599     return high;
9600 }
9601 
9602 void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** output)9603 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9604                                          const bool complement_b, SV** output)
9605 {
9606     /* Take the union of two inversion lists and point '*output' to it.  On
9607      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9608      * even 'a' or 'b').  If to an inversion list, the contents of the original
9609      * list will be replaced by the union.  The first list, 'a', may be
9610      * NULL, in which case a copy of the second list is placed in '*output'.
9611      * If 'complement_b' is TRUE, the union is taken of the complement
9612      * (inversion) of 'b' instead of b itself.
9613      *
9614      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9615      * Richard Gillam, published by Addison-Wesley, and explained at some
9616      * length there.  The preface says to incorporate its examples into your
9617      * code at your own risk.
9618      *
9619      * The algorithm is like a merge sort. */
9620 
9621     const UV* array_a;    /* a's array */
9622     const UV* array_b;
9623     UV len_a;	    /* length of a's array */
9624     UV len_b;
9625 
9626     SV* u;			/* the resulting union */
9627     UV* array_u;
9628     UV len_u = 0;
9629 
9630     UV i_a = 0;		    /* current index into a's array */
9631     UV i_b = 0;
9632     UV i_u = 0;
9633 
9634     /* running count, as explained in the algorithm source book; items are
9635      * stopped accumulating and are output when the count changes to/from 0.
9636      * The count is incremented when we start a range that's in an input's set,
9637      * and decremented when we start a range that's not in a set.  So this
9638      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9639      * and hence nothing goes into the union; 1, just one of the inputs is in
9640      * its set (and its current range gets added to the union); and 2 when both
9641      * inputs are in their sets.  */
9642     UV count = 0;
9643 
9644     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9645     assert(a != b);
9646     assert(*output == NULL || is_invlist(*output));
9647 
9648     len_b = _invlist_len(b);
9649     if (len_b == 0) {
9650 
9651         /* Here, 'b' is empty, hence it's complement is all possible code
9652          * points.  So if the union includes the complement of 'b', it includes
9653          * everything, and we need not even look at 'a'.  It's easiest to
9654          * create a new inversion list that matches everything.  */
9655         if (complement_b) {
9656             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9657 
9658             if (*output == NULL) { /* If the output didn't exist, just point it
9659                                       at the new list */
9660                 *output = everything;
9661             }
9662             else { /* Otherwise, replace its contents with the new list */
9663                 invlist_replace_list_destroys_src(*output, everything);
9664                 SvREFCNT_dec_NN(everything);
9665             }
9666 
9667             return;
9668         }
9669 
9670         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9671          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9672          * output will be empty */
9673 
9674         if (a == NULL || _invlist_len(a) == 0) {
9675             if (*output == NULL) {
9676                 *output = _new_invlist(0);
9677             }
9678             else {
9679                 invlist_clear(*output);
9680             }
9681             return;
9682         }
9683 
9684         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9685          * union.  We can just return a copy of 'a' if '*output' doesn't point
9686          * to an existing list */
9687         if (*output == NULL) {
9688             *output = invlist_clone(a, NULL);
9689             return;
9690         }
9691 
9692         /* If the output is to overwrite 'a', we have a no-op, as it's
9693          * already in 'a' */
9694         if (*output == a) {
9695             return;
9696         }
9697 
9698         /* Here, '*output' is to be overwritten by 'a' */
9699         u = invlist_clone(a, NULL);
9700         invlist_replace_list_destroys_src(*output, u);
9701         SvREFCNT_dec_NN(u);
9702 
9703         return;
9704     }
9705 
9706     /* Here 'b' is not empty.  See about 'a' */
9707 
9708     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9709 
9710         /* Here, 'a' is empty (and b is not).  That means the union will come
9711          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9712          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9713          * the clone */
9714 
9715         SV ** dest = (*output == NULL) ? output : &u;
9716         *dest = invlist_clone(b, NULL);
9717         if (complement_b) {
9718             _invlist_invert(*dest);
9719         }
9720 
9721         if (dest == &u) {
9722             invlist_replace_list_destroys_src(*output, u);
9723             SvREFCNT_dec_NN(u);
9724         }
9725 
9726 	return;
9727     }
9728 
9729     /* Here both lists exist and are non-empty */
9730     array_a = invlist_array(a);
9731     array_b = invlist_array(b);
9732 
9733     /* If are to take the union of 'a' with the complement of b, set it
9734      * up so are looking at b's complement. */
9735     if (complement_b) {
9736 
9737 	/* To complement, we invert: if the first element is 0, remove it.  To
9738 	 * do this, we just pretend the array starts one later */
9739         if (array_b[0] == 0) {
9740             array_b++;
9741             len_b--;
9742         }
9743         else {
9744 
9745             /* But if the first element is not zero, we pretend the list starts
9746              * at the 0 that is always stored immediately before the array. */
9747             array_b--;
9748             len_b++;
9749         }
9750     }
9751 
9752     /* Size the union for the worst case: that the sets are completely
9753      * disjoint */
9754     u = _new_invlist(len_a + len_b);
9755 
9756     /* Will contain U+0000 if either component does */
9757     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9758                                       || (len_b > 0 && array_b[0] == 0));
9759 
9760     /* Go through each input list item by item, stopping when have exhausted
9761      * one of them */
9762     while (i_a < len_a && i_b < len_b) {
9763 	UV cp;	    /* The element to potentially add to the union's array */
9764 	bool cp_in_set;   /* is it in the input list's set or not */
9765 
9766 	/* We need to take one or the other of the two inputs for the union.
9767 	 * Since we are merging two sorted lists, we take the smaller of the
9768          * next items.  In case of a tie, we take first the one that is in its
9769          * set.  If we first took the one not in its set, it would decrement
9770          * the count, possibly to 0 which would cause it to be output as ending
9771          * the range, and the next time through we would take the same number,
9772          * and output it again as beginning the next range.  By doing it the
9773          * opposite way, there is no possibility that the count will be
9774          * momentarily decremented to 0, and thus the two adjoining ranges will
9775          * be seamlessly merged.  (In a tie and both are in the set or both not
9776          * in the set, it doesn't matter which we take first.) */
9777 	if (       array_a[i_a] < array_b[i_b]
9778 	    || (   array_a[i_a] == array_b[i_b]
9779 		&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9780 	{
9781 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9782 	    cp = array_a[i_a++];
9783 	}
9784 	else {
9785 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9786 	    cp = array_b[i_b++];
9787 	}
9788 
9789 	/* Here, have chosen which of the two inputs to look at.  Only output
9790 	 * if the running count changes to/from 0, which marks the
9791 	 * beginning/end of a range that's in the set */
9792 	if (cp_in_set) {
9793 	    if (count == 0) {
9794 		array_u[i_u++] = cp;
9795 	    }
9796 	    count++;
9797 	}
9798 	else {
9799 	    count--;
9800 	    if (count == 0) {
9801 		array_u[i_u++] = cp;
9802 	    }
9803 	}
9804     }
9805 
9806 
9807     /* The loop above increments the index into exactly one of the input lists
9808      * each iteration, and ends when either index gets to its list end.  That
9809      * means the other index is lower than its end, and so something is
9810      * remaining in that one.  We decrement 'count', as explained below, if
9811      * that list is in its set.  (i_a and i_b each currently index the element
9812      * beyond the one we care about.) */
9813     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9814 	|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9815     {
9816 	count--;
9817     }
9818 
9819     /* Above we decremented 'count' if the list that had unexamined elements in
9820      * it was in its set.  This has made it so that 'count' being non-zero
9821      * means there isn't anything left to output; and 'count' equal to 0 means
9822      * that what is left to output is precisely that which is left in the
9823      * non-exhausted input list.
9824      *
9825      * To see why, note first that the exhausted input obviously has nothing
9826      * left to add to the union.  If it was in its set at its end, that means
9827      * the set extends from here to the platform's infinity, and hence so does
9828      * the union and the non-exhausted set is irrelevant.  The exhausted set
9829      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9830      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9831      * 'count' remains at 1.  This is consistent with the decremented 'count'
9832      * != 0 meaning there's nothing left to add to the union.
9833      *
9834      * But if the exhausted input wasn't in its set, it contributed 0 to
9835      * 'count', and the rest of the union will be whatever the other input is.
9836      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9837      * otherwise it gets decremented to 0.  This is consistent with 'count'
9838      * == 0 meaning the remainder of the union is whatever is left in the
9839      * non-exhausted list. */
9840     if (count != 0) {
9841         len_u = i_u;
9842     }
9843     else {
9844         IV copy_count = len_a - i_a;
9845         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9846 	    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9847         }
9848         else { /* The non-exhausted input is b */
9849             copy_count = len_b - i_b;
9850 	    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9851         }
9852         len_u = i_u + copy_count;
9853     }
9854 
9855     /* Set the result to the final length, which can change the pointer to
9856      * array_u, so re-find it.  (Note that it is unlikely that this will
9857      * change, as we are shrinking the space, not enlarging it) */
9858     if (len_u != _invlist_len(u)) {
9859 	invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9860 	invlist_trim(u);
9861 	array_u = invlist_array(u);
9862     }
9863 
9864     if (*output == NULL) {  /* Simply return the new inversion list */
9865         *output = u;
9866     }
9867     else {
9868         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9869          * could instead free '*output', and then set it to 'u', but experience
9870          * has shown [perl #127392] that if the input is a mortal, we can get a
9871          * huge build-up of these during regex compilation before they get
9872          * freed. */
9873         invlist_replace_list_destroys_src(*output, u);
9874         SvREFCNT_dec_NN(u);
9875     }
9876 
9877     return;
9878 }
9879 
9880 void
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** i)9881 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9882                                                const bool complement_b, SV** i)
9883 {
9884     /* Take the intersection of two inversion lists and point '*i' to it.  On
9885      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9886      * even 'a' or 'b').  If to an inversion list, the contents of the original
9887      * list will be replaced by the intersection.  The first list, 'a', may be
9888      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9889      * TRUE, the result will be the intersection of 'a' and the complement (or
9890      * inversion) of 'b' instead of 'b' directly.
9891      *
9892      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9893      * Richard Gillam, published by Addison-Wesley, and explained at some
9894      * length there.  The preface says to incorporate its examples into your
9895      * code at your own risk.  In fact, it had bugs
9896      *
9897      * The algorithm is like a merge sort, and is essentially the same as the
9898      * union above
9899      */
9900 
9901     const UV* array_a;		/* a's array */
9902     const UV* array_b;
9903     UV len_a;	/* length of a's array */
9904     UV len_b;
9905 
9906     SV* r;		     /* the resulting intersection */
9907     UV* array_r;
9908     UV len_r = 0;
9909 
9910     UV i_a = 0;		    /* current index into a's array */
9911     UV i_b = 0;
9912     UV i_r = 0;
9913 
9914     /* running count of how many of the two inputs are postitioned at ranges
9915      * that are in their sets.  As explained in the algorithm source book,
9916      * items are stopped accumulating and are output when the count changes
9917      * to/from 2.  The count is incremented when we start a range that's in an
9918      * input's set, and decremented when we start a range that's not in a set.
9919      * Only when it is 2 are we in the intersection. */
9920     UV count = 0;
9921 
9922     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9923     assert(a != b);
9924     assert(*i == NULL || is_invlist(*i));
9925 
9926     /* Special case if either one is empty */
9927     len_a = (a == NULL) ? 0 : _invlist_len(a);
9928     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9929         if (len_a != 0 && complement_b) {
9930 
9931             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9932              * must be empty.  Here, also we are using 'b's complement, which
9933              * hence must be every possible code point.  Thus the intersection
9934              * is simply 'a'. */
9935 
9936             if (*i == a) {  /* No-op */
9937                 return;
9938             }
9939 
9940             if (*i == NULL) {
9941                 *i = invlist_clone(a, NULL);
9942                 return;
9943             }
9944 
9945             r = invlist_clone(a, NULL);
9946             invlist_replace_list_destroys_src(*i, r);
9947             SvREFCNT_dec_NN(r);
9948             return;
9949         }
9950 
9951         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9952          * intersection must be empty */
9953         if (*i == NULL) {
9954             *i = _new_invlist(0);
9955             return;
9956         }
9957 
9958         invlist_clear(*i);
9959 	return;
9960     }
9961 
9962     /* Here both lists exist and are non-empty */
9963     array_a = invlist_array(a);
9964     array_b = invlist_array(b);
9965 
9966     /* If are to take the intersection of 'a' with the complement of b, set it
9967      * up so are looking at b's complement. */
9968     if (complement_b) {
9969 
9970 	/* To complement, we invert: if the first element is 0, remove it.  To
9971 	 * do this, we just pretend the array starts one later */
9972         if (array_b[0] == 0) {
9973             array_b++;
9974             len_b--;
9975         }
9976         else {
9977 
9978             /* But if the first element is not zero, we pretend the list starts
9979              * at the 0 that is always stored immediately before the array. */
9980             array_b--;
9981             len_b++;
9982         }
9983     }
9984 
9985     /* Size the intersection for the worst case: that the intersection ends up
9986      * fragmenting everything to be completely disjoint */
9987     r= _new_invlist(len_a + len_b);
9988 
9989     /* Will contain U+0000 iff both components do */
9990     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9991                                      && len_b > 0 && array_b[0] == 0);
9992 
9993     /* Go through each list item by item, stopping when have exhausted one of
9994      * them */
9995     while (i_a < len_a && i_b < len_b) {
9996 	UV cp;	    /* The element to potentially add to the intersection's
9997 		       array */
9998 	bool cp_in_set;	/* Is it in the input list's set or not */
9999 
10000 	/* We need to take one or the other of the two inputs for the
10001 	 * intersection.  Since we are merging two sorted lists, we take the
10002          * smaller of the next items.  In case of a tie, we take first the one
10003          * that is not in its set (a difference from the union algorithm).  If
10004          * we first took the one in its set, it would increment the count,
10005          * possibly to 2 which would cause it to be output as starting a range
10006          * in the intersection, and the next time through we would take that
10007          * same number, and output it again as ending the set.  By doing the
10008          * opposite of this, there is no possibility that the count will be
10009          * momentarily incremented to 2.  (In a tie and both are in the set or
10010          * both not in the set, it doesn't matter which we take first.) */
10011 	if (       array_a[i_a] < array_b[i_b]
10012 	    || (   array_a[i_a] == array_b[i_b]
10013 		&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
10014 	{
10015 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
10016 	    cp = array_a[i_a++];
10017 	}
10018 	else {
10019 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
10020 	    cp= array_b[i_b++];
10021 	}
10022 
10023 	/* Here, have chosen which of the two inputs to look at.  Only output
10024 	 * if the running count changes to/from 2, which marks the
10025 	 * beginning/end of a range that's in the intersection */
10026 	if (cp_in_set) {
10027 	    count++;
10028 	    if (count == 2) {
10029 		array_r[i_r++] = cp;
10030 	    }
10031 	}
10032 	else {
10033 	    if (count == 2) {
10034 		array_r[i_r++] = cp;
10035 	    }
10036 	    count--;
10037 	}
10038 
10039     }
10040 
10041     /* The loop above increments the index into exactly one of the input lists
10042      * each iteration, and ends when either index gets to its list end.  That
10043      * means the other index is lower than its end, and so something is
10044      * remaining in that one.  We increment 'count', as explained below, if the
10045      * exhausted list was in its set.  (i_a and i_b each currently index the
10046      * element beyond the one we care about.) */
10047     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10048         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10049     {
10050 	count++;
10051     }
10052 
10053     /* Above we incremented 'count' if the exhausted list was in its set.  This
10054      * has made it so that 'count' being below 2 means there is nothing left to
10055      * output; otheriwse what's left to add to the intersection is precisely
10056      * that which is left in the non-exhausted input list.
10057      *
10058      * To see why, note first that the exhausted input obviously has nothing
10059      * left to affect the intersection.  If it was in its set at its end, that
10060      * means the set extends from here to the platform's infinity, and hence
10061      * anything in the non-exhausted's list will be in the intersection, and
10062      * anything not in it won't be.  Hence, the rest of the intersection is
10063      * precisely what's in the non-exhausted list  The exhausted set also
10064      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
10065      * it means 'count' is now at least 2.  This is consistent with the
10066      * incremented 'count' being >= 2 means to add the non-exhausted list to
10067      * the intersection.
10068      *
10069      * But if the exhausted input wasn't in its set, it contributed 0 to
10070      * 'count', and the intersection can't include anything further; the
10071      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
10072      * incremented.  This is consistent with 'count' being < 2 meaning nothing
10073      * further to add to the intersection. */
10074     if (count < 2) { /* Nothing left to put in the intersection. */
10075         len_r = i_r;
10076     }
10077     else { /* copy the non-exhausted list, unchanged. */
10078         IV copy_count = len_a - i_a;
10079         if (copy_count > 0) {   /* a is the one with stuff left */
10080 	    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10081         }
10082         else {  /* b is the one with stuff left */
10083             copy_count = len_b - i_b;
10084 	    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10085         }
10086         len_r = i_r + copy_count;
10087     }
10088 
10089     /* Set the result to the final length, which can change the pointer to
10090      * array_r, so re-find it.  (Note that it is unlikely that this will
10091      * change, as we are shrinking the space, not enlarging it) */
10092     if (len_r != _invlist_len(r)) {
10093 	invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10094 	invlist_trim(r);
10095 	array_r = invlist_array(r);
10096     }
10097 
10098     if (*i == NULL) { /* Simply return the calculated intersection */
10099         *i = r;
10100     }
10101     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
10102               instead free '*i', and then set it to 'r', but experience has
10103               shown [perl #127392] that if the input is a mortal, we can get a
10104               huge build-up of these during regex compilation before they get
10105               freed. */
10106         if (len_r) {
10107             invlist_replace_list_destroys_src(*i, r);
10108         }
10109         else {
10110             invlist_clear(*i);
10111         }
10112         SvREFCNT_dec_NN(r);
10113     }
10114 
10115     return;
10116 }
10117 
10118 SV*
Perl__add_range_to_invlist(pTHX_ SV * invlist,UV start,UV end)10119 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10120 {
10121     /* Add the range from 'start' to 'end' inclusive to the inversion list's
10122      * set.  A pointer to the inversion list is returned.  This may actually be
10123      * a new list, in which case the passed in one has been destroyed.  The
10124      * passed-in inversion list can be NULL, in which case a new one is created
10125      * with just the one range in it.  The new list is not necessarily
10126      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
10127      * result of this function.  The gain would not be large, and in many
10128      * cases, this is called multiple times on a single inversion list, so
10129      * anything freed may almost immediately be needed again.
10130      *
10131      * This used to mostly call the 'union' routine, but that is much more
10132      * heavyweight than really needed for a single range addition */
10133 
10134     UV* array;              /* The array implementing the inversion list */
10135     UV len;                 /* How many elements in 'array' */
10136     SSize_t i_s;            /* index into the invlist array where 'start'
10137                                should go */
10138     SSize_t i_e = 0;        /* And the index where 'end' should go */
10139     UV cur_highest;         /* The highest code point in the inversion list
10140                                upon entry to this function */
10141 
10142     /* This range becomes the whole inversion list if none already existed */
10143     if (invlist == NULL) {
10144 	invlist = _new_invlist(2);
10145         _append_range_to_invlist(invlist, start, end);
10146         return invlist;
10147     }
10148 
10149     /* Likewise, if the inversion list is currently empty */
10150     len = _invlist_len(invlist);
10151     if (len == 0) {
10152         _append_range_to_invlist(invlist, start, end);
10153         return invlist;
10154     }
10155 
10156     /* Starting here, we have to know the internals of the list */
10157     array = invlist_array(invlist);
10158 
10159     /* If the new range ends higher than the current highest ... */
10160     cur_highest = invlist_highest(invlist);
10161     if (end > cur_highest) {
10162 
10163         /* If the whole range is higher, we can just append it */
10164         if (start > cur_highest) {
10165             _append_range_to_invlist(invlist, start, end);
10166             return invlist;
10167         }
10168 
10169         /* Otherwise, add the portion that is higher ... */
10170         _append_range_to_invlist(invlist, cur_highest + 1, end);
10171 
10172         /* ... and continue on below to handle the rest.  As a result of the
10173          * above append, we know that the index of the end of the range is the
10174          * final even numbered one of the array.  Recall that the final element
10175          * always starts a range that extends to infinity.  If that range is in
10176          * the set (meaning the set goes from here to infinity), it will be an
10177          * even index, but if it isn't in the set, it's odd, and the final
10178          * range in the set is one less, which is even. */
10179         if (end == UV_MAX) {
10180             i_e = len;
10181         }
10182         else {
10183             i_e = len - 2;
10184         }
10185     }
10186 
10187     /* We have dealt with appending, now see about prepending.  If the new
10188      * range starts lower than the current lowest ... */
10189     if (start < array[0]) {
10190 
10191         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10192          * Let the union code handle it, rather than having to know the
10193          * trickiness in two code places.  */
10194         if (UNLIKELY(start == 0)) {
10195             SV* range_invlist;
10196 
10197             range_invlist = _new_invlist(2);
10198             _append_range_to_invlist(range_invlist, start, end);
10199 
10200             _invlist_union(invlist, range_invlist, &invlist);
10201 
10202             SvREFCNT_dec_NN(range_invlist);
10203 
10204             return invlist;
10205         }
10206 
10207         /* If the whole new range comes before the first entry, and doesn't
10208          * extend it, we have to insert it as an additional range */
10209         if (end < array[0] - 1) {
10210             i_s = i_e = -1;
10211             goto splice_in_new_range;
10212         }
10213 
10214         /* Here the new range adjoins the existing first range, extending it
10215          * downwards. */
10216         array[0] = start;
10217 
10218         /* And continue on below to handle the rest.  We know that the index of
10219          * the beginning of the range is the first one of the array */
10220         i_s = 0;
10221     }
10222     else { /* Not prepending any part of the new range to the existing list.
10223             * Find where in the list it should go.  This finds i_s, such that:
10224             *     invlist[i_s] <= start < array[i_s+1]
10225             */
10226         i_s = _invlist_search(invlist, start);
10227     }
10228 
10229     /* At this point, any extending before the beginning of the inversion list
10230      * and/or after the end has been done.  This has made it so that, in the
10231      * code below, each endpoint of the new range is either in a range that is
10232      * in the set, or is in a gap between two ranges that are.  This means we
10233      * don't have to worry about exceeding the array bounds.
10234      *
10235      * Find where in the list the new range ends (but we can skip this if we
10236      * have already determined what it is, or if it will be the same as i_s,
10237      * which we already have computed) */
10238     if (i_e == 0) {
10239         i_e = (start == end)
10240               ? i_s
10241               : _invlist_search(invlist, end);
10242     }
10243 
10244     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10245      * is a range that goes to infinity there is no element at invlist[i_e+1],
10246      * so only the first relation holds. */
10247 
10248     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10249 
10250         /* Here, the ranges on either side of the beginning of the new range
10251          * are in the set, and this range starts in the gap between them.
10252          *
10253          * The new range extends the range above it downwards if the new range
10254          * ends at or above that range's start */
10255         const bool extends_the_range_above = (   end == UV_MAX
10256                                               || end + 1 >= array[i_s+1]);
10257 
10258         /* The new range extends the range below it upwards if it begins just
10259          * after where that range ends */
10260         if (start == array[i_s]) {
10261 
10262             /* If the new range fills the entire gap between the other ranges,
10263              * they will get merged together.  Other ranges may also get
10264              * merged, depending on how many of them the new range spans.  In
10265              * the general case, we do the merge later, just once, after we
10266              * figure out how many to merge.  But in the case where the new
10267              * range exactly spans just this one gap (possibly extending into
10268              * the one above), we do the merge here, and an early exit.  This
10269              * is done here to avoid having to special case later. */
10270             if (i_e - i_s <= 1) {
10271 
10272                 /* If i_e - i_s == 1, it means that the new range terminates
10273                  * within the range above, and hence 'extends_the_range_above'
10274                  * must be true.  (If the range above it extends to infinity,
10275                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10276                  * will be 0, so no harm done.) */
10277                 if (extends_the_range_above) {
10278                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10279                     invlist_set_len(invlist,
10280                                     len - 2,
10281                                     *(get_invlist_offset_addr(invlist)));
10282                     return invlist;
10283                 }
10284 
10285                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10286                  * to the same range, and below we are about to decrement i_s
10287                  * */
10288                 i_e--;
10289             }
10290 
10291             /* Here, the new range is adjacent to the one below.  (It may also
10292              * span beyond the range above, but that will get resolved later.)
10293              * Extend the range below to include this one. */
10294             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10295             i_s--;
10296             start = array[i_s];
10297         }
10298         else if (extends_the_range_above) {
10299 
10300             /* Here the new range only extends the range above it, but not the
10301              * one below.  It merges with the one above.  Again, we keep i_e
10302              * and i_s in sync if they point to the same range */
10303             if (i_e == i_s) {
10304                 i_e++;
10305             }
10306             i_s++;
10307             array[i_s] = start;
10308         }
10309     }
10310 
10311     /* Here, we've dealt with the new range start extending any adjoining
10312      * existing ranges.
10313      *
10314      * If the new range extends to infinity, it is now the final one,
10315      * regardless of what was there before */
10316     if (UNLIKELY(end == UV_MAX)) {
10317         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10318         return invlist;
10319     }
10320 
10321     /* If i_e started as == i_s, it has also been dealt with,
10322      * and been updated to the new i_s, which will fail the following if */
10323     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10324 
10325         /* Here, the ranges on either side of the end of the new range are in
10326          * the set, and this range ends in the gap between them.
10327          *
10328          * If this range is adjacent to (hence extends) the range above it, it
10329          * becomes part of that range; likewise if it extends the range below,
10330          * it becomes part of that range */
10331         if (end + 1 == array[i_e+1]) {
10332             i_e++;
10333             array[i_e] = start;
10334         }
10335         else if (start <= array[i_e]) {
10336             array[i_e] = end + 1;
10337             i_e--;
10338         }
10339     }
10340 
10341     if (i_s == i_e) {
10342 
10343         /* If the range fits entirely in an existing range (as possibly already
10344          * extended above), it doesn't add anything new */
10345         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10346             return invlist;
10347         }
10348 
10349         /* Here, no part of the range is in the list.  Must add it.  It will
10350          * occupy 2 more slots */
10351       splice_in_new_range:
10352 
10353         invlist_extend(invlist, len + 2);
10354         array = invlist_array(invlist);
10355         /* Move the rest of the array down two slots. Don't include any
10356          * trailing NUL */
10357         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10358 
10359         /* Do the actual splice */
10360         array[i_e+1] = start;
10361         array[i_e+2] = end + 1;
10362         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10363         return invlist;
10364     }
10365 
10366     /* Here the new range crossed the boundaries of a pre-existing range.  The
10367      * code above has adjusted things so that both ends are in ranges that are
10368      * in the set.  This means everything in between must also be in the set.
10369      * Just squash things together */
10370     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10371     invlist_set_len(invlist,
10372                     len - i_e + i_s,
10373                     *(get_invlist_offset_addr(invlist)));
10374 
10375     return invlist;
10376 }
10377 
10378 SV*
Perl__setup_canned_invlist(pTHX_ const STRLEN size,const UV element0,UV ** other_elements_ptr)10379 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10380                                  UV** other_elements_ptr)
10381 {
10382     /* Create and return an inversion list whose contents are to be populated
10383      * by the caller.  The caller gives the number of elements (in 'size') and
10384      * the very first element ('element0').  This function will set
10385      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10386      * are to be placed.
10387      *
10388      * Obviously there is some trust involved that the caller will properly
10389      * fill in the other elements of the array.
10390      *
10391      * (The first element needs to be passed in, as the underlying code does
10392      * things differently depending on whether it is zero or non-zero) */
10393 
10394     SV* invlist = _new_invlist(size);
10395     bool offset;
10396 
10397     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10398 
10399     invlist = add_cp_to_invlist(invlist, element0);
10400     offset = *get_invlist_offset_addr(invlist);
10401 
10402     invlist_set_len(invlist, size, offset);
10403     *other_elements_ptr = invlist_array(invlist) + 1;
10404     return invlist;
10405 }
10406 
10407 #endif
10408 
10409 #ifndef PERL_IN_XSUB_RE
10410 void
Perl__invlist_invert(pTHX_ SV * const invlist)10411 Perl__invlist_invert(pTHX_ SV* const invlist)
10412 {
10413     /* Complement the input inversion list.  This adds a 0 if the list didn't
10414      * have a zero; removes it otherwise.  As described above, the data
10415      * structure is set up so that this is very efficient */
10416 
10417     PERL_ARGS_ASSERT__INVLIST_INVERT;
10418 
10419     assert(! invlist_is_iterating(invlist));
10420 
10421     /* The inverse of matching nothing is matching everything */
10422     if (_invlist_len(invlist) == 0) {
10423 	_append_range_to_invlist(invlist, 0, UV_MAX);
10424 	return;
10425     }
10426 
10427     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10428 }
10429 
10430 SV*
Perl_invlist_clone(pTHX_ SV * const invlist,SV * new_invlist)10431 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10432 {
10433     /* Return a new inversion list that is a copy of the input one, which is
10434      * unchanged.  The new list will not be mortal even if the old one was. */
10435 
10436     const STRLEN nominal_length = _invlist_len(invlist);
10437     const STRLEN physical_length = SvCUR(invlist);
10438     const bool offset = *(get_invlist_offset_addr(invlist));
10439 
10440     PERL_ARGS_ASSERT_INVLIST_CLONE;
10441 
10442     if (new_invlist == NULL) {
10443         new_invlist = _new_invlist(nominal_length);
10444     }
10445     else {
10446         sv_upgrade(new_invlist, SVt_INVLIST);
10447         initialize_invlist_guts(new_invlist, nominal_length);
10448     }
10449 
10450     *(get_invlist_offset_addr(new_invlist)) = offset;
10451     invlist_set_len(new_invlist, nominal_length, offset);
10452     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10453 
10454     return new_invlist;
10455 }
10456 
10457 #endif
10458 
10459 PERL_STATIC_INLINE UV
S_invlist_lowest(SV * const invlist)10460 S_invlist_lowest(SV* const invlist)
10461 {
10462     /* Returns the lowest code point that matches an inversion list.  This API
10463      * has an ambiguity, as it returns 0 under either the lowest is actually
10464      * 0, or if the list is empty.  If this distinction matters to you, check
10465      * for emptiness before calling this function */
10466 
10467     UV len = _invlist_len(invlist);
10468     UV *array;
10469 
10470     PERL_ARGS_ASSERT_INVLIST_LOWEST;
10471 
10472     if (len == 0) {
10473         return 0;
10474     }
10475 
10476     array = invlist_array(invlist);
10477 
10478     return array[0];
10479 }
10480 
10481 STATIC SV *
S_invlist_contents(pTHX_ SV * const invlist,const bool traditional_style)10482 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10483 {
10484     /* Get the contents of an inversion list into a string SV so that they can
10485      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10486      * traditionally done for debug tracing; otherwise it uses a format
10487      * suitable for just copying to the output, with blanks between ranges and
10488      * a dash between range components */
10489 
10490     UV start, end;
10491     SV* output;
10492     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10493     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10494 
10495     if (traditional_style) {
10496         output = newSVpvs("\n");
10497     }
10498     else {
10499         output = newSVpvs("");
10500     }
10501 
10502     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10503 
10504     assert(! invlist_is_iterating(invlist));
10505 
10506     invlist_iterinit(invlist);
10507     while (invlist_iternext(invlist, &start, &end)) {
10508 	if (end == UV_MAX) {
10509 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10510                                           start, intra_range_delimiter,
10511                                                  inter_range_delimiter);
10512 	}
10513 	else if (end != start) {
10514 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10515 		                          start,
10516                                                    intra_range_delimiter,
10517                                                   end, inter_range_delimiter);
10518 	}
10519 	else {
10520 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10521                                           start, inter_range_delimiter);
10522 	}
10523     }
10524 
10525     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10526         SvCUR_set(output, SvCUR(output) - 1);
10527     }
10528 
10529     return output;
10530 }
10531 
10532 #ifndef PERL_IN_XSUB_RE
10533 void
Perl__invlist_dump(pTHX_ PerlIO * file,I32 level,const char * const indent,SV * const invlist)10534 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10535                          const char * const indent, SV* const invlist)
10536 {
10537     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10538      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10539      * the string 'indent'.  The output looks like this:
10540          [0] 0x000A .. 0x000D
10541          [2] 0x0085
10542          [4] 0x2028 .. 0x2029
10543          [6] 0x3104 .. INFTY
10544      * This means that the first range of code points matched by the list are
10545      * 0xA through 0xD; the second range contains only the single code point
10546      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10547      * are used to define each range (except if the final range extends to
10548      * infinity, only a single element is needed).  The array index of the
10549      * first element for the corresponding range is given in brackets. */
10550 
10551     UV start, end;
10552     STRLEN count = 0;
10553 
10554     PERL_ARGS_ASSERT__INVLIST_DUMP;
10555 
10556     if (invlist_is_iterating(invlist)) {
10557         Perl_dump_indent(aTHX_ level, file,
10558              "%sCan't dump inversion list because is in middle of iterating\n",
10559              indent);
10560         return;
10561     }
10562 
10563     invlist_iterinit(invlist);
10564     while (invlist_iternext(invlist, &start, &end)) {
10565 	if (end == UV_MAX) {
10566 	    Perl_dump_indent(aTHX_ level, file,
10567                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10568                                    indent, (UV)count, start);
10569 	}
10570 	else if (end != start) {
10571 	    Perl_dump_indent(aTHX_ level, file,
10572                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10573 		                indent, (UV)count, start,         end);
10574 	}
10575 	else {
10576 	    Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10577                                             indent, (UV)count, start);
10578 	}
10579         count += 2;
10580     }
10581 }
10582 
10583 #endif
10584 
10585 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10586 bool
Perl__invlistEQ(pTHX_ SV * const a,SV * const b,const bool complement_b)10587 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10588 {
10589     /* Return a boolean as to if the two passed in inversion lists are
10590      * identical.  The final argument, if TRUE, says to take the complement of
10591      * the second inversion list before doing the comparison */
10592 
10593     const UV len_a = _invlist_len(a);
10594     UV len_b = _invlist_len(b);
10595 
10596     const UV* array_a = NULL;
10597     const UV* array_b = NULL;
10598 
10599     PERL_ARGS_ASSERT__INVLISTEQ;
10600 
10601     /* This code avoids accessing the arrays unless it knows the length is
10602      * non-zero */
10603 
10604     if (len_a == 0) {
10605         if (len_b == 0) {
10606             return ! complement_b;
10607         }
10608     }
10609     else {
10610         array_a = invlist_array(a);
10611     }
10612 
10613     if (len_b != 0) {
10614         array_b = invlist_array(b);
10615     }
10616 
10617     /* If are to compare 'a' with the complement of b, set it
10618      * up so are looking at b's complement. */
10619     if (complement_b) {
10620 
10621         /* The complement of nothing is everything, so <a> would have to have
10622          * just one element, starting at zero (ending at infinity) */
10623         if (len_b == 0) {
10624             return (len_a == 1 && array_a[0] == 0);
10625         }
10626         if (array_b[0] == 0) {
10627 
10628             /* Otherwise, to complement, we invert.  Here, the first element is
10629              * 0, just remove it.  To do this, we just pretend the array starts
10630              * one later */
10631 
10632             array_b++;
10633             len_b--;
10634         }
10635         else {
10636 
10637             /* But if the first element is not zero, we pretend the list starts
10638              * at the 0 that is always stored immediately before the array. */
10639             array_b--;
10640             len_b++;
10641         }
10642     }
10643 
10644     return    len_a == len_b
10645            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10646 
10647 }
10648 #endif
10649 
10650 /*
10651  * As best we can, determine the characters that can match the start of
10652  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10653  * can be false positive matches
10654  *
10655  * Returns the invlist as a new SV*; it is the caller's responsibility to
10656  * call SvREFCNT_dec() when done with it.
10657  */
10658 STATIC SV*
S_make_exactf_invlist(pTHX_ RExC_state_t * pRExC_state,regnode * node)10659 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10660 {
10661     dVAR;
10662     const U8 * s = (U8*)STRING(node);
10663     SSize_t bytelen = STR_LEN(node);
10664     UV uc;
10665     /* Start out big enough for 2 separate code points */
10666     SV* invlist = _new_invlist(4);
10667 
10668     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10669 
10670     if (! UTF) {
10671         uc = *s;
10672 
10673         /* We punt and assume can match anything if the node begins
10674          * with a multi-character fold.  Things are complicated.  For
10675          * example, /ffi/i could match any of:
10676          *  "\N{LATIN SMALL LIGATURE FFI}"
10677          *  "\N{LATIN SMALL LIGATURE FF}I"
10678          *  "F\N{LATIN SMALL LIGATURE FI}"
10679          *  plus several other things; and making sure we have all the
10680          *  possibilities is hard. */
10681         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10682             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10683         }
10684         else {
10685             /* Any Latin1 range character can potentially match any
10686              * other depending on the locale, and in Turkic locales, U+130 and
10687              * U+131 */
10688             if (OP(node) == EXACTFL) {
10689                 _invlist_union(invlist, PL_Latin1, &invlist);
10690                 invlist = add_cp_to_invlist(invlist,
10691                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10692                 invlist = add_cp_to_invlist(invlist,
10693                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10694             }
10695             else {
10696                 /* But otherwise, it matches at least itself.  We can
10697                  * quickly tell if it has a distinct fold, and if so,
10698                  * it matches that as well */
10699                 invlist = add_cp_to_invlist(invlist, uc);
10700                 if (IS_IN_SOME_FOLD_L1(uc))
10701                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10702             }
10703 
10704             /* Some characters match above-Latin1 ones under /i.  This
10705              * is true of EXACTFL ones when the locale is UTF-8 */
10706             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10707                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10708                                     && OP(node) != EXACTFAA_NO_TRIE)))
10709             {
10710                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10711             }
10712         }
10713     }
10714     else {  /* Pattern is UTF-8 */
10715         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10716         const U8* e = s + bytelen;
10717         IV fc;
10718 
10719         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10720 
10721         /* The only code points that aren't folded in a UTF EXACTFish
10722          * node are the problematic ones in EXACTFL nodes */
10723         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10724             /* We need to check for the possibility that this EXACTFL
10725              * node begins with a multi-char fold.  Therefore we fold
10726              * the first few characters of it so that we can make that
10727              * check */
10728             U8 *d = folded;
10729             int i;
10730 
10731             fc = -1;
10732             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10733                 if (isASCII(*s)) {
10734                     *(d++) = (U8) toFOLD(*s);
10735                     if (fc < 0) {       /* Save the first fold */
10736                         fc = *(d-1);
10737                     }
10738                     s++;
10739                 }
10740                 else {
10741                     STRLEN len;
10742                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10743                     if (fc < 0) {       /* Save the first fold */
10744                         fc = fold;
10745                     }
10746                     d += len;
10747                     s += UTF8SKIP(s);
10748                 }
10749             }
10750 
10751             /* And set up so the code below that looks in this folded
10752              * buffer instead of the node's string */
10753             e = d;
10754             s = folded;
10755         }
10756 
10757         /* When we reach here 's' points to the fold of the first
10758          * character(s) of the node; and 'e' points to far enough along
10759          * the folded string to be just past any possible multi-char
10760          * fold.
10761          *
10762          * Unlike the non-UTF-8 case, the macro for determining if a
10763          * string is a multi-char fold requires all the characters to
10764          * already be folded.  This is because of all the complications
10765          * if not.  Note that they are folded anyway, except in EXACTFL
10766          * nodes.  Like the non-UTF case above, we punt if the node
10767          * begins with a multi-char fold  */
10768 
10769         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10770             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10771         }
10772         else {  /* Single char fold */
10773             unsigned int k;
10774             U32 first_fold;
10775             const U32 * remaining_folds;
10776             Size_t folds_count;
10777 
10778             /* It matches itself */
10779             invlist = add_cp_to_invlist(invlist, fc);
10780 
10781             /* ... plus all the things that fold to it, which are found in
10782              * PL_utf8_foldclosures */
10783             folds_count = _inverse_folds(fc, &first_fold,
10784                                                 &remaining_folds);
10785             for (k = 0; k < folds_count; k++) {
10786                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10787 
10788                 /* /aa doesn't allow folds between ASCII and non- */
10789                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10790                     && isASCII(c) != isASCII(fc))
10791                 {
10792                     continue;
10793                 }
10794 
10795                 invlist = add_cp_to_invlist(invlist, c);
10796             }
10797 
10798             if (OP(node) == EXACTFL) {
10799 
10800                 /* If either [iI] are present in an EXACTFL node the above code
10801                  * should have added its normal case pair, but under a Turkish
10802                  * locale they could match instead the case pairs from it.  Add
10803                  * those as potential matches as well */
10804                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10805                     invlist = add_cp_to_invlist(invlist,
10806                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10807                     invlist = add_cp_to_invlist(invlist,
10808                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10809                 }
10810                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10811                     invlist = add_cp_to_invlist(invlist, 'I');
10812                 }
10813                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10814                     invlist = add_cp_to_invlist(invlist, 'i');
10815                 }
10816             }
10817         }
10818     }
10819 
10820     return invlist;
10821 }
10822 
10823 #undef HEADER_LENGTH
10824 #undef TO_INTERNAL_SIZE
10825 #undef FROM_INTERNAL_SIZE
10826 #undef INVLIST_VERSION_ID
10827 
10828 /* End of inversion list object */
10829 
10830 STATIC void
S_parse_lparen_question_flags(pTHX_ RExC_state_t * pRExC_state)10831 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10832 {
10833     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10834      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10835      * should point to the first flag; it is updated on output to point to the
10836      * final ')' or ':'.  There needs to be at least one flag, or this will
10837      * abort */
10838 
10839     /* for (?g), (?gc), and (?o) warnings; warning
10840        about (?c) will warn about (?g) -- japhy    */
10841 
10842 #define WASTED_O  0x01
10843 #define WASTED_G  0x02
10844 #define WASTED_C  0x04
10845 #define WASTED_GC (WASTED_G|WASTED_C)
10846     I32 wastedflags = 0x00;
10847     U32 posflags = 0, negflags = 0;
10848     U32 *flagsp = &posflags;
10849     char has_charset_modifier = '\0';
10850     regex_charset cs;
10851     bool has_use_defaults = FALSE;
10852     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10853     int x_mod_count = 0;
10854 
10855     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10856 
10857     /* '^' as an initial flag sets certain defaults */
10858     if (UCHARAT(RExC_parse) == '^') {
10859         RExC_parse++;
10860         has_use_defaults = TRUE;
10861         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10862         cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10863              ? REGEX_UNICODE_CHARSET
10864              : REGEX_DEPENDS_CHARSET;
10865         set_regex_charset(&RExC_flags, cs);
10866     }
10867     else {
10868         cs = get_regex_charset(RExC_flags);
10869         if (   cs == REGEX_DEPENDS_CHARSET
10870             && (toUSE_UNI_CHARSET_NOT_DEPENDS))
10871         {
10872             cs = REGEX_UNICODE_CHARSET;
10873         }
10874     }
10875 
10876     while (RExC_parse < RExC_end) {
10877         /* && memCHRs("iogcmsx", *RExC_parse) */
10878         /* (?g), (?gc) and (?o) are useless here
10879            and must be globally applied -- japhy */
10880         if ((RExC_pm_flags & PMf_WILDCARD)) {
10881             if (flagsp == & negflags) {
10882                 if (*RExC_parse == 'm') {
10883                     RExC_parse++;
10884                     /* diag_listed_as: Use of %s is not allowed in Unicode
10885                        property wildcard subpatterns in regex; marked by <--
10886                        HERE in m/%s/ */
10887                     vFAIL("Use of modifier '-m' is not allowed in Unicode"
10888                           " property wildcard subpatterns");
10889                 }
10890             }
10891             else {
10892                 if (*RExC_parse == 's') {
10893                     goto modifier_illegal_in_wildcard;
10894                 }
10895             }
10896         }
10897 
10898         switch (*RExC_parse) {
10899 
10900             /* Code for the imsxn flags */
10901             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10902 
10903             case LOCALE_PAT_MOD:
10904                 if (has_charset_modifier) {
10905                     goto excess_modifier;
10906                 }
10907                 else if (flagsp == &negflags) {
10908                     goto neg_modifier;
10909                 }
10910                 cs = REGEX_LOCALE_CHARSET;
10911                 has_charset_modifier = LOCALE_PAT_MOD;
10912                 break;
10913             case UNICODE_PAT_MOD:
10914                 if (has_charset_modifier) {
10915                     goto excess_modifier;
10916                 }
10917                 else if (flagsp == &negflags) {
10918                     goto neg_modifier;
10919                 }
10920                 cs = REGEX_UNICODE_CHARSET;
10921                 has_charset_modifier = UNICODE_PAT_MOD;
10922                 break;
10923             case ASCII_RESTRICT_PAT_MOD:
10924                 if (flagsp == &negflags) {
10925                     goto neg_modifier;
10926                 }
10927                 if (has_charset_modifier) {
10928                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10929                         goto excess_modifier;
10930                     }
10931                     /* Doubled modifier implies more restricted */
10932                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10933                 }
10934                 else {
10935                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10936                 }
10937                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10938                 break;
10939             case DEPENDS_PAT_MOD:
10940                 if (has_use_defaults) {
10941                     goto fail_modifiers;
10942                 }
10943                 else if (flagsp == &negflags) {
10944                     goto neg_modifier;
10945                 }
10946                 else if (has_charset_modifier) {
10947                     goto excess_modifier;
10948                 }
10949 
10950                 /* The dual charset means unicode semantics if the
10951                  * pattern (or target, not known until runtime) are
10952                  * utf8, or something in the pattern indicates unicode
10953                  * semantics */
10954                 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10955                      ? REGEX_UNICODE_CHARSET
10956                      : REGEX_DEPENDS_CHARSET;
10957                 has_charset_modifier = DEPENDS_PAT_MOD;
10958                 break;
10959               excess_modifier:
10960                 RExC_parse++;
10961                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10962                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10963                 }
10964                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10965                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10966                                         *(RExC_parse - 1));
10967                 }
10968                 else {
10969                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10970                 }
10971                 NOT_REACHED; /*NOTREACHED*/
10972               neg_modifier:
10973                 RExC_parse++;
10974                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10975                                     *(RExC_parse - 1));
10976                 NOT_REACHED; /*NOTREACHED*/
10977             case GLOBAL_PAT_MOD: /* 'g' */
10978                 if (RExC_pm_flags & PMf_WILDCARD) {
10979                     goto modifier_illegal_in_wildcard;
10980                 }
10981                 /*FALLTHROUGH*/
10982             case ONCE_PAT_MOD: /* 'o' */
10983                 if (ckWARN(WARN_REGEXP)) {
10984                     const I32 wflagbit = *RExC_parse == 'o'
10985                                          ? WASTED_O
10986                                          : WASTED_G;
10987                     if (! (wastedflags & wflagbit) ) {
10988                         wastedflags |= wflagbit;
10989 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10990                         vWARN5(
10991                             RExC_parse + 1,
10992                             "Useless (%s%c) - %suse /%c modifier",
10993                             flagsp == &negflags ? "?-" : "?",
10994                             *RExC_parse,
10995                             flagsp == &negflags ? "don't " : "",
10996                             *RExC_parse
10997                         );
10998                     }
10999                 }
11000                 break;
11001 
11002             case CONTINUE_PAT_MOD: /* 'c' */
11003                 if (RExC_pm_flags & PMf_WILDCARD) {
11004                     goto modifier_illegal_in_wildcard;
11005                 }
11006                 if (ckWARN(WARN_REGEXP)) {
11007                     if (! (wastedflags & WASTED_C) ) {
11008                         wastedflags |= WASTED_GC;
11009 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
11010                         vWARN3(
11011                             RExC_parse + 1,
11012                             "Useless (%sc) - %suse /gc modifier",
11013                             flagsp == &negflags ? "?-" : "?",
11014                             flagsp == &negflags ? "don't " : ""
11015                         );
11016                     }
11017                 }
11018                 break;
11019             case KEEPCOPY_PAT_MOD: /* 'p' */
11020                 if (RExC_pm_flags & PMf_WILDCARD) {
11021                     goto modifier_illegal_in_wildcard;
11022                 }
11023                 if (flagsp == &negflags) {
11024                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
11025                 } else {
11026                     *flagsp |= RXf_PMf_KEEPCOPY;
11027                 }
11028                 break;
11029             case '-':
11030                 /* A flag is a default iff it is following a minus, so
11031                  * if there is a minus, it means will be trying to
11032                  * re-specify a default which is an error */
11033                 if (has_use_defaults || flagsp == &negflags) {
11034                     goto fail_modifiers;
11035                 }
11036                 flagsp = &negflags;
11037                 wastedflags = 0;  /* reset so (?g-c) warns twice */
11038                 x_mod_count = 0;
11039                 break;
11040             case ':':
11041             case ')':
11042 
11043                 if (  (RExC_pm_flags & PMf_WILDCARD)
11044                     && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11045                 {
11046                     RExC_parse++;
11047                     /* diag_listed_as: Use of %s is not allowed in Unicode
11048                        property wildcard subpatterns in regex; marked by <--
11049                        HERE in m/%s/ */
11050                     vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11051                            " property wildcard subpatterns",
11052                            has_charset_modifier);
11053                 }
11054 
11055                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11056                     negflags |= RXf_PMf_EXTENDED_MORE;
11057                 }
11058                 RExC_flags |= posflags;
11059 
11060                 if (negflags & RXf_PMf_EXTENDED) {
11061                     negflags |= RXf_PMf_EXTENDED_MORE;
11062                 }
11063                 RExC_flags &= ~negflags;
11064                 set_regex_charset(&RExC_flags, cs);
11065 
11066                 return;
11067             default:
11068               fail_modifiers:
11069                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11070 		/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11071                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11072                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11073                 NOT_REACHED; /*NOTREACHED*/
11074         }
11075 
11076         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11077     }
11078 
11079     vFAIL("Sequence (?... not terminated");
11080 
11081   modifier_illegal_in_wildcard:
11082     RExC_parse++;
11083     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11084        subpatterns in regex; marked by <-- HERE in m/%s/ */
11085     vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11086            " subpatterns", *(RExC_parse - 1));
11087 }
11088 
11089 /*
11090  - reg - regular expression, i.e. main body or parenthesized thing
11091  *
11092  * Caller must absorb opening parenthesis.
11093  *
11094  * Combining parenthesis handling with the base level of regular expression
11095  * is a trifle forced, but the need to tie the tails of the branches to what
11096  * follows makes it hard to avoid.
11097  */
11098 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11099 #ifdef DEBUGGING
11100 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11101 #else
11102 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11103 #endif
11104 
11105 STATIC regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,char * parse_start,char ch)11106 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11107                              I32 *flagp,
11108                              char * parse_start,
11109                              char ch
11110                       )
11111 {
11112     regnode_offset ret;
11113     char* name_start = RExC_parse;
11114     U32 num = 0;
11115     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11116     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11117 
11118     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11119 
11120     if (RExC_parse == name_start || *RExC_parse != ch) {
11121         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11122         vFAIL2("Sequence %.3s... not terminated", parse_start);
11123     }
11124 
11125     if (sv_dat) {
11126         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11127         RExC_rxi->data->data[num]=(void*)sv_dat;
11128         SvREFCNT_inc_simple_void_NN(sv_dat);
11129     }
11130     RExC_sawback = 1;
11131     ret = reganode(pRExC_state,
11132                    ((! FOLD)
11133                      ? REFN
11134                      : (ASCII_FOLD_RESTRICTED)
11135                        ? REFFAN
11136                        : (AT_LEAST_UNI_SEMANTICS)
11137                          ? REFFUN
11138                          : (LOC)
11139                            ? REFFLN
11140                            : REFFN),
11141                     num);
11142     *flagp |= HASWIDTH;
11143 
11144     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11145     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11146 
11147     nextchar(pRExC_state);
11148     return ret;
11149 }
11150 
11151 /* On success, returns the offset at which any next node should be placed into
11152  * the regex engine program being compiled.
11153  *
11154  * Returns 0 otherwise, with *flagp set to indicate why:
11155  *  TRYAGAIN        at the end of (?) that only sets flags.
11156  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11157  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11158  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11159  *  happen.  */
11160 STATIC regnode_offset
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp,U32 depth)11161 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11162     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11163      * 2 is like 1, but indicates that nextchar() has been called to advance
11164      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11165      * this flag alerts us to the need to check for that */
11166 {
11167     regnode_offset ret = 0;    /* Will be the head of the group. */
11168     regnode_offset br;
11169     regnode_offset lastbr;
11170     regnode_offset ender = 0;
11171     I32 parno = 0;
11172     I32 flags;
11173     U32 oregflags = RExC_flags;
11174     bool have_branch = 0;
11175     bool is_open = 0;
11176     I32 freeze_paren = 0;
11177     I32 after_freeze = 0;
11178     I32 num; /* numeric backreferences */
11179     SV * max_open;  /* Max number of unclosed parens */
11180     I32 was_in_lookaround = RExC_in_lookaround;
11181 
11182     char * parse_start = RExC_parse; /* MJD */
11183     char * const oregcomp_parse = RExC_parse;
11184 
11185     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11186 
11187     PERL_ARGS_ASSERT_REG;
11188     DEBUG_PARSE("reg ");
11189 
11190     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11191     assert(max_open);
11192     if (!SvIOK(max_open)) {
11193         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11194     }
11195     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11196                                               open paren */
11197         vFAIL("Too many nested open parens");
11198     }
11199 
11200     *flagp = 0;				/* Tentatively. */
11201 
11202     /* Having this true makes it feasible to have a lot fewer tests for the
11203      * parse pointer being in scope.  For example, we can write
11204      *      while(isFOO(*RExC_parse)) RExC_parse++;
11205      * instead of
11206      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11207      */
11208     assert(*RExC_end == '\0');
11209 
11210     /* Make an OPEN node, if parenthesized. */
11211     if (paren) {
11212 
11213         /* Under /x, space and comments can be gobbled up between the '(' and
11214          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11215          * intervening space, as the sequence is a token, and a token should be
11216          * indivisible */
11217         bool has_intervening_patws = (paren == 2)
11218                                   && *(RExC_parse - 1) != '(';
11219 
11220         if (RExC_parse >= RExC_end) {
11221 	    vFAIL("Unmatched (");
11222         }
11223 
11224         if (paren == 'r') {     /* Atomic script run */
11225             paren = '>';
11226             goto parse_rest;
11227         }
11228         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11229 	    char *start_verb = RExC_parse + 1;
11230 	    STRLEN verb_len;
11231 	    char *start_arg = NULL;
11232 	    unsigned char op = 0;
11233             int arg_required = 0;
11234             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11235             bool has_upper = FALSE;
11236 
11237             if (has_intervening_patws) {
11238                 RExC_parse++;   /* past the '*' */
11239 
11240                 /* For strict backwards compatibility, don't change the message
11241                  * now that we also have lowercase operands */
11242                 if (isUPPER(*RExC_parse)) {
11243                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11244                 }
11245                 else {
11246                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11247                 }
11248             }
11249 	    while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11250 	        if ( *RExC_parse == ':' ) {
11251 	            start_arg = RExC_parse + 1;
11252 	            break;
11253 	        }
11254                 else if (! UTF) {
11255                     if (isUPPER(*RExC_parse)) {
11256                         has_upper = TRUE;
11257                     }
11258                     RExC_parse++;
11259                 }
11260                 else {
11261                     RExC_parse += UTF8SKIP(RExC_parse);
11262                 }
11263 	    }
11264 	    verb_len = RExC_parse - start_verb;
11265 	    if ( start_arg ) {
11266                 if (RExC_parse >= RExC_end) {
11267                     goto unterminated_verb_pattern;
11268                 }
11269 
11270 	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11271 	        while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11272                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11273                 }
11274 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11275                   unterminated_verb_pattern:
11276                     if (has_upper) {
11277                         vFAIL("Unterminated verb pattern argument");
11278                     }
11279                     else {
11280                         vFAIL("Unterminated '(*...' argument");
11281                     }
11282                 }
11283 	    } else {
11284 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11285                     if (has_upper) {
11286                         vFAIL("Unterminated verb pattern");
11287                     }
11288                     else {
11289                         vFAIL("Unterminated '(*...' construct");
11290                     }
11291                 }
11292 	    }
11293 
11294             /* Here, we know that RExC_parse < RExC_end */
11295 
11296 	    switch ( *start_verb ) {
11297             case 'A':  /* (*ACCEPT) */
11298                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11299 		    op = ACCEPT;
11300 		    internal_argval = RExC_nestroot;
11301 		}
11302 		break;
11303             case 'C':  /* (*COMMIT) */
11304                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11305                     op = COMMIT;
11306                 break;
11307             case 'F':  /* (*FAIL) */
11308                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11309 		    op = OPFAIL;
11310 		}
11311 		break;
11312             case ':':  /* (*:NAME) */
11313 	    case 'M':  /* (*MARK:NAME) */
11314 	        if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11315                     op = MARKPOINT;
11316                     arg_required = 1;
11317                 }
11318                 break;
11319             case 'P':  /* (*PRUNE) */
11320                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11321                     op = PRUNE;
11322                 break;
11323             case 'S':   /* (*SKIP) */
11324                 if ( memEQs(start_verb, verb_len,"SKIP") )
11325                     op = SKIP;
11326                 break;
11327             case 'T':  /* (*THEN) */
11328                 /* [19:06] <TimToady> :: is then */
11329                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11330                     op = CUTGROUP;
11331                     RExC_seen |= REG_CUTGROUP_SEEN;
11332                 }
11333                 break;
11334             case 'a':
11335                 if (   memEQs(start_verb, verb_len, "asr")
11336                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11337                 {
11338                     paren = 'r';        /* Mnemonic: recursed run */
11339                     goto script_run;
11340                 }
11341                 else if (memEQs(start_verb, verb_len, "atomic")) {
11342                     paren = 't';    /* AtOMIC */
11343                     goto alpha_assertions;
11344                 }
11345                 break;
11346             case 'p':
11347                 if (   memEQs(start_verb, verb_len, "plb")
11348                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11349                 {
11350                     paren = 'b';
11351                     goto lookbehind_alpha_assertions;
11352                 }
11353                 else if (   memEQs(start_verb, verb_len, "pla")
11354                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11355                 {
11356                     paren = 'a';
11357                     goto alpha_assertions;
11358                 }
11359                 break;
11360             case 'n':
11361                 if (   memEQs(start_verb, verb_len, "nlb")
11362                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11363                 {
11364                     paren = 'B';
11365                     goto lookbehind_alpha_assertions;
11366                 }
11367                 else if (   memEQs(start_verb, verb_len, "nla")
11368                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11369                 {
11370                     paren = 'A';
11371                     goto alpha_assertions;
11372                 }
11373                 break;
11374             case 's':
11375                 if (   memEQs(start_verb, verb_len, "sr")
11376                     || memEQs(start_verb, verb_len, "script_run"))
11377                 {
11378                     regnode_offset atomic;
11379 
11380                     paren = 's';
11381 
11382                    script_run:
11383 
11384                     /* This indicates Unicode rules. */
11385                     REQUIRE_UNI_RULES(flagp, 0);
11386 
11387                     if (! start_arg) {
11388                         goto no_colon;
11389                     }
11390 
11391                     RExC_parse = start_arg;
11392 
11393                     if (RExC_in_script_run) {
11394 
11395                         /*  Nested script runs are treated as no-ops, because
11396                          *  if the nested one fails, the outer one must as
11397                          *  well.  It could fail sooner, and avoid (??{} with
11398                          *  side effects, but that is explicitly documented as
11399                          *  undefined behavior. */
11400 
11401                         ret = 0;
11402 
11403                         if (paren == 's') {
11404                             paren = ':';
11405                             goto parse_rest;
11406                         }
11407 
11408                         /* But, the atomic part of a nested atomic script run
11409                          * isn't a no-op, but can be treated just like a '(?>'
11410                          * */
11411                         paren = '>';
11412                         goto parse_rest;
11413                     }
11414 
11415                     if (paren == 's') {
11416                         /* Here, we're starting a new regular script run */
11417                         ret = reg_node(pRExC_state, SROPEN);
11418                         RExC_in_script_run = 1;
11419                         is_open = 1;
11420                         goto parse_rest;
11421                     }
11422 
11423                     /* Here, we are starting an atomic script run.  This is
11424                      * handled by recursing to deal with the atomic portion
11425                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11426 
11427                     ret = reg_node(pRExC_state, SROPEN);
11428 
11429                     RExC_in_script_run = 1;
11430 
11431                     atomic = reg(pRExC_state, 'r', &flags, depth);
11432                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11433                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11434                         return 0;
11435                     }
11436 
11437                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11438                         REQUIRE_BRANCHJ(flagp, 0);
11439                     }
11440 
11441                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11442                                                                 SRCLOSE)))
11443                     {
11444                         REQUIRE_BRANCHJ(flagp, 0);
11445                     }
11446 
11447                     RExC_in_script_run = 0;
11448                     return ret;
11449                 }
11450 
11451                 break;
11452 
11453             lookbehind_alpha_assertions:
11454                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11455                 /*FALLTHROUGH*/
11456 
11457             alpha_assertions:
11458 
11459                 RExC_in_lookaround++;
11460                 RExC_seen_zerolen++;
11461 
11462                 if (! start_arg) {
11463                     goto no_colon;
11464                 }
11465 
11466                 /* An empty negative lookahead assertion simply is failure */
11467                 if (paren == 'A' && RExC_parse == start_arg) {
11468                     ret=reganode(pRExC_state, OPFAIL, 0);
11469                     nextchar(pRExC_state);
11470                     return ret;
11471 	        }
11472 
11473                 RExC_parse = start_arg;
11474                 goto parse_rest;
11475 
11476               no_colon:
11477                 vFAIL2utf8f(
11478                 "'(*%" UTF8f "' requires a terminating ':'",
11479                 UTF8fARG(UTF, verb_len, start_verb));
11480 		NOT_REACHED; /*NOTREACHED*/
11481 
11482 	    } /* End of switch */
11483 	    if ( ! op ) {
11484 	        RExC_parse += UTF
11485                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11486                               : 1;
11487                 if (has_upper || verb_len == 0) {
11488                     vFAIL2utf8f(
11489                     "Unknown verb pattern '%" UTF8f "'",
11490                     UTF8fARG(UTF, verb_len, start_verb));
11491                 }
11492                 else {
11493                     vFAIL2utf8f(
11494                     "Unknown '(*...)' construct '%" UTF8f "'",
11495                     UTF8fARG(UTF, verb_len, start_verb));
11496                 }
11497 	    }
11498             if ( RExC_parse == start_arg ) {
11499                 start_arg = NULL;
11500             }
11501             if ( arg_required && !start_arg ) {
11502                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11503                     (int) verb_len, start_verb);
11504             }
11505             if (internal_argval == -1) {
11506                 ret = reganode(pRExC_state, op, 0);
11507             } else {
11508                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11509             }
11510             RExC_seen |= REG_VERBARG_SEEN;
11511             if (start_arg) {
11512                 SV *sv = newSVpvn( start_arg,
11513                                     RExC_parse - start_arg);
11514                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11515                                         STR_WITH_LEN("S"));
11516                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11517                 FLAGS(REGNODE_p(ret)) = 1;
11518             } else {
11519                 FLAGS(REGNODE_p(ret)) = 0;
11520             }
11521             if ( internal_argval != -1 )
11522                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11523 	    nextchar(pRExC_state);
11524 	    return ret;
11525         }
11526         else if (*RExC_parse == '?') { /* (?...) */
11527 	    bool is_logical = 0;
11528 	    const char * const seqstart = RExC_parse;
11529             const char * endptr;
11530             const char non_existent_group_msg[]
11531                                             = "Reference to nonexistent group";
11532             const char impossible_group[] = "Invalid reference to group";
11533 
11534             if (has_intervening_patws) {
11535                 RExC_parse++;
11536                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11537             }
11538 
11539 	    RExC_parse++;           /* past the '?' */
11540             paren = *RExC_parse;    /* might be a trailing NUL, if not
11541                                        well-formed */
11542             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11543             if (RExC_parse > RExC_end) {
11544                 paren = '\0';
11545             }
11546 	    ret = 0;			/* For look-ahead/behind. */
11547 	    switch (paren) {
11548 
11549 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
11550 	        paren = *RExC_parse;
11551 		if ( paren == '<') {    /* (?P<...>) named capture */
11552                     RExC_parse++;
11553                     if (RExC_parse >= RExC_end) {
11554                         vFAIL("Sequence (?P<... not terminated");
11555                     }
11556 		    goto named_capture;
11557                 }
11558                 else if (paren == '>') {   /* (?P>name) named recursion */
11559                     RExC_parse++;
11560                     if (RExC_parse >= RExC_end) {
11561                         vFAIL("Sequence (?P>... not terminated");
11562                     }
11563                     goto named_recursion;
11564                 }
11565                 else if (paren == '=') {   /* (?P=...)  named backref */
11566                     RExC_parse++;
11567                     return handle_named_backref(pRExC_state, flagp,
11568                                                 parse_start, ')');
11569                 }
11570                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11571                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11572 		vFAIL3("Sequence (%.*s...) not recognized",
11573                                 (int) (RExC_parse - seqstart), seqstart);
11574 		NOT_REACHED; /*NOTREACHED*/
11575             case '<':           /* (?<...) */
11576                 /* If you want to support (?<*...), first reconcile with GH #17363 */
11577 		if (*RExC_parse == '!')
11578 		    paren = ',';
11579 		else if (*RExC_parse != '=')
11580               named_capture:
11581 		{               /* (?<...>) */
11582 		    char *name_start;
11583 		    SV *svname;
11584 		    paren= '>';
11585                 /* FALLTHROUGH */
11586             case '\'':          /* (?'...') */
11587                     name_start = RExC_parse;
11588                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11589 		    if (   RExC_parse == name_start
11590                         || RExC_parse >= RExC_end
11591                         || *RExC_parse != paren)
11592                     {
11593 		        vFAIL2("Sequence (?%c... not terminated",
11594 		            paren=='>' ? '<' : (char) paren);
11595                     }
11596 		    {
11597 			HE *he_str;
11598 			SV *sv_dat = NULL;
11599                         if (!svname) /* shouldn't happen */
11600                             Perl_croak(aTHX_
11601                                 "panic: reg_scan_name returned NULL");
11602                         if (!RExC_paren_names) {
11603                             RExC_paren_names= newHV();
11604                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11605 #ifdef DEBUGGING
11606                             RExC_paren_name_list= newAV();
11607                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11608 #endif
11609                         }
11610                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11611                         if ( he_str )
11612                             sv_dat = HeVAL(he_str);
11613                         if ( ! sv_dat ) {
11614                             /* croak baby croak */
11615                             Perl_croak(aTHX_
11616                                 "panic: paren_name hash element allocation failed");
11617                         } else if ( SvPOK(sv_dat) ) {
11618                             /* (?|...) can mean we have dupes so scan to check
11619                                its already been stored. Maybe a flag indicating
11620                                we are inside such a construct would be useful,
11621                                but the arrays are likely to be quite small, so
11622                                for now we punt -- dmq */
11623                             IV count = SvIV(sv_dat);
11624                             I32 *pv = (I32*)SvPVX(sv_dat);
11625                             IV i;
11626                             for ( i = 0 ; i < count ; i++ ) {
11627                                 if ( pv[i] == RExC_npar ) {
11628                                     count = 0;
11629                                     break;
11630                                 }
11631                             }
11632                             if ( count ) {
11633                                 pv = (I32*)SvGROW(sv_dat,
11634                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11635                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11636                                 pv[count] = RExC_npar;
11637                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11638                             }
11639                         } else {
11640                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11641                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11642                                                                 sizeof(I32));
11643                             SvIOK_on(sv_dat);
11644                             SvIV_set(sv_dat, 1);
11645                         }
11646 #ifdef DEBUGGING
11647                         /* Yes this does cause a memory leak in debugging Perls
11648                          * */
11649                         if (!av_store(RExC_paren_name_list,
11650                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11651                             SvREFCNT_dec_NN(svname);
11652 #endif
11653 
11654                         /*sv_dump(sv_dat);*/
11655                     }
11656                     nextchar(pRExC_state);
11657 		    paren = 1;
11658 		    goto capturing_parens;
11659 		}
11660 
11661                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11662 		RExC_in_lookaround++;
11663 		RExC_parse++;
11664                 if (RExC_parse >= RExC_end) {
11665                     vFAIL("Sequence (?... not terminated");
11666                 }
11667                 RExC_seen_zerolen++;
11668                 break;
11669 	    case '=':           /* (?=...) */
11670 		RExC_seen_zerolen++;
11671                 RExC_in_lookaround++;
11672                 break;
11673 	    case '!':           /* (?!...) */
11674 		RExC_seen_zerolen++;
11675 		/* check if we're really just a "FAIL" assertion */
11676                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11677                                         FALSE /* Don't force to /x */ );
11678 	        if (*RExC_parse == ')') {
11679                     ret=reganode(pRExC_state, OPFAIL, 0);
11680 	            nextchar(pRExC_state);
11681 	            return ret;
11682 	        }
11683                 RExC_in_lookaround++;
11684 	        break;
11685 	    case '|':           /* (?|...) */
11686 	        /* branch reset, behave like a (?:...) except that
11687 	           buffers in alternations share the same numbers */
11688 	        paren = ':';
11689 	        after_freeze = freeze_paren = RExC_npar;
11690 
11691                 /* XXX This construct currently requires an extra pass.
11692                  * Investigation would be required to see if that could be
11693                  * changed */
11694                 REQUIRE_PARENS_PASS;
11695 	        break;
11696 	    case ':':           /* (?:...) */
11697 	    case '>':           /* (?>...) */
11698 		break;
11699 	    case '$':           /* (?$...) */
11700 	    case '@':           /* (?@...) */
11701 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11702 		break;
11703 	    case '0' :           /* (?0) */
11704 	    case 'R' :           /* (?R) */
11705                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11706 		    FAIL("Sequence (?R) not terminated");
11707                 num = 0;
11708                 RExC_seen |= REG_RECURSE_SEEN;
11709 
11710                 /* XXX These constructs currently require an extra pass.
11711                  * It probably could be changed */
11712                 REQUIRE_PARENS_PASS;
11713 
11714 		*flagp |= POSTPONED;
11715                 goto gen_recurse_regop;
11716 		/*notreached*/
11717             /* named and numeric backreferences */
11718             case '&':            /* (?&NAME) */
11719                 parse_start = RExC_parse - 1;
11720               named_recursion:
11721                 {
11722                     SV *sv_dat = reg_scan_name(pRExC_state,
11723                                                REG_RSN_RETURN_DATA);
11724                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11725                 }
11726                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11727                     vFAIL("Sequence (?&... not terminated");
11728                 goto gen_recurse_regop;
11729                 /* NOTREACHED */
11730             case '+':
11731                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11732                     RExC_parse++;
11733                     vFAIL("Illegal pattern");
11734                 }
11735                 goto parse_recursion;
11736                 /* NOTREACHED*/
11737             case '-': /* (?-1) */
11738                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11739                     RExC_parse--; /* rewind to let it be handled later */
11740                     goto parse_flags;
11741                 }
11742                 /* FALLTHROUGH */
11743             case '1': case '2': case '3': case '4': /* (?1) */
11744 	    case '5': case '6': case '7': case '8': case '9':
11745 	        RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11746               parse_recursion:
11747                 {
11748                     bool is_neg = FALSE;
11749                     UV unum;
11750                     parse_start = RExC_parse - 1; /* MJD */
11751                     if (*RExC_parse == '-') {
11752                         RExC_parse++;
11753                         is_neg = TRUE;
11754                     }
11755                     endptr = RExC_end;
11756                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11757                         && unum <= I32_MAX
11758                     ) {
11759                         num = (I32)unum;
11760                         RExC_parse = (char*)endptr;
11761                     }
11762                     else {  /* Overflow, or something like that.  Position
11763                                beyond all digits for the message */
11764                         while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
11765                             RExC_parse++;
11766                         }
11767                         vFAIL(impossible_group);
11768                     }
11769                     if (is_neg) {
11770                         /* -num is always representable on 1 and 2's complement
11771                          * machines */
11772                         num = -num;
11773                     }
11774                 }
11775 	        if (*RExC_parse!=')')
11776 	            vFAIL("Expecting close bracket");
11777 
11778               gen_recurse_regop:
11779                 if (paren == '-' || paren == '+') {
11780 
11781                     /* Don't overflow */
11782                     if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11783                         RExC_parse++;
11784                         vFAIL(impossible_group);
11785                     }
11786 
11787                     /*
11788                     Diagram of capture buffer numbering.
11789                     Top line is the normal capture buffer numbers
11790                     Bottom line is the negative indexing as from
11791                     the X (the (?-2))
11792 
11793                         1 2    3 4 5 X   Y      6 7
11794                        /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11795                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11796                     -   5 4    3 2 1 X   Y      x x
11797 
11798                     Resolve to absolute group.  Recall that RExC_npar is +1 of
11799                     the actual parenthesis group number.  For lookahead, we
11800                     have to compensate for that.  Using the above example, when
11801                     we get to Y in the parse, num is 2 and RExC_npar is 6.  We
11802                     want 7 for +2, and 4 for -2.
11803                     */
11804                     if ( paren == '+' ) {
11805                         num--;
11806                     }
11807 
11808                     num += RExC_npar;
11809 
11810                     if (paren == '-' && num < 1) {
11811                         RExC_parse++;
11812                         vFAIL(non_existent_group_msg);
11813                     }
11814                 }
11815 
11816                 if (num >= RExC_npar) {
11817 
11818                     /* It might be a forward reference; we can't fail until we
11819                      * know, by completing the parse to get all the groups, and
11820                      * then reparsing */
11821                     if (ALL_PARENS_COUNTED)  {
11822                         if (num >= RExC_total_parens) {
11823                             RExC_parse++;
11824                             vFAIL(non_existent_group_msg);
11825                         }
11826                     }
11827                     else {
11828                         REQUIRE_PARENS_PASS;
11829                     }
11830                 }
11831 
11832                 /* We keep track how many GOSUB items we have produced.
11833                    To start off the ARG2L() of the GOSUB holds its "id",
11834                    which is used later in conjunction with RExC_recurse
11835                    to calculate the offset we need to jump for the GOSUB,
11836                    which it will store in the final representation.
11837                    We have to defer the actual calculation until much later
11838                    as the regop may move.
11839                  */
11840                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11841                 RExC_recurse_count++;
11842                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11843                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11844                             22, "|    |", (int)(depth * 2 + 1), "",
11845                             (UV)ARG(REGNODE_p(ret)),
11846                             (IV)ARG2L(REGNODE_p(ret))));
11847                 RExC_seen |= REG_RECURSE_SEEN;
11848 
11849                 Set_Node_Length(REGNODE_p(ret),
11850                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11851 		Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11852 
11853                 *flagp |= POSTPONED;
11854                 assert(*RExC_parse == ')');
11855                 nextchar(pRExC_state);
11856                 return ret;
11857 
11858             /* NOTREACHED */
11859 
11860 	    case '?':           /* (??...) */
11861 		is_logical = 1;
11862 		if (*RExC_parse != '{') {
11863                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11864                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11865                     vFAIL2utf8f(
11866                         "Sequence (%" UTF8f "...) not recognized",
11867                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11868 		    NOT_REACHED; /*NOTREACHED*/
11869 		}
11870 		*flagp |= POSTPONED;
11871 		paren = '{';
11872                 RExC_parse++;
11873 		/* FALLTHROUGH */
11874 	    case '{':           /* (?{...}) */
11875 	    {
11876 		U32 n = 0;
11877 		struct reg_code_block *cb;
11878                 OP * o;
11879 
11880 		RExC_seen_zerolen++;
11881 
11882 		if (   !pRExC_state->code_blocks
11883 		    || pRExC_state->code_index
11884                                         >= pRExC_state->code_blocks->count
11885 		    || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11886 			!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11887 			    - RExC_start)
11888 		) {
11889 		    if (RExC_pm_flags & PMf_USE_RE_EVAL)
11890 			FAIL("panic: Sequence (?{...}): no code block found\n");
11891 		    FAIL("Eval-group not allowed at runtime, use re 'eval'");
11892 		}
11893 		/* this is a pre-compiled code block (?{...}) */
11894 		cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11895 		RExC_parse = RExC_start + cb->end;
11896 		o = cb->block;
11897                 if (cb->src_regex) {
11898                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11899                     RExC_rxi->data->data[n] =
11900                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11901                     RExC_rxi->data->data[n+1] = (void*)o;
11902                 }
11903                 else {
11904                     n = add_data(pRExC_state,
11905                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11906                     RExC_rxi->data->data[n] = (void*)o;
11907                 }
11908 		pRExC_state->code_index++;
11909 		nextchar(pRExC_state);
11910 
11911 		if (is_logical) {
11912                     regnode_offset eval;
11913 		    ret = reg_node(pRExC_state, LOGICAL);
11914 
11915                     eval = reg2Lanode(pRExC_state, EVAL,
11916                                        n,
11917 
11918                                        /* for later propagation into (??{})
11919                                         * return value */
11920                                        RExC_flags & RXf_PMf_COMPILETIME
11921                                       );
11922                     FLAGS(REGNODE_p(ret)) = 2;
11923                     if (! REGTAIL(pRExC_state, ret, eval)) {
11924                         REQUIRE_BRANCHJ(flagp, 0);
11925                     }
11926                     /* deal with the length of this later - MJD */
11927 		    return ret;
11928 		}
11929 		ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11930 		Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11931 		Set_Node_Offset(REGNODE_p(ret), parse_start);
11932 		return ret;
11933 	    }
11934 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
11935 	    {
11936 	        int is_define= 0;
11937                 const int DEFINE_len = sizeof("DEFINE") - 1;
11938 		if (    RExC_parse < RExC_end - 1
11939                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11940                             && (   RExC_parse[1] == '='
11941                                 || RExC_parse[1] == '!'
11942                                 || RExC_parse[1] == '<'
11943                                 || RExC_parse[1] == '{'))
11944 		        || (       RExC_parse[0] == '*'        /* (?(*...)) */
11945                             && (   memBEGINs(RExC_parse + 1,
11946                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11947                                          "pla:")
11948                                 || memBEGINs(RExC_parse + 1,
11949                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11950                                          "plb:")
11951                                 || memBEGINs(RExC_parse + 1,
11952                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11953                                          "nla:")
11954                                 || memBEGINs(RExC_parse + 1,
11955                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11956                                          "nlb:")
11957                                 || memBEGINs(RExC_parse + 1,
11958                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11959                                          "positive_lookahead:")
11960                                 || memBEGINs(RExC_parse + 1,
11961                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11962                                          "positive_lookbehind:")
11963                                 || memBEGINs(RExC_parse + 1,
11964                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11965                                          "negative_lookahead:")
11966                                 || memBEGINs(RExC_parse + 1,
11967                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11968                                          "negative_lookbehind:"))))
11969                 ) { /* Lookahead or eval. */
11970                     I32 flag;
11971                     regnode_offset tail;
11972 
11973                     ret = reg_node(pRExC_state, LOGICAL);
11974                     FLAGS(REGNODE_p(ret)) = 1;
11975 
11976                     tail = reg(pRExC_state, 1, &flag, depth+1);
11977                     RETURN_FAIL_ON_RESTART(flag, flagp);
11978                     if (! REGTAIL(pRExC_state, ret, tail)) {
11979                         REQUIRE_BRANCHJ(flagp, 0);
11980                     }
11981                     goto insert_if;
11982                 }
11983 		else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11984 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11985 	        {
11986 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
11987 	            char *name_start= RExC_parse++;
11988 	            U32 num = 0;
11989 	            SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11990 	            if (   RExC_parse == name_start
11991                         || RExC_parse >= RExC_end
11992                         || *RExC_parse != ch)
11993                     {
11994                         vFAIL2("Sequence (?(%c... not terminated",
11995                             (ch == '>' ? '<' : ch));
11996                     }
11997                     RExC_parse++;
11998                     if (sv_dat) {
11999                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
12000                         RExC_rxi->data->data[num]=(void*)sv_dat;
12001                         SvREFCNT_inc_simple_void_NN(sv_dat);
12002                     }
12003                     ret = reganode(pRExC_state, GROUPPN, num);
12004                     goto insert_if_check_paren;
12005 		}
12006 		else if (memBEGINs(RExC_parse,
12007                                    (STRLEN) (RExC_end - RExC_parse),
12008                                    "DEFINE"))
12009                 {
12010 		    ret = reganode(pRExC_state, DEFINEP, 0);
12011 		    RExC_parse += DEFINE_len;
12012 		    is_define = 1;
12013 		    goto insert_if_check_paren;
12014 		}
12015 		else if (RExC_parse[0] == 'R') {
12016 		    RExC_parse++;
12017                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
12018                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
12019                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
12020                      */
12021 		    parno = 0;
12022                     if (RExC_parse[0] == '0') {
12023                         parno = 1;
12024                         RExC_parse++;
12025                     }
12026                     else if (inRANGE(RExC_parse[0], '1', '9')) {
12027                         UV uv;
12028                         endptr = RExC_end;
12029                         if (grok_atoUV(RExC_parse, &uv, &endptr)
12030                             && uv <= I32_MAX
12031                         ) {
12032                             parno = (I32)uv + 1;
12033                             RExC_parse = (char*)endptr;
12034                         }
12035                         /* else "Switch condition not recognized" below */
12036 		    } else if (RExC_parse[0] == '&') {
12037 		        SV *sv_dat;
12038 		        RExC_parse++;
12039 		        sv_dat = reg_scan_name(pRExC_state,
12040                                                REG_RSN_RETURN_DATA);
12041                         if (sv_dat)
12042                             parno = 1 + *((I32 *)SvPVX(sv_dat));
12043 		    }
12044 		    ret = reganode(pRExC_state, INSUBP, parno);
12045 		    goto insert_if_check_paren;
12046 		}
12047                 else if (inRANGE(RExC_parse[0], '1', '9')) {
12048                     /* (?(1)...) */
12049 		    char c;
12050                     UV uv;
12051                     endptr = RExC_end;
12052                     if (grok_atoUV(RExC_parse, &uv, &endptr)
12053                         && uv <= I32_MAX
12054                     ) {
12055                         parno = (I32)uv;
12056                         RExC_parse = (char*)endptr;
12057                     }
12058                     else {
12059                         vFAIL("panic: grok_atoUV returned FALSE");
12060                     }
12061                     ret = reganode(pRExC_state, GROUPP, parno);
12062 
12063                  insert_if_check_paren:
12064 		    if (UCHARAT(RExC_parse) != ')') {
12065                         RExC_parse += UTF
12066                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12067                                       : 1;
12068 			vFAIL("Switch condition not recognized");
12069 		    }
12070 		    nextchar(pRExC_state);
12071 		  insert_if:
12072                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12073                                                              IFTHEN, 0)))
12074                     {
12075                         REQUIRE_BRANCHJ(flagp, 0);
12076                     }
12077                     br = regbranch(pRExC_state, &flags, 1, depth+1);
12078 		    if (br == 0) {
12079                         RETURN_FAIL_ON_RESTART(flags,flagp);
12080                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12081                               (UV) flags);
12082                     } else
12083                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12084                                                              LONGJMP, 0)))
12085                     {
12086                         REQUIRE_BRANCHJ(flagp, 0);
12087                     }
12088 		    c = UCHARAT(RExC_parse);
12089                     nextchar(pRExC_state);
12090 		    if (flags&HASWIDTH)
12091 			*flagp |= HASWIDTH;
12092 		    if (c == '|') {
12093 		        if (is_define)
12094 		            vFAIL("(?(DEFINE)....) does not allow branches");
12095 
12096                         /* Fake one for optimizer.  */
12097                         lastbr = reganode(pRExC_state, IFTHEN, 0);
12098 
12099                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12100                             RETURN_FAIL_ON_RESTART(flags, flagp);
12101                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12102                                   (UV) flags);
12103                         }
12104                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
12105                             REQUIRE_BRANCHJ(flagp, 0);
12106                         }
12107 		 	if (flags&HASWIDTH)
12108 			    *flagp |= HASWIDTH;
12109                         c = UCHARAT(RExC_parse);
12110                         nextchar(pRExC_state);
12111 		    }
12112 		    else
12113 			lastbr = 0;
12114                     if (c != ')') {
12115                         if (RExC_parse >= RExC_end)
12116                             vFAIL("Switch (?(condition)... not terminated");
12117                         else
12118                             vFAIL("Switch (?(condition)... contains too many branches");
12119                     }
12120 		    ender = reg_node(pRExC_state, TAIL);
12121                     if (! REGTAIL(pRExC_state, br, ender)) {
12122                         REQUIRE_BRANCHJ(flagp, 0);
12123                     }
12124 		    if (lastbr) {
12125                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12126                             REQUIRE_BRANCHJ(flagp, 0);
12127                         }
12128                         if (! REGTAIL(pRExC_state,
12129                                       REGNODE_OFFSET(
12130                                                  NEXTOPER(
12131                                                  NEXTOPER(REGNODE_p(lastbr)))),
12132                                       ender))
12133                         {
12134                             REQUIRE_BRANCHJ(flagp, 0);
12135                         }
12136 		    }
12137 		    else
12138                         if (! REGTAIL(pRExC_state, ret, ender)) {
12139                             REQUIRE_BRANCHJ(flagp, 0);
12140                         }
12141 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
12142                     RExC_size++; /* XXX WHY do we need this?!!
12143                                     For large programs it seems to be required
12144                                     but I can't figure out why. -- dmq*/
12145 #endif
12146 		    return ret;
12147 		}
12148                 RExC_parse += UTF
12149                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12150                               : 1;
12151                 vFAIL("Unknown switch condition (?(...))");
12152 	    }
12153 	    case '[':           /* (?[ ... ]) */
12154                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12155                                          oregcomp_parse);
12156             case 0: /* A NUL */
12157 		RExC_parse--; /* for vFAIL to print correctly */
12158                 vFAIL("Sequence (? incomplete");
12159                 break;
12160 
12161             case ')':
12162                 if (RExC_strict) {  /* [perl #132851] */
12163                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12164                 }
12165                 /* FALLTHROUGH */
12166             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12167 	    /* FALLTHROUGH */
12168 	    default: /* e.g., (?i) */
12169 	        RExC_parse = (char *) seqstart + 1;
12170               parse_flags:
12171 		parse_lparen_question_flags(pRExC_state);
12172                 if (UCHARAT(RExC_parse) != ':') {
12173                     if (RExC_parse < RExC_end)
12174                         nextchar(pRExC_state);
12175                     *flagp = TRYAGAIN;
12176                     return 0;
12177                 }
12178                 paren = ':';
12179                 nextchar(pRExC_state);
12180                 ret = 0;
12181                 goto parse_rest;
12182             } /* end switch */
12183 	}
12184         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12185 	  capturing_parens:
12186 	    parno = RExC_npar;
12187 	    RExC_npar++;
12188             if (! ALL_PARENS_COUNTED) {
12189                 /* If we are in our first pass through (and maybe only pass),
12190                  * we  need to allocate memory for the capturing parentheses
12191                  * data structures.
12192                  */
12193 
12194                 if (!RExC_parens_buf_size) {
12195                     /* first guess at number of parens we might encounter */
12196                     RExC_parens_buf_size = 10;
12197 
12198                     /* setup RExC_open_parens, which holds the address of each
12199                      * OPEN tag, and to make things simpler for the 0 index the
12200                      * start of the program - this is used later for offsets */
12201                     Newxz(RExC_open_parens, RExC_parens_buf_size,
12202                             regnode_offset);
12203                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12204 
12205                     /* setup RExC_close_parens, which holds the address of each
12206                      * CLOSE tag, and to make things simpler for the 0 index
12207                      * the end of the program - this is used later for offsets
12208                      * */
12209                     Newxz(RExC_close_parens, RExC_parens_buf_size,
12210                             regnode_offset);
12211                     /* we dont know where end op starts yet, so we dont need to
12212                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12213                      * above */
12214                 }
12215                 else if (RExC_npar > RExC_parens_buf_size) {
12216                     I32 old_size = RExC_parens_buf_size;
12217 
12218                     RExC_parens_buf_size *= 2;
12219 
12220                     Renew(RExC_open_parens, RExC_parens_buf_size,
12221                             regnode_offset);
12222                     Zero(RExC_open_parens + old_size,
12223                             RExC_parens_buf_size - old_size, regnode_offset);
12224 
12225                     Renew(RExC_close_parens, RExC_parens_buf_size,
12226                             regnode_offset);
12227                     Zero(RExC_close_parens + old_size,
12228                             RExC_parens_buf_size - old_size, regnode_offset);
12229                 }
12230             }
12231 
12232 	    ret = reganode(pRExC_state, OPEN, parno);
12233             if (!RExC_nestroot)
12234                 RExC_nestroot = parno;
12235             if (RExC_open_parens && !RExC_open_parens[parno])
12236             {
12237                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12238                     "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12239                     22, "|    |", (int)(depth * 2 + 1), "",
12240                     (IV)parno, ret));
12241                 RExC_open_parens[parno]= ret;
12242             }
12243 
12244             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12245             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12246 	    is_open = 1;
12247 	} else {
12248             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12249             paren = ':';
12250 	    ret = 0;
12251         }
12252     }
12253     else                        /* ! paren */
12254 	ret = 0;
12255 
12256    parse_rest:
12257     /* Pick up the branches, linking them together. */
12258     parse_start = RExC_parse;   /* MJD */
12259     br = regbranch(pRExC_state, &flags, 1, depth+1);
12260 
12261     /*     branch_len = (paren != 0); */
12262 
12263     if (br == 0) {
12264         RETURN_FAIL_ON_RESTART(flags, flagp);
12265         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12266     }
12267     if (*RExC_parse == '|') {
12268 	if (RExC_use_BRANCHJ) {
12269 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
12270 	}
12271 	else {                  /* MJD */
12272 	    reginsert(pRExC_state, BRANCH, br, depth+1);
12273             Set_Node_Length(REGNODE_p(br), paren != 0);
12274             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12275         }
12276 	have_branch = 1;
12277     }
12278     else if (paren == ':') {
12279 	*flagp |= flags&SIMPLE;
12280     }
12281     if (is_open) {				/* Starts with OPEN. */
12282         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12283             REQUIRE_BRANCHJ(flagp, 0);
12284         }
12285     }
12286     else if (paren != '?')		/* Not Conditional */
12287 	ret = br;
12288     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12289     lastbr = br;
12290     while (*RExC_parse == '|') {
12291 	if (RExC_use_BRANCHJ) {
12292             bool shut_gcc_up;
12293 
12294 	    ender = reganode(pRExC_state, LONGJMP, 0);
12295 
12296             /* Append to the previous. */
12297             shut_gcc_up = REGTAIL(pRExC_state,
12298                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12299                          ender);
12300             PERL_UNUSED_VAR(shut_gcc_up);
12301 	}
12302 	nextchar(pRExC_state);
12303 	if (freeze_paren) {
12304 	    if (RExC_npar > after_freeze)
12305 	        after_freeze = RExC_npar;
12306             RExC_npar = freeze_paren;
12307         }
12308         br = regbranch(pRExC_state, &flags, 0, depth+1);
12309 
12310 	if (br == 0) {
12311             RETURN_FAIL_ON_RESTART(flags, flagp);
12312             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12313         }
12314         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12315             REQUIRE_BRANCHJ(flagp, 0);
12316         }
12317 	lastbr = br;
12318 	*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12319     }
12320 
12321     if (have_branch || paren != ':') {
12322         regnode * br;
12323 
12324 	/* Make a closing node, and hook it on the end. */
12325 	switch (paren) {
12326 	case ':':
12327 	    ender = reg_node(pRExC_state, TAIL);
12328 	    break;
12329 	case 1: case 2:
12330 	    ender = reganode(pRExC_state, CLOSE, parno);
12331             if ( RExC_close_parens ) {
12332                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12333                         "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12334                         22, "|    |", (int)(depth * 2 + 1), "",
12335                         (IV)parno, ender));
12336                 RExC_close_parens[parno]= ender;
12337 	        if (RExC_nestroot == parno)
12338 	            RExC_nestroot = 0;
12339 	    }
12340             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12341             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12342 	    break;
12343 	case 's':
12344 	    ender = reg_node(pRExC_state, SRCLOSE);
12345             RExC_in_script_run = 0;
12346 	    break;
12347 	case '<':
12348         case 'a':
12349         case 'A':
12350         case 'b':
12351         case 'B':
12352 	case ',':
12353 	case '=':
12354 	case '!':
12355 	    *flagp &= ~HASWIDTH;
12356 	    /* FALLTHROUGH */
12357         case 't':   /* aTomic */
12358 	case '>':
12359 	    ender = reg_node(pRExC_state, SUCCEED);
12360 	    break;
12361 	case 0:
12362 	    ender = reg_node(pRExC_state, END);
12363             assert(!RExC_end_op); /* there can only be one! */
12364             RExC_end_op = REGNODE_p(ender);
12365             if (RExC_close_parens) {
12366                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12367                     "%*s%*s Setting close paren #0 (END) to %zu\n",
12368                     22, "|    |", (int)(depth * 2 + 1), "",
12369                     ender));
12370 
12371                 RExC_close_parens[0]= ender;
12372             }
12373 	    break;
12374 	}
12375         DEBUG_PARSE_r({
12376             DEBUG_PARSE_MSG("lsbr");
12377             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12378             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12379             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12380                           SvPV_nolen_const(RExC_mysv1),
12381                           (IV)lastbr,
12382                           SvPV_nolen_const(RExC_mysv2),
12383                           (IV)ender,
12384                           (IV)(ender - lastbr)
12385             );
12386         });
12387         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12388             REQUIRE_BRANCHJ(flagp, 0);
12389         }
12390 
12391 	if (have_branch) {
12392             char is_nothing= 1;
12393 	    if (depth==1)
12394                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12395 
12396 	    /* Hook the tails of the branches to the closing node. */
12397 	    for (br = REGNODE_p(ret); br; br = regnext(br)) {
12398 		const U8 op = PL_regkind[OP(br)];
12399 		if (op == BRANCH) {
12400                     if (! REGTAIL_STUDY(pRExC_state,
12401                                         REGNODE_OFFSET(NEXTOPER(br)),
12402                                         ender))
12403                     {
12404                         REQUIRE_BRANCHJ(flagp, 0);
12405                     }
12406                     if ( OP(NEXTOPER(br)) != NOTHING
12407                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12408                         is_nothing= 0;
12409 		}
12410 		else if (op == BRANCHJ) {
12411                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12412                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12413                                         ender);
12414                     PERL_UNUSED_VAR(shut_gcc_up);
12415                     /* for now we always disable this optimisation * /
12416                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12417                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12418                     */
12419                         is_nothing= 0;
12420 		}
12421 	    }
12422             if (is_nothing) {
12423                 regnode * ret_as_regnode = REGNODE_p(ret);
12424                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12425                                ? regnext(ret_as_regnode)
12426                                : ret_as_regnode;
12427                 DEBUG_PARSE_r({
12428                     DEBUG_PARSE_MSG("NADA");
12429                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12430                                      NULL, pRExC_state);
12431                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12432                                      NULL, pRExC_state);
12433                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12434                                   SvPV_nolen_const(RExC_mysv1),
12435                                   (IV)REG_NODE_NUM(ret_as_regnode),
12436                                   SvPV_nolen_const(RExC_mysv2),
12437                                   (IV)ender,
12438                                   (IV)(ender - ret)
12439                     );
12440                 });
12441                 OP(br)= NOTHING;
12442                 if (OP(REGNODE_p(ender)) == TAIL) {
12443                     NEXT_OFF(br)= 0;
12444                     RExC_emit= REGNODE_OFFSET(br) + 1;
12445                 } else {
12446                     regnode *opt;
12447                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12448                         OP(opt)= OPTIMIZED;
12449                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12450                 }
12451             }
12452 	}
12453     }
12454 
12455     {
12456         const char *p;
12457          /* Even/odd or x=don't care: 010101x10x */
12458         static const char parens[] = "=!aA<,>Bbt";
12459          /* flag below is set to 0 up through 'A'; 1 for larger */
12460 
12461 	if (paren && (p = strchr(parens, paren))) {
12462 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12463 	    int flag = (p - parens) > 3;
12464 
12465 	    if (paren == '>' || paren == 't') {
12466 		node = SUSPEND, flag = 0;
12467             }
12468 
12469 	    reginsert(pRExC_state, node, ret, depth+1);
12470             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12471 	    Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12472 	    FLAGS(REGNODE_p(ret)) = flag;
12473             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12474             {
12475                 REQUIRE_BRANCHJ(flagp, 0);
12476             }
12477 	}
12478     }
12479 
12480     /* Check for proper termination. */
12481     if (paren) {
12482         /* restore original flags, but keep (?p) and, if we've encountered
12483          * something in the parse that changes /d rules into /u, keep the /u */
12484 	RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12485         if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
12486             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12487         }
12488 	if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12489 	    RExC_parse = oregcomp_parse;
12490 	    vFAIL("Unmatched (");
12491 	}
12492 	nextchar(pRExC_state);
12493     }
12494     else if (!paren && RExC_parse < RExC_end) {
12495 	if (*RExC_parse == ')') {
12496 	    RExC_parse++;
12497 	    vFAIL("Unmatched )");
12498 	}
12499 	else
12500 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
12501 	NOT_REACHED; /* NOTREACHED */
12502     }
12503 
12504     if (after_freeze > RExC_npar)
12505         RExC_npar = after_freeze;
12506 
12507     RExC_in_lookaround = was_in_lookaround;
12508 
12509     return(ret);
12510 }
12511 
12512 /*
12513  - regbranch - one alternative of an | operator
12514  *
12515  * Implements the concatenation operator.
12516  *
12517  * On success, returns the offset at which any next node should be placed into
12518  * the regex engine program being compiled.
12519  *
12520  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12521  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12522  * UTF-8
12523  */
12524 STATIC regnode_offset
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first,U32 depth)12525 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12526 {
12527     regnode_offset ret;
12528     regnode_offset chain = 0;
12529     regnode_offset latest;
12530     I32 flags = 0, c = 0;
12531     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12532 
12533     PERL_ARGS_ASSERT_REGBRANCH;
12534 
12535     DEBUG_PARSE("brnc");
12536 
12537     if (first)
12538 	ret = 0;
12539     else {
12540 	if (RExC_use_BRANCHJ)
12541 	    ret = reganode(pRExC_state, BRANCHJ, 0);
12542 	else {
12543 	    ret = reg_node(pRExC_state, BRANCH);
12544             Set_Node_Length(REGNODE_p(ret), 1);
12545         }
12546     }
12547 
12548     *flagp = WORST;			/* Tentatively. */
12549 
12550     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12551                             FALSE /* Don't force to /x */ );
12552     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12553 	flags &= ~TRYAGAIN;
12554         latest = regpiece(pRExC_state, &flags, depth+1);
12555 	if (latest == 0) {
12556 	    if (flags & TRYAGAIN)
12557 		continue;
12558             RETURN_FAIL_ON_RESTART(flags, flagp);
12559             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12560 	}
12561 	else if (ret == 0)
12562             ret = latest;
12563 	*flagp |= flags&(HASWIDTH|POSTPONED);
12564 	if (chain == 0) 	/* First piece. */
12565 	    *flagp |= flags&SPSTART;
12566 	else {
12567 	    /* FIXME adding one for every branch after the first is probably
12568 	     * excessive now we have TRIE support. (hv) */
12569 	    MARK_NAUGHTY(1);
12570             if (! REGTAIL(pRExC_state, chain, latest)) {
12571                 /* XXX We could just redo this branch, but figuring out what
12572                  * bookkeeping needs to be reset is a pain, and it's likely
12573                  * that other branches that goto END will also be too large */
12574                 REQUIRE_BRANCHJ(flagp, 0);
12575             }
12576 	}
12577 	chain = latest;
12578 	c++;
12579     }
12580     if (chain == 0) {	/* Loop ran zero times. */
12581 	chain = reg_node(pRExC_state, NOTHING);
12582 	if (ret == 0)
12583 	    ret = chain;
12584     }
12585     if (c == 1) {
12586 	*flagp |= flags&SIMPLE;
12587     }
12588 
12589     return ret;
12590 }
12591 
12592 /*
12593  - regpiece - something followed by possible quantifier * + ? {n,m}
12594  *
12595  * Note that the branching code sequences used for ? and the general cases
12596  * of * and + are somewhat optimized:  they use the same NOTHING node as
12597  * both the endmarker for their branch list and the body of the last branch.
12598  * It might seem that this node could be dispensed with entirely, but the
12599  * endmarker role is not redundant.
12600  *
12601  * On success, returns the offset at which any next node should be placed into
12602  * the regex engine program being compiled.
12603  *
12604  * Returns 0 otherwise, with *flagp set to indicate why:
12605  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12606  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12607  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12608  */
12609 STATIC regnode_offset
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)12610 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12611 {
12612     regnode_offset ret;
12613     char op;
12614     char *next;
12615     I32 flags;
12616     const char * const origparse = RExC_parse;
12617     I32 min;
12618     I32 max = REG_INFTY;
12619 #ifdef RE_TRACK_PATTERN_OFFSETS
12620     char *parse_start;
12621 #endif
12622     const char *maxpos = NULL;
12623     UV uv;
12624 
12625     /* Save the original in case we change the emitted regop to a FAIL. */
12626     const regnode_offset orig_emit = RExC_emit;
12627 
12628     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12629 
12630     PERL_ARGS_ASSERT_REGPIECE;
12631 
12632     DEBUG_PARSE("piec");
12633 
12634     ret = regatom(pRExC_state, &flags, depth+1);
12635     if (ret == 0) {
12636         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12637         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12638     }
12639 
12640     op = *RExC_parse;
12641 
12642     if (op == '{' && regcurly(RExC_parse)) {
12643 	maxpos = NULL;
12644 #ifdef RE_TRACK_PATTERN_OFFSETS
12645         parse_start = RExC_parse; /* MJD */
12646 #endif
12647 	next = RExC_parse + 1;
12648 	while (isDIGIT(*next) || *next == ',') {
12649 	    if (*next == ',') {
12650 		if (maxpos)
12651 		    break;
12652 		else
12653 		    maxpos = next;
12654 	    }
12655 	    next++;
12656 	}
12657 	if (*next == '}') {		/* got one */
12658             const char* endptr;
12659 	    if (!maxpos)
12660 		maxpos = next;
12661 	    RExC_parse++;
12662             if (isDIGIT(*RExC_parse)) {
12663                 endptr = RExC_end;
12664                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12665                     vFAIL("Invalid quantifier in {,}");
12666                 if (uv >= REG_INFTY)
12667                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12668                 min = (I32)uv;
12669             } else {
12670                 min = 0;
12671             }
12672 	    if (*maxpos == ',')
12673 		maxpos++;
12674 	    else
12675 		maxpos = RExC_parse;
12676             if (isDIGIT(*maxpos)) {
12677                 endptr = RExC_end;
12678                 if (!grok_atoUV(maxpos, &uv, &endptr))
12679                     vFAIL("Invalid quantifier in {,}");
12680                 if (uv >= REG_INFTY)
12681                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12682                 max = (I32)uv;
12683             } else {
12684 		max = REG_INFTY;		/* meaning "infinity" */
12685             }
12686 	    RExC_parse = next;
12687 	    nextchar(pRExC_state);
12688             if (max < min) {    /* If can't match, warn and optimize to fail
12689                                    unconditionally */
12690                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12691                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12692                 NEXT_OFF(REGNODE_p(orig_emit)) =
12693                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12694                 return ret;
12695             }
12696             else if (min == max && *RExC_parse == '?')
12697             {
12698                 ckWARN2reg(RExC_parse + 1,
12699                            "Useless use of greediness modifier '%c'",
12700                            *RExC_parse);
12701             }
12702 
12703 	  do_curly:
12704 	    if ((flags&SIMPLE)) {
12705                 if (min == 0 && max == REG_INFTY) {
12706 
12707                     /* Going from 0..inf is currently forbidden in wildcard
12708                      * subpatterns.  The only reason is to make it harder to
12709                      * write patterns that take a long long time to halt, and
12710                      * because the use of this construct isn't necessary in
12711                      * matching Unicode property values */
12712                     if (RExC_pm_flags & PMf_WILDCARD) {
12713                         RExC_parse++;
12714                         /* diag_listed_as: Use of %s is not allowed in Unicode
12715                            property wildcard subpatterns in regex; marked by
12716                            <-- HERE in m/%s/ */
12717                         vFAIL("Use of quantifier '*' is not allowed in"
12718                               " Unicode property wildcard subpatterns");
12719                         /* Note, don't need to worry about {0,}, as a '}' isn't
12720                          * legal at all in wildcards, so wouldn't get this far
12721                          * */
12722                     }
12723                     reginsert(pRExC_state, STAR, ret, depth+1);
12724                     MARK_NAUGHTY(4);
12725                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12726                     goto nest_check;
12727                 }
12728                 if (min == 1 && max == REG_INFTY) {
12729                     reginsert(pRExC_state, PLUS, ret, depth+1);
12730                     MARK_NAUGHTY(3);
12731                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12732                     goto nest_check;
12733                 }
12734                 MARK_NAUGHTY_EXP(2, 2);
12735 		reginsert(pRExC_state, CURLY, ret, depth+1);
12736                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12737                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12738 	    }
12739 	    else {
12740 		const regnode_offset w = reg_node(pRExC_state, WHILEM);
12741 
12742 		FLAGS(REGNODE_p(w)) = 0;
12743                 if (!  REGTAIL(pRExC_state, ret, w)) {
12744                     REQUIRE_BRANCHJ(flagp, 0);
12745                 }
12746 		if (RExC_use_BRANCHJ) {
12747 		    reginsert(pRExC_state, LONGJMP, ret, depth+1);
12748 		    reginsert(pRExC_state, NOTHING, ret, depth+1);
12749 		    NEXT_OFF(REGNODE_p(ret)) = 3;	/* Go over LONGJMP. */
12750 		}
12751 		reginsert(pRExC_state, CURLYX, ret, depth+1);
12752                                 /* MJD hk */
12753                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12754                 Set_Node_Length(REGNODE_p(ret),
12755                                 op == '{' ? (RExC_parse - parse_start) : 1);
12756 
12757 		if (RExC_use_BRANCHJ)
12758                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12759                                                        LONGJMP. */
12760                 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12761                                                           NOTHING)))
12762                 {
12763                     REQUIRE_BRANCHJ(flagp, 0);
12764                 }
12765                 RExC_whilem_seen++;
12766                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12767 	    }
12768 	    FLAGS(REGNODE_p(ret)) = 0;
12769 
12770 	    if (min > 0)
12771 		*flagp = WORST;
12772 	    if (max > 0)
12773 		*flagp |= HASWIDTH;
12774             ARG1_SET(REGNODE_p(ret), (U16)min);
12775             ARG2_SET(REGNODE_p(ret), (U16)max);
12776             if (max == REG_INFTY)
12777                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12778 
12779 	    goto nest_check;
12780 	}
12781     }
12782 
12783     if (!ISMULT1(op)) {
12784 	*flagp = flags;
12785 	return(ret);
12786     }
12787 
12788 #if 0				/* Now runtime fix should be reliable. */
12789 
12790     /* if this is reinstated, don't forget to put this back into perldiag:
12791 
12792 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
12793 
12794 	   (F) The part of the regexp subject to either the * or + quantifier
12795            could match an empty string. The {#} shows in the regular
12796            expression about where the problem was discovered.
12797 
12798     */
12799 
12800     if (!(flags&HASWIDTH) && op != '?')
12801       vFAIL("Regexp *+ operand could be empty");
12802 #endif
12803 
12804 #ifdef RE_TRACK_PATTERN_OFFSETS
12805     parse_start = RExC_parse;
12806 #endif
12807     nextchar(pRExC_state);
12808 
12809     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12810 
12811     if (op == '*') {
12812 	min = 0;
12813 	goto do_curly;
12814     }
12815     else if (op == '+') {
12816 	min = 1;
12817 	goto do_curly;
12818     }
12819     else if (op == '?') {
12820 	min = 0; max = 1;
12821 	goto do_curly;
12822     }
12823   nest_check:
12824     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12825         if (origparse[0] == '\\' && origparse[1] == 'K') {
12826             vFAIL2utf8f(
12827                        "%" UTF8f " is forbidden - matches null string many times",
12828                        UTF8fARG(UTF, (RExC_parse >= origparse
12829                                      ? RExC_parse - origparse
12830                                      : 0),
12831                        origparse));
12832             /* NOT-REACHED */
12833         } else {
12834             ckWARN2reg(RExC_parse,
12835                        "%" UTF8f " matches null string many times",
12836                        UTF8fARG(UTF, (RExC_parse >= origparse
12837                                      ? RExC_parse - origparse
12838                                      : 0),
12839                        origparse));
12840         }
12841     }
12842 
12843     if (*RExC_parse == '?') {
12844 	nextchar(pRExC_state);
12845 	reginsert(pRExC_state, MINMOD, ret, depth+1);
12846         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12847             REQUIRE_BRANCHJ(flagp, 0);
12848         }
12849     }
12850     else if (*RExC_parse == '+') {
12851         regnode_offset ender;
12852         nextchar(pRExC_state);
12853         ender = reg_node(pRExC_state, SUCCEED);
12854         if (! REGTAIL(pRExC_state, ret, ender)) {
12855             REQUIRE_BRANCHJ(flagp, 0);
12856         }
12857         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12858         ender = reg_node(pRExC_state, TAIL);
12859         if (! REGTAIL(pRExC_state, ret, ender)) {
12860             REQUIRE_BRANCHJ(flagp, 0);
12861         }
12862     }
12863 
12864     if (ISMULT2(RExC_parse)) {
12865 	RExC_parse++;
12866 	vFAIL("Nested quantifiers");
12867     }
12868 
12869     return(ret);
12870 }
12871 
12872 STATIC bool
S_grok_bslash_N(pTHX_ RExC_state_t * pRExC_state,regnode_offset * node_p,UV * code_point_p,int * cp_count,I32 * flagp,const bool strict,const U32 depth)12873 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12874                 regnode_offset * node_p,
12875                 UV * code_point_p,
12876                 int * cp_count,
12877                 I32 * flagp,
12878                 const bool strict,
12879                 const U32 depth
12880     )
12881 {
12882  /* This routine teases apart the various meanings of \N and returns
12883   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12884   * in the current context.
12885   *
12886   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12887   *
12888   * If <code_point_p> is not NULL, the context is expecting the result to be a
12889   * single code point.  If this \N instance turns out to a single code point,
12890   * the function returns TRUE and sets *code_point_p to that code point.
12891   *
12892   * If <node_p> is not NULL, the context is expecting the result to be one of
12893   * the things representable by a regnode.  If this \N instance turns out to be
12894   * one such, the function generates the regnode, returns TRUE and sets *node_p
12895   * to point to the offset of that regnode into the regex engine program being
12896   * compiled.
12897   *
12898   * If this instance of \N isn't legal in any context, this function will
12899   * generate a fatal error and not return.
12900   *
12901   * On input, RExC_parse should point to the first char following the \N at the
12902   * time of the call.  On successful return, RExC_parse will have been updated
12903   * to point to just after the sequence identified by this routine.  Also
12904   * *flagp has been updated as needed.
12905   *
12906   * When there is some problem with the current context and this \N instance,
12907   * the function returns FALSE, without advancing RExC_parse, nor setting
12908   * *node_p, nor *code_point_p, nor *flagp.
12909   *
12910   * If <cp_count> is not NULL, the caller wants to know the length (in code
12911   * points) that this \N sequence matches.  This is set, and the input is
12912   * parsed for errors, even if the function returns FALSE, as detailed below.
12913   *
12914   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12915   *
12916   * Probably the most common case is for the \N to specify a single code point.
12917   * *cp_count will be set to 1, and *code_point_p will be set to that code
12918   * point.
12919   *
12920   * Another possibility is for the input to be an empty \N{}.  This is no
12921   * longer accepted, and will generate a fatal error.
12922   *
12923   * Another possibility is for a custom charnames handler to be in effect which
12924   * translates the input name to an empty string.  *cp_count will be set to 0.
12925   * *node_p will be set to a generated NOTHING node.
12926   *
12927   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12928   * set to 0. *node_p will be set to a generated REG_ANY node.
12929   *
12930   * The fifth possibility is that \N resolves to a sequence of more than one
12931   * code points.  *cp_count will be set to the number of code points in the
12932   * sequence. *node_p will be set to a generated node returned by this
12933   * function calling S_reg().
12934   *
12935   * The final possibility is that it is premature to be calling this function;
12936   * the parse needs to be restarted.  This can happen when this changes from
12937   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12938   * latter occurs only when the fifth possibility would otherwise be in
12939   * effect, and is because one of those code points requires the pattern to be
12940   * recompiled as UTF-8.  The function returns FALSE, and sets the
12941   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12942   * happens, the caller needs to desist from continuing parsing, and return
12943   * this information to its caller.  This is not set for when there is only one
12944   * code point, as this can be called as part of an ANYOF node, and they can
12945   * store above-Latin1 code points without the pattern having to be in UTF-8.
12946   *
12947   * For non-single-quoted regexes, the tokenizer has resolved character and
12948   * sequence names inside \N{...} into their Unicode values, normalizing the
12949   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12950   * hex-represented code points in the sequence.  This is done there because
12951   * the names can vary based on what charnames pragma is in scope at the time,
12952   * so we need a way to take a snapshot of what they resolve to at the time of
12953   * the original parse. [perl #56444].
12954   *
12955   * That parsing is skipped for single-quoted regexes, so here we may get
12956   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12957   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12958   * the native character set for non-ASCII platforms.  The other possibilities
12959   * are already native, so no translation is done. */
12960 
12961     char * endbrace;    /* points to '}' following the name */
12962     char* p = RExC_parse; /* Temporary */
12963 
12964     SV * substitute_parse = NULL;
12965     char *orig_end;
12966     char *save_start;
12967     I32 flags;
12968 
12969     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12970 
12971     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12972 
12973     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12974     assert(! (node_p && cp_count));               /* At most 1 should be set */
12975 
12976     if (cp_count) {     /* Initialize return for the most common case */
12977         *cp_count = 1;
12978     }
12979 
12980     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12981      * modifier.  The other meanings do not, so use a temporary until we find
12982      * out which we are being called with */
12983     skip_to_be_ignored_text(pRExC_state, &p,
12984                             FALSE /* Don't force to /x */ );
12985 
12986     /* Disambiguate between \N meaning a named character versus \N meaning
12987      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12988      * quantifier, or if there is no '{' at all */
12989     if (*p != '{' || regcurly(p)) {
12990         RExC_parse = p;
12991         if (cp_count) {
12992             *cp_count = -1;
12993         }
12994 
12995         if (! node_p) {
12996             return FALSE;
12997         }
12998 
12999         *node_p = reg_node(pRExC_state, REG_ANY);
13000         *flagp |= HASWIDTH|SIMPLE;
13001         MARK_NAUGHTY(1);
13002         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
13003         return TRUE;
13004     }
13005 
13006     /* The test above made sure that the next real character is a '{', but
13007      * under the /x modifier, it could be separated by space (or a comment and
13008      * \n) and this is not allowed (for consistency with \x{...} and the
13009      * tokenizer handling of \N{NAME}). */
13010     if (*RExC_parse != '{') {
13011         vFAIL("Missing braces on \\N{}");
13012     }
13013 
13014     RExC_parse++;       /* Skip past the '{' */
13015 
13016     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13017     if (! endbrace) { /* no trailing brace */
13018         vFAIL2("Missing right brace on \\%c{}", 'N');
13019     }
13020 
13021     /* Here, we have decided it should be a named character or sequence.  These
13022      * imply Unicode semantics */
13023     REQUIRE_UNI_RULES(flagp, FALSE);
13024 
13025     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13026      * nothing at all (not allowed under strict) */
13027     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13028         RExC_parse = endbrace;
13029         if (strict) {
13030             RExC_parse++;   /* Position after the "}" */
13031             vFAIL("Zero length \\N{}");
13032         }
13033 
13034         if (cp_count) {
13035             *cp_count = 0;
13036         }
13037         nextchar(pRExC_state);
13038         if (! node_p) {
13039             return FALSE;
13040         }
13041 
13042         *node_p = reg_node(pRExC_state, NOTHING);
13043         return TRUE;
13044     }
13045 
13046     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13047 
13048         /* Here, the name isn't of the form  U+....  This can happen if the
13049          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
13050          * is the time to find out what the name means */
13051 
13052         const STRLEN name_len = endbrace - RExC_parse;
13053         SV *  value_sv;     /* What does this name evaluate to */
13054         SV ** value_svp;
13055         const U8 * value;   /* string of name's value */
13056         STRLEN value_len;   /* and its length */
13057 
13058         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
13059          *  toke.c, and their values. Make sure is initialized */
13060         if (! RExC_unlexed_names) {
13061             RExC_unlexed_names = newHV();
13062         }
13063 
13064         /* If we have already seen this name in this pattern, use that.  This
13065          * allows us to only call the charnames handler once per name per
13066          * pattern.  A broken or malicious handler could return something
13067          * different each time, which could cause the results to vary depending
13068          * on if something gets added or subtracted from the pattern that
13069          * causes the number of passes to change, for example */
13070         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13071                                                       name_len, 0)))
13072         {
13073             value_sv = *value_svp;
13074         }
13075         else { /* Otherwise we have to go out and get the name */
13076             const char * error_msg = NULL;
13077             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13078                                                       UTF,
13079                                                       &error_msg);
13080             if (error_msg) {
13081                 RExC_parse = endbrace;
13082                 vFAIL(error_msg);
13083             }
13084 
13085             /* If no error message, should have gotten a valid return */
13086             assert (value_sv);
13087 
13088             /* Save the name's meaning for later use */
13089             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13090                            value_sv, 0))
13091             {
13092                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13093             }
13094         }
13095 
13096         /* Here, we have the value the name evaluates to in 'value_sv' */
13097         value = (U8 *) SvPV(value_sv, value_len);
13098 
13099         /* See if the result is one code point vs 0 or multiple */
13100         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13101                                   ? UTF8SKIP(value)
13102                                   : 1)))
13103         {
13104             /* Here, exactly one code point.  If that isn't what is wanted,
13105              * fail */
13106             if (! code_point_p) {
13107                 RExC_parse = p;
13108                 return FALSE;
13109             }
13110 
13111             /* Convert from string to numeric code point */
13112             *code_point_p = (SvUTF8(value_sv))
13113                             ? valid_utf8_to_uvchr(value, NULL)
13114                             : *value;
13115 
13116             /* Have parsed this entire single code point \N{...}.  *cp_count
13117              * has already been set to 1, so don't do it again. */
13118             RExC_parse = endbrace;
13119             nextchar(pRExC_state);
13120             return TRUE;
13121         } /* End of is a single code point */
13122 
13123         /* Count the code points, if caller desires.  The API says to do this
13124          * even if we will later return FALSE */
13125         if (cp_count) {
13126             *cp_count = 0;
13127 
13128             *cp_count = (SvUTF8(value_sv))
13129                         ? utf8_length(value, value + value_len)
13130                         : value_len;
13131         }
13132 
13133         /* Fail if caller doesn't want to handle a multi-code-point sequence.
13134          * But don't back the pointer up if the caller wants to know how many
13135          * code points there are (they need to handle it themselves in this
13136          * case).  */
13137         if (! node_p) {
13138             if (! cp_count) {
13139                 RExC_parse = p;
13140             }
13141             return FALSE;
13142         }
13143 
13144         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13145          * reg recursively to parse it.  That way, it retains its atomicness,
13146          * while not having to worry about any special handling that some code
13147          * points may have. */
13148 
13149         substitute_parse = newSVpvs("?:");
13150         sv_catsv(substitute_parse, value_sv);
13151         sv_catpv(substitute_parse, ")");
13152 
13153         /* The value should already be native, so no need to convert on EBCDIC
13154          * platforms.*/
13155         assert(! RExC_recode_x_to_native);
13156 
13157     }
13158     else {   /* \N{U+...} */
13159         Size_t count = 0;   /* code point count kept internally */
13160 
13161         /* We can get to here when the input is \N{U+...} or when toke.c has
13162          * converted a name to the \N{U+...} form.  This include changing a
13163          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13164 
13165         RExC_parse += 2;    /* Skip past the 'U+' */
13166 
13167         /* Code points are separated by dots.  The '}' terminates the whole
13168          * thing. */
13169 
13170         do {    /* Loop until the ending brace */
13171             I32 flags = PERL_SCAN_SILENT_OVERFLOW
13172                       | PERL_SCAN_SILENT_ILLDIGIT
13173                       | PERL_SCAN_NOTIFY_ILLDIGIT
13174                       | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13175                       | PERL_SCAN_DISALLOW_PREFIX;
13176             STRLEN len = endbrace - RExC_parse;
13177             NV overflow_value;
13178             char * start_digit = RExC_parse;
13179             UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13180 
13181             if (len == 0) {
13182                 RExC_parse++;
13183               bad_NU:
13184                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13185             }
13186 
13187             RExC_parse += len;
13188 
13189             if (cp > MAX_LEGAL_CP) {
13190                 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13191             }
13192 
13193             if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13194                 if (count) {
13195                     goto do_concat;
13196                 }
13197 
13198                 /* Here, is a single code point; fail if doesn't want that */
13199                 if (! code_point_p) {
13200                     RExC_parse = p;
13201                     return FALSE;
13202                 }
13203 
13204                 /* A single code point is easy to handle; just return it */
13205                 *code_point_p = UNI_TO_NATIVE(cp);
13206                 RExC_parse = endbrace;
13207                 nextchar(pRExC_state);
13208                 return TRUE;
13209             }
13210 
13211             /* Here, the parse stopped bfore the ending brace.  This is legal
13212              * only if that character is a dot separating code points, like a
13213              * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13214              * So the next character must be a dot (and the one after that
13215              * can't be the endbrace, or we'd have something like \N{U+100.} )
13216              * */
13217             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13218                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13219                               ? UTF8SKIP(RExC_parse)
13220                               : 1;
13221                 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13222                                                           malformed utf8 */
13223                 goto bad_NU;
13224             }
13225 
13226             /* Here, looks like its really a multiple character sequence.  Fail
13227              * if that's not what the caller wants.  But continue with counting
13228              * and error checking if they still want a count */
13229             if (! node_p && ! cp_count) {
13230                 return FALSE;
13231             }
13232 
13233             /* What is done here is to convert this to a sub-pattern of the
13234              * form \x{char1}\x{char2}...  and then call reg recursively to
13235              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13236              * atomicness, while not having to worry about special handling
13237              * that some code points may have.  We don't create a subpattern,
13238              * but go through the motions of code point counting and error
13239              * checking, if the caller doesn't want a node returned. */
13240 
13241             if (node_p && ! substitute_parse) {
13242                 substitute_parse = newSVpvs("?:");
13243             }
13244 
13245           do_concat:
13246 
13247             if (node_p) {
13248                 /* Convert to notation the rest of the code understands */
13249                 sv_catpvs(substitute_parse, "\\x{");
13250                 sv_catpvn(substitute_parse, start_digit,
13251                                             RExC_parse - start_digit);
13252                 sv_catpvs(substitute_parse, "}");
13253             }
13254 
13255             /* Move to after the dot (or ending brace the final time through.)
13256              * */
13257             RExC_parse++;
13258             count++;
13259 
13260         } while (RExC_parse < endbrace);
13261 
13262         if (! node_p) { /* Doesn't want the node */
13263             assert (cp_count);
13264 
13265             *cp_count = count;
13266             return FALSE;
13267         }
13268 
13269         sv_catpvs(substitute_parse, ")");
13270 
13271         /* The values are Unicode, and therefore have to be converted to native
13272          * on a non-Unicode (meaning non-ASCII) platform. */
13273         SET_recode_x_to_native(1);
13274     }
13275 
13276     /* Here, we have the string the name evaluates to, ready to be parsed,
13277      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13278      * constructs.  This can be called from within a substitute parse already.
13279      * The error reporting mechanism doesn't work for 2 levels of this, but the
13280      * code above has validated this new construct, so there should be no
13281      * errors generated by the below.  And this isn' an exact copy, so the
13282      * mechanism to seamlessly deal with this won't work, so turn off warnings
13283      * during it */
13284     save_start = RExC_start;
13285     orig_end = RExC_end;
13286 
13287     RExC_parse = RExC_start = SvPVX(substitute_parse);
13288     RExC_end = RExC_parse + SvCUR(substitute_parse);
13289     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13290 
13291     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13292 
13293     /* Restore the saved values */
13294     RESTORE_WARNINGS;
13295     RExC_start = save_start;
13296     RExC_parse = endbrace;
13297     RExC_end = orig_end;
13298     SET_recode_x_to_native(0);
13299 
13300     SvREFCNT_dec_NN(substitute_parse);
13301 
13302     if (! *node_p) {
13303         RETURN_FAIL_ON_RESTART(flags, flagp);
13304         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13305             (UV) flags);
13306     }
13307     *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13308 
13309     nextchar(pRExC_state);
13310 
13311     return TRUE;
13312 }
13313 
13314 
13315 STATIC U8
S_compute_EXACTish(RExC_state_t * pRExC_state)13316 S_compute_EXACTish(RExC_state_t *pRExC_state)
13317 {
13318     U8 op;
13319 
13320     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13321 
13322     if (! FOLD) {
13323         return (LOC)
13324                 ? EXACTL
13325                 : EXACT;
13326     }
13327 
13328     op = get_regex_charset(RExC_flags);
13329     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13330         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13331                  been, so there is no hole */
13332     }
13333 
13334     return op + EXACTF;
13335 }
13336 
13337 STATIC bool
S_new_regcurly(const char * s,const char * e)13338 S_new_regcurly(const char *s, const char *e)
13339 {
13340     /* This is a temporary function designed to match the most lenient form of
13341      * a {m,n} quantifier we ever envision, with either number omitted, and
13342      * spaces anywhere between/before/after them.
13343      *
13344      * If this function fails, then the string it matches is very unlikely to
13345      * ever be considered a valid quantifier, so we can allow the '{' that
13346      * begins it to be considered as a literal */
13347 
13348     bool has_min = FALSE;
13349     bool has_max = FALSE;
13350 
13351     PERL_ARGS_ASSERT_NEW_REGCURLY;
13352 
13353     if (s >= e || *s++ != '{')
13354 	return FALSE;
13355 
13356     while (s < e && isSPACE(*s)) {
13357         s++;
13358     }
13359     while (s < e && isDIGIT(*s)) {
13360         has_min = TRUE;
13361         s++;
13362     }
13363     while (s < e && isSPACE(*s)) {
13364         s++;
13365     }
13366 
13367     if (*s == ',') {
13368 	s++;
13369         while (s < e && isSPACE(*s)) {
13370             s++;
13371         }
13372         while (s < e && isDIGIT(*s)) {
13373             has_max = TRUE;
13374             s++;
13375         }
13376         while (s < e && isSPACE(*s)) {
13377             s++;
13378         }
13379     }
13380 
13381     return s < e && *s == '}' && (has_min || has_max);
13382 }
13383 
13384 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13385  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13386 
13387 static I32
S_backref_value(char * p,char * e)13388 S_backref_value(char *p, char *e)
13389 {
13390     const char* endptr = e;
13391     UV val;
13392     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13393         return (I32)val;
13394     return I32_MAX;
13395 }
13396 
13397 
13398 /*
13399  - regatom - the lowest level
13400 
13401    Try to identify anything special at the start of the current parse position.
13402    If there is, then handle it as required. This may involve generating a
13403    single regop, such as for an assertion; or it may involve recursing, such as
13404    to handle a () structure.
13405 
13406    If the string doesn't start with something special then we gobble up
13407    as much literal text as we can.  If we encounter a quantifier, we have to
13408    back off the final literal character, as that quantifier applies to just it
13409    and not to the whole string of literals.
13410 
13411    Once we have been able to handle whatever type of thing started the
13412    sequence, we return the offset into the regex engine program being compiled
13413    at which any  next regnode should be placed.
13414 
13415    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13416    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13417    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13418    Otherwise does not return 0.
13419 
13420    Note: we have to be careful with escapes, as they can be both literal
13421    and special, and in the case of \10 and friends, context determines which.
13422 
13423    A summary of the code structure is:
13424 
13425    switch (first_byte) {
13426 	cases for each special:
13427 	    handle this special;
13428 	    break;
13429 	case '\\':
13430 	    switch (2nd byte) {
13431 		cases for each unambiguous special:
13432 		    handle this special;
13433 		    break;
13434 		cases for each ambigous special/literal:
13435 		    disambiguate;
13436 		    if (special)  handle here
13437 		    else goto defchar;
13438 		default: // unambiguously literal:
13439 		    goto defchar;
13440 	    }
13441 	default:  // is a literal char
13442 	    // FALL THROUGH
13443 	defchar:
13444 	    create EXACTish node for literal;
13445 	    while (more input and node isn't full) {
13446 		switch (input_byte) {
13447 		   cases for each special;
13448                        make sure parse pointer is set so that the next call to
13449                            regatom will see this special first
13450                        goto loopdone; // EXACTish node terminated by prev. char
13451 		   default:
13452 		       append char to EXACTISH node;
13453 		}
13454 	        get next input byte;
13455 	    }
13456         loopdone:
13457    }
13458    return the generated node;
13459 
13460    Specifically there are two separate switches for handling
13461    escape sequences, with the one for handling literal escapes requiring
13462    a dummy entry for all of the special escapes that are actually handled
13463    by the other.
13464 
13465 */
13466 
13467 STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)13468 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13469 {
13470     dVAR;
13471     regnode_offset ret = 0;
13472     I32 flags = 0;
13473     char *parse_start;
13474     U8 op;
13475     int invert = 0;
13476 
13477     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13478 
13479     *flagp = WORST;		/* Tentatively. */
13480 
13481     DEBUG_PARSE("atom");
13482 
13483     PERL_ARGS_ASSERT_REGATOM;
13484 
13485   tryagain:
13486     parse_start = RExC_parse;
13487     assert(RExC_parse < RExC_end);
13488     switch ((U8)*RExC_parse) {
13489     case '^':
13490 	RExC_seen_zerolen++;
13491 	nextchar(pRExC_state);
13492 	if (RExC_flags & RXf_PMf_MULTILINE)
13493 	    ret = reg_node(pRExC_state, MBOL);
13494 	else
13495 	    ret = reg_node(pRExC_state, SBOL);
13496         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13497 	break;
13498     case '$':
13499 	nextchar(pRExC_state);
13500 	if (*RExC_parse)
13501 	    RExC_seen_zerolen++;
13502 	if (RExC_flags & RXf_PMf_MULTILINE)
13503 	    ret = reg_node(pRExC_state, MEOL);
13504 	else
13505 	    ret = reg_node(pRExC_state, SEOL);
13506         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13507 	break;
13508     case '.':
13509 	nextchar(pRExC_state);
13510 	if (RExC_flags & RXf_PMf_SINGLELINE)
13511 	    ret = reg_node(pRExC_state, SANY);
13512 	else
13513 	    ret = reg_node(pRExC_state, REG_ANY);
13514 	*flagp |= HASWIDTH|SIMPLE;
13515 	MARK_NAUGHTY(1);
13516         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13517 	break;
13518     case '[':
13519     {
13520 	char * const oregcomp_parse = ++RExC_parse;
13521         ret = regclass(pRExC_state, flagp, depth+1,
13522                        FALSE, /* means parse the whole char class */
13523                        TRUE, /* allow multi-char folds */
13524                        FALSE, /* don't silence non-portable warnings. */
13525                        (bool) RExC_strict,
13526                        TRUE, /* Allow an optimized regnode result */
13527                        NULL);
13528         if (ret == 0) {
13529             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13530             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13531                   (UV) *flagp);
13532         }
13533 	if (*RExC_parse != ']') {
13534 	    RExC_parse = oregcomp_parse;
13535 	    vFAIL("Unmatched [");
13536 	}
13537 	nextchar(pRExC_state);
13538         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13539 	break;
13540     }
13541     case '(':
13542 	nextchar(pRExC_state);
13543         ret = reg(pRExC_state, 2, &flags, depth+1);
13544 	if (ret == 0) {
13545 		if (flags & TRYAGAIN) {
13546 		    if (RExC_parse >= RExC_end) {
13547 			 /* Make parent create an empty node if needed. */
13548 			*flagp |= TRYAGAIN;
13549 			return(0);
13550 		    }
13551 		    goto tryagain;
13552 		}
13553                 RETURN_FAIL_ON_RESTART(flags, flagp);
13554                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13555                                                                  (UV) flags);
13556 	}
13557 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13558 	break;
13559     case '|':
13560     case ')':
13561 	if (flags & TRYAGAIN) {
13562 	    *flagp |= TRYAGAIN;
13563 	    return 0;
13564 	}
13565 	vFAIL("Internal urp");
13566 				/* Supposed to be caught earlier. */
13567 	break;
13568     case '?':
13569     case '+':
13570     case '*':
13571 	RExC_parse++;
13572 	vFAIL("Quantifier follows nothing");
13573 	break;
13574     case '\\':
13575 	/* Special Escapes
13576 
13577 	   This switch handles escape sequences that resolve to some kind
13578 	   of special regop and not to literal text. Escape sequences that
13579 	   resolve to literal text are handled below in the switch marked
13580 	   "Literal Escapes".
13581 
13582 	   Every entry in this switch *must* have a corresponding entry
13583 	   in the literal escape switch. However, the opposite is not
13584 	   required, as the default for this switch is to jump to the
13585 	   literal text handling code.
13586 	*/
13587 	RExC_parse++;
13588 	switch ((U8)*RExC_parse) {
13589 	/* Special Escapes */
13590 	case 'A':
13591 	    RExC_seen_zerolen++;
13592             /* Under wildcards, this is changed to match \n; should be
13593              * invisible to the user, as they have to compile under /m */
13594             if (RExC_pm_flags & PMf_WILDCARD) {
13595                 ret = reg_node(pRExC_state, MBOL);
13596             }
13597             else {
13598                 ret = reg_node(pRExC_state, SBOL);
13599                 /* SBOL is shared with /^/ so we set the flags so we can tell
13600                  * /\A/ from /^/ in split. */
13601                 FLAGS(REGNODE_p(ret)) = 1;
13602                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13603             }
13604 	    goto finish_meta_pat;
13605 	case 'G':
13606             if (RExC_pm_flags & PMf_WILDCARD) {
13607                 RExC_parse++;
13608                 /* diag_listed_as: Use of %s is not allowed in Unicode property
13609                    wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13610                  */
13611                 vFAIL("Use of '\\G' is not allowed in Unicode property"
13612                       " wildcard subpatterns");
13613             }
13614 	    ret = reg_node(pRExC_state, GPOS);
13615             RExC_seen |= REG_GPOS_SEEN;
13616 	    *flagp |= SIMPLE;
13617 	    goto finish_meta_pat;
13618 	case 'K':
13619             if (!RExC_in_lookaround) {
13620                 RExC_seen_zerolen++;
13621                 ret = reg_node(pRExC_state, KEEPS);
13622                 *flagp |= SIMPLE;
13623                 /* XXX:dmq : disabling in-place substitution seems to
13624                  * be necessary here to avoid cases of memory corruption, as
13625                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13626                  */
13627                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13628                 goto finish_meta_pat;
13629             }
13630             else {
13631                 ++RExC_parse; /* advance past the 'K' */
13632                 vFAIL("\\K not permitted in lookahead/lookbehind");
13633             }
13634 	case 'Z':
13635             if (RExC_pm_flags & PMf_WILDCARD) {
13636                 /* See comment under \A above */
13637                 ret = reg_node(pRExC_state, MEOL);
13638             }
13639             else {
13640                 ret = reg_node(pRExC_state, SEOL);
13641                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13642             }
13643 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
13644 	    goto finish_meta_pat;
13645 	case 'z':
13646             if (RExC_pm_flags & PMf_WILDCARD) {
13647                 /* See comment under \A above */
13648                 ret = reg_node(pRExC_state, MEOL);
13649             }
13650             else {
13651                 ret = reg_node(pRExC_state, EOS);
13652                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13653             }
13654 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
13655 	    goto finish_meta_pat;
13656 	case 'C':
13657 	    vFAIL("\\C no longer supported");
13658 	case 'X':
13659 	    ret = reg_node(pRExC_state, CLUMP);
13660 	    *flagp |= HASWIDTH;
13661 	    goto finish_meta_pat;
13662 
13663 	case 'B':
13664             invert = 1;
13665             /* FALLTHROUGH */
13666 	case 'b':
13667           {
13668             U8 flags = 0;
13669 	    regex_charset charset = get_regex_charset(RExC_flags);
13670 
13671 	    RExC_seen_zerolen++;
13672             RExC_seen |= REG_LOOKBEHIND_SEEN;
13673 	    op = BOUND + charset;
13674 
13675 	    if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13676                 flags = TRADITIONAL_BOUND;
13677                 if (op > BOUNDA) {  /* /aa is same as /a */
13678                     op = BOUNDA;
13679                 }
13680             }
13681             else {
13682                 STRLEN length;
13683                 char name = *RExC_parse;
13684                 char * endbrace = NULL;
13685                 RExC_parse += 2;
13686                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13687 
13688                 if (! endbrace) {
13689                     vFAIL2("Missing right brace on \\%c{}", name);
13690                 }
13691                 /* XXX Need to decide whether to take spaces or not.  Should be
13692                  * consistent with \p{}, but that currently is SPACE, which
13693                  * means vertical too, which seems wrong
13694                  * while (isBLANK(*RExC_parse)) {
13695                     RExC_parse++;
13696                 }*/
13697                 if (endbrace == RExC_parse) {
13698                     RExC_parse++;  /* After the '}' */
13699                     vFAIL2("Empty \\%c{}", name);
13700                 }
13701                 length = endbrace - RExC_parse;
13702                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13703                     length--;
13704                 }*/
13705                 switch (*RExC_parse) {
13706                     case 'g':
13707                         if (    length != 1
13708                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13709                         {
13710                             goto bad_bound_type;
13711                         }
13712                         flags = GCB_BOUND;
13713                         break;
13714                     case 'l':
13715                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13716                             goto bad_bound_type;
13717                         }
13718                         flags = LB_BOUND;
13719                         break;
13720                     case 's':
13721                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13722                             goto bad_bound_type;
13723                         }
13724                         flags = SB_BOUND;
13725                         break;
13726                     case 'w':
13727                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13728                             goto bad_bound_type;
13729                         }
13730                         flags = WB_BOUND;
13731                         break;
13732                     default:
13733                       bad_bound_type:
13734                         RExC_parse = endbrace;
13735 			vFAIL2utf8f(
13736                             "'%" UTF8f "' is an unknown bound type",
13737 			    UTF8fARG(UTF, length, endbrace - length));
13738                         NOT_REACHED; /*NOTREACHED*/
13739                 }
13740                 RExC_parse = endbrace;
13741                 REQUIRE_UNI_RULES(flagp, 0);
13742 
13743                 if (op == BOUND) {
13744                     op = BOUNDU;
13745                 }
13746                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13747                     op = BOUNDU;
13748                     length += 4;
13749 
13750                     /* Don't have to worry about UTF-8, in this message because
13751                      * to get here the contents of the \b must be ASCII */
13752                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13753                               "Using /u for '%.*s' instead of /%s",
13754                               (unsigned) length,
13755                               endbrace - length + 1,
13756                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13757                               ? ASCII_RESTRICT_PAT_MODS
13758                               : ASCII_MORE_RESTRICT_PAT_MODS);
13759                 }
13760 	    }
13761 
13762             if (op == BOUND) {
13763                 RExC_seen_d_op = TRUE;
13764             }
13765             else if (op == BOUNDL) {
13766                 RExC_contains_locale = 1;
13767             }
13768 
13769             if (invert) {
13770                 op += NBOUND - BOUND;
13771             }
13772 
13773 	    ret = reg_node(pRExC_state, op);
13774             FLAGS(REGNODE_p(ret)) = flags;
13775 
13776 	    *flagp |= SIMPLE;
13777 
13778 	    goto finish_meta_pat;
13779           }
13780 
13781 	case 'R':
13782 	    ret = reg_node(pRExC_state, LNBREAK);
13783 	    *flagp |= HASWIDTH|SIMPLE;
13784 	    goto finish_meta_pat;
13785 
13786 	case 'd':
13787 	case 'D':
13788 	case 'h':
13789 	case 'H':
13790 	case 'p':
13791 	case 'P':
13792 	case 's':
13793 	case 'S':
13794 	case 'v':
13795 	case 'V':
13796 	case 'w':
13797 	case 'W':
13798             /* These all have the same meaning inside [brackets], and it knows
13799              * how to do the best optimizations for them.  So, pretend we found
13800              * these within brackets, and let it do the work */
13801             RExC_parse--;
13802 
13803             ret = regclass(pRExC_state, flagp, depth+1,
13804                            TRUE, /* means just parse this element */
13805                            FALSE, /* don't allow multi-char folds */
13806                            FALSE, /* don't silence non-portable warnings.  It
13807                                      would be a bug if these returned
13808                                      non-portables */
13809                            (bool) RExC_strict,
13810                            TRUE, /* Allow an optimized regnode result */
13811                            NULL);
13812             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13813             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13814              * multi-char folds are allowed.  */
13815             if (!ret)
13816                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13817                       (UV) *flagp);
13818 
13819             RExC_parse--;   /* regclass() leaves this one too far ahead */
13820 
13821           finish_meta_pat:
13822                    /* The escapes above that don't take a parameter can't be
13823                     * followed by a '{'.  But 'pX', 'p{foo}' and
13824                     * correspondingly 'P' can be */
13825             if (   RExC_parse - parse_start == 1
13826                 && UCHARAT(RExC_parse + 1) == '{'
13827                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13828             {
13829                 RExC_parse += 2;
13830                 vFAIL("Unescaped left brace in regex is illegal here");
13831             }
13832             Set_Node_Offset(REGNODE_p(ret), parse_start);
13833             Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13834             nextchar(pRExC_state);
13835 	    break;
13836         case 'N':
13837             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13838              * \N{...} evaluates to a sequence of more than one code points).
13839              * The function call below returns a regnode, which is our result.
13840              * The parameters cause it to fail if the \N{} evaluates to a
13841              * single code point; we handle those like any other literal.  The
13842              * reason that the multicharacter case is handled here and not as
13843              * part of the EXACtish code is because of quantifiers.  In
13844              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13845              * this way makes that Just Happen. dmq.
13846              * join_exact() will join this up with adjacent EXACTish nodes
13847              * later on, if appropriate. */
13848             ++RExC_parse;
13849             if (grok_bslash_N(pRExC_state,
13850                               &ret,     /* Want a regnode returned */
13851                               NULL,     /* Fail if evaluates to a single code
13852                                            point */
13853                               NULL,     /* Don't need a count of how many code
13854                                            points */
13855                               flagp,
13856                               RExC_strict,
13857                               depth)
13858             ) {
13859                 break;
13860             }
13861 
13862             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13863 
13864             /* Here, evaluates to a single code point.  Go get that */
13865             RExC_parse = parse_start;
13866             goto defchar;
13867 
13868 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
13869       parse_named_seq:
13870         {
13871             char ch;
13872             if (   RExC_parse >= RExC_end - 1
13873                 || ((   ch = RExC_parse[1]) != '<'
13874                                       && ch != '\''
13875                                       && ch != '{'))
13876             {
13877 	        RExC_parse++;
13878 		/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13879 	        vFAIL2("Sequence %.2s... not terminated", parse_start);
13880 	    } else {
13881 		RExC_parse += 2;
13882                 ret = handle_named_backref(pRExC_state,
13883                                            flagp,
13884                                            parse_start,
13885                                            (ch == '<')
13886                                            ? '>'
13887                                            : (ch == '{')
13888                                              ? '}'
13889                                              : '\'');
13890             }
13891             break;
13892 	}
13893 	case 'g':
13894 	case '1': case '2': case '3': case '4':
13895 	case '5': case '6': case '7': case '8': case '9':
13896 	    {
13897 		I32 num;
13898 		bool hasbrace = 0;
13899 
13900 		if (*RExC_parse == 'g') {
13901                     bool isrel = 0;
13902 
13903 		    RExC_parse++;
13904 		    if (*RExC_parse == '{') {
13905 		        RExC_parse++;
13906 		        hasbrace = 1;
13907 		    }
13908 		    if (*RExC_parse == '-') {
13909 		        RExC_parse++;
13910 		        isrel = 1;
13911 		    }
13912 		    if (hasbrace && !isDIGIT(*RExC_parse)) {
13913 		        if (isrel) RExC_parse--;
13914                         RExC_parse -= 2;
13915 		        goto parse_named_seq;
13916                     }
13917 
13918                     if (RExC_parse >= RExC_end) {
13919                         goto unterminated_g;
13920                     }
13921                     num = S_backref_value(RExC_parse, RExC_end);
13922                     if (num == 0)
13923                         vFAIL("Reference to invalid group 0");
13924                     else if (num == I32_MAX) {
13925                          if (isDIGIT(*RExC_parse))
13926 			    vFAIL("Reference to nonexistent group");
13927                         else
13928                           unterminated_g:
13929                             vFAIL("Unterminated \\g... pattern");
13930                     }
13931 
13932                     if (isrel) {
13933                         num = RExC_npar - num;
13934                         if (num < 1)
13935                             vFAIL("Reference to nonexistent or unclosed group");
13936                     }
13937                 }
13938                 else {
13939                     num = S_backref_value(RExC_parse, RExC_end);
13940                     /* bare \NNN might be backref or octal - if it is larger
13941                      * than or equal RExC_npar then it is assumed to be an
13942                      * octal escape. Note RExC_npar is +1 from the actual
13943                      * number of parens. */
13944                     /* Note we do NOT check if num == I32_MAX here, as that is
13945                      * handled by the RExC_npar check */
13946 
13947                     if (
13948                         /* any numeric escape < 10 is always a backref */
13949                         num > 9
13950                         /* any numeric escape < RExC_npar is a backref */
13951                         && num >= RExC_npar
13952                         /* cannot be an octal escape if it starts with 8 */
13953                         && *RExC_parse != '8'
13954                         /* cannot be an octal escape if it starts with 9 */
13955                         && *RExC_parse != '9'
13956                     ) {
13957                         /* Probably not meant to be a backref, instead likely
13958                          * to be an octal character escape, e.g. \35 or \777.
13959                          * The above logic should make it obvious why using
13960                          * octal escapes in patterns is problematic. - Yves */
13961                         RExC_parse = parse_start;
13962                         goto defchar;
13963                     }
13964                 }
13965 
13966                 /* At this point RExC_parse points at a numeric escape like
13967                  * \12 or \88 or something similar, which we should NOT treat
13968                  * as an octal escape. It may or may not be a valid backref
13969                  * escape. For instance \88888888 is unlikely to be a valid
13970                  * backref. */
13971                 while (isDIGIT(*RExC_parse))
13972                     RExC_parse++;
13973                 if (hasbrace) {
13974                     if (*RExC_parse != '}')
13975                         vFAIL("Unterminated \\g{...} pattern");
13976                     RExC_parse++;
13977                 }
13978                 if (num >= (I32)RExC_npar) {
13979 
13980                     /* It might be a forward reference; we can't fail until we
13981                      * know, by completing the parse to get all the groups, and
13982                      * then reparsing */
13983                     if (ALL_PARENS_COUNTED)  {
13984                         if (num >= RExC_total_parens)  {
13985                             vFAIL("Reference to nonexistent group");
13986                         }
13987                     }
13988                     else {
13989                         REQUIRE_PARENS_PASS;
13990                     }
13991                 }
13992                 RExC_sawback = 1;
13993                 ret = reganode(pRExC_state,
13994                                ((! FOLD)
13995                                  ? REF
13996                                  : (ASCII_FOLD_RESTRICTED)
13997                                    ? REFFA
13998                                    : (AT_LEAST_UNI_SEMANTICS)
13999                                      ? REFFU
14000                                      : (LOC)
14001                                        ? REFFL
14002                                        : REFF),
14003                                 num);
14004                 if (OP(REGNODE_p(ret)) == REFF) {
14005                     RExC_seen_d_op = TRUE;
14006                 }
14007                 *flagp |= HASWIDTH;
14008 
14009                 /* override incorrect value set in reganode MJD */
14010                 Set_Node_Offset(REGNODE_p(ret), parse_start);
14011                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
14012                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14013                                         FALSE /* Don't force to /x */ );
14014 	    }
14015 	    break;
14016 	case '\0':
14017 	    if (RExC_parse >= RExC_end)
14018 		FAIL("Trailing \\");
14019 	    /* FALLTHROUGH */
14020 	default:
14021 	    /* Do not generate "unrecognized" warnings here, we fall
14022 	       back into the quick-grab loop below */
14023             RExC_parse = parse_start;
14024 	    goto defchar;
14025 	} /* end of switch on a \foo sequence */
14026 	break;
14027 
14028     case '#':
14029 
14030         /* '#' comments should have been spaced over before this function was
14031          * called */
14032         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14033 	/*
14034         if (RExC_flags & RXf_PMf_EXTENDED) {
14035 	    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14036 	    if (RExC_parse < RExC_end)
14037 		goto tryagain;
14038 	}
14039         */
14040 
14041 	/* FALLTHROUGH */
14042 
14043     default:
14044 	  defchar: {
14045 
14046             /* Here, we have determined that the next thing is probably a
14047              * literal character.  RExC_parse points to the first byte of its
14048              * definition.  (It still may be an escape sequence that evaluates
14049              * to a single character) */
14050 
14051 	    STRLEN len = 0;
14052 	    UV ender = 0;
14053 	    char *p;
14054 	    char *s, *old_s = NULL, *old_old_s = NULL;
14055 	    char *s0;
14056             U32 max_string_len = 255;
14057 
14058             /* We may have to reparse the node, artificially stopping filling
14059              * it early, based on info gleaned in the first parse.  This
14060              * variable gives where we stop.  Make it above the normal stopping
14061              * place first time through; otherwise it would stop too early */
14062             U32 upper_fill = max_string_len + 1;
14063 
14064             /* We start out as an EXACT node, even if under /i, until we find a
14065              * character which is in a fold.  The algorithm now segregates into
14066              * separate nodes, characters that fold from those that don't under
14067              * /i.  (This hopefully will create nodes that are fixed strings
14068              * even under /i, giving the optimizer something to grab on to.)
14069              * So, if a node has something in it and the next character is in
14070              * the opposite category, that node is closed up, and the function
14071              * returns.  Then regatom is called again, and a new node is
14072              * created for the new category. */
14073             U8 node_type = EXACT;
14074 
14075             /* Assume the node will be fully used; the excess is given back at
14076              * the end.  Under /i, we may need to temporarily add the fold of
14077              * an extra character or two at the end to check for splitting
14078              * multi-char folds, so allocate extra space for that.   We can't
14079              * make any other length assumptions, as a byte input sequence
14080              * could shrink down. */
14081             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14082                                                  + ((! FOLD)
14083                                                     ? 0
14084                                                     : 2 * ((UTF)
14085                                                            ? UTF8_MAXBYTES_CASE
14086                         /* Max non-UTF-8 expansion is 2 */ : 2)));
14087 
14088             bool next_is_quantifier;
14089             char * oldp = NULL;
14090 
14091             /* We can convert EXACTF nodes to EXACTFU if they contain only
14092              * characters that match identically regardless of the target
14093              * string's UTF8ness.  The reason to do this is that EXACTF is not
14094              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14095              * runtime.
14096              *
14097              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14098              * contain only above-Latin1 characters (hence must be in UTF8),
14099              * which don't participate in folds with Latin1-range characters,
14100              * as the latter's folds aren't known until runtime. */
14101             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14102 
14103             /* Single-character EXACTish nodes are almost always SIMPLE.  This
14104              * allows us to override this as encountered */
14105             U8 maybe_SIMPLE = SIMPLE;
14106 
14107             /* Does this node contain something that can't match unless the
14108              * target string is (also) in UTF-8 */
14109             bool requires_utf8_target = FALSE;
14110 
14111             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14112             bool has_ss = FALSE;
14113 
14114             /* So is the MICRO SIGN */
14115             bool has_micro_sign = FALSE;
14116 
14117             /* Set when we fill up the current node and there is still more
14118              * text to process */
14119             bool overflowed;
14120 
14121             /* Allocate an EXACT node.  The node_type may change below to
14122              * another EXACTish node, but since the size of the node doesn't
14123              * change, it works */
14124             ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14125                                                                     "exact");
14126             FILL_NODE(ret, node_type);
14127             RExC_emit++;
14128 
14129 	    s = STRING(REGNODE_p(ret));
14130 
14131             s0 = s;
14132 
14133 	  reparse:
14134 
14135             p = RExC_parse;
14136             len = 0;
14137             s = s0;
14138             node_type = EXACT;
14139             oldp = NULL;
14140             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14141             maybe_SIMPLE = SIMPLE;
14142             requires_utf8_target = FALSE;
14143             has_ss = FALSE;
14144             has_micro_sign = FALSE;
14145 
14146           continue_parse:
14147 
14148             /* This breaks under rare circumstances.  If folding, we do not
14149              * want to split a node at a character that is a non-final in a
14150              * multi-char fold, as an input string could just happen to want to
14151              * match across the node boundary.  The code at the end of the loop
14152              * looks for this, and backs off until it finds not such a
14153              * character, but it is possible (though extremely, extremely
14154              * unlikely) for all characters in the node to be non-final fold
14155              * ones, in which case we just leave the node fully filled, and
14156              * hope that it doesn't match the string in just the wrong place */
14157 
14158             assert( ! UTF     /* Is at the beginning of a character */
14159                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14160                    || UTF8_IS_START(UCHARAT(RExC_parse)));
14161 
14162             overflowed = FALSE;
14163 
14164             /* Here, we have a literal character.  Find the maximal string of
14165              * them in the input that we can fit into a single EXACTish node.
14166              * We quit at the first non-literal or when the node gets full, or
14167              * under /i the categorization of folding/non-folding character
14168              * changes */
14169             while (p < RExC_end && len < upper_fill) {
14170 
14171                 /* In most cases each iteration adds one byte to the output.
14172                  * The exceptions override this */
14173                 Size_t added_len = 1;
14174 
14175 		oldp = p;
14176                 old_old_s = old_s;
14177                 old_s = s;
14178 
14179                 /* White space has already been ignored */
14180                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
14181                        || ! is_PATWS_safe((p), RExC_end, UTF));
14182 
14183 		switch ((U8)*p) {
14184                   const char* message;
14185                   U32 packed_warn;
14186                   U8 grok_c_char;
14187 
14188 		case '^':
14189 		case '$':
14190 		case '.':
14191 		case '[':
14192 		case '(':
14193 		case ')':
14194 		case '|':
14195 		    goto loopdone;
14196 		case '\\':
14197 		    /* Literal Escapes Switch
14198 
14199 		       This switch is meant to handle escape sequences that
14200 		       resolve to a literal character.
14201 
14202 		       Every escape sequence that represents something
14203 		       else, like an assertion or a char class, is handled
14204 		       in the switch marked 'Special Escapes' above in this
14205 		       routine, but also has an entry here as anything that
14206 		       isn't explicitly mentioned here will be treated as
14207 		       an unescaped equivalent literal.
14208 		    */
14209 
14210 		    switch ((U8)*++p) {
14211 
14212 		    /* These are all the special escapes. */
14213 		    case 'A':             /* Start assertion */
14214 		    case 'b': case 'B':   /* Word-boundary assertion*/
14215 		    case 'C':             /* Single char !DANGEROUS! */
14216 		    case 'd': case 'D':   /* digit class */
14217 		    case 'g': case 'G':   /* generic-backref, pos assertion */
14218 		    case 'h': case 'H':   /* HORIZWS */
14219 		    case 'k': case 'K':   /* named backref, keep marker */
14220 		    case 'p': case 'P':   /* Unicode property */
14221 		              case 'R':   /* LNBREAK */
14222 		    case 's': case 'S':   /* space class */
14223 		    case 'v': case 'V':   /* VERTWS */
14224 		    case 'w': case 'W':   /* word class */
14225                     case 'X':             /* eXtended Unicode "combining
14226                                              character sequence" */
14227 		    case 'z': case 'Z':   /* End of line/string assertion */
14228 			--p;
14229 			goto loopdone;
14230 
14231 	            /* Anything after here is an escape that resolves to a
14232 	               literal. (Except digits, which may or may not)
14233 	             */
14234 		    case 'n':
14235 			ender = '\n';
14236 			p++;
14237 			break;
14238 		    case 'N': /* Handle a single-code point named character. */
14239                         RExC_parse = p + 1;
14240                         if (! grok_bslash_N(pRExC_state,
14241                                             NULL,   /* Fail if evaluates to
14242                                                        anything other than a
14243                                                        single code point */
14244                                             &ender, /* The returned single code
14245                                                        point */
14246                                             NULL,   /* Don't need a count of
14247                                                        how many code points */
14248                                             flagp,
14249                                             RExC_strict,
14250                                             depth)
14251                         ) {
14252                             if (*flagp & NEED_UTF8)
14253                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14254                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14255 
14256                             /* Here, it wasn't a single code point.  Go close
14257                              * up this EXACTish node.  The switch() prior to
14258                              * this switch handles the other cases */
14259                             RExC_parse = p = oldp;
14260                             goto loopdone;
14261                         }
14262                         p = RExC_parse;
14263                         RExC_parse = parse_start;
14264 
14265                         /* The \N{} means the pattern, if previously /d,
14266                          * becomes /u.  That means it can't be an EXACTF node,
14267                          * but an EXACTFU */
14268                         if (node_type == EXACTF) {
14269                             node_type = EXACTFU;
14270 
14271                             /* If the node already contains something that
14272                              * differs between EXACTF and EXACTFU, reparse it
14273                              * as EXACTFU */
14274                             if (! maybe_exactfu) {
14275                                 len = 0;
14276                                 s = s0;
14277                                 goto reparse;
14278                             }
14279                         }
14280 
14281                         break;
14282 		    case 'r':
14283 			ender = '\r';
14284 			p++;
14285 			break;
14286 		    case 't':
14287 			ender = '\t';
14288 			p++;
14289 			break;
14290 		    case 'f':
14291 			ender = '\f';
14292 			p++;
14293 			break;
14294 		    case 'e':
14295 			ender = ESC_NATIVE;
14296 			p++;
14297 			break;
14298 		    case 'a':
14299 			ender = '\a';
14300 			p++;
14301 			break;
14302 		    case 'o':
14303                         if (! grok_bslash_o(&p,
14304                                             RExC_end,
14305                                             &ender,
14306                                             &message,
14307                                             &packed_warn,
14308                                             (bool) RExC_strict,
14309                                             FALSE, /* No illegal cp's */
14310                                             UTF))
14311                         {
14312                             RExC_parse = p; /* going to die anyway; point to
14313                                                exact spot of failure */
14314                             vFAIL(message);
14315                         }
14316 
14317                         if (message && TO_OUTPUT_WARNINGS(p)) {
14318                             warn_non_literal_string(p, packed_warn, message);
14319                         }
14320                         break;
14321 		    case 'x':
14322                         if (! grok_bslash_x(&p,
14323                                             RExC_end,
14324                                             &ender,
14325                                             &message,
14326                                             &packed_warn,
14327                                             (bool) RExC_strict,
14328                                             FALSE, /* No illegal cp's */
14329                                             UTF))
14330                         {
14331                             RExC_parse = p;	/* going to die anyway; point
14332                                                    to exact spot of failure */
14333                             vFAIL(message);
14334                         }
14335 
14336                         if (message && TO_OUTPUT_WARNINGS(p)) {
14337                             warn_non_literal_string(p, packed_warn, message);
14338                         }
14339 
14340 #ifdef EBCDIC
14341                         if (ender < 0x100) {
14342                             if (RExC_recode_x_to_native) {
14343                                 ender = LATIN1_TO_NATIVE(ender);
14344                             }
14345                         }
14346 #endif
14347                         break;
14348 		    case 'c':
14349                         p++;
14350                         if (! grok_bslash_c(*p, &grok_c_char,
14351                                             &message, &packed_warn))
14352                         {
14353                             /* going to die anyway; point to exact spot of
14354                              * failure */
14355                             RExC_parse = p + ((UTF)
14356                                               ? UTF8_SAFE_SKIP(p, RExC_end)
14357                                               : 1);
14358                             vFAIL(message);
14359                         }
14360 
14361                         ender = grok_c_char;
14362                         p++;
14363                         if (message && TO_OUTPUT_WARNINGS(p)) {
14364                             warn_non_literal_string(p, packed_warn, message);
14365                         }
14366 
14367 			break;
14368                     case '8': case '9': /* must be a backreference */
14369                         --p;
14370                         /* we have an escape like \8 which cannot be an octal escape
14371                          * so we exit the loop, and let the outer loop handle this
14372                          * escape which may or may not be a legitimate backref. */
14373                         goto loopdone;
14374                     case '1': case '2': case '3':case '4':
14375 		    case '5': case '6': case '7':
14376                         /* When we parse backslash escapes there is ambiguity
14377                          * between backreferences and octal escapes. Any escape
14378                          * from \1 - \9 is a backreference, any multi-digit
14379                          * escape which does not start with 0 and which when
14380                          * evaluated as decimal could refer to an already
14381                          * parsed capture buffer is a back reference. Anything
14382                          * else is octal.
14383                          *
14384                          * Note this implies that \118 could be interpreted as
14385                          * 118 OR as "\11" . "8" depending on whether there
14386                          * were 118 capture buffers defined already in the
14387                          * pattern.  */
14388 
14389                         /* NOTE, RExC_npar is 1 more than the actual number of
14390                          * parens we have seen so far, hence the "<" as opposed
14391                          * to "<=" */
14392                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14393                         {  /* Not to be treated as an octal constant, go
14394                                    find backref */
14395                             --p;
14396                             goto loopdone;
14397                         }
14398                         /* FALLTHROUGH */
14399                     case '0':
14400 			{
14401 			    I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14402                                       | PERL_SCAN_NOTIFY_ILLDIGIT;
14403 			    STRLEN numlen = 3;
14404 			    ender = grok_oct(p, &numlen, &flags, NULL);
14405 			    p += numlen;
14406                             if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14407                                 && isDIGIT(*p)  /* like \08, \178 */
14408                                 && ckWARN(WARN_REGEXP))
14409                             {
14410 				reg_warn_non_literal_string(
14411                                      p + 1,
14412                                      form_alien_digit_msg(8, numlen, p,
14413                                                         RExC_end, UTF, FALSE));
14414                             }
14415 			}
14416 			break;
14417 		    case '\0':
14418 			if (p >= RExC_end)
14419 			    FAIL("Trailing \\");
14420 			/* FALLTHROUGH */
14421 		    default:
14422 			if (isALPHANUMERIC(*p)) {
14423                             /* An alpha followed by '{' is going to fail next
14424                              * iteration, so don't output this warning in that
14425                              * case */
14426                             if (! isALPHA(*p) || *(p + 1) != '{') {
14427                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14428                                                   " passed through", p);
14429                             }
14430 			}
14431 			goto normal_default;
14432 		    } /* End of switch on '\' */
14433 		    break;
14434 		case '{':
14435                     /* Trying to gain new uses for '{' without breaking too
14436                      * much existing code is hard.  The solution currently
14437                      * adopted is:
14438                      *  1)  If there is no ambiguity that a '{' should always
14439                      *      be taken literally, at the start of a construct, we
14440                      *      just do so.
14441                      *  2)  If the literal '{' conflicts with our desired use
14442                      *      of it as a metacharacter, we die.  The deprecation
14443                      *      cycles for this have come and gone.
14444                      *  3)  If there is ambiguity, we raise a simple warning.
14445                      *      This could happen, for example, if the user
14446                      *      intended it to introduce a quantifier, but slightly
14447                      *      misspelled the quantifier.  Without this warning,
14448                      *      the quantifier would silently be taken as a literal
14449                      *      string of characters instead of a meta construct */
14450 		    if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14451                         if (      RExC_strict
14452                             || (  p > parse_start + 1
14453                                 && isALPHA_A(*(p - 1))
14454                                 && *(p - 2) == '\\')
14455                             || new_regcurly(p, RExC_end))
14456                         {
14457                             RExC_parse = p + 1;
14458                             vFAIL("Unescaped left brace in regex is "
14459                                   "illegal here");
14460                         }
14461                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14462                                          " passed through");
14463 		    }
14464 		    goto normal_default;
14465                 case '}':
14466                 case ']':
14467                     if (p > RExC_parse && RExC_strict) {
14468                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14469                     }
14470 		    /*FALLTHROUGH*/
14471 		default:    /* A literal character */
14472 		  normal_default:
14473 		    if (! UTF8_IS_INVARIANT(*p) && UTF) {
14474 			STRLEN numlen;
14475 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14476 					       &numlen, UTF8_ALLOW_DEFAULT);
14477 			p += numlen;
14478 		    }
14479 		    else
14480 			ender = (U8) *p++;
14481 		    break;
14482 		} /* End of switch on the literal */
14483 
14484 		/* Here, have looked at the literal character, and <ender>
14485                  * contains its ordinal; <p> points to the character after it.
14486                  * */
14487 
14488                 if (ender > 255) {
14489                     REQUIRE_UTF8(flagp);
14490                     if (   UNICODE_IS_PERL_EXTENDED(ender)
14491                         && TO_OUTPUT_WARNINGS(p))
14492                     {
14493                         ckWARN2_non_literal_string(p,
14494                                                    packWARN(WARN_PORTABLE),
14495                                                    PL_extended_cp_format,
14496                                                    ender);
14497                     }
14498                 }
14499 
14500                 /* We need to check if the next non-ignored thing is a
14501                  * quantifier.  Move <p> to after anything that should be
14502                  * ignored, which, as a side effect, positions <p> for the next
14503                  * loop iteration */
14504                 skip_to_be_ignored_text(pRExC_state, &p,
14505                                         FALSE /* Don't force to /x */ );
14506 
14507                 /* If the next thing is a quantifier, it applies to this
14508                  * character only, which means that this character has to be in
14509                  * its own node and can't just be appended to the string in an
14510                  * existing node, so if there are already other characters in
14511                  * the node, close the node with just them, and set up to do
14512                  * this character again next time through, when it will be the
14513                  * only thing in its new node */
14514 
14515                 next_is_quantifier =    LIKELY(p < RExC_end)
14516                                      && UNLIKELY(ISMULT2(p));
14517 
14518                 if (next_is_quantifier && LIKELY(len)) {
14519                     p = oldp;
14520                     goto loopdone;
14521                 }
14522 
14523                 /* Ready to add 'ender' to the node */
14524 
14525                 if (! FOLD) {  /* The simple case, just append the literal */
14526                   not_fold_common:
14527 
14528                     /* Don't output if it would overflow */
14529                     if (UNLIKELY(len > max_string_len - ((UTF)
14530                                                       ? UVCHR_SKIP(ender)
14531                                                       : 1)))
14532                     {
14533                         overflowed = TRUE;
14534                         break;
14535                     }
14536 
14537                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14538                         *(s++) = (char) ender;
14539                     }
14540                     else {
14541                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14542                         added_len = (char *) new_s - s;
14543                         s = (char *) new_s;
14544 
14545                         if (ender > 255)  {
14546                             requires_utf8_target = TRUE;
14547                         }
14548                     }
14549                 }
14550                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14551 
14552                     /* Here are folding under /l, and the code point is
14553                      * problematic.  If this is the first character in the
14554                      * node, change the node type to folding.   Otherwise, if
14555                      * this is the first problematic character, close up the
14556                      * existing node, so can start a new node with this one */
14557                     if (! len) {
14558                         node_type = EXACTFL;
14559                         RExC_contains_locale = 1;
14560                     }
14561                     else if (node_type == EXACT) {
14562                         p = oldp;
14563                         goto loopdone;
14564                     }
14565 
14566                     /* This problematic code point means we can't simplify
14567                      * things */
14568                     maybe_exactfu = FALSE;
14569 
14570                     /* Here, we are adding a problematic fold character.
14571                      * "Problematic" in this context means that its fold isn't
14572                      * known until runtime.  (The non-problematic code points
14573                      * are the above-Latin1 ones that fold to also all
14574                      * above-Latin1.  Their folds don't vary no matter what the
14575                      * locale is.) But here we have characters whose fold
14576                      * depends on the locale.  We just add in the unfolded
14577                      * character, and wait until runtime to fold it */
14578                     goto not_fold_common;
14579                 }
14580                 else /* regular fold; see if actually is in a fold */
14581                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14582                          || (ender > 255
14583                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14584                 {
14585                     /* Here, folding, but the character isn't in a fold.
14586                      *
14587                      * Start a new node if previous characters in the node were
14588                      * folded */
14589                     if (len && node_type != EXACT) {
14590                         p = oldp;
14591                         goto loopdone;
14592                     }
14593 
14594                     /* Here, continuing a node with non-folded characters.  Add
14595                      * this one */
14596                     goto not_fold_common;
14597                 }
14598                 else {  /* Here, does participate in some fold */
14599 
14600                     /* If this is the first character in the node, change its
14601                      * type to folding.  Otherwise, if this is the first
14602                      * folding character in the node, close up the existing
14603                      * node, so can start a new node with this one.  */
14604                     if (! len) {
14605                         node_type = compute_EXACTish(pRExC_state);
14606                     }
14607                     else if (node_type == EXACT) {
14608                         p = oldp;
14609                         goto loopdone;
14610                     }
14611 
14612                     if (UTF) {  /* Alway use the folded value for UTF-8
14613                                    patterns */
14614                         if (UVCHR_IS_INVARIANT(ender)) {
14615                             if (UNLIKELY(len + 1 > max_string_len)) {
14616                                 overflowed = TRUE;
14617                                 break;
14618                             }
14619 
14620                             *(s)++ = (U8) toFOLD(ender);
14621                         }
14622                         else {
14623                             UV folded = _to_uni_fold_flags(
14624                                     ender,
14625                                     (U8 *) s,  /* We have allocated extra space
14626                                                   in 's' so can't run off the
14627                                                   end */
14628                                     &added_len,
14629                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14630                                                     ? FOLD_FLAGS_NOMIX_ASCII
14631                                                     : 0));
14632                             if (UNLIKELY(len + added_len > max_string_len)) {
14633                                 overflowed = TRUE;
14634                                 break;
14635                             }
14636 
14637                             s += added_len;
14638 
14639                             if (   folded > 255
14640                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14641                             {
14642                                 /* U+B5 folds to the MU, so its possible for a
14643                                  * non-UTF-8 target to match it */
14644                                 requires_utf8_target = TRUE;
14645                             }
14646                         }
14647                     }
14648                     else { /* Here is non-UTF8. */
14649 
14650                         /* The fold will be one or (rarely) two characters.
14651                          * Check that there's room for at least a single one
14652                          * before setting any flags, etc.  Because otherwise an
14653                          * overflowing character could cause a flag to be set
14654                          * even though it doesn't end up in this node.  (For
14655                          * the two character fold, we check again, before
14656                          * setting any flags) */
14657                         if (UNLIKELY(len + 1 > max_string_len)) {
14658                             overflowed = TRUE;
14659                             break;
14660                         }
14661 
14662 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14663    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14664                                       || UNICODE_DOT_DOT_VERSION > 0)
14665 
14666                         /* On non-ancient Unicodes, check for the only possible
14667                          * multi-char fold  */
14668                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14669 
14670                             /* This potential multi-char fold means the node
14671                              * can't be simple (because it could match more
14672                              * than a single char).  And in some cases it will
14673                              * match 'ss', so set that flag */
14674                             maybe_SIMPLE = 0;
14675                             has_ss = TRUE;
14676 
14677                             /* It can't change to be an EXACTFU (unless already
14678                              * is one).  We fold it iff under /u rules. */
14679                             if (node_type != EXACTFU) {
14680                                 maybe_exactfu = FALSE;
14681                             }
14682                             else {
14683                                 if (UNLIKELY(len + 2 > max_string_len)) {
14684                                     overflowed = TRUE;
14685                                     break;
14686                                 }
14687 
14688                                 *(s++) = 's';
14689                                 *(s++) = 's';
14690                                 added_len = 2;
14691 
14692                                 goto done_with_this_char;
14693                             }
14694                         }
14695                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14696                                  && LIKELY(len > 0)
14697                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14698                         {
14699                             /* Also, the sequence 'ss' is special when not
14700                              * under /u.  If the target string is UTF-8, it
14701                              * should match SHARP S; otherwise it won't.  So,
14702                              * here we have to exclude the possibility of this
14703                              * node moving to /u.*/
14704                             has_ss = TRUE;
14705                             maybe_exactfu = FALSE;
14706                         }
14707 #endif
14708                         /* Here, the fold will be a single character */
14709 
14710                         if (UNLIKELY(ender == MICRO_SIGN)) {
14711                             has_micro_sign = TRUE;
14712                         }
14713                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14714 
14715                             /* If the character's fold differs between /d and
14716                              * /u, this can't change to be an EXACTFU node */
14717                             maybe_exactfu = FALSE;
14718                         }
14719 
14720                         *(s++) = (DEPENDS_SEMANTICS)
14721                                  ? (char) toFOLD(ender)
14722 
14723                                    /* Under /u, the fold of any character in
14724                                     * the 0-255 range happens to be its
14725                                     * lowercase equivalent, except for LATIN
14726                                     * SMALL LETTER SHARP S, which was handled
14727                                     * above, and the MICRO SIGN, whose fold
14728                                     * requires UTF-8 to represent.  */
14729                                  : (char) toLOWER_L1(ender);
14730                     }
14731 		} /* End of adding current character to the node */
14732 
14733               done_with_this_char:
14734 
14735                 len += added_len;
14736 
14737 		if (next_is_quantifier) {
14738 
14739                     /* Here, the next input is a quantifier, and to get here,
14740                      * the current character is the only one in the node. */
14741                     goto loopdone;
14742 		}
14743 
14744 	    } /* End of loop through literal characters */
14745 
14746             /* Here we have either exhausted the input or run out of room in
14747              * the node.  If the former, we are done.  (If we encountered a
14748              * character that can't be in the node, transfer is made directly
14749              * to <loopdone>, and so we wouldn't have fallen off the end of the
14750              * loop.)  */
14751             if (LIKELY(! overflowed)) {
14752                 goto loopdone;
14753             }
14754 
14755             /* Here we have run out of room.  We can grow plain EXACT and
14756              * LEXACT nodes.  If the pattern is gigantic enough, though,
14757              * eventually we'll have to artificially chunk the pattern into
14758              * multiple nodes. */
14759             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14760                 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14761                 Size_t overhead_expansion = 0;
14762                 char temp[256];
14763                 Size_t max_nodes_for_string;
14764                 Size_t achievable;
14765                 SSize_t delta;
14766 
14767                 /* Here we couldn't fit the final character in the current
14768                  * node, so it will have to be reparsed, no matter what else we
14769                  * do */
14770                 p = oldp;
14771 
14772                 /* If would have overflowed a regular EXACT node, switch
14773                  * instead to an LEXACT.  The code below is structured so that
14774                  * the actual growing code is common to changing from an EXACT
14775                  * or just increasing the LEXACT size.  This means that we have
14776                  * to save the string in the EXACT case before growing, and
14777                  * then copy it afterwards to its new location */
14778                 if (node_type == EXACT) {
14779                     overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14780                     RExC_emit += overhead_expansion;
14781                     Copy(s0, temp, len, char);
14782                 }
14783 
14784                 /* Ready to grow.  If it was a plain EXACT, the string was
14785                  * saved, and the first few bytes of it overwritten by adding
14786                  * an argument field.  We assume, as we do elsewhere in this
14787                  * file, that one byte of remaining input will translate into
14788                  * one byte of output, and if that's too small, we grow again,
14789                  * if too large the excess memory is freed at the end */
14790 
14791                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14792                 achievable = MIN(max_nodes_for_string,
14793                                  current_string_nodes + STR_SZ(RExC_end - p));
14794                 delta = achievable - current_string_nodes;
14795 
14796                 /* If there is just no more room, go finish up this chunk of
14797                  * the pattern. */
14798                 if (delta <= 0) {
14799                     goto loopdone;
14800                 }
14801 
14802                 change_engine_size(pRExC_state, delta + overhead_expansion);
14803                 current_string_nodes += delta;
14804                 max_string_len
14805                            = sizeof(struct regnode) * current_string_nodes;
14806                 upper_fill = max_string_len + 1;
14807 
14808                 /* If the length was small, we know this was originally an
14809                  * EXACT node now converted to LEXACT, and the string has to be
14810                  * restored.  Otherwise the string was untouched.  260 is just
14811                  * a number safely above 255 so don't have to worry about
14812                  * getting it precise */
14813                 if (len < 260) {
14814                     node_type = LEXACT;
14815                     FILL_NODE(ret, node_type);
14816                     s0 = STRING(REGNODE_p(ret));
14817                     Copy(temp, s0, len, char);
14818                     s = s0 + len;
14819                 }
14820 
14821                 goto continue_parse;
14822             }
14823             else if (FOLD) {
14824                 bool splittable = FALSE;
14825                 bool backed_up = FALSE;
14826                 char * e;       /* should this be U8? */
14827                 char * s_start; /* should this be U8? */
14828 
14829                 /* Here is /i.  Running out of room creates a problem if we are
14830                  * folding, and the split happens in the middle of a
14831                  * multi-character fold, as a match that should have occurred,
14832                  * won't, due to the way nodes are matched, and our artificial
14833                  * boundary.  So back off until we aren't splitting such a
14834                  * fold.  If there is no such place to back off to, we end up
14835                  * taking the entire node as-is.  This can happen if the node
14836                  * consists entirely of 'f' or entirely of 's' characters (or
14837                  * things that fold to them) as 'ff' and 'ss' are
14838                  * multi-character folds.
14839                  *
14840                  * The Unicode standard says that multi character folds consist
14841                  * of either two or three characters.  That means we would be
14842                  * splitting one if the final character in the node is at the
14843                  * beginning of either type, or is the second of a three
14844                  * character fold.
14845                  *
14846                  * At this point:
14847                  *  ender     is the code point of the character that won't fit
14848                  *            in the node
14849                  *  s         points to just beyond the final byte in the node.
14850                  *            It's where we would place ender if there were
14851                  *            room, and where in fact we do place ender's fold
14852                  *            in the code below, as we've over-allocated space
14853                  *            for s0 (hence s) to allow for this
14854                  *  e         starts at 's' and advances as we append things.
14855                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
14856                  *            have been advanced to beyond it).
14857                  *  old_old_s points to the beginning byte of the final
14858                  *            character in the node
14859                  *  p         points to the beginning byte in the input of the
14860                  *            character beyond 'ender'.
14861                  *  oldp      points to the beginning byte in the input of
14862                  *            'ender'.
14863                  *
14864                  * In the case of /il, we haven't folded anything that could be
14865                  * affected by the locale.  That means only above-Latin1
14866                  * characters that fold to other above-latin1 characters get
14867                  * folded at compile time.  To check where a good place to
14868                  * split nodes is, everything in it will have to be folded.
14869                  * The boolean 'maybe_exactfu' keeps track in /il if there are
14870                  * any unfolded characters in the node. */
14871                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14872 
14873                 /* If we do need to fold the node, we need a place to store the
14874                  * folded copy, and a way to map back to the unfolded original
14875                  * */
14876                 char * locfold_buf = NULL;
14877                 Size_t * loc_correspondence = NULL;
14878 
14879                 if (! need_to_fold_loc) {   /* The normal case.  Just
14880                                                initialize to the actual node */
14881                     e = s;
14882                     s_start = s0;
14883                     s = old_old_s;  /* Point to the beginning of the final char
14884                                        that fits in the node */
14885                 }
14886                 else {
14887 
14888                     /* Here, we have filled a /il node, and there are unfolded
14889                      * characters in it.  If the runtime locale turns out to be
14890                      * UTF-8, there are possible multi-character folds, just
14891                      * like when not under /l.  The node hence can't terminate
14892                      * in the middle of such a fold.  To determine this, we
14893                      * have to create a folded copy of this node.  That means
14894                      * reparsing the node, folding everything assuming a UTF-8
14895                      * locale.  (If at runtime it isn't such a locale, the
14896                      * actions here wouldn't have been necessary, but we have
14897                      * to assume the worst case.)  If we find we need to back
14898                      * off the folded string, we do so, and then map that
14899                      * position back to the original unfolded node, which then
14900                      * gets output, truncated at that spot */
14901 
14902                     char * redo_p = RExC_parse;
14903                     char * redo_e;
14904                     char * old_redo_e;
14905 
14906                     /* Allow enough space assuming a single byte input folds to
14907                      * a single byte output, plus assume that the two unparsed
14908                      * characters (that we may need) fold to the largest number
14909                      * of bytes possible, plus extra for one more worst case
14910                      * scenario.  In the loop below, if we start eating into
14911                      * that final spare space, we enlarge this initial space */
14912                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14913 
14914                     Newxz(locfold_buf, size, char);
14915                     Newxz(loc_correspondence, size, Size_t);
14916 
14917                     /* Redo this node's parse, folding into 'locfold_buf' */
14918                     redo_p = RExC_parse;
14919                     old_redo_e = redo_e = locfold_buf;
14920                     while (redo_p <= oldp) {
14921 
14922                         old_redo_e = redo_e;
14923                         loc_correspondence[redo_e - locfold_buf]
14924                                                         = redo_p - RExC_parse;
14925 
14926                         if (UTF) {
14927                             Size_t added_len;
14928 
14929                             (void) _to_utf8_fold_flags((U8 *) redo_p,
14930                                                        (U8 *) RExC_end,
14931                                                        (U8 *) redo_e,
14932                                                        &added_len,
14933                                                        FOLD_FLAGS_FULL);
14934                             redo_e += added_len;
14935                             redo_p += UTF8SKIP(redo_p);
14936                         }
14937                         else {
14938 
14939                             /* Note that if this code is run on some ancient
14940                              * Unicode versions, SHARP S doesn't fold to 'ss',
14941                              * but rather than clutter the code with #ifdef's,
14942                              * as is done above, we ignore that possibility.
14943                              * This is ok because this code doesn't affect what
14944                              * gets matched, but merely where the node gets
14945                              * split */
14946                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14947                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14948                             }
14949                             else {
14950                                 *redo_e++ = 's';
14951                                 *redo_e++ = 's';
14952                             }
14953                             redo_p++;
14954                         }
14955 
14956 
14957                         /* If we're getting so close to the end that a
14958                          * worst-case fold in the next character would cause us
14959                          * to overflow, increase, assuming one byte output byte
14960                          * per one byte input one, plus room for another worst
14961                          * case fold */
14962                         if (   redo_p <= oldp
14963                             && redo_e > locfold_buf + size
14964                                                     - (UTF8_MAXBYTES_CASE + 1))
14965                         {
14966                             Size_t new_size = size
14967                                             + (oldp - redo_p)
14968                                             + UTF8_MAXBYTES_CASE + 1;
14969                             Ptrdiff_t e_offset = redo_e - locfold_buf;
14970 
14971                             Renew(locfold_buf, new_size, char);
14972                             Renew(loc_correspondence, new_size, Size_t);
14973                             size = new_size;
14974 
14975                             redo_e = locfold_buf + e_offset;
14976                         }
14977                     }
14978 
14979                     /* Set so that things are in terms of the folded, temporary
14980                      * string */
14981                     s = old_redo_e;
14982                     s_start = locfold_buf;
14983                     e = redo_e;
14984 
14985                 }
14986 
14987                 /* Here, we have 's', 's_start' and 'e' set up to point to the
14988                  * input that goes into the node, folded.
14989                  *
14990                  * If the final character of the node and the fold of ender
14991                  * form the first two characters of a three character fold, we
14992                  * need to peek ahead at the next (unparsed) character in the
14993                  * input to determine if the three actually do form such a
14994                  * fold.  Just looking at that character is not generally
14995                  * sufficient, as it could be, for example, an escape sequence
14996                  * that evaluates to something else, and it needs to be folded.
14997                  *
14998                  * khw originally thought to just go through the parse loop one
14999                  * extra time, but that doesn't work easily as that iteration
15000                  * could cause things to think that the parse is over and to
15001                  * goto loopdone.  The character could be a '$' for example, or
15002                  * the character beyond could be a quantifier, and other
15003                  * glitches as well.
15004                  *
15005                  * The solution used here for peeking ahead is to look at that
15006                  * next character.  If it isn't ASCII punctuation, then it will
15007                  * be something that continues in an EXACTish node if there
15008                  * were space.  We append the fold of it to s, having reserved
15009                  * enough room in s0 for the purpose.  If we can't reasonably
15010                  * peek ahead, we instead assume the worst case: that it is
15011                  * something that would form the completion of a multi-char
15012                  * fold.
15013                  *
15014                  * If we can't split between s and ender, we work backwards
15015                  * character-by-character down to s0.  At each current point
15016                  * see if we are at the beginning of a multi-char fold.  If so,
15017                  * that means we would be splitting the fold across nodes, and
15018                  * so we back up one and try again.
15019                  *
15020                  * If we're not at the beginning, we still could be at the
15021                  * final two characters of a (rare) three character fold.  We
15022                  * check if the sequence starting at the character before the
15023                  * current position (and including the current and next
15024                  * characters) is a three character fold.  If not, the node can
15025                  * be split here.  If it is, we have to backup two characters
15026                  * and try again.
15027                  *
15028                  * Otherwise, the node can be split at the current position.
15029                  *
15030                  * The same logic is used for UTF-8 patterns and not */
15031                 if (UTF) {
15032                     Size_t added_len;
15033 
15034                     /* Append the fold of ender */
15035                     (void) _to_uni_fold_flags(
15036                         ender,
15037                         (U8 *) e,
15038                         &added_len,
15039                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15040                                         ? FOLD_FLAGS_NOMIX_ASCII
15041                                         : 0));
15042                     e += added_len;
15043 
15044                     /* 's' and the character folded to by ender may be the
15045                      * first two of a three-character fold, in which case the
15046                      * node should not be split here.  That may mean examining
15047                      * the so-far unparsed character starting at 'p'.  But if
15048                      * ender folded to more than one character, we already have
15049                      * three characters to look at.  Also, we first check if
15050                      * the sequence consisting of s and the next character form
15051                      * the first two of some three character fold.  If not,
15052                      * there's no need to peek ahead. */
15053                     if (   added_len <= UTF8SKIP(e - added_len)
15054                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15055                     {
15056                         /* Here, the two do form the beginning of a potential
15057                          * three character fold.  The unexamined character may
15058                          * or may not complete it.  Peek at it.  It might be
15059                          * something that ends the node or an escape sequence,
15060                          * in which case we don't know without a lot of work
15061                          * what it evaluates to, so we have to assume the worst
15062                          * case: that it does complete the fold, and so we
15063                          * can't split here.  All such instances  will have
15064                          * that character be an ASCII punctuation character,
15065                          * like a backslash.  So, for that case, backup one and
15066                          * drop down to try at that position */
15067                         if (isPUNCT(*p)) {
15068                             s = (char *) utf8_hop_back((U8 *) s, -1,
15069                                        (U8 *) s_start);
15070                             backed_up = TRUE;
15071                         }
15072                         else {
15073                             /* Here, since it's not punctuation, it must be a
15074                              * real character, and we can append its fold to
15075                              * 'e' (having deliberately reserved enough space
15076                              * for this eventuality) and drop down to check if
15077                              * the three actually do form a folded sequence */
15078                             (void) _to_utf8_fold_flags(
15079                                 (U8 *) p, (U8 *) RExC_end,
15080                                 (U8 *) e,
15081                                 &added_len,
15082                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15083                                                 ? FOLD_FLAGS_NOMIX_ASCII
15084                                                 : 0));
15085                             e += added_len;
15086                         }
15087                     }
15088 
15089                     /* Here, we either have three characters available in
15090                      * sequence starting at 's', or we have two characters and
15091                      * know that the following one can't possibly be part of a
15092                      * three character fold.  We go through the node backwards
15093                      * until we find a place where we can split it without
15094                      * breaking apart a multi-character fold.  At any given
15095                      * point we have to worry about if such a fold begins at
15096                      * the current 's', and also if a three-character fold
15097                      * begins at s-1, (containing s and s+1).  Splitting in
15098                      * either case would break apart a fold */
15099                     do {
15100                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15101                                                             (U8 *) s_start);
15102 
15103                         /* If is a multi-char fold, can't split here.  Backup
15104                          * one char and try again */
15105                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15106                             s = prev_s;
15107                             backed_up = TRUE;
15108                             continue;
15109                         }
15110 
15111                         /* If the two characters beginning at 's' are part of a
15112                          * three character fold starting at the character
15113                          * before s, we can't split either before or after s.
15114                          * Backup two chars and try again */
15115                         if (   LIKELY(s > s_start)
15116                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15117                         {
15118                             s = prev_s;
15119                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15120                             backed_up = TRUE;
15121                             continue;
15122                         }
15123 
15124                         /* Here there's no multi-char fold between s and the
15125                          * next character following it.  We can split */
15126                         splittable = TRUE;
15127                         break;
15128 
15129                     } while (s > s_start); /* End of loops backing up through the node */
15130 
15131                     /* Here we either couldn't find a place to split the node,
15132                      * or else we broke out of the loop setting 'splittable' to
15133                      * true.  In the latter case, the place to split is between
15134                      * the first and second characters in the sequence starting
15135                      * at 's' */
15136                     if (splittable) {
15137                         s += UTF8SKIP(s);
15138                     }
15139                 }
15140                 else {  /* Pattern not UTF-8 */
15141                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
15142                         || ASCII_FOLD_RESTRICTED)
15143                     {
15144                         assert( toLOWER_L1(ender) < 256 );
15145                         *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15146                     }
15147                     else {
15148                         *e++ = 's';
15149                         *e++ = 's';
15150                     }
15151 
15152                     if (   e - s  <= 1
15153                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15154                     {
15155                         if (isPUNCT(*p)) {
15156                             s--;
15157                             backed_up = TRUE;
15158                         }
15159                         else {
15160                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15161                                 || ASCII_FOLD_RESTRICTED)
15162                             {
15163                                 assert( toLOWER_L1(ender) < 256 );
15164                                 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15165                             }
15166                             else {
15167                                 *e++ = 's';
15168                                 *e++ = 's';
15169                             }
15170                         }
15171                     }
15172 
15173                     do {
15174                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15175                             s--;
15176                             backed_up = TRUE;
15177                             continue;
15178                         }
15179 
15180                         if (   LIKELY(s > s_start)
15181                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15182                         {
15183                             s -= 2;
15184                             backed_up = TRUE;
15185                             continue;
15186                         }
15187 
15188                         splittable = TRUE;
15189                         break;
15190 
15191                     } while (s > s_start);
15192 
15193                     if (splittable) {
15194                         s++;
15195                     }
15196                 }
15197 
15198                 /* Here, we are done backing up.  If we didn't backup at all
15199                  * (the likely case), just proceed */
15200                 if (backed_up) {
15201 
15202                    /* If we did find a place to split, reparse the entire node
15203                     * stopping where we have calculated. */
15204                     if (splittable) {
15205 
15206                        /* If we created a temporary folded string under /l, we
15207                         * have to map that back to the original */
15208                         if (need_to_fold_loc) {
15209                             upper_fill = loc_correspondence[s - s_start];
15210                             if (upper_fill == 0) {
15211                                 FAIL2("panic: loc_correspondence[%d] is 0",
15212                                       (int) (s - s_start));
15213                             }
15214                             Safefree(locfold_buf);
15215                             Safefree(loc_correspondence);
15216                         }
15217                         else {
15218                             upper_fill = s - s0;
15219                         }
15220                         goto reparse;
15221                     }
15222 
15223                     /* Here the node consists entirely of non-final multi-char
15224                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
15225                      * decent place to split it, so give up and just take the
15226                      * whole thing */
15227                     len = old_s - s0;
15228                 }
15229 
15230                 if (need_to_fold_loc) {
15231                     Safefree(locfold_buf);
15232                     Safefree(loc_correspondence);
15233                 }
15234 	    }   /* End of verifying node ends with an appropriate char */
15235 
15236             /* We need to start the next node at the character that didn't fit
15237              * in this one */
15238             p = oldp;
15239 
15240           loopdone:   /* Jumped to when encounters something that shouldn't be
15241                          in the node */
15242 
15243             /* Free up any over-allocated space; cast is to silence bogus
15244              * warning in MS VC */
15245             change_engine_size(pRExC_state,
15246                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15247 
15248             /* I (khw) don't know if you can get here with zero length, but the
15249              * old code handled this situation by creating a zero-length EXACT
15250              * node.  Might as well be NOTHING instead */
15251             if (len == 0) {
15252                 OP(REGNODE_p(ret)) = NOTHING;
15253             }
15254             else {
15255 
15256                 /* If the node type is EXACT here, check to see if it
15257                  * should be EXACTL, or EXACT_REQ8. */
15258                 if (node_type == EXACT) {
15259                     if (LOC) {
15260                         node_type = EXACTL;
15261                     }
15262                     else if (requires_utf8_target) {
15263                         node_type = EXACT_REQ8;
15264                     }
15265                 }
15266                 else if (node_type == LEXACT) {
15267                     if (requires_utf8_target) {
15268                         node_type = LEXACT_REQ8;
15269                     }
15270                 }
15271                 else if (FOLD) {
15272                     if (    UNLIKELY(has_micro_sign || has_ss)
15273                         && (node_type == EXACTFU || (   node_type == EXACTF
15274                                                      && maybe_exactfu)))
15275                     {   /* These two conditions are problematic in non-UTF-8
15276                            EXACTFU nodes. */
15277                         assert(! UTF);
15278                         node_type = EXACTFUP;
15279                     }
15280                     else if (node_type == EXACTFL) {
15281 
15282                         /* 'maybe_exactfu' is deliberately set above to
15283                          * indicate this node type, where all code points in it
15284                          * are above 255 */
15285                         if (maybe_exactfu) {
15286                             node_type = EXACTFLU8;
15287                         }
15288                         else if (UNLIKELY(
15289                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15290                         {
15291                             /* A character that folds to more than one will
15292                              * match multiple characters, so can't be SIMPLE.
15293                              * We don't have to worry about this with EXACTFLU8
15294                              * nodes just above, as they have already been
15295                              * folded (since the fold doesn't vary at run
15296                              * time).  Here, if the final character in the node
15297                              * folds to multiple, it can't be simple.  (This
15298                              * only has an effect if the node has only a single
15299                              * character, hence the final one, as elsewhere we
15300                              * turn off simple for nodes whose length > 1 */
15301                             maybe_SIMPLE = 0;
15302                         }
15303                     }
15304                     else if (node_type == EXACTF) {  /* Means is /di */
15305 
15306                         /* This intermediate variable is needed solely because
15307                          * the asserts in the macro where used exceed Win32's
15308                          * literal string capacity */
15309                         char first_char = * STRING(REGNODE_p(ret));
15310 
15311                         /* If 'maybe_exactfu' is clear, then we need to stay
15312                          * /di.  If it is set, it means there are no code
15313                          * points that match differently depending on UTF8ness
15314                          * of the target string, so it can become an EXACTFU
15315                          * node */
15316                         if (! maybe_exactfu) {
15317                             RExC_seen_d_op = TRUE;
15318                         }
15319                         else if (   isALPHA_FOLD_EQ(first_char, 's')
15320                                  || isALPHA_FOLD_EQ(ender, 's'))
15321                         {
15322                             /* But, if the node begins or ends in an 's' we
15323                              * have to defer changing it into an EXACTFU, as
15324                              * the node could later get joined with another one
15325                              * that ends or begins with 's' creating an 'ss'
15326                              * sequence which would then wrongly match the
15327                              * sharp s without the target being UTF-8.  We
15328                              * create a special node that we resolve later when
15329                              * we join nodes together */
15330 
15331                             node_type = EXACTFU_S_EDGE;
15332                         }
15333                         else {
15334                             node_type = EXACTFU;
15335                         }
15336                     }
15337 
15338                     if (requires_utf8_target && node_type == EXACTFU) {
15339                         node_type = EXACTFU_REQ8;
15340                     }
15341                 }
15342 
15343                 OP(REGNODE_p(ret)) = node_type;
15344                 setSTR_LEN(REGNODE_p(ret), len);
15345                 RExC_emit += STR_SZ(len);
15346 
15347                 /* If the node isn't a single character, it can't be SIMPLE */
15348                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15349                     maybe_SIMPLE = 0;
15350                 }
15351 
15352                 *flagp |= HASWIDTH | maybe_SIMPLE;
15353             }
15354 
15355             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15356             RExC_parse = p;
15357 
15358 	    {
15359 		/* len is STRLEN which is unsigned, need to copy to signed */
15360 		IV iv = len;
15361 		if (iv < 0)
15362 		    vFAIL("Internal disaster");
15363 	    }
15364 
15365 	} /* End of label 'defchar:' */
15366 	break;
15367     } /* End of giant switch on input character */
15368 
15369     /* Position parse to next real character */
15370     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15371                                             FALSE /* Don't force to /x */ );
15372     if (   *RExC_parse == '{'
15373         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15374     {
15375         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15376             RExC_parse++;
15377             vFAIL("Unescaped left brace in regex is illegal here");
15378         }
15379         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15380                                   " passed through");
15381     }
15382 
15383     return(ret);
15384 }
15385 
15386 
15387 STATIC void
S_populate_ANYOF_from_invlist(pTHX_ regnode * node,SV ** invlist_ptr)15388 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15389 {
15390     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
15391      * sets up the bitmap and any flags, removing those code points from the
15392      * inversion list, setting it to NULL should it become completely empty */
15393 
15394     dVAR;
15395 
15396     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15397     assert(PL_regkind[OP(node)] == ANYOF);
15398 
15399     /* There is no bitmap for this node type */
15400     if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15401         return;
15402     }
15403 
15404     ANYOF_BITMAP_ZERO(node);
15405     if (*invlist_ptr) {
15406 
15407 	/* This gets set if we actually need to modify things */
15408 	bool change_invlist = FALSE;
15409 
15410 	UV start, end;
15411 
15412 	/* Start looking through *invlist_ptr */
15413 	invlist_iterinit(*invlist_ptr);
15414 	while (invlist_iternext(*invlist_ptr, &start, &end)) {
15415 	    UV high;
15416 	    int i;
15417 
15418             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15419                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15420             }
15421 
15422 	    /* Quit if are above what we should change */
15423 	    if (start >= NUM_ANYOF_CODE_POINTS) {
15424 		break;
15425 	    }
15426 
15427 	    change_invlist = TRUE;
15428 
15429 	    /* Set all the bits in the range, up to the max that we are doing */
15430 	    high = (end < NUM_ANYOF_CODE_POINTS - 1)
15431                    ? end
15432                    : NUM_ANYOF_CODE_POINTS - 1;
15433 	    for (i = start; i <= (int) high; i++) {
15434 		if (! ANYOF_BITMAP_TEST(node, i)) {
15435 		    ANYOF_BITMAP_SET(node, i);
15436 		}
15437 	    }
15438 	}
15439 	invlist_iterfinish(*invlist_ptr);
15440 
15441         /* Done with loop; remove any code points that are in the bitmap from
15442          * *invlist_ptr; similarly for code points above the bitmap if we have
15443          * a flag to match all of them anyways */
15444 	if (change_invlist) {
15445 	    _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15446 	}
15447         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15448 	    _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15449 	}
15450 
15451 	/* If have completely emptied it, remove it completely */
15452 	if (_invlist_len(*invlist_ptr) == 0) {
15453 	    SvREFCNT_dec_NN(*invlist_ptr);
15454 	    *invlist_ptr = NULL;
15455 	}
15456     }
15457 }
15458 
15459 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15460    Character classes ([:foo:]) can also be negated ([:^foo:]).
15461    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15462    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15463    but trigger failures because they are currently unimplemented. */
15464 
15465 #define POSIXCC_DONE(c)   ((c) == ':')
15466 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15467 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15468 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15469 
15470 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
15471 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
15472 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
15473 
15474 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15475 
15476 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15477  * routine. q.v. */
15478 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
15479         if (posix_warnings) {                                               \
15480             if (! RExC_warn_text ) RExC_warn_text =                         \
15481                                          (AV *) sv_2mortal((SV *) newAV()); \
15482             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
15483                                              WARNING_PREFIX                 \
15484                                              text                           \
15485                                              REPORT_LOCATION,               \
15486                                              REPORT_LOCATION_ARGS(p)));     \
15487         }                                                                   \
15488     } STMT_END
15489 #define CLEAR_POSIX_WARNINGS()                                              \
15490     STMT_START {                                                            \
15491         if (posix_warnings && RExC_warn_text)                               \
15492             av_clear(RExC_warn_text);                                       \
15493     } STMT_END
15494 
15495 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15496     STMT_START {                                                            \
15497         CLEAR_POSIX_WARNINGS();                                             \
15498         return ret;                                                         \
15499     } STMT_END
15500 
15501 STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t * pRExC_state,const char * const s,char ** updated_parse_ptr,AV ** posix_warnings,const bool check_only)15502 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15503 
15504     const char * const s,      /* Where the putative posix class begins.
15505                                   Normally, this is one past the '['.  This
15506                                   parameter exists so it can be somewhere
15507                                   besides RExC_parse. */
15508     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15509                                   NULL */
15510     AV ** posix_warnings,      /* Where to place any generated warnings, or
15511                                   NULL */
15512     const bool check_only      /* Don't die if error */
15513 )
15514 {
15515     /* This parses what the caller thinks may be one of the three POSIX
15516      * constructs:
15517      *  1) a character class, like [:blank:]
15518      *  2) a collating symbol, like [. .]
15519      *  3) an equivalence class, like [= =]
15520      * In the latter two cases, it croaks if it finds a syntactically legal
15521      * one, as these are not handled by Perl.
15522      *
15523      * The main purpose is to look for a POSIX character class.  It returns:
15524      *  a) the class number
15525      *      if it is a completely syntactically and semantically legal class.
15526      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15527      *      closing ']' of the class
15528      *  b) OOB_NAMEDCLASS
15529      *      if it appears that one of the three POSIX constructs was meant, but
15530      *      its specification was somehow defective.  'updated_parse_ptr', if
15531      *      not NULL, is set to point to the character just after the end
15532      *      character of the class.  See below for handling of warnings.
15533      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15534      *      if it  doesn't appear that a POSIX construct was intended.
15535      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15536      *      raised.
15537      *
15538      * In b) there may be errors or warnings generated.  If 'check_only' is
15539      * TRUE, then any errors are discarded.  Warnings are returned to the
15540      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15541      * instead it is NULL, warnings are suppressed.
15542      *
15543      * The reason for this function, and its complexity is that a bracketed
15544      * character class can contain just about anything.  But it's easy to
15545      * mistype the very specific posix class syntax but yielding a valid
15546      * regular bracketed class, so it silently gets compiled into something
15547      * quite unintended.
15548      *
15549      * The solution adopted here maintains backward compatibility except that
15550      * it adds a warning if it looks like a posix class was intended but
15551      * improperly specified.  The warning is not raised unless what is input
15552      * very closely resembles one of the 14 legal posix classes.  To do this,
15553      * it uses fuzzy parsing.  It calculates how many single-character edits it
15554      * would take to transform what was input into a legal posix class.  Only
15555      * if that number is quite small does it think that the intention was a
15556      * posix class.  Obviously these are heuristics, and there will be cases
15557      * where it errs on one side or another, and they can be tweaked as
15558      * experience informs.
15559      *
15560      * The syntax for a legal posix class is:
15561      *
15562      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15563      *
15564      * What this routine considers syntactically to be an intended posix class
15565      * is this (the comments indicate some restrictions that the pattern
15566      * doesn't show):
15567      *
15568      *  qr/(?x: \[?                         # The left bracket, possibly
15569      *                                      # omitted
15570      *          \h*                         # possibly followed by blanks
15571      *          (?: \^ \h* )?               # possibly a misplaced caret
15572      *          [:;]?                       # The opening class character,
15573      *                                      # possibly omitted.  A typo
15574      *                                      # semi-colon can also be used.
15575      *          \h*
15576      *          \^?                         # possibly a correctly placed
15577      *                                      # caret, but not if there was also
15578      *                                      # a misplaced one
15579      *          \h*
15580      *          .{3,15}                     # The class name.  If there are
15581      *                                      # deviations from the legal syntax,
15582      *                                      # its edit distance must be close
15583      *                                      # to a real class name in order
15584      *                                      # for it to be considered to be
15585      *                                      # an intended posix class.
15586      *          \h*
15587      *          [[:punct:]]?                # The closing class character,
15588      *                                      # possibly omitted.  If not a colon
15589      *                                      # nor semi colon, the class name
15590      *                                      # must be even closer to a valid
15591      *                                      # one
15592      *          \h*
15593      *          \]?                         # The right bracket, possibly
15594      *                                      # omitted.
15595      *     )/
15596      *
15597      * In the above, \h must be ASCII-only.
15598      *
15599      * These are heuristics, and can be tweaked as field experience dictates.
15600      * There will be cases when someone didn't intend to specify a posix class
15601      * that this warns as being so.  The goal is to minimize these, while
15602      * maximizing the catching of things intended to be a posix class that
15603      * aren't parsed as such.
15604      */
15605 
15606     const char* p             = s;
15607     const char * const e      = RExC_end;
15608     unsigned complement       = 0;      /* If to complement the class */
15609     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15610     bool has_opening_bracket  = FALSE;
15611     bool has_opening_colon    = FALSE;
15612     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15613                                                    valid class */
15614     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15615     const char* name_start;             /* ptr to class name first char */
15616 
15617     /* If the number of single-character typos the input name is away from a
15618      * legal name is no more than this number, it is considered to have meant
15619      * the legal name */
15620     int max_distance          = 2;
15621 
15622     /* to store the name.  The size determines the maximum length before we
15623      * decide that no posix class was intended.  Should be at least
15624      * sizeof("alphanumeric") */
15625     UV input_text[15];
15626     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15627 
15628     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15629 
15630     CLEAR_POSIX_WARNINGS();
15631 
15632     if (p >= e) {
15633         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15634     }
15635 
15636     if (*(p - 1) != '[') {
15637         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15638         found_problem = TRUE;
15639     }
15640     else {
15641         has_opening_bracket = TRUE;
15642     }
15643 
15644     /* They could be confused and think you can put spaces between the
15645      * components */
15646     if (isBLANK(*p)) {
15647         found_problem = TRUE;
15648 
15649         do {
15650             p++;
15651         } while (p < e && isBLANK(*p));
15652 
15653         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15654     }
15655 
15656     /* For [. .] and [= =].  These are quite different internally from [: :],
15657      * so they are handled separately.  */
15658     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15659                                             and 1 for at least one char in it
15660                                           */
15661     {
15662         const char open_char  = *p;
15663         const char * temp_ptr = p + 1;
15664 
15665         /* These two constructs are not handled by perl, and if we find a
15666          * syntactically valid one, we croak.  khw, who wrote this code, finds
15667          * this explanation of them very unclear:
15668          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15669          * And searching the rest of the internet wasn't very helpful either.
15670          * It looks like just about any byte can be in these constructs,
15671          * depending on the locale.  But unless the pattern is being compiled
15672          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15673          * In that case, it looks like [= =] isn't allowed at all, and that
15674          * [. .] could be any single code point, but for longer strings the
15675          * constituent characters would have to be the ASCII alphabetics plus
15676          * the minus-hyphen.  Any sensible locale definition would limit itself
15677          * to these.  And any portable one definitely should.  Trying to parse
15678          * the general case is a nightmare (see [perl #127604]).  So, this code
15679          * looks only for interiors of these constructs that match:
15680          *      qr/.|[-\w]{2,}/
15681          * Using \w relaxes the apparent rules a little, without adding much
15682          * danger of mistaking something else for one of these constructs.
15683          *
15684          * [. .] in some implementations described on the internet is usable to
15685          * escape a character that otherwise is special in bracketed character
15686          * classes.  For example [.].] means a literal right bracket instead of
15687          * the ending of the class
15688          *
15689          * [= =] can legitimately contain a [. .] construct, but we don't
15690          * handle this case, as that [. .] construct will later get parsed
15691          * itself and croak then.  And [= =] is checked for even when not under
15692          * /l, as Perl has long done so.
15693          *
15694          * The code below relies on there being a trailing NUL, so it doesn't
15695          * have to keep checking if the parse ptr < e.
15696          */
15697         if (temp_ptr[1] == open_char) {
15698             temp_ptr++;
15699         }
15700         else while (    temp_ptr < e
15701                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15702         {
15703             temp_ptr++;
15704         }
15705 
15706         if (*temp_ptr == open_char) {
15707             temp_ptr++;
15708             if (*temp_ptr == ']') {
15709                 temp_ptr++;
15710                 if (! found_problem && ! check_only) {
15711                     RExC_parse = (char *) temp_ptr;
15712                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15713                             "extensions", open_char, open_char);
15714                 }
15715 
15716                 /* Here, the syntax wasn't completely valid, or else the call
15717                  * is to check-only */
15718                 if (updated_parse_ptr) {
15719                     *updated_parse_ptr = (char *) temp_ptr;
15720                 }
15721 
15722                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15723             }
15724         }
15725 
15726         /* If we find something that started out to look like one of these
15727          * constructs, but isn't, we continue below so that it can be checked
15728          * for being a class name with a typo of '.' or '=' instead of a colon.
15729          * */
15730     }
15731 
15732     /* Here, we think there is a possibility that a [: :] class was meant, and
15733      * we have the first real character.  It could be they think the '^' comes
15734      * first */
15735     if (*p == '^') {
15736         found_problem = TRUE;
15737         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15738         complement = 1;
15739         p++;
15740 
15741         if (isBLANK(*p)) {
15742             found_problem = TRUE;
15743 
15744             do {
15745                 p++;
15746             } while (p < e && isBLANK(*p));
15747 
15748             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15749         }
15750     }
15751 
15752     /* But the first character should be a colon, which they could have easily
15753      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15754      * distinguish from a colon, so treat that as a colon).  */
15755     if (*p == ':') {
15756         p++;
15757         has_opening_colon = TRUE;
15758     }
15759     else if (*p == ';') {
15760         found_problem = TRUE;
15761         p++;
15762         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15763         has_opening_colon = TRUE;
15764     }
15765     else {
15766         found_problem = TRUE;
15767         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15768 
15769         /* Consider an initial punctuation (not one of the recognized ones) to
15770          * be a left terminator */
15771         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15772             p++;
15773         }
15774     }
15775 
15776     /* They may think that you can put spaces between the components */
15777     if (isBLANK(*p)) {
15778         found_problem = TRUE;
15779 
15780         do {
15781             p++;
15782         } while (p < e && isBLANK(*p));
15783 
15784         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15785     }
15786 
15787     if (*p == '^') {
15788 
15789         /* We consider something like [^:^alnum:]] to not have been intended to
15790          * be a posix class, but XXX maybe we should */
15791         if (complement) {
15792             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15793         }
15794 
15795         complement = 1;
15796         p++;
15797     }
15798 
15799     /* Again, they may think that you can put spaces between the components */
15800     if (isBLANK(*p)) {
15801         found_problem = TRUE;
15802 
15803         do {
15804             p++;
15805         } while (p < e && isBLANK(*p));
15806 
15807         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15808     }
15809 
15810     if (*p == ']') {
15811 
15812         /* XXX This ']' may be a typo, and something else was meant.  But
15813          * treating it as such creates enough complications, that that
15814          * possibility isn't currently considered here.  So we assume that the
15815          * ']' is what is intended, and if we've already found an initial '[',
15816          * this leaves this construct looking like [:] or [:^], which almost
15817          * certainly weren't intended to be posix classes */
15818         if (has_opening_bracket) {
15819             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15820         }
15821 
15822         /* But this function can be called when we parse the colon for
15823          * something like qr/[alpha:]]/, so we back up to look for the
15824          * beginning */
15825         p--;
15826 
15827         if (*p == ';') {
15828             found_problem = TRUE;
15829             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15830         }
15831         else if (*p != ':') {
15832 
15833             /* XXX We are currently very restrictive here, so this code doesn't
15834              * consider the possibility that, say, /[alpha.]]/ was intended to
15835              * be a posix class. */
15836             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15837         }
15838 
15839         /* Here we have something like 'foo:]'.  There was no initial colon,
15840          * and we back up over 'foo.  XXX Unlike the going forward case, we
15841          * don't handle typos of non-word chars in the middle */
15842         has_opening_colon = FALSE;
15843         p--;
15844 
15845         while (p > RExC_start && isWORDCHAR(*p)) {
15846             p--;
15847         }
15848         p++;
15849 
15850         /* Here, we have positioned ourselves to where we think the first
15851          * character in the potential class is */
15852     }
15853 
15854     /* Now the interior really starts.  There are certain key characters that
15855      * can end the interior, or these could just be typos.  To catch both
15856      * cases, we may have to do two passes.  In the first pass, we keep on
15857      * going unless we come to a sequence that matches
15858      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15859      * This means it takes a sequence to end the pass, so two typos in a row if
15860      * that wasn't what was intended.  If the class is perfectly formed, just
15861      * this one pass is needed.  We also stop if there are too many characters
15862      * being accumulated, but this number is deliberately set higher than any
15863      * real class.  It is set high enough so that someone who thinks that
15864      * 'alphanumeric' is a correct name would get warned that it wasn't.
15865      * While doing the pass, we keep track of where the key characters were in
15866      * it.  If we don't find an end to the class, and one of the key characters
15867      * was found, we redo the pass, but stop when we get to that character.
15868      * Thus the key character was considered a typo in the first pass, but a
15869      * terminator in the second.  If two key characters are found, we stop at
15870      * the second one in the first pass.  Again this can miss two typos, but
15871      * catches a single one
15872      *
15873      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15874      * point to the first key character.  For the second pass, it starts as -1.
15875      * */
15876 
15877     name_start = p;
15878   parse_name:
15879     {
15880         bool has_blank               = FALSE;
15881         bool has_upper               = FALSE;
15882         bool has_terminating_colon   = FALSE;
15883         bool has_terminating_bracket = FALSE;
15884         bool has_semi_colon          = FALSE;
15885         unsigned int name_len        = 0;
15886         int punct_count              = 0;
15887 
15888         while (p < e) {
15889 
15890             /* Squeeze out blanks when looking up the class name below */
15891             if (isBLANK(*p) ) {
15892                 has_blank = TRUE;
15893                 found_problem = TRUE;
15894                 p++;
15895                 continue;
15896             }
15897 
15898             /* The name will end with a punctuation */
15899             if (isPUNCT(*p)) {
15900                 const char * peek = p + 1;
15901 
15902                 /* Treat any non-']' punctuation followed by a ']' (possibly
15903                  * with intervening blanks) as trying to terminate the class.
15904                  * ']]' is very likely to mean a class was intended (but
15905                  * missing the colon), but the warning message that gets
15906                  * generated shows the error position better if we exit the
15907                  * loop at the bottom (eventually), so skip it here. */
15908                 if (*p != ']') {
15909                     if (peek < e && isBLANK(*peek)) {
15910                         has_blank = TRUE;
15911                         found_problem = TRUE;
15912                         do {
15913                             peek++;
15914                         } while (peek < e && isBLANK(*peek));
15915                     }
15916 
15917                     if (peek < e && *peek == ']') {
15918                         has_terminating_bracket = TRUE;
15919                         if (*p == ':') {
15920                             has_terminating_colon = TRUE;
15921                         }
15922                         else if (*p == ';') {
15923                             has_semi_colon = TRUE;
15924                             has_terminating_colon = TRUE;
15925                         }
15926                         else {
15927                             found_problem = TRUE;
15928                         }
15929                         p = peek + 1;
15930                         goto try_posix;
15931                     }
15932                 }
15933 
15934                 /* Here we have punctuation we thought didn't end the class.
15935                  * Keep track of the position of the key characters that are
15936                  * more likely to have been class-enders */
15937                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15938 
15939                     /* Allow just one such possible class-ender not actually
15940                      * ending the class. */
15941                     if (possible_end) {
15942                         break;
15943                     }
15944                     possible_end = p;
15945                 }
15946 
15947                 /* If we have too many punctuation characters, no use in
15948                  * keeping going */
15949                 if (++punct_count > max_distance) {
15950                     break;
15951                 }
15952 
15953                 /* Treat the punctuation as a typo. */
15954                 input_text[name_len++] = *p;
15955                 p++;
15956             }
15957             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15958                 input_text[name_len++] = toLOWER(*p);
15959                 has_upper = TRUE;
15960                 found_problem = TRUE;
15961                 p++;
15962             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15963                 input_text[name_len++] = *p;
15964                 p++;
15965             }
15966             else {
15967                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15968                 p+= UTF8SKIP(p);
15969             }
15970 
15971             /* The declaration of 'input_text' is how long we allow a potential
15972              * class name to be, before saying they didn't mean a class name at
15973              * all */
15974             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15975                 break;
15976             }
15977         }
15978 
15979         /* We get to here when the possible class name hasn't been properly
15980          * terminated before:
15981          *   1) we ran off the end of the pattern; or
15982          *   2) found two characters, each of which might have been intended to
15983          *      be the name's terminator
15984          *   3) found so many punctuation characters in the purported name,
15985          *      that the edit distance to a valid one is exceeded
15986          *   4) we decided it was more characters than anyone could have
15987          *      intended to be one. */
15988 
15989         found_problem = TRUE;
15990 
15991         /* In the final two cases, we know that looking up what we've
15992          * accumulated won't lead to a match, even a fuzzy one. */
15993         if (   name_len >= C_ARRAY_LENGTH(input_text)
15994             || punct_count > max_distance)
15995         {
15996             /* If there was an intermediate key character that could have been
15997              * an intended end, redo the parse, but stop there */
15998             if (possible_end && possible_end != (char *) -1) {
15999                 possible_end = (char *) -1; /* Special signal value to say
16000                                                we've done a first pass */
16001                 p = name_start;
16002                 goto parse_name;
16003             }
16004 
16005             /* Otherwise, it can't have meant to have been a class */
16006             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16007         }
16008 
16009         /* If we ran off the end, and the final character was a punctuation
16010          * one, back up one, to look at that final one just below.  Later, we
16011          * will restore the parse pointer if appropriate */
16012         if (name_len && p == e && isPUNCT(*(p-1))) {
16013             p--;
16014             name_len--;
16015         }
16016 
16017         if (p < e && isPUNCT(*p)) {
16018             if (*p == ']') {
16019                 has_terminating_bracket = TRUE;
16020 
16021                 /* If this is a 2nd ']', and the first one is just below this
16022                  * one, consider that to be the real terminator.  This gives a
16023                  * uniform and better positioning for the warning message  */
16024                 if (   possible_end
16025                     && possible_end != (char *) -1
16026                     && *possible_end == ']'
16027                     && name_len && input_text[name_len - 1] == ']')
16028                 {
16029                     name_len--;
16030                     p = possible_end;
16031 
16032                     /* And this is actually equivalent to having done the 2nd
16033                      * pass now, so set it to not try again */
16034                     possible_end = (char *) -1;
16035                 }
16036             }
16037             else {
16038                 if (*p == ':') {
16039                     has_terminating_colon = TRUE;
16040                 }
16041                 else if (*p == ';') {
16042                     has_semi_colon = TRUE;
16043                     has_terminating_colon = TRUE;
16044                 }
16045                 p++;
16046             }
16047         }
16048 
16049     try_posix:
16050 
16051         /* Here, we have a class name to look up.  We can short circuit the
16052          * stuff below for short names that can't possibly be meant to be a
16053          * class name.  (We can do this on the first pass, as any second pass
16054          * will yield an even shorter name) */
16055         if (name_len < 3) {
16056             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16057         }
16058 
16059         /* Find which class it is.  Initially switch on the length of the name.
16060          * */
16061         switch (name_len) {
16062             case 4:
16063                 if (memEQs(name_start, 4, "word")) {
16064                     /* this is not POSIX, this is the Perl \w */
16065                     class_number = ANYOF_WORDCHAR;
16066                 }
16067                 break;
16068             case 5:
16069                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16070                  *                        graph lower print punct space upper
16071                  * Offset 4 gives the best switch position.  */
16072                 switch (name_start[4]) {
16073                     case 'a':
16074                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
16075                             class_number = ANYOF_ALPHA;
16076                         break;
16077                     case 'e':
16078                         if (memBEGINs(name_start, 5, "spac")) /* space */
16079                             class_number = ANYOF_SPACE;
16080                         break;
16081                     case 'h':
16082                         if (memBEGINs(name_start, 5, "grap")) /* graph */
16083                             class_number = ANYOF_GRAPH;
16084                         break;
16085                     case 'i':
16086                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
16087                             class_number = ANYOF_ASCII;
16088                         break;
16089                     case 'k':
16090                         if (memBEGINs(name_start, 5, "blan")) /* blank */
16091                             class_number = ANYOF_BLANK;
16092                         break;
16093                     case 'l':
16094                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16095                             class_number = ANYOF_CNTRL;
16096                         break;
16097                     case 'm':
16098                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16099                             class_number = ANYOF_ALPHANUMERIC;
16100                         break;
16101                     case 'r':
16102                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
16103                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16104                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16105                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16106                         break;
16107                     case 't':
16108                         if (memBEGINs(name_start, 5, "digi")) /* digit */
16109                             class_number = ANYOF_DIGIT;
16110                         else if (memBEGINs(name_start, 5, "prin")) /* print */
16111                             class_number = ANYOF_PRINT;
16112                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
16113                             class_number = ANYOF_PUNCT;
16114                         break;
16115                 }
16116                 break;
16117             case 6:
16118                 if (memEQs(name_start, 6, "xdigit"))
16119                     class_number = ANYOF_XDIGIT;
16120                 break;
16121         }
16122 
16123         /* If the name exactly matches a posix class name the class number will
16124          * here be set to it, and the input almost certainly was meant to be a
16125          * posix class, so we can skip further checking.  If instead the syntax
16126          * is exactly correct, but the name isn't one of the legal ones, we
16127          * will return that as an error below.  But if neither of these apply,
16128          * it could be that no posix class was intended at all, or that one
16129          * was, but there was a typo.  We tease these apart by doing fuzzy
16130          * matching on the name */
16131         if (class_number == OOB_NAMEDCLASS && found_problem) {
16132             const UV posix_names[][6] = {
16133                                                 { 'a', 'l', 'n', 'u', 'm' },
16134                                                 { 'a', 'l', 'p', 'h', 'a' },
16135                                                 { 'a', 's', 'c', 'i', 'i' },
16136                                                 { 'b', 'l', 'a', 'n', 'k' },
16137                                                 { 'c', 'n', 't', 'r', 'l' },
16138                                                 { 'd', 'i', 'g', 'i', 't' },
16139                                                 { 'g', 'r', 'a', 'p', 'h' },
16140                                                 { 'l', 'o', 'w', 'e', 'r' },
16141                                                 { 'p', 'r', 'i', 'n', 't' },
16142                                                 { 'p', 'u', 'n', 'c', 't' },
16143                                                 { 's', 'p', 'a', 'c', 'e' },
16144                                                 { 'u', 'p', 'p', 'e', 'r' },
16145                                                 { 'w', 'o', 'r', 'd' },
16146                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
16147                                             };
16148             /* The names of the above all have added NULs to make them the same
16149              * size, so we need to also have the real lengths */
16150             const UV posix_name_lengths[] = {
16151                                                 sizeof("alnum") - 1,
16152                                                 sizeof("alpha") - 1,
16153                                                 sizeof("ascii") - 1,
16154                                                 sizeof("blank") - 1,
16155                                                 sizeof("cntrl") - 1,
16156                                                 sizeof("digit") - 1,
16157                                                 sizeof("graph") - 1,
16158                                                 sizeof("lower") - 1,
16159                                                 sizeof("print") - 1,
16160                                                 sizeof("punct") - 1,
16161                                                 sizeof("space") - 1,
16162                                                 sizeof("upper") - 1,
16163                                                 sizeof("word")  - 1,
16164                                                 sizeof("xdigit")- 1
16165                                             };
16166             unsigned int i;
16167             int temp_max = max_distance;    /* Use a temporary, so if we
16168                                                reparse, we haven't changed the
16169                                                outer one */
16170 
16171             /* Use a smaller max edit distance if we are missing one of the
16172              * delimiters */
16173             if (   has_opening_bracket + has_opening_colon < 2
16174                 || has_terminating_bracket + has_terminating_colon < 2)
16175             {
16176                 temp_max--;
16177             }
16178 
16179             /* See if the input name is close to a legal one */
16180             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16181 
16182                 /* Short circuit call if the lengths are too far apart to be
16183                  * able to match */
16184                 if (abs( (int) (name_len - posix_name_lengths[i]))
16185                     > temp_max)
16186                 {
16187                     continue;
16188                 }
16189 
16190                 if (edit_distance(input_text,
16191                                   posix_names[i],
16192                                   name_len,
16193                                   posix_name_lengths[i],
16194                                   temp_max
16195                                  )
16196                     > -1)
16197                 { /* If it is close, it probably was intended to be a class */
16198                     goto probably_meant_to_be;
16199                 }
16200             }
16201 
16202             /* Here the input name is not close enough to a valid class name
16203              * for us to consider it to be intended to be a posix class.  If
16204              * we haven't already done so, and the parse found a character that
16205              * could have been terminators for the name, but which we absorbed
16206              * as typos during the first pass, repeat the parse, signalling it
16207              * to stop at that character */
16208             if (possible_end && possible_end != (char *) -1) {
16209                 possible_end = (char *) -1;
16210                 p = name_start;
16211                 goto parse_name;
16212             }
16213 
16214             /* Here neither pass found a close-enough class name */
16215             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16216         }
16217 
16218     probably_meant_to_be:
16219 
16220         /* Here we think that a posix specification was intended.  Update any
16221          * parse pointer */
16222         if (updated_parse_ptr) {
16223             *updated_parse_ptr = (char *) p;
16224         }
16225 
16226         /* If a posix class name was intended but incorrectly specified, we
16227          * output or return the warnings */
16228         if (found_problem) {
16229 
16230             /* We set flags for these issues in the parse loop above instead of
16231              * adding them to the list of warnings, because we can parse it
16232              * twice, and we only want one warning instance */
16233             if (has_upper) {
16234                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16235             }
16236             if (has_blank) {
16237                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16238             }
16239             if (has_semi_colon) {
16240                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16241             }
16242             else if (! has_terminating_colon) {
16243                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16244             }
16245             if (! has_terminating_bracket) {
16246                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16247             }
16248 
16249             if (   posix_warnings
16250                 && RExC_warn_text
16251                 && av_top_index(RExC_warn_text) > -1)
16252             {
16253                 *posix_warnings = RExC_warn_text;
16254             }
16255         }
16256         else if (class_number != OOB_NAMEDCLASS) {
16257             /* If it is a known class, return the class.  The class number
16258              * #defines are structured so each complement is +1 to the normal
16259              * one */
16260             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16261         }
16262         else if (! check_only) {
16263 
16264             /* Here, it is an unrecognized class.  This is an error (unless the
16265             * call is to check only, which we've already handled above) */
16266             const char * const complement_string = (complement)
16267                                                    ? "^"
16268                                                    : "";
16269             RExC_parse = (char *) p;
16270             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16271                         complement_string,
16272                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16273         }
16274     }
16275 
16276     return OOB_NAMEDCLASS;
16277 }
16278 #undef ADD_POSIX_WARNING
16279 
16280 STATIC unsigned  int
S_regex_set_precedence(const U8 my_operator)16281 S_regex_set_precedence(const U8 my_operator) {
16282 
16283     /* Returns the precedence in the (?[...]) construct of the input operator,
16284      * specified by its character representation.  The precedence follows
16285      * general Perl rules, but it extends this so that ')' and ']' have (low)
16286      * precedence even though they aren't really operators */
16287 
16288     switch (my_operator) {
16289         case '!':
16290             return 5;
16291         case '&':
16292             return 4;
16293         case '^':
16294         case '|':
16295         case '+':
16296         case '-':
16297             return 3;
16298         case ')':
16299             return 2;
16300         case ']':
16301             return 1;
16302     }
16303 
16304     NOT_REACHED; /* NOTREACHED */
16305     return 0;   /* Silence compiler warning */
16306 }
16307 
16308 STATIC regnode_offset
S_handle_regex_sets(pTHX_ RExC_state_t * pRExC_state,SV ** return_invlist,I32 * flagp,U32 depth,char * const oregcomp_parse)16309 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16310                     I32 *flagp, U32 depth,
16311                     char * const oregcomp_parse)
16312 {
16313     /* Handle the (?[...]) construct to do set operations */
16314 
16315     U8 curchar;                     /* Current character being parsed */
16316     UV start, end;	            /* End points of code point ranges */
16317     SV* final = NULL;               /* The end result inversion list */
16318     SV* result_string;              /* 'final' stringified */
16319     AV* stack;                      /* stack of operators and operands not yet
16320                                        resolved */
16321     AV* fence_stack = NULL;         /* A stack containing the positions in
16322                                        'stack' of where the undealt-with left
16323                                        parens would be if they were actually
16324                                        put there */
16325     /* The 'volatile' is a workaround for an optimiser bug
16326      * in Solaris Studio 12.3. See RT #127455 */
16327     volatile IV fence = 0;          /* Position of where most recent undealt-
16328                                        with left paren in stack is; -1 if none.
16329                                      */
16330     STRLEN len;                     /* Temporary */
16331     regnode_offset node;            /* Temporary, and final regnode returned by
16332                                        this function */
16333     const bool save_fold = FOLD;    /* Temporary */
16334     char *save_end, *save_parse;    /* Temporaries */
16335     const bool in_locale = LOC;     /* we turn off /l during processing */
16336 
16337     DECLARE_AND_GET_RE_DEBUG_FLAGS;
16338 
16339     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16340     PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16341 
16342     DEBUG_PARSE("xcls");
16343 
16344     if (in_locale) {
16345         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16346     }
16347 
16348     /* The use of this operator implies /u.  This is required so that the
16349      * compile time values are valid in all runtime cases */
16350     REQUIRE_UNI_RULES(flagp, 0);
16351 
16352     ckWARNexperimental(RExC_parse,
16353                        WARN_EXPERIMENTAL__REGEX_SETS,
16354                        "The regex_sets feature is experimental");
16355 
16356     /* Everything in this construct is a metacharacter.  Operands begin with
16357      * either a '\' (for an escape sequence), or a '[' for a bracketed
16358      * character class.  Any other character should be an operator, or
16359      * parenthesis for grouping.  Both types of operands are handled by calling
16360      * regclass() to parse them.  It is called with a parameter to indicate to
16361      * return the computed inversion list.  The parsing here is implemented via
16362      * a stack.  Each entry on the stack is a single character representing one
16363      * of the operators; or else a pointer to an operand inversion list. */
16364 
16365 #define IS_OPERATOR(a) SvIOK(a)
16366 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
16367 
16368     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
16369      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16370      * with pronouncing it called it Reverse Polish instead, but now that YOU
16371      * know how to pronounce it you can use the correct term, thus giving due
16372      * credit to the person who invented it, and impressing your geek friends.
16373      * Wikipedia says that the pronounciation of "Ł" has been changing so that
16374      * it is now more like an English initial W (as in wonk) than an L.)
16375      *
16376      * This means that, for example, 'a | b & c' is stored on the stack as
16377      *
16378      * c  [4]
16379      * b  [3]
16380      * &  [2]
16381      * a  [1]
16382      * |  [0]
16383      *
16384      * where the numbers in brackets give the stack [array] element number.
16385      * In this implementation, parentheses are not stored on the stack.
16386      * Instead a '(' creates a "fence" so that the part of the stack below the
16387      * fence is invisible except to the corresponding ')' (this allows us to
16388      * replace testing for parens, by using instead subtraction of the fence
16389      * position).  As new operands are processed they are pushed onto the stack
16390      * (except as noted in the next paragraph).  New operators of higher
16391      * precedence than the current final one are inserted on the stack before
16392      * the lhs operand (so that when the rhs is pushed next, everything will be
16393      * in the correct positions shown above.  When an operator of equal or
16394      * lower precedence is encountered in parsing, all the stacked operations
16395      * of equal or higher precedence are evaluated, leaving the result as the
16396      * top entry on the stack.  This makes higher precedence operations
16397      * evaluate before lower precedence ones, and causes operations of equal
16398      * precedence to left associate.
16399      *
16400      * The only unary operator '!' is immediately pushed onto the stack when
16401      * encountered.  When an operand is encountered, if the top of the stack is
16402      * a '!", the complement is immediately performed, and the '!' popped.  The
16403      * resulting value is treated as a new operand, and the logic in the
16404      * previous paragraph is executed.  Thus in the expression
16405      *      [a] + ! [b]
16406      * the stack looks like
16407      *
16408      * !
16409      * a
16410      * +
16411      *
16412      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16413      * becomes
16414      *
16415      * !b
16416      * a
16417      * +
16418      *
16419      * A ')' is treated as an operator with lower precedence than all the
16420      * aforementioned ones, which causes all operations on the stack above the
16421      * corresponding '(' to be evaluated down to a single resultant operand.
16422      * Then the fence for the '(' is removed, and the operand goes through the
16423      * algorithm above, without the fence.
16424      *
16425      * A separate stack is kept of the fence positions, so that the position of
16426      * the latest so-far unbalanced '(' is at the top of it.
16427      *
16428      * The ']' ending the construct is treated as the lowest operator of all,
16429      * so that everything gets evaluated down to a single operand, which is the
16430      * result */
16431 
16432     sv_2mortal((SV *)(stack = newAV()));
16433     sv_2mortal((SV *)(fence_stack = newAV()));
16434 
16435     while (RExC_parse < RExC_end) {
16436         I32 top_index;              /* Index of top-most element in 'stack' */
16437         SV** top_ptr;               /* Pointer to top 'stack' element */
16438         SV* current = NULL;         /* To contain the current inversion list
16439                                        operand */
16440         SV* only_to_avoid_leaks;
16441 
16442         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16443                                 TRUE /* Force /x */ );
16444         if (RExC_parse >= RExC_end) {   /* Fail */
16445             break;
16446         }
16447 
16448         curchar = UCHARAT(RExC_parse);
16449 
16450 redo_curchar:
16451 
16452 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16453                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16454         DEBUG_U(dump_regex_sets_structures(pRExC_state,
16455                                            stack, fence, fence_stack));
16456 #endif
16457 
16458         top_index = av_tindex_skip_len_mg(stack);
16459 
16460         switch (curchar) {
16461             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
16462             char stacked_operator;  /* The topmost operator on the 'stack'. */
16463             SV* lhs;                /* Operand to the left of the operator */
16464             SV* rhs;                /* Operand to the right of the operator */
16465             SV* fence_ptr;          /* Pointer to top element of the fence
16466                                        stack */
16467             case '(':
16468 
16469                 if (   RExC_parse < RExC_end - 2
16470                     && UCHARAT(RExC_parse + 1) == '?'
16471                     && UCHARAT(RExC_parse + 2) == '^')
16472                 {
16473                     const regnode_offset orig_emit = RExC_emit;
16474                     SV * resultant_invlist;
16475 
16476                     /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16477                      * This happens when we have some thing like
16478                      *
16479                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16480                      *   ...
16481                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
16482                      *
16483                      * Here we would be handling the interpolated
16484                      * '$thai_or_lao'.  We handle this by a recursive call to
16485                      * reg which returns the inversion list the
16486                      * interpolated expression evaluates to.  Actually, the
16487                      * return is a special regnode containing a pointer to that
16488                      * inversion list.  If the return isn't that regnode alone,
16489                      * we know that this wasn't such an interpolation, which is
16490                      * an error: we need to get a single inversion list back
16491                      * from the recursion */
16492 
16493                     RExC_parse++;
16494                     RExC_sets_depth++;
16495 
16496 	            node = reg(pRExC_state, 2, flagp, depth+1);
16497                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16498 
16499                     if (   OP(REGNODE_p(node)) != REGEX_SET
16500                            /* If more than a single node returned, the nested
16501                             * parens evaluated to more than just a (?[...]),
16502                             * which isn't legal */
16503                         || RExC_emit != orig_emit
16504                                       + NODE_STEP_REGNODE
16505                                       + regarglen[REGEX_SET])
16506                     {
16507                         vFAIL("Expecting interpolated extended charclass");
16508                     }
16509                     resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16510                     current = invlist_clone(resultant_invlist, NULL);
16511                     SvREFCNT_dec(resultant_invlist);
16512 
16513                     RExC_sets_depth--;
16514                     RExC_emit = orig_emit;
16515                     goto handle_operand;
16516                 }
16517 
16518                 /* A regular '('.  Look behind for illegal syntax */
16519                 if (top_index - fence >= 0) {
16520                     /* If the top entry on the stack is an operator, it had
16521                      * better be a '!', otherwise the entry below the top
16522                      * operand should be an operator */
16523                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16524                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16525                         || (   IS_OPERAND(*top_ptr)
16526                             && (   top_index - fence < 1
16527                                 || ! (stacked_ptr = av_fetch(stack,
16528                                                              top_index - 1,
16529                                                              FALSE))
16530                                 || ! IS_OPERATOR(*stacked_ptr))))
16531                     {
16532                         RExC_parse++;
16533                         vFAIL("Unexpected '(' with no preceding operator");
16534                     }
16535                 }
16536 
16537                 /* Stack the position of this undealt-with left paren */
16538                 av_push(fence_stack, newSViv(fence));
16539                 fence = top_index + 1;
16540                 break;
16541 
16542             case '\\':
16543                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16544                  * multi-char folds are allowed.  */
16545                 if (!regclass(pRExC_state, flagp, depth+1,
16546                               TRUE, /* means parse just the next thing */
16547                               FALSE, /* don't allow multi-char folds */
16548                               FALSE, /* don't silence non-portable warnings.  */
16549                               TRUE,  /* strict */
16550                               FALSE, /* Require return to be an ANYOF */
16551                               &current))
16552                 {
16553                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16554                     goto regclass_failed;
16555                 }
16556 
16557                 assert(current);
16558 
16559                 /* regclass() will return with parsing just the \ sequence,
16560                  * leaving the parse pointer at the next thing to parse */
16561                 RExC_parse--;
16562                 goto handle_operand;
16563 
16564             case '[':   /* Is a bracketed character class */
16565             {
16566                 /* See if this is a [:posix:] class. */
16567                 bool is_posix_class = (OOB_NAMEDCLASS
16568                             < handle_possible_posix(pRExC_state,
16569                                                 RExC_parse + 1,
16570                                                 NULL,
16571                                                 NULL,
16572                                                 TRUE /* checking only */));
16573                 /* If it is a posix class, leave the parse pointer at the '['
16574                  * to fool regclass() into thinking it is part of a
16575                  * '[[:posix:]]'. */
16576                 if (! is_posix_class) {
16577                     RExC_parse++;
16578                 }
16579 
16580                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16581                  * multi-char folds are allowed.  */
16582                 if (!regclass(pRExC_state, flagp, depth+1,
16583                                 is_posix_class, /* parse the whole char
16584                                                     class only if not a
16585                                                     posix class */
16586                                 FALSE, /* don't allow multi-char folds */
16587                                 TRUE, /* silence non-portable warnings. */
16588                                 TRUE, /* strict */
16589                                 FALSE, /* Require return to be an ANYOF */
16590                                 &current))
16591                 {
16592                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16593                     goto regclass_failed;
16594                 }
16595 
16596                 assert(current);
16597 
16598                 /* function call leaves parse pointing to the ']', except if we
16599                  * faked it */
16600                 if (is_posix_class) {
16601                     RExC_parse--;
16602                 }
16603 
16604                 goto handle_operand;
16605             }
16606 
16607             case ']':
16608                 if (top_index >= 1) {
16609                     goto join_operators;
16610                 }
16611 
16612                 /* Only a single operand on the stack: are done */
16613                 goto done;
16614 
16615             case ')':
16616                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16617                     if (UCHARAT(RExC_parse - 1) == ']')  {
16618                         break;
16619                     }
16620                     RExC_parse++;
16621                     vFAIL("Unexpected ')'");
16622                 }
16623 
16624                 /* If nothing after the fence, is missing an operand */
16625                 if (top_index - fence < 0) {
16626                     RExC_parse++;
16627                     goto bad_syntax;
16628                 }
16629                 /* If at least two things on the stack, treat this as an
16630                   * operator */
16631                 if (top_index - fence >= 1) {
16632                     goto join_operators;
16633                 }
16634 
16635                 /* Here only a single thing on the fenced stack, and there is a
16636                  * fence.  Get rid of it */
16637                 fence_ptr = av_pop(fence_stack);
16638                 assert(fence_ptr);
16639                 fence = SvIV(fence_ptr);
16640                 SvREFCNT_dec_NN(fence_ptr);
16641                 fence_ptr = NULL;
16642 
16643                 if (fence < 0) {
16644                     fence = 0;
16645                 }
16646 
16647                 /* Having gotten rid of the fence, we pop the operand at the
16648                  * stack top and process it as a newly encountered operand */
16649                 current = av_pop(stack);
16650                 if (IS_OPERAND(current)) {
16651                     goto handle_operand;
16652                 }
16653 
16654                 RExC_parse++;
16655                 goto bad_syntax;
16656 
16657             case '&':
16658             case '|':
16659             case '+':
16660             case '-':
16661             case '^':
16662 
16663                 /* These binary operators should have a left operand already
16664                  * parsed */
16665                 if (   top_index - fence < 0
16666                     || top_index - fence == 1
16667                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16668                     || ! IS_OPERAND(*top_ptr))
16669                 {
16670                     goto unexpected_binary;
16671                 }
16672 
16673                 /* If only the one operand is on the part of the stack visible
16674                  * to us, we just place this operator in the proper position */
16675                 if (top_index - fence < 2) {
16676 
16677                     /* Place the operator before the operand */
16678 
16679                     SV* lhs = av_pop(stack);
16680                     av_push(stack, newSVuv(curchar));
16681                     av_push(stack, lhs);
16682                     break;
16683                 }
16684 
16685                 /* But if there is something else on the stack, we need to
16686                  * process it before this new operator if and only if the
16687                  * stacked operation has equal or higher precedence than the
16688                  * new one */
16689 
16690              join_operators:
16691 
16692                 /* The operator on the stack is supposed to be below both its
16693                  * operands */
16694                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16695                     || IS_OPERAND(*stacked_ptr))
16696                 {
16697                     /* But if not, it's legal and indicates we are completely
16698                      * done if and only if we're currently processing a ']',
16699                      * which should be the final thing in the expression */
16700                     if (curchar == ']') {
16701                         goto done;
16702                     }
16703 
16704                   unexpected_binary:
16705                     RExC_parse++;
16706                     vFAIL2("Unexpected binary operator '%c' with no "
16707                            "preceding operand", curchar);
16708                 }
16709                 stacked_operator = (char) SvUV(*stacked_ptr);
16710 
16711                 if (regex_set_precedence(curchar)
16712                     > regex_set_precedence(stacked_operator))
16713                 {
16714                     /* Here, the new operator has higher precedence than the
16715                      * stacked one.  This means we need to add the new one to
16716                      * the stack to await its rhs operand (and maybe more
16717                      * stuff).  We put it before the lhs operand, leaving
16718                      * untouched the stacked operator and everything below it
16719                      * */
16720                     lhs = av_pop(stack);
16721                     assert(IS_OPERAND(lhs));
16722 
16723                     av_push(stack, newSVuv(curchar));
16724                     av_push(stack, lhs);
16725                     break;
16726                 }
16727 
16728                 /* Here, the new operator has equal or lower precedence than
16729                  * what's already there.  This means the operation already
16730                  * there should be performed now, before the new one. */
16731 
16732                 rhs = av_pop(stack);
16733                 if (! IS_OPERAND(rhs)) {
16734 
16735                     /* This can happen when a ! is not followed by an operand,
16736                      * like in /(?[\t &!])/ */
16737                     goto bad_syntax;
16738                 }
16739 
16740                 lhs = av_pop(stack);
16741 
16742                 if (! IS_OPERAND(lhs)) {
16743 
16744                     /* This can happen when there is an empty (), like in
16745                      * /(?[[0]+()+])/ */
16746                     goto bad_syntax;
16747                 }
16748 
16749                 switch (stacked_operator) {
16750                     case '&':
16751                         _invlist_intersection(lhs, rhs, &rhs);
16752                         break;
16753 
16754                     case '|':
16755                     case '+':
16756                         _invlist_union(lhs, rhs, &rhs);
16757                         break;
16758 
16759                     case '-':
16760                         _invlist_subtract(lhs, rhs, &rhs);
16761                         break;
16762 
16763                     case '^':   /* The union minus the intersection */
16764                     {
16765                         SV* i = NULL;
16766                         SV* u = NULL;
16767 
16768                         _invlist_union(lhs, rhs, &u);
16769                         _invlist_intersection(lhs, rhs, &i);
16770                         _invlist_subtract(u, i, &rhs);
16771                         SvREFCNT_dec_NN(i);
16772                         SvREFCNT_dec_NN(u);
16773                         break;
16774                     }
16775                 }
16776                 SvREFCNT_dec(lhs);
16777 
16778                 /* Here, the higher precedence operation has been done, and the
16779                  * result is in 'rhs'.  We overwrite the stacked operator with
16780                  * the result.  Then we redo this code to either push the new
16781                  * operator onto the stack or perform any higher precedence
16782                  * stacked operation */
16783                 only_to_avoid_leaks = av_pop(stack);
16784                 SvREFCNT_dec(only_to_avoid_leaks);
16785                 av_push(stack, rhs);
16786                 goto redo_curchar;
16787 
16788             case '!':   /* Highest priority, right associative */
16789 
16790                 /* If what's already at the top of the stack is another '!",
16791                  * they just cancel each other out */
16792                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16793                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16794                 {
16795                     only_to_avoid_leaks = av_pop(stack);
16796                     SvREFCNT_dec(only_to_avoid_leaks);
16797                 }
16798                 else { /* Otherwise, since it's right associative, just push
16799                           onto the stack */
16800                     av_push(stack, newSVuv(curchar));
16801                 }
16802                 break;
16803 
16804             default:
16805                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16806                 if (RExC_parse >= RExC_end) {
16807                     break;
16808                 }
16809                 vFAIL("Unexpected character");
16810 
16811           handle_operand:
16812 
16813             /* Here 'current' is the operand.  If something is already on the
16814              * stack, we have to check if it is a !.  But first, the code above
16815              * may have altered the stack in the time since we earlier set
16816              * 'top_index'.  */
16817 
16818             top_index = av_tindex_skip_len_mg(stack);
16819             if (top_index - fence >= 0) {
16820                 /* If the top entry on the stack is an operator, it had better
16821                  * be a '!', otherwise the entry below the top operand should
16822                  * be an operator */
16823                 top_ptr = av_fetch(stack, top_index, FALSE);
16824                 assert(top_ptr);
16825                 if (IS_OPERATOR(*top_ptr)) {
16826 
16827                     /* The only permissible operator at the top of the stack is
16828                      * '!', which is applied immediately to this operand. */
16829                     curchar = (char) SvUV(*top_ptr);
16830                     if (curchar != '!') {
16831                         SvREFCNT_dec(current);
16832                         vFAIL2("Unexpected binary operator '%c' with no "
16833                                 "preceding operand", curchar);
16834                     }
16835 
16836                     _invlist_invert(current);
16837 
16838                     only_to_avoid_leaks = av_pop(stack);
16839                     SvREFCNT_dec(only_to_avoid_leaks);
16840 
16841                     /* And we redo with the inverted operand.  This allows
16842                      * handling multiple ! in a row */
16843                     goto handle_operand;
16844                 }
16845                           /* Single operand is ok only for the non-binary ')'
16846                            * operator */
16847                 else if ((top_index - fence == 0 && curchar != ')')
16848                          || (top_index - fence > 0
16849                              && (! (stacked_ptr = av_fetch(stack,
16850                                                            top_index - 1,
16851                                                            FALSE))
16852                                  || IS_OPERAND(*stacked_ptr))))
16853                 {
16854                     SvREFCNT_dec(current);
16855                     vFAIL("Operand with no preceding operator");
16856                 }
16857             }
16858 
16859             /* Here there was nothing on the stack or the top element was
16860              * another operand.  Just add this new one */
16861             av_push(stack, current);
16862 
16863         } /* End of switch on next parse token */
16864 
16865         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16866     } /* End of loop parsing through the construct */
16867 
16868     vFAIL("Syntax error in (?[...])");
16869 
16870   done:
16871 
16872     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16873         if (RExC_parse < RExC_end) {
16874             RExC_parse++;
16875         }
16876 
16877         vFAIL("Unexpected ']' with no following ')' in (?[...");
16878     }
16879 
16880     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16881         vFAIL("Unmatched (");
16882     }
16883 
16884     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16885         || ((final = av_pop(stack)) == NULL)
16886         || ! IS_OPERAND(final)
16887         || ! is_invlist(final)
16888         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16889     {
16890       bad_syntax:
16891         SvREFCNT_dec(final);
16892         vFAIL("Incomplete expression within '(?[ ])'");
16893     }
16894 
16895     /* Here, 'final' is the resultant inversion list from evaluating the
16896      * expression.  Return it if so requested */
16897     if (return_invlist) {
16898         *return_invlist = final;
16899         return END;
16900     }
16901 
16902     if (RExC_sets_depth) {  /* If within a recursive call, return in a special
16903                                regnode */
16904         RExC_parse++;
16905         node = regpnode(pRExC_state, REGEX_SET, final);
16906     }
16907     else {
16908 
16909         /* Otherwise generate a resultant node, based on 'final'.  regclass()
16910          * is expecting a string of ranges and individual code points */
16911         invlist_iterinit(final);
16912         result_string = newSVpvs("");
16913         while (invlist_iternext(final, &start, &end)) {
16914             if (start == end) {
16915                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16916             }
16917             else {
16918                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16919                                                         UVXf "}", start, end);
16920             }
16921         }
16922 
16923         /* About to generate an ANYOF (or similar) node from the inversion list
16924          * we have calculated */
16925         save_parse = RExC_parse;
16926         RExC_parse = SvPV(result_string, len);
16927         save_end = RExC_end;
16928         RExC_end = RExC_parse + len;
16929         TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16930 
16931         /* We turn off folding around the call, as the class we have
16932          * constructed already has all folding taken into consideration, and we
16933          * don't want regclass() to add to that */
16934         RExC_flags &= ~RXf_PMf_FOLD;
16935         /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16936          * folds are allowed.  */
16937         node = regclass(pRExC_state, flagp, depth+1,
16938                         FALSE, /* means parse the whole char class */
16939                         FALSE, /* don't allow multi-char folds */
16940                         TRUE, /* silence non-portable warnings.  The above may
16941                                  very well have generated non-portable code
16942                                  points, but they're valid on this machine */
16943                         FALSE, /* similarly, no need for strict */
16944 
16945                         /* We can optimize into something besides an ANYOF,
16946                          * except under /l, which needs to be ANYOF because of
16947                          * runtime checks for locale sanity, etc */
16948                     ! in_locale,
16949                         NULL
16950                     );
16951 
16952         RESTORE_WARNINGS;
16953         RExC_parse = save_parse + 1;
16954         RExC_end = save_end;
16955         SvREFCNT_dec_NN(final);
16956         SvREFCNT_dec_NN(result_string);
16957 
16958         if (save_fold) {
16959             RExC_flags |= RXf_PMf_FOLD;
16960         }
16961 
16962         if (!node) {
16963             RETURN_FAIL_ON_RESTART(*flagp, flagp);
16964             goto regclass_failed;
16965         }
16966 
16967         /* Fix up the node type if we are in locale.  (We have pretended we are
16968          * under /u for the purposes of regclass(), as this construct will only
16969          * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
16970          * (so as to cause any warnings about bad locales to be output in
16971          * regexec.c), and add the flag that indicates to check if not in a
16972          * UTF-8 locale.  The reason we above forbid optimization into
16973          * something other than an ANYOF node is simply to minimize the number
16974          * of code changes in regexec.c.  Otherwise we would have to create new
16975          * EXACTish node types and deal with them.  This decision could be
16976          * revisited should this construct become popular.
16977          *
16978          * (One might think we could look at the resulting ANYOF node and
16979          * suppress the flag if everything is above 255, as those would be
16980          * UTF-8 only, but this isn't true, as the components that led to that
16981          * result could have been locale-affected, and just happen to cancel
16982          * each other out under UTF-8 locales.) */
16983         if (in_locale) {
16984             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16985 
16986             assert(OP(REGNODE_p(node)) == ANYOF);
16987 
16988             OP(REGNODE_p(node)) = ANYOFL;
16989             ANYOF_FLAGS(REGNODE_p(node))
16990                     |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16991         }
16992     }
16993 
16994     nextchar(pRExC_state);
16995     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16996     return node;
16997 
16998   regclass_failed:
16999     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
17000                                                                 (UV) *flagp);
17001 }
17002 
17003 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17004 
17005 STATIC void
S_dump_regex_sets_structures(pTHX_ RExC_state_t * pRExC_state,AV * stack,const IV fence,AV * fence_stack)17006 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
17007                              AV * stack, const IV fence, AV * fence_stack)
17008 {   /* Dumps the stacks in handle_regex_sets() */
17009 
17010     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
17011     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
17012     SSize_t i;
17013 
17014     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
17015 
17016     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
17017 
17018     if (stack_top < 0) {
17019         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
17020     }
17021     else {
17022         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17023         for (i = stack_top; i >= 0; i--) {
17024             SV ** element_ptr = av_fetch(stack, i, FALSE);
17025             if (! element_ptr) {
17026             }
17027 
17028             if (IS_OPERATOR(*element_ptr)) {
17029                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17030                                             (int) i, (int) SvIV(*element_ptr));
17031             }
17032             else {
17033                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17034                 sv_dump(*element_ptr);
17035             }
17036         }
17037     }
17038 
17039     if (fence_stack_top < 0) {
17040         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17041     }
17042     else {
17043         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17044         for (i = fence_stack_top; i >= 0; i--) {
17045             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17046             if (! element_ptr) {
17047             }
17048 
17049             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17050                                             (int) i, (int) SvIV(*element_ptr));
17051         }
17052     }
17053 }
17054 
17055 #endif
17056 
17057 #undef IS_OPERATOR
17058 #undef IS_OPERAND
17059 
17060 STATIC void
S_add_above_Latin1_folds(pTHX_ RExC_state_t * pRExC_state,const U8 cp,SV ** invlist)17061 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17062 {
17063     /* This adds the Latin1/above-Latin1 folding rules.
17064      *
17065      * This should be called only for a Latin1-range code points, cp, which is
17066      * known to be involved in a simple fold with other code points above
17067      * Latin1.  It would give false results if /aa has been specified.
17068      * Multi-char folds are outside the scope of this, and must be handled
17069      * specially. */
17070 
17071     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17072 
17073     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17074 
17075     /* The rules that are valid for all Unicode versions are hard-coded in */
17076     switch (cp) {
17077         case 'k':
17078         case 'K':
17079           *invlist =
17080              add_cp_to_invlist(*invlist, KELVIN_SIGN);
17081             break;
17082         case 's':
17083         case 'S':
17084           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17085             break;
17086         case MICRO_SIGN:
17087           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17088           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17089             break;
17090         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17091         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17092           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17093             break;
17094         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17095           *invlist = add_cp_to_invlist(*invlist,
17096                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17097             break;
17098 
17099         default:    /* Other code points are checked against the data for the
17100                        current Unicode version */
17101           {
17102             Size_t folds_count;
17103             U32 first_fold;
17104             const U32 * remaining_folds;
17105             UV folded_cp;
17106 
17107             if (isASCII(cp)) {
17108                 folded_cp = toFOLD(cp);
17109             }
17110             else {
17111                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17112                 Size_t dummy_len;
17113                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17114             }
17115 
17116             if (folded_cp > 255) {
17117                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17118             }
17119 
17120             folds_count = _inverse_folds(folded_cp, &first_fold,
17121                                                     &remaining_folds);
17122             if (folds_count == 0) {
17123 
17124                 /* Use deprecated warning to increase the chances of this being
17125                  * output */
17126                 ckWARN2reg_d(RExC_parse,
17127                         "Perl folding rules are not up-to-date for 0x%02X;"
17128                         " please use the perlbug utility to report;", cp);
17129             }
17130             else {
17131                 unsigned int i;
17132 
17133                 if (first_fold > 255) {
17134                     *invlist = add_cp_to_invlist(*invlist, first_fold);
17135                 }
17136                 for (i = 0; i < folds_count - 1; i++) {
17137                     if (remaining_folds[i] > 255) {
17138                         *invlist = add_cp_to_invlist(*invlist,
17139                                                     remaining_folds[i]);
17140                     }
17141                 }
17142             }
17143             break;
17144          }
17145     }
17146 }
17147 
17148 STATIC void
S_output_posix_warnings(pTHX_ RExC_state_t * pRExC_state,AV * posix_warnings)17149 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17150 {
17151     /* Output the elements of the array given by '*posix_warnings' as REGEXP
17152      * warnings. */
17153 
17154     SV * msg;
17155     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17156 
17157     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17158 
17159     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17160         CLEAR_POSIX_WARNINGS();
17161         return;
17162     }
17163 
17164     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17165         if (first_is_fatal) {           /* Avoid leaking this */
17166             av_undef(posix_warnings);   /* This isn't necessary if the
17167                                             array is mortal, but is a
17168                                             fail-safe */
17169             (void) sv_2mortal(msg);
17170             PREPARE_TO_DIE;
17171         }
17172         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17173         SvREFCNT_dec_NN(msg);
17174     }
17175 
17176     UPDATE_WARNINGS_LOC(RExC_parse);
17177 }
17178 
17179 PERL_STATIC_INLINE Size_t
S_find_first_differing_byte_pos(const U8 * s1,const U8 * s2,const Size_t max)17180 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17181 {
17182     const U8 * const start = s1;
17183     const U8 * const send = start + max;
17184 
17185     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17186 
17187     while (s1 < send && *s1  == *s2) {
17188         s1++; s2++;
17189     }
17190 
17191     return s1 - start;
17192 }
17193 
17194 
17195 STATIC AV *
S_add_multi_match(pTHX_ AV * multi_char_matches,SV * multi_string,const STRLEN cp_count)17196 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17197 {
17198     /* This adds the string scalar <multi_string> to the array
17199      * <multi_char_matches>.  <multi_string> is known to have exactly
17200      * <cp_count> code points in it.  This is used when constructing a
17201      * bracketed character class and we find something that needs to match more
17202      * than a single character.
17203      *
17204      * <multi_char_matches> is actually an array of arrays.  Each top-level
17205      * element is an array that contains all the strings known so far that are
17206      * the same length.  And that length (in number of code points) is the same
17207      * as the index of the top-level array.  Hence, the [2] element is an
17208      * array, each element thereof is a string containing TWO code points;
17209      * while element [3] is for strings of THREE characters, and so on.  Since
17210      * this is for multi-char strings there can never be a [0] nor [1] element.
17211      *
17212      * When we rewrite the character class below, we will do so such that the
17213      * longest strings are written first, so that it prefers the longest
17214      * matching strings first.  This is done even if it turns out that any
17215      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
17216      * Christiansen has agreed that this is ok.  This makes the test for the
17217      * ligature 'ffi' come before the test for 'ff', for example */
17218 
17219     AV* this_array;
17220     AV** this_array_ptr;
17221 
17222     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17223 
17224     if (! multi_char_matches) {
17225         multi_char_matches = newAV();
17226     }
17227 
17228     if (av_exists(multi_char_matches, cp_count)) {
17229         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17230         this_array = *this_array_ptr;
17231     }
17232     else {
17233         this_array = newAV();
17234         av_store(multi_char_matches, cp_count,
17235                  (SV*) this_array);
17236     }
17237     av_push(this_array, multi_string);
17238 
17239     return multi_char_matches;
17240 }
17241 
17242 /* The names of properties whose definitions are not known at compile time are
17243  * stored in this SV, after a constant heading.  So if the length has been
17244  * changed since initialization, then there is a run-time definition. */
17245 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
17246                                         (SvCUR(listsv) != initial_listsv_len)
17247 
17248 /* There is a restricted set of white space characters that are legal when
17249  * ignoring white space in a bracketed character class.  This generates the
17250  * code to skip them.
17251  *
17252  * There is a line below that uses the same white space criteria but is outside
17253  * this macro.  Both here and there must use the same definition */
17254 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
17255     STMT_START {                                                        \
17256         if (do_skip) {                                                  \
17257             while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
17258             {                                                           \
17259                 p++;                                                    \
17260             }                                                           \
17261         }                                                               \
17262     } STMT_END
17263 
17264 STATIC regnode_offset
S_regclass(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth,const bool stop_at_1,bool allow_mutiple_chars,const bool silence_non_portable,const bool strict,bool optimizable,SV ** ret_invlist)17265 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17266                  const bool stop_at_1,  /* Just parse the next thing, don't
17267                                            look for a full character class */
17268                  bool allow_mutiple_chars,
17269                  const bool silence_non_portable,   /* Don't output warnings
17270                                                        about too large
17271                                                        characters */
17272                  const bool strict,
17273                  bool optimizable,                  /* ? Allow a non-ANYOF return
17274                                                        node */
17275                  SV** ret_invlist  /* Return an inversion list, not a node */
17276           )
17277 {
17278     /* parse a bracketed class specification.  Most of these will produce an
17279      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17280      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
17281      * under /i with multi-character folds: it will be rewritten following the
17282      * paradigm of this example, where the <multi-fold>s are characters which
17283      * fold to multiple character sequences:
17284      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17285      * gets effectively rewritten as:
17286      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17287      * reg() gets called (recursively) on the rewritten version, and this
17288      * function will return what it constructs.  (Actually the <multi-fold>s
17289      * aren't physically removed from the [abcdefghi], it's just that they are
17290      * ignored in the recursion by means of a flag:
17291      * <RExC_in_multi_char_class>.)
17292      *
17293      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17294      * characters, with the corresponding bit set if that character is in the
17295      * list.  For characters above this, an inversion list is used.  There
17296      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17297      * determinable at compile time
17298      *
17299      * On success, returns the offset at which any next node should be placed
17300      * into the regex engine program being compiled.
17301      *
17302      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17303      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17304      * UTF-8
17305      */
17306 
17307     dVAR;
17308     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17309     IV range = 0;
17310     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17311     regnode_offset ret = -1;    /* Initialized to an illegal value */
17312     STRLEN numlen;
17313     int namedclass = OOB_NAMEDCLASS;
17314     char *rangebegin = NULL;
17315     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
17316                                aren't available at the time this was called */
17317     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17318 				      than just initialized.  */
17319     SV* properties = NULL;    /* Code points that match \p{} \P{} */
17320     SV* posixes = NULL;     /* Code points that match classes like [:word:],
17321                                extended beyond the Latin1 range.  These have to
17322                                be kept separate from other code points for much
17323                                of this function because their handling  is
17324                                different under /i, and for most classes under
17325                                /d as well */
17326     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
17327                                separate for a while from the non-complemented
17328                                versions because of complications with /d
17329                                matching */
17330     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17331                                   treated more simply than the general case,
17332                                   leading to less compilation and execution
17333                                   work */
17334     UV element_count = 0;   /* Number of distinct elements in the class.
17335 			       Optimizations may be possible if this is tiny */
17336     AV * multi_char_matches = NULL; /* Code points that fold to more than one
17337                                        character; used under /i */
17338     UV n;
17339     char * stop_ptr = RExC_end;    /* where to stop parsing */
17340 
17341     /* ignore unescaped whitespace? */
17342     const bool skip_white = cBOOL(   ret_invlist
17343                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17344 
17345     /* inversion list of code points this node matches only when the target
17346      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
17347      * /d) */
17348     SV* upper_latin1_only_utf8_matches = NULL;
17349 
17350     /* Inversion list of code points this node matches regardless of things
17351      * like locale, folding, utf8ness of the target string */
17352     SV* cp_list = NULL;
17353 
17354     /* Like cp_list, but code points on this list need to be checked for things
17355      * that fold to/from them under /i */
17356     SV* cp_foldable_list = NULL;
17357 
17358     /* Like cp_list, but code points on this list are valid only when the
17359      * runtime locale is UTF-8 */
17360     SV* only_utf8_locale_list = NULL;
17361 
17362     /* In a range, if one of the endpoints is non-character-set portable,
17363      * meaning that it hard-codes a code point that may mean a different
17364      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17365      * mnemonic '\t' which each mean the same character no matter which
17366      * character set the platform is on. */
17367     unsigned int non_portable_endpoint = 0;
17368 
17369     /* Is the range unicode? which means on a platform that isn't 1-1 native
17370      * to Unicode (i.e. non-ASCII), each code point in it should be considered
17371      * to be a Unicode value.  */
17372     bool unicode_range = FALSE;
17373     bool invert = FALSE;    /* Is this class to be complemented */
17374 
17375     bool warn_super = ALWAYS_WARN_SUPER;
17376 
17377     const char * orig_parse = RExC_parse;
17378 
17379     /* This variable is used to mark where the end in the input is of something
17380      * that looks like a POSIX construct but isn't.  During the parse, when
17381      * something looks like it could be such a construct is encountered, it is
17382      * checked for being one, but not if we've already checked this area of the
17383      * input.  Only after this position is reached do we check again */
17384     char *not_posix_region_end = RExC_parse - 1;
17385 
17386     AV* posix_warnings = NULL;
17387     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17388     U8 op = END;    /* The returned node-type, initialized to an impossible
17389                        one.  */
17390     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
17391     U32 posixl = 0;       /* bit field of posix classes matched under /l */
17392 
17393 
17394 /* Flags as to what things aren't knowable until runtime.  (Note that these are
17395  * mutually exclusive.) */
17396 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
17397                                             haven't been defined as of yet */
17398 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
17399                                             UTF-8 or not */
17400 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
17401                                             what gets folded */
17402     U32 has_runtime_dependency = 0;     /* OR of the above flags */
17403 
17404     DECLARE_AND_GET_RE_DEBUG_FLAGS;
17405 
17406     PERL_ARGS_ASSERT_REGCLASS;
17407 #ifndef DEBUGGING
17408     PERL_UNUSED_ARG(depth);
17409 #endif
17410 
17411     assert(! (ret_invlist && allow_mutiple_chars));
17412 
17413     /* If wants an inversion list returned, we can't optimize to something
17414      * else. */
17415     if (ret_invlist) {
17416         optimizable = FALSE;
17417     }
17418 
17419     DEBUG_PARSE("clas");
17420 
17421 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
17422     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
17423                                    && UNICODE_DOT_DOT_VERSION == 0)
17424     allow_mutiple_chars = FALSE;
17425 #endif
17426 
17427     /* We include the /i status at the beginning of this so that we can
17428      * know it at runtime */
17429     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17430     initial_listsv_len = SvCUR(listsv);
17431     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
17432 
17433     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17434 
17435     assert(RExC_parse <= RExC_end);
17436 
17437     if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
17438 	RExC_parse++;
17439         invert = TRUE;
17440         allow_mutiple_chars = FALSE;
17441         MARK_NAUGHTY(1);
17442         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17443     }
17444 
17445     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17446     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17447         int maybe_class = handle_possible_posix(pRExC_state,
17448                                                 RExC_parse,
17449                                                 &not_posix_region_end,
17450                                                 NULL,
17451                                                 TRUE /* checking only */);
17452         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17453             ckWARN4reg(not_posix_region_end,
17454                     "POSIX syntax [%c %c] belongs inside character classes%s",
17455                     *RExC_parse, *RExC_parse,
17456                     (maybe_class == OOB_NAMEDCLASS)
17457                     ? ((POSIXCC_NOTYET(*RExC_parse))
17458                         ? " (but this one isn't implemented)"
17459                         : " (but this one isn't fully valid)")
17460                     : ""
17461                     );
17462         }
17463     }
17464 
17465     /* If the caller wants us to just parse a single element, accomplish this
17466      * by faking the loop ending condition */
17467     if (stop_at_1 && RExC_end > RExC_parse) {
17468         stop_ptr = RExC_parse + 1;
17469     }
17470 
17471     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17472     if (UCHARAT(RExC_parse) == ']')
17473 	goto charclassloop;
17474 
17475     while (1) {
17476 
17477         if (   posix_warnings
17478             && av_tindex_skip_len_mg(posix_warnings) >= 0
17479             && RExC_parse > not_posix_region_end)
17480         {
17481             /* Warnings about posix class issues are considered tentative until
17482              * we are far enough along in the parse that we can no longer
17483              * change our mind, at which point we output them.  This is done
17484              * each time through the loop so that a later class won't zap them
17485              * before they have been dealt with. */
17486             output_posix_warnings(pRExC_state, posix_warnings);
17487         }
17488 
17489         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17490 
17491         if  (RExC_parse >= stop_ptr) {
17492             break;
17493         }
17494 
17495         if  (UCHARAT(RExC_parse) == ']') {
17496             break;
17497         }
17498 
17499       charclassloop:
17500 
17501 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17502         save_value = value;
17503         save_prevvalue = prevvalue;
17504 
17505 	if (!range) {
17506 	    rangebegin = RExC_parse;
17507 	    element_count++;
17508             non_portable_endpoint = 0;
17509 	}
17510 	if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17511 	    value = utf8n_to_uvchr((U8*)RExC_parse,
17512 				   RExC_end - RExC_parse,
17513 				   &numlen, UTF8_ALLOW_DEFAULT);
17514 	    RExC_parse += numlen;
17515 	}
17516 	else
17517 	    value = UCHARAT(RExC_parse++);
17518 
17519         if (value == '[') {
17520             char * posix_class_end;
17521             namedclass = handle_possible_posix(pRExC_state,
17522                                                RExC_parse,
17523                                                &posix_class_end,
17524                                                do_posix_warnings ? &posix_warnings : NULL,
17525                                                FALSE    /* die if error */);
17526             if (namedclass > OOB_NAMEDCLASS) {
17527 
17528                 /* If there was an earlier attempt to parse this particular
17529                  * posix class, and it failed, it was a false alarm, as this
17530                  * successful one proves */
17531                 if (   posix_warnings
17532                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17533                     && not_posix_region_end >= RExC_parse
17534                     && not_posix_region_end <= posix_class_end)
17535                 {
17536                     av_undef(posix_warnings);
17537                 }
17538 
17539                 RExC_parse = posix_class_end;
17540             }
17541             else if (namedclass == OOB_NAMEDCLASS) {
17542                 not_posix_region_end = posix_class_end;
17543             }
17544             else {
17545                 namedclass = OOB_NAMEDCLASS;
17546             }
17547         }
17548         else if (   RExC_parse - 1 > not_posix_region_end
17549                  && MAYBE_POSIXCC(value))
17550         {
17551             (void) handle_possible_posix(
17552                         pRExC_state,
17553                         RExC_parse - 1,  /* -1 because parse has already been
17554                                             advanced */
17555                         &not_posix_region_end,
17556                         do_posix_warnings ? &posix_warnings : NULL,
17557                         TRUE /* checking only */);
17558         }
17559         else if (  strict && ! skip_white
17560                  && (   _generic_isCC(value, _CC_VERTSPACE)
17561                      || is_VERTWS_cp_high(value)))
17562         {
17563             vFAIL("Literal vertical space in [] is illegal except under /x");
17564         }
17565         else if (value == '\\') {
17566             /* Is a backslash; get the code point of the char after it */
17567 
17568             if (RExC_parse >= RExC_end) {
17569                 vFAIL("Unmatched [");
17570             }
17571 
17572 	    if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17573 		value = utf8n_to_uvchr((U8*)RExC_parse,
17574 				   RExC_end - RExC_parse,
17575 				   &numlen, UTF8_ALLOW_DEFAULT);
17576 		RExC_parse += numlen;
17577 	    }
17578 	    else
17579 		value = UCHARAT(RExC_parse++);
17580 
17581 	    /* Some compilers cannot handle switching on 64-bit integer
17582 	     * values, therefore value cannot be an UV.  Yes, this will
17583 	     * be a problem later if we want switch on Unicode.
17584 	     * A similar issue a little bit later when switching on
17585 	     * namedclass. --jhi */
17586 
17587             /* If the \ is escaping white space when white space is being
17588              * skipped, it means that that white space is wanted literally, and
17589              * is already in 'value'.  Otherwise, need to translate the escape
17590              * into what it signifies. */
17591             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17592                 const char * message;
17593                 U32 packed_warn;
17594                 U8 grok_c_char;
17595 
17596 	    case 'w':	namedclass = ANYOF_WORDCHAR;	break;
17597 	    case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
17598 	    case 's':	namedclass = ANYOF_SPACE;	break;
17599 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
17600 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
17601 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
17602 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
17603 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
17604 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
17605 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
17606             case 'N':  /* Handle \N{NAME} in class */
17607                 {
17608                     const char * const backslash_N_beg = RExC_parse - 2;
17609                     int cp_count;
17610 
17611                     if (! grok_bslash_N(pRExC_state,
17612                                         NULL,      /* No regnode */
17613                                         &value,    /* Yes single value */
17614                                         &cp_count, /* Multiple code pt count */
17615                                         flagp,
17616                                         strict,
17617                                         depth)
17618                     ) {
17619 
17620                         if (*flagp & NEED_UTF8)
17621                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17622 
17623                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17624 
17625                         if (cp_count < 0) {
17626                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17627                         }
17628                         else if (cp_count == 0) {
17629                             ckWARNreg(RExC_parse,
17630                               "Ignoring zero length \\N{} in character class");
17631                         }
17632                         else { /* cp_count > 1 */
17633                             assert(cp_count > 1);
17634                             if (! RExC_in_multi_char_class) {
17635                                 if ( ! allow_mutiple_chars
17636                                     || invert
17637                                     || range
17638                                     || *RExC_parse == '-')
17639                                 {
17640                                     if (strict) {
17641                                         RExC_parse--;
17642                                         vFAIL("\\N{} here is restricted to one character");
17643                                     }
17644                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17645                                     break; /* <value> contains the first code
17646                                               point. Drop out of the switch to
17647                                               process it */
17648                                 }
17649                                 else {
17650                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17651                                                  RExC_parse - backslash_N_beg);
17652                                     multi_char_matches
17653                                         = add_multi_match(multi_char_matches,
17654                                                           multi_char_N,
17655                                                           cp_count);
17656                                 }
17657                             }
17658                         } /* End of cp_count != 1 */
17659 
17660                         /* This element should not be processed further in this
17661                          * class */
17662                         element_count--;
17663                         value = save_value;
17664                         prevvalue = save_prevvalue;
17665                         continue;   /* Back to top of loop to get next char */
17666                     }
17667 
17668                     /* Here, is a single code point, and <value> contains it */
17669                     unicode_range = TRUE;   /* \N{} are Unicode */
17670                 }
17671                 break;
17672 	    case 'p':
17673 	    case 'P':
17674 		{
17675 		char *e;
17676 
17677                 if (RExC_pm_flags & PMf_WILDCARD) {
17678                     RExC_parse++;
17679                     /* diag_listed_as: Use of %s is not allowed in Unicode
17680                        property wildcard subpatterns in regex; marked by <--
17681                        HERE in m/%s/ */
17682                     vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17683                            " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17684                 }
17685 
17686 		/* \p means they want Unicode semantics */
17687 		REQUIRE_UNI_RULES(flagp, 0);
17688 
17689 		if (RExC_parse >= RExC_end)
17690 		    vFAIL2("Empty \\%c", (U8)value);
17691 		if (*RExC_parse == '{') {
17692 		    const U8 c = (U8)value;
17693 		    e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17694                     if (!e) {
17695                         RExC_parse++;
17696                         vFAIL2("Missing right brace on \\%c{}", c);
17697                     }
17698 
17699                     RExC_parse++;
17700 
17701                     /* White space is allowed adjacent to the braces and after
17702                      * any '^', even when not under /x */
17703                     while (isSPACE(*RExC_parse)) {
17704                          RExC_parse++;
17705 		    }
17706 
17707 		    if (UCHARAT(RExC_parse) == '^') {
17708 
17709                         /* toggle.  (The rhs xor gets the single bit that
17710                          * differs between P and p; the other xor inverts just
17711                          * that bit) */
17712                         value ^= 'P' ^ 'p';
17713 
17714                         RExC_parse++;
17715                         while (isSPACE(*RExC_parse)) {
17716                             RExC_parse++;
17717                         }
17718                     }
17719 
17720                     if (e == RExC_parse)
17721                         vFAIL2("Empty \\%c{}", c);
17722 
17723 		    n = e - RExC_parse;
17724 		    while (isSPACE(*(RExC_parse + n - 1)))
17725 		        n--;
17726 
17727 		}   /* The \p isn't immediately followed by a '{' */
17728 		else if (! isALPHA(*RExC_parse)) {
17729                     RExC_parse += (UTF)
17730                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17731                                   : 1;
17732                     vFAIL2("Character following \\%c must be '{' or a "
17733                            "single-character Unicode property name",
17734                            (U8) value);
17735                 }
17736                 else {
17737 		    e = RExC_parse;
17738 		    n = 1;
17739 		}
17740 		{
17741                     char* name = RExC_parse;
17742 
17743                     /* Any message returned about expanding the definition */
17744                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17745 
17746                     /* If set TRUE, the property is user-defined as opposed to
17747                      * official Unicode */
17748                     bool user_defined = FALSE;
17749                     AV * strings = NULL;
17750 
17751                     SV * prop_definition = parse_uniprop_string(
17752                                             name, n, UTF, FOLD,
17753                                             FALSE, /* This is compile-time */
17754 
17755                                             /* We can't defer this defn when
17756                                              * the full result is required in
17757                                              * this call */
17758                                             ! cBOOL(ret_invlist),
17759 
17760                                             &strings,
17761                                             &user_defined,
17762                                             msg,
17763                                             0 /* Base level */
17764                                            );
17765                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17766                         assert(prop_definition == NULL);
17767                         RExC_parse = e + 1;
17768                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17769                                                thing so, or else the display is
17770                                                mojibake */
17771                             RExC_utf8 = TRUE;
17772                         }
17773 			/* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17774                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17775                                     SvCUR(msg), SvPVX(msg)));
17776                     }
17777 
17778                     assert(prop_definition || strings);
17779 
17780                     if (strings) {
17781                         if (ret_invlist) {
17782                             if (! prop_definition) {
17783                                 RExC_parse = e + 1;
17784                                 vFAIL("Unicode string properties are not implemented in (?[...])");
17785                             }
17786                             else {
17787                                 ckWARNreg(e + 1,
17788                                     "Using just the single character results"
17789                                     " returned by \\p{} in (?[...])");
17790                             }
17791                         }
17792                         else if (! RExC_in_multi_char_class) {
17793                             if (invert ^ (value == 'P')) {
17794                                 RExC_parse = e + 1;
17795                                 vFAIL("Inverting a character class which contains"
17796                                     " a multi-character sequence is illegal");
17797                             }
17798 
17799                             /* For each multi-character string ... */
17800                             while (av_tindex(strings) >= 0) {
17801                                 /* ... Each entry is itself an array of code
17802                                 * points. */
17803                                 AV * this_string = (AV *) av_shift( strings);
17804                                 STRLEN cp_count = av_tindex(this_string) + 1;
17805                                 SV * final = newSV(cp_count * 4);
17806                                 SvPVCLEAR(final);
17807 
17808                                 /* Create another string of sequences of \x{...} */
17809                                 while (av_tindex(this_string) >= 0) {
17810                                     SV * character = av_shift(this_string);
17811                                     UV cp = SvUV(character);
17812 
17813                                     if (cp > 255) {
17814                                         REQUIRE_UTF8(flagp);
17815                                     }
17816                                     Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17817                                                                         cp);
17818                                     SvREFCNT_dec_NN(character);
17819                                 }
17820                                 SvREFCNT_dec_NN(this_string);
17821 
17822                                 /* And add that to the list of such things */
17823                                 multi_char_matches
17824                                             = add_multi_match(multi_char_matches,
17825                                                             final,
17826                                                             cp_count);
17827                             }
17828                         }
17829                         SvREFCNT_dec_NN(strings);
17830                     }
17831 
17832                     if (! prop_definition) {    /* If we got only a string,
17833                                                    this iteration didn't really
17834                                                    find a character */
17835                         element_count--;
17836                     }
17837                     else if (! is_invlist(prop_definition)) {
17838 
17839                         /* Here, the definition isn't known, so we have gotten
17840                          * returned a string that will be evaluated if and when
17841                          * encountered at runtime.  We add it to the list of
17842                          * such properties, along with whether it should be
17843                          * complemented or not */
17844                         if (value == 'P') {
17845                             sv_catpvs(listsv, "!");
17846                         }
17847                         else {
17848                             sv_catpvs(listsv, "+");
17849                         }
17850                         sv_catsv(listsv, prop_definition);
17851 
17852                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17853 
17854                         /* We don't know yet what this matches, so have to flag
17855                          * it */
17856                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17857                     }
17858                     else {
17859                         assert (prop_definition && is_invlist(prop_definition));
17860 
17861                         /* Here we do have the complete property definition
17862                          *
17863                          * Temporary workaround for [perl #133136].  For this
17864                          * precise input that is in the .t that is failing,
17865                          * load utf8.pm, which is what the test wants, so that
17866                          * that .t passes */
17867                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17868                                         "foo\\p{Alnum}")
17869                             && ! hv_common(GvHVn(PL_incgv),
17870                                            NULL,
17871                                            "utf8.pm", sizeof("utf8.pm") - 1,
17872                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17873                         {
17874                             require_pv("utf8.pm");
17875                         }
17876 
17877                         if (! user_defined &&
17878                             /* We warn on matching an above-Unicode code point
17879                              * if the match would return true, except don't
17880                              * warn for \p{All}, which has exactly one element
17881                              * = 0 */
17882                             (_invlist_contains_cp(prop_definition, 0x110000)
17883                                 && (! (_invlist_len(prop_definition) == 1
17884                                        && *invlist_array(prop_definition) == 0))))
17885                         {
17886                             warn_super = TRUE;
17887                         }
17888 
17889                         /* Invert if asking for the complement */
17890                         if (value == 'P') {
17891 			    _invlist_union_complement_2nd(properties,
17892                                                           prop_definition,
17893                                                           &properties);
17894                         }
17895                         else {
17896                             _invlist_union(properties, prop_definition, &properties);
17897 			}
17898                     }
17899                 }
17900 
17901 		RExC_parse = e + 1;
17902                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17903                                                 named */
17904 		}
17905 		break;
17906 	    case 'n':	value = '\n';			break;
17907 	    case 'r':	value = '\r';			break;
17908 	    case 't':	value = '\t';			break;
17909 	    case 'f':	value = '\f';			break;
17910 	    case 'b':	value = '\b';			break;
17911 	    case 'e':	value = ESC_NATIVE;             break;
17912 	    case 'a':	value = '\a';                   break;
17913 	    case 'o':
17914 		RExC_parse--;	/* function expects to be pointed at the 'o' */
17915                 if (! grok_bslash_o(&RExC_parse,
17916                                             RExC_end,
17917                                             &value,
17918                                             &message,
17919                                             &packed_warn,
17920                                             strict,
17921                                             cBOOL(range), /* MAX_UV allowed for range
17922                                                       upper limit */
17923                                             UTF))
17924                 {
17925                     vFAIL(message);
17926                 }
17927                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17928                     warn_non_literal_string(RExC_parse, packed_warn, message);
17929                 }
17930 
17931                 if (value < 256) {
17932                     non_portable_endpoint++;
17933                 }
17934 		break;
17935 	    case 'x':
17936 		RExC_parse--;	/* function expects to be pointed at the 'x' */
17937                 if (!  grok_bslash_x(&RExC_parse,
17938                                             RExC_end,
17939                                             &value,
17940                                             &message,
17941                                             &packed_warn,
17942                                             strict,
17943                                             cBOOL(range), /* MAX_UV allowed for range
17944                                                       upper limit */
17945                                             UTF))
17946                 {
17947                     vFAIL(message);
17948                 }
17949                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17950                     warn_non_literal_string(RExC_parse, packed_warn, message);
17951                 }
17952 
17953                 if (value < 256) {
17954                     non_portable_endpoint++;
17955                 }
17956 		break;
17957 	    case 'c':
17958                 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17959                                                                 &packed_warn))
17960                 {
17961                     /* going to die anyway; point to exact spot of
17962                         * failure */
17963                     RExC_parse += (UTF)
17964                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17965                                   : 1;
17966                     vFAIL(message);
17967                 }
17968 
17969                 value = grok_c_char;
17970                 RExC_parse++;
17971                 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17972                     warn_non_literal_string(RExC_parse, packed_warn, message);
17973                 }
17974 
17975                 non_portable_endpoint++;
17976 		break;
17977 	    case '0': case '1': case '2': case '3': case '4':
17978 	    case '5': case '6': case '7':
17979 		{
17980 		    /* Take 1-3 octal digits */
17981 		    I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17982                               | PERL_SCAN_NOTIFY_ILLDIGIT;
17983                     numlen = (strict) ? 4 : 3;
17984                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17985 		    RExC_parse += numlen;
17986                     if (numlen != 3) {
17987                         if (strict) {
17988                             RExC_parse += (UTF)
17989                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17990                                           : 1;
17991                             vFAIL("Need exactly 3 octal digits");
17992                         }
17993                         else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17994                                  && RExC_parse < RExC_end
17995                                  && isDIGIT(*RExC_parse)
17996                                  && ckWARN(WARN_REGEXP))
17997                         {
17998                             reg_warn_non_literal_string(
17999                                  RExC_parse + 1,
18000                                  form_alien_digit_msg(8, numlen, RExC_parse,
18001                                                         RExC_end, UTF, FALSE));
18002                         }
18003                     }
18004                     if (value < 256) {
18005                         non_portable_endpoint++;
18006                     }
18007 		    break;
18008 		}
18009 	    default:
18010 		/* Allow \_ to not give an error */
18011 		if (isWORDCHAR(value) && value != '_') {
18012                     if (strict) {
18013                         vFAIL2("Unrecognized escape \\%c in character class",
18014                                (int)value);
18015                     }
18016                     else {
18017                         ckWARN2reg(RExC_parse,
18018                             "Unrecognized escape \\%c in character class passed through",
18019                             (int)value);
18020                     }
18021 		}
18022 		break;
18023 	    }   /* End of switch on char following backslash */
18024 	} /* end of handling backslash escape sequences */
18025 
18026         /* Here, we have the current token in 'value' */
18027 
18028 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18029             U8 classnum;
18030 
18031 	    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
18032 	     * literal, as is the character that began the false range, i.e.
18033 	     * the 'a' in the examples */
18034 	    if (range) {
18035                 const int w = (RExC_parse >= rangebegin)
18036                                 ? RExC_parse - rangebegin
18037                                 : 0;
18038                 if (strict) {
18039                     vFAIL2utf8f(
18040                         "False [] range \"%" UTF8f "\"",
18041                         UTF8fARG(UTF, w, rangebegin));
18042                 }
18043                 else {
18044                     ckWARN2reg(RExC_parse,
18045                         "False [] range \"%" UTF8f "\"",
18046                         UTF8fARG(UTF, w, rangebegin));
18047                     cp_list = add_cp_to_invlist(cp_list, '-');
18048                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18049                                                             prevvalue);
18050                 }
18051 
18052 		range = 0; /* this was not a true range */
18053                 element_count += 2; /* So counts for three values */
18054 	    }
18055 
18056             classnum = namedclass_to_classnum(namedclass);
18057 
18058 	    if (LOC && namedclass < ANYOF_POSIXL_MAX
18059 #ifndef HAS_ISASCII
18060                 && classnum != _CC_ASCII
18061 #endif
18062             ) {
18063                 SV* scratch_list = NULL;
18064 
18065                 /* What the Posix classes (like \w, [:space:]) match isn't
18066                  * generally knowable under locale until actual match time.  A
18067                  * special node is used for these which has extra space for a
18068                  * bitmap, with a bit reserved for each named class that is to
18069                  * be matched against.  (This isn't needed for \p{} and
18070                  * pseudo-classes, as they are not affected by locale, and
18071                  * hence are dealt with separately.)  However, if a named class
18072                  * and its complement are both present, then it matches
18073                  * everything, and there is no runtime dependency.  Odd numbers
18074                  * are the complements of the next lower number, so xor works.
18075                  * (Note that something like [\w\D] should match everything,
18076                  * because \d should be a proper subset of \w.  But rather than
18077                  * trust that the locale is well behaved, we leave this to
18078                  * runtime to sort out) */
18079                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18080                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18081                     POSIXL_ZERO(posixl);
18082                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18083                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18084                     continue;   /* We could ignore the rest of the class, but
18085                                    best to parse it for any errors */
18086                 }
18087                 else { /* Here, isn't the complement of any already parsed
18088                           class */
18089                     POSIXL_SET(posixl, namedclass);
18090                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18091                     anyof_flags |= ANYOF_MATCHES_POSIXL;
18092 
18093                     /* The above-Latin1 characters are not subject to locale
18094                      * rules.  Just add them to the unconditionally-matched
18095                      * list */
18096 
18097                     /* Get the list of the above-Latin1 code points this
18098                      * matches */
18099                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18100                                             PL_XPosix_ptrs[classnum],
18101 
18102                                             /* Odd numbers are complements,
18103                                              * like NDIGIT, NASCII, ... */
18104                                             namedclass % 2 != 0,
18105                                             &scratch_list);
18106                     /* Checking if 'cp_list' is NULL first saves an extra
18107                      * clone.  Its reference count will be decremented at the
18108                      * next union, etc, or if this is the only instance, at the
18109                      * end of the routine */
18110                     if (! cp_list) {
18111                         cp_list = scratch_list;
18112                     }
18113                     else {
18114                         _invlist_union(cp_list, scratch_list, &cp_list);
18115                         SvREFCNT_dec_NN(scratch_list);
18116                     }
18117                     continue;   /* Go get next character */
18118                 }
18119             }
18120             else {
18121 
18122                 /* Here, is not /l, or is a POSIX class for which /l doesn't
18123                  * matter (or is a Unicode property, which is skipped here). */
18124                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
18125                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18126 
18127                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
18128                          * nor /l make a difference in what these match,
18129                          * therefore we just add what they match to cp_list. */
18130                         if (classnum != _CC_VERTSPACE) {
18131                             assert(   namedclass == ANYOF_HORIZWS
18132                                    || namedclass == ANYOF_NHORIZWS);
18133 
18134                             /* It turns out that \h is just a synonym for
18135                              * XPosixBlank */
18136                             classnum = _CC_BLANK;
18137                         }
18138 
18139                         _invlist_union_maybe_complement_2nd(
18140                                 cp_list,
18141                                 PL_XPosix_ptrs[classnum],
18142                                 namedclass % 2 != 0,    /* Complement if odd
18143                                                           (NHORIZWS, NVERTWS)
18144                                                         */
18145                                 &cp_list);
18146                     }
18147                 }
18148                 else if (   AT_LEAST_UNI_SEMANTICS
18149                          || classnum == _CC_ASCII
18150                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
18151                                                    || classnum == _CC_XDIGIT)))
18152                 {
18153                     /* We usually have to worry about /d affecting what POSIX
18154                      * classes match, with special code needed because we won't
18155                      * know until runtime what all matches.  But there is no
18156                      * extra work needed under /u and /a; and [:ascii:] is
18157                      * unaffected by /d; and :digit: and :xdigit: don't have
18158                      * runtime differences under /d.  So we can special case
18159                      * these, and avoid some extra work below, and at runtime.
18160                      * */
18161                     _invlist_union_maybe_complement_2nd(
18162                                                      simple_posixes,
18163                                                       ((AT_LEAST_ASCII_RESTRICTED)
18164                                                        ? PL_Posix_ptrs[classnum]
18165                                                        : PL_XPosix_ptrs[classnum]),
18166                                                      namedclass % 2 != 0,
18167                                                      &simple_posixes);
18168                 }
18169                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
18170                            complement and use nposixes */
18171                     SV** posixes_ptr = namedclass % 2 == 0
18172                                        ? &posixes
18173                                        : &nposixes;
18174                     _invlist_union_maybe_complement_2nd(
18175                                                      *posixes_ptr,
18176                                                      PL_XPosix_ptrs[classnum],
18177                                                      namedclass % 2 != 0,
18178                                                      posixes_ptr);
18179                 }
18180 	    }
18181 	} /* end of namedclass \blah */
18182 
18183         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18184 
18185         /* If 'range' is set, 'value' is the ending of a range--check its
18186          * validity.  (If value isn't a single code point in the case of a
18187          * range, we should have figured that out above in the code that
18188          * catches false ranges).  Later, we will handle each individual code
18189          * point in the range.  If 'range' isn't set, this could be the
18190          * beginning of a range, so check for that by looking ahead to see if
18191          * the next real character to be processed is the range indicator--the
18192          * minus sign */
18193 
18194 	if (range) {
18195 #ifdef EBCDIC
18196             /* For unicode ranges, we have to test that the Unicode as opposed
18197              * to the native values are not decreasing.  (Above 255, there is
18198              * no difference between native and Unicode) */
18199 	    if (unicode_range && prevvalue < 255 && value < 255) {
18200                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18201                     goto backwards_range;
18202                 }
18203             }
18204             else
18205 #endif
18206 	    if (prevvalue > value) /* b-a */ {
18207 		int w;
18208 #ifdef EBCDIC
18209               backwards_range:
18210 #endif
18211                 w = RExC_parse - rangebegin;
18212                 vFAIL2utf8f(
18213                     "Invalid [] range \"%" UTF8f "\"",
18214                     UTF8fARG(UTF, w, rangebegin));
18215                 NOT_REACHED; /* NOTREACHED */
18216 	    }
18217 	}
18218 	else {
18219             prevvalue = value; /* save the beginning of the potential range */
18220             if (! stop_at_1     /* Can't be a range if parsing just one thing */
18221                 && *RExC_parse == '-')
18222             {
18223                 char* next_char_ptr = RExC_parse + 1;
18224 
18225                 /* Get the next real char after the '-' */
18226                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18227 
18228                 /* If the '-' is at the end of the class (just before the ']',
18229                  * it is a literal minus; otherwise it is a range */
18230                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18231                     RExC_parse = next_char_ptr;
18232 
18233                     /* a bad range like \w-, [:word:]- ? */
18234                     if (namedclass > OOB_NAMEDCLASS) {
18235                         if (strict || ckWARN(WARN_REGEXP)) {
18236                             const int w = RExC_parse >= rangebegin
18237                                           ?  RExC_parse - rangebegin
18238                                           : 0;
18239                             if (strict) {
18240                                 vFAIL4("False [] range \"%*.*s\"",
18241                                     w, w, rangebegin);
18242                             }
18243                             else {
18244                                 vWARN4(RExC_parse,
18245                                     "False [] range \"%*.*s\"",
18246                                     w, w, rangebegin);
18247                             }
18248                         }
18249                         cp_list = add_cp_to_invlist(cp_list, '-');
18250                         element_count++;
18251                     } else
18252                         range = 1;	/* yeah, it's a range! */
18253                     continue;	/* but do it the next time */
18254                 }
18255 	    }
18256 	}
18257 
18258         if (namedclass > OOB_NAMEDCLASS) {
18259             continue;
18260         }
18261 
18262         /* Here, we have a single value this time through the loop, and
18263          * <prevvalue> is the beginning of the range, if any; or <value> if
18264          * not. */
18265 
18266 	/* non-Latin1 code point implies unicode semantics. */
18267 	if (value > 255) {
18268             if (value > MAX_LEGAL_CP && (   value != UV_MAX
18269                                          || prevvalue > MAX_LEGAL_CP))
18270             {
18271                 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18272             }
18273             REQUIRE_UNI_RULES(flagp, 0);
18274             if (  ! silence_non_portable
18275                 &&  UNICODE_IS_PERL_EXTENDED(value)
18276                 &&  TO_OUTPUT_WARNINGS(RExC_parse))
18277             {
18278                 ckWARN2_non_literal_string(RExC_parse,
18279                                            packWARN(WARN_PORTABLE),
18280                                            PL_extended_cp_format,
18281                                            value);
18282             }
18283 	}
18284 
18285         /* Ready to process either the single value, or the completed range.
18286          * For single-valued non-inverted ranges, we consider the possibility
18287          * of multi-char folds.  (We made a conscious decision to not do this
18288          * for the other cases because it can often lead to non-intuitive
18289          * results.  For example, you have the peculiar case that:
18290          *  "s s" =~ /^[^\xDF]+$/i => Y
18291          *  "ss"  =~ /^[^\xDF]+$/i => N
18292          *
18293          * See [perl #89750] */
18294         if (FOLD && allow_mutiple_chars && value == prevvalue) {
18295             if (    value == LATIN_SMALL_LETTER_SHARP_S
18296                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18297                                                         value)))
18298             {
18299                 /* Here <value> is indeed a multi-char fold.  Get what it is */
18300 
18301                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18302                 STRLEN foldlen;
18303 
18304                 UV folded = _to_uni_fold_flags(
18305                                 value,
18306                                 foldbuf,
18307                                 &foldlen,
18308                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18309                                                    ? FOLD_FLAGS_NOMIX_ASCII
18310                                                    : 0)
18311                                 );
18312 
18313                 /* Here, <folded> should be the first character of the
18314                  * multi-char fold of <value>, with <foldbuf> containing the
18315                  * whole thing.  But, if this fold is not allowed (because of
18316                  * the flags), <fold> will be the same as <value>, and should
18317                  * be processed like any other character, so skip the special
18318                  * handling */
18319                 if (folded != value) {
18320 
18321                     /* Skip if we are recursed, currently parsing the class
18322                      * again.  Otherwise add this character to the list of
18323                      * multi-char folds. */
18324                     if (! RExC_in_multi_char_class) {
18325                         STRLEN cp_count = utf8_length(foldbuf,
18326                                                       foldbuf + foldlen);
18327                         SV* multi_fold = sv_2mortal(newSVpvs(""));
18328 
18329                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18330 
18331                         multi_char_matches
18332                                         = add_multi_match(multi_char_matches,
18333                                                           multi_fold,
18334                                                           cp_count);
18335 
18336                     }
18337 
18338                     /* This element should not be processed further in this
18339                      * class */
18340                     element_count--;
18341                     value = save_value;
18342                     prevvalue = save_prevvalue;
18343                     continue;
18344                 }
18345             }
18346         }
18347 
18348         if (strict && ckWARN(WARN_REGEXP)) {
18349             if (range) {
18350 
18351                 /* If the range starts above 255, everything is portable and
18352                  * likely to be so for any forseeable character set, so don't
18353                  * warn. */
18354                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18355                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18356                 }
18357                 else if (prevvalue != value) {
18358 
18359                     /* Under strict, ranges that stop and/or end in an ASCII
18360                      * printable should have each end point be a portable value
18361                      * for it (preferably like 'A', but we don't warn if it is
18362                      * a (portable) Unicode name or code point), and the range
18363                      * must be all digits or all letters of the same case.
18364                      * Otherwise, the range is non-portable and unclear as to
18365                      * what it contains */
18366                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
18367                         && (          non_portable_endpoint
18368                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18369                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
18370                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
18371                     ))) {
18372                         vWARN(RExC_parse, "Ranges of ASCII printables should"
18373                                           " be some subset of \"0-9\","
18374                                           " \"A-Z\", or \"a-z\"");
18375                     }
18376                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18377                         SSize_t index_start;
18378                         SSize_t index_final;
18379 
18380                         /* But the nature of Unicode and languages mean we
18381                          * can't do the same checks for above-ASCII ranges,
18382                          * except in the case of digit ones.  These should
18383                          * contain only digits from the same group of 10.  The
18384                          * ASCII case is handled just above.  Hence here, the
18385                          * range could be a range of digits.  First some
18386                          * unlikely special cases.  Grandfather in that a range
18387                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18388                          * if its starting value is one of the 10 digits prior
18389                          * to it.  This is because it is an alternate way of
18390                          * writing 19D1, and some people may expect it to be in
18391                          * that group.  But it is bad, because it won't give
18392                          * the expected results.  In Unicode 5.2 it was
18393                          * considered to be in that group (of 11, hence), but
18394                          * this was fixed in the next version */
18395 
18396                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18397                             goto warn_bad_digit_range;
18398                         }
18399                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
18400                                           &&     value <= 0x1D7FF))
18401                         {
18402                             /* This is the only other case currently in Unicode
18403                              * where the algorithm below fails.  The code
18404                              * points just above are the end points of a single
18405                              * range containing only decimal digits.  It is 5
18406                              * different series of 0-9.  All other ranges of
18407                              * digits currently in Unicode are just a single
18408                              * series.  (And mktables will notify us if a later
18409                              * Unicode version breaks this.)
18410                              *
18411                              * If the range being checked is at most 9 long,
18412                              * and the digit values represented are in
18413                              * numerical order, they are from the same series.
18414                              * */
18415                             if (         value - prevvalue > 9
18416                                 ||    (((    value - 0x1D7CE) % 10)
18417                                      <= (prevvalue - 0x1D7CE) % 10))
18418                             {
18419                                 goto warn_bad_digit_range;
18420                             }
18421                         }
18422                         else {
18423 
18424                             /* For all other ranges of digits in Unicode, the
18425                              * algorithm is just to check if both end points
18426                              * are in the same series, which is the same range.
18427                              * */
18428                             index_start = _invlist_search(
18429                                                     PL_XPosix_ptrs[_CC_DIGIT],
18430                                                     prevvalue);
18431 
18432                             /* Warn if the range starts and ends with a digit,
18433                              * and they are not in the same group of 10. */
18434                             if (   index_start >= 0
18435                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18436                                 && (index_final =
18437                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18438                                                     value)) != index_start
18439                                 && index_final >= 0
18440                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18441                             {
18442                               warn_bad_digit_range:
18443                                 vWARN(RExC_parse, "Ranges of digits should be"
18444                                                   " from the same group of"
18445                                                   " 10");
18446                             }
18447                         }
18448                     }
18449                 }
18450             }
18451             if ((! range || prevvalue == value) && non_portable_endpoint) {
18452                 if (isPRINT_A(value)) {
18453                     char literal[3];
18454                     unsigned d = 0;
18455                     if (isBACKSLASHED_PUNCT(value)) {
18456                         literal[d++] = '\\';
18457                     }
18458                     literal[d++] = (char) value;
18459                     literal[d++] = '\0';
18460 
18461                     vWARN4(RExC_parse,
18462                            "\"%.*s\" is more clearly written simply as \"%s\"",
18463                            (int) (RExC_parse - rangebegin),
18464                            rangebegin,
18465                            literal
18466                         );
18467                 }
18468                 else if (isMNEMONIC_CNTRL(value)) {
18469                     vWARN4(RExC_parse,
18470                            "\"%.*s\" is more clearly written simply as \"%s\"",
18471                            (int) (RExC_parse - rangebegin),
18472                            rangebegin,
18473                            cntrl_to_mnemonic((U8) value)
18474                         );
18475                 }
18476             }
18477         }
18478 
18479         /* Deal with this element of the class */
18480 
18481 #ifndef EBCDIC
18482         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18483                                                     prevvalue, value);
18484 #else
18485         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18486          * that don't require special handling, we can just add the range like
18487          * we do for ASCII platforms */
18488         if ((UNLIKELY(prevvalue == 0) && value >= 255)
18489             || ! (prevvalue < 256
18490                     && (unicode_range
18491                         || (! non_portable_endpoint
18492                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18493                                 || (isUPPER_A(prevvalue)
18494                                     && isUPPER_A(value)))))))
18495         {
18496             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18497                                                         prevvalue, value);
18498         }
18499         else {
18500             /* Here, requires special handling.  This can be because it is a
18501              * range whose code points are considered to be Unicode, and so
18502              * must be individually translated into native, or because its a
18503              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18504              * EBCDIC, but we have defined them to include only the "expected"
18505              * upper or lower case ASCII alphabetics.  Subranges above 255 are
18506              * the same in native and Unicode, so can be added as a range */
18507             U8 start = NATIVE_TO_LATIN1(prevvalue);
18508             unsigned j;
18509             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18510             for (j = start; j <= end; j++) {
18511                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18512             }
18513             if (value > 255) {
18514                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18515                                                             256, value);
18516             }
18517         }
18518 #endif
18519 
18520 	range = 0; /* this range (if it was one) is done now */
18521     } /* End of loop through all the text within the brackets */
18522 
18523     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18524         output_posix_warnings(pRExC_state, posix_warnings);
18525     }
18526 
18527     /* If anything in the class expands to more than one character, we have to
18528      * deal with them by building up a substitute parse string, and recursively
18529      * calling reg() on it, instead of proceeding */
18530     if (multi_char_matches) {
18531 	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18532         I32 cp_count;
18533 	STRLEN len;
18534 	char *save_end = RExC_end;
18535 	char *save_parse = RExC_parse;
18536 	char *save_start = RExC_start;
18537         Size_t constructed_prefix_len = 0; /* This gives the length of the
18538                                               constructed portion of the
18539                                               substitute parse. */
18540         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
18541                                        a "|" */
18542         I32 reg_flags;
18543 
18544         assert(! invert);
18545         /* Only one level of recursion allowed */
18546         assert(RExC_copy_start_in_constructed == RExC_precomp);
18547 
18548 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
18549            because too confusing */
18550         if (invert) {
18551             sv_catpvs(substitute_parse, "(?:");
18552         }
18553 #endif
18554 
18555         /* Look at the longest strings first */
18556         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18557                         cp_count > 0;
18558                         cp_count--)
18559         {
18560 
18561             if (av_exists(multi_char_matches, cp_count)) {
18562                 AV** this_array_ptr;
18563                 SV* this_sequence;
18564 
18565                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18566                                                  cp_count, FALSE);
18567                 while ((this_sequence = av_pop(*this_array_ptr)) !=
18568                                                                 &PL_sv_undef)
18569                 {
18570                     if (! first_time) {
18571                         sv_catpvs(substitute_parse, "|");
18572                     }
18573                     first_time = FALSE;
18574 
18575                     sv_catpv(substitute_parse, SvPVX(this_sequence));
18576                 }
18577             }
18578         }
18579 
18580         /* If the character class contains anything else besides these
18581          * multi-character strings, have to include it in recursive parsing */
18582         if (element_count) {
18583             bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18584 
18585             sv_catpvs(substitute_parse, "|");
18586             if (has_l_bracket) {    /* Add an [ if the original had one */
18587                 sv_catpvs(substitute_parse, "[");
18588             }
18589             constructed_prefix_len = SvCUR(substitute_parse);
18590             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18591 
18592             /* Put in a closing ']' to match any opening one, but not if going
18593              * off the end, as otherwise we are adding something that really
18594              * isn't there */
18595             if (has_l_bracket && RExC_parse < RExC_end) {
18596                 sv_catpvs(substitute_parse, "]");
18597             }
18598         }
18599 
18600         sv_catpvs(substitute_parse, ")");
18601 #if 0
18602         if (invert) {
18603             /* This is a way to get the parse to skip forward a whole named
18604              * sequence instead of matching the 2nd character when it fails the
18605              * first */
18606             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18607         }
18608 #endif
18609 
18610         /* Set up the data structure so that any errors will be properly
18611          * reported.  See the comments at the definition of
18612          * REPORT_LOCATION_ARGS for details */
18613         RExC_copy_start_in_input = (char *) orig_parse;
18614 	RExC_start = RExC_parse = SvPV(substitute_parse, len);
18615         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18616 	RExC_end = RExC_parse + len;
18617         RExC_in_multi_char_class = 1;
18618 
18619 	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18620 
18621         *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
18622 
18623         /* And restore so can parse the rest of the pattern */
18624         RExC_parse = save_parse;
18625 	RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18626 	RExC_end = save_end;
18627 	RExC_in_multi_char_class = 0;
18628         SvREFCNT_dec_NN(multi_char_matches);
18629         return ret;
18630     }
18631 
18632     /* If folding, we calculate all characters that could fold to or from the
18633      * ones already on the list */
18634     if (cp_foldable_list) {
18635         if (FOLD) {
18636             UV start, end;	/* End points of code point ranges */
18637 
18638             SV* fold_intersection = NULL;
18639             SV** use_list;
18640 
18641             /* Our calculated list will be for Unicode rules.  For locale
18642              * matching, we have to keep a separate list that is consulted at
18643              * runtime only when the locale indicates Unicode rules (and we
18644              * don't include potential matches in the ASCII/Latin1 range, as
18645              * any code point could fold to any other, based on the run-time
18646              * locale).   For non-locale, we just use the general list */
18647             if (LOC) {
18648                 use_list = &only_utf8_locale_list;
18649             }
18650             else {
18651                 use_list = &cp_list;
18652             }
18653 
18654             /* Only the characters in this class that participate in folds need
18655              * be checked.  Get the intersection of this class and all the
18656              * possible characters that are foldable.  This can quickly narrow
18657              * down a large class */
18658             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18659                                   &fold_intersection);
18660 
18661             /* Now look at the foldable characters in this class individually */
18662             invlist_iterinit(fold_intersection);
18663             while (invlist_iternext(fold_intersection, &start, &end)) {
18664                 UV j;
18665                 UV folded;
18666 
18667                 /* Look at every character in the range */
18668                 for (j = start; j <= end; j++) {
18669                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18670                     STRLEN foldlen;
18671                     unsigned int k;
18672                     Size_t folds_count;
18673                     U32 first_fold;
18674                     const U32 * remaining_folds;
18675 
18676                     if (j < 256) {
18677 
18678                         /* Under /l, we don't know what code points below 256
18679                          * fold to, except we do know the MICRO SIGN folds to
18680                          * an above-255 character if the locale is UTF-8, so we
18681                          * add it to the special list (in *use_list)  Otherwise
18682                          * we know now what things can match, though some folds
18683                          * are valid under /d only if the target is UTF-8.
18684                          * Those go in a separate list */
18685                         if (      IS_IN_SOME_FOLD_L1(j)
18686                             && ! (LOC && j != MICRO_SIGN))
18687                         {
18688 
18689                             /* ASCII is always matched; non-ASCII is matched
18690                              * only under Unicode rules (which could happen
18691                              * under /l if the locale is a UTF-8 one */
18692                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18693                                 *use_list = add_cp_to_invlist(*use_list,
18694                                                             PL_fold_latin1[j]);
18695                             }
18696                             else if (j != PL_fold_latin1[j]) {
18697                                 upper_latin1_only_utf8_matches
18698                                         = add_cp_to_invlist(
18699                                                 upper_latin1_only_utf8_matches,
18700                                                 PL_fold_latin1[j]);
18701                             }
18702                         }
18703 
18704                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18705                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18706                         {
18707                             add_above_Latin1_folds(pRExC_state,
18708                                                    (U8) j,
18709                                                    use_list);
18710                         }
18711                         continue;
18712                     }
18713 
18714                     /* Here is an above Latin1 character.  We don't have the
18715                      * rules hard-coded for it.  First, get its fold.  This is
18716                      * the simple fold, as the multi-character folds have been
18717                      * handled earlier and separated out */
18718                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18719                                                         (ASCII_FOLD_RESTRICTED)
18720                                                         ? FOLD_FLAGS_NOMIX_ASCII
18721                                                         : 0);
18722 
18723                     /* Single character fold of above Latin1.  Add everything
18724                      * in its fold closure to the list that this node should
18725                      * match. */
18726                     folds_count = _inverse_folds(folded, &first_fold,
18727                                                     &remaining_folds);
18728                     for (k = 0; k <= folds_count; k++) {
18729                         UV c = (k == 0)     /* First time through use itself */
18730                                 ? folded
18731                                 : (k == 1)  /* 2nd time use, the first fold */
18732                                    ? first_fold
18733 
18734                                      /* Then the remaining ones */
18735                                    : remaining_folds[k-2];
18736 
18737                         /* /aa doesn't allow folds between ASCII and non- */
18738                         if ((   ASCII_FOLD_RESTRICTED
18739                             && (isASCII(c) != isASCII(j))))
18740                         {
18741                             continue;
18742                         }
18743 
18744                         /* Folds under /l which cross the 255/256 boundary are
18745                          * added to a separate list.  (These are valid only
18746                          * when the locale is UTF-8.) */
18747                         if (c < 256 && LOC) {
18748                             *use_list = add_cp_to_invlist(*use_list, c);
18749                             continue;
18750                         }
18751 
18752                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18753                         {
18754                             cp_list = add_cp_to_invlist(cp_list, c);
18755                         }
18756                         else {
18757                             /* Similarly folds involving non-ascii Latin1
18758                              * characters under /d are added to their list */
18759                             upper_latin1_only_utf8_matches
18760                                     = add_cp_to_invlist(
18761                                                 upper_latin1_only_utf8_matches,
18762                                                 c);
18763                         }
18764                     }
18765                 }
18766             }
18767             SvREFCNT_dec_NN(fold_intersection);
18768         }
18769 
18770         /* Now that we have finished adding all the folds, there is no reason
18771          * to keep the foldable list separate */
18772         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18773 	SvREFCNT_dec_NN(cp_foldable_list);
18774     }
18775 
18776     /* And combine the result (if any) with any inversion lists from posix
18777      * classes.  The lists are kept separate up to now because we don't want to
18778      * fold the classes */
18779     if (simple_posixes) {   /* These are the classes known to be unaffected by
18780                                /a, /aa, and /d */
18781         if (cp_list) {
18782             _invlist_union(cp_list, simple_posixes, &cp_list);
18783             SvREFCNT_dec_NN(simple_posixes);
18784         }
18785         else {
18786             cp_list = simple_posixes;
18787         }
18788     }
18789     if (posixes || nposixes) {
18790         if (! DEPENDS_SEMANTICS) {
18791 
18792             /* For everything but /d, we can just add the current 'posixes' and
18793              * 'nposixes' to the main list */
18794             if (posixes) {
18795                 if (cp_list) {
18796                     _invlist_union(cp_list, posixes, &cp_list);
18797                     SvREFCNT_dec_NN(posixes);
18798                 }
18799                 else {
18800                     cp_list = posixes;
18801                 }
18802             }
18803             if (nposixes) {
18804                 if (cp_list) {
18805                     _invlist_union(cp_list, nposixes, &cp_list);
18806                     SvREFCNT_dec_NN(nposixes);
18807                 }
18808                 else {
18809                     cp_list = nposixes;
18810                 }
18811             }
18812         }
18813         else {
18814             /* Under /d, things like \w match upper Latin1 characters only if
18815              * the target string is in UTF-8.  But things like \W match all the
18816              * upper Latin1 characters if the target string is not in UTF-8.
18817              *
18818              * Handle the case with something like \W separately */
18819             if (nposixes) {
18820                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18821 
18822                 /* A complemented posix class matches all upper Latin1
18823                  * characters if not in UTF-8.  And it matches just certain
18824                  * ones when in UTF-8.  That means those certain ones are
18825                  * matched regardless, so can just be added to the
18826                  * unconditional list */
18827                 if (cp_list) {
18828                     _invlist_union(cp_list, nposixes, &cp_list);
18829                     SvREFCNT_dec_NN(nposixes);
18830                     nposixes = NULL;
18831                 }
18832                 else {
18833                     cp_list = nposixes;
18834                 }
18835 
18836                 /* Likewise for 'posixes' */
18837                 _invlist_union(posixes, cp_list, &cp_list);
18838                 SvREFCNT_dec(posixes);
18839 
18840                 /* Likewise for anything else in the range that matched only
18841                  * under UTF-8 */
18842                 if (upper_latin1_only_utf8_matches) {
18843                     _invlist_union(cp_list,
18844                                    upper_latin1_only_utf8_matches,
18845                                    &cp_list);
18846                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18847                     upper_latin1_only_utf8_matches = NULL;
18848                 }
18849 
18850                 /* If we don't match all the upper Latin1 characters regardless
18851                  * of UTF-8ness, we have to set a flag to match the rest when
18852                  * not in UTF-8 */
18853                 _invlist_subtract(only_non_utf8_list, cp_list,
18854                                   &only_non_utf8_list);
18855                 if (_invlist_len(only_non_utf8_list) != 0) {
18856                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18857                 }
18858                 SvREFCNT_dec_NN(only_non_utf8_list);
18859             }
18860             else {
18861                 /* Here there were no complemented posix classes.  That means
18862                  * the upper Latin1 characters in 'posixes' match only when the
18863                  * target string is in UTF-8.  So we have to add them to the
18864                  * list of those types of code points, while adding the
18865                  * remainder to the unconditional list.
18866                  *
18867                  * First calculate what they are */
18868                 SV* nonascii_but_latin1_properties = NULL;
18869                 _invlist_intersection(posixes, PL_UpperLatin1,
18870                                       &nonascii_but_latin1_properties);
18871 
18872                 /* And add them to the final list of such characters. */
18873                 _invlist_union(upper_latin1_only_utf8_matches,
18874                                nonascii_but_latin1_properties,
18875                                &upper_latin1_only_utf8_matches);
18876 
18877                 /* Remove them from what now becomes the unconditional list */
18878                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18879                                   &posixes);
18880 
18881                 /* And add those unconditional ones to the final list */
18882                 if (cp_list) {
18883                     _invlist_union(cp_list, posixes, &cp_list);
18884                     SvREFCNT_dec_NN(posixes);
18885                     posixes = NULL;
18886                 }
18887                 else {
18888                     cp_list = posixes;
18889                 }
18890 
18891                 SvREFCNT_dec(nonascii_but_latin1_properties);
18892 
18893                 /* Get rid of any characters from the conditional list that we
18894                  * now know are matched unconditionally, which may make that
18895                  * list empty */
18896                 _invlist_subtract(upper_latin1_only_utf8_matches,
18897                                   cp_list,
18898                                   &upper_latin1_only_utf8_matches);
18899                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18900                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18901                     upper_latin1_only_utf8_matches = NULL;
18902                 }
18903             }
18904         }
18905     }
18906 
18907     /* And combine the result (if any) with any inversion list from properties.
18908      * The lists are kept separate up to now so that we can distinguish the two
18909      * in regards to matching above-Unicode.  A run-time warning is generated
18910      * if a Unicode property is matched against a non-Unicode code point. But,
18911      * we allow user-defined properties to match anything, without any warning,
18912      * and we also suppress the warning if there is a portion of the character
18913      * class that isn't a Unicode property, and which matches above Unicode, \W
18914      * or [\x{110000}] for example.
18915      * (Note that in this case, unlike the Posix one above, there is no
18916      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18917      * forces Unicode semantics */
18918     if (properties) {
18919         if (cp_list) {
18920 
18921             /* If it matters to the final outcome, see if a non-property
18922              * component of the class matches above Unicode.  If so, the
18923              * warning gets suppressed.  This is true even if just a single
18924              * such code point is specified, as, though not strictly correct if
18925              * another such code point is matched against, the fact that they
18926              * are using above-Unicode code points indicates they should know
18927              * the issues involved */
18928             if (warn_super) {
18929                 warn_super = ! (invert
18930                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18931             }
18932 
18933             _invlist_union(properties, cp_list, &cp_list);
18934             SvREFCNT_dec_NN(properties);
18935         }
18936         else {
18937             cp_list = properties;
18938         }
18939 
18940         if (warn_super) {
18941             anyof_flags
18942              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18943 
18944             /* Because an ANYOF node is the only one that warns, this node
18945              * can't be optimized into something else */
18946             optimizable = FALSE;
18947         }
18948     }
18949 
18950     /* Here, we have calculated what code points should be in the character
18951      * class.
18952      *
18953      * Now we can see about various optimizations.  Fold calculation (which we
18954      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18955      * would invert to include K, which under /i would match k, which it
18956      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18957      * folded until runtime */
18958 
18959     /* If we didn't do folding, it's because some information isn't available
18960      * until runtime; set the run-time fold flag for these  We know to set the
18961      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18962      * at least one 0-255 range code point */
18963     if (LOC && FOLD) {
18964 
18965         /* Some things on the list might be unconditionally included because of
18966          * other components.  Remove them, and clean up the list if it goes to
18967          * 0 elements */
18968         if (only_utf8_locale_list && cp_list) {
18969             _invlist_subtract(only_utf8_locale_list, cp_list,
18970                               &only_utf8_locale_list);
18971 
18972             if (_invlist_len(only_utf8_locale_list) == 0) {
18973                 SvREFCNT_dec_NN(only_utf8_locale_list);
18974                 only_utf8_locale_list = NULL;
18975             }
18976         }
18977         if (    only_utf8_locale_list
18978             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18979                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18980         {
18981             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18982             anyof_flags
18983                  |= ANYOFL_FOLD
18984                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18985         }
18986         else if (cp_list && invlist_lowest(cp_list) < 256) {
18987             /* If nothing is below 256, has no locale dependency; otherwise it
18988              * does */
18989             anyof_flags |= ANYOFL_FOLD;
18990             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18991         }
18992     }
18993     else if (   DEPENDS_SEMANTICS
18994              && (    upper_latin1_only_utf8_matches
18995                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18996     {
18997         RExC_seen_d_op = TRUE;
18998         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18999     }
19000 
19001     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
19002      * compile time. */
19003     if (     cp_list
19004         &&   invert
19005         && ! has_runtime_dependency)
19006     {
19007         _invlist_invert(cp_list);
19008 
19009 	/* Clear the invert flag since have just done it here */
19010 	invert = FALSE;
19011     }
19012 
19013     /* All possible optimizations below still have these characteristics.
19014      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
19015      * routine) */
19016     *flagp |= HASWIDTH|SIMPLE;
19017 
19018     if (ret_invlist) {
19019         *ret_invlist = cp_list;
19020 
19021         return (cp_list) ? RExC_emit : 0;
19022     }
19023 
19024     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19025         RExC_contains_locale = 1;
19026     }
19027 
19028     /* Some character classes are equivalent to other nodes.  Such nodes take
19029      * up less room, and some nodes require fewer operations to execute, than
19030      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
19031      * improve efficiency. */
19032 
19033     if (optimizable) {
19034         PERL_UINT_FAST8_T i;
19035         UV partial_cp_count = 0;
19036         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19037         UV   end[MAX_FOLD_FROMS+1] = { 0 };
19038         bool single_range = FALSE;
19039 
19040         if (cp_list) { /* Count the code points in enough ranges that we would
19041                           see all the ones possible in any fold in this version
19042                           of Unicode */
19043 
19044             invlist_iterinit(cp_list);
19045             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19046                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19047                     break;
19048                 }
19049                 partial_cp_count += end[i] - start[i] + 1;
19050             }
19051 
19052             if (i == 1) {
19053                 single_range = TRUE;
19054             }
19055             invlist_iterfinish(cp_list);
19056         }
19057 
19058         /* If we know at compile time that this matches every possible code
19059          * point, any run-time dependencies don't matter */
19060         if (start[0] == 0 && end[0] == UV_MAX) {
19061             if (invert) {
19062                 ret = reganode(pRExC_state, OPFAIL, 0);
19063             }
19064             else {
19065                 ret = reg_node(pRExC_state, SANY);
19066                 MARK_NAUGHTY(1);
19067             }
19068             goto not_anyof;
19069         }
19070 
19071         /* Similarly, for /l posix classes, if both a class and its
19072          * complement match, any run-time dependencies don't matter */
19073         if (posixl) {
19074             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19075                                                         namedclass += 2)
19076             {
19077                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
19078                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19079                 {
19080                     if (invert) {
19081                         ret = reganode(pRExC_state, OPFAIL, 0);
19082                     }
19083                     else {
19084                         ret = reg_node(pRExC_state, SANY);
19085                         MARK_NAUGHTY(1);
19086                     }
19087                     goto not_anyof;
19088                 }
19089             }
19090 
19091             /* For well-behaved locales, some classes are subsets of others,
19092              * so complementing the subset and including the non-complemented
19093              * superset should match everything, like [\D[:alnum:]], and
19094              * [[:^alpha:][:alnum:]], but some implementations of locales are
19095              * buggy, and khw thinks its a bad idea to have optimization change
19096              * behavior, even if it avoids an OS bug in a given case */
19097 
19098 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19099 
19100             /* If is a single posix /l class, can optimize to just that op.
19101              * Such a node will not match anything in the Latin1 range, as that
19102              * is not determinable until runtime, but will match whatever the
19103              * class does outside that range.  (Note that some classes won't
19104              * match anything outside the range, like [:ascii:]) */
19105             if (    isSINGLE_BIT_SET(posixl)
19106                 && (partial_cp_count == 0 || start[0] > 255))
19107             {
19108                 U8 classnum;
19109                 SV * class_above_latin1 = NULL;
19110                 bool already_inverted;
19111                 bool are_equivalent;
19112 
19113                 /* Compute which bit is set, which is the same thing as, e.g.,
19114                  * ANYOF_CNTRL.  From
19115                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19116                  * */
19117                 static const int MultiplyDeBruijnBitPosition2[32] =
19118                     {
19119                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19120                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19121                     };
19122 
19123                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19124                                                           * 0x077CB531U) >> 27];
19125                 classnum = namedclass_to_classnum(namedclass);
19126 
19127                 /* The named classes are such that the inverted number is one
19128                  * larger than the non-inverted one */
19129                 already_inverted = namedclass
19130                                  - classnum_to_namedclass(classnum);
19131 
19132                 /* Create an inversion list of the official property, inverted
19133                  * if the constructed node list is inverted, and restricted to
19134                  * only the above latin1 code points, which are the only ones
19135                  * known at compile time */
19136                 _invlist_intersection_maybe_complement_2nd(
19137                                                     PL_AboveLatin1,
19138                                                     PL_XPosix_ptrs[classnum],
19139                                                     already_inverted,
19140                                                     &class_above_latin1);
19141                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19142                                                                         FALSE);
19143                 SvREFCNT_dec_NN(class_above_latin1);
19144 
19145                 if (are_equivalent) {
19146 
19147                     /* Resolve the run-time inversion flag with this possibly
19148                      * inverted class */
19149                     invert = invert ^ already_inverted;
19150 
19151                     ret = reg_node(pRExC_state,
19152                                    POSIXL + invert * (NPOSIXL - POSIXL));
19153                     FLAGS(REGNODE_p(ret)) = classnum;
19154                     goto not_anyof;
19155                 }
19156             }
19157         }
19158 
19159         /* khw can't think of any other possible transformation involving
19160          * these. */
19161         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19162             goto is_anyof;
19163         }
19164 
19165         if (! has_runtime_dependency) {
19166 
19167             /* If the list is empty, nothing matches.  This happens, for
19168              * example, when a Unicode property that doesn't match anything is
19169              * the only element in the character class (perluniprops.pod notes
19170              * such properties). */
19171             if (partial_cp_count == 0) {
19172                 if (invert) {
19173                     ret = reg_node(pRExC_state, SANY);
19174                 }
19175                 else {
19176                     ret = reganode(pRExC_state, OPFAIL, 0);
19177                 }
19178 
19179                 goto not_anyof;
19180             }
19181 
19182             /* If matches everything but \n */
19183             if (   start[0] == 0 && end[0] == '\n' - 1
19184                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19185             {
19186                 assert (! invert);
19187                 ret = reg_node(pRExC_state, REG_ANY);
19188                 MARK_NAUGHTY(1);
19189                 goto not_anyof;
19190             }
19191         }
19192 
19193         /* Next see if can optimize classes that contain just a few code points
19194          * into an EXACTish node.  The reason to do this is to let the
19195          * optimizer join this node with adjacent EXACTish ones, and ANYOF
19196          * nodes require conversion to code point from UTF-8.
19197          *
19198          * An EXACTFish node can be generated even if not under /i, and vice
19199          * versa.  But care must be taken.  An EXACTFish node has to be such
19200          * that it only matches precisely the code points in the class, but we
19201          * want to generate the least restrictive one that does that, to
19202          * increase the odds of being able to join with an adjacent node.  For
19203          * example, if the class contains [kK], we have to make it an EXACTFAA
19204          * node to prevent the KELVIN SIGN from matching.  Whether we are under
19205          * /i or not is irrelevant in this case.  Less obvious is the pattern
19206          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
19207          * supposed to match the single character U+0149 LATIN SMALL LETTER N
19208          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
19209          * that includes \X{02BC}, there is a multi-char fold that does, and so
19210          * the node generated for it must be an EXACTFish one.  On the other
19211          * hand qr/:/i should generate a plain EXACT node since the colon
19212          * participates in no fold whatsoever, and having it EXACT tells the
19213          * optimizer the target string cannot match unless it has a colon in
19214          * it.
19215          */
19216         if (   ! posixl
19217             && ! invert
19218 
19219                 /* Only try if there are no more code points in the class than
19220                  * in the max possible fold */
19221             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19222         {
19223             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19224             {
19225                 /* We can always make a single code point class into an
19226                  * EXACTish node. */
19227 
19228                 if (LOC) {
19229 
19230                     /* Here is /l:  Use EXACTL, except if there is a fold not
19231                      * known until runtime so shows as only a single code point
19232                      * here.  For code points above 255, we know which can
19233                      * cause problems by having a potential fold to the Latin1
19234                      * range. */
19235                     if (  ! FOLD
19236                         || (     start[0] > 255
19237                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19238                     {
19239                         op = EXACTL;
19240                     }
19241                     else {
19242                         op = EXACTFL;
19243                     }
19244                 }
19245                 else if (! FOLD) { /* Not /l and not /i */
19246                     op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19247                 }
19248                 else if (start[0] < 256) { /* /i, not /l, and the code point is
19249                                               small */
19250 
19251                     /* Under /i, it gets a little tricky.  A code point that
19252                      * doesn't participate in a fold should be an EXACT node.
19253                      * We know this one isn't the result of a simple fold, or
19254                      * there'd be more than one code point in the list, but it
19255                      * could be part of a multi- character fold.  In that case
19256                      * we better not create an EXACT node, as we would wrongly
19257                      * be telling the optimizer that this code point must be in
19258                      * the target string, and that is wrong.  This is because
19259                      * if the sequence around this code point forms a
19260                      * multi-char fold, what needs to be in the string could be
19261                      * the code point that folds to the sequence.
19262                      *
19263                      * This handles the case of below-255 code points, as we
19264                      * have an easy look up for those.  The next clause handles
19265                      * the above-256 one */
19266                     op = IS_IN_SOME_FOLD_L1(start[0])
19267                          ? EXACTFU
19268                          : EXACT;
19269                 }
19270                 else {  /* /i, larger code point.  Since we are under /i, and
19271                            have just this code point, we know that it can't
19272                            fold to something else, so PL_InMultiCharFold
19273                            applies to it */
19274                     op = _invlist_contains_cp(PL_InMultiCharFold,
19275                                               start[0])
19276                          ? EXACTFU_REQ8
19277                          : EXACT_REQ8;
19278                 }
19279 
19280                 value = start[0];
19281             }
19282             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19283                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
19284             {
19285                 /* Here, the only runtime dependency, if any, is from /d, and
19286                  * the class matches more than one code point, and the lowest
19287                  * code point participates in some fold.  It might be that the
19288                  * other code points are /i equivalent to this one, and hence
19289                  * they would representable by an EXACTFish node.  Above, we
19290                  * eliminated classes that contain too many code points to be
19291                  * EXACTFish, with the test for MAX_FOLD_FROMS
19292                  *
19293                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
19294                  * We do this because we have EXACTFAA at our disposal for the
19295                  * ASCII range */
19296                 if (partial_cp_count == 2 && isASCII(start[0])) {
19297 
19298                     /* The only ASCII characters that participate in folds are
19299                      * alphabetics */
19300                     assert(isALPHA(start[0]));
19301                     if (   end[0] == start[0]   /* First range is a single
19302                                                    character, so 2nd exists */
19303                         && isALPHA_FOLD_EQ(start[0], start[1]))
19304                     {
19305 
19306                         /* Here, is part of an ASCII fold pair */
19307 
19308                         if (   ASCII_FOLD_RESTRICTED
19309                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19310                         {
19311                             /* If the second clause just above was true, it
19312                              * means we can't be under /i, or else the list
19313                              * would have included more than this fold pair.
19314                              * Therefore we have to exclude the possibility of
19315                              * whatever else it is that folds to these, by
19316                              * using EXACTFAA */
19317                             op = EXACTFAA;
19318                         }
19319                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19320 
19321                             /* Here, there's no simple fold that start[0] is part
19322                              * of, but there is a multi-character one.  If we
19323                              * are not under /i, we want to exclude that
19324                              * possibility; if under /i, we want to include it
19325                              * */
19326                             op = (FOLD) ? EXACTFU : EXACTFAA;
19327                         }
19328                         else {
19329 
19330                             /* Here, the only possible fold start[0] particpates in
19331                              * is with start[1].  /i or not isn't relevant */
19332                             op = EXACTFU;
19333                         }
19334 
19335                         value = toFOLD(start[0]);
19336                     }
19337                 }
19338                 else if (  ! upper_latin1_only_utf8_matches
19339                          || (   _invlist_len(upper_latin1_only_utf8_matches)
19340                                                                           == 2
19341                              && PL_fold_latin1[
19342                                invlist_highest(upper_latin1_only_utf8_matches)]
19343                              == start[0]))
19344                 {
19345                     /* Here, the smallest character is non-ascii or there are
19346                      * more than 2 code points matched by this node.  Also, we
19347                      * either don't have /d UTF-8 dependent matches, or if we
19348                      * do, they look like they could be a single character that
19349                      * is the fold of the lowest one in the always-match list.
19350                      * This test quickly excludes most of the false positives
19351                      * when there are /d UTF-8 depdendent matches.  These are
19352                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19353                      * SMALL LETTER A WITH GRAVE iff the target string is
19354                      * UTF-8.  (We don't have to worry above about exceeding
19355                      * the array bounds of PL_fold_latin1[] because any code
19356                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
19357                      *
19358                      * EXACTFAA would apply only to pairs (hence exactly 2 code
19359                      * points) in the ASCII range, so we can't use it here to
19360                      * artificially restrict the fold domain, so we check if
19361                      * the class does or does not match some EXACTFish node.
19362                      * Further, if we aren't under /i, and the folded-to
19363                      * character is part of a multi-character fold, we can't do
19364                      * this optimization, as the sequence around it could be
19365                      * that multi-character fold, and we don't here know the
19366                      * context, so we have to assume it is that multi-char
19367                      * fold, to prevent potential bugs.
19368                      *
19369                      * To do the general case, we first find the fold of the
19370                      * lowest code point (which may be higher than the lowest
19371                      * one), then find everything that folds to it.  (The data
19372                      * structure we have only maps from the folded code points,
19373                      * so we have to do the earlier step.) */
19374 
19375                     Size_t foldlen;
19376                     U8 foldbuf[UTF8_MAXBYTES_CASE];
19377                     UV folded = _to_uni_fold_flags(start[0],
19378                                                         foldbuf, &foldlen, 0);
19379                     U32 first_fold;
19380                     const U32 * remaining_folds;
19381                     Size_t folds_to_this_cp_count = _inverse_folds(
19382                                                             folded,
19383                                                             &first_fold,
19384                                                             &remaining_folds);
19385                     Size_t folds_count = folds_to_this_cp_count + 1;
19386                     SV * fold_list = _new_invlist(folds_count);
19387                     unsigned int i;
19388 
19389                     /* If there are UTF-8 dependent matches, create a temporary
19390                      * list of what this node matches, including them. */
19391                     SV * all_cp_list = NULL;
19392                     SV ** use_this_list = &cp_list;
19393 
19394                     if (upper_latin1_only_utf8_matches) {
19395                         all_cp_list = _new_invlist(0);
19396                         use_this_list = &all_cp_list;
19397                         _invlist_union(cp_list,
19398                                        upper_latin1_only_utf8_matches,
19399                                        use_this_list);
19400                     }
19401 
19402                     /* Having gotten everything that participates in the fold
19403                      * containing the lowest code point, we turn that into an
19404                      * inversion list, making sure everything is included. */
19405                     fold_list = add_cp_to_invlist(fold_list, start[0]);
19406                     fold_list = add_cp_to_invlist(fold_list, folded);
19407                     if (folds_to_this_cp_count > 0) {
19408                         fold_list = add_cp_to_invlist(fold_list, first_fold);
19409                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19410                             fold_list = add_cp_to_invlist(fold_list,
19411                                                         remaining_folds[i]);
19412                         }
19413                     }
19414 
19415                     /* If the fold list is identical to what's in this ANYOF
19416                      * node, the node can be represented by an EXACTFish one
19417                      * instead */
19418                     if (_invlistEQ(*use_this_list, fold_list,
19419                                    0 /* Don't complement */ )
19420                     ) {
19421 
19422                         /* But, we have to be careful, as mentioned above.
19423                          * Just the right sequence of characters could match
19424                          * this if it is part of a multi-character fold.  That
19425                          * IS what we want if we are under /i.  But it ISN'T
19426                          * what we want if not under /i, as it could match when
19427                          * it shouldn't.  So, when we aren't under /i and this
19428                          * character participates in a multi-char fold, we
19429                          * don't optimize into an EXACTFish node.  So, for each
19430                          * case below we have to check if we are folding
19431                          * and if not, if it is not part of a multi-char fold.
19432                          * */
19433                         if (start[0] > 255) {    /* Highish code point */
19434                             if (FOLD || ! _invlist_contains_cp(
19435                                             PL_InMultiCharFold, folded))
19436                             {
19437                                 op = (LOC)
19438                                      ? EXACTFLU8
19439                                      : (ASCII_FOLD_RESTRICTED)
19440                                        ? EXACTFAA
19441                                        : EXACTFU_REQ8;
19442                                 value = folded;
19443                             }
19444                         }   /* Below, the lowest code point < 256 */
19445                         else if (    FOLD
19446                                  &&  folded == 's'
19447                                  &&  DEPENDS_SEMANTICS)
19448                         {   /* An EXACTF node containing a single character
19449                                 's', can be an EXACTFU if it doesn't get
19450                                 joined with an adjacent 's' */
19451                             op = EXACTFU_S_EDGE;
19452                             value = folded;
19453                         }
19454                         else if (    FOLD
19455                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19456                         {
19457                             if (upper_latin1_only_utf8_matches) {
19458                                 op = EXACTF;
19459 
19460                                 /* We can't use the fold, as that only matches
19461                                  * under UTF-8 */
19462                                 value = start[0];
19463                             }
19464                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
19465                                      && ! UTF)
19466                             {   /* EXACTFUP is a special node for this
19467                                    character */
19468                                 op = (ASCII_FOLD_RESTRICTED)
19469                                      ? EXACTFAA
19470                                      : EXACTFUP;
19471                                 value = MICRO_SIGN;
19472                             }
19473                             else if (     ASCII_FOLD_RESTRICTED
19474                                      && ! isASCII(start[0]))
19475                             {   /* For ASCII under /iaa, we can use EXACTFU
19476                                    below */
19477                                 op = EXACTFAA;
19478                                 value = folded;
19479                             }
19480                             else {
19481                                 op = EXACTFU;
19482                                 value = folded;
19483                             }
19484                         }
19485                     }
19486 
19487                     SvREFCNT_dec_NN(fold_list);
19488                     SvREFCNT_dec(all_cp_list);
19489                 }
19490             }
19491 
19492             if (op != END) {
19493                 U8 len;
19494 
19495                 /* Here, we have calculated what EXACTish node to use.  Have to
19496                  * convert to UTF-8 if not already there */
19497                 if (value > 255) {
19498                     if (! UTF) {
19499                         SvREFCNT_dec(cp_list);;
19500                         REQUIRE_UTF8(flagp);
19501                     }
19502 
19503                     /* This is a kludge to the special casing issues with this
19504                      * ligature under /aa.  FB05 should fold to FB06, but the
19505                      * call above to _to_uni_fold_flags() didn't find this, as
19506                      * it didn't use the /aa restriction in order to not miss
19507                      * other folds that would be affected.  This is the only
19508                      * instance likely to ever be a problem in all of Unicode.
19509                      * So special case it. */
19510                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
19511                         && ASCII_FOLD_RESTRICTED)
19512                     {
19513                         value = LATIN_SMALL_LIGATURE_ST;
19514                     }
19515                 }
19516 
19517                 len = (UTF) ? UVCHR_SKIP(value) : 1;
19518 
19519                 ret = regnode_guts(pRExC_state, op, len, "exact");
19520                 FILL_NODE(ret, op);
19521                 RExC_emit += 1 + STR_SZ(len);
19522                 setSTR_LEN(REGNODE_p(ret), len);
19523                 if (len == 1) {
19524                     *STRINGs(REGNODE_p(ret)) = (U8) value;
19525                 }
19526                 else {
19527                     uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19528                 }
19529                 goto not_anyof;
19530             }
19531         }
19532 
19533         if (! has_runtime_dependency) {
19534 
19535             /* See if this can be turned into an ANYOFM node.  Think about the
19536              * bit patterns in two different bytes.  In some positions, the
19537              * bits in each will be 1; and in other positions both will be 0;
19538              * and in some positions the bit will be 1 in one byte, and 0 in
19539              * the other.  Let 'n' be the number of positions where the bits
19540              * differ.  We create a mask which has exactly 'n' 0 bits, each in
19541              * a position where the two bytes differ.  Now take the set of all
19542              * bytes that when ANDed with the mask yield the same result.  That
19543              * set has 2**n elements, and is representable by just two 8 bit
19544              * numbers: the result and the mask.  Importantly, matching the set
19545              * can be vectorized by creating a word full of the result bytes,
19546              * and a word full of the mask bytes, yielding a significant speed
19547              * up.  Here, see if this node matches such a set.  As a concrete
19548              * example consider [01], and the byte representing '0' which is
19549              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
19550              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
19551              * 0x30.  Any other bytes ANDed yield something else.  So [01],
19552              * which is a common usage, is optimizable into ANYOFM, and can
19553              * benefit from the speed up.  We can only do this on UTF-8
19554              * invariant bytes, because they have the same bit patterns under
19555              * UTF-8 as not. */
19556             PERL_UINT_FAST8_T inverted = 0;
19557 #ifdef EBCDIC
19558             const PERL_UINT_FAST8_T max_permissible = 0xFF;
19559 #else
19560             const PERL_UINT_FAST8_T max_permissible = 0x7F;
19561 #endif
19562             /* If doesn't fit the criteria for ANYOFM, invert and try again.
19563              * If that works we will instead later generate an NANYOFM, and
19564              * invert back when through */
19565             if (invlist_highest(cp_list) > max_permissible) {
19566                 _invlist_invert(cp_list);
19567                 inverted = 1;
19568             }
19569 
19570             if (invlist_highest(cp_list) <= max_permissible) {
19571                 UV this_start, this_end;
19572                 UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
19573                 U8 bits_differing = 0;
19574                 Size_t full_cp_count = 0;
19575                 bool first_time = TRUE;
19576 
19577                 /* Go through the bytes and find the bit positions that differ
19578                  * */
19579                 invlist_iterinit(cp_list);
19580                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19581                     unsigned int i = this_start;
19582 
19583                     if (first_time) {
19584                         if (! UVCHR_IS_INVARIANT(i)) {
19585                             goto done_anyofm;
19586                         }
19587 
19588                         first_time = FALSE;
19589                         lowest_cp = this_start;
19590 
19591                         /* We have set up the code point to compare with.
19592                          * Don't compare it with itself */
19593                         i++;
19594                     }
19595 
19596                     /* Find the bit positions that differ from the lowest code
19597                      * point in the node.  Keep track of all such positions by
19598                      * OR'ing */
19599                     for (; i <= this_end; i++) {
19600                         if (! UVCHR_IS_INVARIANT(i)) {
19601                             goto done_anyofm;
19602                         }
19603 
19604                         bits_differing  |= i ^ lowest_cp;
19605                     }
19606 
19607                     full_cp_count += this_end - this_start + 1;
19608                 }
19609 
19610                 /* At the end of the loop, we count how many bits differ from
19611                  * the bits in lowest code point, call the count 'd'.  If the
19612                  * set we found contains 2**d elements, it is the closure of
19613                  * all code points that differ only in those bit positions.  To
19614                  * convince yourself of that, first note that the number in the
19615                  * closure must be a power of 2, which we test for.  The only
19616                  * way we could have that count and it be some differing set,
19617                  * is if we got some code points that don't differ from the
19618                  * lowest code point in any position, but do differ from each
19619                  * other in some other position.  That means one code point has
19620                  * a 1 in that position, and another has a 0.  But that would
19621                  * mean that one of them differs from the lowest code point in
19622                  * that position, which possibility we've already excluded.  */
19623                 if (  (inverted || full_cp_count > 1)
19624                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19625                 {
19626                     U8 ANYOFM_mask;
19627 
19628                     op = ANYOFM + inverted;;
19629 
19630                     /* We need to make the bits that differ be 0's */
19631                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19632 
19633                     /* The argument is the lowest code point */
19634                     ret = reganode(pRExC_state, op, lowest_cp);
19635                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19636                 }
19637 
19638               done_anyofm:
19639                 invlist_iterfinish(cp_list);
19640             }
19641 
19642             if (inverted) {
19643                 _invlist_invert(cp_list);
19644             }
19645 
19646             if (op != END) {
19647                 goto not_anyof;
19648             }
19649 
19650             /* XXX We could create an ANYOFR_LOW node here if we saved above if
19651              * all were invariants, it wasn't inverted, and there is a single
19652              * range.  This would be faster than some of the posix nodes we
19653              * create below like /\d/a, but would be twice the size.  Without
19654              * having actually measured the gain, khw doesn't think the
19655              * tradeoff is really worth it */
19656         }
19657 
19658         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19659             PERL_UINT_FAST8_T type;
19660             SV * intersection = NULL;
19661             SV* d_invlist = NULL;
19662 
19663             /* See if this matches any of the POSIX classes.  The POSIXA and
19664              * POSIXD ones are about the same speed as ANYOF ops, but take less
19665              * room; the ones that have above-Latin1 code point matches are
19666              * somewhat faster than ANYOF.  */
19667 
19668             for (type = POSIXA; type >= POSIXD; type--) {
19669                 int posix_class;
19670 
19671                 if (type == POSIXL) {   /* But not /l posix classes */
19672                     continue;
19673                 }
19674 
19675                 for (posix_class = 0;
19676                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19677                      posix_class++)
19678                 {
19679                     SV** our_code_points = &cp_list;
19680                     SV** official_code_points;
19681                     int try_inverted;
19682 
19683                     if (type == POSIXA) {
19684                         official_code_points = &PL_Posix_ptrs[posix_class];
19685                     }
19686                     else {
19687                         official_code_points = &PL_XPosix_ptrs[posix_class];
19688                     }
19689 
19690                     /* Skip non-existent classes of this type.  e.g. \v only
19691                      * has an entry in PL_XPosix_ptrs */
19692                     if (! *official_code_points) {
19693                         continue;
19694                     }
19695 
19696                     /* Try both the regular class, and its inversion */
19697                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19698                         bool this_inverted = invert ^ try_inverted;
19699 
19700                         if (type != POSIXD) {
19701 
19702                             /* This class that isn't /d can't match if we have
19703                              * /d dependencies */
19704                             if (has_runtime_dependency
19705                                                     & HAS_D_RUNTIME_DEPENDENCY)
19706                             {
19707                                 continue;
19708                             }
19709                         }
19710                         else /* is /d */ if (! this_inverted) {
19711 
19712                             /* /d classes don't match anything non-ASCII below
19713                              * 256 unconditionally (which cp_list contains) */
19714                             _invlist_intersection(cp_list, PL_UpperLatin1,
19715                                                            &intersection);
19716                             if (_invlist_len(intersection) != 0) {
19717                                 continue;
19718                             }
19719 
19720                             SvREFCNT_dec(d_invlist);
19721                             d_invlist = invlist_clone(cp_list, NULL);
19722 
19723                             /* But under UTF-8 it turns into using /u rules.
19724                              * Add the things it matches under these conditions
19725                              * so that we check below that these are identical
19726                              * to what the tested class should match */
19727                             if (upper_latin1_only_utf8_matches) {
19728                                 _invlist_union(
19729                                             d_invlist,
19730                                             upper_latin1_only_utf8_matches,
19731                                             &d_invlist);
19732                             }
19733                             our_code_points = &d_invlist;
19734                         }
19735                         else {  /* POSIXD, inverted.  If this doesn't have this
19736                                    flag set, it isn't /d. */
19737                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19738                             {
19739                                 continue;
19740                             }
19741                             our_code_points = &cp_list;
19742                         }
19743 
19744                         /* Here, have weeded out some things.  We want to see
19745                          * if the list of characters this node contains
19746                          * ('*our_code_points') precisely matches those of the
19747                          * class we are currently checking against
19748                          * ('*official_code_points'). */
19749                         if (_invlistEQ(*our_code_points,
19750                                        *official_code_points,
19751                                        try_inverted))
19752                         {
19753                             /* Here, they precisely match.  Optimize this ANYOF
19754                              * node into its equivalent POSIX one of the
19755                              * correct type, possibly inverted */
19756                             ret = reg_node(pRExC_state, (try_inverted)
19757                                                         ? type + NPOSIXA
19758                                                                 - POSIXA
19759                                                         : type);
19760                             FLAGS(REGNODE_p(ret)) = posix_class;
19761                             SvREFCNT_dec(d_invlist);
19762                             SvREFCNT_dec(intersection);
19763                             goto not_anyof;
19764                         }
19765                     }
19766                 }
19767             }
19768             SvREFCNT_dec(d_invlist);
19769             SvREFCNT_dec(intersection);
19770         }
19771 
19772         /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19773          * both in size and speed.  Currently, a 20 bit range base (smallest
19774          * code point in the range), and a 12 bit maximum delta are packed into
19775          * a 32 bit word.  This allows for using it on all of the Unicode code
19776          * points except for the highest plane, which is only for private use
19777          * code points.  khw doubts that a bigger delta is likely in real world
19778          * applications */
19779         if (     single_range
19780             && ! has_runtime_dependency
19781             &&   anyof_flags == 0
19782             &&   start[0] < (1 << ANYOFR_BASE_BITS)
19783             &&   end[0] - start[0]
19784                     < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19785                                    * CHARBITS - ANYOFR_BASE_BITS))))
19786 
19787         {
19788             U8 low_utf8[UTF8_MAXBYTES+1];
19789             U8 high_utf8[UTF8_MAXBYTES+1];
19790 
19791             ret = reganode(pRExC_state, ANYOFR,
19792                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19793 
19794             /* Place the lowest UTF-8 start byte in the flags field, so as to
19795              * allow efficient ruling out at run time of many possible inputs.
19796              * */
19797             (void) uvchr_to_utf8(low_utf8, start[0]);
19798             (void) uvchr_to_utf8(high_utf8, end[0]);
19799 
19800             /* If all code points share the same first byte, this can be an
19801              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
19802              * quickly rule out many inputs at run-time without having to
19803              * compute the code point from UTF-8.  For EBCDIC, we use I8, as
19804              * not doing that transformation would not rule out nearly so many
19805              * things */
19806             if (low_utf8[0] == high_utf8[0]) {
19807                 OP(REGNODE_p(ret)) = ANYOFRb;
19808                 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19809             }
19810             else {
19811                 ANYOF_FLAGS(REGNODE_p(ret))
19812                                     = NATIVE_UTF8_TO_I8(low_utf8[0]);
19813             }
19814 
19815             goto not_anyof;
19816         }
19817 
19818         /* If didn't find an optimization and there is no need for a bitmap,
19819          * optimize to indicate that */
19820         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19821             && ! LOC
19822             && ! upper_latin1_only_utf8_matches
19823             &&   anyof_flags == 0)
19824         {
19825             U8 low_utf8[UTF8_MAXBYTES+1];
19826             UV highest_cp = invlist_highest(cp_list);
19827 
19828             /* Currently the maximum allowed code point by the system is
19829              * IV_MAX.  Higher ones are reserved for future internal use.  This
19830              * particular regnode can be used for higher ones, but we can't
19831              * calculate the code point of those.  IV_MAX suffices though, as
19832              * it will be a large first byte */
19833             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19834                            - low_utf8;
19835 
19836             /* We store the lowest possible first byte of the UTF-8
19837              * representation, using the flags field.  This allows for quick
19838              * ruling out of some inputs without having to convert from UTF-8
19839              * to code point.  For EBCDIC, we use I8, as not doing that
19840              * transformation would not rule out nearly so many things */
19841             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19842 
19843             op = ANYOFH;
19844 
19845             /* If the first UTF-8 start byte for the highest code point in the
19846              * range is suitably small, we may be able to get an upper bound as
19847              * well */
19848             if (highest_cp <= IV_MAX) {
19849                 U8 high_utf8[UTF8_MAXBYTES+1];
19850                 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19851                                 - high_utf8;
19852 
19853                 /* If the lowest and highest are the same, we can get an exact
19854                  * first byte instead of a just minimum or even a sequence of
19855                  * exact leading bytes.  We signal these with different
19856                  * regnodes */
19857                 if (low_utf8[0] == high_utf8[0]) {
19858                     Size_t len = find_first_differing_byte_pos(low_utf8,
19859                                                                high_utf8,
19860                                                        MIN(low_len, high_len));
19861 
19862                     if (len == 1) {
19863 
19864                         /* No need to convert to I8 for EBCDIC as this is an
19865                          * exact match */
19866                         anyof_flags = low_utf8[0];
19867                         op = ANYOFHb;
19868                     }
19869                     else {
19870                         op = ANYOFHs;
19871                         ret = regnode_guts(pRExC_state, op,
19872                                            regarglen[op] + STR_SZ(len),
19873                                            "anyofhs");
19874                         FILL_NODE(ret, op);
19875                         ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19876                                                                         = len;
19877                         Copy(low_utf8,  /* Add the common bytes */
19878                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19879                            len, U8);
19880                         RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19881                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19882                                                   NULL, only_utf8_locale_list);
19883                         goto not_anyof;
19884                     }
19885                 }
19886                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19887                 {
19888 
19889                     /* Here, the high byte is not the same as the low, but is
19890                      * small enough that its reasonable to have a loose upper
19891                      * bound, which is packed in with the strict lower bound.
19892                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19893                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
19894                      * is the same thing as UTF-8 */
19895 
19896                     U8 bits = 0;
19897                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19898                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19899                                   - anyof_flags;
19900 
19901                     if (range_diff <= max_range_diff / 8) {
19902                         bits = 3;
19903                     }
19904                     else if (range_diff <= max_range_diff / 4) {
19905                         bits = 2;
19906                     }
19907                     else if (range_diff <= max_range_diff / 2) {
19908                         bits = 1;
19909                     }
19910                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19911                     op = ANYOFHr;
19912                 }
19913             }
19914 
19915             goto done_finding_op;
19916         }
19917     }   /* End of seeing if can optimize it into a different node */
19918 
19919   is_anyof: /* It's going to be an ANYOF node. */
19920     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19921          ? ANYOFD
19922          : ((posixl)
19923             ? ANYOFPOSIXL
19924             : ((LOC)
19925                ? ANYOFL
19926                : ANYOF));
19927 
19928   done_finding_op:
19929 
19930     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19931     FILL_NODE(ret, op);        /* We set the argument later */
19932     RExC_emit += 1 + regarglen[op];
19933     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19934 
19935     /* Here, <cp_list> contains all the code points we can determine at
19936      * compile time that match under all conditions.  Go through it, and
19937      * for things that belong in the bitmap, put them there, and delete from
19938      * <cp_list>.  While we are at it, see if everything above 255 is in the
19939      * list, and if so, set a flag to speed up execution */
19940 
19941     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19942 
19943     if (posixl) {
19944         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19945     }
19946 
19947     if (invert) {
19948         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19949     }
19950 
19951     /* Here, the bitmap has been populated with all the Latin1 code points that
19952      * always match.  Can now add to the overall list those that match only
19953      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19954      * */
19955     if (upper_latin1_only_utf8_matches) {
19956 	if (cp_list) {
19957 	    _invlist_union(cp_list,
19958                            upper_latin1_only_utf8_matches,
19959                            &cp_list);
19960 	    SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19961 	}
19962 	else {
19963 	    cp_list = upper_latin1_only_utf8_matches;
19964 	}
19965         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19966     }
19967 
19968     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19969                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19970                    ? listsv
19971                    : NULL,
19972                   only_utf8_locale_list);
19973     SvREFCNT_dec(cp_list);;
19974     SvREFCNT_dec(only_utf8_locale_list);
19975     return ret;
19976 
19977   not_anyof:
19978 
19979     /* Here, the node is getting optimized into something that's not an ANYOF
19980      * one.  Finish up. */
19981 
19982     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19983                                            RExC_parse - orig_parse);;
19984     SvREFCNT_dec(cp_list);;
19985     SvREFCNT_dec(only_utf8_locale_list);
19986     return ret;
19987 }
19988 
19989 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19990 
19991 STATIC void
S_set_ANYOF_arg(pTHX_ RExC_state_t * const pRExC_state,regnode * const node,SV * const cp_list,SV * const runtime_defns,SV * const only_utf8_locale_list)19992 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19993                 regnode* const node,
19994                 SV* const cp_list,
19995                 SV* const runtime_defns,
19996                 SV* const only_utf8_locale_list)
19997 {
19998     /* Sets the arg field of an ANYOF-type node 'node', using information about
19999      * the node passed-in.  If there is nothing outside the node's bitmap, the
20000      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
20001      * the count returned by add_data(), having allocated and stored an array,
20002      * av, as follows:
20003      *
20004      *  av[0] stores the inversion list defining this class as far as known at
20005      *        this time, or PL_sv_undef if nothing definite is now known.
20006      *  av[1] stores the inversion list of code points that match only if the
20007      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
20008      *        av[2], or no entry otherwise.
20009      *  av[2] stores the list of user-defined properties whose subroutine
20010      *        definitions aren't known at this time, or no entry if none. */
20011 
20012     UV n;
20013 
20014     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
20015 
20016     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
20017         assert(! (ANYOF_FLAGS(node)
20018                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
20019 	ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
20020     }
20021     else {
20022 	AV * const av = newAV();
20023 	SV *rv;
20024 
20025         if (cp_list) {
20026             av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20027         }
20028 
20029         if (only_utf8_locale_list) {
20030             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20031                                      SvREFCNT_inc_NN(only_utf8_locale_list));
20032         }
20033 
20034         if (runtime_defns) {
20035             av_store(av, DEFERRED_USER_DEFINED_INDEX,
20036                          SvREFCNT_inc_NN(runtime_defns));
20037         }
20038 
20039 	rv = newRV_noinc(MUTABLE_SV(av));
20040 	n = add_data(pRExC_state, STR_WITH_LEN("s"));
20041 	RExC_rxi->data->data[n] = (void*)rv;
20042 	ARG_SET(node, n);
20043     }
20044 }
20045 
20046 SV *
20047 
20048 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
Perl_get_regclass_nonbitmap_data(pTHX_ const regexp * prog,const regnode * node,bool doinit,SV ** listsvp,SV ** only_utf8_locale_ptr,SV ** output_invlist)20049 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20050 #else
20051 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20052 #endif
20053 
20054 {
20055     /* For internal core use only.
20056      * Returns the inversion list for the input 'node' in the regex 'prog'.
20057      * If <doinit> is 'true', will attempt to create the inversion list if not
20058      *    already done.
20059      * If <listsvp> is non-null, will return the printable contents of the
20060      *    property definition.  This can be used to get debugging information
20061      *    even before the inversion list exists, by calling this function with
20062      *    'doinit' set to false, in which case the components that will be used
20063      *    to eventually create the inversion list are returned  (in a printable
20064      *    form).
20065      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20066      *    store an inversion list of code points that should match only if the
20067      *    execution-time locale is a UTF-8 one.
20068      * If <output_invlist> is not NULL, it is where this routine is to store an
20069      *    inversion list of the code points that would be instead returned in
20070      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
20071      *    when this parameter is used, is just the non-code point data that
20072      *    will go into creating the inversion list.  This currently should be just
20073      *    user-defined properties whose definitions were not known at compile
20074      *    time.  Using this parameter allows for easier manipulation of the
20075      *    inversion list's data by the caller.  It is illegal to call this
20076      *    function with this parameter set, but not <listsvp>
20077      *
20078      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
20079      * that, in spite of this function's name, the inversion list it returns
20080      * may include the bitmap data as well */
20081 
20082     SV *si  = NULL;         /* Input initialization string */
20083     SV* invlist = NULL;
20084 
20085     RXi_GET_DECL(prog, progi);
20086     const struct reg_data * const data = prog ? progi->data : NULL;
20087 
20088 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20089     PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20090 #else
20091     PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20092 #endif
20093     assert(! output_invlist || listsvp);
20094 
20095     if (data && data->count) {
20096 	const U32 n = ARG(node);
20097 
20098 	if (data->what[n] == 's') {
20099 	    SV * const rv = MUTABLE_SV(data->data[n]);
20100 	    AV * const av = MUTABLE_AV(SvRV(rv));
20101 	    SV **const ary = AvARRAY(av);
20102 
20103             invlist = ary[INVLIST_INDEX];
20104 
20105             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20106                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20107             }
20108 
20109             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20110                 si = ary[DEFERRED_USER_DEFINED_INDEX];
20111             }
20112 
20113 	    if (doinit && (si || invlist)) {
20114                 if (si) {
20115                     bool user_defined;
20116                     SV * msg = newSVpvs_flags("", SVs_TEMP);
20117 
20118                     SV * prop_definition = handle_user_defined_property(
20119                             "", 0, FALSE,   /* There is no \p{}, \P{} */
20120                             SvPVX_const(si)[1] - '0',   /* /i or not has been
20121                                                            stored here for just
20122                                                            this occasion */
20123                             TRUE,           /* run time */
20124                             FALSE,          /* This call must find the defn */
20125                             si,             /* The property definition  */
20126                             &user_defined,
20127                             msg,
20128                             0               /* base level call */
20129                            );
20130 
20131                     if (SvCUR(msg)) {
20132                         assert(prop_definition == NULL);
20133 
20134                         Perl_croak(aTHX_ "%" UTF8f,
20135                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20136                     }
20137 
20138                     if (invlist) {
20139                         _invlist_union(invlist, prop_definition, &invlist);
20140                         SvREFCNT_dec_NN(prop_definition);
20141                     }
20142                     else {
20143                         invlist = prop_definition;
20144                     }
20145 
20146                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20147                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20148 
20149                     ary[INVLIST_INDEX] = invlist;
20150                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20151                                  ? ONLY_LOCALE_MATCHES_INDEX
20152                                  : INVLIST_INDEX);
20153                     si = NULL;
20154                 }
20155 	    }
20156 	}
20157     }
20158 
20159     /* If requested, return a printable version of what this ANYOF node matches
20160      * */
20161     if (listsvp) {
20162 	SV* matches_string = NULL;
20163 
20164         /* This function can be called at compile-time, before everything gets
20165          * resolved, in which case we return the currently best available
20166          * information, which is the string that will eventually be used to do
20167          * that resolving, 'si' */
20168 	if (si) {
20169             /* Here, we only have 'si' (and possibly some passed-in data in
20170              * 'invlist', which is handled below)  If the caller only wants
20171              * 'si', use that.  */
20172             if (! output_invlist) {
20173                 matches_string = newSVsv(si);
20174             }
20175             else {
20176                 /* But if the caller wants an inversion list of the node, we
20177                  * need to parse 'si' and place as much as possible in the
20178                  * desired output inversion list, making 'matches_string' only
20179                  * contain the currently unresolvable things */
20180                 const char *si_string = SvPVX(si);
20181                 STRLEN remaining = SvCUR(si);
20182                 UV prev_cp = 0;
20183                 U8 count = 0;
20184 
20185                 /* Ignore everything before and including the first new-line */
20186                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20187                 assert (si_string != NULL);
20188                 si_string++;
20189                 remaining = SvPVX(si) + SvCUR(si) - si_string;
20190 
20191                 while (remaining > 0) {
20192 
20193                     /* The data consists of just strings defining user-defined
20194                      * property names, but in prior incarnations, and perhaps
20195                      * somehow from pluggable regex engines, it could still
20196                      * hold hex code point definitions, all of which should be
20197                      * legal (or it wouldn't have gotten this far).  Each
20198                      * component of a range would be separated by a tab, and
20199                      * each range by a new-line.  If these are found, instead
20200                      * add them to the inversion list */
20201                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
20202                                      |PERL_SCAN_SILENT_NON_PORTABLE;
20203                     STRLEN len = remaining;
20204                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20205 
20206                     /* If the hex decode routine found something, it should go
20207                      * up to the next \n */
20208                     if (   *(si_string + len) == '\n') {
20209                         if (count) {    /* 2nd code point on line */
20210                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20211                         }
20212                         else {
20213                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20214                         }
20215                         count = 0;
20216                         goto prepare_for_next_iteration;
20217                     }
20218 
20219                     /* If the hex decode was instead for the lower range limit,
20220                      * save it, and go parse the upper range limit */
20221                     if (*(si_string + len) == '\t') {
20222                         assert(count == 0);
20223 
20224                         prev_cp = cp;
20225                         count = 1;
20226                       prepare_for_next_iteration:
20227                         si_string += len + 1;
20228                         remaining -= len + 1;
20229                         continue;
20230                     }
20231 
20232                     /* Here, didn't find a legal hex number.  Just add the text
20233                      * from here up to the next \n, omitting any trailing
20234                      * markers. */
20235 
20236                     remaining -= len;
20237                     len = strcspn(si_string,
20238                                         DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20239                     remaining -= len;
20240                     if (matches_string) {
20241                         sv_catpvn(matches_string, si_string, len);
20242                     }
20243                     else {
20244                         matches_string = newSVpvn(si_string, len);
20245                     }
20246                     sv_catpvs(matches_string, " ");
20247 
20248                     si_string += len;
20249                     if (   remaining
20250                         && UCHARAT(si_string)
20251                                             == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20252                     {
20253                         si_string++;
20254                         remaining--;
20255                     }
20256                     if (remaining && UCHARAT(si_string) == '\n') {
20257                         si_string++;
20258                         remaining--;
20259                     }
20260                 } /* end of loop through the text */
20261 
20262                 assert(matches_string);
20263                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
20264                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20265                 }
20266             } /* end of has an 'si' */
20267 	}
20268 
20269         /* Add the stuff that's already known */
20270         if (invlist) {
20271 
20272             /* Again, if the caller doesn't want the output inversion list, put
20273              * everything in 'matches-string' */
20274             if (! output_invlist) {
20275                 if ( ! matches_string) {
20276                     matches_string = newSVpvs("\n");
20277                 }
20278                 sv_catsv(matches_string, invlist_contents(invlist,
20279                                                   TRUE /* traditional style */
20280                                                   ));
20281             }
20282             else if (! *output_invlist) {
20283                 *output_invlist = invlist_clone(invlist, NULL);
20284             }
20285             else {
20286                 _invlist_union(*output_invlist, invlist, output_invlist);
20287             }
20288         }
20289 
20290 	*listsvp = matches_string;
20291     }
20292 
20293     return invlist;
20294 }
20295 
20296 /* reg_skipcomment()
20297 
20298    Absorbs an /x style # comment from the input stream,
20299    returning a pointer to the first character beyond the comment, or if the
20300    comment terminates the pattern without anything following it, this returns
20301    one past the final character of the pattern (in other words, RExC_end) and
20302    sets the REG_RUN_ON_COMMENT_SEEN flag.
20303 
20304    Note it's the callers responsibility to ensure that we are
20305    actually in /x mode
20306 
20307 */
20308 
20309 PERL_STATIC_INLINE char*
S_reg_skipcomment(RExC_state_t * pRExC_state,char * p)20310 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20311 {
20312     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20313 
20314     assert(*p == '#');
20315 
20316     while (p < RExC_end) {
20317         if (*(++p) == '\n') {
20318             return p+1;
20319         }
20320     }
20321 
20322     /* we ran off the end of the pattern without ending the comment, so we have
20323      * to add an \n when wrapping */
20324     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20325     return p;
20326 }
20327 
20328 STATIC void
S_skip_to_be_ignored_text(pTHX_ RExC_state_t * pRExC_state,char ** p,const bool force_to_xmod)20329 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20330                                 char ** p,
20331                                 const bool force_to_xmod
20332                          )
20333 {
20334     /* If the text at the current parse position '*p' is a '(?#...)' comment,
20335      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20336      * is /x whitespace, advance '*p' so that on exit it points to the first
20337      * byte past all such white space and comments */
20338 
20339     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20340 
20341     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20342 
20343     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20344 
20345     for (;;) {
20346 	if (RExC_end - (*p) >= 3
20347 	    && *(*p)     == '('
20348 	    && *(*p + 1) == '?'
20349 	    && *(*p + 2) == '#')
20350 	{
20351 	    while (*(*p) != ')') {
20352 		if ((*p) == RExC_end)
20353 		    FAIL("Sequence (?#... not terminated");
20354 		(*p)++;
20355 	    }
20356 	    (*p)++;
20357 	    continue;
20358 	}
20359 
20360 	if (use_xmod) {
20361             const char * save_p = *p;
20362             while ((*p) < RExC_end) {
20363                 STRLEN len;
20364                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20365                     (*p) += len;
20366                 }
20367                 else if (*(*p) == '#') {
20368                     (*p) = reg_skipcomment(pRExC_state, (*p));
20369                 }
20370                 else {
20371                     break;
20372                 }
20373             }
20374             if (*p != save_p) {
20375                 continue;
20376             }
20377 	}
20378 
20379         break;
20380     }
20381 
20382     return;
20383 }
20384 
20385 /* nextchar()
20386 
20387    Advances the parse position by one byte, unless that byte is the beginning
20388    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
20389    those two cases, the parse position is advanced beyond all such comments and
20390    white space.
20391 
20392    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20393 */
20394 
20395 STATIC void
S_nextchar(pTHX_ RExC_state_t * pRExC_state)20396 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20397 {
20398     PERL_ARGS_ASSERT_NEXTCHAR;
20399 
20400     if (RExC_parse < RExC_end) {
20401         assert(   ! UTF
20402                || UTF8_IS_INVARIANT(*RExC_parse)
20403                || UTF8_IS_START(*RExC_parse));
20404 
20405         RExC_parse += (UTF)
20406                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20407                       : 1;
20408 
20409         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20410                                 FALSE /* Don't force /x */ );
20411     }
20412 }
20413 
20414 STATIC void
S_change_engine_size(pTHX_ RExC_state_t * pRExC_state,const Ptrdiff_t size)20415 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20416 {
20417     /* 'size' is the delta number of smallest regnode equivalents to add or
20418      * subtract from the current memory allocated to the regex engine being
20419      * constructed. */
20420 
20421     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20422 
20423     RExC_size += size;
20424 
20425     Renewc(RExC_rxi,
20426            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20427                                                 /* +1 for REG_MAGIC */
20428            char,
20429            regexp_internal);
20430     if ( RExC_rxi == NULL )
20431 	FAIL("Regexp out of space");
20432     RXi_SET(RExC_rx, RExC_rxi);
20433 
20434     RExC_emit_start = RExC_rxi->program;
20435     if (size > 0) {
20436         Zero(REGNODE_p(RExC_emit), size, regnode);
20437     }
20438 
20439 #ifdef RE_TRACK_PATTERN_OFFSETS
20440     Renew(RExC_offsets, 2*RExC_size+1, U32);
20441     if (size > 0) {
20442         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20443     }
20444     RExC_offsets[0] = RExC_size;
20445 #endif
20446 }
20447 
20448 STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t * pRExC_state,const U8 op,const STRLEN extra_size,const char * const name)20449 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20450 {
20451     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20452      * equivalents space.  It aligns and increments RExC_size
20453      *
20454      * It returns the regnode's offset into the regex engine program */
20455 
20456     const regnode_offset ret = RExC_emit;
20457 
20458     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20459 
20460     PERL_ARGS_ASSERT_REGNODE_GUTS;
20461 
20462     SIZE_ALIGN(RExC_size);
20463     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20464     NODE_ALIGN_FILL(REGNODE_p(ret));
20465 #ifndef RE_TRACK_PATTERN_OFFSETS
20466     PERL_UNUSED_ARG(name);
20467     PERL_UNUSED_ARG(op);
20468 #else
20469     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20470 
20471     if (RExC_offsets) {         /* MJD */
20472 	MJD_OFFSET_DEBUG(
20473               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20474               name, __LINE__,
20475               PL_reg_name[op],
20476               (UV)(RExC_emit) > RExC_offsets[0]
20477 		? "Overwriting end of array!\n" : "OK",
20478               (UV)(RExC_emit),
20479               (UV)(RExC_parse - RExC_start),
20480               (UV)RExC_offsets[0]));
20481 	Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20482     }
20483 #endif
20484     return(ret);
20485 }
20486 
20487 /*
20488 - reg_node - emit a node
20489 */
20490 STATIC regnode_offset /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)20491 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20492 {
20493     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20494     regnode_offset ptr = ret;
20495 
20496     PERL_ARGS_ASSERT_REG_NODE;
20497 
20498     assert(regarglen[op] == 0);
20499 
20500     FILL_ADVANCE_NODE(ptr, op);
20501     RExC_emit = ptr;
20502     return(ret);
20503 }
20504 
20505 /*
20506 - reganode - emit a node with an argument
20507 */
20508 STATIC regnode_offset /* Location. */
S_reganode(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)20509 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20510 {
20511     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20512     regnode_offset ptr = ret;
20513 
20514     PERL_ARGS_ASSERT_REGANODE;
20515 
20516     /* ANYOF are special cased to allow non-length 1 args */
20517     assert(regarglen[op] == 1);
20518 
20519     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20520     RExC_emit = ptr;
20521     return(ret);
20522 }
20523 
20524 /*
20525 - regpnode - emit a temporary node with a SV* argument
20526 */
20527 STATIC regnode_offset /* Location. */
S_regpnode(pTHX_ RExC_state_t * pRExC_state,U8 op,SV * arg)20528 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20529 {
20530     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20531     regnode_offset ptr = ret;
20532 
20533     PERL_ARGS_ASSERT_REGPNODE;
20534 
20535     FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20536     RExC_emit = ptr;
20537     return(ret);
20538 }
20539 
20540 STATIC regnode_offset
S_reg2Lanode(pTHX_ RExC_state_t * pRExC_state,const U8 op,const U32 arg1,const I32 arg2)20541 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20542 {
20543     /* emit a node with U32 and I32 arguments */
20544 
20545     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20546     regnode_offset ptr = ret;
20547 
20548     PERL_ARGS_ASSERT_REG2LANODE;
20549 
20550     assert(regarglen[op] == 2);
20551 
20552     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20553     RExC_emit = ptr;
20554     return(ret);
20555 }
20556 
20557 /*
20558 - reginsert - insert an operator in front of already-emitted operand
20559 *
20560 * That means that on exit 'operand' is the offset of the newly inserted
20561 * operator, and the original operand has been relocated.
20562 *
20563 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20564 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20565 *
20566 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20567 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20568 *
20569 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20570 */
20571 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,const U8 op,const regnode_offset operand,const U32 depth)20572 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20573                   const regnode_offset operand, const U32 depth)
20574 {
20575     regnode *src;
20576     regnode *dst;
20577     regnode *place;
20578     const int offset = regarglen[(U8)op];
20579     const int size = NODE_STEP_REGNODE + offset;
20580     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20581 
20582     PERL_ARGS_ASSERT_REGINSERT;
20583     PERL_UNUSED_CONTEXT;
20584     PERL_UNUSED_ARG(depth);
20585 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20586     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20587     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20588                                     studying. If this is wrong then we need to adjust RExC_recurse
20589                                     below like we do with RExC_open_parens/RExC_close_parens. */
20590     change_engine_size(pRExC_state, (Ptrdiff_t) size);
20591     src = REGNODE_p(RExC_emit);
20592     RExC_emit += size;
20593     dst = REGNODE_p(RExC_emit);
20594 
20595     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20596      * and [perl #133871] shows this can lead to problems, so skip this
20597      * realignment of parens until a later pass when they are reliable */
20598     if (! IN_PARENS_PASS && RExC_open_parens) {
20599         int paren;
20600         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20601         /* remember that RExC_npar is rex->nparens + 1,
20602          * iow it is 1 more than the number of parens seen in
20603          * the pattern so far. */
20604         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20605             /* note, RExC_open_parens[0] is the start of the
20606              * regex, it can't move. RExC_close_parens[0] is the end
20607              * of the regex, it *can* move. */
20608             if ( paren && RExC_open_parens[paren] >= operand ) {
20609                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20610                 RExC_open_parens[paren] += size;
20611             } else {
20612                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20613             }
20614             if ( RExC_close_parens[paren] >= operand ) {
20615                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20616                 RExC_close_parens[paren] += size;
20617             } else {
20618                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20619             }
20620         }
20621     }
20622     if (RExC_end_op)
20623         RExC_end_op += size;
20624 
20625     while (src > REGNODE_p(operand)) {
20626 	StructCopy(--src, --dst, regnode);
20627 #ifdef RE_TRACK_PATTERN_OFFSETS
20628         if (RExC_offsets) {     /* MJD 20010112 */
20629 	    MJD_OFFSET_DEBUG(
20630                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20631                   "reginsert",
20632 		  __LINE__,
20633 		  PL_reg_name[op],
20634                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20635 		    ? "Overwriting end of array!\n" : "OK",
20636                   (UV)REGNODE_OFFSET(src),
20637                   (UV)REGNODE_OFFSET(dst),
20638                   (UV)RExC_offsets[0]));
20639 	    Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20640 	    Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20641         }
20642 #endif
20643     }
20644 
20645     place = REGNODE_p(operand);	/* Op node, where operand used to be. */
20646 #ifdef RE_TRACK_PATTERN_OFFSETS
20647     if (RExC_offsets) {         /* MJD */
20648 	MJD_OFFSET_DEBUG(
20649               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20650               "reginsert",
20651 	      __LINE__,
20652 	      PL_reg_name[op],
20653               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20654               ? "Overwriting end of array!\n" : "OK",
20655               (UV)REGNODE_OFFSET(place),
20656               (UV)(RExC_parse - RExC_start),
20657               (UV)RExC_offsets[0]));
20658 	Set_Node_Offset(place, RExC_parse);
20659 	Set_Node_Length(place, 1);
20660     }
20661 #endif
20662     src = NEXTOPER(place);
20663     FLAGS(place) = 0;
20664     FILL_NODE(operand, op);
20665 
20666     /* Zero out any arguments in the new node */
20667     Zero(src, offset, regnode);
20668 }
20669 
20670 /*
20671 - regtail - set the next-pointer at the end of a node chain of p to val.  If
20672             that value won't fit in the space available, instead returns FALSE.
20673             (Except asserts if we can't fit in the largest space the regex
20674             engine is designed for.)
20675 - SEE ALSO: regtail_study
20676 */
20677 STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,const regnode_offset p,const regnode_offset val,const U32 depth)20678 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20679                 const regnode_offset p,
20680                 const regnode_offset val,
20681                 const U32 depth)
20682 {
20683     regnode_offset scan;
20684     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20685 
20686     PERL_ARGS_ASSERT_REGTAIL;
20687 #ifndef DEBUGGING
20688     PERL_UNUSED_ARG(depth);
20689 #endif
20690 
20691     /* Find last node. */
20692     scan = (regnode_offset) p;
20693     for (;;) {
20694 	regnode * const temp = regnext(REGNODE_p(scan));
20695         DEBUG_PARSE_r({
20696             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20697             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20698             Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
20699                 SvPV_nolen_const(RExC_mysv), scan,
20700                     (temp == NULL ? "->" : ""),
20701                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20702             );
20703         });
20704         if (temp == NULL)
20705             break;
20706         scan = REGNODE_OFFSET(temp);
20707     }
20708 
20709     assert(val >= scan);
20710     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20711         assert((UV) (val - scan) <= U32_MAX);
20712         ARG_SET(REGNODE_p(scan), val - scan);
20713     }
20714     else {
20715         if (val - scan > U16_MAX) {
20716             /* Populate this with something that won't loop and will likely
20717              * lead to a crash if the caller ignores the failure return, and
20718              * execution continues */
20719             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20720             return FALSE;
20721         }
20722         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20723     }
20724 
20725     return TRUE;
20726 }
20727 
20728 #ifdef DEBUGGING
20729 /*
20730 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20731 - Look for optimizable sequences at the same time.
20732 - currently only looks for EXACT chains.
20733 
20734 This is experimental code. The idea is to use this routine to perform
20735 in place optimizations on branches and groups as they are constructed,
20736 with the long term intention of removing optimization from study_chunk so
20737 that it is purely analytical.
20738 
20739 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20740 to control which is which.
20741 
20742 This used to return a value that was ignored.  It was a problem that it is
20743 #ifdef'd to be another function that didn't return a value.  khw has changed it
20744 so both currently return a pass/fail return.
20745 
20746 */
20747 /* TODO: All four parms should be const */
20748 
20749 STATIC bool
S_regtail_study(pTHX_ RExC_state_t * pRExC_state,regnode_offset p,const regnode_offset val,U32 depth)20750 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20751                       const regnode_offset val, U32 depth)
20752 {
20753     regnode_offset scan;
20754     U8 exact = PSEUDO;
20755 #ifdef EXPERIMENTAL_INPLACESCAN
20756     I32 min = 0;
20757 #endif
20758     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20759 
20760     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20761 
20762 
20763     /* Find last node. */
20764 
20765     scan = p;
20766     for (;;) {
20767         regnode * const temp = regnext(REGNODE_p(scan));
20768 #ifdef EXPERIMENTAL_INPLACESCAN
20769         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20770 	    bool unfolded_multi_char;	/* Unexamined in this routine */
20771             if (join_exact(pRExC_state, scan, &min,
20772                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20773                 return TRUE; /* Was return EXACT */
20774 	}
20775 #endif
20776         if ( exact ) {
20777             switch (OP(REGNODE_p(scan))) {
20778                 case LEXACT:
20779                 case EXACT:
20780                 case LEXACT_REQ8:
20781                 case EXACT_REQ8:
20782                 case EXACTL:
20783                 case EXACTF:
20784                 case EXACTFU_S_EDGE:
20785                 case EXACTFAA_NO_TRIE:
20786                 case EXACTFAA:
20787                 case EXACTFU:
20788                 case EXACTFU_REQ8:
20789                 case EXACTFLU8:
20790                 case EXACTFUP:
20791                 case EXACTFL:
20792                         if( exact == PSEUDO )
20793                             exact= OP(REGNODE_p(scan));
20794                         else if ( exact != OP(REGNODE_p(scan)) )
20795                             exact= 0;
20796                 case NOTHING:
20797                     break;
20798                 default:
20799                     exact= 0;
20800             }
20801         }
20802         DEBUG_PARSE_r({
20803             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20804             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20805             Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
20806                 SvPV_nolen_const(RExC_mysv),
20807                 scan,
20808                 PL_reg_name[exact]);
20809         });
20810 	if (temp == NULL)
20811 	    break;
20812 	scan = REGNODE_OFFSET(temp);
20813     }
20814     DEBUG_PARSE_r({
20815         DEBUG_PARSE_MSG("");
20816         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20817         Perl_re_printf( aTHX_
20818                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20819 		      SvPV_nolen_const(RExC_mysv),
20820 		      (IV)val,
20821 		      (IV)(val - scan)
20822         );
20823     });
20824     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20825         assert((UV) (val - scan) <= U32_MAX);
20826 	ARG_SET(REGNODE_p(scan), val - scan);
20827     }
20828     else {
20829         if (val - scan > U16_MAX) {
20830             /* Populate this with something that won't loop and will likely
20831              * lead to a crash if the caller ignores the failure return, and
20832              * execution continues */
20833             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20834             return FALSE;
20835         }
20836 	NEXT_OFF(REGNODE_p(scan)) = val - scan;
20837     }
20838 
20839     return TRUE; /* Was 'return exact' */
20840 }
20841 #endif
20842 
20843 STATIC SV*
S_get_ANYOFM_contents(pTHX_ const regnode * n)20844 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20845 
20846     /* Returns an inversion list of all the code points matched by the
20847      * ANYOFM/NANYOFM node 'n' */
20848 
20849     SV * cp_list = _new_invlist(-1);
20850     const U8 lowest = (U8) ARG(n);
20851     unsigned int i;
20852     U8 count = 0;
20853     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20854 
20855     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20856 
20857     /* Starting with the lowest code point, any code point that ANDed with the
20858      * mask yields the lowest code point is in the set */
20859     for (i = lowest; i <= 0xFF; i++) {
20860         if ((i & FLAGS(n)) == ARG(n)) {
20861             cp_list = add_cp_to_invlist(cp_list, i);
20862             count++;
20863 
20864             /* We know how many code points (a power of two) that are in the
20865              * set.  No use looking once we've got that number */
20866             if (count >= needed) break;
20867         }
20868     }
20869 
20870     if (OP(n) == NANYOFM) {
20871         _invlist_invert(cp_list);
20872     }
20873     return cp_list;
20874 }
20875 
20876 /*
20877  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20878  */
20879 #ifdef DEBUGGING
20880 
20881 static void
S_regdump_intflags(pTHX_ const char * lead,const U32 flags)20882 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20883 {
20884     int bit;
20885     int set=0;
20886 
20887     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20888 
20889     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20890         if (flags & (1<<bit)) {
20891             if (!set++ && lead)
20892                 Perl_re_printf( aTHX_  "%s", lead);
20893             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20894         }
20895     }
20896     if (lead)  {
20897         if (set)
20898             Perl_re_printf( aTHX_  "\n");
20899         else
20900             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20901     }
20902 }
20903 
20904 static void
S_regdump_extflags(pTHX_ const char * lead,const U32 flags)20905 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20906 {
20907     int bit;
20908     int set=0;
20909     regex_charset cs;
20910 
20911     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20912 
20913     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20914         if (flags & (1<<bit)) {
20915 	    if ((1<<bit) & RXf_PMf_CHARSET) {	/* Output separately, below */
20916 		continue;
20917 	    }
20918             if (!set++ && lead)
20919                 Perl_re_printf( aTHX_  "%s", lead);
20920             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20921         }
20922     }
20923     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20924             if (!set++ && lead) {
20925                 Perl_re_printf( aTHX_  "%s", lead);
20926             }
20927             switch (cs) {
20928                 case REGEX_UNICODE_CHARSET:
20929                     Perl_re_printf( aTHX_  "UNICODE");
20930                     break;
20931                 case REGEX_LOCALE_CHARSET:
20932                     Perl_re_printf( aTHX_  "LOCALE");
20933                     break;
20934                 case REGEX_ASCII_RESTRICTED_CHARSET:
20935                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20936                     break;
20937                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20938                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20939                     break;
20940                 default:
20941                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20942                     break;
20943             }
20944     }
20945     if (lead)  {
20946         if (set)
20947             Perl_re_printf( aTHX_  "\n");
20948         else
20949             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20950     }
20951 }
20952 #endif
20953 
20954 void
Perl_regdump(pTHX_ const regexp * r)20955 Perl_regdump(pTHX_ const regexp *r)
20956 {
20957 #ifdef DEBUGGING
20958     int i;
20959     SV * const sv = sv_newmortal();
20960     SV *dsv= sv_newmortal();
20961     RXi_GET_DECL(r, ri);
20962     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20963 
20964     PERL_ARGS_ASSERT_REGDUMP;
20965 
20966     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20967 
20968     /* Header fields of interest. */
20969     for (i = 0; i < 2; i++) {
20970         if (r->substrs->data[i].substr) {
20971             RE_PV_QUOTED_DECL(s, 0, dsv,
20972                             SvPVX_const(r->substrs->data[i].substr),
20973                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20974                             PL_dump_re_max_len);
20975             Perl_re_printf( aTHX_
20976                           "%s %s%s at %" IVdf "..%" UVuf " ",
20977                           i ? "floating" : "anchored",
20978                           s,
20979                           RE_SV_TAIL(r->substrs->data[i].substr),
20980                           (IV)r->substrs->data[i].min_offset,
20981                           (UV)r->substrs->data[i].max_offset);
20982         }
20983         else if (r->substrs->data[i].utf8_substr) {
20984             RE_PV_QUOTED_DECL(s, 1, dsv,
20985                             SvPVX_const(r->substrs->data[i].utf8_substr),
20986                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20987                             30);
20988             Perl_re_printf( aTHX_
20989                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20990                           i ? "floating" : "anchored",
20991                           s,
20992                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20993                           (IV)r->substrs->data[i].min_offset,
20994                           (UV)r->substrs->data[i].max_offset);
20995         }
20996     }
20997 
20998     if (r->check_substr || r->check_utf8)
20999         Perl_re_printf( aTHX_
21000 		      (const char *)
21001 		      (   r->check_substr == r->substrs->data[1].substr
21002 		       && r->check_utf8   == r->substrs->data[1].utf8_substr
21003 		       ? "(checking floating" : "(checking anchored"));
21004     if (r->intflags & PREGf_NOSCAN)
21005         Perl_re_printf( aTHX_  " noscan");
21006     if (r->extflags & RXf_CHECK_ALL)
21007         Perl_re_printf( aTHX_  " isall");
21008     if (r->check_substr || r->check_utf8)
21009         Perl_re_printf( aTHX_  ") ");
21010 
21011     if (ri->regstclass) {
21012         regprop(r, sv, ri->regstclass, NULL, NULL);
21013         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
21014     }
21015     if (r->intflags & PREGf_ANCH) {
21016         Perl_re_printf( aTHX_  "anchored");
21017         if (r->intflags & PREGf_ANCH_MBOL)
21018             Perl_re_printf( aTHX_  "(MBOL)");
21019         if (r->intflags & PREGf_ANCH_SBOL)
21020             Perl_re_printf( aTHX_  "(SBOL)");
21021         if (r->intflags & PREGf_ANCH_GPOS)
21022             Perl_re_printf( aTHX_  "(GPOS)");
21023         Perl_re_printf( aTHX_ " ");
21024     }
21025     if (r->intflags & PREGf_GPOS_SEEN)
21026         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
21027     if (r->intflags & PREGf_SKIP)
21028         Perl_re_printf( aTHX_  "plus ");
21029     if (r->intflags & PREGf_IMPLICIT)
21030         Perl_re_printf( aTHX_  "implicit ");
21031     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
21032     if (r->extflags & RXf_EVAL_SEEN)
21033         Perl_re_printf( aTHX_  "with eval ");
21034     Perl_re_printf( aTHX_  "\n");
21035     DEBUG_FLAGS_r({
21036         regdump_extflags("r->extflags: ", r->extflags);
21037         regdump_intflags("r->intflags: ", r->intflags);
21038     });
21039 #else
21040     PERL_ARGS_ASSERT_REGDUMP;
21041     PERL_UNUSED_CONTEXT;
21042     PERL_UNUSED_ARG(r);
21043 #endif	/* DEBUGGING */
21044 }
21045 
21046 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21047 #ifdef DEBUGGING
21048 
21049 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
21050      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
21051      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
21052      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
21053      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
21054      || _CC_VERTSPACE != 15
21055 #   error Need to adjust order of anyofs[]
21056 #  endif
21057 static const char * const anyofs[] = {
21058     "\\w",
21059     "\\W",
21060     "\\d",
21061     "\\D",
21062     "[:alpha:]",
21063     "[:^alpha:]",
21064     "[:lower:]",
21065     "[:^lower:]",
21066     "[:upper:]",
21067     "[:^upper:]",
21068     "[:punct:]",
21069     "[:^punct:]",
21070     "[:print:]",
21071     "[:^print:]",
21072     "[:alnum:]",
21073     "[:^alnum:]",
21074     "[:graph:]",
21075     "[:^graph:]",
21076     "[:cased:]",
21077     "[:^cased:]",
21078     "\\s",
21079     "\\S",
21080     "[:blank:]",
21081     "[:^blank:]",
21082     "[:xdigit:]",
21083     "[:^xdigit:]",
21084     "[:cntrl:]",
21085     "[:^cntrl:]",
21086     "[:ascii:]",
21087     "[:^ascii:]",
21088     "\\v",
21089     "\\V"
21090 };
21091 #endif
21092 
21093 /*
21094 - regprop - printable representation of opcode, with run time support
21095 */
21096 
21097 void
Perl_regprop(pTHX_ const regexp * prog,SV * sv,const regnode * o,const regmatch_info * reginfo,const RExC_state_t * pRExC_state)21098 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21099 {
21100 #ifdef DEBUGGING
21101     dVAR;
21102     int k;
21103     RXi_GET_DECL(prog, progi);
21104     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21105 
21106     PERL_ARGS_ASSERT_REGPROP;
21107 
21108     SvPVCLEAR(sv);
21109 
21110     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
21111         if (pRExC_state) {  /* This gives more info, if we have it */
21112             FAIL3("panic: corrupted regexp opcode %d > %d",
21113                   (int)OP(o), (int)REGNODE_MAX);
21114         }
21115         else {
21116             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21117                              (int)OP(o), (int)REGNODE_MAX);
21118         }
21119     }
21120     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21121 
21122     k = PL_regkind[OP(o)];
21123 
21124     if (k == EXACT) {
21125 	sv_catpvs(sv, " ");
21126 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21127 	 * is a crude hack but it may be the best for now since
21128 	 * we have no flag "this EXACTish node was UTF-8"
21129 	 * --jhi */
21130 	pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21131                   PL_colors[0], PL_colors[1],
21132 		  PERL_PV_ESCAPE_UNI_DETECT |
21133 		  PERL_PV_ESCAPE_NONASCII   |
21134 		  PERL_PV_PRETTY_ELLIPSES   |
21135 		  PERL_PV_PRETTY_LTGT       |
21136 		  PERL_PV_PRETTY_NOCLEAR
21137 		  );
21138     } else if (k == TRIE) {
21139 	/* print the details of the trie in dumpuntil instead, as
21140 	 * progi->data isn't available here */
21141         const char op = OP(o);
21142         const U32 n = ARG(o);
21143         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21144                (reg_ac_data *)progi->data->data[n] :
21145                NULL;
21146         const reg_trie_data * const trie
21147 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21148 
21149         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21150         DEBUG_TRIE_COMPILE_r({
21151           if (trie->jump)
21152             sv_catpvs(sv, "(JUMP)");
21153           Perl_sv_catpvf(aTHX_ sv,
21154             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21155             (UV)trie->startstate,
21156             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21157             (UV)trie->wordcount,
21158             (UV)trie->minlen,
21159             (UV)trie->maxlen,
21160             (UV)TRIE_CHARCOUNT(trie),
21161             (UV)trie->uniquecharcount
21162           );
21163         });
21164         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21165             sv_catpvs(sv, "[");
21166             (void) put_charclass_bitmap_innards(sv,
21167                                                 ((IS_ANYOF_TRIE(op))
21168                                                  ? ANYOF_BITMAP(o)
21169                                                  : TRIE_BITMAP(trie)),
21170                                                 NULL,
21171                                                 NULL,
21172                                                 NULL,
21173                                                 0,
21174                                                 FALSE
21175                                                );
21176             sv_catpvs(sv, "]");
21177         }
21178     } else if (k == CURLY) {
21179         U32 lo = ARG1(o), hi = ARG2(o);
21180 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21181 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21182         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21183         if (hi == REG_INFTY)
21184             sv_catpvs(sv, "INFTY");
21185         else
21186             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21187         sv_catpvs(sv, "}");
21188     }
21189     else if (k == WHILEM && o->flags)			/* Ordinal/of */
21190 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21191     else if (k == REF || k == OPEN || k == CLOSE
21192              || k == GROUPP || OP(o)==ACCEPT)
21193     {
21194         AV *name_list= NULL;
21195         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21196         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
21197 	if ( RXp_PAREN_NAMES(prog) ) {
21198             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21199         } else if ( pRExC_state ) {
21200             name_list= RExC_paren_name_list;
21201         }
21202         if (name_list) {
21203             if ( k != REF || (OP(o) < REFN)) {
21204                 SV **name= av_fetch(name_list, parno, 0 );
21205 	        if (name)
21206 	            Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21207             }
21208             else {
21209                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21210                 I32 *nums=(I32*)SvPVX(sv_dat);
21211                 SV **name= av_fetch(name_list, nums[0], 0 );
21212                 I32 n;
21213                 if (name) {
21214                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
21215                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21216 			   	    (n ? "," : ""), (IV)nums[n]);
21217                     }
21218                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21219                 }
21220             }
21221         }
21222         if ( k == REF && reginfo) {
21223             U32 n = ARG(o);  /* which paren pair */
21224             I32 ln = prog->offs[n].start;
21225             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21226                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21227             else if (ln == prog->offs[n].end)
21228                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21229             else {
21230                 const char *s = reginfo->strbeg + ln;
21231                 Perl_sv_catpvf(aTHX_ sv, ": ");
21232                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21233                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21234             }
21235         }
21236     } else if (k == GOSUB) {
21237         AV *name_list= NULL;
21238         if ( RXp_PAREN_NAMES(prog) ) {
21239             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21240         } else if ( pRExC_state ) {
21241             name_list= RExC_paren_name_list;
21242         }
21243 
21244         /* Paren and offset */
21245         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21246                 (int)((o + (int)ARG2L(o)) - progi->program) );
21247         if (name_list) {
21248             SV **name= av_fetch(name_list, ARG(o), 0 );
21249             if (name)
21250                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21251         }
21252     }
21253     else if (k == LOGICAL)
21254         /* 2: embedded, otherwise 1 */
21255 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21256     else if (k == ANYOF || k == ANYOFR) {
21257         U8 flags;
21258         char * bitmap;
21259         U32 arg;
21260         bool do_sep = FALSE;    /* Do we need to separate various components of
21261                                    the output? */
21262         /* Set if there is still an unresolved user-defined property */
21263         SV *unresolved                = NULL;
21264 
21265         /* Things that are ignored except when the runtime locale is UTF-8 */
21266         SV *only_utf8_locale_invlist = NULL;
21267 
21268         /* Code points that don't fit in the bitmap */
21269         SV *nonbitmap_invlist = NULL;
21270 
21271         /* And things that aren't in the bitmap, but are small enough to be */
21272         SV* bitmap_range_not_in_bitmap = NULL;
21273 
21274         bool inverted;
21275 
21276         if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21277             flags = 0;
21278             bitmap = NULL;
21279             arg = 0;
21280         }
21281         else {
21282             flags = ANYOF_FLAGS(o);
21283             bitmap = ANYOF_BITMAP(o);
21284             arg = ARG(o);
21285         }
21286 
21287 	if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21288             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21289                 sv_catpvs(sv, "{utf8-locale-reqd}");
21290             }
21291             if (flags & ANYOFL_FOLD) {
21292                 sv_catpvs(sv, "{i}");
21293             }
21294         }
21295 
21296         inverted = flags & ANYOF_INVERT;
21297 
21298         /* If there is stuff outside the bitmap, get it */
21299         if (arg != ANYOF_ONLY_HAS_BITMAP) {
21300             if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21301                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21302                                             ANYOFRbase(o),
21303                                             ANYOFRbase(o) + ANYOFRdelta(o));
21304             }
21305             else {
21306 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21307                 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21308                                                 &unresolved,
21309                                                 &only_utf8_locale_invlist,
21310                                                 &nonbitmap_invlist);
21311 #else
21312                 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21313                                                 &unresolved,
21314                                                 &only_utf8_locale_invlist,
21315                                                 &nonbitmap_invlist);
21316 #endif
21317             }
21318 
21319             /* The non-bitmap data may contain stuff that could fit in the
21320              * bitmap.  This could come from a user-defined property being
21321              * finally resolved when this call was done; or much more likely
21322              * because there are matches that require UTF-8 to be valid, and so
21323              * aren't in the bitmap (or ANYOFR).  This is teased apart later */
21324             _invlist_intersection(nonbitmap_invlist,
21325                                   PL_InBitmap,
21326                                   &bitmap_range_not_in_bitmap);
21327             /* Leave just the things that don't fit into the bitmap */
21328             _invlist_subtract(nonbitmap_invlist,
21329                               PL_InBitmap,
21330                               &nonbitmap_invlist);
21331         }
21332 
21333         /* Obey this flag to add all above-the-bitmap code points */
21334         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21335             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21336                                                       NUM_ANYOF_CODE_POINTS,
21337                                                       UV_MAX);
21338         }
21339 
21340         /* Ready to start outputting.  First, the initial left bracket */
21341 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21342 
21343         /* ANYOFH by definition doesn't have anything that will fit inside the
21344          * bitmap;  ANYOFR may or may not. */
21345         if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21346             && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21347                 ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21348         {
21349             /* Then all the things that could fit in the bitmap */
21350             do_sep = put_charclass_bitmap_innards(sv,
21351                                                   bitmap,
21352                                                   bitmap_range_not_in_bitmap,
21353                                                   only_utf8_locale_invlist,
21354                                                   o,
21355                                                   flags,
21356 
21357                                                   /* Can't try inverting for a
21358                                                    * better display if there
21359                                                    * are things that haven't
21360                                                    * been resolved */
21361                                                   unresolved != NULL
21362                                             || inRANGE(OP(o), ANYOFR, ANYOFRb));
21363             SvREFCNT_dec(bitmap_range_not_in_bitmap);
21364 
21365             /* If there are user-defined properties which haven't been defined
21366              * yet, output them.  If the result is not to be inverted, it is
21367              * clearest to output them in a separate [] from the bitmap range
21368              * stuff.  If the result is to be complemented, we have to show
21369              * everything in one [], as the inversion applies to the whole
21370              * thing.  Use {braces} to separate them from anything in the
21371              * bitmap and anything above the bitmap. */
21372             if (unresolved) {
21373                 if (inverted) {
21374                     if (! do_sep) { /* If didn't output anything in the bitmap
21375                                      */
21376                         sv_catpvs(sv, "^");
21377                     }
21378                     sv_catpvs(sv, "{");
21379                 }
21380                 else if (do_sep) {
21381                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21382                                                       PL_colors[0]);
21383                 }
21384                 sv_catsv(sv, unresolved);
21385                 if (inverted) {
21386                     sv_catpvs(sv, "}");
21387                 }
21388                 do_sep = ! inverted;
21389             }
21390         }
21391 
21392         /* And, finally, add the above-the-bitmap stuff */
21393         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21394             SV* contents;
21395 
21396             /* See if truncation size is overridden */
21397             const STRLEN dump_len = (PL_dump_re_max_len > 256)
21398                                     ? PL_dump_re_max_len
21399                                     : 256;
21400 
21401             /* This is output in a separate [] */
21402             if (do_sep) {
21403                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21404             }
21405 
21406             /* And, for easy of understanding, it is shown in the
21407              * uncomplemented form if possible.  The one exception being if
21408              * there are unresolved items, where the inversion has to be
21409              * delayed until runtime */
21410             if (inverted && ! unresolved) {
21411                 _invlist_invert(nonbitmap_invlist);
21412                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21413             }
21414 
21415             contents = invlist_contents(nonbitmap_invlist,
21416                                         FALSE /* output suitable for catsv */
21417                                        );
21418 
21419             /* If the output is shorter than the permissible maximum, just do it. */
21420             if (SvCUR(contents) <= dump_len) {
21421                 sv_catsv(sv, contents);
21422             }
21423             else {
21424                 const char * contents_string = SvPVX(contents);
21425                 STRLEN i = dump_len;
21426 
21427                 /* Otherwise, start at the permissible max and work back to the
21428                  * first break possibility */
21429                 while (i > 0 && contents_string[i] != ' ') {
21430                     i--;
21431                 }
21432                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
21433                                        find a legal break */
21434                     i = dump_len;
21435                 }
21436 
21437                 sv_catpvn(sv, contents_string, i);
21438                 sv_catpvs(sv, "...");
21439             }
21440 
21441             SvREFCNT_dec_NN(contents);
21442             SvREFCNT_dec_NN(nonbitmap_invlist);
21443         }
21444 
21445         /* And finally the matching, closing ']' */
21446 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21447 
21448         if (OP(o) == ANYOFHs) {
21449             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21450         }
21451         else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21452             U8 lowest = (OP(o) != ANYOFHr)
21453                          ? FLAGS(o)
21454                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21455             U8 highest = (OP(o) == ANYOFHr)
21456                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21457                          : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21458                            ? 0xFF
21459                            : lowest;
21460 #ifndef EBCDIC
21461             if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21462 #endif
21463             {
21464                 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21465                 if (lowest != highest) {
21466                     Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21467                 }
21468                 Perl_sv_catpvf(aTHX_ sv, ")");
21469             }
21470         }
21471 
21472         SvREFCNT_dec(unresolved);
21473     }
21474     else if (k == ANYOFM) {
21475         SV * cp_list = get_ANYOFM_contents(o);
21476 
21477 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21478         if (OP(o) == NANYOFM) {
21479             _invlist_invert(cp_list);
21480         }
21481 
21482         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21483 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21484 
21485         SvREFCNT_dec(cp_list);
21486     }
21487     else if (k == POSIXD || k == NPOSIXD) {
21488         U8 index = FLAGS(o) * 2;
21489         if (index < C_ARRAY_LENGTH(anyofs)) {
21490             if (*anyofs[index] != '[')  {
21491                 sv_catpvs(sv, "[");
21492             }
21493             sv_catpv(sv, anyofs[index]);
21494             if (*anyofs[index] != '[')  {
21495                 sv_catpvs(sv, "]");
21496             }
21497         }
21498         else {
21499             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21500         }
21501     }
21502     else if (k == BOUND || k == NBOUND) {
21503         /* Must be synced with order of 'bound_type' in regcomp.h */
21504         const char * const bounds[] = {
21505             "",      /* Traditional */
21506             "{gcb}",
21507             "{lb}",
21508             "{sb}",
21509             "{wb}"
21510         };
21511         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21512         sv_catpv(sv, bounds[FLAGS(o)]);
21513     }
21514     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21515 	Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21516         if (o->next_off) {
21517             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21518         }
21519 	Perl_sv_catpvf(aTHX_ sv, "]");
21520     }
21521     else if (OP(o) == SBOL)
21522         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21523 
21524     /* add on the verb argument if there is one */
21525     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21526         if ( ARG(o) )
21527             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21528                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21529         else
21530             sv_catpvs(sv, ":NULL");
21531     }
21532 #else
21533     PERL_UNUSED_CONTEXT;
21534     PERL_UNUSED_ARG(sv);
21535     PERL_UNUSED_ARG(o);
21536     PERL_UNUSED_ARG(prog);
21537     PERL_UNUSED_ARG(reginfo);
21538     PERL_UNUSED_ARG(pRExC_state);
21539 #endif	/* DEBUGGING */
21540 }
21541 
21542 
21543 
21544 SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)21545 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21546 {				/* Assume that RE_INTUIT is set */
21547     /* Returns an SV containing a string that must appear in the target for it
21548      * to match, or NULL if nothing is known that must match.
21549      *
21550      * CAUTION: the SV can be freed during execution of the regex engine */
21551 
21552     struct regexp *const prog = ReANY(r);
21553     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21554 
21555     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21556     PERL_UNUSED_CONTEXT;
21557 
21558     DEBUG_COMPILE_r(
21559 	{
21560             if (prog->maxlen > 0) {
21561                 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21562 		      ? prog->check_utf8 : prog->check_substr);
21563 
21564                 if (!PL_colorset) reginitcolors();
21565                 Perl_re_printf( aTHX_
21566 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21567 		      PL_colors[4],
21568 		      RX_UTF8(r) ? "utf8 " : "",
21569 		      PL_colors[5], PL_colors[0],
21570 		      s,
21571 		      PL_colors[1],
21572 		      (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21573             }
21574 	} );
21575 
21576     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21577     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21578 }
21579 
21580 /*
21581    pregfree()
21582 
21583    handles refcounting and freeing the perl core regexp structure. When
21584    it is necessary to actually free the structure the first thing it
21585    does is call the 'free' method of the regexp_engine associated to
21586    the regexp, allowing the handling of the void *pprivate; member
21587    first. (This routine is not overridable by extensions, which is why
21588    the extensions free is called first.)
21589 
21590    See regdupe and regdupe_internal if you change anything here.
21591 */
21592 #ifndef PERL_IN_XSUB_RE
21593 void
Perl_pregfree(pTHX_ REGEXP * r)21594 Perl_pregfree(pTHX_ REGEXP *r)
21595 {
21596     SvREFCNT_dec(r);
21597 }
21598 
21599 void
Perl_pregfree2(pTHX_ REGEXP * rx)21600 Perl_pregfree2(pTHX_ REGEXP *rx)
21601 {
21602     struct regexp *const r = ReANY(rx);
21603     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21604 
21605     PERL_ARGS_ASSERT_PREGFREE2;
21606 
21607     if (! r)
21608         return;
21609 
21610     if (r->mother_re) {
21611         ReREFCNT_dec(r->mother_re);
21612     } else {
21613         CALLREGFREE_PVT(rx); /* free the private data */
21614         SvREFCNT_dec(RXp_PAREN_NAMES(r));
21615     }
21616     if (r->substrs) {
21617         int i;
21618         for (i = 0; i < 2; i++) {
21619             SvREFCNT_dec(r->substrs->data[i].substr);
21620             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21621         }
21622 	Safefree(r->substrs);
21623     }
21624     RX_MATCH_COPY_FREE(rx);
21625 #ifdef PERL_ANY_COW
21626     SvREFCNT_dec(r->saved_copy);
21627 #endif
21628     Safefree(r->offs);
21629     SvREFCNT_dec(r->qr_anoncv);
21630     if (r->recurse_locinput)
21631         Safefree(r->recurse_locinput);
21632 }
21633 
21634 
21635 /*  reg_temp_copy()
21636 
21637     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21638     except that dsv will be created if NULL.
21639 
21640     This function is used in two main ways. First to implement
21641         $r = qr/....; $s = $$r;
21642 
21643     Secondly, it is used as a hacky workaround to the structural issue of
21644     match results
21645     being stored in the regexp structure which is in turn stored in
21646     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21647     could be PL_curpm in multiple contexts, and could require multiple
21648     result sets being associated with the pattern simultaneously, such
21649     as when doing a recursive match with (??{$qr})
21650 
21651     The solution is to make a lightweight copy of the regexp structure
21652     when a qr// is returned from the code executed by (??{$qr}) this
21653     lightweight copy doesn't actually own any of its data except for
21654     the starp/end and the actual regexp structure itself.
21655 
21656 */
21657 
21658 
21659 REGEXP *
Perl_reg_temp_copy(pTHX_ REGEXP * dsv,REGEXP * ssv)21660 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21661 {
21662     struct regexp *drx;
21663     struct regexp *const srx = ReANY(ssv);
21664     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21665 
21666     PERL_ARGS_ASSERT_REG_TEMP_COPY;
21667 
21668     if (!dsv)
21669 	dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21670     else {
21671         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21672 
21673         /* our only valid caller, sv_setsv_flags(), should have done
21674          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21675         assert(!SvOOK(dsv));
21676         assert(!SvIsCOW(dsv));
21677         assert(!SvROK(dsv));
21678 
21679         if (SvPVX_const(dsv)) {
21680             if (SvLEN(dsv))
21681                 Safefree(SvPVX(dsv));
21682             SvPVX(dsv) = NULL;
21683         }
21684         SvLEN_set(dsv, 0);
21685         SvCUR_set(dsv, 0);
21686 	SvOK_off((SV *)dsv);
21687 
21688 	if (islv) {
21689 	    /* For PVLVs, the head (sv_any) points to an XPVLV, while
21690              * the LV's xpvlenu_rx will point to a regexp body, which
21691              * we allocate here */
21692 	    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21693 	    assert(!SvPVX(dsv));
21694             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21695 	    temp->sv_any = NULL;
21696 	    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21697 	    SvREFCNT_dec_NN(temp);
21698 	    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21699 	       ing below will not set it. */
21700 	    SvCUR_set(dsv, SvCUR(ssv));
21701 	}
21702     }
21703     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21704        sv_force_normal(sv) is called.  */
21705     SvFAKE_on(dsv);
21706     drx = ReANY(dsv);
21707 
21708     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21709     SvPV_set(dsv, RX_WRAPPED(ssv));
21710     /* We share the same string buffer as the original regexp, on which we
21711        hold a reference count, incremented when mother_re is set below.
21712        The string pointer is copied here, being part of the regexp struct.
21713      */
21714     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21715 	   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21716     if (!islv)
21717         SvLEN_set(dsv, 0);
21718     if (srx->offs) {
21719         const I32 npar = srx->nparens+1;
21720         Newx(drx->offs, npar, regexp_paren_pair);
21721         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21722     }
21723     if (srx->substrs) {
21724         int i;
21725         Newx(drx->substrs, 1, struct reg_substr_data);
21726 	StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21727 
21728         for (i = 0; i < 2; i++) {
21729             SvREFCNT_inc_void(drx->substrs->data[i].substr);
21730             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21731         }
21732 
21733 	/* check_substr and check_utf8, if non-NULL, point to either their
21734 	   anchored or float namesakes, and don't hold a second reference.  */
21735     }
21736     RX_MATCH_COPIED_off(dsv);
21737 #ifdef PERL_ANY_COW
21738     drx->saved_copy = NULL;
21739 #endif
21740     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21741     SvREFCNT_inc_void(drx->qr_anoncv);
21742     if (srx->recurse_locinput)
21743         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21744 
21745     return dsv;
21746 }
21747 #endif
21748 
21749 
21750 /* regfree_internal()
21751 
21752    Free the private data in a regexp. This is overloadable by
21753    extensions. Perl takes care of the regexp structure in pregfree(),
21754    this covers the *pprivate pointer which technically perl doesn't
21755    know about, however of course we have to handle the
21756    regexp_internal structure when no extension is in use.
21757 
21758    Note this is called before freeing anything in the regexp
21759    structure.
21760  */
21761 
21762 void
Perl_regfree_internal(pTHX_ REGEXP * const rx)21763 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21764 {
21765     struct regexp *const r = ReANY(rx);
21766     RXi_GET_DECL(r, ri);
21767     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21768 
21769     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21770 
21771     if (! ri) {
21772         return;
21773     }
21774 
21775     DEBUG_COMPILE_r({
21776 	if (!PL_colorset)
21777 	    reginitcolors();
21778 	{
21779 	    SV *dsv= sv_newmortal();
21780             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21781                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21782             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21783                 PL_colors[4], PL_colors[5], s);
21784         }
21785     });
21786 
21787 #ifdef RE_TRACK_PATTERN_OFFSETS
21788     if (ri->u.offsets)
21789         Safefree(ri->u.offsets);             /* 20010421 MJD */
21790 #endif
21791     if (ri->code_blocks)
21792         S_free_codeblocks(aTHX_ ri->code_blocks);
21793 
21794     if (ri->data) {
21795 	int n = ri->data->count;
21796 
21797 	while (--n >= 0) {
21798           /* If you add a ->what type here, update the comment in regcomp.h */
21799 	    switch (ri->data->what[n]) {
21800 	    case 'a':
21801 	    case 'r':
21802 	    case 's':
21803 	    case 'S':
21804 	    case 'u':
21805 		SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21806 		break;
21807 	    case 'f':
21808 		Safefree(ri->data->data[n]);
21809 		break;
21810 	    case 'l':
21811 	    case 'L':
21812 	        break;
21813             case 'T':
21814                 { /* Aho Corasick add-on structure for a trie node.
21815                      Used in stclass optimization only */
21816                     U32 refcount;
21817                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21818 #ifdef USE_ITHREADS
21819                     dVAR;
21820 #endif
21821                     OP_REFCNT_LOCK;
21822                     refcount = --aho->refcount;
21823                     OP_REFCNT_UNLOCK;
21824                     if ( !refcount ) {
21825                         PerlMemShared_free(aho->states);
21826                         PerlMemShared_free(aho->fail);
21827 			 /* do this last!!!! */
21828                         PerlMemShared_free(ri->data->data[n]);
21829                         /* we should only ever get called once, so
21830                          * assert as much, and also guard the free
21831                          * which /might/ happen twice. At the least
21832                          * it will make code anlyzers happy and it
21833                          * doesn't cost much. - Yves */
21834                         assert(ri->regstclass);
21835                         if (ri->regstclass) {
21836                             PerlMemShared_free(ri->regstclass);
21837                             ri->regstclass = 0;
21838                         }
21839                     }
21840                 }
21841                 break;
21842 	    case 't':
21843 	        {
21844 	            /* trie structure. */
21845 	            U32 refcount;
21846 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21847 #ifdef USE_ITHREADS
21848                     dVAR;
21849 #endif
21850                     OP_REFCNT_LOCK;
21851                     refcount = --trie->refcount;
21852                     OP_REFCNT_UNLOCK;
21853                     if ( !refcount ) {
21854                         PerlMemShared_free(trie->charmap);
21855                         PerlMemShared_free(trie->states);
21856                         PerlMemShared_free(trie->trans);
21857                         if (trie->bitmap)
21858                             PerlMemShared_free(trie->bitmap);
21859                         if (trie->jump)
21860                             PerlMemShared_free(trie->jump);
21861 			PerlMemShared_free(trie->wordinfo);
21862                         /* do this last!!!! */
21863                         PerlMemShared_free(ri->data->data[n]);
21864 		    }
21865 		}
21866 		break;
21867 	    default:
21868 		Perl_croak(aTHX_ "panic: regfree data code '%c'",
21869                                                     ri->data->what[n]);
21870 	    }
21871 	}
21872 	Safefree(ri->data->what);
21873 	Safefree(ri->data);
21874     }
21875 
21876     Safefree(ri);
21877 }
21878 
21879 #define av_dup_inc(s, t)	MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21880 #define hv_dup_inc(s, t)	MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21881 #define SAVEPVN(p, n)	((p) ? savepvn(p, n) : NULL)
21882 
21883 /*
21884    re_dup_guts - duplicate a regexp.
21885 
21886    This routine is expected to clone a given regexp structure. It is only
21887    compiled under USE_ITHREADS.
21888 
21889    After all of the core data stored in struct regexp is duplicated
21890    the regexp_engine.dupe method is used to copy any private data
21891    stored in the *pprivate pointer. This allows extensions to handle
21892    any duplication it needs to do.
21893 
21894    See pregfree() and regfree_internal() if you change anything here.
21895 */
21896 #if defined(USE_ITHREADS)
21897 #ifndef PERL_IN_XSUB_RE
21898 void
Perl_re_dup_guts(pTHX_ const REGEXP * sstr,REGEXP * dstr,CLONE_PARAMS * param)21899 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21900 {
21901     dVAR;
21902     I32 npar;
21903     const struct regexp *r = ReANY(sstr);
21904     struct regexp *ret = ReANY(dstr);
21905 
21906     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21907 
21908     npar = r->nparens+1;
21909     Newx(ret->offs, npar, regexp_paren_pair);
21910     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21911 
21912     if (ret->substrs) {
21913 	/* Do it this way to avoid reading from *r after the StructCopy().
21914 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
21915 	   cache, it doesn't matter.  */
21916         int i;
21917 	const bool anchored = r->check_substr
21918 	    ? r->check_substr == r->substrs->data[0].substr
21919 	    : r->check_utf8   == r->substrs->data[0].utf8_substr;
21920         Newx(ret->substrs, 1, struct reg_substr_data);
21921 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21922 
21923         for (i = 0; i < 2; i++) {
21924             ret->substrs->data[i].substr =
21925                         sv_dup_inc(ret->substrs->data[i].substr, param);
21926             ret->substrs->data[i].utf8_substr =
21927                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21928         }
21929 
21930 	/* check_substr and check_utf8, if non-NULL, point to either their
21931 	   anchored or float namesakes, and don't hold a second reference.  */
21932 
21933 	if (ret->check_substr) {
21934 	    if (anchored) {
21935 		assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21936 
21937 		ret->check_substr = ret->substrs->data[0].substr;
21938 		ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21939 	    } else {
21940 		assert(r->check_substr == r->substrs->data[1].substr);
21941 		assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21942 
21943 		ret->check_substr = ret->substrs->data[1].substr;
21944 		ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21945 	    }
21946 	} else if (ret->check_utf8) {
21947 	    if (anchored) {
21948 		ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21949 	    } else {
21950 		ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21951 	    }
21952 	}
21953     }
21954 
21955     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21956     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21957     if (r->recurse_locinput)
21958         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21959 
21960     if (ret->pprivate)
21961 	RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21962 
21963     if (RX_MATCH_COPIED(dstr))
21964 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21965     else
21966 	ret->subbeg = NULL;
21967 #ifdef PERL_ANY_COW
21968     ret->saved_copy = NULL;
21969 #endif
21970 
21971     /* Whether mother_re be set or no, we need to copy the string.  We
21972        cannot refrain from copying it when the storage points directly to
21973        our mother regexp, because that's
21974 	       1: a buffer in a different thread
21975 	       2: something we no longer hold a reference on
21976 	       so we need to copy it locally.  */
21977     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21978     /* set malloced length to a non-zero value so it will be freed
21979      * (otherwise in combination with SVf_FAKE it looks like an alien
21980      * buffer). It doesn't have to be the actual malloced size, since it
21981      * should never be grown */
21982     SvLEN_set(dstr, SvCUR(sstr)+1);
21983     ret->mother_re   = NULL;
21984 }
21985 #endif /* PERL_IN_XSUB_RE */
21986 
21987 /*
21988    regdupe_internal()
21989 
21990    This is the internal complement to regdupe() which is used to copy
21991    the structure pointed to by the *pprivate pointer in the regexp.
21992    This is the core version of the extension overridable cloning hook.
21993    The regexp structure being duplicated will be copied by perl prior
21994    to this and will be provided as the regexp *r argument, however
21995    with the /old/ structures pprivate pointer value. Thus this routine
21996    may override any copying normally done by perl.
21997 
21998    It returns a pointer to the new regexp_internal structure.
21999 */
22000 
22001 void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx,CLONE_PARAMS * param)22002 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
22003 {
22004     dVAR;
22005     struct regexp *const r = ReANY(rx);
22006     regexp_internal *reti;
22007     int len;
22008     RXi_GET_DECL(r, ri);
22009 
22010     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
22011 
22012     len = ProgLen(ri);
22013 
22014     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
22015           char, regexp_internal);
22016     Copy(ri->program, reti->program, len+1, regnode);
22017 
22018 
22019     if (ri->code_blocks) {
22020 	int n;
22021 	Newx(reti->code_blocks, 1, struct reg_code_blocks);
22022 	Newx(reti->code_blocks->cb, ri->code_blocks->count,
22023                     struct reg_code_block);
22024 	Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22025              ri->code_blocks->count, struct reg_code_block);
22026 	for (n = 0; n < ri->code_blocks->count; n++)
22027 	     reti->code_blocks->cb[n].src_regex = (REGEXP*)
22028 		    sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22029         reti->code_blocks->count = ri->code_blocks->count;
22030         reti->code_blocks->refcnt = 1;
22031     }
22032     else
22033 	reti->code_blocks = NULL;
22034 
22035     reti->regstclass = NULL;
22036 
22037     if (ri->data) {
22038 	struct reg_data *d;
22039         const int count = ri->data->count;
22040 	int i;
22041 
22042 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22043 		char, struct reg_data);
22044 	Newx(d->what, count, U8);
22045 
22046 	d->count = count;
22047 	for (i = 0; i < count; i++) {
22048 	    d->what[i] = ri->data->what[i];
22049 	    switch (d->what[i]) {
22050 	        /* see also regcomp.h and regfree_internal() */
22051             case 'a': /* actually an AV, but the dup function is identical.
22052                          values seem to be "plain sv's" generally. */
22053             case 'r': /* a compiled regex (but still just another SV) */
22054             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22055                          this use case should go away, the code could have used
22056                          'a' instead - see S_set_ANYOF_arg() for array contents. */
22057             case 'S': /* actually an SV, but the dup function is identical.  */
22058             case 'u': /* actually an HV, but the dup function is identical.
22059                          values are "plain sv's" */
22060 		d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22061 		break;
22062 	    case 'f':
22063                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22064                  * patterns which could start with several different things. Pre-TRIE
22065                  * this was more important than it is now, however this still helps
22066                  * in some places, for instance /x?a+/ might produce a SSC equivalent
22067                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22068                  * in regexec.c
22069                  */
22070 		/* This is cheating. */
22071 		Newx(d->data[i], 1, regnode_ssc);
22072 		StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22073 		reti->regstclass = (regnode*)d->data[i];
22074 		break;
22075 	    case 'T':
22076                 /* AHO-CORASICK fail table */
22077                 /* Trie stclasses are readonly and can thus be shared
22078 		 * without duplication. We free the stclass in pregfree
22079 		 * when the corresponding reg_ac_data struct is freed.
22080 		 */
22081 		reti->regstclass= ri->regstclass;
22082 		/* FALLTHROUGH */
22083 	    case 't':
22084                 /* TRIE transition table */
22085 		OP_REFCNT_LOCK;
22086 		((reg_trie_data*)ri->data->data[i])->refcount++;
22087 		OP_REFCNT_UNLOCK;
22088 		/* FALLTHROUGH */
22089             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22090             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22091                          is not from another regexp */
22092 		d->data[i] = ri->data->data[i];
22093 		break;
22094             default:
22095                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22096                                                            ri->data->what[i]);
22097 	    }
22098 	}
22099 
22100 	reti->data = d;
22101     }
22102     else
22103 	reti->data = NULL;
22104 
22105     reti->name_list_idx = ri->name_list_idx;
22106 
22107 #ifdef RE_TRACK_PATTERN_OFFSETS
22108     if (ri->u.offsets) {
22109         Newx(reti->u.offsets, 2*len+1, U32);
22110         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22111     }
22112 #else
22113     SetProgLen(reti, len);
22114 #endif
22115 
22116     return (void*)reti;
22117 }
22118 
22119 #endif    /* USE_ITHREADS */
22120 
22121 #ifndef PERL_IN_XSUB_RE
22122 
22123 /*
22124  - regnext - dig the "next" pointer out of a node
22125  */
22126 regnode *
Perl_regnext(pTHX_ regnode * p)22127 Perl_regnext(pTHX_ regnode *p)
22128 {
22129     I32 offset;
22130 
22131     if (!p)
22132 	return(NULL);
22133 
22134     if (OP(p) > REGNODE_MAX) {		/* regnode.type is unsigned */
22135 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22136                                                 (int)OP(p), (int)REGNODE_MAX);
22137     }
22138 
22139     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22140     if (offset == 0)
22141 	return(NULL);
22142 
22143     return(p+offset);
22144 }
22145 
22146 #endif
22147 
22148 STATIC void
S_re_croak(pTHX_ bool utf8,const char * pat,...)22149 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22150 {
22151     va_list args;
22152     STRLEN len = strlen(pat);
22153     char buf[512];
22154     SV *msv;
22155     const char *message;
22156 
22157     PERL_ARGS_ASSERT_RE_CROAK;
22158 
22159     if (len > 510)
22160 	len = 510;
22161     Copy(pat, buf, len , char);
22162     buf[len] = '\n';
22163     buf[len + 1] = '\0';
22164     va_start(args, pat);
22165     msv = vmess(buf, &args);
22166     va_end(args);
22167     message = SvPV_const(msv, len);
22168     if (len > 512)
22169 	len = 512;
22170     Copy(message, buf, len , char);
22171     /* len-1 to avoid \n */
22172     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22173 }
22174 
22175 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
22176 
22177 #ifndef PERL_IN_XSUB_RE
22178 void
Perl_save_re_context(pTHX)22179 Perl_save_re_context(pTHX)
22180 {
22181     I32 nparens = -1;
22182     I32 i;
22183 
22184     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22185 
22186     if (PL_curpm) {
22187 	const REGEXP * const rx = PM_GETRE(PL_curpm);
22188 	if (rx)
22189             nparens = RX_NPARENS(rx);
22190     }
22191 
22192     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22193      * that PL_curpm will be null, but that utf8.pm and the modules it
22194      * loads will only use $1..$3.
22195      * The t/porting/re_context.t test file checks this assumption.
22196      */
22197     if (nparens == -1)
22198         nparens = 3;
22199 
22200     for (i = 1; i <= nparens; i++) {
22201         char digits[TYPE_CHARS(long)];
22202         const STRLEN len = my_snprintf(digits, sizeof(digits),
22203                                        "%lu", (long)i);
22204         GV *const *const gvp
22205             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22206 
22207         if (gvp) {
22208             GV * const gv = *gvp;
22209             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22210                 save_scalar(gv);
22211         }
22212     }
22213 }
22214 #endif
22215 
22216 #ifdef DEBUGGING
22217 
22218 STATIC void
S_put_code_point(pTHX_ SV * sv,UV c)22219 S_put_code_point(pTHX_ SV *sv, UV c)
22220 {
22221     PERL_ARGS_ASSERT_PUT_CODE_POINT;
22222 
22223     if (c > 255) {
22224         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22225     }
22226     else if (isPRINT(c)) {
22227 	const char string = (char) c;
22228 
22229         /* We use {phrase} as metanotation in the class, so also escape literal
22230          * braces */
22231 	if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22232 	    sv_catpvs(sv, "\\");
22233 	sv_catpvn(sv, &string, 1);
22234     }
22235     else if (isMNEMONIC_CNTRL(c)) {
22236         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22237     }
22238     else {
22239         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22240     }
22241 }
22242 
22243 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22244 
22245 STATIC void
S_put_range(pTHX_ SV * sv,UV start,const UV end,const bool allow_literals)22246 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22247 {
22248     /* Appends to 'sv' a displayable version of the range of code points from
22249      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
22250      * that have them, when they occur at the beginning or end of the range.
22251      * It uses hex to output the remaining code points, unless 'allow_literals'
22252      * is true, in which case the printable ASCII ones are output as-is (though
22253      * some of these will be escaped by put_code_point()).
22254      *
22255      * NOTE:  This is designed only for printing ranges of code points that fit
22256      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
22257      */
22258 
22259     const unsigned int min_range_count = 3;
22260 
22261     assert(start <= end);
22262 
22263     PERL_ARGS_ASSERT_PUT_RANGE;
22264 
22265     while (start <= end) {
22266         UV this_end;
22267         const char * format;
22268 
22269         if (end - start < min_range_count) {
22270 
22271             /* Output chars individually when they occur in short ranges */
22272             for (; start <= end; start++) {
22273                 put_code_point(sv, start);
22274             }
22275             break;
22276         }
22277 
22278         /* If permitted by the input options, and there is a possibility that
22279          * this range contains a printable literal, look to see if there is
22280          * one. */
22281         if (allow_literals && start <= MAX_PRINT_A) {
22282 
22283             /* If the character at the beginning of the range isn't an ASCII
22284              * printable, effectively split the range into two parts:
22285              *  1) the portion before the first such printable,
22286              *  2) the rest
22287              * and output them separately. */
22288             if (! isPRINT_A(start)) {
22289                 UV temp_end = start + 1;
22290 
22291                 /* There is no point looking beyond the final possible
22292                  * printable, in MAX_PRINT_A */
22293                 UV max = MIN(end, MAX_PRINT_A);
22294 
22295                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22296                     temp_end++;
22297                 }
22298 
22299                 /* Here, temp_end points to one beyond the first printable if
22300                  * found, or to one beyond 'max' if not.  If none found, make
22301                  * sure that we use the entire range */
22302                 if (temp_end > MAX_PRINT_A) {
22303                     temp_end = end + 1;
22304                 }
22305 
22306                 /* Output the first part of the split range: the part that
22307                  * doesn't have printables, with the parameter set to not look
22308                  * for literals (otherwise we would infinitely recurse) */
22309                 put_range(sv, start, temp_end - 1, FALSE);
22310 
22311                 /* The 2nd part of the range (if any) starts here. */
22312                 start = temp_end;
22313 
22314                 /* We do a continue, instead of dropping down, because even if
22315                  * the 2nd part is non-empty, it could be so short that we want
22316                  * to output it as individual characters, as tested for at the
22317                  * top of this loop.  */
22318                 continue;
22319             }
22320 
22321             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
22322              * output a sub-range of just the digits or letters, then process
22323              * the remaining portion as usual. */
22324             if (isALPHANUMERIC_A(start)) {
22325                 UV mask = (isDIGIT_A(start))
22326                            ? _CC_DIGIT
22327                              : isUPPER_A(start)
22328                                ? _CC_UPPER
22329                                : _CC_LOWER;
22330                 UV temp_end = start + 1;
22331 
22332                 /* Find the end of the sub-range that includes just the
22333                  * characters in the same class as the first character in it */
22334                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22335                     temp_end++;
22336                 }
22337                 temp_end--;
22338 
22339                 /* For short ranges, don't duplicate the code above to output
22340                  * them; just call recursively */
22341                 if (temp_end - start < min_range_count) {
22342                     put_range(sv, start, temp_end, FALSE);
22343                 }
22344                 else {  /* Output as a range */
22345                     put_code_point(sv, start);
22346                     sv_catpvs(sv, "-");
22347                     put_code_point(sv, temp_end);
22348                 }
22349                 start = temp_end + 1;
22350                 continue;
22351             }
22352 
22353             /* We output any other printables as individual characters */
22354             if (isPUNCT_A(start) || isSPACE_A(start)) {
22355                 while (start <= end && (isPUNCT_A(start)
22356                                         || isSPACE_A(start)))
22357                 {
22358                     put_code_point(sv, start);
22359                     start++;
22360                 }
22361                 continue;
22362             }
22363         } /* End of looking for literals */
22364 
22365         /* Here is not to output as a literal.  Some control characters have
22366          * mnemonic names.  Split off any of those at the beginning and end of
22367          * the range to print mnemonically.  It isn't possible for many of
22368          * these to be in a row, so this won't overwhelm with output */
22369         if (   start <= end
22370             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22371         {
22372             while (isMNEMONIC_CNTRL(start) && start <= end) {
22373                 put_code_point(sv, start);
22374                 start++;
22375             }
22376 
22377             /* If this didn't take care of the whole range ... */
22378             if (start <= end) {
22379 
22380                 /* Look backwards from the end to find the final non-mnemonic
22381                  * */
22382                 UV temp_end = end;
22383                 while (isMNEMONIC_CNTRL(temp_end)) {
22384                     temp_end--;
22385                 }
22386 
22387                 /* And separately output the interior range that doesn't start
22388                  * or end with mnemonics */
22389                 put_range(sv, start, temp_end, FALSE);
22390 
22391                 /* Then output the mnemonic trailing controls */
22392                 start = temp_end + 1;
22393                 while (start <= end) {
22394                     put_code_point(sv, start);
22395                     start++;
22396                 }
22397                 break;
22398             }
22399         }
22400 
22401         /* As a final resort, output the range or subrange as hex. */
22402 
22403         if (start >= NUM_ANYOF_CODE_POINTS) {
22404             this_end = end;
22405         }
22406         else {  /* Have to split range at the bitmap boundary */
22407             this_end = (end < NUM_ANYOF_CODE_POINTS)
22408                         ? end
22409                         : NUM_ANYOF_CODE_POINTS - 1;
22410         }
22411 #if NUM_ANYOF_CODE_POINTS > 256
22412         format = (this_end < 256)
22413                  ? "\\x%02" UVXf "-\\x%02" UVXf
22414                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22415 #else
22416         format = "\\x%02" UVXf "-\\x%02" UVXf;
22417 #endif
22418         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22419         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22420         GCC_DIAG_RESTORE_STMT;
22421         break;
22422     }
22423 }
22424 
22425 STATIC void
S_put_charclass_bitmap_innards_invlist(pTHX_ SV * sv,SV * invlist)22426 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22427 {
22428     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22429      * 'invlist' */
22430 
22431     UV start, end;
22432     bool allow_literals = TRUE;
22433 
22434     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22435 
22436     /* Generally, it is more readable if printable characters are output as
22437      * literals, but if a range (nearly) spans all of them, it's best to output
22438      * it as a single range.  This code will use a single range if all but 2
22439      * ASCII printables are in it */
22440     invlist_iterinit(invlist);
22441     while (invlist_iternext(invlist, &start, &end)) {
22442 
22443         /* If the range starts beyond the final printable, it doesn't have any
22444          * in it */
22445         if (start > MAX_PRINT_A) {
22446             break;
22447         }
22448 
22449         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
22450          * all but two, the range must start and end no later than 2 from
22451          * either end */
22452         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22453             if (end > MAX_PRINT_A) {
22454                 end = MAX_PRINT_A;
22455             }
22456             if (start < ' ') {
22457                 start = ' ';
22458             }
22459             if (end - start >= MAX_PRINT_A - ' ' - 2) {
22460                 allow_literals = FALSE;
22461             }
22462             break;
22463         }
22464     }
22465     invlist_iterfinish(invlist);
22466 
22467     /* Here we have figured things out.  Output each range */
22468     invlist_iterinit(invlist);
22469     while (invlist_iternext(invlist, &start, &end)) {
22470         if (start >= NUM_ANYOF_CODE_POINTS) {
22471             break;
22472         }
22473         put_range(sv, start, end, allow_literals);
22474     }
22475     invlist_iterfinish(invlist);
22476 
22477     return;
22478 }
22479 
22480 STATIC SV*
S_put_charclass_bitmap_innards_common(pTHX_ SV * invlist,SV * posixes,SV * only_utf8,SV * not_utf8,SV * only_utf8_locale,const bool invert)22481 S_put_charclass_bitmap_innards_common(pTHX_
22482         SV* invlist,            /* The bitmap */
22483         SV* posixes,            /* Under /l, things like [:word:], \S */
22484         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
22485         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
22486         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
22487         const bool invert       /* Is the result to be inverted? */
22488 )
22489 {
22490     /* Create and return an SV containing a displayable version of the bitmap
22491      * and associated information determined by the input parameters.  If the
22492      * output would have been only the inversion indicator '^', NULL is instead
22493      * returned. */
22494 
22495     dVAR;
22496     SV * output;
22497 
22498     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22499 
22500     if (invert) {
22501         output = newSVpvs("^");
22502     }
22503     else {
22504         output = newSVpvs("");
22505     }
22506 
22507     /* First, the code points in the bitmap that are unconditionally there */
22508     put_charclass_bitmap_innards_invlist(output, invlist);
22509 
22510     /* Traditionally, these have been placed after the main code points */
22511     if (posixes) {
22512         sv_catsv(output, posixes);
22513     }
22514 
22515     if (only_utf8 && _invlist_len(only_utf8)) {
22516         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22517         put_charclass_bitmap_innards_invlist(output, only_utf8);
22518     }
22519 
22520     if (not_utf8 && _invlist_len(not_utf8)) {
22521         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22522         put_charclass_bitmap_innards_invlist(output, not_utf8);
22523     }
22524 
22525     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22526         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22527         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22528 
22529         /* This is the only list in this routine that can legally contain code
22530          * points outside the bitmap range.  The call just above to
22531          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22532          * output them here.  There's about a half-dozen possible, and none in
22533          * contiguous ranges longer than 2 */
22534         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22535             UV start, end;
22536             SV* above_bitmap = NULL;
22537 
22538             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22539 
22540             invlist_iterinit(above_bitmap);
22541             while (invlist_iternext(above_bitmap, &start, &end)) {
22542                 UV i;
22543 
22544                 for (i = start; i <= end; i++) {
22545                     put_code_point(output, i);
22546                 }
22547             }
22548             invlist_iterfinish(above_bitmap);
22549             SvREFCNT_dec_NN(above_bitmap);
22550         }
22551     }
22552 
22553     if (invert && SvCUR(output) == 1) {
22554         return NULL;
22555     }
22556 
22557     return output;
22558 }
22559 
22560 STATIC bool
S_put_charclass_bitmap_innards(pTHX_ SV * sv,char * bitmap,SV * nonbitmap_invlist,SV * only_utf8_locale_invlist,const regnode * const node,const U8 flags,const bool force_as_is_display)22561 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22562                                      char *bitmap,
22563                                      SV *nonbitmap_invlist,
22564                                      SV *only_utf8_locale_invlist,
22565                                      const regnode * const node,
22566                                      const U8 flags,
22567                                      const bool force_as_is_display)
22568 {
22569     /* Appends to 'sv' a displayable version of the innards of the bracketed
22570      * character class defined by the other arguments:
22571      *  'bitmap' points to the bitmap, or NULL if to ignore that.
22572      *  'nonbitmap_invlist' is an inversion list of the code points that are in
22573      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
22574      *      none.  The reasons for this could be that they require some
22575      *      condition such as the target string being or not being in UTF-8
22576      *      (under /d), or because they came from a user-defined property that
22577      *      was not resolved at the time of the regex compilation (under /u)
22578      *  'only_utf8_locale_invlist' is an inversion list of the code points that
22579      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
22580      *  'node' is the regex pattern ANYOF node.  It is needed only when the
22581      *      above two parameters are not null, and is passed so that this
22582      *      routine can tease apart the various reasons for them.
22583      *  'flags' is the flags field of 'node'
22584      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
22585      *      to invert things to see if that leads to a cleaner display.  If
22586      *      FALSE, this routine is free to use its judgment about doing this.
22587      *
22588      * It returns TRUE if there was actually something output.  (It may be that
22589      * the bitmap, etc is empty.)
22590      *
22591      * When called for outputting the bitmap of a non-ANYOF node, just pass the
22592      * bitmap, with the succeeding parameters set to NULL, and the final one to
22593      * FALSE.
22594      */
22595 
22596     /* In general, it tries to display the 'cleanest' representation of the
22597      * innards, choosing whether to display them inverted or not, regardless of
22598      * whether the class itself is to be inverted.  However,  there are some
22599      * cases where it can't try inverting, as what actually matches isn't known
22600      * until runtime, and hence the inversion isn't either. */
22601 
22602     dVAR;
22603     bool inverting_allowed = ! force_as_is_display;
22604 
22605     int i;
22606     STRLEN orig_sv_cur = SvCUR(sv);
22607 
22608     SV* invlist;            /* Inversion list we accumulate of code points that
22609                                are unconditionally matched */
22610     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
22611                                UTF-8 */
22612     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
22613                              */
22614     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
22615     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
22616                                        is UTF-8 */
22617 
22618     SV* as_is_display;      /* The output string when we take the inputs
22619                                literally */
22620     SV* inverted_display;   /* The output string when we invert the inputs */
22621 
22622     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
22623                                                    to match? */
22624     /* We are biased in favor of displaying things without them being inverted,
22625      * as that is generally easier to understand */
22626     const int bias = 5;
22627 
22628     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22629 
22630     /* Start off with whatever code points are passed in.  (We clone, so we
22631      * don't change the caller's list) */
22632     if (nonbitmap_invlist) {
22633         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22634         invlist = invlist_clone(nonbitmap_invlist, NULL);
22635     }
22636     else {  /* Worst case size is every other code point is matched */
22637         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22638     }
22639 
22640     if (flags) {
22641         if (OP(node) == ANYOFD) {
22642 
22643             /* This flag indicates that the code points below 0x100 in the
22644              * nonbitmap list are precisely the ones that match only when the
22645              * target is UTF-8 (they should all be non-ASCII). */
22646             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22647             {
22648                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22649                 _invlist_subtract(invlist, only_utf8, &invlist);
22650             }
22651 
22652             /* And this flag for matching all non-ASCII 0xFF and below */
22653             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22654             {
22655                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22656             }
22657         }
22658         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22659 
22660             /* If either of these flags are set, what matches isn't
22661              * determinable except during execution, so don't know enough here
22662              * to invert */
22663             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22664                 inverting_allowed = FALSE;
22665             }
22666 
22667             /* What the posix classes match also varies at runtime, so these
22668              * will be output symbolically. */
22669             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22670                 int i;
22671 
22672                 posixes = newSVpvs("");
22673                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22674                     if (ANYOF_POSIXL_TEST(node, i)) {
22675                         sv_catpv(posixes, anyofs[i]);
22676                     }
22677                 }
22678             }
22679         }
22680     }
22681 
22682     /* Accumulate the bit map into the unconditional match list */
22683     if (bitmap) {
22684         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22685             if (BITMAP_TEST(bitmap, i)) {
22686                 int start = i++;
22687                 for (;
22688                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22689                      i++)
22690                 { /* empty */ }
22691                 invlist = _add_range_to_invlist(invlist, start, i-1);
22692             }
22693         }
22694     }
22695 
22696     /* Make sure that the conditional match lists don't have anything in them
22697      * that match unconditionally; otherwise the output is quite confusing.
22698      * This could happen if the code that populates these misses some
22699      * duplication. */
22700     if (only_utf8) {
22701         _invlist_subtract(only_utf8, invlist, &only_utf8);
22702     }
22703     if (not_utf8) {
22704         _invlist_subtract(not_utf8, invlist, &not_utf8);
22705     }
22706 
22707     if (only_utf8_locale_invlist) {
22708 
22709         /* Since this list is passed in, we have to make a copy before
22710          * modifying it */
22711         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22712 
22713         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22714 
22715         /* And, it can get really weird for us to try outputting an inverted
22716          * form of this list when it has things above the bitmap, so don't even
22717          * try */
22718         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22719             inverting_allowed = FALSE;
22720         }
22721     }
22722 
22723     /* Calculate what the output would be if we take the input as-is */
22724     as_is_display = put_charclass_bitmap_innards_common(invlist,
22725                                                     posixes,
22726                                                     only_utf8,
22727                                                     not_utf8,
22728                                                     only_utf8_locale,
22729                                                     invert);
22730 
22731     /* If have to take the output as-is, just do that */
22732     if (! inverting_allowed) {
22733         if (as_is_display) {
22734             sv_catsv(sv, as_is_display);
22735             SvREFCNT_dec_NN(as_is_display);
22736         }
22737     }
22738     else { /* But otherwise, create the output again on the inverted input, and
22739               use whichever version is shorter */
22740 
22741         int inverted_bias, as_is_bias;
22742 
22743         /* We will apply our bias to whichever of the results doesn't have
22744          * the '^' */
22745         if (invert) {
22746             invert = FALSE;
22747             as_is_bias = bias;
22748             inverted_bias = 0;
22749         }
22750         else {
22751             invert = TRUE;
22752             as_is_bias = 0;
22753             inverted_bias = bias;
22754         }
22755 
22756         /* Now invert each of the lists that contribute to the output,
22757          * excluding from the result things outside the possible range */
22758 
22759         /* For the unconditional inversion list, we have to add in all the
22760          * conditional code points, so that when inverted, they will be gone
22761          * from it */
22762         _invlist_union(only_utf8, invlist, &invlist);
22763         _invlist_union(not_utf8, invlist, &invlist);
22764         _invlist_union(only_utf8_locale, invlist, &invlist);
22765         _invlist_invert(invlist);
22766         _invlist_intersection(invlist, PL_InBitmap, &invlist);
22767 
22768         if (only_utf8) {
22769             _invlist_invert(only_utf8);
22770             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22771         }
22772         else if (not_utf8) {
22773 
22774             /* If a code point matches iff the target string is not in UTF-8,
22775              * then complementing the result has it not match iff not in UTF-8,
22776              * which is the same thing as matching iff it is UTF-8. */
22777             only_utf8 = not_utf8;
22778             not_utf8 = NULL;
22779         }
22780 
22781         if (only_utf8_locale) {
22782             _invlist_invert(only_utf8_locale);
22783             _invlist_intersection(only_utf8_locale,
22784                                   PL_InBitmap,
22785                                   &only_utf8_locale);
22786         }
22787 
22788         inverted_display = put_charclass_bitmap_innards_common(
22789                                             invlist,
22790                                             posixes,
22791                                             only_utf8,
22792                                             not_utf8,
22793                                             only_utf8_locale, invert);
22794 
22795         /* Use the shortest representation, taking into account our bias
22796          * against showing it inverted */
22797         if (   inverted_display
22798             && (   ! as_is_display
22799                 || (  SvCUR(inverted_display) + inverted_bias
22800                     < SvCUR(as_is_display)    + as_is_bias)))
22801         {
22802 	    sv_catsv(sv, inverted_display);
22803         }
22804         else if (as_is_display) {
22805 	    sv_catsv(sv, as_is_display);
22806         }
22807 
22808         SvREFCNT_dec(as_is_display);
22809         SvREFCNT_dec(inverted_display);
22810     }
22811 
22812     SvREFCNT_dec_NN(invlist);
22813     SvREFCNT_dec(only_utf8);
22814     SvREFCNT_dec(not_utf8);
22815     SvREFCNT_dec(posixes);
22816     SvREFCNT_dec(only_utf8_locale);
22817 
22818     return SvCUR(sv) > orig_sv_cur;
22819 }
22820 
22821 #define CLEAR_OPTSTART                                                       \
22822     if (optstart) STMT_START {                                               \
22823         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
22824                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22825         optstart=NULL;                                                       \
22826     } STMT_END
22827 
22828 #define DUMPUNTIL(b,e)                                                       \
22829                     CLEAR_OPTSTART;                                          \
22830                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22831 
22832 STATIC const regnode *
S_dumpuntil(pTHX_ const regexp * r,const regnode * start,const regnode * node,const regnode * last,const regnode * plast,SV * sv,I32 indent,U32 depth)22833 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22834 	    const regnode *last, const regnode *plast,
22835 	    SV* sv, I32 indent, U32 depth)
22836 {
22837     U8 op = PSEUDO;	/* Arbitrary non-END op. */
22838     const regnode *next;
22839     const regnode *optstart= NULL;
22840 
22841     RXi_GET_DECL(r, ri);
22842     DECLARE_AND_GET_RE_DEBUG_FLAGS;
22843 
22844     PERL_ARGS_ASSERT_DUMPUNTIL;
22845 
22846 #ifdef DEBUG_DUMPUNTIL
22847     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22848         last ? last-start : 0, plast ? plast-start : 0);
22849 #endif
22850 
22851     if (plast && plast < last)
22852         last= plast;
22853 
22854     while (PL_regkind[op] != END && (!last || node < last)) {
22855         assert(node);
22856 	/* While that wasn't END last time... */
22857 	NODE_ALIGN(node);
22858 	op = OP(node);
22859 	if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22860 	    indent--;
22861 	next = regnext((regnode *)node);
22862 
22863 	/* Where, what. */
22864 	if (OP(node) == OPTIMIZED) {
22865 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22866 	        optstart = node;
22867 	    else
22868 		goto after_print;
22869 	} else
22870 	    CLEAR_OPTSTART;
22871 
22872         regprop(r, sv, node, NULL, NULL);
22873         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22874 		      (int)(2*indent + 1), "", SvPVX_const(sv));
22875 
22876         if (OP(node) != OPTIMIZED) {
22877             if (next == NULL)		/* Next ptr. */
22878                 Perl_re_printf( aTHX_  " (0)");
22879             else if (PL_regkind[(U8)op] == BRANCH
22880                      && PL_regkind[OP(next)] != BRANCH )
22881                 Perl_re_printf( aTHX_  " (FAIL)");
22882             else
22883                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22884             Perl_re_printf( aTHX_ "\n");
22885         }
22886 
22887       after_print:
22888 	if (PL_regkind[(U8)op] == BRANCHJ) {
22889 	    assert(next);
22890 	    {
22891                 const regnode *nnode = (OP(next) == LONGJMP
22892                                        ? regnext((regnode *)next)
22893                                        : next);
22894                 if (last && nnode > last)
22895                     nnode = last;
22896                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22897 	    }
22898 	}
22899 	else if (PL_regkind[(U8)op] == BRANCH) {
22900 	    assert(next);
22901 	    DUMPUNTIL(NEXTOPER(node), next);
22902 	}
22903 	else if ( PL_regkind[(U8)op]  == TRIE ) {
22904 	    const regnode *this_trie = node;
22905 	    const char op = OP(node);
22906             const U32 n = ARG(node);
22907 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
22908                (reg_ac_data *)ri->data->data[n] :
22909                NULL;
22910 	    const reg_trie_data * const trie =
22911 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22912 #ifdef DEBUGGING
22913 	    AV *const trie_words
22914                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22915 #endif
22916 	    const regnode *nextbranch= NULL;
22917 	    I32 word_idx;
22918             SvPVCLEAR(sv);
22919 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22920 		SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22921 
22922                 Perl_re_indentf( aTHX_  "%s ",
22923                     indent+3,
22924                     elem_ptr
22925                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22926                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22927                                 PL_colors[0], PL_colors[1],
22928                                 (SvUTF8(*elem_ptr)
22929                                  ? PERL_PV_ESCAPE_UNI
22930                                  : 0)
22931                                 | PERL_PV_PRETTY_ELLIPSES
22932                                 | PERL_PV_PRETTY_LTGT
22933                             )
22934                     : "???"
22935                 );
22936                 if (trie->jump) {
22937                     U16 dist= trie->jump[word_idx+1];
22938                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22939                                (UV)((dist ? this_trie + dist : next) - start));
22940                     if (dist) {
22941                         if (!nextbranch)
22942                             nextbranch= this_trie + trie->jump[0];
22943 			DUMPUNTIL(this_trie + dist, nextbranch);
22944                     }
22945                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22946                         nextbranch= regnext((regnode *)nextbranch);
22947                 } else {
22948                     Perl_re_printf( aTHX_  "\n");
22949 		}
22950 	    }
22951 	    if (last && next > last)
22952 	        node= last;
22953 	    else
22954 	        node= next;
22955 	}
22956 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22957 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22958                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22959 	}
22960 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22961 	    assert(next);
22962 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22963 	}
22964 	else if ( op == PLUS || op == STAR) {
22965 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22966 	}
22967 	else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22968             /* Literal string, where present. */
22969 	    node += NODE_SZ_STR(node) - 1;
22970 	    node = NEXTOPER(node);
22971 	}
22972 	else {
22973 	    node = NEXTOPER(node);
22974 	    node += regarglen[(U8)op];
22975 	}
22976 	if (op == CURLYX || op == OPEN || op == SROPEN)
22977 	    indent++;
22978     }
22979     CLEAR_OPTSTART;
22980 #ifdef DEBUG_DUMPUNTIL
22981     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22982 #endif
22983     return node;
22984 }
22985 
22986 #endif	/* DEBUGGING */
22987 
22988 #ifndef PERL_IN_XSUB_RE
22989 
22990 #  include "uni_keywords.h"
22991 
22992 void
Perl_init_uniprops(pTHX)22993 Perl_init_uniprops(pTHX)
22994 {
22995     dVAR;
22996 
22997 #  ifdef DEBUGGING
22998     char * dump_len_string;
22999 
23000     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
23001     if (   ! dump_len_string
23002         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
23003     {
23004         PL_dump_re_max_len = 60;    /* A reasonable default */
23005     }
23006 #  endif
23007 
23008     PL_user_def_props = newHV();
23009 
23010 #  ifdef USE_ITHREADS
23011 
23012     HvSHAREKEYS_off(PL_user_def_props);
23013     PL_user_def_props_aTHX = aTHX;
23014 
23015 #  endif
23016 
23017     /* Set up the inversion list interpreter-level variables */
23018 
23019     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23020     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
23021     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
23022     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23023     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23024     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23025     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23026     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23027     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23028     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23029     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23030     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23031     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23032     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23033     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23034     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23035 
23036     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23037     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23038     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23039     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23040     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23041     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23042     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23043     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23044     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23045     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23046     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23047     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23048     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23049     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23050     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23051     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23052 
23053     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23054     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23055     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23056     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23057     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23058 
23059     PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23060     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23061     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23062     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23063 
23064     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23065 
23066     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23067     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23068 
23069     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23070     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23071 
23072     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23073     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23074                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23075     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23076                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23077     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23078     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23079     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23080     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23081     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23082     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23083     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23084     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23085     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23086 
23087 #  ifdef UNI_XIDC
23088     /* The below are used only by deprecated functions.  They could be removed */
23089     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23090     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23091     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23092 #  endif
23093 }
23094 
23095 /* These four functions are compiled only in regcomp.c, where they have access
23096  * to the data they return.  They are a way for re_comp.c to get access to that
23097  * data without having to compile the whole data structures. */
23098 
23099 I16
Perl_do_uniprop_match(const char * const key,const U16 key_len)23100 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23101 {
23102     PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23103 
23104     return match_uniprop((U8 *) key, key_len);
23105 }
23106 
23107 SV *
Perl_get_prop_definition(pTHX_ const int table_index)23108 Perl_get_prop_definition(pTHX_ const int table_index)
23109 {
23110     PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23111 
23112     /* Create and return the inversion list */
23113     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23114 }
23115 
23116 const char * const *
Perl_get_prop_values(const int table_index)23117 Perl_get_prop_values(const int table_index)
23118 {
23119     PERL_ARGS_ASSERT_GET_PROP_VALUES;
23120 
23121     return UNI_prop_value_ptrs[table_index];
23122 }
23123 
23124 const char *
Perl_get_deprecated_property_msg(const Size_t warning_offset)23125 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23126 {
23127     PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23128 
23129     return deprecated_property_msgs[warning_offset];
23130 }
23131 
23132 #  if 0
23133 
23134 This code was mainly added for backcompat to give a warning for non-portable
23135 code points in user-defined properties.  But experiments showed that the
23136 warning in earlier perls were only omitted on overflow, which should be an
23137 error, so there really isnt a backcompat issue, and actually adding the
23138 warning when none was present before might cause breakage, for little gain.  So
23139 khw left this code in, but not enabled.  Tests were never added.
23140 
23141 embed.fnc entry:
23142 Ei	|const char *|get_extended_utf8_msg|const UV cp
23143 
23144 PERL_STATIC_INLINE const char *
23145 S_get_extended_utf8_msg(pTHX_ const UV cp)
23146 {
23147     U8 dummy[UTF8_MAXBYTES + 1];
23148     HV *msgs;
23149     SV **msg;
23150 
23151     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23152                              &msgs);
23153 
23154     msg = hv_fetchs(msgs, "text", 0);
23155     assert(msg);
23156 
23157     (void) sv_2mortal((SV *) msgs);
23158 
23159     return SvPVX(*msg);
23160 }
23161 
23162 #  endif
23163 #endif /* end of ! PERL_IN_XSUB_RE */
23164 
23165 STATIC REGEXP *
S_compile_wildcard(pTHX_ const char * subpattern,const STRLEN len,const bool ignore_case)23166 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23167                          const bool ignore_case)
23168 {
23169     /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23170      * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
23171      * because nothing outside of ASCII will match.  Use /m because the input
23172      * string may be a bunch of lines strung together.
23173      *
23174      * Also sets up the debugging info */
23175 
23176     U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23177     U32 rx_flags;
23178     SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23179     REGEXP * subpattern_re;
23180     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23181 
23182     PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23183 
23184     if (ignore_case) {
23185         flags |= PMf_FOLD;
23186     }
23187     set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23188 
23189     /* Like in op.c, we copy the compile time pm flags to the rx ones */
23190     rx_flags = flags & RXf_PMf_COMPILETIME;
23191 
23192 #ifndef PERL_IN_XSUB_RE
23193     /* Use the core engine if this file is regcomp.c.  That means no
23194      * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23195     subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23196                                              &PL_core_reg_engine,
23197                                              NULL, NULL,
23198                                              rx_flags, flags);
23199 #else
23200     if (isDEBUG_WILDCARD) {
23201         /* Use the special debugging engine if this file is re_comp.c and wants
23202          * to output the wildcard matching.  This uses whatever
23203          * 'use re "Debug ..." is in effect */
23204         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23205                                                  &my_reg_engine,
23206                                                  NULL, NULL,
23207                                                  rx_flags, flags);
23208     }
23209     else {
23210         /* Use the special wildcard engine if this file is re_comp.c and
23211          * doesn't want to output the wildcard matching.  This uses whatever
23212          * 'use re "Debug ..." is in effect for compilation, but this engine
23213          * structure has been set up so that it uses the core engine for
23214          * execution, so no execution debugging as a result of re.pm will be
23215          * displayed. */
23216         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23217                                                  &wild_reg_engine,
23218                                                  NULL, NULL,
23219                                                  rx_flags, flags);
23220         /* XXX The above has the effect that any user-supplied regex engine
23221          * won't be called for matching wildcards.  That might be good, or bad.
23222          * It could be changed in several ways.  The reason it is done the
23223          * current way is to avoid having to save and restore
23224          * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
23225          * could be used.  Another suggestion is to keep the authoritative
23226          * value of the debug flags in a thread-local variable and add set/get
23227          * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23228          * Still another is to pass a flag, say in the engine's intflags that
23229          * would be checked each time before doing the debug output */
23230     }
23231 #endif
23232 
23233     assert(subpattern_re);  /* Should have died if didn't compile successfully */
23234     return subpattern_re;
23235 }
23236 
23237 STATIC I32
S_execute_wildcard(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)23238 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23239 	 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23240 {
23241     I32 result;
23242     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23243 
23244     PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23245 
23246     ENTER;
23247 
23248     /* The compilation has set things up so that if the program doesn't want to
23249      * see the wildcard matching procedure, it will get the core execution
23250      * engine, which is subject only to -Dr.  So we have to turn that off
23251      * around this procedure */
23252     if (! isDEBUG_WILDCARD) {
23253         /* Note! Casts away 'volatile' */
23254         SAVEI32(PL_debug);
23255         PL_debug &= ~ DEBUG_r_FLAG;
23256     }
23257 
23258     result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23259                          NULL, nosave);
23260     LEAVE;
23261 
23262     return result;
23263 }
23264 
23265 SV *
S_handle_user_defined_property(pTHX_ const char * name,const STRLEN name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,SV * contents,bool * user_defined_ptr,SV * msg,const STRLEN level)23266 S_handle_user_defined_property(pTHX_
23267 
23268     /* Parses the contents of a user-defined property definition; returning the
23269      * expanded definition if possible.  If so, the return is an inversion
23270      * list.
23271      *
23272      * If there are subroutines that are part of the expansion and which aren't
23273      * known at the time of the call to this function, this returns what
23274      * parse_uniprop_string() returned for the first one encountered.
23275      *
23276      * If an error was found, NULL is returned, and 'msg' gets a suitable
23277      * message appended to it.  (Appending allows the back trace of how we got
23278      * to the faulty definition to be displayed through nested calls of
23279      * user-defined subs.)
23280      *
23281      * The caller IS responsible for freeing any returned SV.
23282      *
23283      * The syntax of the contents is pretty much described in perlunicode.pod,
23284      * but we also allow comments on each line */
23285 
23286     const char * name,          /* Name of property */
23287     const STRLEN name_len,      /* The name's length in bytes */
23288     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23289     const bool to_fold,         /* ? Is this under /i */
23290     const bool runtime,         /* ? Are we in compile- or run-time */
23291     const bool deferrable,      /* Is it ok for this property's full definition
23292                                    to be deferred until later? */
23293     SV* contents,               /* The property's definition */
23294     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
23295                                    getting called unless this is thought to be
23296                                    a user-defined property */
23297     SV * msg,                   /* Any error or warning msg(s) are appended to
23298                                    this */
23299     const STRLEN level)         /* Recursion level of this call */
23300 {
23301     STRLEN len;
23302     const char * string         = SvPV_const(contents, len);
23303     const char * const e        = string + len;
23304     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23305     const STRLEN msgs_length_on_entry = SvCUR(msg);
23306 
23307     const char * s0 = string;   /* Points to first byte in the current line
23308                                    being parsed in 'string' */
23309     const char overflow_msg[] = "Code point too large in \"";
23310     SV* running_definition = NULL;
23311 
23312     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23313 
23314     *user_defined_ptr = TRUE;
23315 
23316     /* Look at each line */
23317     while (s0 < e) {
23318         const char * s;     /* Current byte */
23319         char op = '+';      /* Default operation is 'union' */
23320         IV   min = 0;       /* range begin code point */
23321         IV   max = -1;      /* and range end */
23322         SV* this_definition;
23323 
23324         /* Skip comment lines */
23325         if (*s0 == '#') {
23326             s0 = strchr(s0, '\n');
23327             if (s0 == NULL) {
23328                 break;
23329             }
23330             s0++;
23331             continue;
23332         }
23333 
23334         /* For backcompat, allow an empty first line */
23335         if (*s0 == '\n') {
23336             s0++;
23337             continue;
23338         }
23339 
23340         /* First character in the line may optionally be the operation */
23341         if (   *s0 == '+'
23342             || *s0 == '!'
23343             || *s0 == '-'
23344             || *s0 == '&')
23345         {
23346             op = *s0++;
23347         }
23348 
23349         /* If the line is one or two hex digits separated by blank space, its
23350          * a range; otherwise it is either another user-defined property or an
23351          * error */
23352 
23353         s = s0;
23354 
23355         if (! isXDIGIT(*s)) {
23356             goto check_if_property;
23357         }
23358 
23359         do { /* Each new hex digit will add 4 bits. */
23360             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23361                 s = strchr(s, '\n');
23362                 if (s == NULL) {
23363                     s = e;
23364                 }
23365                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23366                 sv_catpv(msg, overflow_msg);
23367                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23368                                      UTF8fARG(is_contents_utf8, s - s0, s0));
23369                 sv_catpvs(msg, "\"");
23370                 goto return_failure;
23371             }
23372 
23373             /* Accumulate this digit into the value */
23374             min = (min << 4) + READ_XDIGIT(s);
23375         } while (isXDIGIT(*s));
23376 
23377         while (isBLANK(*s)) { s++; }
23378 
23379         /* We allow comments at the end of the line */
23380         if (*s == '#') {
23381             s = strchr(s, '\n');
23382             if (s == NULL) {
23383                 s = e;
23384             }
23385             s++;
23386         }
23387         else if (s < e && *s != '\n') {
23388             if (! isXDIGIT(*s)) {
23389                 goto check_if_property;
23390             }
23391 
23392             /* Look for the high point of the range */
23393             max = 0;
23394             do {
23395                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23396                     s = strchr(s, '\n');
23397                     if (s == NULL) {
23398                         s = e;
23399                     }
23400                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23401                     sv_catpv(msg, overflow_msg);
23402                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23403                                       UTF8fARG(is_contents_utf8, s - s0, s0));
23404                     sv_catpvs(msg, "\"");
23405                     goto return_failure;
23406                 }
23407 
23408                 max = (max << 4) + READ_XDIGIT(s);
23409             } while (isXDIGIT(*s));
23410 
23411             while (isBLANK(*s)) { s++; }
23412 
23413             if (*s == '#') {
23414                 s = strchr(s, '\n');
23415                 if (s == NULL) {
23416                     s = e;
23417                 }
23418             }
23419             else if (s < e && *s != '\n') {
23420                 goto check_if_property;
23421             }
23422         }
23423 
23424         if (max == -1) {    /* The line only had one entry */
23425             max = min;
23426         }
23427         else if (max < min) {
23428             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23429             sv_catpvs(msg, "Illegal range in \"");
23430             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23431                                 UTF8fARG(is_contents_utf8, s - s0, s0));
23432             sv_catpvs(msg, "\"");
23433             goto return_failure;
23434         }
23435 
23436 #  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
23437 
23438         if (   UNICODE_IS_PERL_EXTENDED(min)
23439             || UNICODE_IS_PERL_EXTENDED(max))
23440         {
23441             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23442 
23443             /* If both code points are non-portable, warn only on the lower
23444              * one. */
23445             sv_catpv(msg, get_extended_utf8_msg(
23446                                             (UNICODE_IS_PERL_EXTENDED(min))
23447                                             ? min : max));
23448             sv_catpvs(msg, " in \"");
23449             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23450                                  UTF8fARG(is_contents_utf8, s - s0, s0));
23451             sv_catpvs(msg, "\"");
23452         }
23453 
23454 #  endif
23455 
23456         /* Here, this line contains a legal range */
23457         this_definition = sv_2mortal(_new_invlist(2));
23458         this_definition = _add_range_to_invlist(this_definition, min, max);
23459         goto calculate;
23460 
23461       check_if_property:
23462 
23463         /* Here it isn't a legal range line.  See if it is a legal property
23464          * line.  First find the end of the meat of the line */
23465         s = strpbrk(s, "#\n");
23466         if (s == NULL) {
23467             s = e;
23468         }
23469 
23470         /* Ignore trailing blanks in keeping with the requirements of
23471          * parse_uniprop_string() */
23472         s--;
23473         while (s > s0 && isBLANK_A(*s)) {
23474             s--;
23475         }
23476         s++;
23477 
23478         this_definition = parse_uniprop_string(s0, s - s0,
23479                                                is_utf8, to_fold, runtime,
23480                                                deferrable,
23481                                                NULL,
23482                                                user_defined_ptr, msg,
23483                                                (name_len == 0)
23484                                                 ? level /* Don't increase level
23485                                                            if input is empty */
23486                                                 : level + 1
23487                                               );
23488         if (this_definition == NULL) {
23489             goto return_failure;    /* 'msg' should have had the reason
23490                                        appended to it by the above call */
23491         }
23492 
23493         if (! is_invlist(this_definition)) {    /* Unknown at this time */
23494             return newSVsv(this_definition);
23495         }
23496 
23497         if (*s != '\n') {
23498             s = strchr(s, '\n');
23499             if (s == NULL) {
23500                 s = e;
23501             }
23502         }
23503 
23504       calculate:
23505 
23506         switch (op) {
23507             case '+':
23508                 _invlist_union(running_definition, this_definition,
23509                                                         &running_definition);
23510                 break;
23511             case '-':
23512                 _invlist_subtract(running_definition, this_definition,
23513                                                         &running_definition);
23514                 break;
23515             case '&':
23516                 _invlist_intersection(running_definition, this_definition,
23517                                                         &running_definition);
23518                 break;
23519             case '!':
23520                 _invlist_union_complement_2nd(running_definition,
23521                                         this_definition, &running_definition);
23522                 break;
23523             default:
23524                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23525                                  __FILE__, __LINE__, op);
23526                 break;
23527         }
23528 
23529         /* Position past the '\n' */
23530         s0 = s + 1;
23531     }   /* End of loop through the lines of 'contents' */
23532 
23533     /* Here, we processed all the lines in 'contents' without error.  If we
23534      * didn't add any warnings, simply return success */
23535     if (msgs_length_on_entry == SvCUR(msg)) {
23536 
23537         /* If the expansion was empty, the answer isn't nothing: its an empty
23538          * inversion list */
23539         if (running_definition == NULL) {
23540             running_definition = _new_invlist(1);
23541         }
23542 
23543         return running_definition;
23544     }
23545 
23546     /* Otherwise, add some explanatory text, but we will return success */
23547     goto return_msg;
23548 
23549   return_failure:
23550     running_definition = NULL;
23551 
23552   return_msg:
23553 
23554     if (name_len > 0) {
23555         sv_catpvs(msg, " in expansion of ");
23556         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23557     }
23558 
23559     return running_definition;
23560 }
23561 
23562 /* As explained below, certain operations need to take place in the first
23563  * thread created.  These macros switch contexts */
23564 #  ifdef USE_ITHREADS
23565 #    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
23566                                         PerlInterpreter * save_aTHX = aTHX;
23567 #    define SWITCH_TO_GLOBAL_CONTEXT                                        \
23568                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23569 #    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
23570 #    define CUR_CONTEXT      aTHX
23571 #    define ORIGINAL_CONTEXT save_aTHX
23572 #  else
23573 #    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
23574 #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
23575 #    define RESTORE_CONTEXT                   NOOP
23576 #    define CUR_CONTEXT                       NULL
23577 #    define ORIGINAL_CONTEXT                  NULL
23578 #  endif
23579 
23580 STATIC void
S_delete_recursion_entry(pTHX_ void * key)23581 S_delete_recursion_entry(pTHX_ void *key)
23582 {
23583     /* Deletes the entry used to detect recursion when expanding user-defined
23584      * properties.  This is a function so it can be set up to be called even if
23585      * the program unexpectedly quits */
23586 
23587     dVAR;
23588     SV ** current_entry;
23589     const STRLEN key_len = strlen((const char *) key);
23590     DECLARATION_FOR_GLOBAL_CONTEXT;
23591 
23592     SWITCH_TO_GLOBAL_CONTEXT;
23593 
23594     /* If the entry is one of these types, it is a permanent entry, and not the
23595      * one used to detect recursions.  This function should delete only the
23596      * recursion entry */
23597     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23598     if (     current_entry
23599         && ! is_invlist(*current_entry)
23600         && ! SvPOK(*current_entry))
23601     {
23602         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23603                                                                     G_DISCARD);
23604     }
23605 
23606     RESTORE_CONTEXT;
23607 }
23608 
23609 STATIC SV *
S_get_fq_name(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool has_colon_colon)23610 S_get_fq_name(pTHX_
23611               const char * const name,    /* The first non-blank in the \p{}, \P{} */
23612               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
23613               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23614               const bool has_colon_colon
23615              )
23616 {
23617     /* Returns a mortal SV containing the fully qualified version of the input
23618      * name */
23619 
23620     SV * fq_name;
23621 
23622     fq_name = newSVpvs_flags("", SVs_TEMP);
23623 
23624     /* Use the current package if it wasn't included in our input */
23625     if (! has_colon_colon) {
23626         const HV * pkg = (IN_PERL_COMPILETIME)
23627                          ? PL_curstash
23628                          : CopSTASH(PL_curcop);
23629         const char* pkgname = HvNAME(pkg);
23630 
23631         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23632                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23633         sv_catpvs(fq_name, "::");
23634     }
23635 
23636     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23637                          UTF8fARG(is_utf8, name_len, name));
23638     return fq_name;
23639 }
23640 
23641 STATIC SV *
S_parse_uniprop_string(pTHX_ const char * const name,Size_t name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,AV ** strings,bool * user_defined_ptr,SV * msg,const STRLEN level)23642 S_parse_uniprop_string(pTHX_
23643 
23644     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
23645      * now.  If so, the return is an inversion list.
23646      *
23647      * If the property is user-defined, it is a subroutine, which in turn
23648      * may call other subroutines.  This function will call the whole nest of
23649      * them to get the definition they return; if some aren't known at the time
23650      * of the call to this function, the fully qualified name of the highest
23651      * level sub is returned.  It is an error to call this function at runtime
23652      * without every sub defined.
23653      *
23654      * If an error was found, NULL is returned, and 'msg' gets a suitable
23655      * message appended to it.  (Appending allows the back trace of how we got
23656      * to the faulty definition to be displayed through nested calls of
23657      * user-defined subs.)
23658      *
23659      * The caller should NOT try to free any returned inversion list.
23660      *
23661      * Other parameters will be set on return as described below */
23662 
23663     const char * const name,    /* The first non-blank in the \p{}, \P{} */
23664     Size_t name_len,            /* Its length in bytes, not including any
23665                                    trailing space */
23666     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23667     const bool to_fold,         /* ? Is this under /i */
23668     const bool runtime,         /* TRUE if this is being called at run time */
23669     const bool deferrable,      /* TRUE if it's ok for the definition to not be
23670                                    known at this call */
23671     AV ** strings,              /* To return string property values, like named
23672                                    sequences */
23673     bool *user_defined_ptr,     /* Upon return from this function it will be
23674                                    set to TRUE if any component is a
23675                                    user-defined property */
23676     SV * msg,                   /* Any error or warning msg(s) are appended to
23677                                    this */
23678     const STRLEN level)         /* Recursion level of this call */
23679 {
23680     dVAR;
23681     char* lookup_name;          /* normalized name for lookup in our tables */
23682     unsigned lookup_len;        /* Its length */
23683     enum { Not_Strict = 0,      /* Some properties have stricter name */
23684            Strict,              /* normalization rules, which we decide */
23685            As_Is                /* upon based on parsing */
23686          } stricter = Not_Strict;
23687 
23688     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23689      * (though it requires extra effort to download them from Unicode and
23690      * compile perl to know about them) */
23691     bool is_nv_type = FALSE;
23692 
23693     unsigned int i, j = 0;
23694     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
23695     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
23696     int table_index = 0;    /* The entry number for this property in the table
23697                                of all Unicode property names */
23698     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
23699     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
23700                                    the normalized name in certain situations */
23701     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
23702                                    part of a package name */
23703     Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
23704     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
23705                                              property rather than a Unicode
23706                                              one. */
23707     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
23708                                      if an error.  If it is an inversion list,
23709                                      it is the definition.  Otherwise it is a
23710                                      string containing the fully qualified sub
23711                                      name of 'name' */
23712     SV * fq_name = NULL;        /* For user-defined properties, the fully
23713                                    qualified name */
23714     bool invert_return = FALSE; /* ? Do we need to complement the result before
23715                                      returning it */
23716     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23717                                        explicit utf8:: package that we strip
23718                                        off  */
23719     /* The expansion of properties that could be either user-defined or
23720      * official unicode ones is deferred until runtime, including a marker for
23721      * those that might be in the latter category.  This boolean indicates if
23722      * we've seen that marker.  If not, what we're parsing can't be such an
23723      * official Unicode property whose expansion was deferred */
23724     bool could_be_deferred_official = FALSE;
23725 
23726     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23727 
23728     /* The input will be normalized into 'lookup_name' */
23729     Newx(lookup_name, name_len, char);
23730     SAVEFREEPV(lookup_name);
23731 
23732     /* Parse the input. */
23733     for (i = 0; i < name_len; i++) {
23734         char cur = name[i];
23735 
23736         /* Most of the characters in the input will be of this ilk, being parts
23737          * of a name */
23738         if (isIDCONT_A(cur)) {
23739 
23740             /* Case differences are ignored.  Our lookup routine assumes
23741              * everything is lowercase, so normalize to that */
23742             if (isUPPER_A(cur)) {
23743                 lookup_name[j++] = toLOWER_A(cur);
23744                 continue;
23745             }
23746 
23747             if (cur == '_') { /* Don't include these in the normalized name */
23748                 continue;
23749             }
23750 
23751             lookup_name[j++] = cur;
23752 
23753             /* The first character in a user-defined name must be of this type.
23754              * */
23755             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23756                 could_be_user_defined = FALSE;
23757             }
23758 
23759             continue;
23760         }
23761 
23762         /* Here, the character is not something typically in a name,  But these
23763          * two types of characters (and the '_' above) can be freely ignored in
23764          * most situations.  Later it may turn out we shouldn't have ignored
23765          * them, and we have to reparse, but we don't have enough information
23766          * yet to make that decision */
23767         if (cur == '-' || isSPACE_A(cur)) {
23768             could_be_user_defined = FALSE;
23769             continue;
23770         }
23771 
23772         /* An equals sign or single colon mark the end of the first part of
23773          * the property name */
23774         if (    cur == '='
23775             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23776         {
23777             lookup_name[j++] = '='; /* Treat the colon as an '=' */
23778             equals_pos = j; /* Note where it occurred in the input */
23779             could_be_user_defined = FALSE;
23780             break;
23781         }
23782 
23783         /* If this looks like it is a marker we inserted at compile time,
23784          * set a flag and otherwise ignore it.  If it isn't in the final
23785          * position, keep it as it would have been user input. */
23786         if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23787             && ! deferrable
23788             &&   could_be_user_defined
23789             &&   i == name_len - 1)
23790         {
23791             name_len--;
23792             could_be_deferred_official = TRUE;
23793             continue;
23794         }
23795 
23796         /* Otherwise, this character is part of the name. */
23797         lookup_name[j++] = cur;
23798 
23799         /* Here it isn't a single colon, so if it is a colon, it must be a
23800          * double colon */
23801         if (cur == ':') {
23802 
23803             /* A double colon should be a package qualifier.  We note its
23804              * position and continue.  Note that one could have
23805              *      pkg1::pkg2::...::foo
23806              * so that the position at the end of the loop will be just after
23807              * the final qualifier */
23808 
23809             i++;
23810             non_pkg_begin = i + 1;
23811             lookup_name[j++] = ':';
23812             lun_non_pkg_begin = j;
23813         }
23814         else { /* Only word chars (and '::') can be in a user-defined name */
23815             could_be_user_defined = FALSE;
23816         }
23817     } /* End of parsing through the lhs of the property name (or all of it if
23818          no rhs) */
23819 
23820 #  define STRLENs(s)  (sizeof("" s "") - 1)
23821 
23822     /* If there is a single package name 'utf8::', it is ambiguous.  It could
23823      * be for a user-defined property, or it could be a Unicode property, as
23824      * all of them are considered to be for that package.  For the purposes of
23825      * parsing the rest of the property, strip it off */
23826     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23827         lookup_name +=  STRLENs("utf8::");
23828         j -=  STRLENs("utf8::");
23829         equals_pos -=  STRLENs("utf8::");
23830         stripped_utf8_pkg = TRUE;
23831     }
23832 
23833     /* Here, we are either done with the whole property name, if it was simple;
23834      * or are positioned just after the '=' if it is compound. */
23835 
23836     if (equals_pos >= 0) {
23837         assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23838 
23839         /* Space immediately after the '=' is ignored */
23840         i++;
23841         for (; i < name_len; i++) {
23842             if (! isSPACE_A(name[i])) {
23843                 break;
23844             }
23845         }
23846 
23847         /* Most punctuation after the equals indicates a subpattern, like
23848          * \p{foo=/bar/} */
23849         if (   isPUNCT_A(name[i])
23850             &&  name[i] != '-'
23851             &&  name[i] != '+'
23852             &&  name[i] != '_'
23853             &&  name[i] != '{'
23854                 /* A backslash means the real delimitter is the next character,
23855                  * but it must be punctuation */
23856             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23857         {
23858             bool special_property = memEQs(lookup_name, j - 1, "name")
23859                                  || memEQs(lookup_name, j - 1, "na");
23860             if (! special_property) {
23861                 /* Find the property.  The table includes the equals sign, so
23862                  * we use 'j' as-is */
23863                 table_index = do_uniprop_match(lookup_name, j);
23864             }
23865             if (special_property || table_index) {
23866                 REGEXP * subpattern_re;
23867                 char open = name[i++];
23868                 char close;
23869                 const char * pos_in_brackets;
23870                 const char * const * prop_values;
23871                 bool escaped = 0;
23872 
23873                 /* Backslash => delimitter is the character following.  We
23874                  * already checked that it is punctuation */
23875                 if (open == '\\') {
23876                     open = name[i++];
23877                     escaped = 1;
23878                 }
23879 
23880                 /* This data structure is constructed so that the matching
23881                  * closing bracket is 3 past its matching opening.  The second
23882                  * set of closing is so that if the opening is something like
23883                  * ']', the closing will be that as well.  Something similar is
23884                  * done in toke.c */
23885                 pos_in_brackets = memCHRs("([<)]>)]>", open);
23886                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23887 
23888                 if (    i >= name_len
23889                     ||  name[name_len-1] != close
23890                     || (escaped && name[name_len-2] != '\\')
23891                         /* Also make sure that there are enough characters.
23892                          * e.g., '\\\' would show up incorrectly as legal even
23893                          * though it is too short */
23894                     || (SSize_t) (name_len - i - 1 - escaped) < 0)
23895                 {
23896                     sv_catpvs(msg, "Unicode property wildcard not terminated");
23897                     goto append_name_to_msg;
23898                 }
23899 
23900                 Perl_ck_warner_d(aTHX_
23901                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23902                     "The Unicode property wildcards feature is experimental");
23903 
23904                 if (special_property) {
23905                     const char * error_msg;
23906                     const char * revised_name = name + i;
23907                     Size_t revised_name_len = name_len - (i + 1 + escaped);
23908 
23909                     /* Currently, the only 'special_property' is name, which we
23910                      * lookup in _charnames.pm */
23911 
23912                     if (! load_charnames(newSVpvs("placeholder"),
23913                                          revised_name, revised_name_len,
23914                                          &error_msg))
23915                     {
23916                         sv_catpv(msg, error_msg);
23917                         goto append_name_to_msg;
23918                     }
23919 
23920                     /* Farm this out to a function just to make the current
23921                      * function less unwieldy */
23922                     if (handle_names_wildcard(revised_name, revised_name_len,
23923                                               &prop_definition,
23924                                               strings))
23925                     {
23926                         return prop_definition;
23927                     }
23928 
23929                     goto failed;
23930                 }
23931 
23932                 prop_values = get_prop_values(table_index);
23933 
23934                 /* Now create and compile the wildcard subpattern.  Use /i
23935                  * because the property values are supposed to match with case
23936                  * ignored. */
23937                 subpattern_re = compile_wildcard(name + i,
23938                                                  name_len - i - 1 - escaped,
23939                                                  TRUE /* /i */
23940                                                 );
23941 
23942                 /* For each legal property value, see if the supplied pattern
23943                  * matches it. */
23944                 while (*prop_values) {
23945                     const char * const entry = *prop_values;
23946                     const Size_t len = strlen(entry);
23947                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23948 
23949                     if (execute_wildcard(subpattern_re,
23950                                  (char *) entry,
23951                                  (char *) entry + len,
23952                                  (char *) entry, 0,
23953                                  entry_sv,
23954                                  0))
23955                     { /* Here, matched.  Add to the returned list */
23956                         Size_t total_len = j + len;
23957                         SV * sub_invlist = NULL;
23958                         char * this_string;
23959 
23960                         /* We know this is a legal \p{property=value}.  Call
23961                          * the function to return the list of code points that
23962                          * match it */
23963                         Newxz(this_string, total_len + 1, char);
23964                         Copy(lookup_name, this_string, j, char);
23965                         my_strlcat(this_string, entry, total_len + 1);
23966                         SAVEFREEPV(this_string);
23967                         sub_invlist = parse_uniprop_string(this_string,
23968                                                            total_len,
23969                                                            is_utf8,
23970                                                            to_fold,
23971                                                            runtime,
23972                                                            deferrable,
23973                                                            NULL,
23974                                                            user_defined_ptr,
23975                                                            msg,
23976                                                            level + 1);
23977                         _invlist_union(prop_definition, sub_invlist,
23978                                        &prop_definition);
23979                     }
23980 
23981                     prop_values++;  /* Next iteration, look at next propvalue */
23982                 } /* End of looking through property values; (the data
23983                      structure is terminated by a NULL ptr) */
23984 
23985                 SvREFCNT_dec_NN(subpattern_re);
23986 
23987                 if (prop_definition) {
23988                     return prop_definition;
23989                 }
23990 
23991                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23992                 goto append_name_to_msg;
23993             }
23994 
23995             /* Here's how khw thinks we should proceed to handle the properties
23996              * not yet done:    Bidi Mirroring Glyph        can map to ""
23997                                 Bidi Paired Bracket         can map to ""
23998                                 Case Folding  (both full and simple)
23999                                             Shouldn't /i be good enough for Full
24000                                 Decomposition Mapping
24001                                 Equivalent Unified Ideograph    can map to ""
24002                                 Lowercase Mapping  (both full and simple)
24003                                 NFKC Case Fold                  can map to ""
24004                                 Titlecase Mapping  (both full and simple)
24005                                 Uppercase Mapping  (both full and simple)
24006              * Handle these the same way Name is done, using say, _wild.pm, but
24007              * having both loose and full, like in charclass_invlists.h.
24008              * Perhaps move block and script to that as they are somewhat large
24009              * in charclass_invlists.h.
24010              * For properties where the default is the code point itself, such
24011              * as any of the case changing mappings, the string would otherwise
24012              * consist of all Unicode code points in UTF-8 strung together.
24013              * This would be impractical.  So instead, examine their compiled
24014              * pattern, looking at the ssc.  If none, reject the pattern as an
24015              * error.  Otherwise run the pattern against every code point in
24016              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
24017              * And it might be good to create an API to return the ssc.
24018              * Or handle them like the algorithmic names are done
24019              */
24020         } /* End of is a wildcard subppattern */
24021 
24022         /* \p{name=...} is handled specially.  Instead of using the normal
24023          * mechanism involving charclass_invlists.h, it uses _charnames.pm
24024          * which has the necessary (huge) data accessible to it, and which
24025          * doesn't get loaded unless necessary.  The legal syntax for names is
24026          * somewhat different than other properties due both to the vagaries of
24027          * a few outlier official names, and the fact that only a few ASCII
24028          * characters are permitted in them */
24029         if (   memEQs(lookup_name, j - 1, "name")
24030             || memEQs(lookup_name, j - 1, "na"))
24031         {
24032             dSP;
24033             HV * table;
24034             SV * character;
24035             const char * error_msg;
24036             CV* lookup_loose;
24037             SV * character_name;
24038             STRLEN character_len;
24039             UV cp;
24040 
24041             stricter = As_Is;
24042 
24043             /* Since the RHS (after skipping initial space) is passed unchanged
24044              * to charnames, and there are different criteria for what are
24045              * legal characters in the name, just parse it here.  A character
24046              * name must begin with an ASCII alphabetic */
24047             if (! isALPHA(name[i])) {
24048                 goto failed;
24049             }
24050             lookup_name[j++] = name[i];
24051 
24052             for (++i; i < name_len; i++) {
24053                 /* Official names can only be in the ASCII range, and only
24054                  * certain characters */
24055                 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24056                     goto failed;
24057                 }
24058                 lookup_name[j++] = name[i];
24059             }
24060 
24061             /* Finished parsing, save the name into an SV */
24062             character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24063 
24064             /* Make sure _charnames is loaded.  (The parameters give context
24065              * for any errors generated */
24066             table = load_charnames(character_name, name, name_len, &error_msg);
24067             if (table == NULL) {
24068                 sv_catpv(msg, error_msg);
24069                 goto append_name_to_msg;
24070             }
24071 
24072             lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
24073             if (! lookup_loose) {
24074                 Perl_croak(aTHX_
24075                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
24076             }
24077 
24078             PUSHSTACKi(PERLSI_REGCOMP);
24079             ENTER ;
24080             SAVETMPS;
24081             save_re_context();
24082 
24083             PUSHMARK(SP) ;
24084             XPUSHs(character_name);
24085             PUTBACK;
24086             call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24087 
24088             SPAGAIN ;
24089 
24090             character = POPs;
24091             SvREFCNT_inc_simple_void_NN(character);
24092 
24093             PUTBACK ;
24094             FREETMPS ;
24095             LEAVE ;
24096             POPSTACK;
24097 
24098             if (! SvOK(character)) {
24099                 goto failed;
24100             }
24101 
24102             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24103             if (character_len == SvCUR(character)) {
24104                 prop_definition = add_cp_to_invlist(NULL, cp);
24105             }
24106             else {
24107                 AV * this_string;
24108 
24109                 /* First of the remaining characters in the string. */
24110                 char * remaining = SvPVX(character) + character_len;
24111 
24112                 if (strings == NULL) {
24113                     goto failed;    /* XXX Perhaps a specific msg instead, like
24114                                        'not available here' */
24115                 }
24116 
24117                 if (*strings == NULL) {
24118                     *strings = newAV();
24119                 }
24120 
24121                 this_string = newAV();
24122                 av_push(this_string, newSVuv(cp));
24123 
24124                 do {
24125                     cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24126                     av_push(this_string, newSVuv(cp));
24127                     remaining += character_len;
24128                 } while (remaining < SvEND(character));
24129 
24130                 av_push(*strings, (SV *) this_string);
24131             }
24132 
24133             return prop_definition;
24134         }
24135 
24136         /* Certain properties whose values are numeric need special handling.
24137          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
24138          * purposes of checking if this is one of those properties */
24139         if (memBEGINPs(lookup_name, j, "is")) {
24140             lookup_offset = 2;
24141         }
24142 
24143         /* Then check if it is one of these specially-handled properties.  The
24144          * possibilities are hard-coded because easier this way, and the list
24145          * is unlikely to change.
24146          *
24147          * All numeric value type properties are of this ilk, and are also
24148          * special in a different way later on.  So find those first.  There
24149          * are several numeric value type properties in the Unihan DB (which is
24150          * unlikely to be compiled with perl, but we handle it here in case it
24151          * does get compiled).  They all end with 'numeric'.  The interiors
24152          * aren't checked for the precise property.  This would stop working if
24153          * a cjk property were to be created that ended with 'numeric' and
24154          * wasn't a numeric type */
24155         is_nv_type = memEQs(lookup_name + lookup_offset,
24156                        j - 1 - lookup_offset, "numericvalue")
24157                   || memEQs(lookup_name + lookup_offset,
24158                       j - 1 - lookup_offset, "nv")
24159                   || (   memENDPs(lookup_name + lookup_offset,
24160                             j - 1 - lookup_offset, "numeric")
24161                       && (   memBEGINPs(lookup_name + lookup_offset,
24162                                       j - 1 - lookup_offset, "cjk")
24163                           || memBEGINPs(lookup_name + lookup_offset,
24164                                       j - 1 - lookup_offset, "k")));
24165         if (   is_nv_type
24166             || memEQs(lookup_name + lookup_offset,
24167                       j - 1 - lookup_offset, "canonicalcombiningclass")
24168             || memEQs(lookup_name + lookup_offset,
24169                       j - 1 - lookup_offset, "ccc")
24170             || memEQs(lookup_name + lookup_offset,
24171                       j - 1 - lookup_offset, "age")
24172             || memEQs(lookup_name + lookup_offset,
24173                       j - 1 - lookup_offset, "in")
24174             || memEQs(lookup_name + lookup_offset,
24175                       j - 1 - lookup_offset, "presentin"))
24176         {
24177             unsigned int k;
24178 
24179             /* Since the stuff after the '=' is a number, we can't throw away
24180              * '-' willy-nilly, as those could be a minus sign.  Other stricter
24181              * rules also apply.  However, these properties all can have the
24182              * rhs not be a number, in which case they contain at least one
24183              * alphabetic.  In those cases, the stricter rules don't apply.
24184              * But the numeric type properties can have the alphas [Ee] to
24185              * signify an exponent, and it is still a number with stricter
24186              * rules.  So look for an alpha that signifies not-strict */
24187             stricter = Strict;
24188             for (k = i; k < name_len; k++) {
24189                 if (   isALPHA_A(name[k])
24190                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24191                 {
24192                     stricter = Not_Strict;
24193                     break;
24194                 }
24195             }
24196         }
24197 
24198         if (stricter) {
24199 
24200             /* A number may have a leading '+' or '-'.  The latter is retained
24201              * */
24202             if (name[i] == '+') {
24203                 i++;
24204             }
24205             else if (name[i] == '-') {
24206                 lookup_name[j++] = '-';
24207                 i++;
24208             }
24209 
24210             /* Skip leading zeros including single underscores separating the
24211              * zeros, or between the final leading zero and the first other
24212              * digit */
24213             for (; i < name_len - 1; i++) {
24214                 if (    name[i] != '0'
24215                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24216                 {
24217                     break;
24218                 }
24219             }
24220         }
24221     }
24222     else {  /* No '=' */
24223 
24224        /* Only a few properties without an '=' should be parsed with stricter
24225         * rules.  The list is unlikely to change. */
24226         if (   memBEGINPs(lookup_name, j, "perl")
24227             && memNEs(lookup_name + 4, j - 4, "space")
24228             && memNEs(lookup_name + 4, j - 4, "word"))
24229         {
24230             stricter = Strict;
24231 
24232             /* We set the inputs back to 0 and the code below will reparse,
24233              * using strict */
24234             i = j = 0;
24235         }
24236     }
24237 
24238     /* Here, we have either finished the property, or are positioned to parse
24239      * the remainder, and we know if stricter rules apply.  Finish out, if not
24240      * already done */
24241     for (; i < name_len; i++) {
24242         char cur = name[i];
24243 
24244         /* In all instances, case differences are ignored, and we normalize to
24245          * lowercase */
24246         if (isUPPER_A(cur)) {
24247             lookup_name[j++] = toLOWER(cur);
24248             continue;
24249         }
24250 
24251         /* An underscore is skipped, but not under strict rules unless it
24252          * separates two digits */
24253         if (cur == '_') {
24254             if (    stricter
24255                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
24256                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24257             {
24258                 lookup_name[j++] = '_';
24259             }
24260             continue;
24261         }
24262 
24263         /* Hyphens are skipped except under strict */
24264         if (cur == '-' && ! stricter) {
24265             continue;
24266         }
24267 
24268         /* XXX Bug in documentation.  It says white space skipped adjacent to
24269          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
24270          * in a number */
24271         if (isSPACE_A(cur) && ! stricter) {
24272             continue;
24273         }
24274 
24275         lookup_name[j++] = cur;
24276 
24277         /* Unless this is a non-trailing slash, we are done with it */
24278         if (i >= name_len - 1 || cur != '/') {
24279             continue;
24280         }
24281 
24282         slash_pos = j;
24283 
24284         /* A slash in the 'numeric value' property indicates that what follows
24285          * is a denominator.  It can have a leading '+' and '0's that should be
24286          * skipped.  But we have never allowed a negative denominator, so treat
24287          * a minus like every other character.  (No need to rule out a second
24288          * '/', as that won't match anything anyway */
24289         if (is_nv_type) {
24290             i++;
24291             if (i < name_len && name[i] == '+') {
24292                 i++;
24293             }
24294 
24295             /* Skip leading zeros including underscores separating digits */
24296             for (; i < name_len - 1; i++) {
24297                 if (   name[i] != '0'
24298                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24299                 {
24300                     break;
24301                 }
24302             }
24303 
24304             /* Store the first real character in the denominator */
24305             if (i < name_len) {
24306                 lookup_name[j++] = name[i];
24307             }
24308         }
24309     }
24310 
24311     /* Here are completely done parsing the input 'name', and 'lookup_name'
24312      * contains a copy, normalized.
24313      *
24314      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24315      * different from without the underscores.  */
24316     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
24317            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24318         && UNLIKELY(name[name_len-1] == '_'))
24319     {
24320         lookup_name[j++] = '&';
24321     }
24322 
24323     /* If the original input began with 'In' or 'Is', it could be a subroutine
24324      * call to a user-defined property instead of a Unicode property name. */
24325     if (    name_len - non_pkg_begin > 2
24326         &&  name[non_pkg_begin+0] == 'I'
24327         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24328     {
24329         /* Names that start with In have different characterstics than those
24330          * that start with Is */
24331         if (name[non_pkg_begin+1] == 's') {
24332             starts_with_Is = TRUE;
24333         }
24334     }
24335     else {
24336         could_be_user_defined = FALSE;
24337     }
24338 
24339     if (could_be_user_defined) {
24340         CV* user_sub;
24341 
24342         /* If the user defined property returns the empty string, it could
24343          * easily be because the pattern is being compiled before the data it
24344          * actually needs to compile is available.  This could be argued to be
24345          * a bug in the perl code, but this is a change of behavior for Perl,
24346          * so we handle it.  This means that intentionally returning nothing
24347          * will not be resolved until runtime */
24348         bool empty_return = FALSE;
24349 
24350         /* Here, the name could be for a user defined property, which are
24351          * implemented as subs. */
24352         user_sub = get_cvn_flags(name, name_len, 0);
24353         if (! user_sub) {
24354 
24355             /* Here, the property name could be a user-defined one, but there
24356              * is no subroutine to handle it (as of now).   Defer handling it
24357              * until runtime.  Otherwise, a block defined by Unicode in a later
24358              * release would get the synonym InFoo added for it, and existing
24359              * code that used that name would suddenly break if it referred to
24360              * the property before the sub was declared.  See [perl #134146] */
24361             if (deferrable) {
24362                 goto definition_deferred;
24363             }
24364 
24365             /* Here, we are at runtime, and didn't find the user property.  It
24366              * could be an official property, but only if no package was
24367              * specified, or just the utf8:: package. */
24368             if (could_be_deferred_official) {
24369                 lookup_name += lun_non_pkg_begin;
24370                 j -= lun_non_pkg_begin;
24371             }
24372             else if (! stripped_utf8_pkg) {
24373                 goto unknown_user_defined;
24374             }
24375 
24376             /* Drop down to look up in the official properties */
24377         }
24378         else {
24379             const char insecure[] = "Insecure user-defined property";
24380 
24381             /* Here, there is a sub by the correct name.  Normally we call it
24382              * to get the property definition */
24383             dSP;
24384             SV * user_sub_sv = MUTABLE_SV(user_sub);
24385             SV * error;     /* Any error returned by calling 'user_sub' */
24386             SV * key;       /* The key into the hash of user defined sub names
24387                              */
24388             SV * placeholder;
24389             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
24390 
24391             /* How many times to retry when another thread is in the middle of
24392              * expanding the same definition we want */
24393             PERL_INT_FAST8_T retry_countdown = 10;
24394 
24395             DECLARATION_FOR_GLOBAL_CONTEXT;
24396 
24397             /* If we get here, we know this property is user-defined */
24398             *user_defined_ptr = TRUE;
24399 
24400             /* We refuse to call a potentially tainted subroutine; returning an
24401              * error instead */
24402             if (TAINT_get) {
24403                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24404                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24405                 goto append_name_to_msg;
24406             }
24407 
24408             /* In principal, we only call each subroutine property definition
24409              * once during the life of the program.  This guarantees that the
24410              * property definition never changes.  The results of the single
24411              * sub call are stored in a hash, which is used instead for future
24412              * references to this property.  The property definition is thus
24413              * immutable.  But, to allow the user to have a /i-dependent
24414              * definition, we call the sub once for non-/i, and once for /i,
24415              * should the need arise, passing the /i status as a parameter.
24416              *
24417              * We start by constructing the hash key name, consisting of the
24418              * fully qualified subroutine name, preceded by the /i status, so
24419              * that there is a key for /i and a different key for non-/i */
24420             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24421             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24422                                           non_pkg_begin != 0);
24423             sv_catsv(key, fq_name);
24424             sv_2mortal(key);
24425 
24426             /* We only call the sub once throughout the life of the program
24427              * (with the /i, non-/i exception noted above).  That means the
24428              * hash must be global and accessible to all threads.  It is
24429              * created at program start-up, before any threads are created, so
24430              * is accessible to all children.  But this creates some
24431              * complications.
24432              *
24433              * 1) The keys can't be shared, or else problems arise; sharing is
24434              *    turned off at hash creation time
24435              * 2) All SVs in it are there for the remainder of the life of the
24436              *    program, and must be created in the same interpreter context
24437              *    as the hash, or else they will be freed from the wrong pool
24438              *    at global destruction time.  This is handled by switching to
24439              *    the hash's context to create each SV going into it, and then
24440              *    immediately switching back
24441              * 3) All accesses to the hash must be controlled by a mutex, to
24442              *    prevent two threads from getting an unstable state should
24443              *    they simultaneously be accessing it.  The code below is
24444              *    crafted so that the mutex is locked whenever there is an
24445              *    access and unlocked only when the next stable state is
24446              *    achieved.
24447              *
24448              * The hash stores either the definition of the property if it was
24449              * valid, or, if invalid, the error message that was raised.  We
24450              * use the type of SV to distinguish.
24451              *
24452              * There's also the need to guard against the definition expansion
24453              * from infinitely recursing.  This is handled by storing the aTHX
24454              * of the expanding thread during the expansion.  Again the SV type
24455              * is used to distinguish this from the other two cases.  If we
24456              * come to here and the hash entry for this property is our aTHX,
24457              * it means we have recursed, and the code assumes that we would
24458              * infinitely recurse, so instead stops and raises an error.
24459              * (Any recursion has always been treated as infinite recursion in
24460              * this feature.)
24461              *
24462              * If instead, the entry is for a different aTHX, it means that
24463              * that thread has gotten here first, and hasn't finished expanding
24464              * the definition yet.  We just have to wait until it is done.  We
24465              * sleep and retry a few times, returning an error if the other
24466              * thread doesn't complete. */
24467 
24468           re_fetch:
24469             USER_PROP_MUTEX_LOCK;
24470 
24471             /* If we have an entry for this key, the subroutine has already
24472              * been called once with this /i status. */
24473             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24474                                                    SvPVX(key), SvCUR(key), 0);
24475             if (saved_user_prop_ptr) {
24476 
24477                 /* If the saved result is an inversion list, it is the valid
24478                  * definition of this property */
24479                 if (is_invlist(*saved_user_prop_ptr)) {
24480                     prop_definition = *saved_user_prop_ptr;
24481 
24482                     /* The SV in the hash won't be removed until global
24483                      * destruction, so it is stable and we can unlock */
24484                     USER_PROP_MUTEX_UNLOCK;
24485 
24486                     /* The caller shouldn't try to free this SV */
24487                     return prop_definition;
24488                 }
24489 
24490                 /* Otherwise, if it is a string, it is the error message
24491                  * that was returned when we first tried to evaluate this
24492                  * property.  Fail, and append the message */
24493                 if (SvPOK(*saved_user_prop_ptr)) {
24494                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24495                     sv_catsv(msg, *saved_user_prop_ptr);
24496 
24497                     /* The SV in the hash won't be removed until global
24498                      * destruction, so it is stable and we can unlock */
24499                     USER_PROP_MUTEX_UNLOCK;
24500 
24501                     return NULL;
24502                 }
24503 
24504                 assert(SvIOK(*saved_user_prop_ptr));
24505 
24506                 /* Here, we have an unstable entry in the hash.  Either another
24507                  * thread is in the middle of expanding the property's
24508                  * definition, or we are ourselves recursing.  We use the aTHX
24509                  * in it to distinguish */
24510                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24511 
24512                     /* Here, it's another thread doing the expanding.  We've
24513                      * looked as much as we are going to at the contents of the
24514                      * hash entry.  It's safe to unlock. */
24515                     USER_PROP_MUTEX_UNLOCK;
24516 
24517                     /* Retry a few times */
24518                     if (retry_countdown-- > 0) {
24519                         PerlProc_sleep(1);
24520                         goto re_fetch;
24521                     }
24522 
24523                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24524                     sv_catpvs(msg, "Timeout waiting for another thread to "
24525                                    "define");
24526                     goto append_name_to_msg;
24527                 }
24528 
24529                 /* Here, we are recursing; don't dig any deeper */
24530                 USER_PROP_MUTEX_UNLOCK;
24531 
24532                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24533                 sv_catpvs(msg,
24534                           "Infinite recursion in user-defined property");
24535                 goto append_name_to_msg;
24536             }
24537 
24538             /* Here, this thread has exclusive control, and there is no entry
24539              * for this property in the hash.  So we have the go ahead to
24540              * expand the definition ourselves. */
24541 
24542             PUSHSTACKi(PERLSI_REGCOMP);
24543             ENTER;
24544 
24545             /* Create a temporary placeholder in the hash to detect recursion
24546              * */
24547             SWITCH_TO_GLOBAL_CONTEXT;
24548             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24549             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24550             RESTORE_CONTEXT;
24551 
24552             /* Now that we have a placeholder, we can let other threads
24553              * continue */
24554             USER_PROP_MUTEX_UNLOCK;
24555 
24556             /* Make sure the placeholder always gets destroyed */
24557             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24558 
24559             PUSHMARK(SP);
24560             SAVETMPS;
24561 
24562             /* Call the user's function, with the /i status as a parameter.
24563              * Note that we have gone to a lot of trouble to keep this call
24564              * from being within the locked mutex region. */
24565             XPUSHs(boolSV(to_fold));
24566             PUTBACK;
24567 
24568             /* The following block was taken from swash_init().  Presumably
24569              * they apply to here as well, though we no longer use a swash --
24570              * khw */
24571             SAVEHINTS();
24572             save_re_context();
24573             /* We might get here via a subroutine signature which uses a utf8
24574              * parameter name, at which point PL_subname will have been set
24575              * but not yet used. */
24576             save_item(PL_subname);
24577 
24578             /* G_SCALAR guarantees a single return value */
24579             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24580 
24581             SPAGAIN;
24582 
24583             error = ERRSV;
24584             if (TAINT_get || SvTRUE(error)) {
24585                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24586                 if (SvTRUE(error)) {
24587                     sv_catpvs(msg, "Error \"");
24588                     sv_catsv(msg, error);
24589                     sv_catpvs(msg, "\"");
24590                 }
24591                 if (TAINT_get) {
24592                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
24593                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24594                 }
24595 
24596                 if (name_len > 0) {
24597                     sv_catpvs(msg, " in expansion of ");
24598                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24599                                                                   name_len,
24600                                                                   name));
24601                 }
24602 
24603                 (void) POPs;
24604                 prop_definition = NULL;
24605             }
24606             else {
24607                 SV * contents = POPs;
24608 
24609                 /* The contents is supposed to be the expansion of the property
24610                  * definition.  If the definition is deferrable, and we got an
24611                  * empty string back, set a flag to later defer it (after clean
24612                  * up below). */
24613                 if (      deferrable
24614                     && (! SvPOK(contents) || SvCUR(contents) == 0))
24615                 {
24616                         empty_return = TRUE;
24617                 }
24618                 else { /* Otherwise, call a function to check for valid syntax,
24619                           and handle it */
24620 
24621                     prop_definition = handle_user_defined_property(
24622                                                     name, name_len,
24623                                                     is_utf8, to_fold, runtime,
24624                                                     deferrable,
24625                                                     contents, user_defined_ptr,
24626                                                     msg,
24627                                                     level);
24628                 }
24629             }
24630 
24631             /* Here, we have the results of the expansion.  Delete the
24632              * placeholder, and if the definition is now known, replace it with
24633              * that definition.  We need exclusive access to the hash, and we
24634              * can't let anyone else in, between when we delete the placeholder
24635              * and add the permanent entry */
24636             USER_PROP_MUTEX_LOCK;
24637 
24638             S_delete_recursion_entry(aTHX_ SvPVX(key));
24639 
24640             if (    ! empty_return
24641                 && (! prop_definition || is_invlist(prop_definition)))
24642             {
24643                 /* If we got success we use the inversion list defining the
24644                  * property; otherwise use the error message */
24645                 SWITCH_TO_GLOBAL_CONTEXT;
24646                 (void) hv_store_ent(PL_user_def_props,
24647                                     key,
24648                                     ((prop_definition)
24649                                      ? newSVsv(prop_definition)
24650                                      : newSVsv(msg)),
24651                                     0);
24652                 RESTORE_CONTEXT;
24653             }
24654 
24655             /* All done, and the hash now has a permanent entry for this
24656              * property.  Give up exclusive control */
24657             USER_PROP_MUTEX_UNLOCK;
24658 
24659             FREETMPS;
24660             LEAVE;
24661             POPSTACK;
24662 
24663             if (empty_return) {
24664                 goto definition_deferred;
24665             }
24666 
24667             if (prop_definition) {
24668 
24669                 /* If the definition is for something not known at this time,
24670                  * we toss it, and go return the main property name, as that's
24671                  * the one the user will be aware of */
24672                 if (! is_invlist(prop_definition)) {
24673                     SvREFCNT_dec_NN(prop_definition);
24674                     goto definition_deferred;
24675                 }
24676 
24677                 sv_2mortal(prop_definition);
24678             }
24679 
24680             /* And return */
24681             return prop_definition;
24682 
24683         }   /* End of calling the subroutine for the user-defined property */
24684     }       /* End of it could be a user-defined property */
24685 
24686     /* Here it wasn't a user-defined property that is known at this time.  See
24687      * if it is a Unicode property */
24688 
24689     lookup_len = j;     /* This is a more mnemonic name than 'j' */
24690 
24691     /* Get the index into our pointer table of the inversion list corresponding
24692      * to the property */
24693     table_index = do_uniprop_match(lookup_name, lookup_len);
24694 
24695     /* If it didn't find the property ... */
24696     if (table_index == 0) {
24697 
24698         /* Try again stripping off any initial 'Is'.  This is because we
24699          * promise that an initial Is is optional.  The same isn't true of
24700          * names that start with 'In'.  Those can match only blocks, and the
24701          * lookup table already has those accounted for. */
24702         if (starts_with_Is) {
24703             lookup_name += 2;
24704             lookup_len -= 2;
24705             equals_pos -= 2;
24706             slash_pos -= 2;
24707 
24708             table_index = do_uniprop_match(lookup_name, lookup_len);
24709         }
24710 
24711         if (table_index == 0) {
24712             char * canonical;
24713 
24714             /* Here, we didn't find it.  If not a numeric type property, and
24715              * can't be a user-defined one, it isn't a legal property */
24716             if (! is_nv_type) {
24717                 if (! could_be_user_defined) {
24718                     goto failed;
24719                 }
24720 
24721                 /* Here, the property name is legal as a user-defined one.   At
24722                  * compile time, it might just be that the subroutine for that
24723                  * property hasn't been encountered yet, but at runtime, it's
24724                  * an error to try to use an undefined one */
24725                 if (! deferrable) {
24726                     goto unknown_user_defined;;
24727                 }
24728 
24729                 goto definition_deferred;
24730             } /* End of isn't a numeric type property */
24731 
24732             /* The numeric type properties need more work to decide.  What we
24733              * do is make sure we have the number in canonical form and look
24734              * that up. */
24735 
24736             if (slash_pos < 0) {    /* No slash */
24737 
24738                 /* When it isn't a rational, take the input, convert it to a
24739                  * NV, then create a canonical string representation of that
24740                  * NV. */
24741 
24742                 NV value;
24743                 SSize_t value_len = lookup_len - equals_pos;
24744 
24745                 /* Get the value */
24746                 if (   value_len <= 0
24747                     || my_atof3(lookup_name + equals_pos, &value,
24748                                 value_len)
24749                           != lookup_name + lookup_len)
24750                 {
24751                     goto failed;
24752                 }
24753 
24754                 /* If the value is an integer, the canonical value is integral
24755                  * */
24756                 if (Perl_ceil(value) == value) {
24757                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24758                                             equals_pos, lookup_name, value);
24759                 }
24760                 else {  /* Otherwise, it is %e with a known precision */
24761                     char * exp_ptr;
24762 
24763                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24764                                                 equals_pos, lookup_name,
24765                                                 PL_E_FORMAT_PRECISION, value);
24766 
24767                     /* The exponent generated is expecting two digits, whereas
24768                      * %e on some systems will generate three.  Remove leading
24769                      * zeros in excess of 2 from the exponent.  We start
24770                      * looking for them after the '=' */
24771                     exp_ptr = strchr(canonical + equals_pos, 'e');
24772                     if (exp_ptr) {
24773                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24774                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24775 
24776                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24777 
24778                         if (excess_exponent_len > 0) {
24779                             SSize_t leading_zeros = strspn(cur_ptr, "0");
24780                             SSize_t excess_leading_zeros
24781                                     = MIN(leading_zeros, excess_exponent_len);
24782                             if (excess_leading_zeros > 0) {
24783                                 Move(cur_ptr + excess_leading_zeros,
24784                                      cur_ptr,
24785                                      strlen(cur_ptr) - excess_leading_zeros
24786                                        + 1,  /* Copy the NUL as well */
24787                                      char);
24788                             }
24789                         }
24790                     }
24791                 }
24792             }
24793             else {  /* Has a slash.  Create a rational in canonical form  */
24794                 UV numerator, denominator, gcd, trial;
24795                 const char * end_ptr;
24796                 const char * sign = "";
24797 
24798                 /* We can't just find the numerator, denominator, and do the
24799                  * division, then use the method above, because that is
24800                  * inexact.  And the input could be a rational that is within
24801                  * epsilon (given our precision) of a valid rational, and would
24802                  * then incorrectly compare valid.
24803                  *
24804                  * We're only interested in the part after the '=' */
24805                 const char * this_lookup_name = lookup_name + equals_pos;
24806                 lookup_len -= equals_pos;
24807                 slash_pos -= equals_pos;
24808 
24809                 /* Handle any leading minus */
24810                 if (this_lookup_name[0] == '-') {
24811                     sign = "-";
24812                     this_lookup_name++;
24813                     lookup_len--;
24814                     slash_pos--;
24815                 }
24816 
24817                 /* Convert the numerator to numeric */
24818                 end_ptr = this_lookup_name + slash_pos;
24819                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24820                     goto failed;
24821                 }
24822 
24823                 /* It better have included all characters before the slash */
24824                 if (*end_ptr != '/') {
24825                     goto failed;
24826                 }
24827 
24828                 /* Set to look at just the denominator */
24829                 this_lookup_name += slash_pos;
24830                 lookup_len -= slash_pos;
24831                 end_ptr = this_lookup_name + lookup_len;
24832 
24833                 /* Convert the denominator to numeric */
24834                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24835                     goto failed;
24836                 }
24837 
24838                 /* It better be the rest of the characters, and don't divide by
24839                  * 0 */
24840                 if (   end_ptr != this_lookup_name + lookup_len
24841                     || denominator == 0)
24842                 {
24843                     goto failed;
24844                 }
24845 
24846                 /* Get the greatest common denominator using
24847                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
24848                 gcd = numerator;
24849                 trial = denominator;
24850                 while (trial != 0) {
24851                     UV temp = trial;
24852                     trial = gcd % trial;
24853                     gcd = temp;
24854                 }
24855 
24856                 /* If already in lowest possible terms, we have already tried
24857                  * looking this up */
24858                 if (gcd == 1) {
24859                     goto failed;
24860                 }
24861 
24862                 /* Reduce the rational, which should put it in canonical form
24863                  * */
24864                 numerator /= gcd;
24865                 denominator /= gcd;
24866 
24867                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24868                         equals_pos, lookup_name, sign, numerator, denominator);
24869             }
24870 
24871             /* Here, we have the number in canonical form.  Try that */
24872             table_index = do_uniprop_match(canonical, strlen(canonical));
24873             if (table_index == 0) {
24874                 goto failed;
24875             }
24876         }   /* End of still didn't find the property in our table */
24877     }       /* End of       didn't find the property in our table */
24878 
24879     /* Here, we have a non-zero return, which is an index into a table of ptrs.
24880      * A negative return signifies that the real index is the absolute value,
24881      * but the result needs to be inverted */
24882     if (table_index < 0) {
24883         invert_return = TRUE;
24884         table_index = -table_index;
24885     }
24886 
24887     /* Out-of band indices indicate a deprecated property.  The proper index is
24888      * modulo it with the table size.  And dividing by the table size yields
24889      * an offset into a table constructed by regen/mk_invlists.pl to contain
24890      * the corresponding warning message */
24891     if (table_index > MAX_UNI_KEYWORD_INDEX) {
24892         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24893         table_index %= MAX_UNI_KEYWORD_INDEX;
24894         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24895                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24896                 (int) name_len, name,
24897                 get_deprecated_property_msg(warning_offset));
24898     }
24899 
24900     /* In a few properties, a different property is used under /i.  These are
24901      * unlikely to change, so are hard-coded here. */
24902     if (to_fold) {
24903         if (   table_index == UNI_XPOSIXUPPER
24904             || table_index == UNI_XPOSIXLOWER
24905             || table_index == UNI_TITLE)
24906         {
24907             table_index = UNI_CASED;
24908         }
24909         else if (   table_index == UNI_UPPERCASELETTER
24910                  || table_index == UNI_LOWERCASELETTER
24911 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
24912                  || table_index == UNI_TITLECASELETTER
24913 #  endif
24914         ) {
24915             table_index = UNI_CASEDLETTER;
24916         }
24917         else if (  table_index == UNI_POSIXUPPER
24918                 || table_index == UNI_POSIXLOWER)
24919         {
24920             table_index = UNI_POSIXALPHA;
24921         }
24922     }
24923 
24924     /* Create and return the inversion list */
24925     prop_definition = get_prop_definition(table_index);
24926     sv_2mortal(prop_definition);
24927 
24928     /* See if there is a private use override to add to this definition */
24929     {
24930         COPHH * hinthash = (IN_PERL_COMPILETIME)
24931                            ? CopHINTHASH_get(&PL_compiling)
24932                            : CopHINTHASH_get(PL_curcop);
24933 	SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24934 
24935         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24936 
24937             /* See if there is an element in the hints hash for this table */
24938             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24939             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24940 
24941             if (pos) {
24942                 bool dummy;
24943                 SV * pu_definition;
24944                 SV * pu_invlist;
24945                 SV * expanded_prop_definition =
24946                             sv_2mortal(invlist_clone(prop_definition, NULL));
24947 
24948                 /* If so, it's definition is the string from here to the next
24949                  * \a character.  And its format is the same as a user-defined
24950                  * property */
24951                 pos += SvCUR(pu_lookup);
24952                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24953                 pu_invlist = handle_user_defined_property(lookup_name,
24954                                                           lookup_len,
24955                                                           0, /* Not UTF-8 */
24956                                                           0, /* Not folded */
24957                                                           runtime,
24958                                                           deferrable,
24959                                                           pu_definition,
24960                                                           &dummy,
24961                                                           msg,
24962                                                           level);
24963                 if (TAINT_get) {
24964                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24965                     sv_catpvs(msg, "Insecure private-use override");
24966                     goto append_name_to_msg;
24967                 }
24968 
24969                 /* For now, as a safety measure, make sure that it doesn't
24970                  * override non-private use code points */
24971                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24972 
24973                 /* Add it to the list to be returned */
24974                 _invlist_union(prop_definition, pu_invlist,
24975                                &expanded_prop_definition);
24976                 prop_definition = expanded_prop_definition;
24977                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24978             }
24979         }
24980     }
24981 
24982     if (invert_return) {
24983         _invlist_invert(prop_definition);
24984     }
24985     return prop_definition;
24986 
24987   unknown_user_defined:
24988     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24989     sv_catpvs(msg, "Unknown user-defined property name");
24990     goto append_name_to_msg;
24991 
24992   failed:
24993     if (non_pkg_begin != 0) {
24994         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24995         sv_catpvs(msg, "Illegal user-defined property name");
24996     }
24997     else {
24998         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24999         sv_catpvs(msg, "Can't find Unicode property definition");
25000     }
25001     /* FALLTHROUGH */
25002 
25003   append_name_to_msg:
25004     {
25005         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
25006         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
25007 
25008         sv_catpv(msg, prefix);
25009         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
25010         sv_catpv(msg, suffix);
25011     }
25012 
25013     return NULL;
25014 
25015   definition_deferred:
25016 
25017     {
25018         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
25019 
25020         /* Here it could yet to be defined, so defer evaluation of this until
25021          * its needed at runtime.  We need the fully qualified property name to
25022          * avoid ambiguity */
25023         if (! fq_name) {
25024             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25025                                                                 is_qualified);
25026         }
25027 
25028         /* If it didn't come with a package, or the package is utf8::, this
25029          * actually could be an official Unicode property whose inclusion we
25030          * are deferring until runtime to make sure that it isn't overridden by
25031          * a user-defined property of the same name (which we haven't
25032          * encountered yet).  Add a marker to indicate this possibility, for
25033          * use at such time when we first need the definition during pattern
25034          * matching execution */
25035         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25036             sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25037         }
25038 
25039         /* We also need a trailing newline */
25040         sv_catpvs(fq_name, "\n");
25041 
25042         *user_defined_ptr = TRUE;
25043         return fq_name;
25044     }
25045 }
25046 
25047 STATIC bool
S_handle_names_wildcard(pTHX_ const char * wname,const STRLEN wname_len,SV ** prop_definition,AV ** strings)25048 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25049                               const STRLEN wname_len, /* Its length */
25050                               SV ** prop_definition,
25051                               AV ** strings)
25052 {
25053     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25054      * any matches, adding them to prop_definition */
25055 
25056     dSP;
25057 
25058     CV * get_names_info;        /* entry to charnames.pm to get info we need */
25059     SV * names_string;          /* Contains all character names, except algo */
25060     SV * algorithmic_names;     /* Contains info about algorithmically
25061                                    generated character names */
25062     REGEXP * subpattern_re;     /* The user's pattern to match with */
25063     struct regexp * prog;       /* The compiled pattern */
25064     char * all_names_start;     /* lib/unicore/Name.pl string of every
25065                                    (non-algorithmic) character name */
25066     char * cur_pos;             /* We match, effectively using /gc; this is
25067                                    where we are now */
25068     bool found_matches = FALSE; /* Did any name match so far? */
25069     SV * empty;                 /* For matching zero length names */
25070     SV * must_sv;               /* Contains the substring, if any, that must be
25071                                    in a name for the subpattern to match */
25072     const char * must;          /* The PV of 'must' */
25073     STRLEN must_len;            /* And its length */
25074     SV * syllable_name = NULL;  /* For Hangul syllables */
25075     const char hangul_prefix[] = "HANGUL SYLLABLE ";
25076     const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25077 
25078     /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25079      * syllable name, and these are immutable and guaranteed by the Unicode
25080      * standard to never be extended */
25081     const STRLEN syl_max_len = hangul_prefix_len + 7;
25082 
25083     IV i;
25084 
25085     PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25086 
25087     /* Make sure _charnames is loaded.  (The parameters give context
25088      * for any errors generated */
25089     get_names_info = get_cv("_charnames::_get_names_info", 0);
25090     if (! get_names_info) {
25091         Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25092     }
25093 
25094     /* Get the charnames data */
25095     PUSHSTACKi(PERLSI_REGCOMP);
25096     ENTER ;
25097     SAVETMPS;
25098     save_re_context();
25099 
25100     PUSHMARK(SP) ;
25101     PUTBACK;
25102 
25103     /* Special _charnames entry point that returns the info this routine
25104      * requires */
25105     call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25106 
25107     SPAGAIN ;
25108 
25109     /* Data structure for names which end in their very own code points */
25110     algorithmic_names = POPs;
25111     SvREFCNT_inc_simple_void_NN(algorithmic_names);
25112 
25113     /* The lib/unicore/Name.pl string */
25114     names_string = POPs;
25115     SvREFCNT_inc_simple_void_NN(names_string);
25116 
25117     PUTBACK ;
25118     FREETMPS ;
25119     LEAVE ;
25120     POPSTACK;
25121 
25122     if (   ! SvROK(names_string)
25123         || ! SvROK(algorithmic_names))
25124     {   /* Perhaps should panic instead XXX */
25125         SvREFCNT_dec(names_string);
25126         SvREFCNT_dec(algorithmic_names);
25127         return FALSE;
25128     }
25129 
25130     names_string = sv_2mortal(SvRV(names_string));
25131     all_names_start = SvPVX(names_string);
25132     cur_pos = all_names_start;
25133 
25134     algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25135 
25136     /* Compile the subpattern consisting of the name being looked for */
25137     subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25138 
25139     must_sv = re_intuit_string(subpattern_re);
25140     if (must_sv) {
25141         /* regexec.c can free the re_intuit_string() return. GH #17734 */
25142         must_sv = sv_2mortal(newSVsv(must_sv));
25143         must = SvPV(must_sv, must_len);
25144     }
25145     else {
25146         must = "";
25147         must_len = 0;
25148     }
25149 
25150     /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
25151      * This works because the NUL causes the function to return early, thus
25152      * showing that there are characters in it other than the acceptable ones,
25153      * which is our desired result.) */
25154 
25155     prog = ReANY(subpattern_re);
25156 
25157     /* If only nothing is matched, skip to where empty names are looked for */
25158     if (prog->maxlen == 0) {
25159         goto check_empty;
25160     }
25161 
25162     /* And match against the string of all names /gc.  Don't even try if it
25163      * must match a character not found in any name. */
25164     if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25165     {
25166         while (execute_wildcard(subpattern_re,
25167                                 cur_pos,
25168                                 SvEND(names_string),
25169                                 all_names_start, 0,
25170                                 names_string,
25171                                 0))
25172         { /* Here, matched. */
25173 
25174             /* Note the string entries look like
25175              *      00001\nSTART OF HEADING\n\n
25176              * so we could match anywhere in that string.  We have to rule out
25177              * matching a code point line */
25178             char * this_name_start = all_names_start
25179                                                 + RX_OFFS(subpattern_re)->start;
25180             char * this_name_end   = all_names_start
25181                                                 + RX_OFFS(subpattern_re)->end;
25182             char * cp_start;
25183             char * cp_end;
25184             UV cp = 0;      /* Silences some compilers */
25185             AV * this_string = NULL;
25186             bool is_multi = FALSE;
25187 
25188             /* If matched nothing, advance to next possible match */
25189             if (this_name_start == this_name_end) {
25190                 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25191                                           SvEND(names_string) - this_name_end);
25192                 if (cur_pos == NULL) {
25193                     break;
25194                 }
25195             }
25196             else {
25197                 /* Position the next match to start beyond the current returned
25198                  * entry */
25199                 cur_pos = (char *) memchr(this_name_end, '\n',
25200                                           SvEND(names_string) - this_name_end);
25201             }
25202 
25203             /* Back up to the \n just before the beginning of the character. */
25204             cp_end = (char *) my_memrchr(all_names_start,
25205                                          '\n',
25206                                          this_name_start - all_names_start);
25207 
25208             /* If we didn't find a \n, it means it matched somewhere in the
25209              * initial '00000' in the string, so isn't a real match */
25210             if (cp_end == NULL) {
25211                 continue;
25212             }
25213 
25214             this_name_start = cp_end + 1;   /* The name starts just after */
25215             cp_end--;                       /* the \n, and the code point */
25216                                             /* ends just before it */
25217 
25218             /* All code points are 5 digits long */
25219             cp_start = cp_end - 4;
25220 
25221             /* This shouldn't happen, as we found a \n, and the first \n is
25222              * further along than what we subtracted */
25223             assert(cp_start >= all_names_start);
25224 
25225             if (cp_start == all_names_start) {
25226                 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25227                 continue;
25228             }
25229 
25230             /* If the character is a blank, we either have a named sequence, or
25231              * something is wrong */
25232             if (*(cp_start - 1) == ' ') {
25233                 cp_start = (char *) my_memrchr(all_names_start,
25234                                                '\n',
25235                                                cp_start - all_names_start);
25236                 cp_start++;
25237             }
25238 
25239             assert(cp_start != NULL && cp_start >= all_names_start + 2);
25240 
25241             /* Except for the first line in the string, the sequence before the
25242              * code point is \n\n.  If that isn't the case here, we didn't
25243              * match the name of a character.  (We could have matched a named
25244              * sequence, not currently handled */
25245             if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25246                 continue;
25247             }
25248 
25249             /* We matched!  Add this to the list */
25250             found_matches = TRUE;
25251 
25252             /* Loop through all the code points in the sequence */
25253             while (cp_start < cp_end) {
25254 
25255                 /* Calculate this code point from its 5 digits */
25256                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25257                    + (XDIGIT_VALUE(cp_start[1]) << 12)
25258                    + (XDIGIT_VALUE(cp_start[2]) << 8)
25259                    + (XDIGIT_VALUE(cp_start[3]) << 4)
25260                    +  XDIGIT_VALUE(cp_start[4]);
25261 
25262                 cp_start += 6;  /* Go past any blank */
25263 
25264                 if (cp_start < cp_end || is_multi) {
25265                     if (this_string == NULL) {
25266                         this_string = newAV();
25267                     }
25268 
25269                     is_multi = TRUE;
25270                     av_push(this_string, newSVuv(cp));
25271                 }
25272             }
25273 
25274             if (is_multi) { /* Was more than one code point */
25275                 if (*strings == NULL) {
25276                     *strings = newAV();
25277                 }
25278 
25279                 av_push(*strings, (SV *) this_string);
25280             }
25281             else {  /* Only a single code point */
25282                 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25283             }
25284         } /* End of loop through the non-algorithmic names string */
25285     }
25286 
25287     /* There are also character names not in 'names_string'.  These are
25288      * algorithmically generatable.  Try this pattern on each possible one.
25289      * (khw originally planned to leave this out given the large number of
25290      * matches attempted; but the speed turned out to be quite acceptable
25291      *
25292      * There are plenty of opportunities to optimize to skip many of the tests.
25293      * beyond the rudimentary ones already here */
25294 
25295     /* First see if the subpattern matches any of the algorithmic generatable
25296      * Hangul syllable names.
25297      *
25298      * We know none of these syllable names will match if the input pattern
25299      * requires more bytes than any syllable has, or if the input pattern only
25300      * matches an empty name, or if the pattern has something it must match and
25301      * one of the characters in that isn't in any Hangul syllable. */
25302     if (    prog->minlen <= (SSize_t) syl_max_len
25303         &&  prog->maxlen > 0
25304         && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25305     {
25306         /* These constants, names, values, and algorithm are adapted from the
25307          * Unicode standard, version 5.1, section 3.12, and should never
25308          * change. */
25309         const char * JamoL[] = {
25310             "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25311             "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25312         };
25313         const int LCount = C_ARRAY_LENGTH(JamoL);
25314 
25315         const char * JamoV[] = {
25316             "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25317             "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25318             "I"
25319         };
25320         const int VCount = C_ARRAY_LENGTH(JamoV);
25321 
25322         const char * JamoT[] = {
25323             "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25324             "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25325             "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25326         };
25327         const int TCount = C_ARRAY_LENGTH(JamoT);
25328 
25329         int L, V, T;
25330 
25331         /* This is the initial Hangul syllable code point; each time through the
25332          * inner loop, it maps to the next higher code point.  For more info,
25333          * see the Hangul syllable section of the Unicode standard. */
25334         int cp = 0xAC00;
25335 
25336         syllable_name = sv_2mortal(newSV(syl_max_len));
25337         sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25338 
25339         for (L = 0; L < LCount; L++) {
25340             for (V = 0; V < VCount; V++) {
25341                 for (T = 0; T < TCount; T++) {
25342 
25343                     /* Truncate back to the prefix, which is unvarying */
25344                     SvCUR_set(syllable_name, hangul_prefix_len);
25345 
25346                     sv_catpv(syllable_name, JamoL[L]);
25347                     sv_catpv(syllable_name, JamoV[V]);
25348                     sv_catpv(syllable_name, JamoT[T]);
25349 
25350                     if (execute_wildcard(subpattern_re,
25351                                 SvPVX(syllable_name),
25352                                 SvEND(syllable_name),
25353                                 SvPVX(syllable_name), 0,
25354                                 syllable_name,
25355                                 0))
25356                     {
25357                         *prop_definition = add_cp_to_invlist(*prop_definition,
25358                                                              cp);
25359                         found_matches = TRUE;
25360                     }
25361 
25362                     cp++;
25363                 }
25364             }
25365         }
25366     }
25367 
25368     /* The rest of the algorithmically generatable names are of the form
25369      * "PREFIX-code_point".  The prefixes and the code point limits of each
25370      * were returned to us in the array 'algorithmic_names' from data in
25371      * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
25372     for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25373         IV j;
25374 
25375         /* Each element of the array is a hash, giving the details for the
25376          * series of names it covers.  There is the base name of the characters
25377          * in the series, and the low and high code points in the series.  And,
25378          * for optimization purposes a string containing all the legal
25379          * characters that could possibly be in a name in this series. */
25380         HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25381         SV * prefix = * hv_fetchs(this_series, "name", 0);
25382         IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25383         IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25384         char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25385 
25386         /* Pre-allocate an SV with enough space */
25387         SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25388                                                         SvPVX(prefix)));
25389         if (high >= 0x10000) {
25390             sv_catpvs(algo_name, "0");
25391         }
25392 
25393         /* This series can be skipped entirely if the pattern requires
25394          * something longer than any name in the series, or can only match an
25395          * empty name, or contains a character not found in any name in the
25396          * series */
25397         if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
25398             &&  prog->maxlen > 0
25399             && (strspn(must, legal) == must_len))
25400         {
25401             for (j = low; j <= high; j++) { /* For each code point in the series */
25402 
25403                 /* Get its name, and see if it matches the subpattern */
25404                 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25405                                      (unsigned) j);
25406 
25407                 if (execute_wildcard(subpattern_re,
25408                                     SvPVX(algo_name),
25409                                     SvEND(algo_name),
25410                                     SvPVX(algo_name), 0,
25411                                     algo_name,
25412                                     0))
25413                 {
25414                     *prop_definition = add_cp_to_invlist(*prop_definition, j);
25415                     found_matches = TRUE;
25416                 }
25417             }
25418         }
25419     }
25420 
25421   check_empty:
25422     /* Finally, see if the subpattern matches an empty string */
25423     empty = newSVpvs("");
25424     if (execute_wildcard(subpattern_re,
25425                          SvPVX(empty),
25426                          SvEND(empty),
25427                          SvPVX(empty), 0,
25428                          empty,
25429                          0))
25430     {
25431         /* Many code points have empty names.  Currently these are the \p{GC=C}
25432          * ones, minus CC and CF */
25433 
25434         SV * empty_names_ref = get_prop_definition(UNI_C);
25435         SV * empty_names = invlist_clone(empty_names_ref, NULL);
25436 
25437         SV * subtract = get_prop_definition(UNI_CC);
25438 
25439         _invlist_subtract(empty_names, subtract, &empty_names);
25440         SvREFCNT_dec_NN(empty_names_ref);
25441         SvREFCNT_dec_NN(subtract);
25442 
25443         subtract = get_prop_definition(UNI_CF);
25444         _invlist_subtract(empty_names, subtract, &empty_names);
25445         SvREFCNT_dec_NN(subtract);
25446 
25447         _invlist_union(*prop_definition, empty_names, prop_definition);
25448         found_matches = TRUE;
25449         SvREFCNT_dec_NN(empty_names);
25450     }
25451     SvREFCNT_dec_NN(empty);
25452 
25453 #if 0
25454     /* If we ever were to accept aliases for, say private use names, we would
25455      * need to do something fancier to find empty names.  The code below works
25456      * (at the time it was written), and is slower than the above */
25457     const char empties_pat[] = "^.";
25458     if (strNE(name, empties_pat)) {
25459         SV * empty = newSVpvs("");
25460         if (execute_wildcard(subpattern_re,
25461                     SvPVX(empty),
25462                     SvEND(empty),
25463                     SvPVX(empty), 0,
25464                     empty,
25465                     0))
25466         {
25467             SV * empties = NULL;
25468 
25469             (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25470 
25471             _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25472             SvREFCNT_dec_NN(empties);
25473 
25474             found_matches = TRUE;
25475         }
25476         SvREFCNT_dec_NN(empty);
25477     }
25478 #endif
25479 
25480     SvREFCNT_dec_NN(subpattern_re);
25481     return found_matches;
25482 }
25483 
25484 /*
25485  * ex: set ts=8 sts=4 sw=4 et:
25486  */
25487