xref: /openbsd/gnu/usr.bin/perl/regcomp.c (revision 9b7c3dbb)
1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 #ifdef PERL_EXT_RE_BUILD
35 #include "re_top.h"
36 #endif
37 
38 /*
39  * pregcomp and pregexec -- regsub and regerror are not used in perl
40  *
41  *	Copyright (c) 1986 by University of Toronto.
42  *	Written by Henry Spencer.  Not derived from licensed software.
43  *
44  *	Permission is granted to anyone to use this software for any
45  *	purpose on any computer system, and to redistribute it freely,
46  *	subject to the following restrictions:
47  *
48  *	1. The author is not responsible for the consequences of use of
49  *		this software, no matter how awful, even if they arise
50  *		from defects in it.
51  *
52  *	2. The origin of this software must not be misrepresented, either
53  *		by explicit claim or by omission.
54  *
55  *	3. Altered versions must be plainly marked as such, and must not
56  *		be misrepresented as being the original software.
57  *
58  *
59  ****    Alterations to Henry's code are...
60  ****
61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63  ****    by Larry Wall and others
64  ****
65  ****    You may distribute under the terms of either the GNU General Public
66  ****    License or the Artistic License, as specified in the README file.
67 
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGCOMP_C
75 #include "perl.h"
76 
77 #ifndef PERL_IN_XSUB_RE
78 #  include "INTERN.h"
79 #endif
80 
81 #define REG_COMP_C
82 #ifdef PERL_IN_XSUB_RE
83 #  include "re_comp.h"
84 EXTERN_C const struct regexp_engine my_reg_engine;
85 #else
86 #  include "regcomp.h"
87 #endif
88 
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
93 
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 
99 #ifndef STATIC
100 #define	STATIC	static
101 #endif
102 
103 
104 struct RExC_state_t {
105     U32		flags;			/* RXf_* are we folding, multilining? */
106     U32		pm_flags;		/* PMf_* stuff from the calling PMOP */
107     char	*precomp;		/* uncompiled string. */
108     REGEXP	*rx_sv;			/* The SV that is the regexp. */
109     regexp	*rx;                    /* perl core regexp structure */
110     regexp_internal	*rxi;           /* internal data for regexp object
111                                            pprivate field */
112     char	*start;			/* Start of input for compile */
113     char	*end;			/* End of input for compile */
114     char	*parse;			/* Input-scan pointer. */
115     SSize_t	whilem_seen;		/* number of WHILEM in this expr */
116     regnode	*emit_start;		/* Start of emitted-code area */
117     regnode	*emit_bound;		/* First regnode outside of the
118                                            allocated space */
119     regnode	*emit;			/* Code-emit pointer; if = &emit_dummy,
120                                            implies compiling, so don't emit */
121     regnode_ssc	emit_dummy;		/* placeholder for emit to point to;
122                                            large enough for the largest
123                                            non-EXACTish node, so can use it as
124                                            scratch in pass1 */
125     I32		naughty;		/* How bad is this pattern? */
126     I32		sawback;		/* Did we see \1, ...? */
127     U32		seen;
128     SSize_t	size;			/* Code size. */
129     I32                npar;            /* Capture buffer count, (OPEN) plus
130                                            one. ("par" 0 is the whole
131                                            pattern)*/
132     I32		nestroot;		/* root parens we are in - used by
133                                            accept */
134     I32		extralen;
135     I32		seen_zerolen;
136     regnode	**open_parens;		/* pointers to open parens */
137     regnode	**close_parens;		/* pointers to close parens */
138     regnode	*opend;			/* END node in program */
139     I32		utf8;		/* whether the pattern is utf8 or not */
140     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
141 				/* XXX use this for future optimisation of case
142 				 * where pattern must be upgraded to utf8. */
143     I32		uni_semantics;	/* If a d charset modifier should use unicode
144 				   rules, even if the pattern is not in
145 				   utf8 */
146     HV		*paren_names;		/* Paren names */
147 
148     regnode	**recurse;		/* Recurse regops */
149     I32		recurse_count;		/* Number of recurse regops */
150     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
151                                            through */
152     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
153     I32		in_lookbehind;
154     I32		contains_locale;
155     I32		contains_i;
156     I32		override_recoding;
157     I32		in_multi_char_class;
158     struct reg_code_block *code_blocks;	/* positions of literal (?{})
159 					    within pattern */
160     int		num_code_blocks;	/* size of code_blocks[] */
161     int		code_index;		/* next code_blocks[] slot */
162     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
163 #ifdef ADD_TO_REGEXEC
164     char 	*starttry;		/* -Dr: where regtry was called. */
165 #define RExC_starttry	(pRExC_state->starttry)
166 #endif
167     SV		*runtime_code_qr;	/* qr with the runtime code blocks */
168 #ifdef DEBUGGING
169     const char  *lastparse;
170     I32         lastnum;
171     AV          *paren_name_list;       /* idx -> name */
172 #define RExC_lastparse	(pRExC_state->lastparse)
173 #define RExC_lastnum	(pRExC_state->lastnum)
174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
175 #endif
176 };
177 
178 #define RExC_flags	(pRExC_state->flags)
179 #define RExC_pm_flags	(pRExC_state->pm_flags)
180 #define RExC_precomp	(pRExC_state->precomp)
181 #define RExC_rx_sv	(pRExC_state->rx_sv)
182 #define RExC_rx		(pRExC_state->rx)
183 #define RExC_rxi	(pRExC_state->rxi)
184 #define RExC_start	(pRExC_state->start)
185 #define RExC_end	(pRExC_state->end)
186 #define RExC_parse	(pRExC_state->parse)
187 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
188 #ifdef RE_TRACK_PATTERN_OFFSETS
189 #define RExC_offsets	(pRExC_state->rxi->u.offsets) /* I am not like the
190                                                          others */
191 #endif
192 #define RExC_emit	(pRExC_state->emit)
193 #define RExC_emit_dummy	(pRExC_state->emit_dummy)
194 #define RExC_emit_start	(pRExC_state->emit_start)
195 #define RExC_emit_bound	(pRExC_state->emit_bound)
196 #define RExC_naughty	(pRExC_state->naughty)
197 #define RExC_sawback	(pRExC_state->sawback)
198 #define RExC_seen	(pRExC_state->seen)
199 #define RExC_size	(pRExC_state->size)
200 #define RExC_maxlen        (pRExC_state->maxlen)
201 #define RExC_npar	(pRExC_state->npar)
202 #define RExC_nestroot   (pRExC_state->nestroot)
203 #define RExC_extralen	(pRExC_state->extralen)
204 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
205 #define RExC_utf8	(pRExC_state->utf8)
206 #define RExC_uni_semantics	(pRExC_state->uni_semantics)
207 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
208 #define RExC_open_parens	(pRExC_state->open_parens)
209 #define RExC_close_parens	(pRExC_state->close_parens)
210 #define RExC_opend	(pRExC_state->opend)
211 #define RExC_paren_names	(pRExC_state->paren_names)
212 #define RExC_recurse	(pRExC_state->recurse)
213 #define RExC_recurse_count	(pRExC_state->recurse_count)
214 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
215 #define RExC_study_chunk_recursed_bytes  \
216                                    (pRExC_state->study_chunk_recursed_bytes)
217 #define RExC_in_lookbehind	(pRExC_state->in_lookbehind)
218 #define RExC_contains_locale	(pRExC_state->contains_locale)
219 #define RExC_contains_i (pRExC_state->contains_i)
220 #define RExC_override_recoding (pRExC_state->override_recoding)
221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
222 
223 
224 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
225 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
226 	((*s) == '{' && regcurly(s, FALSE)))
227 
228 /*
229  * Flags to be passed up and down.
230  */
231 #define	WORST		0	/* Worst case. */
232 #define	HASWIDTH	0x01	/* Known to match non-null strings. */
233 
234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
235  * character.  (There needs to be a case: in the switch statement in regexec.c
236  * for any node marked SIMPLE.)  Note that this is not the same thing as
237  * REGNODE_SIMPLE */
238 #define	SIMPLE		0x02
239 #define	SPSTART		0x04	/* Starts with * or + */
240 #define POSTPONED	0x08    /* (?1),(?&name), (??{...}) or similar */
241 #define TRYAGAIN	0x10	/* Weeded out a declaration. */
242 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
243 
244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
245 
246 /* whether trie related optimizations are enabled */
247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
248 #define TRIE_STUDY_OPT
249 #define FULL_TRIE_STUDY
250 #define TRIE_STCLASS
251 #endif
252 
253 
254 
255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
256 #define PBITVAL(paren) (1 << ((paren) & 7))
257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
260 
261 #define REQUIRE_UTF8	STMT_START {                                       \
262                                      if (!UTF) {                           \
263                                          *flagp = RESTART_UTF8;            \
264                                          return NULL;                      \
265                                      }                                     \
266                         } STMT_END
267 
268 /* This converts the named class defined in regcomp.h to its equivalent class
269  * number defined in handy.h. */
270 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
271 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
272 
273 #define _invlist_union_complement_2nd(a, b, output) \
274                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
275 #define _invlist_intersection_complement_2nd(a, b, output) \
276                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
277 
278 /* About scan_data_t.
279 
280   During optimisation we recurse through the regexp program performing
281   various inplace (keyhole style) optimisations. In addition study_chunk
282   and scan_commit populate this data structure with information about
283   what strings MUST appear in the pattern. We look for the longest
284   string that must appear at a fixed location, and we look for the
285   longest string that may appear at a floating location. So for instance
286   in the pattern:
287 
288     /FOO[xX]A.*B[xX]BAR/
289 
290   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
291   strings (because they follow a .* construct). study_chunk will identify
292   both FOO and BAR as being the longest fixed and floating strings respectively.
293 
294   The strings can be composites, for instance
295 
296      /(f)(o)(o)/
297 
298   will result in a composite fixed substring 'foo'.
299 
300   For each string some basic information is maintained:
301 
302   - offset or min_offset
303     This is the position the string must appear at, or not before.
304     It also implicitly (when combined with minlenp) tells us how many
305     characters must match before the string we are searching for.
306     Likewise when combined with minlenp and the length of the string it
307     tells us how many characters must appear after the string we have
308     found.
309 
310   - max_offset
311     Only used for floating strings. This is the rightmost point that
312     the string can appear at. If set to SSize_t_MAX it indicates that the
313     string can occur infinitely far to the right.
314 
315   - minlenp
316     A pointer to the minimum number of characters of the pattern that the
317     string was found inside. This is important as in the case of positive
318     lookahead or positive lookbehind we can have multiple patterns
319     involved. Consider
320 
321     /(?=FOO).*F/
322 
323     The minimum length of the pattern overall is 3, the minimum length
324     of the lookahead part is 3, but the minimum length of the part that
325     will actually match is 1. So 'FOO's minimum length is 3, but the
326     minimum length for the F is 1. This is important as the minimum length
327     is used to determine offsets in front of and behind the string being
328     looked for.  Since strings can be composites this is the length of the
329     pattern at the time it was committed with a scan_commit. Note that
330     the length is calculated by study_chunk, so that the minimum lengths
331     are not known until the full pattern has been compiled, thus the
332     pointer to the value.
333 
334   - lookbehind
335 
336     In the case of lookbehind the string being searched for can be
337     offset past the start point of the final matching string.
338     If this value was just blithely removed from the min_offset it would
339     invalidate some of the calculations for how many chars must match
340     before or after (as they are derived from min_offset and minlen and
341     the length of the string being searched for).
342     When the final pattern is compiled and the data is moved from the
343     scan_data_t structure into the regexp structure the information
344     about lookbehind is factored in, with the information that would
345     have been lost precalculated in the end_shift field for the
346     associated string.
347 
348   The fields pos_min and pos_delta are used to store the minimum offset
349   and the delta to the maximum offset at the current point in the pattern.
350 
351 */
352 
353 typedef struct scan_data_t {
354     /*I32 len_min;      unused */
355     /*I32 len_delta;    unused */
356     SSize_t pos_min;
357     SSize_t pos_delta;
358     SV *last_found;
359     SSize_t last_end;	    /* min value, <0 unless valid. */
360     SSize_t last_start_min;
361     SSize_t last_start_max;
362     SV **longest;	    /* Either &l_fixed, or &l_float. */
363     SV *longest_fixed;      /* longest fixed string found in pattern */
364     SSize_t offset_fixed;   /* offset where it starts */
365     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
366     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
367     SV *longest_float;      /* longest floating string found in pattern */
368     SSize_t offset_float_min; /* earliest point in string it can appear */
369     SSize_t offset_float_max; /* latest point in string it can appear */
370     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
371     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
372     I32 flags;
373     I32 whilem_c;
374     SSize_t *last_closep;
375     regnode_ssc *start_class;
376 } scan_data_t;
377 
378 /* The below is perhaps overboard, but this allows us to save a test at the
379  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
380  * and 'a' differ by a single bit; the same with the upper and lower case of
381  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
382  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
383  * then inverts it to form a mask, with just a single 0, in the bit position
384  * where the upper- and lowercase differ.  XXX There are about 40 other
385  * instances in the Perl core where this micro-optimization could be used.
386  * Should decide if maintenance cost is worse, before changing those
387  *
388  * Returns a boolean as to whether or not 'v' is either a lowercase or
389  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
390  * compile-time constant, the generated code is better than some optimizing
391  * compilers figure out, amounting to a mask and test.  The results are
392  * meaningless if 'c' is not one of [A-Za-z] */
393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
394                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
395 
396 /*
397  * Forward declarations for pregcomp()'s friends.
398  */
399 
400 static const scan_data_t zero_scan_data =
401   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
402 
403 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
404 #define SF_BEFORE_SEOL		0x0001
405 #define SF_BEFORE_MEOL		0x0002
406 #define SF_FIX_BEFORE_EOL	(SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
407 #define SF_FL_BEFORE_EOL	(SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
408 
409 #define SF_FIX_SHIFT_EOL	(+2)
410 #define SF_FL_SHIFT_EOL		(+4)
411 
412 #define SF_FIX_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
413 #define SF_FIX_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
414 
415 #define SF_FL_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
416 #define SF_FL_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
417 #define SF_IS_INF		0x0040
418 #define SF_HAS_PAR		0x0080
419 #define SF_IN_PAR		0x0100
420 #define SF_HAS_EVAL		0x0200
421 #define SCF_DO_SUBSTR		0x0400
422 #define SCF_DO_STCLASS_AND	0x0800
423 #define SCF_DO_STCLASS_OR	0x1000
424 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
425 #define SCF_WHILEM_VISITED_POS	0x2000
426 
427 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
428 #define SCF_SEEN_ACCEPT         0x8000
429 #define SCF_TRIE_DOING_RESTUDY 0x10000
430 
431 #define UTF cBOOL(RExC_utf8)
432 
433 /* The enums for all these are ordered so things work out correctly */
434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
436                                                      == REGEX_DEPENDS_CHARSET)
437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
439                                                      >= REGEX_UNICODE_CHARSET)
440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
441                                             == REGEX_ASCII_RESTRICTED_CHARSET)
442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
443                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
445                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
446 
447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
448 
449 /* For programs that want to be strictly Unicode compatible by dying if any
450  * attempt is made to match a non-Unicode code point against a Unicode
451  * property.  */
452 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
453 
454 #define OOB_NAMEDCLASS		-1
455 
456 /* There is no code point that is out-of-bounds, so this is problematic.  But
457  * its only current use is to initialize a variable that is always set before
458  * looked at. */
459 #define OOB_UNICODE		0xDEADBEEF
460 
461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
463 
464 
465 /* length of regex to show in messages that don't mark a position within */
466 #define RegexLengthToShowInErrorMessages 127
467 
468 /*
469  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
470  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
471  * op/pragma/warn/regcomp.
472  */
473 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
474 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
475 
476 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
477                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
478 
479 #define REPORT_LOCATION_ARGS(offset)            \
480                 UTF8fARG(UTF, offset, RExC_precomp), \
481                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
482 
483 /*
484  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
485  * arg. Show regex, up to a maximum length. If it's too long, chop and add
486  * "...".
487  */
488 #define _FAIL(code) STMT_START {					\
489     const char *ellipses = "";						\
490     IV len = RExC_end - RExC_precomp;					\
491 									\
492     if (!SIZE_ONLY)							\
493 	SAVEFREESV(RExC_rx_sv);						\
494     if (len > RegexLengthToShowInErrorMessages) {			\
495 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
496 	len = RegexLengthToShowInErrorMessages - 10;			\
497 	ellipses = "...";						\
498     }									\
499     code;                                                               \
500 } STMT_END
501 
502 #define	FAIL(msg) _FAIL(			    \
503     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",	    \
504 	    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
505 
506 #define	FAIL2(msg,arg) _FAIL(			    \
507     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",	    \
508 	    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
509 
510 /*
511  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
512  */
513 #define	Simple_vFAIL(m) STMT_START {					\
514     const IV offset = RExC_parse - RExC_precomp;			\
515     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
516 	    m, REPORT_LOCATION_ARGS(offset));	\
517 } STMT_END
518 
519 /*
520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
521  */
522 #define	vFAIL(m) STMT_START {				\
523     if (!SIZE_ONLY)					\
524 	SAVEFREESV(RExC_rx_sv);				\
525     Simple_vFAIL(m);					\
526 } STMT_END
527 
528 /*
529  * Like Simple_vFAIL(), but accepts two arguments.
530  */
531 #define	Simple_vFAIL2(m,a1) STMT_START {			\
532     const IV offset = RExC_parse - RExC_precomp;			\
533     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,			\
534                       REPORT_LOCATION_ARGS(offset));	\
535 } STMT_END
536 
537 /*
538  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
539  */
540 #define	vFAIL2(m,a1) STMT_START {			\
541     if (!SIZE_ONLY)					\
542 	SAVEFREESV(RExC_rx_sv);				\
543     Simple_vFAIL2(m, a1);				\
544 } STMT_END
545 
546 
547 /*
548  * Like Simple_vFAIL(), but accepts three arguments.
549  */
550 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
551     const IV offset = RExC_parse - RExC_precomp;		\
552     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,		\
553 	    REPORT_LOCATION_ARGS(offset));	\
554 } STMT_END
555 
556 /*
557  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
558  */
559 #define	vFAIL3(m,a1,a2) STMT_START {			\
560     if (!SIZE_ONLY)					\
561 	SAVEFREESV(RExC_rx_sv);				\
562     Simple_vFAIL3(m, a1, a2);				\
563 } STMT_END
564 
565 /*
566  * Like Simple_vFAIL(), but accepts four arguments.
567  */
568 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
569     const IV offset = RExC_parse - RExC_precomp;		\
570     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,		\
571 	    REPORT_LOCATION_ARGS(offset));	\
572 } STMT_END
573 
574 #define	vFAIL4(m,a1,a2,a3) STMT_START {			\
575     if (!SIZE_ONLY)					\
576 	SAVEFREESV(RExC_rx_sv);				\
577     Simple_vFAIL4(m, a1, a2, a3);			\
578 } STMT_END
579 
580 /* A specialized version of vFAIL2 that works with UTF8f */
581 #define vFAIL2utf8f(m, a1) STMT_START { \
582     const IV offset = RExC_parse - RExC_precomp;   \
583     if (!SIZE_ONLY)                                \
584         SAVEFREESV(RExC_rx_sv);                    \
585     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
586             REPORT_LOCATION_ARGS(offset));         \
587 } STMT_END
588 
589 
590 /* m is not necessarily a "literal string", in this macro */
591 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
592     const IV offset = loc - RExC_precomp;                               \
593     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
594             m, REPORT_LOCATION_ARGS(offset));       \
595 } STMT_END
596 
597 #define	ckWARNreg(loc,m) STMT_START {					\
598     const IV offset = loc - RExC_precomp;				\
599     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
600 	    REPORT_LOCATION_ARGS(offset));		\
601 } STMT_END
602 
603 #define	vWARN_dep(loc, m) STMT_START {				        \
604     const IV offset = loc - RExC_precomp;				\
605     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,	\
606 	    REPORT_LOCATION_ARGS(offset));	        \
607 } STMT_END
608 
609 #define	ckWARNdep(loc,m) STMT_START {				        \
610     const IV offset = loc - RExC_precomp;				\
611     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),	                \
612 	    m REPORT_LOCATION,						\
613 	    REPORT_LOCATION_ARGS(offset));		\
614 } STMT_END
615 
616 #define	ckWARNregdep(loc,m) STMT_START {				\
617     const IV offset = loc - RExC_precomp;				\
618     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),	\
619 	    m REPORT_LOCATION,						\
620 	    REPORT_LOCATION_ARGS(offset));		\
621 } STMT_END
622 
623 #define	ckWARN2reg_d(loc,m, a1) STMT_START {				\
624     const IV offset = loc - RExC_precomp;				\
625     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),			\
626 	    m REPORT_LOCATION,						\
627 	    a1, REPORT_LOCATION_ARGS(offset));	\
628 } STMT_END
629 
630 #define	ckWARN2reg(loc, m, a1) STMT_START {				\
631     const IV offset = loc - RExC_precomp;				\
632     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
633 	    a1, REPORT_LOCATION_ARGS(offset));	\
634 } STMT_END
635 
636 #define	vWARN3(loc, m, a1, a2) STMT_START {				\
637     const IV offset = loc - RExC_precomp;				\
638     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
639 	    a1, a2, REPORT_LOCATION_ARGS(offset));	\
640 } STMT_END
641 
642 #define	ckWARN3reg(loc, m, a1, a2) STMT_START {				\
643     const IV offset = loc - RExC_precomp;				\
644     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
645 	    a1, a2, REPORT_LOCATION_ARGS(offset));	\
646 } STMT_END
647 
648 #define	vWARN4(loc, m, a1, a2, a3) STMT_START {				\
649     const IV offset = loc - RExC_precomp;				\
650     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
651 	    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
652 } STMT_END
653 
654 #define	ckWARN4reg(loc, m, a1, a2, a3) STMT_START {			\
655     const IV offset = loc - RExC_precomp;				\
656     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
657 	    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
658 } STMT_END
659 
660 #define	vWARN5(loc, m, a1, a2, a3, a4) STMT_START {			\
661     const IV offset = loc - RExC_precomp;				\
662     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
663 	    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
664 } STMT_END
665 
666 
667 /* Allow for side effects in s */
668 #define REGC(c,s) STMT_START {			\
669     if (!SIZE_ONLY) *(s) = (c); else (void)(s);	\
670 } STMT_END
671 
672 /* Macros for recording node offsets.   20001227 mjd@plover.com
673  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
674  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
675  * Element 0 holds the number n.
676  * Position is 1 indexed.
677  */
678 #ifndef RE_TRACK_PATTERN_OFFSETS
679 #define Set_Node_Offset_To_R(node,byte)
680 #define Set_Node_Offset(node,byte)
681 #define Set_Cur_Node_Offset
682 #define Set_Node_Length_To_R(node,len)
683 #define Set_Node_Length(node,len)
684 #define Set_Node_Cur_Length(node,start)
685 #define Node_Offset(n)
686 #define Node_Length(n)
687 #define Set_Node_Offset_Length(node,offset,len)
688 #define ProgLen(ri) ri->u.proglen
689 #define SetProgLen(ri,x) ri->u.proglen = x
690 #else
691 #define ProgLen(ri) ri->u.offsets[0]
692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
693 #define Set_Node_Offset_To_R(node,byte) STMT_START {			\
694     if (! SIZE_ONLY) {							\
695 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
696 		    __LINE__, (int)(node), (int)(byte)));		\
697 	if((node) < 0) {						\
698 	    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
699                                          (int)(node));                  \
700 	} else {							\
701 	    RExC_offsets[2*(node)-1] = (byte);				\
702 	}								\
703     }									\
704 } STMT_END
705 
706 #define Set_Node_Offset(node,byte) \
707     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
709 
710 #define Set_Node_Length_To_R(node,len) STMT_START {			\
711     if (! SIZE_ONLY) {							\
712 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
713 		__LINE__, (int)(node), (int)(len)));			\
714 	if((node) < 0) {						\
715 	    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
716                                          (int)(node));                  \
717 	} else {							\
718 	    RExC_offsets[2*(node)] = (len);				\
719 	}								\
720     }									\
721 } STMT_END
722 
723 #define Set_Node_Length(node,len) \
724     Set_Node_Length_To_R((node)-RExC_emit_start, len)
725 #define Set_Node_Cur_Length(node, start)                \
726     Set_Node_Length(node, RExC_parse - start)
727 
728 /* Get offsets and lengths */
729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
731 
732 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
733     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));	\
734     Set_Node_Length_To_R((node)-RExC_emit_start, (len));	\
735 } STMT_END
736 #endif
737 
738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
739 #define EXPERIMENTAL_INPLACESCAN
740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
741 
742 #define DEBUG_RExC_seen() \
743         DEBUG_OPTIMISE_MORE_r({                                             \
744             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
745                                                                             \
746             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
747                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
748                                                                             \
749             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
750                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
751                                                                             \
752             if (RExC_seen & REG_GPOS_SEEN)                                  \
753                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
754                                                                             \
755             if (RExC_seen & REG_CANY_SEEN)                                  \
756                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
757                                                                             \
758             if (RExC_seen & REG_RECURSE_SEEN)                               \
759                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
760                                                                             \
761             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
762                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
763                                                                             \
764             if (RExC_seen & REG_VERBARG_SEEN)                               \
765                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
766                                                                             \
767             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
768                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
769                                                                             \
770             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
771                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
772                                                                             \
773             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
774                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
775                                                                             \
776             if (RExC_seen & REG_GOSTART_SEEN)                               \
777                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
778                                                                             \
779             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
780                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
781                                                                             \
782             PerlIO_printf(Perl_debug_log,"\n");                             \
783         });
784 
785 #define DEBUG_STUDYDATA(str,data,depth)                              \
786 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
787     PerlIO_printf(Perl_debug_log,                                    \
788         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
789         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
790         (int)(depth)*2, "",                                          \
791         (IV)((data)->pos_min),                                       \
792         (IV)((data)->pos_delta),                                     \
793         (UV)((data)->flags),                                         \
794         (IV)((data)->whilem_c),                                      \
795         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
796         is_inf ? "INF " : ""                                         \
797     );                                                               \
798     if ((data)->last_found)                                          \
799         PerlIO_printf(Perl_debug_log,                                \
800             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
801             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
802             SvPVX_const((data)->last_found),                         \
803             (IV)((data)->last_end),                                  \
804             (IV)((data)->last_start_min),                            \
805             (IV)((data)->last_start_max),                            \
806             ((data)->longest &&                                      \
807              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
808             SvPVX_const((data)->longest_fixed),                      \
809             (IV)((data)->offset_fixed),                              \
810             ((data)->longest &&                                      \
811              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
812             SvPVX_const((data)->longest_float),                      \
813             (IV)((data)->offset_float_min),                          \
814             (IV)((data)->offset_float_max)                           \
815         );                                                           \
816     PerlIO_printf(Perl_debug_log,"\n");                              \
817 });
818 
819 /* Mark that we cannot extend a found fixed substring at this point.
820    Update the longest found anchored substring and the longest found
821    floating substrings if needed. */
822 
823 STATIC void
824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
825                     SSize_t *minlenp, int is_inf)
826 {
827     const STRLEN l = CHR_SVLEN(data->last_found);
828     const STRLEN old_l = CHR_SVLEN(*data->longest);
829     GET_RE_DEBUG_FLAGS_DECL;
830 
831     PERL_ARGS_ASSERT_SCAN_COMMIT;
832 
833     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
834 	SvSetMagicSV(*data->longest, data->last_found);
835 	if (*data->longest == data->longest_fixed) {
836 	    data->offset_fixed = l ? data->last_start_min : data->pos_min;
837 	    if (data->flags & SF_BEFORE_EOL)
838 		data->flags
839 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
840 	    else
841 		data->flags &= ~SF_FIX_BEFORE_EOL;
842 	    data->minlen_fixed=minlenp;
843 	    data->lookbehind_fixed=0;
844 	}
845 	else { /* *data->longest == data->longest_float */
846 	    data->offset_float_min = l ? data->last_start_min : data->pos_min;
847 	    data->offset_float_max = (l
848 				      ? data->last_start_max
849 				      : (data->pos_delta == SSize_t_MAX
850 					 ? SSize_t_MAX
851 					 : data->pos_min + data->pos_delta));
852 	    if (is_inf
853 		 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
854 		data->offset_float_max = SSize_t_MAX;
855 	    if (data->flags & SF_BEFORE_EOL)
856 		data->flags
857 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
858 	    else
859 		data->flags &= ~SF_FL_BEFORE_EOL;
860             data->minlen_float=minlenp;
861             data->lookbehind_float=0;
862 	}
863     }
864     SvCUR_set(data->last_found, 0);
865     {
866 	SV * const sv = data->last_found;
867 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
868 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
869 	    if (mg)
870 		mg->mg_len = 0;
871 	}
872     }
873     data->last_end = -1;
874     data->flags &= ~SF_BEFORE_EOL;
875     DEBUG_STUDYDATA("commit: ",data,0);
876 }
877 
878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
879  * list that describes which code points it matches */
880 
881 STATIC void
882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
883 {
884     /* Set the SSC 'ssc' to match an empty string or any code point */
885 
886     PERL_ARGS_ASSERT_SSC_ANYTHING;
887 
888     assert(is_ANYOF_SYNTHETIC(ssc));
889 
890     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
891     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
892     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
893 }
894 
895 STATIC int
896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
897 {
898     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
899      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
900      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
901      * in any way, so there's no point in using it */
902 
903     UV start, end;
904     bool ret;
905 
906     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
907 
908     assert(is_ANYOF_SYNTHETIC(ssc));
909 
910     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
911         return FALSE;
912     }
913 
914     /* See if the list consists solely of the range 0 - Infinity */
915     invlist_iterinit(ssc->invlist);
916     ret = invlist_iternext(ssc->invlist, &start, &end)
917           && start == 0
918           && end == UV_MAX;
919 
920     invlist_iterfinish(ssc->invlist);
921 
922     if (ret) {
923         return TRUE;
924     }
925 
926     /* If e.g., both \w and \W are set, matches everything */
927     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
928         int i;
929         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
930             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
931                 return TRUE;
932             }
933         }
934     }
935 
936     return FALSE;
937 }
938 
939 STATIC void
940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
941 {
942     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
943      * string, any code point, or any posix class under locale */
944 
945     PERL_ARGS_ASSERT_SSC_INIT;
946 
947     Zero(ssc, 1, regnode_ssc);
948     set_ANYOF_SYNTHETIC(ssc);
949     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
950     ssc_anything(ssc);
951 
952     /* If any portion of the regex is to operate under locale rules,
953      * initialization includes it.  The reason this isn't done for all regexes
954      * is that the optimizer was written under the assumption that locale was
955      * all-or-nothing.  Given the complexity and lack of documentation in the
956      * optimizer, and that there are inadequate test cases for locale, many
957      * parts of it may not work properly, it is safest to avoid locale unless
958      * necessary. */
959     if (RExC_contains_locale) {
960 	ANYOF_POSIXL_SETALL(ssc);
961     }
962     else {
963 	ANYOF_POSIXL_ZERO(ssc);
964     }
965 }
966 
967 STATIC int
968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
969                               const regnode_ssc *ssc)
970 {
971     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
972      * to the list of code points matched, and locale posix classes; hence does
973      * not check its flags) */
974 
975     UV start, end;
976     bool ret;
977 
978     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
979 
980     assert(is_ANYOF_SYNTHETIC(ssc));
981 
982     invlist_iterinit(ssc->invlist);
983     ret = invlist_iternext(ssc->invlist, &start, &end)
984           && start == 0
985           && end == UV_MAX;
986 
987     invlist_iterfinish(ssc->invlist);
988 
989     if (! ret) {
990         return FALSE;
991     }
992 
993     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
994         return FALSE;
995     }
996 
997     return TRUE;
998 }
999 
1000 STATIC SV*
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002                                const regnode_charclass* const node)
1003 {
1004     /* Returns a mortal inversion list defining which code points are matched
1005      * by 'node', which is of type ANYOF.  Handles complementing the result if
1006      * appropriate.  If some code points aren't knowable at this time, the
1007      * returned list must, and will, contain every code point that is a
1008      * possibility. */
1009 
1010     SV* invlist = sv_2mortal(_new_invlist(0));
1011     SV* only_utf8_locale_invlist = NULL;
1012     unsigned int i;
1013     const U32 n = ARG(node);
1014     bool new_node_has_latin1 = FALSE;
1015 
1016     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017 
1018     /* Look at the data structure created by S_set_ANYOF_arg() */
1019     if (n != ANYOF_NONBITMAP_EMPTY) {
1020         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021         AV * const av = MUTABLE_AV(SvRV(rv));
1022         SV **const ary = AvARRAY(av);
1023         assert(RExC_rxi->data->what[n] == 's');
1024 
1025         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027         }
1028         else if (ary[0] && ary[0] != &PL_sv_undef) {
1029 
1030             /* Here, no compile-time swash, and there are things that won't be
1031              * known until runtime -- we have to assume it could be anything */
1032             return _add_range_to_invlist(invlist, 0, UV_MAX);
1033         }
1034         else if (ary[3] && ary[3] != &PL_sv_undef) {
1035 
1036             /* Here no compile-time swash, and no run-time only data.  Use the
1037              * node's inversion list */
1038             invlist = sv_2mortal(invlist_clone(ary[3]));
1039         }
1040 
1041         /* Get the code points valid only under UTF-8 locales */
1042         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043             && ary[2] && ary[2] != &PL_sv_undef)
1044         {
1045             only_utf8_locale_invlist = ary[2];
1046         }
1047     }
1048 
1049     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050      * inversion list for the others, but if there are code points that should
1051      * match only conditionally on the target string being UTF-8, those are
1052      * placed in the inversion list, and not the bitmap.  Since there are
1053      * circumstances under which they could match, they are included in the
1054      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1055      * here, so that when we invert below, the end result actually does include
1056      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1057      * before we add the unconditionally matched code points */
1058     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059         _invlist_intersection_complement_2nd(invlist,
1060                                              PL_UpperLatin1,
1061                                              &invlist);
1062     }
1063 
1064     /* Add in the points from the bit map */
1065     for (i = 0; i < 256; i++) {
1066         if (ANYOF_BITMAP_TEST(node, i)) {
1067             invlist = add_cp_to_invlist(invlist, i);
1068             new_node_has_latin1 = TRUE;
1069         }
1070     }
1071 
1072     /* If this can match all upper Latin1 code points, have to add them
1073      * as well */
1074     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1076     }
1077 
1078     /* Similarly for these */
1079     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1081     }
1082 
1083     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084         _invlist_invert(invlist);
1085     }
1086     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1087 
1088         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089          * locale.  We can skip this if there are no 0-255 at all. */
1090         _invlist_union(invlist, PL_Latin1, &invlist);
1091     }
1092 
1093     /* Similarly add the UTF-8 locale possible matches.  These have to be
1094      * deferred until after the non-UTF-8 locale ones are taken care of just
1095      * above, or it leads to wrong results under ANYOF_INVERT */
1096     if (only_utf8_locale_invlist) {
1097         _invlist_union_maybe_complement_2nd(invlist,
1098                                             only_utf8_locale_invlist,
1099                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1100                                             &invlist);
1101     }
1102 
1103     return invlist;
1104 }
1105 
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero		ssc_init
1108 
1109 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1111 
1112 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1113  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1114  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1115 
1116 STATIC void
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118                 const regnode_charclass *and_with)
1119 {
1120     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121      * another SSC or a regular ANYOF class.  Can create false positives. */
1122 
1123     SV* anded_cp_list;
1124     U8  anded_flags;
1125 
1126     PERL_ARGS_ASSERT_SSC_AND;
1127 
1128     assert(is_ANYOF_SYNTHETIC(ssc));
1129 
1130     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131      * the code point inversion list and just the relevant flags */
1132     if (is_ANYOF_SYNTHETIC(and_with)) {
1133         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134         anded_flags = ANYOF_FLAGS(and_with);
1135 
1136         /* XXX This is a kludge around what appears to be deficiencies in the
1137          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1138          * there are paths through the optimizer where it doesn't get weeded
1139          * out when it should.  And if we don't make some extra provision for
1140          * it like the code just below, it doesn't get added when it should.
1141          * This solution is to add it only when AND'ing, which is here, and
1142          * only when what is being AND'ed is the pristine, original node
1143          * matching anything.  Thus it is like adding it to ssc_anything() but
1144          * only when the result is to be AND'ed.  Probably the same solution
1145          * could be adopted for the same problem we have with /l matching,
1146          * which is solved differently in S_ssc_init(), and that would lead to
1147          * fewer false positives than that solution has.  But if this solution
1148          * creates bugs, the consequences are only that a warning isn't raised
1149          * that should be; while the consequences for having /l bugs is
1150          * incorrect matches */
1151         if (ssc_is_anything((regnode_ssc *)and_with)) {
1152             anded_flags |= ANYOF_WARN_SUPER;
1153         }
1154     }
1155     else {
1156         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1158     }
1159 
1160     ANYOF_FLAGS(ssc) &= anded_flags;
1161 
1162     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164      * 'and_with' may be inverted.  When not inverted, we have the situation of
1165      * computing:
1166      *  (C1 | P1) & (C2 | P2)
1167      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1170      *                    <=  ((C1 & C2) | P1 | P2)
1171      * Alternatively, the last few steps could be:
1172      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1174      *                    <=  (C1 | C2 | (P1 & P2))
1175      * We favor the second approach if either P1 or P2 is non-empty.  This is
1176      * because these components are a barrier to doing optimizations, as what
1177      * they match cannot be known until the moment of matching as they are
1178      * dependent on the current locale, 'AND"ing them likely will reduce or
1179      * eliminate them.
1180      * But we can do better if we know that C1,P1 are in their initial state (a
1181      * frequent occurrence), each matching everything:
1182      *  (<everything>) & (C2 | P2) =  C2 | P2
1183      * Similarly, if C2,P2 are in their initial state (again a frequent
1184      * occurrence), the result is a no-op
1185      *  (C1 | P1) & (<everything>) =  C1 | P1
1186      *
1187      * Inverted, we have
1188      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1189      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1191      * */
1192 
1193     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194         && ! is_ANYOF_SYNTHETIC(and_with))
1195     {
1196         unsigned int i;
1197 
1198         ssc_intersection(ssc,
1199                          anded_cp_list,
1200                          FALSE /* Has already been inverted */
1201                          );
1202 
1203         /* If either P1 or P2 is empty, the intersection will be also; can skip
1204          * the loop */
1205         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206             ANYOF_POSIXL_ZERO(ssc);
1207         }
1208         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1209 
1210             /* Note that the Posix class component P from 'and_with' actually
1211              * looks like:
1212              *      P = Pa | Pb | ... | Pn
1213              * where each component is one posix class, such as in [\w\s].
1214              * Thus
1215              *      ~P = ~(Pa | Pb | ... | Pn)
1216              *         = ~Pa & ~Pb & ... & ~Pn
1217              *        <= ~Pa | ~Pb | ... | ~Pn
1218              * The last is something we can easily calculate, but unfortunately
1219              * is likely to have many false positives.  We could do better
1220              * in some (but certainly not all) instances if two classes in
1221              * P have known relationships.  For example
1222              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1223              * So
1224              *      :lower: & :print: = :lower:
1225              * And similarly for classes that must be disjoint.  For example,
1226              * since \s and \w can have no elements in common based on rules in
1227              * the POSIX standard,
1228              *      \w & ^\S = nothing
1229              * Unfortunately, some vendor locales do not meet the Posix
1230              * standard, in particular almost everything by Microsoft.
1231              * The loop below just changes e.g., \w into \W and vice versa */
1232 
1233             regnode_charclass_posixl temp;
1234             int add = 1;    /* To calculate the index of the complement */
1235 
1236             ANYOF_POSIXL_ZERO(&temp);
1237             for (i = 0; i < ANYOF_MAX; i++) {
1238                 assert(i % 2 != 0
1239                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1241 
1242                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243                     ANYOF_POSIXL_SET(&temp, i + add);
1244                 }
1245                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1246             }
1247             ANYOF_POSIXL_AND(&temp, ssc);
1248 
1249         } /* else ssc already has no posixes */
1250     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1251          in its initial state */
1252     else if (! is_ANYOF_SYNTHETIC(and_with)
1253              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1254     {
1255         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256          * copy it over 'ssc' */
1257         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258             if (is_ANYOF_SYNTHETIC(and_with)) {
1259                 StructCopy(and_with, ssc, regnode_ssc);
1260             }
1261             else {
1262                 ssc->invlist = anded_cp_list;
1263                 ANYOF_POSIXL_ZERO(ssc);
1264                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1266                 }
1267             }
1268         }
1269         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1271         {
1272             /* One or the other of P1, P2 is non-empty. */
1273             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1275             }
1276             ssc_union(ssc, anded_cp_list, FALSE);
1277         }
1278         else { /* P1 = P2 = empty */
1279             ssc_intersection(ssc, anded_cp_list, FALSE);
1280         }
1281     }
1282 }
1283 
1284 STATIC void
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286                const regnode_charclass *or_with)
1287 {
1288     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289      * another SSC or a regular ANYOF class.  Can create false positives if
1290      * 'or_with' is to be inverted. */
1291 
1292     SV* ored_cp_list;
1293     U8 ored_flags;
1294 
1295     PERL_ARGS_ASSERT_SSC_OR;
1296 
1297     assert(is_ANYOF_SYNTHETIC(ssc));
1298 
1299     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300      * the code point inversion list and just the relevant flags */
1301     if (is_ANYOF_SYNTHETIC(or_with)) {
1302         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303         ored_flags = ANYOF_FLAGS(or_with);
1304     }
1305     else {
1306         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1308     }
1309 
1310     ANYOF_FLAGS(ssc) |= ored_flags;
1311 
1312     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314      * 'or_with' may be inverted.  When not inverted, we have the simple
1315      * situation of computing:
1316      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1317      * If P1|P2 yields a situation with both a class and its complement are
1318      * set, like having both \w and \W, this matches all code points, and we
1319      * can delete these from the P component of the ssc going forward.  XXX We
1320      * might be able to delete all the P components, but I (khw) am not certain
1321      * about this, and it is better to be safe.
1322      *
1323      * Inverted, we have
1324      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1325      *                         <=  (C1 | P1) | ~C2
1326      *                         <=  (C1 | ~C2) | P1
1327      * (which results in actually simpler code than the non-inverted case)
1328      * */
1329 
1330     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331         && ! is_ANYOF_SYNTHETIC(or_with))
1332     {
1333         /* We ignore P2, leaving P1 going forward */
1334     }   /* else  Not inverted */
1335     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1338             unsigned int i;
1339             for (i = 0; i < ANYOF_MAX; i += 2) {
1340                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1341                 {
1342                     ssc_match_all_cp(ssc);
1343                     ANYOF_POSIXL_CLEAR(ssc, i);
1344                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1345                 }
1346             }
1347         }
1348     }
1349 
1350     ssc_union(ssc,
1351               ored_cp_list,
1352               FALSE /* Already has been inverted */
1353               );
1354 }
1355 
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358 {
1359     PERL_ARGS_ASSERT_SSC_UNION;
1360 
1361     assert(is_ANYOF_SYNTHETIC(ssc));
1362 
1363     _invlist_union_maybe_complement_2nd(ssc->invlist,
1364                                         invlist,
1365                                         invert2nd,
1366                                         &ssc->invlist);
1367 }
1368 
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371                          SV* const invlist,
1372                          const bool invert2nd)
1373 {
1374     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375 
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377 
1378     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379                                                invlist,
1380                                                invert2nd,
1381                                                &ssc->invlist);
1382 }
1383 
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386 {
1387     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388 
1389     assert(is_ANYOF_SYNTHETIC(ssc));
1390 
1391     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392 }
1393 
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396 {
1397     /* AND just the single code point 'cp' into the SSC 'ssc' */
1398 
1399     SV* cp_list = _new_invlist(2);
1400 
1401     PERL_ARGS_ASSERT_SSC_CP_AND;
1402 
1403     assert(is_ANYOF_SYNTHETIC(ssc));
1404 
1405     cp_list = add_cp_to_invlist(cp_list, cp);
1406     ssc_intersection(ssc, cp_list,
1407                      FALSE /* Not inverted */
1408                      );
1409     SvREFCNT_dec_NN(cp_list);
1410 }
1411 
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414 {
1415     /* Set the SSC 'ssc' to not match any locale things */
1416 
1417     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418 
1419     assert(is_ANYOF_SYNTHETIC(ssc));
1420 
1421     ANYOF_POSIXL_ZERO(ssc);
1422     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423 }
1424 
1425 STATIC void
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427 {
1428     /* The inversion list in the SSC is marked mortal; now we need a more
1429      * permanent copy, which is stored the same way that is done in a regular
1430      * ANYOF node, with the first 256 code points in a bit map */
1431 
1432     SV* invlist = invlist_clone(ssc->invlist);
1433 
1434     PERL_ARGS_ASSERT_SSC_FINALIZE;
1435 
1436     assert(is_ANYOF_SYNTHETIC(ssc));
1437 
1438     /* The code in this file assumes that all but these flags aren't relevant
1439      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440      * time we reach here */
1441     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1442 
1443     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444 
1445     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446                                 NULL, NULL, NULL, FALSE);
1447 
1448     /* Make sure is clone-safe */
1449     ssc->invlist = NULL;
1450 
1451     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1453     }
1454 
1455     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1456 }
1457 
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1462                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1463                                : 0 )
1464 
1465 
1466 #ifdef DEBUGGING
1467 /*
1468    dump_trie(trie,widecharmap,revcharmap)
1469    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1471 
1472    These routines dump out a trie in a somewhat readable format.
1473    The _interim_ variants are used for debugging the interim
1474    tables that are used to generate the final compressed
1475    representation which is what dump_trie expects.
1476 
1477    Part of the reason for their existence is to provide a form
1478    of documentation as to how the different representations function.
1479 
1480 */
1481 
1482 /*
1483   Dumps the final compressed table form of the trie to Perl_debug_log.
1484   Used for debugging make_trie().
1485 */
1486 
1487 STATIC void
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489 	    AV *revcharmap, U32 depth)
1490 {
1491     U32 state;
1492     SV *sv=sv_newmortal();
1493     int colwidth= widecharmap ? 6 : 4;
1494     U16 word;
1495     GET_RE_DEBUG_FLAGS_DECL;
1496 
1497     PERL_ARGS_ASSERT_DUMP_TRIE;
1498 
1499     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500         (int)depth * 2 + 2,"",
1501         "Match","Base","Ofs" );
1502 
1503     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504 	SV ** const tmp = av_fetch( revcharmap, state, 0);
1505         if ( tmp ) {
1506             PerlIO_printf( Perl_debug_log, "%*s",
1507                 colwidth,
1508                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509 	                    PL_colors[0], PL_colors[1],
1510 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511 	                    PERL_PV_ESCAPE_FIRSTCHAR
1512                 )
1513             );
1514         }
1515     }
1516     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517         (int)depth * 2 + 2,"");
1518 
1519     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521     PerlIO_printf( Perl_debug_log, "\n");
1522 
1523     for( state = 1 ; state < trie->statecount ; state++ ) {
1524 	const U32 base = trie->states[ state ].trans.base;
1525 
1526         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527                                        (int)depth * 2 + 2,"", (UV)state);
1528 
1529         if ( trie->states[ state ].wordnum ) {
1530             PerlIO_printf( Perl_debug_log, " W%4X",
1531                                            trie->states[ state ].wordnum );
1532         } else {
1533             PerlIO_printf( Perl_debug_log, "%6s", "" );
1534         }
1535 
1536         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1537 
1538         if ( base ) {
1539             U32 ofs = 0;
1540 
1541             while( ( base + ofs  < trie->uniquecharcount ) ||
1542                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1544                                                                     != state))
1545                     ofs++;
1546 
1547             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1548 
1549             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550                 if ( ( base + ofs >= trie->uniquecharcount )
1551                         && ( base + ofs - trie->uniquecharcount
1552                                                         < trie->lasttrans )
1553                         && trie->trans[ base + ofs
1554                                     - trie->uniquecharcount ].check == state )
1555                 {
1556                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1557                     colwidth,
1558                     (UV)trie->trans[ base + ofs
1559                                              - trie->uniquecharcount ].next );
1560                 } else {
1561                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1562                 }
1563             }
1564 
1565             PerlIO_printf( Perl_debug_log, "]");
1566 
1567         }
1568         PerlIO_printf( Perl_debug_log, "\n" );
1569     }
1570     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1571                                 (int)depth*2, "");
1572     for (word=1; word <= trie->wordcount; word++) {
1573 	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574 	    (int)word, (int)(trie->wordinfo[word].prev),
1575 	    (int)(trie->wordinfo[word].len));
1576     }
1577     PerlIO_printf(Perl_debug_log, "\n" );
1578 }
1579 /*
1580   Dumps a fully constructed but uncompressed trie in list form.
1581   List tries normally only are used for construction when the number of
1582   possible chars (trie->uniquecharcount) is very high.
1583   Used for debugging make_trie().
1584 */
1585 STATIC void
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1588 			 U32 depth)
1589 {
1590     U32 state;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594 
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1596 
1597     /* print out the table precompression.  */
1598     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600         "------:-----+-----------------\n" );
1601 
1602     for( state=1 ; state < next_alloc ; state ++ ) {
1603         U16 charid;
1604 
1605         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606             (int)depth * 2 + 2,"", (UV)state  );
1607         if ( ! trie->states[ state ].wordnum ) {
1608             PerlIO_printf( Perl_debug_log, "%5s| ","");
1609         } else {
1610             PerlIO_printf( Perl_debug_log, "W%4x| ",
1611                 trie->states[ state ].wordnum
1612             );
1613         }
1614         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615 	    SV ** const tmp = av_fetch( revcharmap,
1616                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1617 	    if ( tmp ) {
1618                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1619                     colwidth,
1620                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1621                               colwidth,
1622                               PL_colors[0], PL_colors[1],
1623                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624                               | PERL_PV_ESCAPE_FIRSTCHAR
1625                     ) ,
1626                     TRIE_LIST_ITEM(state,charid).forid,
1627                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1628                 );
1629                 if (!(charid % 10))
1630                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631                         (int)((depth * 2) + 14), "");
1632             }
1633         }
1634         PerlIO_printf( Perl_debug_log, "\n");
1635     }
1636 }
1637 
1638 /*
1639   Dumps a fully constructed but uncompressed trie in table form.
1640   This is the normal DFA style state transition table, with a few
1641   twists to facilitate compression later.
1642   Used for debugging make_trie().
1643 */
1644 STATIC void
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
1647 			  U32 depth)
1648 {
1649     U32 state;
1650     U16 charid;
1651     SV *sv=sv_newmortal();
1652     int colwidth= widecharmap ? 6 : 4;
1653     GET_RE_DEBUG_FLAGS_DECL;
1654 
1655     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1656 
1657     /*
1658        print out the table precompression so that we can do a visual check
1659        that they are identical.
1660      */
1661 
1662     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1663 
1664     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
1666         if ( tmp ) {
1667             PerlIO_printf( Perl_debug_log, "%*s",
1668                 colwidth,
1669                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670 	                    PL_colors[0], PL_colors[1],
1671 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672 	                    PERL_PV_ESCAPE_FIRSTCHAR
1673                 )
1674             );
1675         }
1676     }
1677 
1678     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1679 
1680     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1682     }
1683 
1684     PerlIO_printf( Perl_debug_log, "\n" );
1685 
1686     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1687 
1688         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689             (int)depth * 2 + 2,"",
1690             (UV)TRIE_NODENUM( state ) );
1691 
1692         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1694             if (v)
1695                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1696             else
1697                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1698         }
1699         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701                                             (UV)trie->trans[ state ].check );
1702         } else {
1703             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704                                             (UV)trie->trans[ state ].check,
1705             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1706         }
1707     }
1708 }
1709 
1710 #endif
1711 
1712 
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714   startbranch: the first branch in the whole branch sequence
1715   first      : start branch of sequence of branch-exact nodes.
1716 	       May be the same as startbranch
1717   last       : Thing following the last branch.
1718 	       May be the same as tail.
1719   tail       : item following the branch sequence
1720   count      : words in the sequence
1721   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722   depth      : indent depth
1723 
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1725 
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1730 
1731   /he|she|his|hers/
1732 
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1737 
1738       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1739       |    |
1740       |   (2)
1741       |    |
1742      (1)   +-i->(6)-+-s->[7]
1743       |
1744       +-s->(3)-+-h->(4)-+-e->[5]
1745 
1746       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1747 
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1755 
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1758 
1759  / (DUPE|DUPE) X? (?{ ... }) Y /x
1760 
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1765 
1766  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1767 
1768 which prints out 'word' three times, but
1769 
1770  'words'=~/(word|word|word)(?{ print $1 })S/
1771 
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1773 
1774 Example of what happens on a structural level:
1775 
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1777 
1778    1: CURLYM[1] {1,32767}(18)
1779    5:   BRANCH(8)
1780    6:     EXACT <ac>(16)
1781    8:   BRANCH(11)
1782    9:     EXACT <ad>(16)
1783   11:   BRANCH(14)
1784   12:     EXACT <ab>(16)
1785   16:   SUCCEED(0)
1786   17:   NOTHING(18)
1787   18: END(0)
1788 
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1791 
1792    1: CURLYM[1] {1,32767}(18)
1793    5:   TRIE(16)
1794 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1795 	  <ac>
1796 	  <ad>
1797 	  <ab>
1798   16:   SUCCEED(0)
1799   17:   NOTHING(18)
1800   18: END(0)
1801 
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1803 
1804    1: BRANCH(4)
1805    2:   EXACT <foo>(8)
1806    4: BRANCH(7)
1807    5:   EXACT <bar>(8)
1808    7: TAIL(8)
1809    8: EXACT <baz>(10)
1810   10: END(0)
1811 
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1814 
1815     1: TRIE(8)
1816       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1817 	<foo>
1818 	<bar>
1819    7: TAIL(8)
1820    8: EXACT <baz>(10)
1821   10: END(0)
1822 
1823     d = uvchr_to_utf8_flags(d, uv, 0);
1824 
1825 is the recommended Unicode-aware way of saying
1826 
1827     *(d++) = uv;
1828 */
1829 
1830 #define TRIE_STORE_REVCHAR(val)                                            \
1831     STMT_START {                                                           \
1832 	if (UTF) {							   \
1833             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1834 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
1835             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
1837 	    SvPOK_on(zlopp);						   \
1838 	    SvUTF8_on(zlopp);						   \
1839 	    av_push(revcharmap, zlopp);					   \
1840 	} else {							   \
1841             char ooooff = (char)val;                                           \
1842 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
1843 	}								   \
1844         } STMT_END
1845 
1846 /* This gets the next character from the input, folding it if not already
1847  * folded. */
1848 #define TRIE_READ_CHAR STMT_START {                                           \
1849     wordlen++;                                                                \
1850     if ( UTF ) {                                                              \
1851         /* if it is UTF then it is either already folded, or does not need    \
1852          * folding */                                                         \
1853         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1854     }                                                                         \
1855     else if (folder == PL_fold_latin1) {                                      \
1856         /* This folder implies Unicode rules, which in the range expressible  \
1857          *  by not UTF is the lower case, with the two exceptions, one of     \
1858          *  which should have been taken care of before calling this */       \
1859         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1860         uvc = toLOWER_L1(*uc);                                                \
1861         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1862         len = 1;                                                              \
1863     } else {                                                                  \
1864         /* raw data, will be folded later if needed */                        \
1865         uvc = (U32)*uc;                                                       \
1866         len = 1;                                                              \
1867     }                                                                         \
1868 } STMT_END
1869 
1870 
1871 
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1873     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1874 	U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1875 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1876     }                                                           \
1877     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1879     TRIE_LIST_CUR( state )++;                                   \
1880 } STMT_END
1881 
1882 #define TRIE_LIST_NEW(state) STMT_START {                       \
1883     Newxz( trie->states[ state ].trans.list,               \
1884 	4, reg_trie_trans_le );                                 \
1885      TRIE_LIST_CUR( state ) = 1;                                \
1886      TRIE_LIST_LEN( state ) = 4;                                \
1887 } STMT_END
1888 
1889 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1890     U16 dupe= trie->states[ state ].wordnum;                    \
1891     regnode * const noper_next = regnext( noper );              \
1892                                                                 \
1893     DEBUG_r({                                                   \
1894         /* store the word for dumping */                        \
1895         SV* tmp;                                                \
1896         if (OP(noper) != NOTHING)                               \
1897             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);	\
1898         else                                                    \
1899             tmp = newSVpvn_utf8( "", 0, UTF );			\
1900         av_push( trie_words, tmp );                             \
1901     });                                                         \
1902                                                                 \
1903     curword++;                                                  \
1904     trie->wordinfo[curword].prev   = 0;                         \
1905     trie->wordinfo[curword].len    = wordlen;                   \
1906     trie->wordinfo[curword].accept = state;                     \
1907                                                                 \
1908     if ( noper_next < tail ) {                                  \
1909         if (!trie->jump)                                        \
1910             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1911                                                  sizeof(U16) ); \
1912         trie->jump[curword] = (U16)(noper_next - convert);      \
1913         if (!jumper)                                            \
1914             jumper = noper_next;                                \
1915         if (!nextbranch)                                        \
1916             nextbranch= regnext(cur);                           \
1917     }                                                           \
1918                                                                 \
1919     if ( dupe ) {                                               \
1920         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1921         /* chain, so that when the bits of chain are later    */\
1922         /* linked together, the dups appear in the chain      */\
1923 	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924 	trie->wordinfo[dupe].prev = curword;                    \
1925     } else {                                                    \
1926         /* we haven't inserted this word yet.                */ \
1927         trie->states[ state ].wordnum = curword;                \
1928     }                                                           \
1929 } STMT_END
1930 
1931 
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
1933      ( ( base + charid >=  ucharcount					\
1934          && base + charid < ubound					\
1935          && state == trie->trans[ base - ucharcount + charid ].check	\
1936          && trie->trans[ base - ucharcount + charid ].next )		\
1937            ? trie->trans[ base - ucharcount + charid ].next		\
1938            : ( state==1 ? special : 0 )					\
1939       )
1940 
1941 #define MADE_TRIE       1
1942 #define MADE_JUMP_TRIE  2
1943 #define MADE_EXACT_TRIE 4
1944 
1945 STATIC I32
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947                   regnode *first, regnode *last, regnode *tail,
1948                   U32 word_count, U32 flags, U32 depth)
1949 {
1950     dVAR;
1951     /* first pass, loop through and scan words */
1952     reg_trie_data *trie;
1953     HV *widecharmap = NULL;
1954     AV *revcharmap = newAV();
1955     regnode *cur;
1956     STRLEN len = 0;
1957     UV uvc = 0;
1958     U16 curword = 0;
1959     U32 next_alloc = 0;
1960     regnode *jumper = NULL;
1961     regnode *nextbranch = NULL;
1962     regnode *convert = NULL;
1963     U32 *prev_states; /* temp array mapping each state to previous one */
1964     /* we just use folder as a flag in utf8 */
1965     const U8 * folder = NULL;
1966 
1967 #ifdef DEBUGGING
1968     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969     AV *trie_words = NULL;
1970     /* along with revcharmap, this only used during construction but both are
1971      * useful during debugging so we store them in the struct when debugging.
1972      */
1973 #else
1974     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975     STRLEN trie_charcount=0;
1976 #endif
1977     SV *re_trie_maxbuff;
1978     GET_RE_DEBUG_FLAGS_DECL;
1979 
1980     PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982     PERL_UNUSED_ARG(depth);
1983 #endif
1984 
1985     switch (flags) {
1986         case EXACT: break;
1987 	case EXACTFA:
1988         case EXACTFU_SS:
1989 	case EXACTFU: folder = PL_fold_latin1; break;
1990 	case EXACTF:  folder = PL_fold; break;
1991         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992     }
1993 
1994     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995     trie->refcount = 1;
1996     trie->startstate = 1;
1997     trie->wordcount = word_count;
1998     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000     if (flags == EXACT)
2001 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004 
2005     DEBUG_r({
2006         trie_words = newAV();
2007     });
2008 
2009     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010     if (!SvIOK(re_trie_maxbuff)) {
2011         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2012     }
2013     DEBUG_TRIE_COMPILE_r({
2014         PerlIO_printf( Perl_debug_log,
2015           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2016           (int)depth * 2 + 2, "",
2017           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2018           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2019     });
2020 
2021    /* Find the node we are going to overwrite */
2022     if ( first == startbranch && OP( last ) != BRANCH ) {
2023         /* whole branch chain */
2024         convert = first;
2025     } else {
2026         /* branch sub-chain */
2027         convert = NEXTOPER( first );
2028     }
2029 
2030     /*  -- First loop and Setup --
2031 
2032        We first traverse the branches and scan each word to determine if it
2033        contains widechars, and how many unique chars there are, this is
2034        important as we have to build a table with at least as many columns as we
2035        have unique chars.
2036 
2037        We use an array of integers to represent the character codes 0..255
2038        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2039        the native representation of the character value as the key and IV's for
2040        the coded index.
2041 
2042        *TODO* If we keep track of how many times each character is used we can
2043        remap the columns so that the table compression later on is more
2044        efficient in terms of memory by ensuring the most common value is in the
2045        middle and the least common are on the outside.  IMO this would be better
2046        than a most to least common mapping as theres a decent chance the most
2047        common letter will share a node with the least common, meaning the node
2048        will not be compressible. With a middle is most common approach the worst
2049        case is when we have the least common nodes twice.
2050 
2051      */
2052 
2053     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2054         regnode *noper = NEXTOPER( cur );
2055         const U8 *uc = (U8*)STRING( noper );
2056         const U8 *e  = uc + STR_LEN( noper );
2057         int foldlen = 0;
2058         U32 wordlen      = 0;         /* required init */
2059         STRLEN minchars = 0;
2060         STRLEN maxchars = 0;
2061         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2062                                                bitmap?*/
2063 
2064         if (OP(noper) == NOTHING) {
2065             regnode *noper_next= regnext(noper);
2066             if (noper_next != tail && OP(noper_next) == flags) {
2067                 noper = noper_next;
2068                 uc= (U8*)STRING(noper);
2069                 e= uc + STR_LEN(noper);
2070 		trie->minlen= STR_LEN(noper);
2071             } else {
2072 		trie->minlen= 0;
2073 		continue;
2074 	    }
2075         }
2076 
2077         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2078             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2079                                           regardless of encoding */
2080             if (OP( noper ) == EXACTFU_SS) {
2081                 /* false positives are ok, so just set this */
2082                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2083             }
2084         }
2085         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2086                                            branch */
2087             TRIE_CHARCOUNT(trie)++;
2088             TRIE_READ_CHAR;
2089 
2090             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2091              * is in effect.  Under /i, this character can match itself, or
2092              * anything that folds to it.  If not under /i, it can match just
2093              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2094              * all fold to k, and all are single characters.   But some folds
2095              * expand to more than one character, so for example LATIN SMALL
2096              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2097              * the string beginning at 'uc' is 'ffi', it could be matched by
2098              * three characters, or just by the one ligature character. (It
2099              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2100              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2101              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2102              * match.)  The trie needs to know the minimum and maximum number
2103              * of characters that could match so that it can use size alone to
2104              * quickly reject many match attempts.  The max is simple: it is
2105              * the number of folded characters in this branch (since a fold is
2106              * never shorter than what folds to it. */
2107 
2108             maxchars++;
2109 
2110             /* And the min is equal to the max if not under /i (indicated by
2111              * 'folder' being NULL), or there are no multi-character folds.  If
2112              * there is a multi-character fold, the min is incremented just
2113              * once, for the character that folds to the sequence.  Each
2114              * character in the sequence needs to be added to the list below of
2115              * characters in the trie, but we count only the first towards the
2116              * min number of characters needed.  This is done through the
2117              * variable 'foldlen', which is returned by the macros that look
2118              * for these sequences as the number of bytes the sequence
2119              * occupies.  Each time through the loop, we decrement 'foldlen' by
2120              * how many bytes the current char occupies.  Only when it reaches
2121              * 0 do we increment 'minchars' or look for another multi-character
2122              * sequence. */
2123             if (folder == NULL) {
2124                 minchars++;
2125             }
2126             else if (foldlen > 0) {
2127                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2128             }
2129             else {
2130                 minchars++;
2131 
2132                 /* See if *uc is the beginning of a multi-character fold.  If
2133                  * so, we decrement the length remaining to look at, to account
2134                  * for the current character this iteration.  (We can use 'uc'
2135                  * instead of the fold returned by TRIE_READ_CHAR because for
2136                  * non-UTF, the latin1_safe macro is smart enough to account
2137                  * for all the unfolded characters, and because for UTF, the
2138                  * string will already have been folded earlier in the
2139                  * compilation process */
2140                 if (UTF) {
2141                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2142                         foldlen -= UTF8SKIP(uc);
2143                     }
2144                 }
2145                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2146                     foldlen--;
2147                 }
2148             }
2149 
2150             /* The current character (and any potential folds) should be added
2151              * to the possible matching characters for this position in this
2152              * branch */
2153             if ( uvc < 256 ) {
2154                 if ( folder ) {
2155                     U8 folded= folder[ (U8) uvc ];
2156                     if ( !trie->charmap[ folded ] ) {
2157                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2158                         TRIE_STORE_REVCHAR( folded );
2159                     }
2160                 }
2161                 if ( !trie->charmap[ uvc ] ) {
2162                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2163                     TRIE_STORE_REVCHAR( uvc );
2164                 }
2165                 if ( set_bit ) {
2166 		    /* store the codepoint in the bitmap, and its folded
2167 		     * equivalent. */
2168                     TRIE_BITMAP_SET(trie, uvc);
2169 
2170 		    /* store the folded codepoint */
2171                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2172 
2173 		    if ( !UTF ) {
2174 			/* store first byte of utf8 representation of
2175 			   variant codepoints */
2176 			if (! UVCHR_IS_INVARIANT(uvc)) {
2177 			    TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2178 			}
2179 		    }
2180                     set_bit = 0; /* We've done our bit :-) */
2181                 }
2182             } else {
2183 
2184                 /* XXX We could come up with the list of code points that fold
2185                  * to this using PL_utf8_foldclosures, except not for
2186                  * multi-char folds, as there may be multiple combinations
2187                  * there that could work, which needs to wait until runtime to
2188                  * resolve (The comment about LIGATURE FFI above is such an
2189                  * example */
2190 
2191                 SV** svpp;
2192                 if ( !widecharmap )
2193                     widecharmap = newHV();
2194 
2195                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2196 
2197                 if ( !svpp )
2198                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2199 
2200                 if ( !SvTRUE( *svpp ) ) {
2201                     sv_setiv( *svpp, ++trie->uniquecharcount );
2202                     TRIE_STORE_REVCHAR(uvc);
2203                 }
2204             }
2205         } /* end loop through characters in this branch of the trie */
2206 
2207         /* We take the min and max for this branch and combine to find the min
2208          * and max for all branches processed so far */
2209         if( cur == first ) {
2210             trie->minlen = minchars;
2211             trie->maxlen = maxchars;
2212         } else if (minchars < trie->minlen) {
2213             trie->minlen = minchars;
2214         } else if (maxchars > trie->maxlen) {
2215             trie->maxlen = maxchars;
2216         }
2217     } /* end first pass */
2218     DEBUG_TRIE_COMPILE_r(
2219         PerlIO_printf( Perl_debug_log,
2220                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2221                 (int)depth * 2 + 2,"",
2222                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2223 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2224 		(int)trie->minlen, (int)trie->maxlen )
2225     );
2226 
2227     /*
2228         We now know what we are dealing with in terms of unique chars and
2229         string sizes so we can calculate how much memory a naive
2230         representation using a flat table  will take. If it's over a reasonable
2231         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2232         conservative but potentially much slower representation using an array
2233         of lists.
2234 
2235         At the end we convert both representations into the same compressed
2236         form that will be used in regexec.c for matching with. The latter
2237         is a form that cannot be used to construct with but has memory
2238         properties similar to the list form and access properties similar
2239         to the table form making it both suitable for fast searches and
2240         small enough that its feasable to store for the duration of a program.
2241 
2242         See the comment in the code where the compressed table is produced
2243         inplace from the flat tabe representation for an explanation of how
2244         the compression works.
2245 
2246     */
2247 
2248 
2249     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2250     prev_states[1] = 0;
2251 
2252     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2253                                                     > SvIV(re_trie_maxbuff) )
2254     {
2255         /*
2256             Second Pass -- Array Of Lists Representation
2257 
2258             Each state will be represented by a list of charid:state records
2259             (reg_trie_trans_le) the first such element holds the CUR and LEN
2260             points of the allocated array. (See defines above).
2261 
2262             We build the initial structure using the lists, and then convert
2263             it into the compressed table form which allows faster lookups
2264             (but cant be modified once converted).
2265         */
2266 
2267         STRLEN transcount = 1;
2268 
2269         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2270             "%*sCompiling trie using list compiler\n",
2271             (int)depth * 2 + 2, ""));
2272 
2273 	trie->states = (reg_trie_state *)
2274 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2275 				  sizeof(reg_trie_state) );
2276         TRIE_LIST_NEW(1);
2277         next_alloc = 2;
2278 
2279         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2280 
2281             regnode *noper   = NEXTOPER( cur );
2282 	    U8 *uc           = (U8*)STRING( noper );
2283             const U8 *e      = uc + STR_LEN( noper );
2284 	    U32 state        = 1;         /* required init */
2285 	    U16 charid       = 0;         /* sanity init */
2286             U32 wordlen      = 0;         /* required init */
2287 
2288             if (OP(noper) == NOTHING) {
2289                 regnode *noper_next= regnext(noper);
2290                 if (noper_next != tail && OP(noper_next) == flags) {
2291                     noper = noper_next;
2292                     uc= (U8*)STRING(noper);
2293                     e= uc + STR_LEN(noper);
2294                 }
2295             }
2296 
2297             if (OP(noper) != NOTHING) {
2298                 for ( ; uc < e ; uc += len ) {
2299 
2300                     TRIE_READ_CHAR;
2301 
2302                     if ( uvc < 256 ) {
2303                         charid = trie->charmap[ uvc ];
2304 		    } else {
2305                         SV** const svpp = hv_fetch( widecharmap,
2306                                                     (char*)&uvc,
2307                                                     sizeof( UV ),
2308                                                     0);
2309                         if ( !svpp ) {
2310                             charid = 0;
2311                         } else {
2312                             charid=(U16)SvIV( *svpp );
2313                         }
2314 		    }
2315                     /* charid is now 0 if we dont know the char read, or
2316                      * nonzero if we do */
2317                     if ( charid ) {
2318 
2319                         U16 check;
2320                         U32 newstate = 0;
2321 
2322                         charid--;
2323                         if ( !trie->states[ state ].trans.list ) {
2324                             TRIE_LIST_NEW( state );
2325 			}
2326                         for ( check = 1;
2327                               check <= TRIE_LIST_USED( state );
2328                               check++ )
2329                         {
2330                             if ( TRIE_LIST_ITEM( state, check ).forid
2331                                                                     == charid )
2332                             {
2333                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2334                                 break;
2335                             }
2336                         }
2337                         if ( ! newstate ) {
2338                             newstate = next_alloc++;
2339 			    prev_states[newstate] = state;
2340                             TRIE_LIST_PUSH( state, charid, newstate );
2341                             transcount++;
2342                         }
2343                         state = newstate;
2344                     } else {
2345                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2346 		    }
2347 		}
2348 	    }
2349             TRIE_HANDLE_WORD(state);
2350 
2351         } /* end second pass */
2352 
2353         /* next alloc is the NEXT state to be allocated */
2354         trie->statecount = next_alloc;
2355         trie->states = (reg_trie_state *)
2356 	    PerlMemShared_realloc( trie->states,
2357 				   next_alloc
2358 				   * sizeof(reg_trie_state) );
2359 
2360         /* and now dump it out before we compress it */
2361         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2362 							 revcharmap, next_alloc,
2363 							 depth+1)
2364         );
2365 
2366         trie->trans = (reg_trie_trans *)
2367 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2368         {
2369             U32 state;
2370             U32 tp = 0;
2371             U32 zp = 0;
2372 
2373 
2374             for( state=1 ; state < next_alloc ; state ++ ) {
2375                 U32 base=0;
2376 
2377                 /*
2378                 DEBUG_TRIE_COMPILE_MORE_r(
2379                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2380                 );
2381                 */
2382 
2383                 if (trie->states[state].trans.list) {
2384                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2385                     U16 maxid=minid;
2386 		    U16 idx;
2387 
2388                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2389 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2390 			if ( forid < minid ) {
2391 			    minid=forid;
2392 			} else if ( forid > maxid ) {
2393 			    maxid=forid;
2394 			}
2395                     }
2396                     if ( transcount < tp + maxid - minid + 1) {
2397                         transcount *= 2;
2398 			trie->trans = (reg_trie_trans *)
2399 			    PerlMemShared_realloc( trie->trans,
2400 						     transcount
2401 						     * sizeof(reg_trie_trans) );
2402                         Zero( trie->trans + (transcount / 2),
2403                               transcount / 2,
2404                               reg_trie_trans );
2405                     }
2406                     base = trie->uniquecharcount + tp - minid;
2407                     if ( maxid == minid ) {
2408                         U32 set = 0;
2409                         for ( ; zp < tp ; zp++ ) {
2410                             if ( ! trie->trans[ zp ].next ) {
2411                                 base = trie->uniquecharcount + zp - minid;
2412                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2413                                                                    1).newstate;
2414                                 trie->trans[ zp ].check = state;
2415                                 set = 1;
2416                                 break;
2417                             }
2418                         }
2419                         if ( !set ) {
2420                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2421                                                                    1).newstate;
2422                             trie->trans[ tp ].check = state;
2423                             tp++;
2424                             zp = tp;
2425                         }
2426                     } else {
2427                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2428                             const U32 tid = base
2429                                            - trie->uniquecharcount
2430                                            + TRIE_LIST_ITEM( state, idx ).forid;
2431                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2432                                                                 idx ).newstate;
2433                             trie->trans[ tid ].check = state;
2434                         }
2435                         tp += ( maxid - minid + 1 );
2436                     }
2437                     Safefree(trie->states[ state ].trans.list);
2438                 }
2439                 /*
2440                 DEBUG_TRIE_COMPILE_MORE_r(
2441                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2442                 );
2443                 */
2444                 trie->states[ state ].trans.base=base;
2445             }
2446             trie->lasttrans = tp + 1;
2447         }
2448     } else {
2449         /*
2450            Second Pass -- Flat Table Representation.
2451 
2452            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2453            each.  We know that we will need Charcount+1 trans at most to store
2454            the data (one row per char at worst case) So we preallocate both
2455            structures assuming worst case.
2456 
2457            We then construct the trie using only the .next slots of the entry
2458            structs.
2459 
2460            We use the .check field of the first entry of the node temporarily
2461            to make compression both faster and easier by keeping track of how
2462            many non zero fields are in the node.
2463 
2464            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2465            transition.
2466 
2467            There are two terms at use here: state as a TRIE_NODEIDX() which is
2468            a number representing the first entry of the node, and state as a
2469            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2470            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2471            if there are 2 entrys per node. eg:
2472 
2473              A B       A B
2474           1. 2 4    1. 3 7
2475           2. 0 3    3. 0 5
2476           3. 0 0    5. 0 0
2477           4. 0 0    7. 0 0
2478 
2479            The table is internally in the right hand, idx form. However as we
2480            also have to deal with the states array which is indexed by nodenum
2481            we have to use TRIE_NODENUM() to convert.
2482 
2483         */
2484         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2485             "%*sCompiling trie using table compiler\n",
2486             (int)depth * 2 + 2, ""));
2487 
2488 	trie->trans = (reg_trie_trans *)
2489 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2490 				  * trie->uniquecharcount + 1,
2491 				  sizeof(reg_trie_trans) );
2492         trie->states = (reg_trie_state *)
2493 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2494 				  sizeof(reg_trie_state) );
2495         next_alloc = trie->uniquecharcount + 1;
2496 
2497 
2498         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2499 
2500             regnode *noper   = NEXTOPER( cur );
2501 	    const U8 *uc     = (U8*)STRING( noper );
2502             const U8 *e      = uc + STR_LEN( noper );
2503 
2504             U32 state        = 1;         /* required init */
2505 
2506             U16 charid       = 0;         /* sanity init */
2507             U32 accept_state = 0;         /* sanity init */
2508 
2509             U32 wordlen      = 0;         /* required init */
2510 
2511             if (OP(noper) == NOTHING) {
2512                 regnode *noper_next= regnext(noper);
2513                 if (noper_next != tail && OP(noper_next) == flags) {
2514                     noper = noper_next;
2515                     uc= (U8*)STRING(noper);
2516                     e= uc + STR_LEN(noper);
2517                 }
2518             }
2519 
2520             if ( OP(noper) != NOTHING ) {
2521                 for ( ; uc < e ; uc += len ) {
2522 
2523                     TRIE_READ_CHAR;
2524 
2525                     if ( uvc < 256 ) {
2526                         charid = trie->charmap[ uvc ];
2527                     } else {
2528                         SV* const * const svpp = hv_fetch( widecharmap,
2529                                                            (char*)&uvc,
2530                                                            sizeof( UV ),
2531                                                            0);
2532                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2533                     }
2534                     if ( charid ) {
2535                         charid--;
2536                         if ( !trie->trans[ state + charid ].next ) {
2537                             trie->trans[ state + charid ].next = next_alloc;
2538                             trie->trans[ state ].check++;
2539 			    prev_states[TRIE_NODENUM(next_alloc)]
2540 				    = TRIE_NODENUM(state);
2541                             next_alloc += trie->uniquecharcount;
2542                         }
2543                         state = trie->trans[ state + charid ].next;
2544                     } else {
2545                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2546                     }
2547                     /* charid is now 0 if we dont know the char read, or
2548                      * nonzero if we do */
2549                 }
2550             }
2551             accept_state = TRIE_NODENUM( state );
2552             TRIE_HANDLE_WORD(accept_state);
2553 
2554         } /* end second pass */
2555 
2556         /* and now dump it out before we compress it */
2557         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2558 							  revcharmap,
2559 							  next_alloc, depth+1));
2560 
2561         {
2562         /*
2563            * Inplace compress the table.*
2564 
2565            For sparse data sets the table constructed by the trie algorithm will
2566            be mostly 0/FAIL transitions or to put it another way mostly empty.
2567            (Note that leaf nodes will not contain any transitions.)
2568 
2569            This algorithm compresses the tables by eliminating most such
2570            transitions, at the cost of a modest bit of extra work during lookup:
2571 
2572            - Each states[] entry contains a .base field which indicates the
2573            index in the state[] array wheres its transition data is stored.
2574 
2575            - If .base is 0 there are no valid transitions from that node.
2576 
2577            - If .base is nonzero then charid is added to it to find an entry in
2578            the trans array.
2579 
2580            -If trans[states[state].base+charid].check!=state then the
2581            transition is taken to be a 0/Fail transition. Thus if there are fail
2582            transitions at the front of the node then the .base offset will point
2583            somewhere inside the previous nodes data (or maybe even into a node
2584            even earlier), but the .check field determines if the transition is
2585            valid.
2586 
2587            XXX - wrong maybe?
2588            The following process inplace converts the table to the compressed
2589            table: We first do not compress the root node 1,and mark all its
2590            .check pointers as 1 and set its .base pointer as 1 as well. This
2591            allows us to do a DFA construction from the compressed table later,
2592            and ensures that any .base pointers we calculate later are greater
2593            than 0.
2594 
2595            - We set 'pos' to indicate the first entry of the second node.
2596 
2597            - We then iterate over the columns of the node, finding the first and
2598            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2599            and set the .check pointers accordingly, and advance pos
2600            appropriately and repreat for the next node. Note that when we copy
2601            the next pointers we have to convert them from the original
2602            NODEIDX form to NODENUM form as the former is not valid post
2603            compression.
2604 
2605            - If a node has no transitions used we mark its base as 0 and do not
2606            advance the pos pointer.
2607 
2608            - If a node only has one transition we use a second pointer into the
2609            structure to fill in allocated fail transitions from other states.
2610            This pointer is independent of the main pointer and scans forward
2611            looking for null transitions that are allocated to a state. When it
2612            finds one it writes the single transition into the "hole".  If the
2613            pointer doesnt find one the single transition is appended as normal.
2614 
2615            - Once compressed we can Renew/realloc the structures to release the
2616            excess space.
2617 
2618            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2619            specifically Fig 3.47 and the associated pseudocode.
2620 
2621            demq
2622         */
2623         const U32 laststate = TRIE_NODENUM( next_alloc );
2624 	U32 state, charid;
2625         U32 pos = 0, zp=0;
2626         trie->statecount = laststate;
2627 
2628         for ( state = 1 ; state < laststate ; state++ ) {
2629             U8 flag = 0;
2630 	    const U32 stateidx = TRIE_NODEIDX( state );
2631 	    const U32 o_used = trie->trans[ stateidx ].check;
2632 	    U32 used = trie->trans[ stateidx ].check;
2633             trie->trans[ stateidx ].check = 0;
2634 
2635             for ( charid = 0;
2636                   used && charid < trie->uniquecharcount;
2637                   charid++ )
2638             {
2639                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2640                     if ( trie->trans[ stateidx + charid ].next ) {
2641                         if (o_used == 1) {
2642                             for ( ; zp < pos ; zp++ ) {
2643                                 if ( ! trie->trans[ zp ].next ) {
2644                                     break;
2645                                 }
2646                             }
2647                             trie->states[ state ].trans.base
2648                                                     = zp
2649                                                       + trie->uniquecharcount
2650                                                       - charid ;
2651                             trie->trans[ zp ].next
2652                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2653                                                              + charid ].next );
2654                             trie->trans[ zp ].check = state;
2655                             if ( ++zp > pos ) pos = zp;
2656                             break;
2657                         }
2658                         used--;
2659                     }
2660                     if ( !flag ) {
2661                         flag = 1;
2662                         trie->states[ state ].trans.base
2663                                        = pos + trie->uniquecharcount - charid ;
2664                     }
2665                     trie->trans[ pos ].next
2666                         = SAFE_TRIE_NODENUM(
2667                                        trie->trans[ stateidx + charid ].next );
2668                     trie->trans[ pos ].check = state;
2669                     pos++;
2670                 }
2671             }
2672         }
2673         trie->lasttrans = pos + 1;
2674         trie->states = (reg_trie_state *)
2675 	    PerlMemShared_realloc( trie->states, laststate
2676 				   * sizeof(reg_trie_state) );
2677         DEBUG_TRIE_COMPILE_MORE_r(
2678             PerlIO_printf( Perl_debug_log,
2679                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2680                 (int)depth * 2 + 2,"",
2681                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2682                        + 1 ),
2683                 (IV)next_alloc,
2684                 (IV)pos,
2685                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2686             );
2687 
2688         } /* end table compress */
2689     }
2690     DEBUG_TRIE_COMPILE_MORE_r(
2691             PerlIO_printf(Perl_debug_log,
2692                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2693                 (int)depth * 2 + 2, "",
2694                 (UV)trie->statecount,
2695                 (UV)trie->lasttrans)
2696     );
2697     /* resize the trans array to remove unused space */
2698     trie->trans = (reg_trie_trans *)
2699 	PerlMemShared_realloc( trie->trans, trie->lasttrans
2700 			       * sizeof(reg_trie_trans) );
2701 
2702     {   /* Modify the program and insert the new TRIE node */
2703         U8 nodetype =(U8)(flags & 0xFF);
2704         char *str=NULL;
2705 
2706 #ifdef DEBUGGING
2707         regnode *optimize = NULL;
2708 #ifdef RE_TRACK_PATTERN_OFFSETS
2709 
2710         U32 mjd_offset = 0;
2711         U32 mjd_nodelen = 0;
2712 #endif /* RE_TRACK_PATTERN_OFFSETS */
2713 #endif /* DEBUGGING */
2714         /*
2715            This means we convert either the first branch or the first Exact,
2716            depending on whether the thing following (in 'last') is a branch
2717            or not and whther first is the startbranch (ie is it a sub part of
2718            the alternation or is it the whole thing.)
2719            Assuming its a sub part we convert the EXACT otherwise we convert
2720            the whole branch sequence, including the first.
2721          */
2722         /* Find the node we are going to overwrite */
2723         if ( first != startbranch || OP( last ) == BRANCH ) {
2724             /* branch sub-chain */
2725             NEXT_OFF( first ) = (U16)(last - first);
2726 #ifdef RE_TRACK_PATTERN_OFFSETS
2727             DEBUG_r({
2728                 mjd_offset= Node_Offset((convert));
2729                 mjd_nodelen= Node_Length((convert));
2730             });
2731 #endif
2732             /* whole branch chain */
2733         }
2734 #ifdef RE_TRACK_PATTERN_OFFSETS
2735         else {
2736             DEBUG_r({
2737                 const  regnode *nop = NEXTOPER( convert );
2738                 mjd_offset= Node_Offset((nop));
2739                 mjd_nodelen= Node_Length((nop));
2740             });
2741         }
2742         DEBUG_OPTIMISE_r(
2743             PerlIO_printf(Perl_debug_log,
2744                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2745                 (int)depth * 2 + 2, "",
2746                 (UV)mjd_offset, (UV)mjd_nodelen)
2747         );
2748 #endif
2749         /* But first we check to see if there is a common prefix we can
2750            split out as an EXACT and put in front of the TRIE node.  */
2751         trie->startstate= 1;
2752         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2753             U32 state;
2754             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2755                 U32 ofs = 0;
2756                 I32 idx = -1;
2757                 U32 count = 0;
2758                 const U32 base = trie->states[ state ].trans.base;
2759 
2760                 if ( trie->states[state].wordnum )
2761                         count = 1;
2762 
2763                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2764                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2765                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2766                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2767                     {
2768                         if ( ++count > 1 ) {
2769                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2770 			    const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2771                             if ( state == 1 ) break;
2772                             if ( count == 2 ) {
2773                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2774                                 DEBUG_OPTIMISE_r(
2775                                     PerlIO_printf(Perl_debug_log,
2776 					"%*sNew Start State=%"UVuf" Class: [",
2777                                         (int)depth * 2 + 2, "",
2778                                         (UV)state));
2779 				if (idx >= 0) {
2780 				    SV ** const tmp = av_fetch( revcharmap, idx, 0);
2781 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2782 
2783                                     TRIE_BITMAP_SET(trie,*ch);
2784                                     if ( folder )
2785                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2786                                     DEBUG_OPTIMISE_r(
2787                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2788                                     );
2789 				}
2790 			    }
2791 			    TRIE_BITMAP_SET(trie,*ch);
2792 			    if ( folder )
2793 				TRIE_BITMAP_SET(trie,folder[ *ch ]);
2794 			    DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2795 			}
2796                         idx = ofs;
2797 		    }
2798                 }
2799                 if ( count == 1 ) {
2800                     SV **tmp = av_fetch( revcharmap, idx, 0);
2801                     STRLEN len;
2802                     char *ch = SvPV( *tmp, len );
2803                     DEBUG_OPTIMISE_r({
2804                         SV *sv=sv_newmortal();
2805                         PerlIO_printf( Perl_debug_log,
2806 			    "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2807                             (int)depth * 2 + 2, "",
2808                             (UV)state, (UV)idx,
2809                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2810 	                        PL_colors[0], PL_colors[1],
2811 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2812 	                        PERL_PV_ESCAPE_FIRSTCHAR
2813                             )
2814                         );
2815                     });
2816                     if ( state==1 ) {
2817                         OP( convert ) = nodetype;
2818                         str=STRING(convert);
2819                         STR_LEN(convert)=0;
2820                     }
2821                     STR_LEN(convert) += len;
2822                     while (len--)
2823                         *str++ = *ch++;
2824 		} else {
2825 #ifdef DEBUGGING
2826 		    if (state>1)
2827 			DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2828 #endif
2829 		    break;
2830 		}
2831 	    }
2832 	    trie->prefixlen = (state-1);
2833             if (str) {
2834                 regnode *n = convert+NODE_SZ_STR(convert);
2835                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2836                 trie->startstate = state;
2837                 trie->minlen -= (state - 1);
2838                 trie->maxlen -= (state - 1);
2839 #ifdef DEBUGGING
2840                /* At least the UNICOS C compiler choked on this
2841                 * being argument to DEBUG_r(), so let's just have
2842                 * it right here. */
2843                if (
2844 #ifdef PERL_EXT_RE_BUILD
2845                    1
2846 #else
2847                    DEBUG_r_TEST
2848 #endif
2849                    ) {
2850                    regnode *fix = convert;
2851                    U32 word = trie->wordcount;
2852                    mjd_nodelen++;
2853                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2854                    while( ++fix < n ) {
2855                        Set_Node_Offset_Length(fix, 0, 0);
2856                    }
2857                    while (word--) {
2858                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2859                        if (tmp) {
2860                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2861                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2862                            else
2863                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2864                        }
2865                    }
2866                }
2867 #endif
2868                 if (trie->maxlen) {
2869                     convert = n;
2870 		} else {
2871                     NEXT_OFF(convert) = (U16)(tail - convert);
2872                     DEBUG_r(optimize= n);
2873                 }
2874             }
2875         }
2876         if (!jumper)
2877             jumper = last;
2878         if ( trie->maxlen ) {
2879 	    NEXT_OFF( convert ) = (U16)(tail - convert);
2880 	    ARG_SET( convert, data_slot );
2881 	    /* Store the offset to the first unabsorbed branch in
2882 	       jump[0], which is otherwise unused by the jump logic.
2883 	       We use this when dumping a trie and during optimisation. */
2884 	    if (trie->jump)
2885 	        trie->jump[0] = (U16)(nextbranch - convert);
2886 
2887             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2888 	     *   and there is a bitmap
2889 	     *   and the first "jump target" node we found leaves enough room
2890 	     * then convert the TRIE node into a TRIEC node, with the bitmap
2891 	     * embedded inline in the opcode - this is hypothetically faster.
2892 	     */
2893             if ( !trie->states[trie->startstate].wordnum
2894 		 && trie->bitmap
2895 		 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2896             {
2897                 OP( convert ) = TRIEC;
2898                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2899                 PerlMemShared_free(trie->bitmap);
2900                 trie->bitmap= NULL;
2901             } else
2902                 OP( convert ) = TRIE;
2903 
2904             /* store the type in the flags */
2905             convert->flags = nodetype;
2906             DEBUG_r({
2907             optimize = convert
2908                       + NODE_STEP_REGNODE
2909                       + regarglen[ OP( convert ) ];
2910             });
2911             /* XXX We really should free up the resource in trie now,
2912                    as we won't use them - (which resources?) dmq */
2913         }
2914         /* needed for dumping*/
2915         DEBUG_r(if (optimize) {
2916             regnode *opt = convert;
2917 
2918             while ( ++opt < optimize) {
2919                 Set_Node_Offset_Length(opt,0,0);
2920             }
2921             /*
2922                 Try to clean up some of the debris left after the
2923                 optimisation.
2924              */
2925             while( optimize < jumper ) {
2926                 mjd_nodelen += Node_Length((optimize));
2927                 OP( optimize ) = OPTIMIZED;
2928                 Set_Node_Offset_Length(optimize,0,0);
2929                 optimize++;
2930             }
2931             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2932         });
2933     } /* end node insert */
2934 
2935     /*  Finish populating the prev field of the wordinfo array.  Walk back
2936      *  from each accept state until we find another accept state, and if
2937      *  so, point the first word's .prev field at the second word. If the
2938      *  second already has a .prev field set, stop now. This will be the
2939      *  case either if we've already processed that word's accept state,
2940      *  or that state had multiple words, and the overspill words were
2941      *  already linked up earlier.
2942      */
2943     {
2944 	U16 word;
2945 	U32 state;
2946 	U16 prev;
2947 
2948 	for (word=1; word <= trie->wordcount; word++) {
2949 	    prev = 0;
2950 	    if (trie->wordinfo[word].prev)
2951 		continue;
2952 	    state = trie->wordinfo[word].accept;
2953 	    while (state) {
2954 		state = prev_states[state];
2955 		if (!state)
2956 		    break;
2957 		prev = trie->states[state].wordnum;
2958 		if (prev)
2959 		    break;
2960 	    }
2961 	    trie->wordinfo[word].prev = prev;
2962 	}
2963 	Safefree(prev_states);
2964     }
2965 
2966 
2967     /* and now dump out the compressed format */
2968     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2969 
2970     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2971 #ifdef DEBUGGING
2972     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2973     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2974 #else
2975     SvREFCNT_dec_NN(revcharmap);
2976 #endif
2977     return trie->jump
2978            ? MADE_JUMP_TRIE
2979            : trie->startstate>1
2980              ? MADE_EXACT_TRIE
2981              : MADE_TRIE;
2982 }
2983 
2984 STATIC void
2985 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2986 {
2987 /* The Trie is constructed and compressed now so we can build a fail array if
2988  * it's needed
2989 
2990    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2991    3.32 in the
2992    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2993    Ullman 1985/88
2994    ISBN 0-201-10088-6
2995 
2996    We find the fail state for each state in the trie, this state is the longest
2997    proper suffix of the current state's 'word' that is also a proper prefix of
2998    another word in our trie. State 1 represents the word '' and is thus the
2999    default fail state. This allows the DFA not to have to restart after its
3000    tried and failed a word at a given point, it simply continues as though it
3001    had been matching the other word in the first place.
3002    Consider
3003       'abcdgu'=~/abcdefg|cdgu/
3004    When we get to 'd' we are still matching the first word, we would encounter
3005    'g' which would fail, which would bring us to the state representing 'd' in
3006    the second word where we would try 'g' and succeed, proceeding to match
3007    'cdgu'.
3008  */
3009  /* add a fail transition */
3010     const U32 trie_offset = ARG(source);
3011     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3012     U32 *q;
3013     const U32 ucharcount = trie->uniquecharcount;
3014     const U32 numstates = trie->statecount;
3015     const U32 ubound = trie->lasttrans + ucharcount;
3016     U32 q_read = 0;
3017     U32 q_write = 0;
3018     U32 charid;
3019     U32 base = trie->states[ 1 ].trans.base;
3020     U32 *fail;
3021     reg_ac_data *aho;
3022     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3023     GET_RE_DEBUG_FLAGS_DECL;
3024 
3025     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3026 #ifndef DEBUGGING
3027     PERL_UNUSED_ARG(depth);
3028 #endif
3029 
3030 
3031     ARG_SET( stclass, data_slot );
3032     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3033     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3034     aho->trie=trie_offset;
3035     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3036     Copy( trie->states, aho->states, numstates, reg_trie_state );
3037     Newxz( q, numstates, U32);
3038     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3039     aho->refcount = 1;
3040     fail = aho->fail;
3041     /* initialize fail[0..1] to be 1 so that we always have
3042        a valid final fail state */
3043     fail[ 0 ] = fail[ 1 ] = 1;
3044 
3045     for ( charid = 0; charid < ucharcount ; charid++ ) {
3046 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3047 	if ( newstate ) {
3048             q[ q_write ] = newstate;
3049             /* set to point at the root */
3050             fail[ q[ q_write++ ] ]=1;
3051         }
3052     }
3053     while ( q_read < q_write) {
3054 	const U32 cur = q[ q_read++ % numstates ];
3055         base = trie->states[ cur ].trans.base;
3056 
3057         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3058 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3059 	    if (ch_state) {
3060                 U32 fail_state = cur;
3061                 U32 fail_base;
3062                 do {
3063                     fail_state = fail[ fail_state ];
3064                     fail_base = aho->states[ fail_state ].trans.base;
3065                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3066 
3067                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3068                 fail[ ch_state ] = fail_state;
3069                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3070                 {
3071                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3072                 }
3073                 q[ q_write++ % numstates] = ch_state;
3074             }
3075         }
3076     }
3077     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3078        when we fail in state 1, this allows us to use the
3079        charclass scan to find a valid start char. This is based on the principle
3080        that theres a good chance the string being searched contains lots of stuff
3081        that cant be a start char.
3082      */
3083     fail[ 0 ] = fail[ 1 ] = 0;
3084     DEBUG_TRIE_COMPILE_r({
3085         PerlIO_printf(Perl_debug_log,
3086 		      "%*sStclass Failtable (%"UVuf" states): 0",
3087 		      (int)(depth * 2), "", (UV)numstates
3088         );
3089         for( q_read=1; q_read<numstates; q_read++ ) {
3090             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3091         }
3092         PerlIO_printf(Perl_debug_log, "\n");
3093     });
3094     Safefree(q);
3095     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3096 }
3097 
3098 
3099 #define DEBUG_PEEP(str,scan,depth) \
3100     DEBUG_OPTIMISE_r({if (scan){ \
3101        SV * const mysv=sv_newmortal(); \
3102        regnode *Next = regnext(scan); \
3103        regprop(RExC_rx, mysv, scan, NULL); \
3104        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3105        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3106        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3107    }});
3108 
3109 
3110 /* The below joins as many adjacent EXACTish nodes as possible into a single
3111  * one.  The regop may be changed if the node(s) contain certain sequences that
3112  * require special handling.  The joining is only done if:
3113  * 1) there is room in the current conglomerated node to entirely contain the
3114  *    next one.
3115  * 2) they are the exact same node type
3116  *
3117  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3118  * these get optimized out
3119  *
3120  * If a node is to match under /i (folded), the number of characters it matches
3121  * can be different than its character length if it contains a multi-character
3122  * fold.  *min_subtract is set to the total delta number of characters of the
3123  * input nodes.
3124  *
3125  * And *unfolded_multi_char is set to indicate whether or not the node contains
3126  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3127  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3128  * SMALL LETTER SHARP S, as only if the target string being matched against
3129  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3130  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3131  * whose components are all above the Latin1 range are not run-time locale
3132  * dependent, and have already been folded by the time this function is
3133  * called.)
3134  *
3135  * This is as good a place as any to discuss the design of handling these
3136  * multi-character fold sequences.  It's been wrong in Perl for a very long
3137  * time.  There are three code points in Unicode whose multi-character folds
3138  * were long ago discovered to mess things up.  The previous designs for
3139  * dealing with these involved assigning a special node for them.  This
3140  * approach doesn't always work, as evidenced by this example:
3141  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3142  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3143  * would match just the \xDF, it won't be able to handle the case where a
3144  * successful match would have to cross the node's boundary.  The new approach
3145  * that hopefully generally solves the problem generates an EXACTFU_SS node
3146  * that is "sss" in this case.
3147  *
3148  * It turns out that there are problems with all multi-character folds, and not
3149  * just these three.  Now the code is general, for all such cases.  The
3150  * approach taken is:
3151  * 1)   This routine examines each EXACTFish node that could contain multi-
3152  *      character folded sequences.  Since a single character can fold into
3153  *      such a sequence, the minimum match length for this node is less than
3154  *      the number of characters in the node.  This routine returns in
3155  *      *min_subtract how many characters to subtract from the the actual
3156  *      length of the string to get a real minimum match length; it is 0 if
3157  *      there are no multi-char foldeds.  This delta is used by the caller to
3158  *      adjust the min length of the match, and the delta between min and max,
3159  *      so that the optimizer doesn't reject these possibilities based on size
3160  *      constraints.
3161  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3162  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3163  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3164  *      there is a possible fold length change.  That means that a regular
3165  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3166  *      with length changes, and so can be processed faster.  regexec.c takes
3167  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3168  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3169  *      known until runtime).  This saves effort in regex matching.  However,
3170  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3171  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3172  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3173  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3174  *      possibilities for the non-UTF8 patterns are quite simple, except for
3175  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3176  *      members of a fold-pair, and arrays are set up for all of them so that
3177  *      the other member of the pair can be found quickly.  Code elsewhere in
3178  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3179  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3180  *      described in the next item.
3181  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3182  *      validity of the fold won't be known until runtime, and so must remain
3183  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3184  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3185  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3186  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3187  *      The reason this is a problem is that the optimizer part of regexec.c
3188  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3189  *      that a character in the pattern corresponds to at most a single
3190  *      character in the target string.  (And I do mean character, and not byte
3191  *      here, unlike other parts of the documentation that have never been
3192  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3193  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3194  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3195  *      nodes, violate the assumption, and they are the only instances where it
3196  *      is violated.  I'm reluctant to try to change the assumption, as the
3197  *      code involved is impenetrable to me (khw), so instead the code here
3198  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3199  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3200  *      boolean indicating whether or not the node contains such a fold.  When
3201  *      it is true, the caller sets a flag that later causes the optimizer in
3202  *      this file to not set values for the floating and fixed string lengths,
3203  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3204  *      assumption.  Thus, there is no optimization based on string lengths for
3205  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3206  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3207  *      assumption is wrong only in these cases is that all other non-UTF-8
3208  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3209  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3210  *      EXACTF nodes because we don't know at compile time if it actually
3211  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3212  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3213  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3214  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3215  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3216  *      string would require the pattern to be forced into UTF-8, the overhead
3217  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3218  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3219  *      locale.)
3220  *
3221  *      Similarly, the code that generates tries doesn't currently handle
3222  *      not-already-folded multi-char folds, and it looks like a pain to change
3223  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3224  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3225  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3226  *      using /iaa matching will be doing so almost entirely with ASCII
3227  *      strings, so this should rarely be encountered in practice */
3228 
3229 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3230     if (PL_regkind[OP(scan)] == EXACT) \
3231         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3232 
3233 STATIC U32
3234 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3235                    UV *min_subtract, bool *unfolded_multi_char,
3236                    U32 flags,regnode *val, U32 depth)
3237 {
3238     /* Merge several consecutive EXACTish nodes into one. */
3239     regnode *n = regnext(scan);
3240     U32 stringok = 1;
3241     regnode *next = scan + NODE_SZ_STR(scan);
3242     U32 merged = 0;
3243     U32 stopnow = 0;
3244 #ifdef DEBUGGING
3245     regnode *stop = scan;
3246     GET_RE_DEBUG_FLAGS_DECL;
3247 #else
3248     PERL_UNUSED_ARG(depth);
3249 #endif
3250 
3251     PERL_ARGS_ASSERT_JOIN_EXACT;
3252 #ifndef EXPERIMENTAL_INPLACESCAN
3253     PERL_UNUSED_ARG(flags);
3254     PERL_UNUSED_ARG(val);
3255 #endif
3256     DEBUG_PEEP("join",scan,depth);
3257 
3258     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3259      * EXACT ones that are mergeable to the current one. */
3260     while (n
3261            && (PL_regkind[OP(n)] == NOTHING
3262                || (stringok && OP(n) == OP(scan)))
3263            && NEXT_OFF(n)
3264            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3265     {
3266 
3267         if (OP(n) == TAIL || n > next)
3268             stringok = 0;
3269         if (PL_regkind[OP(n)] == NOTHING) {
3270             DEBUG_PEEP("skip:",n,depth);
3271             NEXT_OFF(scan) += NEXT_OFF(n);
3272             next = n + NODE_STEP_REGNODE;
3273 #ifdef DEBUGGING
3274             if (stringok)
3275                 stop = n;
3276 #endif
3277             n = regnext(n);
3278         }
3279         else if (stringok) {
3280             const unsigned int oldl = STR_LEN(scan);
3281             regnode * const nnext = regnext(n);
3282 
3283             /* XXX I (khw) kind of doubt that this works on platforms (should
3284              * Perl ever run on one) where U8_MAX is above 255 because of lots
3285              * of other assumptions */
3286             /* Don't join if the sum can't fit into a single node */
3287             if (oldl + STR_LEN(n) > U8_MAX)
3288                 break;
3289 
3290             DEBUG_PEEP("merg",n,depth);
3291             merged++;
3292 
3293             NEXT_OFF(scan) += NEXT_OFF(n);
3294             STR_LEN(scan) += STR_LEN(n);
3295             next = n + NODE_SZ_STR(n);
3296             /* Now we can overwrite *n : */
3297             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3298 #ifdef DEBUGGING
3299             stop = next - 1;
3300 #endif
3301             n = nnext;
3302             if (stopnow) break;
3303         }
3304 
3305 #ifdef EXPERIMENTAL_INPLACESCAN
3306 	if (flags && !NEXT_OFF(n)) {
3307 	    DEBUG_PEEP("atch", val, depth);
3308 	    if (reg_off_by_arg[OP(n)]) {
3309 		ARG_SET(n, val - n);
3310 	    }
3311 	    else {
3312 		NEXT_OFF(n) = val - n;
3313 	    }
3314 	    stopnow = 1;
3315 	}
3316 #endif
3317     }
3318 
3319     *min_subtract = 0;
3320     *unfolded_multi_char = FALSE;
3321 
3322     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3323      * can now analyze for sequences of problematic code points.  (Prior to
3324      * this final joining, sequences could have been split over boundaries, and
3325      * hence missed).  The sequences only happen in folding, hence for any
3326      * non-EXACT EXACTish node */
3327     if (OP(scan) != EXACT) {
3328         U8* s0 = (U8*) STRING(scan);
3329         U8* s = s0;
3330         U8* s_end = s0 + STR_LEN(scan);
3331 
3332         int total_count_delta = 0;  /* Total delta number of characters that
3333                                        multi-char folds expand to */
3334 
3335 	/* One pass is made over the node's string looking for all the
3336 	 * possibilities.  To avoid some tests in the loop, there are two main
3337 	 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3338 	 * non-UTF-8 */
3339 	if (UTF) {
3340             U8* folded = NULL;
3341 
3342             if (OP(scan) == EXACTFL) {
3343                 U8 *d;
3344 
3345                 /* An EXACTFL node would already have been changed to another
3346                  * node type unless there is at least one character in it that
3347                  * is problematic; likely a character whose fold definition
3348                  * won't be known until runtime, and so has yet to be folded.
3349                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3350                  * to handle the UTF-8 case, we need to create a temporary
3351                  * folded copy using UTF-8 locale rules in order to analyze it.
3352                  * This is because our macros that look to see if a sequence is
3353                  * a multi-char fold assume everything is folded (otherwise the
3354                  * tests in those macros would be too complicated and slow).
3355                  * Note that here, the non-problematic folds will have already
3356                  * been done, so we can just copy such characters.  We actually
3357                  * don't completely fold the EXACTFL string.  We skip the
3358                  * unfolded multi-char folds, as that would just create work
3359                  * below to figure out the size they already are */
3360 
3361                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3362                 d = folded;
3363                 while (s < s_end) {
3364                     STRLEN s_len = UTF8SKIP(s);
3365                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3366                         Copy(s, d, s_len, U8);
3367                         d += s_len;
3368                     }
3369                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3370                         *unfolded_multi_char = TRUE;
3371                         Copy(s, d, s_len, U8);
3372                         d += s_len;
3373                     }
3374                     else if (isASCII(*s)) {
3375                         *(d++) = toFOLD(*s);
3376                     }
3377                     else {
3378                         STRLEN len;
3379                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3380                         d += len;
3381                     }
3382                     s += s_len;
3383                 }
3384 
3385                 /* Point the remainder of the routine to look at our temporary
3386                  * folded copy */
3387                 s = folded;
3388                 s_end = d;
3389             } /* End of creating folded copy of EXACTFL string */
3390 
3391             /* Examine the string for a multi-character fold sequence.  UTF-8
3392              * patterns have all characters pre-folded by the time this code is
3393              * executed */
3394             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3395                                      length sequence we are looking for is 2 */
3396 	    {
3397                 int count = 0;  /* How many characters in a multi-char fold */
3398                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3399                 if (! len) {    /* Not a multi-char fold: get next char */
3400                     s += UTF8SKIP(s);
3401                     continue;
3402                 }
3403 
3404                 /* Nodes with 'ss' require special handling, except for
3405                  * EXACTFA-ish for which there is no multi-char fold to this */
3406                 if (len == 2 && *s == 's' && *(s+1) == 's'
3407                     && OP(scan) != EXACTFA
3408                     && OP(scan) != EXACTFA_NO_TRIE)
3409                 {
3410                     count = 2;
3411                     if (OP(scan) != EXACTFL) {
3412                         OP(scan) = EXACTFU_SS;
3413                     }
3414                     s += 2;
3415                 }
3416                 else { /* Here is a generic multi-char fold. */
3417                     U8* multi_end  = s + len;
3418 
3419                     /* Count how many characters in it.  In the case of /aa, no
3420                      * folds which contain ASCII code points are allowed, so
3421                      * check for those, and skip if found. */
3422                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3423                         count = utf8_length(s, multi_end);
3424                         s = multi_end;
3425                     }
3426                     else {
3427                         while (s < multi_end) {
3428                             if (isASCII(*s)) {
3429                                 s++;
3430                                 goto next_iteration;
3431                             }
3432                             else {
3433                                 s += UTF8SKIP(s);
3434                             }
3435                             count++;
3436                         }
3437                     }
3438                 }
3439 
3440                 /* The delta is how long the sequence is minus 1 (1 is how long
3441                  * the character that folds to the sequence is) */
3442                 total_count_delta += count - 1;
3443               next_iteration: ;
3444 	    }
3445 
3446             /* We created a temporary folded copy of the string in EXACTFL
3447              * nodes.  Therefore we need to be sure it doesn't go below zero,
3448              * as the real string could be shorter */
3449             if (OP(scan) == EXACTFL) {
3450                 int total_chars = utf8_length((U8*) STRING(scan),
3451                                            (U8*) STRING(scan) + STR_LEN(scan));
3452                 if (total_count_delta > total_chars) {
3453                     total_count_delta = total_chars;
3454                 }
3455             }
3456 
3457             *min_subtract += total_count_delta;
3458             Safefree(folded);
3459 	}
3460 	else if (OP(scan) == EXACTFA) {
3461 
3462             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3463              * fold to the ASCII range (and there are no existing ones in the
3464              * upper latin1 range).  But, as outlined in the comments preceding
3465              * this function, we need to flag any occurrences of the sharp s.
3466              * This character forbids trie formation (because of added
3467              * complexity) */
3468 	    while (s < s_end) {
3469                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3470                     OP(scan) = EXACTFA_NO_TRIE;
3471                     *unfolded_multi_char = TRUE;
3472                     break;
3473                 }
3474                 s++;
3475                 continue;
3476             }
3477         }
3478 	else {
3479 
3480             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3481              * folds that are all Latin1.  As explained in the comments
3482              * preceding this function, we look also for the sharp s in EXACTF
3483              * and EXACTFL nodes; it can be in the final position.  Otherwise
3484              * we can stop looking 1 byte earlier because have to find at least
3485              * two characters for a multi-fold */
3486 	    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3487                               ? s_end
3488                               : s_end -1;
3489 
3490 	    while (s < upper) {
3491                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3492                 if (! len) {    /* Not a multi-char fold. */
3493                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3494                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3495                     {
3496                         *unfolded_multi_char = TRUE;
3497                     }
3498                     s++;
3499                     continue;
3500                 }
3501 
3502                 if (len == 2
3503                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3504                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3505                 {
3506 
3507                     /* EXACTF nodes need to know that the minimum length
3508                      * changed so that a sharp s in the string can match this
3509                      * ss in the pattern, but they remain EXACTF nodes, as they
3510                      * won't match this unless the target string is is UTF-8,
3511                      * which we don't know until runtime.  EXACTFL nodes can't
3512                      * transform into EXACTFU nodes */
3513                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3514                         OP(scan) = EXACTFU_SS;
3515                     }
3516 		}
3517 
3518                 *min_subtract += len - 1;
3519                 s += len;
3520 	    }
3521 	}
3522     }
3523 
3524 #ifdef DEBUGGING
3525     /* Allow dumping but overwriting the collection of skipped
3526      * ops and/or strings with fake optimized ops */
3527     n = scan + NODE_SZ_STR(scan);
3528     while (n <= stop) {
3529 	OP(n) = OPTIMIZED;
3530 	FLAGS(n) = 0;
3531 	NEXT_OFF(n) = 0;
3532         n++;
3533     }
3534 #endif
3535     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3536     return stopnow;
3537 }
3538 
3539 /* REx optimizer.  Converts nodes into quicker variants "in place".
3540    Finds fixed substrings.  */
3541 
3542 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3543    to the position after last scanned or to NULL. */
3544 
3545 #define INIT_AND_WITHP \
3546     assert(!and_withp); \
3547     Newx(and_withp,1, regnode_ssc); \
3548     SAVEFREEPV(and_withp)
3549 
3550 /* this is a chain of data about sub patterns we are processing that
3551    need to be handled separately/specially in study_chunk. Its so
3552    we can simulate recursion without losing state.  */
3553 struct scan_frame;
3554 typedef struct scan_frame {
3555     regnode *last;  /* last node to process in this frame */
3556     regnode *next;  /* next node to process when last is reached */
3557     struct scan_frame *prev; /*previous frame*/
3558     U32 prev_recursed_depth;
3559     I32 stop; /* what stopparen do we use */
3560 } scan_frame;
3561 
3562 
3563 STATIC SSize_t
3564 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3565                         SSize_t *minlenp, SSize_t *deltap,
3566 			regnode *last,
3567 			scan_data_t *data,
3568 			I32 stopparen,
3569                         U32 recursed_depth,
3570 			regnode_ssc *and_withp,
3571 			U32 flags, U32 depth)
3572 			/* scanp: Start here (read-write). */
3573 			/* deltap: Write maxlen-minlen here. */
3574 			/* last: Stop before this one. */
3575 			/* data: string data about the pattern */
3576 			/* stopparen: treat close N as END */
3577 			/* recursed: which subroutines have we recursed into */
3578 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3579 {
3580     dVAR;
3581     /* There must be at least this number of characters to match */
3582     SSize_t min = 0;
3583     I32 pars = 0, code;
3584     regnode *scan = *scanp, *next;
3585     SSize_t delta = 0;
3586     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3587     int is_inf_internal = 0;		/* The studied chunk is infinite */
3588     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3589     scan_data_t data_fake;
3590     SV *re_trie_maxbuff = NULL;
3591     regnode *first_non_open = scan;
3592     SSize_t stopmin = SSize_t_MAX;
3593     scan_frame *frame = NULL;
3594     GET_RE_DEBUG_FLAGS_DECL;
3595 
3596     PERL_ARGS_ASSERT_STUDY_CHUNK;
3597 
3598 #ifdef DEBUGGING
3599     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3600 #endif
3601     if ( depth == 0 ) {
3602         while (first_non_open && OP(first_non_open) == OPEN)
3603             first_non_open=regnext(first_non_open);
3604     }
3605 
3606 
3607   fake_study_recurse:
3608     while ( scan && OP(scan) != END && scan < last ){
3609         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3610                                    node length to get a real minimum (because
3611                                    the folded version may be shorter) */
3612 	bool unfolded_multi_char = FALSE;
3613 	/* Peephole optimizer: */
3614         DEBUG_OPTIMISE_MORE_r(
3615         {
3616             PerlIO_printf(Perl_debug_log,
3617                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3618                 ((int) depth*2), "", (long)stopparen,
3619                 (unsigned long)depth, (unsigned long)recursed_depth);
3620             if (recursed_depth) {
3621                 U32 i;
3622                 U32 j;
3623                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3624                     PerlIO_printf(Perl_debug_log,"[");
3625                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3626                         PerlIO_printf(Perl_debug_log,"%d",
3627                             PAREN_TEST(RExC_study_chunk_recursed +
3628                                        (j * RExC_study_chunk_recursed_bytes), i)
3629                             ? 1 : 0
3630                         );
3631                     PerlIO_printf(Perl_debug_log,"]");
3632                 }
3633             }
3634             PerlIO_printf(Perl_debug_log,"\n");
3635         }
3636         );
3637         DEBUG_STUDYDATA("Peep:", data, depth);
3638         DEBUG_PEEP("Peep", scan, depth);
3639 
3640 
3641         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3642          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3643          * by a different invocation of reg() -- Yves
3644          */
3645         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3646 
3647 	/* Follow the next-chain of the current node and optimize
3648 	   away all the NOTHINGs from it.  */
3649 	if (OP(scan) != CURLYX) {
3650 	    const int max = (reg_off_by_arg[OP(scan)]
3651 		       ? I32_MAX
3652 		       /* I32 may be smaller than U16 on CRAYs! */
3653 		       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3654 	    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3655 	    int noff;
3656 	    regnode *n = scan;
3657 
3658 	    /* Skip NOTHING and LONGJMP. */
3659 	    while ((n = regnext(n))
3660 		   && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3661 		       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3662 		   && off + noff < max)
3663 		off += noff;
3664 	    if (reg_off_by_arg[OP(scan)])
3665 		ARG(scan) = off;
3666 	    else
3667 		NEXT_OFF(scan) = off;
3668 	}
3669 
3670 
3671 
3672 	/* The principal pseudo-switch.  Cannot be a switch, since we
3673 	   look into several different things.  */
3674 	if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3675 		   || OP(scan) == IFTHEN) {
3676 	    next = regnext(scan);
3677 	    code = OP(scan);
3678             /* demq: the op(next)==code check is to see if we have
3679              * "branch-branch" AFAICT */
3680 
3681 	    if (OP(next) == code || code == IFTHEN) {
3682                 /* NOTE - There is similar code to this block below for
3683                  * handling TRIE nodes on a re-study.  If you change stuff here
3684                  * check there too. */
3685 		SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3686 		regnode_ssc accum;
3687 		regnode * const startbranch=scan;
3688 
3689                 if (flags & SCF_DO_SUBSTR) {
3690                     /* Cannot merge strings after this. */
3691                     scan_commit(pRExC_state, data, minlenp, is_inf);
3692                 }
3693 
3694                 if (flags & SCF_DO_STCLASS)
3695 		    ssc_init_zero(pRExC_state, &accum);
3696 
3697 		while (OP(scan) == code) {
3698 		    SSize_t deltanext, minnext, fake;
3699 		    I32 f = 0;
3700 		    regnode_ssc this_class;
3701 
3702 		    num++;
3703 		    data_fake.flags = 0;
3704 		    if (data) {
3705 			data_fake.whilem_c = data->whilem_c;
3706 			data_fake.last_closep = data->last_closep;
3707 		    }
3708 		    else
3709 			data_fake.last_closep = &fake;
3710 
3711 		    data_fake.pos_delta = delta;
3712 		    next = regnext(scan);
3713 		    scan = NEXTOPER(scan);
3714 		    if (code != BRANCH)
3715 			scan = NEXTOPER(scan);
3716 		    if (flags & SCF_DO_STCLASS) {
3717 			ssc_init(pRExC_state, &this_class);
3718 			data_fake.start_class = &this_class;
3719 			f = SCF_DO_STCLASS_AND;
3720 		    }
3721 		    if (flags & SCF_WHILEM_VISITED_POS)
3722 			f |= SCF_WHILEM_VISITED_POS;
3723 
3724 		    /* we suppose the run is continuous, last=next...*/
3725 		    minnext = study_chunk(pRExC_state, &scan, minlenp,
3726                                       &deltanext, next, &data_fake, stopparen,
3727                                       recursed_depth, NULL, f,depth+1);
3728 		    if (min1 > minnext)
3729 			min1 = minnext;
3730 		    if (deltanext == SSize_t_MAX) {
3731 			is_inf = is_inf_internal = 1;
3732 			max1 = SSize_t_MAX;
3733 		    } else if (max1 < minnext + deltanext)
3734 			max1 = minnext + deltanext;
3735 		    scan = next;
3736 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737 			pars++;
3738 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
3739 	                if ( stopmin > minnext)
3740 	                    stopmin = min + min1;
3741 	                flags &= ~SCF_DO_SUBSTR;
3742 	                if (data)
3743 	                    data->flags |= SCF_SEEN_ACCEPT;
3744 	            }
3745 		    if (data) {
3746 			if (data_fake.flags & SF_HAS_EVAL)
3747 			    data->flags |= SF_HAS_EVAL;
3748 			data->whilem_c = data_fake.whilem_c;
3749 		    }
3750 		    if (flags & SCF_DO_STCLASS)
3751 			ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3752 		}
3753 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3754 		    min1 = 0;
3755 		if (flags & SCF_DO_SUBSTR) {
3756 		    data->pos_min += min1;
3757 		    if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3758 		        data->pos_delta = SSize_t_MAX;
3759 		    else
3760 		        data->pos_delta += max1 - min1;
3761 		    if (max1 != min1 || is_inf)
3762 			data->longest = &(data->longest_float);
3763 		}
3764 		min += min1;
3765 		if (delta == SSize_t_MAX
3766 		 || SSize_t_MAX - delta - (max1 - min1) < 0)
3767 		    delta = SSize_t_MAX;
3768 		else
3769 		    delta += max1 - min1;
3770 		if (flags & SCF_DO_STCLASS_OR) {
3771 		    ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3772 		    if (min1) {
3773 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3774 			flags &= ~SCF_DO_STCLASS;
3775 		    }
3776 		}
3777 		else if (flags & SCF_DO_STCLASS_AND) {
3778 		    if (min1) {
3779 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3780 			flags &= ~SCF_DO_STCLASS;
3781 		    }
3782 		    else {
3783 			/* Switch to OR mode: cache the old value of
3784 			 * data->start_class */
3785 			INIT_AND_WITHP;
3786 			StructCopy(data->start_class, and_withp, regnode_ssc);
3787 			flags &= ~SCF_DO_STCLASS_AND;
3788 			StructCopy(&accum, data->start_class, regnode_ssc);
3789 			flags |= SCF_DO_STCLASS_OR;
3790 		    }
3791 		}
3792 
3793                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3794                         OP( startbranch ) == BRANCH )
3795                 {
3796 		/* demq.
3797 
3798                    Assuming this was/is a branch we are dealing with: 'scan'
3799                    now points at the item that follows the branch sequence,
3800                    whatever it is. We now start at the beginning of the
3801                    sequence and look for subsequences of
3802 
3803 		   BRANCH->EXACT=>x1
3804 		   BRANCH->EXACT=>x2
3805 		   tail
3806 
3807                    which would be constructed from a pattern like
3808                    /A|LIST|OF|WORDS/
3809 
3810 		   If we can find such a subsequence we need to turn the first
3811 		   element into a trie and then add the subsequent branch exact
3812 		   strings to the trie.
3813 
3814 		   We have two cases
3815 
3816                      1. patterns where the whole set of branches can be
3817                         converted.
3818 
3819 		     2. patterns where only a subset can be converted.
3820 
3821 		   In case 1 we can replace the whole set with a single regop
3822 		   for the trie. In case 2 we need to keep the start and end
3823 		   branches so
3824 
3825 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3826 		     becomes BRANCH TRIE; BRANCH X;
3827 
3828 		  There is an additional case, that being where there is a
3829 		  common prefix, which gets split out into an EXACT like node
3830 		  preceding the TRIE node.
3831 
3832 		  If x(1..n)==tail then we can do a simple trie, if not we make
3833 		  a "jump" trie, such that when we match the appropriate word
3834 		  we "jump" to the appropriate tail node. Essentially we turn
3835 		  a nested if into a case structure of sorts.
3836 
3837 		*/
3838 
3839 		    int made=0;
3840 		    if (!re_trie_maxbuff) {
3841 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3842 			if (!SvIOK(re_trie_maxbuff))
3843 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3844 		    }
3845                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3846                         regnode *cur;
3847                         regnode *first = (regnode *)NULL;
3848                         regnode *last = (regnode *)NULL;
3849                         regnode *tail = scan;
3850                         U8 trietype = 0;
3851                         U32 count=0;
3852 
3853 #ifdef DEBUGGING
3854                         SV * const mysv = sv_newmortal();   /* for dumping */
3855 #endif
3856                         /* var tail is used because there may be a TAIL
3857                            regop in the way. Ie, the exacts will point to the
3858                            thing following the TAIL, but the last branch will
3859                            point at the TAIL. So we advance tail. If we
3860                            have nested (?:) we may have to move through several
3861                            tails.
3862                          */
3863 
3864                         while ( OP( tail ) == TAIL ) {
3865                             /* this is the TAIL generated by (?:) */
3866                             tail = regnext( tail );
3867                         }
3868 
3869 
3870                         DEBUG_TRIE_COMPILE_r({
3871                             regprop(RExC_rx, mysv, tail, NULL);
3872                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3873                               (int)depth * 2 + 2, "",
3874                               "Looking for TRIE'able sequences. Tail node is: ",
3875                               SvPV_nolen_const( mysv )
3876                             );
3877                         });
3878 
3879                         /*
3880 
3881                             Step through the branches
3882                                 cur represents each branch,
3883                                 noper is the first thing to be matched as part
3884                                       of that branch
3885                                 noper_next is the regnext() of that node.
3886 
3887                             We normally handle a case like this
3888                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3889                             support building with NOJUMPTRIE, which restricts
3890                             the trie logic to structures like /FOO|BAR/.
3891 
3892                             If noper is a trieable nodetype then the branch is
3893                             a possible optimization target. If we are building
3894                             under NOJUMPTRIE then we require that noper_next is
3895                             the same as scan (our current position in the regex
3896                             program).
3897 
3898                             Once we have two or more consecutive such branches
3899                             we can create a trie of the EXACT's contents and
3900                             stitch it in place into the program.
3901 
3902                             If the sequence represents all of the branches in
3903                             the alternation we replace the entire thing with a
3904                             single TRIE node.
3905 
3906                             Otherwise when it is a subsequence we need to
3907                             stitch it in place and replace only the relevant
3908                             branches. This means the first branch has to remain
3909                             as it is used by the alternation logic, and its
3910                             next pointer, and needs to be repointed at the item
3911                             on the branch chain following the last branch we
3912                             have optimized away.
3913 
3914                             This could be either a BRANCH, in which case the
3915                             subsequence is internal, or it could be the item
3916                             following the branch sequence in which case the
3917                             subsequence is at the end (which does not
3918                             necessarily mean the first node is the start of the
3919                             alternation).
3920 
3921                             TRIE_TYPE(X) is a define which maps the optype to a
3922                             trietype.
3923 
3924                                 optype          |  trietype
3925                                 ----------------+-----------
3926                                 NOTHING         | NOTHING
3927                                 EXACT           | EXACT
3928                                 EXACTFU         | EXACTFU
3929                                 EXACTFU_SS      | EXACTFU
3930                                 EXACTFA         | EXACTFA
3931 
3932 
3933                         */
3934 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3935                        ( EXACT == (X) )   ? EXACT :        \
3936                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3937                        ( EXACTFA == (X) ) ? EXACTFA :        \
3938                        0 )
3939 
3940                         /* dont use tail as the end marker for this traverse */
3941                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3942                             regnode * const noper = NEXTOPER( cur );
3943                             U8 noper_type = OP( noper );
3944                             U8 noper_trietype = TRIE_TYPE( noper_type );
3945 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3946                             regnode * const noper_next = regnext( noper );
3947 			    U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3948 			    U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3949 #endif
3950 
3951                             DEBUG_TRIE_COMPILE_r({
3952                                 regprop(RExC_rx, mysv, cur, NULL);
3953                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3954                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3955 
3956                                 regprop(RExC_rx, mysv, noper, NULL);
3957                                 PerlIO_printf( Perl_debug_log, " -> %s",
3958                                     SvPV_nolen_const(mysv));
3959 
3960                                 if ( noper_next ) {
3961                                   regprop(RExC_rx, mysv, noper_next, NULL);
3962                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3963                                     SvPV_nolen_const(mysv));
3964                                 }
3965                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3966                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3967 				   PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3968 				);
3969                             });
3970 
3971                             /* Is noper a trieable nodetype that can be merged
3972                              * with the current trie (if there is one)? */
3973                             if ( noper_trietype
3974                                   &&
3975                                   (
3976                                         ( noper_trietype == NOTHING)
3977                                         || ( trietype == NOTHING )
3978                                         || ( trietype == noper_trietype )
3979                                   )
3980 #ifdef NOJUMPTRIE
3981                                   && noper_next == tail
3982 #endif
3983                                   && count < U16_MAX)
3984                             {
3985                                 /* Handle mergable triable node Either we are
3986                                  * the first node in a new trieable sequence,
3987                                  * in which case we do some bookkeeping,
3988                                  * otherwise we update the end pointer. */
3989                                 if ( !first ) {
3990                                     first = cur;
3991 				    if ( noper_trietype == NOTHING ) {
3992 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3993 					regnode * const noper_next = regnext( noper );
3994                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3995 					U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3996 #endif
3997 
3998                                         if ( noper_next_trietype ) {
3999 					    trietype = noper_next_trietype;
4000                                         } else if (noper_next_type)  {
4001                                             /* a NOTHING regop is 1 regop wide.
4002                                              * We need at least two for a trie
4003                                              * so we can't merge this in */
4004                                             first = NULL;
4005                                         }
4006                                     } else {
4007                                         trietype = noper_trietype;
4008                                     }
4009                                 } else {
4010                                     if ( trietype == NOTHING )
4011                                         trietype = noper_trietype;
4012                                     last = cur;
4013                                 }
4014 				if (first)
4015 				    count++;
4016                             } /* end handle mergable triable node */
4017                             else {
4018                                 /* handle unmergable node -
4019                                  * noper may either be a triable node which can
4020                                  * not be tried together with the current trie,
4021                                  * or a non triable node */
4022                                 if ( last ) {
4023                                     /* If last is set and trietype is not
4024                                      * NOTHING then we have found at least two
4025                                      * triable branch sequences in a row of a
4026                                      * similar trietype so we can turn them
4027                                      * into a trie. If/when we allow NOTHING to
4028                                      * start a trie sequence this condition
4029                                      * will be required, and it isn't expensive
4030                                      * so we leave it in for now. */
4031                                     if ( trietype && trietype != NOTHING )
4032                                         make_trie( pRExC_state,
4033                                                 startbranch, first, cur, tail,
4034                                                 count, trietype, depth+1 );
4035                                     last = NULL; /* note: we clear/update
4036                                                     first, trietype etc below,
4037                                                     so we dont do it here */
4038                                 }
4039                                 if ( noper_trietype
4040 #ifdef NOJUMPTRIE
4041                                      && noper_next == tail
4042 #endif
4043                                 ){
4044                                     /* noper is triable, so we can start a new
4045                                      * trie sequence */
4046                                     count = 1;
4047                                     first = cur;
4048                                     trietype = noper_trietype;
4049                                 } else if (first) {
4050                                     /* if we already saw a first but the
4051                                      * current node is not triable then we have
4052                                      * to reset the first information. */
4053                                     count = 0;
4054                                     first = NULL;
4055                                     trietype = 0;
4056                                 }
4057                             } /* end handle unmergable node */
4058                         } /* loop over branches */
4059                         DEBUG_TRIE_COMPILE_r({
4060                             regprop(RExC_rx, mysv, cur, NULL);
4061                             PerlIO_printf( Perl_debug_log,
4062                               "%*s- %s (%d) <SCAN FINISHED>\n",
4063                               (int)depth * 2 + 2,
4064                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4065 
4066                         });
4067                         if ( last && trietype ) {
4068                             if ( trietype != NOTHING ) {
4069                                 /* the last branch of the sequence was part of
4070                                  * a trie, so we have to construct it here
4071                                  * outside of the loop */
4072                                 made= make_trie( pRExC_state, startbranch,
4073                                                  first, scan, tail, count,
4074                                                  trietype, depth+1 );
4075 #ifdef TRIE_STUDY_OPT
4076                                 if ( ((made == MADE_EXACT_TRIE &&
4077                                      startbranch == first)
4078                                      || ( first_non_open == first )) &&
4079                                      depth==0 ) {
4080                                     flags |= SCF_TRIE_RESTUDY;
4081                                     if ( startbranch == first
4082                                          && scan == tail )
4083                                     {
4084                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4085                                     }
4086                                 }
4087 #endif
4088                             } else {
4089                                 /* at this point we know whatever we have is a
4090                                  * NOTHING sequence/branch AND if 'startbranch'
4091                                  * is 'first' then we can turn the whole thing
4092                                  * into a NOTHING
4093                                  */
4094                                 if ( startbranch == first ) {
4095                                     regnode *opt;
4096                                     /* the entire thing is a NOTHING sequence,
4097                                      * something like this: (?:|) So we can
4098                                      * turn it into a plain NOTHING op. */
4099                                     DEBUG_TRIE_COMPILE_r({
4100                                         regprop(RExC_rx, mysv, cur, NULL);
4101                                         PerlIO_printf( Perl_debug_log,
4102                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4103                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4104 
4105                                     });
4106                                     OP(startbranch)= NOTHING;
4107                                     NEXT_OFF(startbranch)= tail - startbranch;
4108                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4109                                         OP(opt)= OPTIMIZED;
4110                                 }
4111                             }
4112                         } /* end if ( last) */
4113                     } /* TRIE_MAXBUF is non zero */
4114 
4115                 } /* do trie */
4116 
4117 	    }
4118 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4119 		scan = NEXTOPER(NEXTOPER(scan));
4120 	    } else			/* single branch is optimized. */
4121 		scan = NEXTOPER(scan);
4122 	    continue;
4123 	} else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4124 	    scan_frame *newframe = NULL;
4125 	    I32 paren;
4126 	    regnode *start;
4127 	    regnode *end;
4128             U32 my_recursed_depth= recursed_depth;
4129 
4130 	    if (OP(scan) != SUSPEND) {
4131                 /* set the pointer */
4132 	        if (OP(scan) == GOSUB) {
4133 	            paren = ARG(scan);
4134 	            RExC_recurse[ARG2L(scan)] = scan;
4135                     start = RExC_open_parens[paren-1];
4136                     end   = RExC_close_parens[paren-1];
4137                 } else {
4138                     paren = 0;
4139                     start = RExC_rxi->program + 1;
4140                     end   = RExC_opend;
4141                 }
4142                 if (!recursed_depth
4143                     ||
4144                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4145                 ) {
4146                     if (!recursed_depth) {
4147                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4148                     } else {
4149                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4150                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4151                              RExC_study_chunk_recursed_bytes, U8);
4152                     }
4153                     /* we havent recursed into this paren yet, so recurse into it */
4154 	            DEBUG_STUDYDATA("set:", data,depth);
4155                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4156                     my_recursed_depth= recursed_depth + 1;
4157                     Newx(newframe,1,scan_frame);
4158                 } else {
4159 	            DEBUG_STUDYDATA("inf:", data,depth);
4160                     /* some form of infinite recursion, assume infinite length
4161                      * */
4162                     if (flags & SCF_DO_SUBSTR) {
4163                         scan_commit(pRExC_state, data, minlenp, is_inf);
4164                         data->longest = &(data->longest_float);
4165                     }
4166                     is_inf = is_inf_internal = 1;
4167                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4168                         ssc_anything(data->start_class);
4169                     flags &= ~SCF_DO_STCLASS;
4170 	        }
4171             } else {
4172 	        Newx(newframe,1,scan_frame);
4173 	        paren = stopparen;
4174 	        start = scan+2;
4175 	        end = regnext(scan);
4176 	    }
4177 	    if (newframe) {
4178                 assert(start);
4179                 assert(end);
4180 	        SAVEFREEPV(newframe);
4181 	        newframe->next = regnext(scan);
4182 	        newframe->last = last;
4183 	        newframe->stop = stopparen;
4184 	        newframe->prev = frame;
4185                 newframe->prev_recursed_depth = recursed_depth;
4186 
4187                 DEBUG_STUDYDATA("frame-new:",data,depth);
4188                 DEBUG_PEEP("fnew", scan, depth);
4189 
4190 	        frame = newframe;
4191 	        scan =  start;
4192 	        stopparen = paren;
4193 	        last = end;
4194                 depth = depth + 1;
4195                 recursed_depth= my_recursed_depth;
4196 
4197 	        continue;
4198 	    }
4199 	}
4200 	else if (OP(scan) == EXACT) {
4201 	    SSize_t l = STR_LEN(scan);
4202 	    UV uc;
4203 	    if (UTF) {
4204 		const U8 * const s = (U8*)STRING(scan);
4205 		uc = utf8_to_uvchr_buf(s, s + l, NULL);
4206 		l = utf8_length(s, s + l);
4207 	    } else {
4208 		uc = *((U8*)STRING(scan));
4209 	    }
4210 	    min += l;
4211 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4212 		/* The code below prefers earlier match for fixed
4213 		   offset, later match for variable offset.  */
4214 		if (data->last_end == -1) { /* Update the start info. */
4215 		    data->last_start_min = data->pos_min;
4216  		    data->last_start_max = is_inf
4217  			? SSize_t_MAX : data->pos_min + data->pos_delta;
4218 		}
4219 		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4220 		if (UTF)
4221 		    SvUTF8_on(data->last_found);
4222 		{
4223 		    SV * const sv = data->last_found;
4224 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4225 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
4226 		    if (mg && mg->mg_len >= 0)
4227 			mg->mg_len += utf8_length((U8*)STRING(scan),
4228                                               (U8*)STRING(scan)+STR_LEN(scan));
4229 		}
4230 		data->last_end = data->pos_min + l;
4231 		data->pos_min += l; /* As in the first entry. */
4232 		data->flags &= ~SF_BEFORE_EOL;
4233 	    }
4234 
4235             /* ANDing the code point leaves at most it, and not in locale, and
4236              * can't match null string */
4237 	    if (flags & SCF_DO_STCLASS_AND) {
4238                 ssc_cp_and(data->start_class, uc);
4239                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4240                 ssc_clear_locale(data->start_class);
4241 	    }
4242 	    else if (flags & SCF_DO_STCLASS_OR) {
4243                 ssc_add_cp(data->start_class, uc);
4244 		ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4245 
4246                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4247                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4248 	    }
4249 	    flags &= ~SCF_DO_STCLASS;
4250 	}
4251 	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4252 	    SSize_t l = STR_LEN(scan);
4253 	    UV uc = *((U8*)STRING(scan));
4254             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4255                                                      separate code points */
4256 
4257 	    /* Search for fixed substrings supports EXACT only. */
4258 	    if (flags & SCF_DO_SUBSTR) {
4259 		assert(data);
4260                 scan_commit(pRExC_state, data, minlenp, is_inf);
4261 	    }
4262 	    if (UTF) {
4263 		const U8 * const s = (U8 *)STRING(scan);
4264 		uc = utf8_to_uvchr_buf(s, s + l, NULL);
4265 		l = utf8_length(s, s + l);
4266 	    }
4267 	    if (unfolded_multi_char) {
4268                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4269 	    }
4270 	    min += l - min_subtract;
4271             assert (min >= 0);
4272             delta += min_subtract;
4273 	    if (flags & SCF_DO_SUBSTR) {
4274 		data->pos_min += l - min_subtract;
4275 		if (data->pos_min < 0) {
4276                     data->pos_min = 0;
4277                 }
4278                 data->pos_delta += min_subtract;
4279 		if (min_subtract) {
4280 		    data->longest = &(data->longest_float);
4281 		}
4282 	    }
4283             if (OP(scan) == EXACTFL) {
4284 
4285                 /* We don't know what the folds are; it could be anything. XXX
4286                  * Actually, we only support UTF-8 encoding for code points
4287                  * above Latin1, so we could know what those folds are. */
4288                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4289                                                        0,
4290                                                        UV_MAX);
4291             }
4292             else {  /* Non-locale EXACTFish */
4293                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4294                 if (flags & SCF_DO_STCLASS_AND) {
4295                     ssc_clear_locale(data->start_class);
4296                 }
4297                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4298                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4299                                                        know if anything folds
4300                                                        with this */
4301                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4302                                                            PL_fold_latin1[uc]);
4303                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4304                                                       legal under /iaa */
4305                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4306                                 EXACTF_invlist
4307                                     = add_cp_to_invlist(EXACTF_invlist,
4308                                                 LATIN_SMALL_LETTER_SHARP_S);
4309                             }
4310                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4311                                 EXACTF_invlist
4312                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4313                                 EXACTF_invlist
4314                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4315                             }
4316                         }
4317 
4318                         /* We also know if there are above-Latin1 code points
4319                          * that fold to this (none legal for ASCII and /iaa) */
4320                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4321                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4322                         {
4323                             /* XXX We could know exactly what does fold to this
4324                              * if the reverse folds are loaded, as currently in
4325                              * S_regclass() */
4326                             _invlist_union(EXACTF_invlist,
4327                                            PL_AboveLatin1,
4328                                            &EXACTF_invlist);
4329                         }
4330                     }
4331                 }
4332                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4333                            know what participates in folds with this, so have
4334                            to assume anything could */
4335 
4336                     /* XXX We could know exactly what does fold to this if the
4337                      * reverse folds are loaded, as currently in S_regclass().
4338                      * But we do know that under /iaa nothing in the ASCII
4339                      * range can participate */
4340                     if (OP(scan) == EXACTFA) {
4341                         _invlist_union_complement_2nd(EXACTF_invlist,
4342                                                       PL_XPosix_ptrs[_CC_ASCII],
4343                                                       &EXACTF_invlist);
4344                     }
4345                     else {
4346                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4347                                                                0, UV_MAX);
4348                     }
4349                 }
4350             }
4351 	    if (flags & SCF_DO_STCLASS_AND) {
4352                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4353                 ANYOF_POSIXL_ZERO(data->start_class);
4354                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4355 	    }
4356 	    else if (flags & SCF_DO_STCLASS_OR) {
4357                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4358 		ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4359 
4360                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4361                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4362 	    }
4363 	    flags &= ~SCF_DO_STCLASS;
4364             SvREFCNT_dec(EXACTF_invlist);
4365 	}
4366 	else if (REGNODE_VARIES(OP(scan))) {
4367 	    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4368 	    I32 fl = 0, f = flags;
4369 	    regnode * const oscan = scan;
4370 	    regnode_ssc this_class;
4371 	    regnode_ssc *oclass = NULL;
4372 	    I32 next_is_eval = 0;
4373 
4374 	    switch (PL_regkind[OP(scan)]) {
4375 	    case WHILEM:		/* End of (?:...)* . */
4376 		scan = NEXTOPER(scan);
4377 		goto finish;
4378 	    case PLUS:
4379 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4380 		    next = NEXTOPER(scan);
4381 		    if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4382 			mincount = 1;
4383 			maxcount = REG_INFTY;
4384 			next = regnext(scan);
4385 			scan = NEXTOPER(scan);
4386 			goto do_curly;
4387 		    }
4388 		}
4389 		if (flags & SCF_DO_SUBSTR)
4390 		    data->pos_min++;
4391 		min++;
4392 		/* Fall through. */
4393 	    case STAR:
4394 		if (flags & SCF_DO_STCLASS) {
4395 		    mincount = 0;
4396 		    maxcount = REG_INFTY;
4397 		    next = regnext(scan);
4398 		    scan = NEXTOPER(scan);
4399 		    goto do_curly;
4400 		}
4401 		if (flags & SCF_DO_SUBSTR) {
4402                     scan_commit(pRExC_state, data, minlenp, is_inf);
4403                     /* Cannot extend fixed substrings */
4404 		    data->longest = &(data->longest_float);
4405 		}
4406                 is_inf = is_inf_internal = 1;
4407                 scan = regnext(scan);
4408 		goto optimize_curly_tail;
4409 	    case CURLY:
4410 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4411 	            && (scan->flags == stopparen))
4412 		{
4413 		    mincount = 1;
4414 		    maxcount = 1;
4415 		} else {
4416 		    mincount = ARG1(scan);
4417 		    maxcount = ARG2(scan);
4418 		}
4419 		next = regnext(scan);
4420 		if (OP(scan) == CURLYX) {
4421 		    I32 lp = (data ? *(data->last_closep) : 0);
4422 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4423 		}
4424 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4425 		next_is_eval = (OP(scan) == EVAL);
4426 	      do_curly:
4427 		if (flags & SCF_DO_SUBSTR) {
4428                     if (mincount == 0)
4429                         scan_commit(pRExC_state, data, minlenp, is_inf);
4430                     /* Cannot extend fixed substrings */
4431 		    pos_before = data->pos_min;
4432 		}
4433 		if (data) {
4434 		    fl = data->flags;
4435 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4436 		    if (is_inf)
4437 			data->flags |= SF_IS_INF;
4438 		}
4439 		if (flags & SCF_DO_STCLASS) {
4440 		    ssc_init(pRExC_state, &this_class);
4441 		    oclass = data->start_class;
4442 		    data->start_class = &this_class;
4443 		    f |= SCF_DO_STCLASS_AND;
4444 		    f &= ~SCF_DO_STCLASS_OR;
4445 		}
4446 	        /* Exclude from super-linear cache processing any {n,m}
4447 		   regops for which the combination of input pos and regex
4448 		   pos is not enough information to determine if a match
4449 		   will be possible.
4450 
4451 		   For example, in the regex /foo(bar\s*){4,8}baz/ with the
4452 		   regex pos at the \s*, the prospects for a match depend not
4453 		   only on the input position but also on how many (bar\s*)
4454 		   repeats into the {4,8} we are. */
4455                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4456 		    f &= ~SCF_WHILEM_VISITED_POS;
4457 
4458 		/* This will finish on WHILEM, setting scan, or on NULL: */
4459 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4460                                   last, data, stopparen, recursed_depth, NULL,
4461                                   (mincount == 0
4462                                    ? (f & ~SCF_DO_SUBSTR)
4463                                    : f)
4464                                   ,depth+1);
4465 
4466 		if (flags & SCF_DO_STCLASS)
4467 		    data->start_class = oclass;
4468 		if (mincount == 0 || minnext == 0) {
4469 		    if (flags & SCF_DO_STCLASS_OR) {
4470 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4471 		    }
4472 		    else if (flags & SCF_DO_STCLASS_AND) {
4473 			/* Switch to OR mode: cache the old value of
4474 			 * data->start_class */
4475 			INIT_AND_WITHP;
4476 			StructCopy(data->start_class, and_withp, regnode_ssc);
4477 			flags &= ~SCF_DO_STCLASS_AND;
4478 			StructCopy(&this_class, data->start_class, regnode_ssc);
4479 			flags |= SCF_DO_STCLASS_OR;
4480                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4481 		    }
4482 		} else {		/* Non-zero len */
4483 		    if (flags & SCF_DO_STCLASS_OR) {
4484 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4485 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4486 		    }
4487 		    else if (flags & SCF_DO_STCLASS_AND)
4488 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4489 		    flags &= ~SCF_DO_STCLASS;
4490 		}
4491 		if (!scan) 		/* It was not CURLYX, but CURLY. */
4492 		    scan = next;
4493 		if (!(flags & SCF_TRIE_DOING_RESTUDY)
4494 		    /* ? quantifier ok, except for (?{ ... }) */
4495 		    && (next_is_eval || !(mincount == 0 && maxcount == 1))
4496 		    && (minnext == 0) && (deltanext == 0)
4497 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4498                     && maxcount <= REG_INFTY/3) /* Complement check for big
4499                                                    count */
4500 		{
4501 		    /* Fatal warnings may leak the regexp without this: */
4502 		    SAVEFREESV(RExC_rx_sv);
4503 		    ckWARNreg(RExC_parse,
4504 			    "Quantifier unexpected on zero-length expression");
4505 		    (void)ReREFCNT_inc(RExC_rx_sv);
4506 		}
4507 
4508 		min += minnext * mincount;
4509 		is_inf_internal |= deltanext == SSize_t_MAX
4510                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4511 		is_inf |= is_inf_internal;
4512                 if (is_inf) {
4513 		    delta = SSize_t_MAX;
4514                 } else {
4515 		    delta += (minnext + deltanext) * maxcount
4516                              - minnext * mincount;
4517                 }
4518 		/* Try powerful optimization CURLYX => CURLYN. */
4519 		if (  OP(oscan) == CURLYX && data
4520 		      && data->flags & SF_IN_PAR
4521 		      && !(data->flags & SF_HAS_EVAL)
4522 		      && !deltanext && minnext == 1 ) {
4523 		    /* Try to optimize to CURLYN.  */
4524 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4525 		    regnode * const nxt1 = nxt;
4526 #ifdef DEBUGGING
4527 		    regnode *nxt2;
4528 #endif
4529 
4530 		    /* Skip open. */
4531 		    nxt = regnext(nxt);
4532 		    if (!REGNODE_SIMPLE(OP(nxt))
4533 			&& !(PL_regkind[OP(nxt)] == EXACT
4534 			     && STR_LEN(nxt) == 1))
4535 			goto nogo;
4536 #ifdef DEBUGGING
4537 		    nxt2 = nxt;
4538 #endif
4539 		    nxt = regnext(nxt);
4540 		    if (OP(nxt) != CLOSE)
4541 			goto nogo;
4542 		    if (RExC_open_parens) {
4543 			RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4544 			RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4545 		    }
4546 		    /* Now we know that nxt2 is the only contents: */
4547 		    oscan->flags = (U8)ARG(nxt);
4548 		    OP(oscan) = CURLYN;
4549 		    OP(nxt1) = NOTHING;	/* was OPEN. */
4550 
4551 #ifdef DEBUGGING
4552 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4553 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4554 		    NEXT_OFF(nxt2) = 0;	/* just for consistency with CURLY. */
4555 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
4556 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
4557 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4558 #endif
4559 		}
4560 	      nogo:
4561 
4562 		/* Try optimization CURLYX => CURLYM. */
4563 		if (  OP(oscan) == CURLYX && data
4564 		      && !(data->flags & SF_HAS_PAR)
4565 		      && !(data->flags & SF_HAS_EVAL)
4566 		      && !deltanext	/* atom is fixed width */
4567 		      && minnext != 0	/* CURLYM can't handle zero width */
4568 
4569                          /* Nor characters whose fold at run-time may be
4570                           * multi-character */
4571                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4572 		) {
4573 		    /* XXXX How to optimize if data == 0? */
4574 		    /* Optimize to a simpler form.  */
4575 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4576 		    regnode *nxt2;
4577 
4578 		    OP(oscan) = CURLYM;
4579 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4580 			    && (OP(nxt2) != WHILEM))
4581 			nxt = nxt2;
4582 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4583 		    /* Need to optimize away parenths. */
4584 		    if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4585 			/* Set the parenth number.  */
4586 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4587 
4588 			oscan->flags = (U8)ARG(nxt);
4589 			if (RExC_open_parens) {
4590 			    RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4591 			    RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4592 			}
4593 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
4594 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
4595 
4596 #ifdef DEBUGGING
4597 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4598 			OP(nxt + 1) = OPTIMIZED; /* was count. */
4599 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4600 			NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4601 #endif
4602 #if 0
4603 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
4604 			    regnode *nnxt = regnext(nxt1);
4605 			    if (nnxt == nxt) {
4606 				if (reg_off_by_arg[OP(nxt1)])
4607 				    ARG_SET(nxt1, nxt2 - nxt1);
4608 				else if (nxt2 - nxt1 < U16_MAX)
4609 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
4610 				else
4611 				    OP(nxt) = NOTHING;	/* Cannot beautify */
4612 			    }
4613 			    nxt1 = nnxt;
4614 			}
4615 #endif
4616 			/* Optimize again: */
4617 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4618                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4619 		    }
4620 		    else
4621 			oscan->flags = 0;
4622 		}
4623 		else if ((OP(oscan) == CURLYX)
4624 			 && (flags & SCF_WHILEM_VISITED_POS)
4625 			 /* See the comment on a similar expression above.
4626 			    However, this time it's not a subexpression
4627 			    we care about, but the expression itself. */
4628 			 && (maxcount == REG_INFTY)
4629 			 && data && ++data->whilem_c < 16) {
4630 		    /* This stays as CURLYX, we can put the count/of pair. */
4631 		    /* Find WHILEM (as in regexec.c) */
4632 		    regnode *nxt = oscan + NEXT_OFF(oscan);
4633 
4634 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4635 			nxt += ARG(nxt);
4636 		    PREVOPER(nxt)->flags = (U8)(data->whilem_c
4637 			| (RExC_whilem_seen << 4)); /* On WHILEM */
4638 		}
4639 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4640 		    pars++;
4641 		if (flags & SCF_DO_SUBSTR) {
4642 		    SV *last_str = NULL;
4643                     STRLEN last_chrs = 0;
4644 		    int counted = mincount != 0;
4645 
4646                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4647                                                                   string. */
4648 			SSize_t b = pos_before >= data->last_start_min
4649 			    ? pos_before : data->last_start_min;
4650 			STRLEN l;
4651 			const char * const s = SvPV_const(data->last_found, l);
4652 			SSize_t old = b - data->last_start_min;
4653 
4654 			if (UTF)
4655 			    old = utf8_hop((U8*)s, old) - (U8*)s;
4656 			l -= old;
4657 			/* Get the added string: */
4658 			last_str = newSVpvn_utf8(s  + old, l, UTF);
4659                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4660                                             (U8*)(s + old + l)) : l;
4661 			if (deltanext == 0 && pos_before == b) {
4662 			    /* What was added is a constant string */
4663 			    if (mincount > 1) {
4664 
4665 				SvGROW(last_str, (mincount * l) + 1);
4666 				repeatcpy(SvPVX(last_str) + l,
4667 					  SvPVX_const(last_str), l,
4668                                           mincount - 1);
4669 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
4670 				/* Add additional parts. */
4671 				SvCUR_set(data->last_found,
4672 					  SvCUR(data->last_found) - l);
4673 				sv_catsv(data->last_found, last_str);
4674 				{
4675 				    SV * sv = data->last_found;
4676 				    MAGIC *mg =
4677 					SvUTF8(sv) && SvMAGICAL(sv) ?
4678 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
4679 				    if (mg && mg->mg_len >= 0)
4680 					mg->mg_len += last_chrs * (mincount-1);
4681 				}
4682                                 last_chrs *= mincount;
4683 				data->last_end += l * (mincount - 1);
4684 			    }
4685 			} else {
4686 			    /* start offset must point into the last copy */
4687 			    data->last_start_min += minnext * (mincount - 1);
4688 			    data->last_start_max += is_inf ? SSize_t_MAX
4689 				: (maxcount - 1) * (minnext + data->pos_delta);
4690 			}
4691 		    }
4692 		    /* It is counted once already... */
4693 		    data->pos_min += minnext * (mincount - counted);
4694 #if 0
4695 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4696                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4697                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4698     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4699     (UV)mincount);
4700 if (deltanext != SSize_t_MAX)
4701 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4702     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4703           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4704 #endif
4705 		    if (deltanext == SSize_t_MAX
4706                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4707 		        data->pos_delta = SSize_t_MAX;
4708 		    else
4709 		        data->pos_delta += - counted * deltanext +
4710 			(minnext + deltanext) * maxcount - minnext * mincount;
4711 		    if (mincount != maxcount) {
4712 			 /* Cannot extend fixed substrings found inside
4713 			    the group.  */
4714                         scan_commit(pRExC_state, data, minlenp, is_inf);
4715 			if (mincount && last_str) {
4716 			    SV * const sv = data->last_found;
4717 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4718 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
4719 
4720 			    if (mg)
4721 				mg->mg_len = -1;
4722 			    sv_setsv(sv, last_str);
4723 			    data->last_end = data->pos_min;
4724 			    data->last_start_min = data->pos_min - last_chrs;
4725 			    data->last_start_max = is_inf
4726 				? SSize_t_MAX
4727 				: data->pos_min + data->pos_delta - last_chrs;
4728 			}
4729 			data->longest = &(data->longest_float);
4730 		    }
4731 		    SvREFCNT_dec(last_str);
4732 		}
4733 		if (data && (fl & SF_HAS_EVAL))
4734 		    data->flags |= SF_HAS_EVAL;
4735 	      optimize_curly_tail:
4736 		if (OP(oscan) != CURLYX) {
4737 		    while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4738 			   && NEXT_OFF(next))
4739 			NEXT_OFF(oscan) += NEXT_OFF(next);
4740 		}
4741 		continue;
4742 
4743 	    default:
4744 #ifdef DEBUGGING
4745                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4746                                                                     OP(scan));
4747 #endif
4748             case REF:
4749             case CLUMP:
4750 		if (flags & SCF_DO_SUBSTR) {
4751                     /* Cannot expect anything... */
4752                     scan_commit(pRExC_state, data, minlenp, is_inf);
4753 		    data->longest = &(data->longest_float);
4754 		}
4755 		is_inf = is_inf_internal = 1;
4756 		if (flags & SCF_DO_STCLASS_OR) {
4757                     if (OP(scan) == CLUMP) {
4758                         /* Actually is any start char, but very few code points
4759                          * aren't start characters */
4760                         ssc_match_all_cp(data->start_class);
4761                     }
4762                     else {
4763                         ssc_anything(data->start_class);
4764                     }
4765                 }
4766 		flags &= ~SCF_DO_STCLASS;
4767 		break;
4768 	    }
4769 	}
4770 	else if (OP(scan) == LNBREAK) {
4771 	    if (flags & SCF_DO_STCLASS) {
4772     	        if (flags & SCF_DO_STCLASS_AND) {
4773                     ssc_intersection(data->start_class,
4774                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4775                     ssc_clear_locale(data->start_class);
4776                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4777                 }
4778                 else if (flags & SCF_DO_STCLASS_OR) {
4779                     ssc_union(data->start_class,
4780                               PL_XPosix_ptrs[_CC_VERTSPACE],
4781                               FALSE);
4782 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4783 
4784                     /* See commit msg for
4785                      * 749e076fceedeb708a624933726e7989f2302f6a */
4786                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4787                 }
4788 		flags &= ~SCF_DO_STCLASS;
4789             }
4790 	    min++;
4791 	    delta++;    /* Because of the 2 char string cr-lf */
4792             if (flags & SCF_DO_SUBSTR) {
4793                 /* Cannot expect anything... */
4794                 scan_commit(pRExC_state, data, minlenp, is_inf);
4795     	        data->pos_min += 1;
4796 	        data->pos_delta += 1;
4797 		data->longest = &(data->longest_float);
4798     	    }
4799 	}
4800 	else if (REGNODE_SIMPLE(OP(scan))) {
4801 
4802 	    if (flags & SCF_DO_SUBSTR) {
4803                 scan_commit(pRExC_state, data, minlenp, is_inf);
4804 		data->pos_min++;
4805 	    }
4806 	    min++;
4807 	    if (flags & SCF_DO_STCLASS) {
4808                 bool invert = 0;
4809                 SV* my_invlist = NULL;
4810                 U8 namedclass;
4811 
4812                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4813                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4814 
4815 		/* Some of the logic below assumes that switching
4816 		   locale on will only add false positives. */
4817 		switch (OP(scan)) {
4818 
4819 		default:
4820 #ifdef DEBUGGING
4821                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4822                                                                      OP(scan));
4823 #endif
4824 		case CANY:
4825 		case SANY:
4826 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4827 			ssc_match_all_cp(data->start_class);
4828 		    break;
4829 
4830 		case REG_ANY:
4831                     {
4832                         SV* REG_ANY_invlist = _new_invlist(2);
4833                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4834                                                             '\n');
4835                         if (flags & SCF_DO_STCLASS_OR) {
4836                             ssc_union(data->start_class,
4837                                       REG_ANY_invlist,
4838                                       TRUE /* TRUE => invert, hence all but \n
4839                                             */
4840                                       );
4841                         }
4842                         else if (flags & SCF_DO_STCLASS_AND) {
4843                             ssc_intersection(data->start_class,
4844                                              REG_ANY_invlist,
4845                                              TRUE  /* TRUE => invert */
4846                                              );
4847                             ssc_clear_locale(data->start_class);
4848                         }
4849                         SvREFCNT_dec_NN(REG_ANY_invlist);
4850 		    }
4851 		    break;
4852 
4853                 case ANYOF:
4854 		    if (flags & SCF_DO_STCLASS_AND)
4855 			ssc_and(pRExC_state, data->start_class,
4856                                 (regnode_charclass *) scan);
4857 		    else
4858 			ssc_or(pRExC_state, data->start_class,
4859                                                           (regnode_charclass *) scan);
4860 		    break;
4861 
4862 		case NPOSIXL:
4863                     invert = 1;
4864                     /* FALL THROUGH */
4865 
4866 		case POSIXL:
4867                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4868                     if (flags & SCF_DO_STCLASS_AND) {
4869                         bool was_there = cBOOL(
4870                                           ANYOF_POSIXL_TEST(data->start_class,
4871                                                                  namedclass));
4872                         ANYOF_POSIXL_ZERO(data->start_class);
4873                         if (was_there) {    /* Do an AND */
4874                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4875                         }
4876                         /* No individual code points can now match */
4877                         data->start_class->invlist
4878                                                 = sv_2mortal(_new_invlist(0));
4879                     }
4880                     else {
4881                         int complement = namedclass + ((invert) ? -1 : 1);
4882 
4883                         assert(flags & SCF_DO_STCLASS_OR);
4884 
4885                         /* If the complement of this class was already there,
4886                          * the result is that they match all code points,
4887                          * (\d + \D == everything).  Remove the classes from
4888                          * future consideration.  Locale is not relevant in
4889                          * this case */
4890                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4891                             ssc_match_all_cp(data->start_class);
4892                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4893                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4894                         }
4895                         else {  /* The usual case; just add this class to the
4896                                    existing set */
4897                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4898                         }
4899                     }
4900                     break;
4901 
4902                 case NPOSIXA:   /* For these, we always know the exact set of
4903                                    what's matched */
4904                     invert = 1;
4905                     /* FALL THROUGH */
4906 		case POSIXA:
4907                     if (FLAGS(scan) == _CC_ASCII) {
4908                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
4909                     }
4910                     else {
4911                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4912                                               PL_XPosix_ptrs[_CC_ASCII],
4913                                               &my_invlist);
4914                     }
4915                     goto join_posix;
4916 
4917 		case NPOSIXD:
4918 		case NPOSIXU:
4919                     invert = 1;
4920                     /* FALL THROUGH */
4921 		case POSIXD:
4922 		case POSIXU:
4923                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4924 
4925                     /* NPOSIXD matches all upper Latin1 code points unless the
4926                      * target string being matched is UTF-8, which is
4927                      * unknowable until match time.  Since we are going to
4928                      * invert, we want to get rid of all of them so that the
4929                      * inversion will match all */
4930                     if (OP(scan) == NPOSIXD) {
4931                         _invlist_subtract(my_invlist, PL_UpperLatin1,
4932                                           &my_invlist);
4933                     }
4934 
4935                   join_posix:
4936 
4937                     if (flags & SCF_DO_STCLASS_AND) {
4938                         ssc_intersection(data->start_class, my_invlist, invert);
4939                         ssc_clear_locale(data->start_class);
4940                     }
4941                     else {
4942                         assert(flags & SCF_DO_STCLASS_OR);
4943                         ssc_union(data->start_class, my_invlist, invert);
4944                     }
4945                     SvREFCNT_dec(my_invlist);
4946 		}
4947 		if (flags & SCF_DO_STCLASS_OR)
4948 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4949 		flags &= ~SCF_DO_STCLASS;
4950 	    }
4951 	}
4952 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4953 	    data->flags |= (OP(scan) == MEOL
4954 			    ? SF_BEFORE_MEOL
4955 			    : SF_BEFORE_SEOL);
4956             scan_commit(pRExC_state, data, minlenp, is_inf);
4957 
4958 	}
4959 	else if (  PL_regkind[OP(scan)] == BRANCHJ
4960 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
4961 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
4962 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4963             if ( OP(scan) == UNLESSM &&
4964                  scan->flags == 0 &&
4965                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4966                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4967             ) {
4968                 regnode *opt;
4969                 regnode *upto= regnext(scan);
4970                 DEBUG_PARSE_r({
4971                     SV * const mysv_val=sv_newmortal();
4972                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4973 
4974                     /*DEBUG_PARSE_MSG("opfail");*/
4975                     regprop(RExC_rx, mysv_val, upto, NULL);
4976                     PerlIO_printf(Perl_debug_log,
4977                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4978                         SvPV_nolen_const(mysv_val),
4979                         (IV)REG_NODE_NUM(upto),
4980                         (IV)(upto - scan)
4981                     );
4982                 });
4983                 OP(scan) = OPFAIL;
4984                 NEXT_OFF(scan) = upto - scan;
4985                 for (opt= scan + 1; opt < upto ; opt++)
4986                     OP(opt) = OPTIMIZED;
4987                 scan= upto;
4988                 continue;
4989             }
4990             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4991                 || OP(scan) == UNLESSM )
4992             {
4993                 /* Negative Lookahead/lookbehind
4994                    In this case we can't do fixed string optimisation.
4995                 */
4996 
4997                 SSize_t deltanext, minnext, fake = 0;
4998                 regnode *nscan;
4999                 regnode_ssc intrnl;
5000                 int f = 0;
5001 
5002                 data_fake.flags = 0;
5003                 if (data) {
5004                     data_fake.whilem_c = data->whilem_c;
5005                     data_fake.last_closep = data->last_closep;
5006 		}
5007                 else
5008                     data_fake.last_closep = &fake;
5009 		data_fake.pos_delta = delta;
5010                 if ( flags & SCF_DO_STCLASS && !scan->flags
5011                      && OP(scan) == IFMATCH ) { /* Lookahead */
5012                     ssc_init(pRExC_state, &intrnl);
5013                     data_fake.start_class = &intrnl;
5014                     f |= SCF_DO_STCLASS_AND;
5015 		}
5016                 if (flags & SCF_WHILEM_VISITED_POS)
5017                     f |= SCF_WHILEM_VISITED_POS;
5018                 next = regnext(scan);
5019                 nscan = NEXTOPER(NEXTOPER(scan));
5020                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5021                                       last, &data_fake, stopparen,
5022                                       recursed_depth, NULL, f, depth+1);
5023                 if (scan->flags) {
5024                     if (deltanext) {
5025 			FAIL("Variable length lookbehind not implemented");
5026                     }
5027                     else if (minnext > (I32)U8_MAX) {
5028 			FAIL2("Lookbehind longer than %"UVuf" not implemented",
5029                               (UV)U8_MAX);
5030                     }
5031                     scan->flags = (U8)minnext;
5032                 }
5033                 if (data) {
5034                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5035                         pars++;
5036                     if (data_fake.flags & SF_HAS_EVAL)
5037                         data->flags |= SF_HAS_EVAL;
5038                     data->whilem_c = data_fake.whilem_c;
5039                 }
5040                 if (f & SCF_DO_STCLASS_AND) {
5041 		    if (flags & SCF_DO_STCLASS_OR) {
5042 			/* OR before, AND after: ideally we would recurse with
5043 			 * data_fake to get the AND applied by study of the
5044 			 * remainder of the pattern, and then derecurse;
5045 			 * *** HACK *** for now just treat as "no information".
5046 			 * See [perl #56690].
5047 			 */
5048 			ssc_init(pRExC_state, data->start_class);
5049 		    }  else {
5050                         /* AND before and after: combine and continue.  These
5051                          * assertions are zero-length, so can match an EMPTY
5052                          * string */
5053 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5054                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5055 		    }
5056                 }
5057 	    }
5058 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5059             else {
5060                 /* Positive Lookahead/lookbehind
5061                    In this case we can do fixed string optimisation,
5062                    but we must be careful about it. Note in the case of
5063                    lookbehind the positions will be offset by the minimum
5064                    length of the pattern, something we won't know about
5065                    until after the recurse.
5066                 */
5067                 SSize_t deltanext, fake = 0;
5068                 regnode *nscan;
5069                 regnode_ssc intrnl;
5070                 int f = 0;
5071                 /* We use SAVEFREEPV so that when the full compile
5072                     is finished perl will clean up the allocated
5073                     minlens when it's all done. This way we don't
5074                     have to worry about freeing them when we know
5075                     they wont be used, which would be a pain.
5076                  */
5077                 SSize_t *minnextp;
5078                 Newx( minnextp, 1, SSize_t );
5079                 SAVEFREEPV(minnextp);
5080 
5081                 if (data) {
5082                     StructCopy(data, &data_fake, scan_data_t);
5083                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5084                         f |= SCF_DO_SUBSTR;
5085                         if (scan->flags)
5086                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5087                         data_fake.last_found=newSVsv(data->last_found);
5088                     }
5089                 }
5090                 else
5091                     data_fake.last_closep = &fake;
5092                 data_fake.flags = 0;
5093 		data_fake.pos_delta = delta;
5094                 if (is_inf)
5095 	            data_fake.flags |= SF_IS_INF;
5096                 if ( flags & SCF_DO_STCLASS && !scan->flags
5097                      && OP(scan) == IFMATCH ) { /* Lookahead */
5098                     ssc_init(pRExC_state, &intrnl);
5099                     data_fake.start_class = &intrnl;
5100                     f |= SCF_DO_STCLASS_AND;
5101                 }
5102                 if (flags & SCF_WHILEM_VISITED_POS)
5103                     f |= SCF_WHILEM_VISITED_POS;
5104                 next = regnext(scan);
5105                 nscan = NEXTOPER(NEXTOPER(scan));
5106 
5107                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5108                                         &deltanext, last, &data_fake,
5109                                         stopparen, recursed_depth, NULL,
5110                                         f,depth+1);
5111                 if (scan->flags) {
5112                     if (deltanext) {
5113 			FAIL("Variable length lookbehind not implemented");
5114                     }
5115                     else if (*minnextp > (I32)U8_MAX) {
5116 			FAIL2("Lookbehind longer than %"UVuf" not implemented",
5117                               (UV)U8_MAX);
5118                     }
5119                     scan->flags = (U8)*minnextp;
5120                 }
5121 
5122                 *minnextp += min;
5123 
5124                 if (f & SCF_DO_STCLASS_AND) {
5125                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5126                     ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5127                 }
5128                 if (data) {
5129                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5130                         pars++;
5131                     if (data_fake.flags & SF_HAS_EVAL)
5132                         data->flags |= SF_HAS_EVAL;
5133                     data->whilem_c = data_fake.whilem_c;
5134                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5135                         if (RExC_rx->minlen<*minnextp)
5136                             RExC_rx->minlen=*minnextp;
5137                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5138                         SvREFCNT_dec_NN(data_fake.last_found);
5139 
5140                         if ( data_fake.minlen_fixed != minlenp )
5141                         {
5142                             data->offset_fixed= data_fake.offset_fixed;
5143                             data->minlen_fixed= data_fake.minlen_fixed;
5144                             data->lookbehind_fixed+= scan->flags;
5145                         }
5146                         if ( data_fake.minlen_float != minlenp )
5147                         {
5148                             data->minlen_float= data_fake.minlen_float;
5149                             data->offset_float_min=data_fake.offset_float_min;
5150                             data->offset_float_max=data_fake.offset_float_max;
5151                             data->lookbehind_float+= scan->flags;
5152                         }
5153                     }
5154                 }
5155 	    }
5156 #endif
5157 	}
5158 	else if (OP(scan) == OPEN) {
5159 	    if (stopparen != (I32)ARG(scan))
5160 	        pars++;
5161 	}
5162 	else if (OP(scan) == CLOSE) {
5163 	    if (stopparen == (I32)ARG(scan)) {
5164 	        break;
5165 	    }
5166 	    if ((I32)ARG(scan) == is_par) {
5167 		next = regnext(scan);
5168 
5169 		if ( next && (OP(next) != WHILEM) && next < last)
5170 		    is_par = 0;		/* Disable optimization */
5171 	    }
5172 	    if (data)
5173 		*(data->last_closep) = ARG(scan);
5174 	}
5175 	else if (OP(scan) == EVAL) {
5176 		if (data)
5177 		    data->flags |= SF_HAS_EVAL;
5178 	}
5179 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5180 	    if (flags & SCF_DO_SUBSTR) {
5181                 scan_commit(pRExC_state, data, minlenp, is_inf);
5182 		flags &= ~SCF_DO_SUBSTR;
5183 	    }
5184 	    if (data && OP(scan)==ACCEPT) {
5185 	        data->flags |= SCF_SEEN_ACCEPT;
5186 	        if (stopmin > min)
5187 	            stopmin = min;
5188 	    }
5189 	}
5190 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5191 	{
5192 		if (flags & SCF_DO_SUBSTR) {
5193                     scan_commit(pRExC_state, data, minlenp, is_inf);
5194 		    data->longest = &(data->longest_float);
5195 		}
5196 		is_inf = is_inf_internal = 1;
5197 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5198 		    ssc_anything(data->start_class);
5199 		flags &= ~SCF_DO_STCLASS;
5200 	}
5201 	else if (OP(scan) == GPOS) {
5202             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5203 	        !(delta || is_inf || (data && data->pos_delta)))
5204 	    {
5205                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5206                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5207 	        if (RExC_rx->gofs < (STRLEN)min)
5208 		    RExC_rx->gofs = min;
5209             } else {
5210                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5211                 RExC_rx->gofs = 0;
5212             }
5213 	}
5214 #ifdef TRIE_STUDY_OPT
5215 #ifdef FULL_TRIE_STUDY
5216         else if (PL_regkind[OP(scan)] == TRIE) {
5217             /* NOTE - There is similar code to this block above for handling
5218                BRANCH nodes on the initial study.  If you change stuff here
5219                check there too. */
5220             regnode *trie_node= scan;
5221             regnode *tail= regnext(scan);
5222             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5223             SSize_t max1 = 0, min1 = SSize_t_MAX;
5224             regnode_ssc accum;
5225 
5226             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5227                 /* Cannot merge strings after this. */
5228                 scan_commit(pRExC_state, data, minlenp, is_inf);
5229             }
5230             if (flags & SCF_DO_STCLASS)
5231                 ssc_init_zero(pRExC_state, &accum);
5232 
5233             if (!trie->jump) {
5234                 min1= trie->minlen;
5235                 max1= trie->maxlen;
5236             } else {
5237                 const regnode *nextbranch= NULL;
5238                 U32 word;
5239 
5240                 for ( word=1 ; word <= trie->wordcount ; word++)
5241                 {
5242                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5243                     regnode_ssc this_class;
5244 
5245                     data_fake.flags = 0;
5246                     if (data) {
5247                         data_fake.whilem_c = data->whilem_c;
5248                         data_fake.last_closep = data->last_closep;
5249                     }
5250                     else
5251                         data_fake.last_closep = &fake;
5252 		    data_fake.pos_delta = delta;
5253                     if (flags & SCF_DO_STCLASS) {
5254                         ssc_init(pRExC_state, &this_class);
5255                         data_fake.start_class = &this_class;
5256                         f = SCF_DO_STCLASS_AND;
5257                     }
5258                     if (flags & SCF_WHILEM_VISITED_POS)
5259                         f |= SCF_WHILEM_VISITED_POS;
5260 
5261                     if (trie->jump[word]) {
5262                         if (!nextbranch)
5263                             nextbranch = trie_node + trie->jump[0];
5264                         scan= trie_node + trie->jump[word];
5265                         /* We go from the jump point to the branch that follows
5266                            it. Note this means we need the vestigal unused
5267                            branches even though they arent otherwise used. */
5268                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5269                             &deltanext, (regnode *)nextbranch, &data_fake,
5270                             stopparen, recursed_depth, NULL, f,depth+1);
5271                     }
5272                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5273                         nextbranch= regnext((regnode*)nextbranch);
5274 
5275                     if (min1 > (SSize_t)(minnext + trie->minlen))
5276                         min1 = minnext + trie->minlen;
5277                     if (deltanext == SSize_t_MAX) {
5278                         is_inf = is_inf_internal = 1;
5279                         max1 = SSize_t_MAX;
5280                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5281                         max1 = minnext + deltanext + trie->maxlen;
5282 
5283                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5284                         pars++;
5285                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5286                         if ( stopmin > min + min1)
5287 	                    stopmin = min + min1;
5288 	                flags &= ~SCF_DO_SUBSTR;
5289 	                if (data)
5290 	                    data->flags |= SCF_SEEN_ACCEPT;
5291 	            }
5292                     if (data) {
5293                         if (data_fake.flags & SF_HAS_EVAL)
5294                             data->flags |= SF_HAS_EVAL;
5295                         data->whilem_c = data_fake.whilem_c;
5296                     }
5297                     if (flags & SCF_DO_STCLASS)
5298                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5299                 }
5300             }
5301             if (flags & SCF_DO_SUBSTR) {
5302                 data->pos_min += min1;
5303                 data->pos_delta += max1 - min1;
5304                 if (max1 != min1 || is_inf)
5305                     data->longest = &(data->longest_float);
5306             }
5307             min += min1;
5308             delta += max1 - min1;
5309             if (flags & SCF_DO_STCLASS_OR) {
5310                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5311                 if (min1) {
5312                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5313                     flags &= ~SCF_DO_STCLASS;
5314                 }
5315             }
5316             else if (flags & SCF_DO_STCLASS_AND) {
5317                 if (min1) {
5318                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5319                     flags &= ~SCF_DO_STCLASS;
5320                 }
5321                 else {
5322                     /* Switch to OR mode: cache the old value of
5323                      * data->start_class */
5324 		    INIT_AND_WITHP;
5325                     StructCopy(data->start_class, and_withp, regnode_ssc);
5326                     flags &= ~SCF_DO_STCLASS_AND;
5327                     StructCopy(&accum, data->start_class, regnode_ssc);
5328                     flags |= SCF_DO_STCLASS_OR;
5329                 }
5330             }
5331             scan= tail;
5332             continue;
5333         }
5334 #else
5335 	else if (PL_regkind[OP(scan)] == TRIE) {
5336 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5337 	    U8*bang=NULL;
5338 
5339 	    min += trie->minlen;
5340 	    delta += (trie->maxlen - trie->minlen);
5341 	    flags &= ~SCF_DO_STCLASS; /* xxx */
5342             if (flags & SCF_DO_SUBSTR) {
5343                 /* Cannot expect anything... */
5344                 scan_commit(pRExC_state, data, minlenp, is_inf);
5345     	        data->pos_min += trie->minlen;
5346     	        data->pos_delta += (trie->maxlen - trie->minlen);
5347 		if (trie->maxlen != trie->minlen)
5348 		    data->longest = &(data->longest_float);
5349     	    }
5350     	    if (trie->jump) /* no more substrings -- for now /grr*/
5351                flags &= ~SCF_DO_SUBSTR;
5352 	}
5353 #endif /* old or new */
5354 #endif /* TRIE_STUDY_OPT */
5355 
5356 	/* Else: zero-length, ignore. */
5357 	scan = regnext(scan);
5358     }
5359     /* If we are exiting a recursion we can unset its recursed bit
5360      * and allow ourselves to enter it again - no danger of an
5361      * infinite loop there.
5362     if (stopparen > -1 && recursed) {
5363 	DEBUG_STUDYDATA("unset:", data,depth);
5364         PAREN_UNSET( recursed, stopparen);
5365     }
5366     */
5367     if (frame) {
5368         DEBUG_STUDYDATA("frame-end:",data,depth);
5369         DEBUG_PEEP("fend", scan, depth);
5370         /* restore previous context */
5371         last = frame->last;
5372         scan = frame->next;
5373         stopparen = frame->stop;
5374         recursed_depth = frame->prev_recursed_depth;
5375         depth = depth - 1;
5376 
5377         frame = frame->prev;
5378         goto fake_study_recurse;
5379     }
5380 
5381   finish:
5382     assert(!frame);
5383     DEBUG_STUDYDATA("pre-fin:",data,depth);
5384 
5385     *scanp = scan;
5386     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5387 
5388     if (flags & SCF_DO_SUBSTR && is_inf)
5389 	data->pos_delta = SSize_t_MAX - data->pos_min;
5390     if (is_par > (I32)U8_MAX)
5391 	is_par = 0;
5392     if (is_par && pars==1 && data) {
5393 	data->flags |= SF_IN_PAR;
5394 	data->flags &= ~SF_HAS_PAR;
5395     }
5396     else if (pars && data) {
5397 	data->flags |= SF_HAS_PAR;
5398 	data->flags &= ~SF_IN_PAR;
5399     }
5400     if (flags & SCF_DO_STCLASS_OR)
5401 	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5402     if (flags & SCF_TRIE_RESTUDY)
5403         data->flags |= 	SCF_TRIE_RESTUDY;
5404 
5405     DEBUG_STUDYDATA("post-fin:",data,depth);
5406 
5407     {
5408         SSize_t final_minlen= min < stopmin ? min : stopmin;
5409 
5410         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5411             RExC_maxlen = final_minlen + delta;
5412         }
5413         return final_minlen;
5414     }
5415     /* not-reached */
5416 }
5417 
5418 STATIC U32
5419 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5420 {
5421     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5422 
5423     PERL_ARGS_ASSERT_ADD_DATA;
5424 
5425     Renewc(RExC_rxi->data,
5426 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5427 	   char, struct reg_data);
5428     if(count)
5429 	Renew(RExC_rxi->data->what, count + n, U8);
5430     else
5431 	Newx(RExC_rxi->data->what, n, U8);
5432     RExC_rxi->data->count = count + n;
5433     Copy(s, RExC_rxi->data->what + count, n, U8);
5434     return count;
5435 }
5436 
5437 /*XXX: todo make this not included in a non debugging perl */
5438 #ifndef PERL_IN_XSUB_RE
5439 void
5440 Perl_reginitcolors(pTHX)
5441 {
5442     dVAR;
5443     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5444     if (s) {
5445 	char *t = savepv(s);
5446 	int i = 0;
5447 	PL_colors[0] = t;
5448 	while (++i < 6) {
5449 	    t = strchr(t, '\t');
5450 	    if (t) {
5451 		*t = '\0';
5452 		PL_colors[i] = ++t;
5453 	    }
5454 	    else
5455 		PL_colors[i] = t = (char *)"";
5456 	}
5457     } else {
5458 	int i = 0;
5459 	while (i < 6)
5460 	    PL_colors[i++] = (char *)"";
5461     }
5462     PL_colorset = 1;
5463 }
5464 #endif
5465 
5466 
5467 #ifdef TRIE_STUDY_OPT
5468 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5469     STMT_START {                                            \
5470         if (                                                \
5471               (data.flags & SCF_TRIE_RESTUDY)               \
5472               && ! restudied++                              \
5473         ) {                                                 \
5474             dOsomething;                                    \
5475             goto reStudy;                                   \
5476         }                                                   \
5477     } STMT_END
5478 #else
5479 #define CHECK_RESTUDY_GOTO_butfirst
5480 #endif
5481 
5482 /*
5483  * pregcomp - compile a regular expression into internal code
5484  *
5485  * Decides which engine's compiler to call based on the hint currently in
5486  * scope
5487  */
5488 
5489 #ifndef PERL_IN_XSUB_RE
5490 
5491 /* return the currently in-scope regex engine (or the default if none)  */
5492 
5493 regexp_engine const *
5494 Perl_current_re_engine(pTHX)
5495 {
5496     dVAR;
5497 
5498     if (IN_PERL_COMPILETIME) {
5499 	HV * const table = GvHV(PL_hintgv);
5500 	SV **ptr;
5501 
5502 	if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5503 	    return &PL_core_reg_engine;
5504 	ptr = hv_fetchs(table, "regcomp", FALSE);
5505 	if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5506 	    return &PL_core_reg_engine;
5507 	return INT2PTR(regexp_engine*,SvIV(*ptr));
5508     }
5509     else {
5510 	SV *ptr;
5511 	if (!PL_curcop->cop_hints_hash)
5512 	    return &PL_core_reg_engine;
5513 	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5514 	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5515 	    return &PL_core_reg_engine;
5516 	return INT2PTR(regexp_engine*,SvIV(ptr));
5517     }
5518 }
5519 
5520 
5521 REGEXP *
5522 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5523 {
5524     dVAR;
5525     regexp_engine const *eng = current_re_engine();
5526     GET_RE_DEBUG_FLAGS_DECL;
5527 
5528     PERL_ARGS_ASSERT_PREGCOMP;
5529 
5530     /* Dispatch a request to compile a regexp to correct regexp engine. */
5531     DEBUG_COMPILE_r({
5532 	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5533 			PTR2UV(eng));
5534     });
5535     return CALLREGCOMP_ENG(eng, pattern, flags);
5536 }
5537 #endif
5538 
5539 /* public(ish) entry point for the perl core's own regex compiling code.
5540  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5541  * pattern rather than a list of OPs, and uses the internal engine rather
5542  * than the current one */
5543 
5544 REGEXP *
5545 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5546 {
5547     SV *pat = pattern; /* defeat constness! */
5548     PERL_ARGS_ASSERT_RE_COMPILE;
5549     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5550 #ifdef PERL_IN_XSUB_RE
5551                                 &my_reg_engine,
5552 #else
5553                                 &PL_core_reg_engine,
5554 #endif
5555                                 NULL, NULL, rx_flags, 0);
5556 }
5557 
5558 
5559 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5560  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5561  * point to the realloced string and length.
5562  *
5563  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5564  * stuff added */
5565 
5566 static void
5567 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5568 		    char **pat_p, STRLEN *plen_p, int num_code_blocks)
5569 {
5570     U8 *const src = (U8*)*pat_p;
5571     U8 *dst;
5572     int n=0;
5573     STRLEN s = 0, d = 0;
5574     bool do_end = 0;
5575     GET_RE_DEBUG_FLAGS_DECL;
5576 
5577     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5578         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5579 
5580     Newx(dst, *plen_p * 2 + 1, U8);
5581 
5582     while (s < *plen_p) {
5583         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5584             dst[d]   = src[s];
5585         else {
5586             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5587             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5588         }
5589         if (n < num_code_blocks) {
5590             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5591                 pRExC_state->code_blocks[n].start = d;
5592                 assert(dst[d] == '(');
5593                 do_end = 1;
5594             }
5595             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5596                 pRExC_state->code_blocks[n].end = d;
5597                 assert(dst[d] == ')');
5598                 do_end = 0;
5599                 n++;
5600             }
5601         }
5602         s++;
5603         d++;
5604     }
5605     dst[d] = '\0';
5606     *plen_p = d;
5607     *pat_p = (char*) dst;
5608     SAVEFREEPV(*pat_p);
5609     RExC_orig_utf8 = RExC_utf8 = 1;
5610 }
5611 
5612 
5613 
5614 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5615  * while recording any code block indices, and handling overloading,
5616  * nested qr// objects etc.  If pat is null, it will allocate a new
5617  * string, or just return the first arg, if there's only one.
5618  *
5619  * Returns the malloced/updated pat.
5620  * patternp and pat_count is the array of SVs to be concatted;
5621  * oplist is the optional list of ops that generated the SVs;
5622  * recompile_p is a pointer to a boolean that will be set if
5623  *   the regex will need to be recompiled.
5624  * delim, if non-null is an SV that will be inserted between each element
5625  */
5626 
5627 static SV*
5628 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5629                 SV *pat, SV ** const patternp, int pat_count,
5630                 OP *oplist, bool *recompile_p, SV *delim)
5631 {
5632     SV **svp;
5633     int n = 0;
5634     bool use_delim = FALSE;
5635     bool alloced = FALSE;
5636 
5637     /* if we know we have at least two args, create an empty string,
5638      * then concatenate args to that. For no args, return an empty string */
5639     if (!pat && pat_count != 1) {
5640         pat = newSVpvn("", 0);
5641         SAVEFREESV(pat);
5642         alloced = TRUE;
5643     }
5644 
5645     for (svp = patternp; svp < patternp + pat_count; svp++) {
5646         SV *sv;
5647         SV *rx  = NULL;
5648         STRLEN orig_patlen = 0;
5649         bool code = 0;
5650         SV *msv = use_delim ? delim : *svp;
5651         if (!msv) msv = &PL_sv_undef;
5652 
5653         /* if we've got a delimiter, we go round the loop twice for each
5654          * svp slot (except the last), using the delimiter the second
5655          * time round */
5656         if (use_delim) {
5657             svp--;
5658             use_delim = FALSE;
5659         }
5660         else if (delim)
5661             use_delim = TRUE;
5662 
5663         if (SvTYPE(msv) == SVt_PVAV) {
5664             /* we've encountered an interpolated array within
5665              * the pattern, e.g. /...@a..../. Expand the list of elements,
5666              * then recursively append elements.
5667              * The code in this block is based on S_pushav() */
5668 
5669             AV *const av = (AV*)msv;
5670             const SSize_t maxarg = AvFILL(av) + 1;
5671             SV **array;
5672 
5673             if (oplist) {
5674                 assert(oplist->op_type == OP_PADAV
5675                     || oplist->op_type == OP_RV2AV);
5676                 oplist = oplist->op_sibling;;
5677             }
5678 
5679             if (SvRMAGICAL(av)) {
5680                 SSize_t i;
5681 
5682                 Newx(array, maxarg, SV*);
5683                 SAVEFREEPV(array);
5684                 for (i=0; i < maxarg; i++) {
5685                     SV ** const svp = av_fetch(av, i, FALSE);
5686                     array[i] = svp ? *svp : &PL_sv_undef;
5687                 }
5688             }
5689             else
5690                 array = AvARRAY(av);
5691 
5692             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5693                                 array, maxarg, NULL, recompile_p,
5694                                 /* $" */
5695                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5696 
5697             continue;
5698         }
5699 
5700 
5701         /* we make the assumption here that each op in the list of
5702          * op_siblings maps to one SV pushed onto the stack,
5703          * except for code blocks, with have both an OP_NULL and
5704          * and OP_CONST.
5705          * This allows us to match up the list of SVs against the
5706          * list of OPs to find the next code block.
5707          *
5708          * Note that       PUSHMARK PADSV PADSV ..
5709          * is optimised to
5710          *                 PADRANGE PADSV  PADSV  ..
5711          * so the alignment still works. */
5712 
5713         if (oplist) {
5714             if (oplist->op_type == OP_NULL
5715                 && (oplist->op_flags & OPf_SPECIAL))
5716             {
5717                 assert(n < pRExC_state->num_code_blocks);
5718                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5719                 pRExC_state->code_blocks[n].block = oplist;
5720                 pRExC_state->code_blocks[n].src_regex = NULL;
5721                 n++;
5722                 code = 1;
5723                 oplist = oplist->op_sibling; /* skip CONST */
5724                 assert(oplist);
5725             }
5726             oplist = oplist->op_sibling;;
5727         }
5728 
5729 	/* apply magic and QR overloading to arg */
5730 
5731         SvGETMAGIC(msv);
5732         if (SvROK(msv) && SvAMAGIC(msv)) {
5733             SV *sv = AMG_CALLunary(msv, regexp_amg);
5734             if (sv) {
5735                 if (SvROK(sv))
5736                     sv = SvRV(sv);
5737                 if (SvTYPE(sv) != SVt_REGEXP)
5738                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5739                 msv = sv;
5740             }
5741         }
5742 
5743         /* try concatenation overload ... */
5744         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5745                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5746         {
5747             sv_setsv(pat, sv);
5748             /* overloading involved: all bets are off over literal
5749              * code. Pretend we haven't seen it */
5750             pRExC_state->num_code_blocks -= n;
5751             n = 0;
5752         }
5753         else  {
5754             /* ... or failing that, try "" overload */
5755             while (SvAMAGIC(msv)
5756                     && (sv = AMG_CALLunary(msv, string_amg))
5757                     && sv != msv
5758                     &&  !(   SvROK(msv)
5759                           && SvROK(sv)
5760                           && SvRV(msv) == SvRV(sv))
5761             ) {
5762                 msv = sv;
5763                 SvGETMAGIC(msv);
5764             }
5765             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5766                 msv = SvRV(msv);
5767 
5768             if (pat) {
5769                 /* this is a partially unrolled
5770                  *     sv_catsv_nomg(pat, msv);
5771                  * that allows us to adjust code block indices if
5772                  * needed */
5773                 STRLEN dlen;
5774                 char *dst = SvPV_force_nomg(pat, dlen);
5775                 orig_patlen = dlen;
5776                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5777                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5778                     sv_setpvn(pat, dst, dlen);
5779                     SvUTF8_on(pat);
5780                 }
5781                 sv_catsv_nomg(pat, msv);
5782                 rx = msv;
5783             }
5784             else
5785                 pat = msv;
5786 
5787             if (code)
5788                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5789         }
5790 
5791         /* extract any code blocks within any embedded qr//'s */
5792         if (rx && SvTYPE(rx) == SVt_REGEXP
5793             && RX_ENGINE((REGEXP*)rx)->op_comp)
5794         {
5795 
5796             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5797             if (ri->num_code_blocks) {
5798                 int i;
5799                 /* the presence of an embedded qr// with code means
5800                  * we should always recompile: the text of the
5801                  * qr// may not have changed, but it may be a
5802                  * different closure than last time */
5803                 *recompile_p = 1;
5804                 Renew(pRExC_state->code_blocks,
5805                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5806                     struct reg_code_block);
5807                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5808 
5809                 for (i=0; i < ri->num_code_blocks; i++) {
5810                     struct reg_code_block *src, *dst;
5811                     STRLEN offset =  orig_patlen
5812                         + ReANY((REGEXP *)rx)->pre_prefix;
5813                     assert(n < pRExC_state->num_code_blocks);
5814                     src = &ri->code_blocks[i];
5815                     dst = &pRExC_state->code_blocks[n];
5816                     dst->start	    = src->start + offset;
5817                     dst->end	    = src->end   + offset;
5818                     dst->block	    = src->block;
5819                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5820                                             src->src_regex
5821                                                 ? src->src_regex
5822                                                 : (REGEXP*)rx);
5823                     n++;
5824                 }
5825             }
5826         }
5827     }
5828     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5829     if (alloced)
5830         SvSETMAGIC(pat);
5831 
5832     return pat;
5833 }
5834 
5835 
5836 
5837 /* see if there are any run-time code blocks in the pattern.
5838  * False positives are allowed */
5839 
5840 static bool
5841 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5842 		    char *pat, STRLEN plen)
5843 {
5844     int n = 0;
5845     STRLEN s;
5846 
5847     for (s = 0; s < plen; s++) {
5848 	if (n < pRExC_state->num_code_blocks
5849 	    && s == pRExC_state->code_blocks[n].start)
5850 	{
5851 	    s = pRExC_state->code_blocks[n].end;
5852 	    n++;
5853 	    continue;
5854 	}
5855 	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
5856 	 * positives here */
5857 	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5858 	    (pat[s+2] == '{'
5859                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5860 	)
5861 	    return 1;
5862     }
5863     return 0;
5864 }
5865 
5866 /* Handle run-time code blocks. We will already have compiled any direct
5867  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5868  * copy of it, but with any literal code blocks blanked out and
5869  * appropriate chars escaped; then feed it into
5870  *
5871  *    eval "qr'modified_pattern'"
5872  *
5873  * For example,
5874  *
5875  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5876  *
5877  * becomes
5878  *
5879  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5880  *
5881  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5882  * and merge them with any code blocks of the original regexp.
5883  *
5884  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5885  * instead, just save the qr and return FALSE; this tells our caller that
5886  * the original pattern needs upgrading to utf8.
5887  */
5888 
5889 static bool
5890 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5891     char *pat, STRLEN plen)
5892 {
5893     SV *qr;
5894 
5895     GET_RE_DEBUG_FLAGS_DECL;
5896 
5897     if (pRExC_state->runtime_code_qr) {
5898 	/* this is the second time we've been called; this should
5899 	 * only happen if the main pattern got upgraded to utf8
5900 	 * during compilation; re-use the qr we compiled first time
5901 	 * round (which should be utf8 too)
5902 	 */
5903 	qr = pRExC_state->runtime_code_qr;
5904 	pRExC_state->runtime_code_qr = NULL;
5905 	assert(RExC_utf8 && SvUTF8(qr));
5906     }
5907     else {
5908 	int n = 0;
5909 	STRLEN s;
5910 	char *p, *newpat;
5911 	int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5912 	SV *sv, *qr_ref;
5913 	dSP;
5914 
5915 	/* determine how many extra chars we need for ' and \ escaping */
5916 	for (s = 0; s < plen; s++) {
5917 	    if (pat[s] == '\'' || pat[s] == '\\')
5918 		newlen++;
5919 	}
5920 
5921 	Newx(newpat, newlen, char);
5922 	p = newpat;
5923 	*p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5924 
5925 	for (s = 0; s < plen; s++) {
5926 	    if (n < pRExC_state->num_code_blocks
5927 		&& s == pRExC_state->code_blocks[n].start)
5928 	    {
5929 		/* blank out literal code block */
5930 		assert(pat[s] == '(');
5931 		while (s <= pRExC_state->code_blocks[n].end) {
5932 		    *p++ = '_';
5933 		    s++;
5934 		}
5935 		s--;
5936 		n++;
5937 		continue;
5938 	    }
5939 	    if (pat[s] == '\'' || pat[s] == '\\')
5940 		*p++ = '\\';
5941 	    *p++ = pat[s];
5942 	}
5943 	*p++ = '\'';
5944 	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5945 	    *p++ = 'x';
5946 	*p++ = '\0';
5947 	DEBUG_COMPILE_r({
5948 	    PerlIO_printf(Perl_debug_log,
5949 		"%sre-parsing pattern for runtime code:%s %s\n",
5950 		PL_colors[4],PL_colors[5],newpat);
5951 	});
5952 
5953 	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5954 	Safefree(newpat);
5955 
5956 	ENTER;
5957 	SAVETMPS;
5958 	save_re_context();
5959 	PUSHSTACKi(PERLSI_REQUIRE);
5960         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5961          * parsing qr''; normally only q'' does this. It also alters
5962          * hints handling */
5963 	eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5964 	SvREFCNT_dec_NN(sv);
5965 	SPAGAIN;
5966 	qr_ref = POPs;
5967 	PUTBACK;
5968 	{
5969 	    SV * const errsv = ERRSV;
5970 	    if (SvTRUE_NN(errsv))
5971 	    {
5972 		Safefree(pRExC_state->code_blocks);
5973                 /* use croak_sv ? */
5974 		Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5975 	    }
5976 	}
5977 	assert(SvROK(qr_ref));
5978 	qr = SvRV(qr_ref);
5979 	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5980 	/* the leaving below frees the tmp qr_ref.
5981 	 * Give qr a life of its own */
5982 	SvREFCNT_inc(qr);
5983 	POPSTACK;
5984 	FREETMPS;
5985 	LEAVE;
5986 
5987     }
5988 
5989     if (!RExC_utf8 && SvUTF8(qr)) {
5990 	/* first time through; the pattern got upgraded; save the
5991 	 * qr for the next time through */
5992 	assert(!pRExC_state->runtime_code_qr);
5993 	pRExC_state->runtime_code_qr = qr;
5994 	return 0;
5995     }
5996 
5997 
5998     /* extract any code blocks within the returned qr//  */
5999 
6000 
6001     /* merge the main (r1) and run-time (r2) code blocks into one */
6002     {
6003 	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6004 	struct reg_code_block *new_block, *dst;
6005 	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6006 	int i1 = 0, i2 = 0;
6007 
6008 	if (!r2->num_code_blocks) /* we guessed wrong */
6009 	{
6010 	    SvREFCNT_dec_NN(qr);
6011 	    return 1;
6012 	}
6013 
6014 	Newx(new_block,
6015 	    r1->num_code_blocks + r2->num_code_blocks,
6016 	    struct reg_code_block);
6017 	dst = new_block;
6018 
6019 	while (    i1 < r1->num_code_blocks
6020 		|| i2 < r2->num_code_blocks)
6021 	{
6022 	    struct reg_code_block *src;
6023 	    bool is_qr = 0;
6024 
6025 	    if (i1 == r1->num_code_blocks) {
6026 		src = &r2->code_blocks[i2++];
6027 		is_qr = 1;
6028 	    }
6029 	    else if (i2 == r2->num_code_blocks)
6030 		src = &r1->code_blocks[i1++];
6031 	    else if (  r1->code_blocks[i1].start
6032 	             < r2->code_blocks[i2].start)
6033 	    {
6034 		src = &r1->code_blocks[i1++];
6035 		assert(src->end < r2->code_blocks[i2].start);
6036 	    }
6037 	    else {
6038 		assert(  r1->code_blocks[i1].start
6039 		       > r2->code_blocks[i2].start);
6040 		src = &r2->code_blocks[i2++];
6041 		is_qr = 1;
6042 		assert(src->end < r1->code_blocks[i1].start);
6043 	    }
6044 
6045 	    assert(pat[src->start] == '(');
6046 	    assert(pat[src->end]   == ')');
6047 	    dst->start	    = src->start;
6048 	    dst->end	    = src->end;
6049 	    dst->block	    = src->block;
6050 	    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6051 				    : src->src_regex;
6052 	    dst++;
6053 	}
6054 	r1->num_code_blocks += r2->num_code_blocks;
6055 	Safefree(r1->code_blocks);
6056 	r1->code_blocks = new_block;
6057     }
6058 
6059     SvREFCNT_dec_NN(qr);
6060     return 1;
6061 }
6062 
6063 
6064 STATIC bool
6065 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6066                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6067 		      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6068                       STRLEN longest_length, bool eol, bool meol)
6069 {
6070     /* This is the common code for setting up the floating and fixed length
6071      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6072      * as to whether succeeded or not */
6073 
6074     I32 t;
6075     SSize_t ml;
6076 
6077     if (! (longest_length
6078            || (eol /* Can't have SEOL and MULTI */
6079                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6080           )
6081             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6082         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6083     {
6084         return FALSE;
6085     }
6086 
6087     /* copy the information about the longest from the reg_scan_data
6088         over to the program. */
6089     if (SvUTF8(sv_longest)) {
6090         *rx_utf8 = sv_longest;
6091         *rx_substr = NULL;
6092     } else {
6093         *rx_substr = sv_longest;
6094         *rx_utf8 = NULL;
6095     }
6096     /* end_shift is how many chars that must be matched that
6097         follow this item. We calculate it ahead of time as once the
6098         lookbehind offset is added in we lose the ability to correctly
6099         calculate it.*/
6100     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6101     *rx_end_shift = ml - offset
6102         - longest_length + (SvTAIL(sv_longest) != 0)
6103         + lookbehind;
6104 
6105     t = (eol/* Can't have SEOL and MULTI */
6106          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6107     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6108 
6109     return TRUE;
6110 }
6111 
6112 /*
6113  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6114  * regular expression into internal code.
6115  * The pattern may be passed either as:
6116  *    a list of SVs (patternp plus pat_count)
6117  *    a list of OPs (expr)
6118  * If both are passed, the SV list is used, but the OP list indicates
6119  * which SVs are actually pre-compiled code blocks
6120  *
6121  * The SVs in the list have magic and qr overloading applied to them (and
6122  * the list may be modified in-place with replacement SVs in the latter
6123  * case).
6124  *
6125  * If the pattern hasn't changed from old_re, then old_re will be
6126  * returned.
6127  *
6128  * eng is the current engine. If that engine has an op_comp method, then
6129  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6130  * do the initial concatenation of arguments and pass on to the external
6131  * engine.
6132  *
6133  * If is_bare_re is not null, set it to a boolean indicating whether the
6134  * arg list reduced (after overloading) to a single bare regex which has
6135  * been returned (i.e. /$qr/).
6136  *
6137  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6138  *
6139  * pm_flags contains the PMf_* flags, typically based on those from the
6140  * pm_flags field of the related PMOP. Currently we're only interested in
6141  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6142  *
6143  * We can't allocate space until we know how big the compiled form will be,
6144  * but we can't compile it (and thus know how big it is) until we've got a
6145  * place to put the code.  So we cheat:  we compile it twice, once with code
6146  * generation turned off and size counting turned on, and once "for real".
6147  * This also means that we don't allocate space until we are sure that the
6148  * thing really will compile successfully, and we never have to move the
6149  * code and thus invalidate pointers into it.  (Note that it has to be in
6150  * one piece because free() must be able to free it all.) [NB: not true in perl]
6151  *
6152  * Beware that the optimization-preparation code in here knows about some
6153  * of the structure of the compiled regexp.  [I'll say.]
6154  */
6155 
6156 REGEXP *
6157 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6158 		    OP *expr, const regexp_engine* eng, REGEXP *old_re,
6159 		     bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6160 {
6161     dVAR;
6162     REGEXP *rx;
6163     struct regexp *r;
6164     regexp_internal *ri;
6165     STRLEN plen;
6166     char *exp;
6167     regnode *scan;
6168     I32 flags;
6169     SSize_t minlen = 0;
6170     U32 rx_flags;
6171     SV *pat;
6172     SV *code_blocksv = NULL;
6173     SV** new_patternp = patternp;
6174 
6175     /* these are all flags - maybe they should be turned
6176      * into a single int with different bit masks */
6177     I32 sawlookahead = 0;
6178     I32 sawplus = 0;
6179     I32 sawopen = 0;
6180     I32 sawminmod = 0;
6181 
6182     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6183     bool recompile = 0;
6184     bool runtime_code = 0;
6185     scan_data_t data;
6186     RExC_state_t RExC_state;
6187     RExC_state_t * const pRExC_state = &RExC_state;
6188 #ifdef TRIE_STUDY_OPT
6189     int restudied = 0;
6190     RExC_state_t copyRExC_state;
6191 #endif
6192     GET_RE_DEBUG_FLAGS_DECL;
6193 
6194     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6195 
6196     DEBUG_r(if (!PL_colorset) reginitcolors());
6197 
6198 #ifndef PERL_IN_XSUB_RE
6199     /* Initialize these here instead of as-needed, as is quick and avoids
6200      * having to test them each time otherwise */
6201     if (! PL_AboveLatin1) {
6202 	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6203 	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6204 	PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6205         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6206         PL_HasMultiCharFold =
6207                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6208     }
6209 #endif
6210 
6211     pRExC_state->code_blocks = NULL;
6212     pRExC_state->num_code_blocks = 0;
6213 
6214     if (is_bare_re)
6215 	*is_bare_re = FALSE;
6216 
6217     if (expr && (expr->op_type == OP_LIST ||
6218 		(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6219 	/* allocate code_blocks if needed */
6220 	OP *o;
6221 	int ncode = 0;
6222 
6223 	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6224 	    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6225 		ncode++; /* count of DO blocks */
6226 	if (ncode) {
6227 	    pRExC_state->num_code_blocks = ncode;
6228 	    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6229 	}
6230     }
6231 
6232     if (!pat_count) {
6233         /* compile-time pattern with just OP_CONSTs and DO blocks */
6234 
6235         int n;
6236         OP *o;
6237 
6238         /* find how many CONSTs there are */
6239         assert(expr);
6240         n = 0;
6241         if (expr->op_type == OP_CONST)
6242             n = 1;
6243         else
6244             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6245                 if (o->op_type == OP_CONST)
6246                     n++;
6247             }
6248 
6249         /* fake up an SV array */
6250 
6251         assert(!new_patternp);
6252         Newx(new_patternp, n, SV*);
6253         SAVEFREEPV(new_patternp);
6254         pat_count = n;
6255 
6256         n = 0;
6257         if (expr->op_type == OP_CONST)
6258             new_patternp[n] = cSVOPx_sv(expr);
6259         else
6260             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6261                 if (o->op_type == OP_CONST)
6262                     new_patternp[n++] = cSVOPo_sv;
6263             }
6264 
6265     }
6266 
6267     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6268         "Assembling pattern from %d elements%s\n", pat_count,
6269             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6270 
6271     /* set expr to the first arg op */
6272 
6273     if (pRExC_state->num_code_blocks
6274          && expr->op_type != OP_CONST)
6275     {
6276             expr = cLISTOPx(expr)->op_first;
6277             assert(   expr->op_type == OP_PUSHMARK
6278                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6279                    || expr->op_type == OP_PADRANGE);
6280             expr = expr->op_sibling;
6281     }
6282 
6283     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6284                         expr, &recompile, NULL);
6285 
6286     /* handle bare (possibly after overloading) regex: foo =~ $re */
6287     {
6288         SV *re = pat;
6289         if (SvROK(re))
6290             re = SvRV(re);
6291         if (SvTYPE(re) == SVt_REGEXP) {
6292             if (is_bare_re)
6293                 *is_bare_re = TRUE;
6294             SvREFCNT_inc(re);
6295             Safefree(pRExC_state->code_blocks);
6296             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6297                 "Precompiled pattern%s\n",
6298                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6299 
6300             return (REGEXP*)re;
6301         }
6302     }
6303 
6304     exp = SvPV_nomg(pat, plen);
6305 
6306     if (!eng->op_comp) {
6307 	if ((SvUTF8(pat) && IN_BYTES)
6308 		|| SvGMAGICAL(pat) || SvAMAGIC(pat))
6309 	{
6310 	    /* make a temporary copy; either to convert to bytes,
6311 	     * or to avoid repeating get-magic / overloaded stringify */
6312 	    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6313 					(IN_BYTES ? 0 : SvUTF8(pat)));
6314 	}
6315 	Safefree(pRExC_state->code_blocks);
6316 	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6317     }
6318 
6319     /* ignore the utf8ness if the pattern is 0 length */
6320     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6321     RExC_uni_semantics = 0;
6322     RExC_contains_locale = 0;
6323     RExC_contains_i = 0;
6324     pRExC_state->runtime_code_qr = NULL;
6325 
6326     DEBUG_COMPILE_r({
6327             SV *dsv= sv_newmortal();
6328             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6329             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6330                           PL_colors[4],PL_colors[5],s);
6331         });
6332 
6333   redo_first_pass:
6334     /* we jump here if we upgrade the pattern to utf8 and have to
6335      * recompile */
6336 
6337     if ((pm_flags & PMf_USE_RE_EVAL)
6338 		/* this second condition covers the non-regex literal case,
6339 		 * i.e.  $foo =~ '(?{})'. */
6340 		|| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6341     )
6342 	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6343 
6344     /* return old regex if pattern hasn't changed */
6345     /* XXX: note in the below we have to check the flags as well as the
6346      * pattern.
6347      *
6348      * Things get a touch tricky as we have to compare the utf8 flag
6349      * independently from the compile flags.  */
6350 
6351     if (   old_re
6352         && !recompile
6353         && !!RX_UTF8(old_re) == !!RExC_utf8
6354         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6355 	&& RX_PRECOMP(old_re)
6356 	&& RX_PRELEN(old_re) == plen
6357         && memEQ(RX_PRECOMP(old_re), exp, plen)
6358 	&& !runtime_code /* with runtime code, always recompile */ )
6359     {
6360         Safefree(pRExC_state->code_blocks);
6361         return old_re;
6362     }
6363 
6364     rx_flags = orig_rx_flags;
6365 
6366     if (rx_flags & PMf_FOLD) {
6367         RExC_contains_i = 1;
6368     }
6369     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6370 
6371 	/* Set to use unicode semantics if the pattern is in utf8 and has the
6372 	 * 'depends' charset specified, as it means unicode when utf8  */
6373 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6374     }
6375 
6376     RExC_precomp = exp;
6377     RExC_flags = rx_flags;
6378     RExC_pm_flags = pm_flags;
6379 
6380     if (runtime_code) {
6381 	if (TAINTING_get && TAINT_get)
6382 	    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6383 
6384 	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6385 	    /* whoops, we have a non-utf8 pattern, whilst run-time code
6386 	     * got compiled as utf8. Try again with a utf8 pattern */
6387             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6388                                     pRExC_state->num_code_blocks);
6389             goto redo_first_pass;
6390 	}
6391     }
6392     assert(!pRExC_state->runtime_code_qr);
6393 
6394     RExC_sawback = 0;
6395 
6396     RExC_seen = 0;
6397     RExC_maxlen = 0;
6398     RExC_in_lookbehind = 0;
6399     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6400     RExC_extralen = 0;
6401     RExC_override_recoding = 0;
6402     RExC_in_multi_char_class = 0;
6403 
6404     /* First pass: determine size, legality. */
6405     RExC_parse = exp;
6406     RExC_start = exp;
6407     RExC_end = exp + plen;
6408     RExC_naughty = 0;
6409     RExC_npar = 1;
6410     RExC_nestroot = 0;
6411     RExC_size = 0L;
6412     RExC_emit = (regnode *) &RExC_emit_dummy;
6413     RExC_whilem_seen = 0;
6414     RExC_open_parens = NULL;
6415     RExC_close_parens = NULL;
6416     RExC_opend = NULL;
6417     RExC_paren_names = NULL;
6418 #ifdef DEBUGGING
6419     RExC_paren_name_list = NULL;
6420 #endif
6421     RExC_recurse = NULL;
6422     RExC_study_chunk_recursed = NULL;
6423     RExC_study_chunk_recursed_bytes= 0;
6424     RExC_recurse_count = 0;
6425     pRExC_state->code_index = 0;
6426 
6427 #if 0 /* REGC() is (currently) a NOP at the first pass.
6428        * Clever compilers notice this and complain. --jhi */
6429     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6430 #endif
6431     DEBUG_PARSE_r(
6432 	PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6433         RExC_lastnum=0;
6434         RExC_lastparse=NULL;
6435     );
6436     /* reg may croak on us, not giving us a chance to free
6437        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6438        need it to survive as long as the regexp (qr/(?{})/).
6439        We must check that code_blocksv is not already set, because we may
6440        have jumped back to restart the sizing pass. */
6441     if (pRExC_state->code_blocks && !code_blocksv) {
6442 	code_blocksv = newSV_type(SVt_PV);
6443 	SAVEFREESV(code_blocksv);
6444 	SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6445 	SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6446     }
6447     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6448         /* It's possible to write a regexp in ascii that represents Unicode
6449         codepoints outside of the byte range, such as via \x{100}. If we
6450         detect such a sequence we have to convert the entire pattern to utf8
6451         and then recompile, as our sizing calculation will have been based
6452         on 1 byte == 1 character, but we will need to use utf8 to encode
6453         at least some part of the pattern, and therefore must convert the whole
6454         thing.
6455         -- dmq */
6456         if (flags & RESTART_UTF8) {
6457             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6458                                     pRExC_state->num_code_blocks);
6459             goto redo_first_pass;
6460         }
6461         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6462     }
6463     if (code_blocksv)
6464 	SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6465 
6466     DEBUG_PARSE_r({
6467         PerlIO_printf(Perl_debug_log,
6468             "Required size %"IVdf" nodes\n"
6469             "Starting second pass (creation)\n",
6470             (IV)RExC_size);
6471         RExC_lastnum=0;
6472         RExC_lastparse=NULL;
6473     });
6474 
6475     /* The first pass could have found things that force Unicode semantics */
6476     if ((RExC_utf8 || RExC_uni_semantics)
6477 	 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6478     {
6479 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6480     }
6481 
6482     /* Small enough for pointer-storage convention?
6483        If extralen==0, this means that we will not need long jumps. */
6484     if (RExC_size >= 0x10000L && RExC_extralen)
6485         RExC_size += RExC_extralen;
6486     else
6487 	RExC_extralen = 0;
6488     if (RExC_whilem_seen > 15)
6489 	RExC_whilem_seen = 15;
6490 
6491     /* Allocate space and zero-initialize. Note, the two step process
6492        of zeroing when in debug mode, thus anything assigned has to
6493        happen after that */
6494     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6495     r = ReANY(rx);
6496     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6497 	 char, regexp_internal);
6498     if ( r == NULL || ri == NULL )
6499 	FAIL("Regexp out of space");
6500 #ifdef DEBUGGING
6501     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6502     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6503          char);
6504 #else
6505     /* bulk initialize base fields with 0. */
6506     Zero(ri, sizeof(regexp_internal), char);
6507 #endif
6508 
6509     /* non-zero initialization begins here */
6510     RXi_SET( r, ri );
6511     r->engine= eng;
6512     r->extflags = rx_flags;
6513     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6514 
6515     if (pm_flags & PMf_IS_QR) {
6516 	ri->code_blocks = pRExC_state->code_blocks;
6517 	ri->num_code_blocks = pRExC_state->num_code_blocks;
6518     }
6519     else
6520     {
6521 	int n;
6522 	for (n = 0; n < pRExC_state->num_code_blocks; n++)
6523 	    if (pRExC_state->code_blocks[n].src_regex)
6524 		SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6525 	SAVEFREEPV(pRExC_state->code_blocks);
6526     }
6527 
6528     {
6529         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6530         bool has_charset = (get_regex_charset(r->extflags)
6531                                                     != REGEX_DEPENDS_CHARSET);
6532 
6533         /* The caret is output if there are any defaults: if not all the STD
6534          * flags are set, or if no character set specifier is needed */
6535         bool has_default =
6536                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6537                     || ! has_charset);
6538         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6539                                                    == REG_RUN_ON_COMMENT_SEEN);
6540 	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6541 			    >> RXf_PMf_STD_PMMOD_SHIFT);
6542 	const char *fptr = STD_PAT_MODS;        /*"msix"*/
6543 	char *p;
6544         /* Allocate for the worst case, which is all the std flags are turned
6545          * on.  If more precision is desired, we could do a population count of
6546          * the flags set.  This could be done with a small lookup table, or by
6547          * shifting, masking and adding, or even, when available, assembly
6548          * language for a machine-language population count.
6549          * We never output a minus, as all those are defaults, so are
6550          * covered by the caret */
6551 	const STRLEN wraplen = plen + has_p + has_runon
6552             + has_default       /* If needs a caret */
6553 
6554 		/* If needs a character set specifier */
6555 	    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6556             + (sizeof(STD_PAT_MODS) - 1)
6557             + (sizeof("(?:)") - 1);
6558 
6559         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6560 	r->xpv_len_u.xpvlenu_pv = p;
6561 	if (RExC_utf8)
6562 	    SvFLAGS(rx) |= SVf_UTF8;
6563         *p++='('; *p++='?';
6564 
6565         /* If a default, cover it using the caret */
6566         if (has_default) {
6567             *p++= DEFAULT_PAT_MOD;
6568         }
6569         if (has_charset) {
6570 	    STRLEN len;
6571 	    const char* const name = get_regex_charset_name(r->extflags, &len);
6572 	    Copy(name, p, len, char);
6573 	    p += len;
6574         }
6575         if (has_p)
6576             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6577         {
6578             char ch;
6579             while((ch = *fptr++)) {
6580                 if(reganch & 1)
6581                     *p++ = ch;
6582                 reganch >>= 1;
6583             }
6584         }
6585 
6586         *p++ = ':';
6587         Copy(RExC_precomp, p, plen, char);
6588 	assert ((RX_WRAPPED(rx) - p) < 16);
6589 	r->pre_prefix = p - RX_WRAPPED(rx);
6590         p += plen;
6591         if (has_runon)
6592             *p++ = '\n';
6593         *p++ = ')';
6594         *p = 0;
6595 	SvCUR_set(rx, p - RX_WRAPPED(rx));
6596     }
6597 
6598     r->intflags = 0;
6599     r->nparens = RExC_npar - 1;	/* set early to validate backrefs */
6600 
6601     /* setup various meta data about recursion, this all requires
6602      * RExC_npar to be correctly set, and a bit later on we clear it */
6603     if (RExC_seen & REG_RECURSE_SEEN) {
6604         Newxz(RExC_open_parens, RExC_npar,regnode *);
6605         SAVEFREEPV(RExC_open_parens);
6606         Newxz(RExC_close_parens,RExC_npar,regnode *);
6607         SAVEFREEPV(RExC_close_parens);
6608     }
6609     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6610         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6611          * So its 1 if there are no parens. */
6612         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6613                                          ((RExC_npar & 0x07) != 0);
6614         Newx(RExC_study_chunk_recursed,
6615              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6616         SAVEFREEPV(RExC_study_chunk_recursed);
6617     }
6618 
6619     /* Useful during FAIL. */
6620 #ifdef RE_TRACK_PATTERN_OFFSETS
6621     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6622     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6623                           "%s %"UVuf" bytes for offset annotations.\n",
6624                           ri->u.offsets ? "Got" : "Couldn't get",
6625                           (UV)((2*RExC_size+1) * sizeof(U32))));
6626 #endif
6627     SetProgLen(ri,RExC_size);
6628     RExC_rx_sv = rx;
6629     RExC_rx = r;
6630     RExC_rxi = ri;
6631 
6632     /* Second pass: emit code. */
6633     RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
6634     RExC_pm_flags = pm_flags;
6635     RExC_parse = exp;
6636     RExC_end = exp + plen;
6637     RExC_naughty = 0;
6638     RExC_npar = 1;
6639     RExC_emit_start = ri->program;
6640     RExC_emit = ri->program;
6641     RExC_emit_bound = ri->program + RExC_size + 1;
6642     pRExC_state->code_index = 0;
6643 
6644     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6645     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6646 	ReREFCNT_dec(rx);
6647         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6648     }
6649     /* XXXX To minimize changes to RE engine we always allocate
6650        3-units-long substrs field. */
6651     Newx(r->substrs, 1, struct reg_substr_data);
6652     if (RExC_recurse_count) {
6653         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6654         SAVEFREEPV(RExC_recurse);
6655     }
6656 
6657 reStudy:
6658     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6659     Zero(r->substrs, 1, struct reg_substr_data);
6660     if (RExC_study_chunk_recursed)
6661         Zero(RExC_study_chunk_recursed,
6662              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6663 
6664 #ifdef TRIE_STUDY_OPT
6665     if (!restudied) {
6666         StructCopy(&zero_scan_data, &data, scan_data_t);
6667         copyRExC_state = RExC_state;
6668     } else {
6669         U32 seen=RExC_seen;
6670         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6671 
6672         RExC_state = copyRExC_state;
6673         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6674             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6675         else
6676             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6677 	StructCopy(&zero_scan_data, &data, scan_data_t);
6678     }
6679 #else
6680     StructCopy(&zero_scan_data, &data, scan_data_t);
6681 #endif
6682 
6683     /* Dig out information for optimizations. */
6684     r->extflags = RExC_flags; /* was pm_op */
6685     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6686 
6687     if (UTF)
6688 	SvUTF8_on(rx);	/* Unicode in it? */
6689     ri->regstclass = NULL;
6690     if (RExC_naughty >= 10)	/* Probably an expensive pattern. */
6691 	r->intflags |= PREGf_NAUGHTY;
6692     scan = ri->program + 1;		/* First BRANCH. */
6693 
6694     /* testing for BRANCH here tells us whether there is "must appear"
6695        data in the pattern. If there is then we can use it for optimisations */
6696     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6697                                                   */
6698 	SSize_t fake;
6699 	STRLEN longest_float_length, longest_fixed_length;
6700 	regnode_ssc ch_class; /* pointed to by data */
6701 	int stclass_flag;
6702 	SSize_t last_close = 0; /* pointed to by data */
6703         regnode *first= scan;
6704         regnode *first_next= regnext(first);
6705 	/*
6706 	 * Skip introductions and multiplicators >= 1
6707 	 * so that we can extract the 'meat' of the pattern that must
6708 	 * match in the large if() sequence following.
6709 	 * NOTE that EXACT is NOT covered here, as it is normally
6710 	 * picked up by the optimiser separately.
6711 	 *
6712 	 * This is unfortunate as the optimiser isnt handling lookahead
6713 	 * properly currently.
6714 	 *
6715 	 */
6716 	while ((OP(first) == OPEN && (sawopen = 1)) ||
6717 	       /* An OR of *one* alternative - should not happen now. */
6718 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6719 	    /* for now we can't handle lookbehind IFMATCH*/
6720 	    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6721 	    (OP(first) == PLUS) ||
6722 	    (OP(first) == MINMOD) ||
6723 	       /* An {n,m} with n>0 */
6724 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6725 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6726 	{
6727 		/*
6728 		 * the only op that could be a regnode is PLUS, all the rest
6729 		 * will be regnode_1 or regnode_2.
6730 		 *
6731                  * (yves doesn't think this is true)
6732 		 */
6733 		if (OP(first) == PLUS)
6734 		    sawplus = 1;
6735                 else {
6736                     if (OP(first) == MINMOD)
6737                         sawminmod = 1;
6738 		    first += regarglen[OP(first)];
6739                 }
6740 		first = NEXTOPER(first);
6741 		first_next= regnext(first);
6742 	}
6743 
6744 	/* Starting-point info. */
6745       again:
6746         DEBUG_PEEP("first:",first,0);
6747         /* Ignore EXACT as we deal with it later. */
6748 	if (PL_regkind[OP(first)] == EXACT) {
6749 	    if (OP(first) == EXACT)
6750 		NOOP;	/* Empty, get anchored substr later. */
6751 	    else
6752 		ri->regstclass = first;
6753 	}
6754 #ifdef TRIE_STCLASS
6755 	else if (PL_regkind[OP(first)] == TRIE &&
6756 	        ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6757 	{
6758 	    regnode *trie_op;
6759 	    /* this can happen only on restudy */
6760 	    if ( OP(first) == TRIE ) {
6761                 struct regnode_1 *trieop = (struct regnode_1 *)
6762 		    PerlMemShared_calloc(1, sizeof(struct regnode_1));
6763                 StructCopy(first,trieop,struct regnode_1);
6764                 trie_op=(regnode *)trieop;
6765             } else {
6766                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6767 		    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6768                 StructCopy(first,trieop,struct regnode_charclass);
6769                 trie_op=(regnode *)trieop;
6770             }
6771             OP(trie_op)+=2;
6772             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6773 	    ri->regstclass = trie_op;
6774 	}
6775 #endif
6776 	else if (REGNODE_SIMPLE(OP(first)))
6777 	    ri->regstclass = first;
6778 	else if (PL_regkind[OP(first)] == BOUND ||
6779 		 PL_regkind[OP(first)] == NBOUND)
6780 	    ri->regstclass = first;
6781 	else if (PL_regkind[OP(first)] == BOL) {
6782             r->intflags |= (OP(first) == MBOL
6783                            ? PREGf_ANCH_MBOL
6784 			   : (OP(first) == SBOL
6785                               ? PREGf_ANCH_SBOL
6786                               : PREGf_ANCH_BOL));
6787 	    first = NEXTOPER(first);
6788 	    goto again;
6789 	}
6790 	else if (OP(first) == GPOS) {
6791             r->intflags |= PREGf_ANCH_GPOS;
6792 	    first = NEXTOPER(first);
6793 	    goto again;
6794 	}
6795 	else if ((!sawopen || !RExC_sawback) &&
6796 	    (OP(first) == STAR &&
6797 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6798             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6799 	{
6800 	    /* turn .* into ^.* with an implied $*=1 */
6801 	    const int type =
6802 		(OP(NEXTOPER(first)) == REG_ANY)
6803                     ? PREGf_ANCH_MBOL
6804                     : PREGf_ANCH_SBOL;
6805             r->intflags |= (type | PREGf_IMPLICIT);
6806 	    first = NEXTOPER(first);
6807 	    goto again;
6808 	}
6809         if (sawplus && !sawminmod && !sawlookahead
6810             && (!sawopen || !RExC_sawback)
6811 	    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6812 	    /* x+ must match at the 1st pos of run of x's */
6813 	    r->intflags |= PREGf_SKIP;
6814 
6815 	/* Scan is after the zeroth branch, first is atomic matcher. */
6816 #ifdef TRIE_STUDY_OPT
6817 	DEBUG_PARSE_r(
6818 	    if (!restudied)
6819 	        PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6820 			      (IV)(first - scan + 1))
6821         );
6822 #else
6823 	DEBUG_PARSE_r(
6824 	    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6825 	        (IV)(first - scan + 1))
6826         );
6827 #endif
6828 
6829 
6830 	/*
6831 	* If there's something expensive in the r.e., find the
6832 	* longest literal string that must appear and make it the
6833 	* regmust.  Resolve ties in favor of later strings, since
6834 	* the regstart check works with the beginning of the r.e.
6835 	* and avoiding duplication strengthens checking.  Not a
6836 	* strong reason, but sufficient in the absence of others.
6837 	* [Now we resolve ties in favor of the earlier string if
6838 	* it happens that c_offset_min has been invalidated, since the
6839 	* earlier string may buy us something the later one won't.]
6840 	*/
6841 
6842 	data.longest_fixed = newSVpvs("");
6843 	data.longest_float = newSVpvs("");
6844 	data.last_found = newSVpvs("");
6845 	data.longest = &(data.longest_fixed);
6846 	ENTER_with_name("study_chunk");
6847 	SAVEFREESV(data.longest_fixed);
6848 	SAVEFREESV(data.longest_float);
6849 	SAVEFREESV(data.last_found);
6850 	first = scan;
6851 	if (!ri->regstclass) {
6852 	    ssc_init(pRExC_state, &ch_class);
6853 	    data.start_class = &ch_class;
6854 	    stclass_flag = SCF_DO_STCLASS_AND;
6855 	} else				/* XXXX Check for BOUND? */
6856 	    stclass_flag = 0;
6857 	data.last_closep = &last_close;
6858 
6859         DEBUG_RExC_seen();
6860 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6861                              scan + RExC_size, /* Up to end */
6862             &data, -1, 0, NULL,
6863             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6864                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6865             0);
6866 
6867 
6868         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6869 
6870 
6871 	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6872 	     && data.last_start_min == 0 && data.last_end > 0
6873 	     && !RExC_seen_zerolen
6874              && !(RExC_seen & REG_VERBARG_SEEN)
6875              && !(RExC_seen & REG_GPOS_SEEN)
6876         ){
6877 	    r->extflags |= RXf_CHECK_ALL;
6878         }
6879 	scan_commit(pRExC_state, &data,&minlen,0);
6880 
6881 	longest_float_length = CHR_SVLEN(data.longest_float);
6882 
6883         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6884                    && data.offset_fixed == data.offset_float_min
6885                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6886             && S_setup_longest (aTHX_ pRExC_state,
6887                                     data.longest_float,
6888                                     &(r->float_utf8),
6889                                     &(r->float_substr),
6890                                     &(r->float_end_shift),
6891                                     data.lookbehind_float,
6892                                     data.offset_float_min,
6893                                     data.minlen_float,
6894                                     longest_float_length,
6895                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6896                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6897         {
6898 	    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6899 	    r->float_max_offset = data.offset_float_max;
6900 	    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6901 	        r->float_max_offset -= data.lookbehind_float;
6902 	    SvREFCNT_inc_simple_void_NN(data.longest_float);
6903 	}
6904 	else {
6905 	    r->float_substr = r->float_utf8 = NULL;
6906 	    longest_float_length = 0;
6907 	}
6908 
6909 	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6910 
6911         if (S_setup_longest (aTHX_ pRExC_state,
6912                                 data.longest_fixed,
6913                                 &(r->anchored_utf8),
6914                                 &(r->anchored_substr),
6915                                 &(r->anchored_end_shift),
6916                                 data.lookbehind_fixed,
6917                                 data.offset_fixed,
6918                                 data.minlen_fixed,
6919                                 longest_fixed_length,
6920                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6921                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6922         {
6923 	    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6924 	    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6925 	}
6926 	else {
6927 	    r->anchored_substr = r->anchored_utf8 = NULL;
6928 	    longest_fixed_length = 0;
6929 	}
6930 	LEAVE_with_name("study_chunk");
6931 
6932 	if (ri->regstclass
6933 	    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6934 	    ri->regstclass = NULL;
6935 
6936 	if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6937 	    && stclass_flag
6938             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6939 	    && !ssc_is_anything(data.start_class))
6940 	{
6941 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6942 
6943             ssc_finalize(pRExC_state, data.start_class);
6944 
6945 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6946 	    StructCopy(data.start_class,
6947 		       (regnode_ssc*)RExC_rxi->data->data[n],
6948 		       regnode_ssc);
6949 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6950 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
6951 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6952 	              regprop(r, sv, (regnode*)data.start_class, NULL);
6953 		      PerlIO_printf(Perl_debug_log,
6954 				    "synthetic stclass \"%s\".\n",
6955 				    SvPVX_const(sv));});
6956             data.start_class = NULL;
6957 	}
6958 
6959         /* A temporary algorithm prefers floated substr to fixed one to dig
6960          * more info. */
6961 	if (longest_fixed_length > longest_float_length) {
6962 	    r->substrs->check_ix = 0;
6963 	    r->check_end_shift = r->anchored_end_shift;
6964 	    r->check_substr = r->anchored_substr;
6965 	    r->check_utf8 = r->anchored_utf8;
6966 	    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6967             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6968                 r->intflags |= PREGf_NOSCAN;
6969 	}
6970 	else {
6971 	    r->substrs->check_ix = 1;
6972 	    r->check_end_shift = r->float_end_shift;
6973 	    r->check_substr = r->float_substr;
6974 	    r->check_utf8 = r->float_utf8;
6975 	    r->check_offset_min = r->float_min_offset;
6976 	    r->check_offset_max = r->float_max_offset;
6977 	}
6978 	if ((r->check_substr || r->check_utf8) ) {
6979 	    r->extflags |= RXf_USE_INTUIT;
6980 	    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6981 		r->extflags |= RXf_INTUIT_TAIL;
6982 	}
6983         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6984 
6985 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6986 	if ( (STRLEN)minlen < longest_float_length )
6987             minlen= longest_float_length;
6988         if ( (STRLEN)minlen < longest_fixed_length )
6989             minlen= longest_fixed_length;
6990         */
6991     }
6992     else {
6993 	/* Several toplevels. Best we can is to set minlen. */
6994 	SSize_t fake;
6995 	regnode_ssc ch_class;
6996 	SSize_t last_close = 0;
6997 
6998 	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6999 
7000 	scan = ri->program + 1;
7001 	ssc_init(pRExC_state, &ch_class);
7002 	data.start_class = &ch_class;
7003 	data.last_closep = &last_close;
7004 
7005         DEBUG_RExC_seen();
7006 	minlen = study_chunk(pRExC_state,
7007             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7008             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7009                                                       ? SCF_TRIE_DOING_RESTUDY
7010                                                       : 0),
7011             0);
7012 
7013         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7014 
7015 	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7016 		= r->float_substr = r->float_utf8 = NULL;
7017 
7018         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7019             && ! ssc_is_anything(data.start_class))
7020         {
7021 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7022 
7023             ssc_finalize(pRExC_state, data.start_class);
7024 
7025 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7026 	    StructCopy(data.start_class,
7027 		       (regnode_ssc*)RExC_rxi->data->data[n],
7028 		       regnode_ssc);
7029 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7030 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
7031 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7032 	              regprop(r, sv, (regnode*)data.start_class, NULL);
7033 		      PerlIO_printf(Perl_debug_log,
7034 				    "synthetic stclass \"%s\".\n",
7035 				    SvPVX_const(sv));});
7036             data.start_class = NULL;
7037 	}
7038     }
7039 
7040     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7041         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7042         r->maxlen = REG_INFTY;
7043     }
7044     else {
7045         r->maxlen = RExC_maxlen;
7046     }
7047 
7048     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7049        the "real" pattern. */
7050     DEBUG_OPTIMISE_r({
7051         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7052                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7053     });
7054     r->minlenret = minlen;
7055     if (r->minlen < minlen)
7056         r->minlen = minlen;
7057 
7058     if (RExC_seen & REG_GPOS_SEEN)
7059         r->intflags |= PREGf_GPOS_SEEN;
7060     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7061         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7062                                                 lookbehind */
7063     if (pRExC_state->num_code_blocks)
7064 	r->extflags |= RXf_EVAL_SEEN;
7065     if (RExC_seen & REG_CANY_SEEN)
7066         r->intflags |= PREGf_CANY_SEEN;
7067     if (RExC_seen & REG_VERBARG_SEEN)
7068     {
7069 	r->intflags |= PREGf_VERBARG_SEEN;
7070         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7071     }
7072     if (RExC_seen & REG_CUTGROUP_SEEN)
7073 	r->intflags |= PREGf_CUTGROUP_SEEN;
7074     if (pm_flags & PMf_USE_RE_EVAL)
7075 	r->intflags |= PREGf_USE_RE_EVAL;
7076     if (RExC_paren_names)
7077         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7078     else
7079         RXp_PAREN_NAMES(r) = NULL;
7080 
7081     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7082      * so it can be used in pp.c */
7083     if (r->intflags & PREGf_ANCH)
7084         r->extflags |= RXf_IS_ANCHORED;
7085 
7086 
7087     {
7088         /* this is used to identify "special" patterns that might result
7089          * in Perl NOT calling the regex engine and instead doing the match "itself",
7090          * particularly special cases in split//. By having the regex compiler
7091          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7092          * we avoid weird issues with equivalent patterns resulting in different behavior,
7093          * AND we allow non Perl engines to get the same optimizations by the setting the
7094          * flags appropriately - Yves */
7095         regnode *first = ri->program + 1;
7096         U8 fop = OP(first);
7097         regnode *next = NEXTOPER(first);
7098         U8 nop = OP(next);
7099 
7100         if (PL_regkind[fop] == NOTHING && nop == END)
7101             r->extflags |= RXf_NULL;
7102         else if (PL_regkind[fop] == BOL && nop == END)
7103             r->extflags |= RXf_START_ONLY;
7104         else if (fop == PLUS
7105                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7106                  && OP(regnext(first)) == END)
7107             r->extflags |= RXf_WHITE;
7108         else if ( r->extflags & RXf_SPLIT
7109                   && fop == EXACT
7110                   && STR_LEN(first) == 1
7111                   && *(STRING(first)) == ' '
7112                   && OP(regnext(first)) == END )
7113             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7114 
7115     }
7116 
7117     if (RExC_contains_locale) {
7118         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7119     }
7120 
7121 #ifdef DEBUGGING
7122     if (RExC_paren_names) {
7123         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7124         ri->data->data[ri->name_list_idx]
7125                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7126     } else
7127 #endif
7128         ri->name_list_idx = 0;
7129 
7130     if (RExC_recurse_count) {
7131         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7132             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7133             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7134         }
7135     }
7136     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7137     /* assume we don't need to swap parens around before we match */
7138 
7139     DEBUG_DUMP_r({
7140         DEBUG_RExC_seen();
7141         PerlIO_printf(Perl_debug_log,"Final program:\n");
7142         regdump(r);
7143     });
7144 #ifdef RE_TRACK_PATTERN_OFFSETS
7145     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7146         const STRLEN len = ri->u.offsets[0];
7147         STRLEN i;
7148         GET_RE_DEBUG_FLAGS_DECL;
7149         PerlIO_printf(Perl_debug_log,
7150                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7151         for (i = 1; i <= len; i++) {
7152             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7153                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7154                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7155             }
7156         PerlIO_printf(Perl_debug_log, "\n");
7157     });
7158 #endif
7159 
7160 #ifdef USE_ITHREADS
7161     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7162      * by setting the regexp SV to readonly-only instead. If the
7163      * pattern's been recompiled, the USEDness should remain. */
7164     if (old_re && SvREADONLY(old_re))
7165         SvREADONLY_on(rx);
7166 #endif
7167     return rx;
7168 }
7169 
7170 
7171 SV*
7172 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7173                     const U32 flags)
7174 {
7175     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7176 
7177     PERL_UNUSED_ARG(value);
7178 
7179     if (flags & RXapif_FETCH) {
7180         return reg_named_buff_fetch(rx, key, flags);
7181     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7182         Perl_croak_no_modify();
7183         return NULL;
7184     } else if (flags & RXapif_EXISTS) {
7185         return reg_named_buff_exists(rx, key, flags)
7186             ? &PL_sv_yes
7187             : &PL_sv_no;
7188     } else if (flags & RXapif_REGNAMES) {
7189         return reg_named_buff_all(rx, flags);
7190     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7191         return reg_named_buff_scalar(rx, flags);
7192     } else {
7193         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7194         return NULL;
7195     }
7196 }
7197 
7198 SV*
7199 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7200                          const U32 flags)
7201 {
7202     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7203     PERL_UNUSED_ARG(lastkey);
7204 
7205     if (flags & RXapif_FIRSTKEY)
7206         return reg_named_buff_firstkey(rx, flags);
7207     else if (flags & RXapif_NEXTKEY)
7208         return reg_named_buff_nextkey(rx, flags);
7209     else {
7210         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7211                                             (int)flags);
7212         return NULL;
7213     }
7214 }
7215 
7216 SV*
7217 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7218 			  const U32 flags)
7219 {
7220     AV *retarray = NULL;
7221     SV *ret;
7222     struct regexp *const rx = ReANY(r);
7223 
7224     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7225 
7226     if (flags & RXapif_ALL)
7227         retarray=newAV();
7228 
7229     if (rx && RXp_PAREN_NAMES(rx)) {
7230         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7231         if (he_str) {
7232             IV i;
7233             SV* sv_dat=HeVAL(he_str);
7234             I32 *nums=(I32*)SvPVX(sv_dat);
7235             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7236                 if ((I32)(rx->nparens) >= nums[i]
7237                     && rx->offs[nums[i]].start != -1
7238                     && rx->offs[nums[i]].end != -1)
7239                 {
7240                     ret = newSVpvs("");
7241                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7242                     if (!retarray)
7243                         return ret;
7244                 } else {
7245                     if (retarray)
7246                         ret = newSVsv(&PL_sv_undef);
7247                 }
7248                 if (retarray)
7249                     av_push(retarray, ret);
7250             }
7251             if (retarray)
7252                 return newRV_noinc(MUTABLE_SV(retarray));
7253         }
7254     }
7255     return NULL;
7256 }
7257 
7258 bool
7259 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7260                            const U32 flags)
7261 {
7262     struct regexp *const rx = ReANY(r);
7263 
7264     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7265 
7266     if (rx && RXp_PAREN_NAMES(rx)) {
7267         if (flags & RXapif_ALL) {
7268             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7269         } else {
7270 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7271             if (sv) {
7272 		SvREFCNT_dec_NN(sv);
7273                 return TRUE;
7274             } else {
7275                 return FALSE;
7276             }
7277         }
7278     } else {
7279         return FALSE;
7280     }
7281 }
7282 
7283 SV*
7284 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7285 {
7286     struct regexp *const rx = ReANY(r);
7287 
7288     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7289 
7290     if ( rx && RXp_PAREN_NAMES(rx) ) {
7291 	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
7292 
7293 	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7294     } else {
7295 	return FALSE;
7296     }
7297 }
7298 
7299 SV*
7300 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7301 {
7302     struct regexp *const rx = ReANY(r);
7303     GET_RE_DEBUG_FLAGS_DECL;
7304 
7305     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7306 
7307     if (rx && RXp_PAREN_NAMES(rx)) {
7308         HV *hv = RXp_PAREN_NAMES(rx);
7309         HE *temphe;
7310         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7311             IV i;
7312             IV parno = 0;
7313             SV* sv_dat = HeVAL(temphe);
7314             I32 *nums = (I32*)SvPVX(sv_dat);
7315             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7316                 if ((I32)(rx->lastparen) >= nums[i] &&
7317                     rx->offs[nums[i]].start != -1 &&
7318                     rx->offs[nums[i]].end != -1)
7319                 {
7320                     parno = nums[i];
7321                     break;
7322                 }
7323             }
7324             if (parno || flags & RXapif_ALL) {
7325 		return newSVhek(HeKEY_hek(temphe));
7326             }
7327         }
7328     }
7329     return NULL;
7330 }
7331 
7332 SV*
7333 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7334 {
7335     SV *ret;
7336     AV *av;
7337     SSize_t length;
7338     struct regexp *const rx = ReANY(r);
7339 
7340     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7341 
7342     if (rx && RXp_PAREN_NAMES(rx)) {
7343         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7344             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7345         } else if (flags & RXapif_ONE) {
7346             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7347             av = MUTABLE_AV(SvRV(ret));
7348             length = av_tindex(av);
7349 	    SvREFCNT_dec_NN(ret);
7350             return newSViv(length + 1);
7351         } else {
7352             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7353                                                 (int)flags);
7354             return NULL;
7355         }
7356     }
7357     return &PL_sv_undef;
7358 }
7359 
7360 SV*
7361 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7362 {
7363     struct regexp *const rx = ReANY(r);
7364     AV *av = newAV();
7365 
7366     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7367 
7368     if (rx && RXp_PAREN_NAMES(rx)) {
7369         HV *hv= RXp_PAREN_NAMES(rx);
7370         HE *temphe;
7371         (void)hv_iterinit(hv);
7372         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7373             IV i;
7374             IV parno = 0;
7375             SV* sv_dat = HeVAL(temphe);
7376             I32 *nums = (I32*)SvPVX(sv_dat);
7377             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7378                 if ((I32)(rx->lastparen) >= nums[i] &&
7379                     rx->offs[nums[i]].start != -1 &&
7380                     rx->offs[nums[i]].end != -1)
7381                 {
7382                     parno = nums[i];
7383                     break;
7384                 }
7385             }
7386             if (parno || flags & RXapif_ALL) {
7387                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7388             }
7389         }
7390     }
7391 
7392     return newRV_noinc(MUTABLE_SV(av));
7393 }
7394 
7395 void
7396 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7397 			     SV * const sv)
7398 {
7399     struct regexp *const rx = ReANY(r);
7400     char *s = NULL;
7401     SSize_t i = 0;
7402     SSize_t s1, t1;
7403     I32 n = paren;
7404 
7405     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7406 
7407     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7408            || n == RX_BUFF_IDX_CARET_FULLMATCH
7409            || n == RX_BUFF_IDX_CARET_POSTMATCH
7410        )
7411     {
7412         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7413         if (!keepcopy) {
7414             /* on something like
7415              *    $r = qr/.../;
7416              *    /$qr/p;
7417              * the KEEPCOPY is set on the PMOP rather than the regex */
7418             if (PL_curpm && r == PM_GETRE(PL_curpm))
7419                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7420         }
7421         if (!keepcopy)
7422             goto ret_undef;
7423     }
7424 
7425     if (!rx->subbeg)
7426         goto ret_undef;
7427 
7428     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7429         /* no need to distinguish between them any more */
7430         n = RX_BUFF_IDX_FULLMATCH;
7431 
7432     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7433         && rx->offs[0].start != -1)
7434     {
7435         /* $`, ${^PREMATCH} */
7436 	i = rx->offs[0].start;
7437 	s = rx->subbeg;
7438     }
7439     else
7440     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7441         && rx->offs[0].end != -1)
7442     {
7443         /* $', ${^POSTMATCH} */
7444 	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7445 	i = rx->sublen + rx->suboffset - rx->offs[0].end;
7446     }
7447     else
7448     if ( 0 <= n && n <= (I32)rx->nparens &&
7449         (s1 = rx->offs[n].start) != -1 &&
7450         (t1 = rx->offs[n].end) != -1)
7451     {
7452         /* $&, ${^MATCH},  $1 ... */
7453         i = t1 - s1;
7454         s = rx->subbeg + s1 - rx->suboffset;
7455     } else {
7456         goto ret_undef;
7457     }
7458 
7459     assert(s >= rx->subbeg);
7460     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7461     if (i >= 0) {
7462 #ifdef NO_TAINT_SUPPORT
7463         sv_setpvn(sv, s, i);
7464 #else
7465         const int oldtainted = TAINT_get;
7466         TAINT_NOT;
7467         sv_setpvn(sv, s, i);
7468         TAINT_set(oldtainted);
7469 #endif
7470         if ( (rx->intflags & PREGf_CANY_SEEN)
7471             ? (RXp_MATCH_UTF8(rx)
7472                         && (!i || is_utf8_string((U8*)s, i)))
7473             : (RXp_MATCH_UTF8(rx)) )
7474         {
7475             SvUTF8_on(sv);
7476         }
7477         else
7478             SvUTF8_off(sv);
7479         if (TAINTING_get) {
7480             if (RXp_MATCH_TAINTED(rx)) {
7481                 if (SvTYPE(sv) >= SVt_PVMG) {
7482                     MAGIC* const mg = SvMAGIC(sv);
7483                     MAGIC* mgt;
7484                     TAINT;
7485                     SvMAGIC_set(sv, mg->mg_moremagic);
7486                     SvTAINT(sv);
7487                     if ((mgt = SvMAGIC(sv))) {
7488                         mg->mg_moremagic = mgt;
7489                         SvMAGIC_set(sv, mg);
7490                     }
7491                 } else {
7492                     TAINT;
7493                     SvTAINT(sv);
7494                 }
7495             } else
7496                 SvTAINTED_off(sv);
7497         }
7498     } else {
7499       ret_undef:
7500         sv_setsv(sv,&PL_sv_undef);
7501         return;
7502     }
7503 }
7504 
7505 void
7506 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7507 							 SV const * const value)
7508 {
7509     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7510 
7511     PERL_UNUSED_ARG(rx);
7512     PERL_UNUSED_ARG(paren);
7513     PERL_UNUSED_ARG(value);
7514 
7515     if (!PL_localizing)
7516         Perl_croak_no_modify();
7517 }
7518 
7519 I32
7520 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7521                               const I32 paren)
7522 {
7523     struct regexp *const rx = ReANY(r);
7524     I32 i;
7525     I32 s1, t1;
7526 
7527     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7528 
7529     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7530         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7531         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7532     )
7533     {
7534         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7535         if (!keepcopy) {
7536             /* on something like
7537              *    $r = qr/.../;
7538              *    /$qr/p;
7539              * the KEEPCOPY is set on the PMOP rather than the regex */
7540             if (PL_curpm && r == PM_GETRE(PL_curpm))
7541                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7542         }
7543         if (!keepcopy)
7544             goto warn_undef;
7545     }
7546 
7547     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7548     switch (paren) {
7549       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7550       case RX_BUFF_IDX_PREMATCH:       /* $` */
7551         if (rx->offs[0].start != -1) {
7552 			i = rx->offs[0].start;
7553 			if (i > 0) {
7554 				s1 = 0;
7555 				t1 = i;
7556 				goto getlen;
7557 			}
7558 	    }
7559         return 0;
7560 
7561       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7562       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7563 	    if (rx->offs[0].end != -1) {
7564 			i = rx->sublen - rx->offs[0].end;
7565 			if (i > 0) {
7566 				s1 = rx->offs[0].end;
7567 				t1 = rx->sublen;
7568 				goto getlen;
7569 			}
7570 	    }
7571         return 0;
7572 
7573       default: /* $& / ${^MATCH}, $1, $2, ... */
7574 	    if (paren <= (I32)rx->nparens &&
7575             (s1 = rx->offs[paren].start) != -1 &&
7576             (t1 = rx->offs[paren].end) != -1)
7577 	    {
7578             i = t1 - s1;
7579             goto getlen;
7580         } else {
7581           warn_undef:
7582             if (ckWARN(WARN_UNINITIALIZED))
7583                 report_uninit((const SV *)sv);
7584             return 0;
7585         }
7586     }
7587   getlen:
7588     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7589         const char * const s = rx->subbeg - rx->suboffset + s1;
7590         const U8 *ep;
7591         STRLEN el;
7592 
7593         i = t1 - s1;
7594         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7595 			i = el;
7596     }
7597     return i;
7598 }
7599 
7600 SV*
7601 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7602 {
7603     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7604 	PERL_UNUSED_ARG(rx);
7605 	if (0)
7606 	    return NULL;
7607 	else
7608 	    return newSVpvs("Regexp");
7609 }
7610 
7611 /* Scans the name of a named buffer from the pattern.
7612  * If flags is REG_RSN_RETURN_NULL returns null.
7613  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7614  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7615  * to the parsed name as looked up in the RExC_paren_names hash.
7616  * If there is an error throws a vFAIL().. type exception.
7617  */
7618 
7619 #define REG_RSN_RETURN_NULL    0
7620 #define REG_RSN_RETURN_NAME    1
7621 #define REG_RSN_RETURN_DATA    2
7622 
7623 STATIC SV*
7624 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7625 {
7626     char *name_start = RExC_parse;
7627 
7628     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7629 
7630     assert (RExC_parse <= RExC_end);
7631     if (RExC_parse == RExC_end) NOOP;
7632     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7633 	 /* skip IDFIRST by using do...while */
7634 	if (UTF)
7635 	    do {
7636 		RExC_parse += UTF8SKIP(RExC_parse);
7637 	    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7638 	else
7639 	    do {
7640 		RExC_parse++;
7641 	    } while (isWORDCHAR(*RExC_parse));
7642     } else {
7643         RExC_parse++; /* so the <- from the vFAIL is after the offending
7644                          character */
7645         vFAIL("Group name must start with a non-digit word character");
7646     }
7647     if ( flags ) {
7648         SV* sv_name
7649 	    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7650 			     SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7651         if ( flags == REG_RSN_RETURN_NAME)
7652             return sv_name;
7653         else if (flags==REG_RSN_RETURN_DATA) {
7654             HE *he_str = NULL;
7655             SV *sv_dat = NULL;
7656             if ( ! sv_name )      /* should not happen*/
7657                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7658             if (RExC_paren_names)
7659                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7660             if ( he_str )
7661                 sv_dat = HeVAL(he_str);
7662             if ( ! sv_dat )
7663                 vFAIL("Reference to nonexistent named group");
7664             return sv_dat;
7665         }
7666         else {
7667             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7668 		       (unsigned long) flags);
7669         }
7670         assert(0); /* NOT REACHED */
7671     }
7672     return NULL;
7673 }
7674 
7675 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7676     int rem=(int)(RExC_end - RExC_parse);                       \
7677     int cut;                                                    \
7678     int num;                                                    \
7679     int iscut=0;                                                \
7680     if (rem>10) {                                               \
7681         rem=10;                                                 \
7682         iscut=1;                                                \
7683     }                                                           \
7684     cut=10-rem;                                                 \
7685     if (RExC_lastparse!=RExC_parse)                             \
7686         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7687             rem, RExC_parse,                                    \
7688             cut + 4,                                            \
7689             iscut ? "..." : "<"                                 \
7690         );                                                      \
7691     else                                                        \
7692         PerlIO_printf(Perl_debug_log,"%16s","");                \
7693                                                                 \
7694     if (SIZE_ONLY)                                              \
7695        num = RExC_size + 1;                                     \
7696     else                                                        \
7697        num=REG_NODE_NUM(RExC_emit);                             \
7698     if (RExC_lastnum!=num)                                      \
7699        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7700     else                                                        \
7701        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7702     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7703         (int)((depth*2)), "",                                   \
7704         (funcname)                                              \
7705     );                                                          \
7706     RExC_lastnum=num;                                           \
7707     RExC_lastparse=RExC_parse;                                  \
7708 })
7709 
7710 
7711 
7712 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7713     DEBUG_PARSE_MSG((funcname));                            \
7714     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7715 })
7716 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7717     DEBUG_PARSE_MSG((funcname));                            \
7718     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7719 })
7720 
7721 /* This section of code defines the inversion list object and its methods.  The
7722  * interfaces are highly subject to change, so as much as possible is static to
7723  * this file.  An inversion list is here implemented as a malloc'd C UV array
7724  * as an SVt_INVLIST scalar.
7725  *
7726  * An inversion list for Unicode is an array of code points, sorted by ordinal
7727  * number.  The zeroth element is the first code point in the list.  The 1th
7728  * element is the first element beyond that not in the list.  In other words,
7729  * the first range is
7730  *  invlist[0]..(invlist[1]-1)
7731  * The other ranges follow.  Thus every element whose index is divisible by two
7732  * marks the beginning of a range that is in the list, and every element not
7733  * divisible by two marks the beginning of a range not in the list.  A single
7734  * element inversion list that contains the single code point N generally
7735  * consists of two elements
7736  *  invlist[0] == N
7737  *  invlist[1] == N+1
7738  * (The exception is when N is the highest representable value on the
7739  * machine, in which case the list containing just it would be a single
7740  * element, itself.  By extension, if the last range in the list extends to
7741  * infinity, then the first element of that range will be in the inversion list
7742  * at a position that is divisible by two, and is the final element in the
7743  * list.)
7744  * Taking the complement (inverting) an inversion list is quite simple, if the
7745  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7746  * This implementation reserves an element at the beginning of each inversion
7747  * list to always contain 0; there is an additional flag in the header which
7748  * indicates if the list begins at the 0, or is offset to begin at the next
7749  * element.
7750  *
7751  * More about inversion lists can be found in "Unicode Demystified"
7752  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7753  * More will be coming when functionality is added later.
7754  *
7755  * The inversion list data structure is currently implemented as an SV pointing
7756  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7757  * array of UV whose memory management is automatically handled by the existing
7758  * facilities for SV's.
7759  *
7760  * Some of the methods should always be private to the implementation, and some
7761  * should eventually be made public */
7762 
7763 /* The header definitions are in F<inline_invlist.c> */
7764 
7765 PERL_STATIC_INLINE UV*
7766 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7767 {
7768     /* Returns a pointer to the first element in the inversion list's array.
7769      * This is called upon initialization of an inversion list.  Where the
7770      * array begins depends on whether the list has the code point U+0000 in it
7771      * or not.  The other parameter tells it whether the code that follows this
7772      * call is about to put a 0 in the inversion list or not.  The first
7773      * element is either the element reserved for 0, if TRUE, or the element
7774      * after it, if FALSE */
7775 
7776     bool* offset = get_invlist_offset_addr(invlist);
7777     UV* zero_addr = (UV *) SvPVX(invlist);
7778 
7779     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7780 
7781     /* Must be empty */
7782     assert(! _invlist_len(invlist));
7783 
7784     *zero_addr = 0;
7785 
7786     /* 1^1 = 0; 1^0 = 1 */
7787     *offset = 1 ^ will_have_0;
7788     return zero_addr + *offset;
7789 }
7790 
7791 PERL_STATIC_INLINE UV*
7792 S_invlist_array(pTHX_ SV* const invlist)
7793 {
7794     /* Returns the pointer to the inversion list's array.  Every time the
7795      * length changes, this needs to be called in case malloc or realloc moved
7796      * it */
7797 
7798     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7799 
7800     /* Must not be empty.  If these fail, you probably didn't check for <len>
7801      * being non-zero before trying to get the array */
7802     assert(_invlist_len(invlist));
7803 
7804     /* The very first element always contains zero, The array begins either
7805      * there, or if the inversion list is offset, at the element after it.
7806      * The offset header field determines which; it contains 0 or 1 to indicate
7807      * how much additionally to add */
7808     assert(0 == *(SvPVX(invlist)));
7809     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7810 }
7811 
7812 PERL_STATIC_INLINE void
7813 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7814 {
7815     /* Sets the current number of elements stored in the inversion list.
7816      * Updates SvCUR correspondingly */
7817 
7818     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7819 
7820     assert(SvTYPE(invlist) == SVt_INVLIST);
7821 
7822     SvCUR_set(invlist,
7823               (len == 0)
7824                ? 0
7825                : TO_INTERNAL_SIZE(len + offset));
7826     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7827 }
7828 
7829 PERL_STATIC_INLINE IV*
7830 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7831 {
7832     /* Return the address of the IV that is reserved to hold the cached index
7833      * */
7834 
7835     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7836 
7837     assert(SvTYPE(invlist) == SVt_INVLIST);
7838 
7839     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7840 }
7841 
7842 PERL_STATIC_INLINE IV
7843 S_invlist_previous_index(pTHX_ SV* const invlist)
7844 {
7845     /* Returns cached index of previous search */
7846 
7847     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7848 
7849     return *get_invlist_previous_index_addr(invlist);
7850 }
7851 
7852 PERL_STATIC_INLINE void
7853 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7854 {
7855     /* Caches <index> for later retrieval */
7856 
7857     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7858 
7859     assert(index == 0 || index < (int) _invlist_len(invlist));
7860 
7861     *get_invlist_previous_index_addr(invlist) = index;
7862 }
7863 
7864 PERL_STATIC_INLINE UV
7865 S_invlist_max(pTHX_ SV* const invlist)
7866 {
7867     /* Returns the maximum number of elements storable in the inversion list's
7868      * array, without having to realloc() */
7869 
7870     PERL_ARGS_ASSERT_INVLIST_MAX;
7871 
7872     assert(SvTYPE(invlist) == SVt_INVLIST);
7873 
7874     /* Assumes worst case, in which the 0 element is not counted in the
7875      * inversion list, so subtracts 1 for that */
7876     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7877            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7878            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7879 }
7880 
7881 #ifndef PERL_IN_XSUB_RE
7882 SV*
7883 Perl__new_invlist(pTHX_ IV initial_size)
7884 {
7885 
7886     /* Return a pointer to a newly constructed inversion list, with enough
7887      * space to store 'initial_size' elements.  If that number is negative, a
7888      * system default is used instead */
7889 
7890     SV* new_list;
7891 
7892     if (initial_size < 0) {
7893 	initial_size = 10;
7894     }
7895 
7896     /* Allocate the initial space */
7897     new_list = newSV_type(SVt_INVLIST);
7898 
7899     /* First 1 is in case the zero element isn't in the list; second 1 is for
7900      * trailing NUL */
7901     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7902     invlist_set_len(new_list, 0, 0);
7903 
7904     /* Force iterinit() to be used to get iteration to work */
7905     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7906 
7907     *get_invlist_previous_index_addr(new_list) = 0;
7908 
7909     return new_list;
7910 }
7911 
7912 SV*
7913 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7914 {
7915     /* Return a pointer to a newly constructed inversion list, initialized to
7916      * point to <list>, which has to be in the exact correct inversion list
7917      * form, including internal fields.  Thus this is a dangerous routine that
7918      * should not be used in the wrong hands.  The passed in 'list' contains
7919      * several header fields at the beginning that are not part of the
7920      * inversion list body proper */
7921 
7922     const STRLEN length = (STRLEN) list[0];
7923     const UV version_id =          list[1];
7924     const bool offset   =    cBOOL(list[2]);
7925 #define HEADER_LENGTH 3
7926     /* If any of the above changes in any way, you must change HEADER_LENGTH
7927      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7928      *      perl -E 'say int(rand 2**31-1)'
7929      */
7930 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7931                                         data structure type, so that one being
7932                                         passed in can be validated to be an
7933                                         inversion list of the correct vintage.
7934                                        */
7935 
7936     SV* invlist = newSV_type(SVt_INVLIST);
7937 
7938     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7939 
7940     if (version_id != INVLIST_VERSION_ID) {
7941         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7942     }
7943 
7944     /* The generated array passed in includes header elements that aren't part
7945      * of the list proper, so start it just after them */
7946     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7947 
7948     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7949 			       shouldn't touch it */
7950 
7951     *(get_invlist_offset_addr(invlist)) = offset;
7952 
7953     /* The 'length' passed to us is the physical number of elements in the
7954      * inversion list.  But if there is an offset the logical number is one
7955      * less than that */
7956     invlist_set_len(invlist, length  - offset, offset);
7957 
7958     invlist_set_previous_index(invlist, 0);
7959 
7960     /* Initialize the iteration pointer. */
7961     invlist_iterfinish(invlist);
7962 
7963     SvREADONLY_on(invlist);
7964 
7965     return invlist;
7966 }
7967 #endif /* ifndef PERL_IN_XSUB_RE */
7968 
7969 STATIC void
7970 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7971 {
7972     /* Grow the maximum size of an inversion list */
7973 
7974     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7975 
7976     assert(SvTYPE(invlist) == SVt_INVLIST);
7977 
7978     /* Add one to account for the zero element at the beginning which may not
7979      * be counted by the calling parameters */
7980     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7981 }
7982 
7983 PERL_STATIC_INLINE void
7984 S_invlist_trim(pTHX_ SV* const invlist)
7985 {
7986     PERL_ARGS_ASSERT_INVLIST_TRIM;
7987 
7988     assert(SvTYPE(invlist) == SVt_INVLIST);
7989 
7990     /* Change the length of the inversion list to how many entries it currently
7991      * has */
7992     SvPV_shrink_to_cur((SV *) invlist);
7993 }
7994 
7995 STATIC void
7996 S__append_range_to_invlist(pTHX_ SV* const invlist,
7997                                  const UV start, const UV end)
7998 {
7999    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8000     * the end of the inversion list.  The range must be above any existing
8001     * ones. */
8002 
8003     UV* array;
8004     UV max = invlist_max(invlist);
8005     UV len = _invlist_len(invlist);
8006     bool offset;
8007 
8008     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8009 
8010     if (len == 0) { /* Empty lists must be initialized */
8011         offset = start != 0;
8012         array = _invlist_array_init(invlist, ! offset);
8013     }
8014     else {
8015 	/* Here, the existing list is non-empty. The current max entry in the
8016 	 * list is generally the first value not in the set, except when the
8017 	 * set extends to the end of permissible values, in which case it is
8018 	 * the first entry in that final set, and so this call is an attempt to
8019 	 * append out-of-order */
8020 
8021 	UV final_element = len - 1;
8022 	array = invlist_array(invlist);
8023 	if (array[final_element] > start
8024 	    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8025 	{
8026 	    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8027 		     array[final_element], start,
8028 		     ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8029 	}
8030 
8031 	/* Here, it is a legal append.  If the new range begins with the first
8032 	 * value not in the set, it is extending the set, so the new first
8033 	 * value not in the set is one greater than the newly extended range.
8034 	 * */
8035         offset = *get_invlist_offset_addr(invlist);
8036 	if (array[final_element] == start) {
8037 	    if (end != UV_MAX) {
8038 		array[final_element] = end + 1;
8039 	    }
8040 	    else {
8041 		/* But if the end is the maximum representable on the machine,
8042 		 * just let the range that this would extend to have no end */
8043 		invlist_set_len(invlist, len - 1, offset);
8044 	    }
8045 	    return;
8046 	}
8047     }
8048 
8049     /* Here the new range doesn't extend any existing set.  Add it */
8050 
8051     len += 2;	/* Includes an element each for the start and end of range */
8052 
8053     /* If wll overflow the existing space, extend, which may cause the array to
8054      * be moved */
8055     if (max < len) {
8056 	invlist_extend(invlist, len);
8057 
8058         /* Have to set len here to avoid assert failure in invlist_array() */
8059         invlist_set_len(invlist, len, offset);
8060 
8061 	array = invlist_array(invlist);
8062     }
8063     else {
8064 	invlist_set_len(invlist, len, offset);
8065     }
8066 
8067     /* The next item on the list starts the range, the one after that is
8068      * one past the new range.  */
8069     array[len - 2] = start;
8070     if (end != UV_MAX) {
8071 	array[len - 1] = end + 1;
8072     }
8073     else {
8074 	/* But if the end is the maximum representable on the machine, just let
8075 	 * the range have no end */
8076 	invlist_set_len(invlist, len - 1, offset);
8077     }
8078 }
8079 
8080 #ifndef PERL_IN_XSUB_RE
8081 
8082 IV
8083 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8084 {
8085     /* Searches the inversion list for the entry that contains the input code
8086      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8087      * return value is the index into the list's array of the range that
8088      * contains <cp> */
8089 
8090     IV low = 0;
8091     IV mid;
8092     IV high = _invlist_len(invlist);
8093     const IV highest_element = high - 1;
8094     const UV* array;
8095 
8096     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8097 
8098     /* If list is empty, return failure. */
8099     if (high == 0) {
8100 	return -1;
8101     }
8102 
8103     /* (We can't get the array unless we know the list is non-empty) */
8104     array = invlist_array(invlist);
8105 
8106     mid = invlist_previous_index(invlist);
8107     assert(mid >=0 && mid <= highest_element);
8108 
8109     /* <mid> contains the cache of the result of the previous call to this
8110      * function (0 the first time).  See if this call is for the same result,
8111      * or if it is for mid-1.  This is under the theory that calls to this
8112      * function will often be for related code points that are near each other.
8113      * And benchmarks show that caching gives better results.  We also test
8114      * here if the code point is within the bounds of the list.  These tests
8115      * replace others that would have had to be made anyway to make sure that
8116      * the array bounds were not exceeded, and these give us extra information
8117      * at the same time */
8118     if (cp >= array[mid]) {
8119         if (cp >= array[highest_element]) {
8120             return highest_element;
8121         }
8122 
8123         /* Here, array[mid] <= cp < array[highest_element].  This means that
8124          * the final element is not the answer, so can exclude it; it also
8125          * means that <mid> is not the final element, so can refer to 'mid + 1'
8126          * safely */
8127         if (cp < array[mid + 1]) {
8128             return mid;
8129         }
8130         high--;
8131         low = mid + 1;
8132     }
8133     else { /* cp < aray[mid] */
8134         if (cp < array[0]) { /* Fail if outside the array */
8135             return -1;
8136         }
8137         high = mid;
8138         if (cp >= array[mid - 1]) {
8139             goto found_entry;
8140         }
8141     }
8142 
8143     /* Binary search.  What we are looking for is <i> such that
8144      *	array[i] <= cp < array[i+1]
8145      * The loop below converges on the i+1.  Note that there may not be an
8146      * (i+1)th element in the array, and things work nonetheless */
8147     while (low < high) {
8148 	mid = (low + high) / 2;
8149         assert(mid <= highest_element);
8150 	if (array[mid] <= cp) { /* cp >= array[mid] */
8151 	    low = mid + 1;
8152 
8153 	    /* We could do this extra test to exit the loop early.
8154 	    if (cp < array[low]) {
8155 		return mid;
8156 	    }
8157 	    */
8158 	}
8159 	else { /* cp < array[mid] */
8160 	    high = mid;
8161 	}
8162     }
8163 
8164   found_entry:
8165     high--;
8166     invlist_set_previous_index(invlist, high);
8167     return high;
8168 }
8169 
8170 void
8171 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8172                                     const UV start, const UV end, U8* swatch)
8173 {
8174     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8175      * but is used when the swash has an inversion list.  This makes this much
8176      * faster, as it uses a binary search instead of a linear one.  This is
8177      * intimately tied to that function, and perhaps should be in utf8.c,
8178      * except it is intimately tied to inversion lists as well.  It assumes
8179      * that <swatch> is all 0's on input */
8180 
8181     UV current = start;
8182     const IV len = _invlist_len(invlist);
8183     IV i;
8184     const UV * array;
8185 
8186     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8187 
8188     if (len == 0) { /* Empty inversion list */
8189         return;
8190     }
8191 
8192     array = invlist_array(invlist);
8193 
8194     /* Find which element it is */
8195     i = _invlist_search(invlist, start);
8196 
8197     /* We populate from <start> to <end> */
8198     while (current < end) {
8199         UV upper;
8200 
8201 	/* The inversion list gives the results for every possible code point
8202 	 * after the first one in the list.  Only those ranges whose index is
8203 	 * even are ones that the inversion list matches.  For the odd ones,
8204 	 * and if the initial code point is not in the list, we have to skip
8205 	 * forward to the next element */
8206         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8207             i++;
8208             if (i >= len) { /* Finished if beyond the end of the array */
8209                 return;
8210             }
8211             current = array[i];
8212 	    if (current >= end) {   /* Finished if beyond the end of what we
8213 				       are populating */
8214                 if (LIKELY(end < UV_MAX)) {
8215                     return;
8216                 }
8217 
8218                 /* We get here when the upper bound is the maximum
8219                  * representable on the machine, and we are looking for just
8220                  * that code point.  Have to special case it */
8221                 i = len;
8222                 goto join_end_of_list;
8223             }
8224         }
8225         assert(current >= start);
8226 
8227 	/* The current range ends one below the next one, except don't go past
8228 	 * <end> */
8229         i++;
8230         upper = (i < len && array[i] < end) ? array[i] : end;
8231 
8232 	/* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8233 	 * for each code point in it */
8234         for (; current < upper; current++) {
8235             const STRLEN offset = (STRLEN)(current - start);
8236             swatch[offset >> 3] |= 1 << (offset & 7);
8237         }
8238 
8239     join_end_of_list:
8240 
8241 	/* Quit if at the end of the list */
8242         if (i >= len) {
8243 
8244 	    /* But first, have to deal with the highest possible code point on
8245 	     * the platform.  The previous code assumes that <end> is one
8246 	     * beyond where we want to populate, but that is impossible at the
8247 	     * platform's infinity, so have to handle it specially */
8248             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8249 	    {
8250                 const STRLEN offset = (STRLEN)(end - start);
8251                 swatch[offset >> 3] |= 1 << (offset & 7);
8252             }
8253             return;
8254         }
8255 
8256 	/* Advance to the next range, which will be for code points not in the
8257 	 * inversion list */
8258         current = array[i];
8259     }
8260 
8261     return;
8262 }
8263 
8264 void
8265 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8266                                          const bool complement_b, SV** output)
8267 {
8268     /* Take the union of two inversion lists and point <output> to it.  *output
8269      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8270      * the reference count to that list will be decremented if not already a
8271      * temporary (mortal); otherwise *output will be made correspondingly
8272      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8273      * second list is returned.  If <complement_b> is TRUE, the union is taken
8274      * of the complement (inversion) of <b> instead of b itself.
8275      *
8276      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8277      * Richard Gillam, published by Addison-Wesley, and explained at some
8278      * length there.  The preface says to incorporate its examples into your
8279      * code at your own risk.
8280      *
8281      * The algorithm is like a merge sort.
8282      *
8283      * XXX A potential performance improvement is to keep track as we go along
8284      * if only one of the inputs contributes to the result, meaning the other
8285      * is a subset of that one.  In that case, we can skip the final copy and
8286      * return the larger of the input lists, but then outside code might need
8287      * to keep track of whether to free the input list or not */
8288 
8289     const UV* array_a;    /* a's array */
8290     const UV* array_b;
8291     UV len_a;	    /* length of a's array */
8292     UV len_b;
8293 
8294     SV* u;			/* the resulting union */
8295     UV* array_u;
8296     UV len_u;
8297 
8298     UV i_a = 0;		    /* current index into a's array */
8299     UV i_b = 0;
8300     UV i_u = 0;
8301 
8302     /* running count, as explained in the algorithm source book; items are
8303      * stopped accumulating and are output when the count changes to/from 0.
8304      * The count is incremented when we start a range that's in the set, and
8305      * decremented when we start a range that's not in the set.  So its range
8306      * is 0 to 2.  Only when the count is zero is something not in the set.
8307      */
8308     UV count = 0;
8309 
8310     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8311     assert(a != b);
8312 
8313     /* If either one is empty, the union is the other one */
8314     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8315         bool make_temp = FALSE; /* Should we mortalize the result? */
8316 
8317 	if (*output == a) {
8318             if (a != NULL) {
8319                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8320                     SvREFCNT_dec_NN(a);
8321                 }
8322             }
8323 	}
8324 	if (*output != b) {
8325 	    *output = invlist_clone(b);
8326             if (complement_b) {
8327                 _invlist_invert(*output);
8328             }
8329 	} /* else *output already = b; */
8330 
8331         if (make_temp) {
8332             sv_2mortal(*output);
8333         }
8334 	return;
8335     }
8336     else if ((len_b = _invlist_len(b)) == 0) {
8337         bool make_temp = FALSE;
8338 	if (*output == b) {
8339             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8340                 SvREFCNT_dec_NN(b);
8341             }
8342 	}
8343 
8344         /* The complement of an empty list is a list that has everything in it,
8345          * so the union with <a> includes everything too */
8346         if (complement_b) {
8347             if (a == *output) {
8348                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8349                     SvREFCNT_dec_NN(a);
8350                 }
8351             }
8352             *output = _new_invlist(1);
8353             _append_range_to_invlist(*output, 0, UV_MAX);
8354         }
8355         else if (*output != a) {
8356             *output = invlist_clone(a);
8357         }
8358         /* else *output already = a; */
8359 
8360         if (make_temp) {
8361             sv_2mortal(*output);
8362         }
8363 	return;
8364     }
8365 
8366     /* Here both lists exist and are non-empty */
8367     array_a = invlist_array(a);
8368     array_b = invlist_array(b);
8369 
8370     /* If are to take the union of 'a' with the complement of b, set it
8371      * up so are looking at b's complement. */
8372     if (complement_b) {
8373 
8374 	/* To complement, we invert: if the first element is 0, remove it.  To
8375 	 * do this, we just pretend the array starts one later */
8376         if (array_b[0] == 0) {
8377             array_b++;
8378             len_b--;
8379         }
8380         else {
8381 
8382             /* But if the first element is not zero, we pretend the list starts
8383              * at the 0 that is always stored immediately before the array. */
8384             array_b--;
8385             len_b++;
8386         }
8387     }
8388 
8389     /* Size the union for the worst case: that the sets are completely
8390      * disjoint */
8391     u = _new_invlist(len_a + len_b);
8392 
8393     /* Will contain U+0000 if either component does */
8394     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8395 				      || (len_b > 0 && array_b[0] == 0));
8396 
8397     /* Go through each list item by item, stopping when exhausted one of
8398      * them */
8399     while (i_a < len_a && i_b < len_b) {
8400 	UV cp;	    /* The element to potentially add to the union's array */
8401 	bool cp_in_set;   /* is it in the the input list's set or not */
8402 
8403 	/* We need to take one or the other of the two inputs for the union.
8404 	 * Since we are merging two sorted lists, we take the smaller of the
8405 	 * next items.  In case of a tie, we take the one that is in its set
8406 	 * first.  If we took one not in the set first, it would decrement the
8407 	 * count, possibly to 0 which would cause it to be output as ending the
8408 	 * range, and the next time through we would take the same number, and
8409 	 * output it again as beginning the next range.  By doing it the
8410 	 * opposite way, there is no possibility that the count will be
8411 	 * momentarily decremented to 0, and thus the two adjoining ranges will
8412 	 * be seamlessly merged.  (In a tie and both are in the set or both not
8413 	 * in the set, it doesn't matter which we take first.) */
8414 	if (array_a[i_a] < array_b[i_b]
8415 	    || (array_a[i_a] == array_b[i_b]
8416 		&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8417 	{
8418 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8419 	    cp= array_a[i_a++];
8420 	}
8421 	else {
8422 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8423 	    cp = array_b[i_b++];
8424 	}
8425 
8426 	/* Here, have chosen which of the two inputs to look at.  Only output
8427 	 * if the running count changes to/from 0, which marks the
8428 	 * beginning/end of a range in that's in the set */
8429 	if (cp_in_set) {
8430 	    if (count == 0) {
8431 		array_u[i_u++] = cp;
8432 	    }
8433 	    count++;
8434 	}
8435 	else {
8436 	    count--;
8437 	    if (count == 0) {
8438 		array_u[i_u++] = cp;
8439 	    }
8440 	}
8441     }
8442 
8443     /* Here, we are finished going through at least one of the lists, which
8444      * means there is something remaining in at most one.  We check if the list
8445      * that hasn't been exhausted is positioned such that we are in the middle
8446      * of a range in its set or not.  (i_a and i_b point to the element beyond
8447      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8448      * is potentially more to output.
8449      * There are four cases:
8450      *	1) Both weren't in their sets, count is 0, and remains 0.  What's left
8451      *	   in the union is entirely from the non-exhausted set.
8452      *	2) Both were in their sets, count is 2.  Nothing further should
8453      *	   be output, as everything that remains will be in the exhausted
8454      *	   list's set, hence in the union; decrementing to 1 but not 0 insures
8455      *	   that
8456      *	3) the exhausted was in its set, non-exhausted isn't, count is 1.
8457      *	   Nothing further should be output because the union includes
8458      *	   everything from the exhausted set.  Not decrementing ensures that.
8459      *	4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8460      *	   decrementing to 0 insures that we look at the remainder of the
8461      *	   non-exhausted set */
8462     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8463 	|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8464     {
8465 	count--;
8466     }
8467 
8468     /* The final length is what we've output so far, plus what else is about to
8469      * be output.  (If 'count' is non-zero, then the input list we exhausted
8470      * has everything remaining up to the machine's limit in its set, and hence
8471      * in the union, so there will be no further output. */
8472     len_u = i_u;
8473     if (count == 0) {
8474 	/* At most one of the subexpressions will be non-zero */
8475 	len_u += (len_a - i_a) + (len_b - i_b);
8476     }
8477 
8478     /* Set result to final length, which can change the pointer to array_u, so
8479      * re-find it */
8480     if (len_u != _invlist_len(u)) {
8481 	invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8482 	invlist_trim(u);
8483 	array_u = invlist_array(u);
8484     }
8485 
8486     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8487      * the other) ended with everything above it not in its set.  That means
8488      * that the remaining part of the union is precisely the same as the
8489      * non-exhausted list, so can just copy it unchanged.  (If both list were
8490      * exhausted at the same time, then the operations below will be both 0.)
8491      */
8492     if (count == 0) {
8493 	IV copy_count; /* At most one will have a non-zero copy count */
8494 	if ((copy_count = len_a - i_a) > 0) {
8495 	    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8496 	}
8497 	else if ((copy_count = len_b - i_b) > 0) {
8498 	    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8499 	}
8500     }
8501 
8502     /*  We may be removing a reference to one of the inputs.  If so, the output
8503      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8504      *  count decremented) */
8505     if (a == *output || b == *output) {
8506         assert(! invlist_is_iterating(*output));
8507         if ((SvTEMP(*output))) {
8508             sv_2mortal(u);
8509         }
8510         else {
8511             SvREFCNT_dec_NN(*output);
8512         }
8513     }
8514 
8515     *output = u;
8516 
8517     return;
8518 }
8519 
8520 void
8521 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8522                                                const bool complement_b, SV** i)
8523 {
8524     /* Take the intersection of two inversion lists and point <i> to it.  *i
8525      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8526      * the reference count to that list will be decremented if not already a
8527      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8528      * The first list, <a>, may be NULL, in which case an empty list is
8529      * returned.  If <complement_b> is TRUE, the result will be the
8530      * intersection of <a> and the complement (or inversion) of <b> instead of
8531      * <b> directly.
8532      *
8533      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8534      * Richard Gillam, published by Addison-Wesley, and explained at some
8535      * length there.  The preface says to incorporate its examples into your
8536      * code at your own risk.  In fact, it had bugs
8537      *
8538      * The algorithm is like a merge sort, and is essentially the same as the
8539      * union above
8540      */
8541 
8542     const UV* array_a;		/* a's array */
8543     const UV* array_b;
8544     UV len_a;	/* length of a's array */
8545     UV len_b;
8546 
8547     SV* r;		     /* the resulting intersection */
8548     UV* array_r;
8549     UV len_r;
8550 
8551     UV i_a = 0;		    /* current index into a's array */
8552     UV i_b = 0;
8553     UV i_r = 0;
8554 
8555     /* running count, as explained in the algorithm source book; items are
8556      * stopped accumulating and are output when the count changes to/from 2.
8557      * The count is incremented when we start a range that's in the set, and
8558      * decremented when we start a range that's not in the set.  So its range
8559      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8560      */
8561     UV count = 0;
8562 
8563     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8564     assert(a != b);
8565 
8566     /* Special case if either one is empty */
8567     len_a = (a == NULL) ? 0 : _invlist_len(a);
8568     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8569         bool make_temp = FALSE;
8570 
8571         if (len_a != 0 && complement_b) {
8572 
8573             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8574              * be empty.  Here, also we are using 'b's complement, which hence
8575              * must be every possible code point.  Thus the intersection is
8576              * simply 'a'. */
8577             if (*i != a) {
8578                 if (*i == b) {
8579                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8580                         SvREFCNT_dec_NN(b);
8581                     }
8582                 }
8583 
8584                 *i = invlist_clone(a);
8585             }
8586             /* else *i is already 'a' */
8587 
8588             if (make_temp) {
8589                 sv_2mortal(*i);
8590             }
8591             return;
8592         }
8593 
8594         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8595          * intersection must be empty */
8596 	if (*i == a) {
8597             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8598                 SvREFCNT_dec_NN(a);
8599             }
8600 	}
8601 	else if (*i == b) {
8602             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8603                 SvREFCNT_dec_NN(b);
8604             }
8605 	}
8606 	*i = _new_invlist(0);
8607         if (make_temp) {
8608             sv_2mortal(*i);
8609         }
8610 
8611 	return;
8612     }
8613 
8614     /* Here both lists exist and are non-empty */
8615     array_a = invlist_array(a);
8616     array_b = invlist_array(b);
8617 
8618     /* If are to take the intersection of 'a' with the complement of b, set it
8619      * up so are looking at b's complement. */
8620     if (complement_b) {
8621 
8622 	/* To complement, we invert: if the first element is 0, remove it.  To
8623 	 * do this, we just pretend the array starts one later */
8624         if (array_b[0] == 0) {
8625             array_b++;
8626             len_b--;
8627         }
8628         else {
8629 
8630             /* But if the first element is not zero, we pretend the list starts
8631              * at the 0 that is always stored immediately before the array. */
8632             array_b--;
8633             len_b++;
8634         }
8635     }
8636 
8637     /* Size the intersection for the worst case: that the intersection ends up
8638      * fragmenting everything to be completely disjoint */
8639     r= _new_invlist(len_a + len_b);
8640 
8641     /* Will contain U+0000 iff both components do */
8642     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8643 				     && len_b > 0 && array_b[0] == 0);
8644 
8645     /* Go through each list item by item, stopping when exhausted one of
8646      * them */
8647     while (i_a < len_a && i_b < len_b) {
8648 	UV cp;	    /* The element to potentially add to the intersection's
8649 		       array */
8650 	bool cp_in_set;	/* Is it in the input list's set or not */
8651 
8652 	/* We need to take one or the other of the two inputs for the
8653 	 * intersection.  Since we are merging two sorted lists, we take the
8654 	 * smaller of the next items.  In case of a tie, we take the one that
8655 	 * is not in its set first (a difference from the union algorithm).  If
8656 	 * we took one in the set first, it would increment the count, possibly
8657 	 * to 2 which would cause it to be output as starting a range in the
8658 	 * intersection, and the next time through we would take that same
8659 	 * number, and output it again as ending the set.  By doing it the
8660 	 * opposite of this, there is no possibility that the count will be
8661 	 * momentarily incremented to 2.  (In a tie and both are in the set or
8662 	 * both not in the set, it doesn't matter which we take first.) */
8663 	if (array_a[i_a] < array_b[i_b]
8664 	    || (array_a[i_a] == array_b[i_b]
8665 		&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8666 	{
8667 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8668 	    cp= array_a[i_a++];
8669 	}
8670 	else {
8671 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8672 	    cp= array_b[i_b++];
8673 	}
8674 
8675 	/* Here, have chosen which of the two inputs to look at.  Only output
8676 	 * if the running count changes to/from 2, which marks the
8677 	 * beginning/end of a range that's in the intersection */
8678 	if (cp_in_set) {
8679 	    count++;
8680 	    if (count == 2) {
8681 		array_r[i_r++] = cp;
8682 	    }
8683 	}
8684 	else {
8685 	    if (count == 2) {
8686 		array_r[i_r++] = cp;
8687 	    }
8688 	    count--;
8689 	}
8690     }
8691 
8692     /* Here, we are finished going through at least one of the lists, which
8693      * means there is something remaining in at most one.  We check if the list
8694      * that has been exhausted is positioned such that we are in the middle
8695      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8696      * the ones we care about.)  There are four cases:
8697      *	1) Both weren't in their sets, count is 0, and remains 0.  There's
8698      *	   nothing left in the intersection.
8699      *	2) Both were in their sets, count is 2 and perhaps is incremented to
8700      *	   above 2.  What should be output is exactly that which is in the
8701      *	   non-exhausted set, as everything it has is also in the intersection
8702      *	   set, and everything it doesn't have can't be in the intersection
8703      *	3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8704      *	   gets incremented to 2.  Like the previous case, the intersection is
8705      *	   everything that remains in the non-exhausted set.
8706      *	4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8707      *	   remains 1.  And the intersection has nothing more. */
8708     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8709 	|| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8710     {
8711 	count++;
8712     }
8713 
8714     /* The final length is what we've output so far plus what else is in the
8715      * intersection.  At most one of the subexpressions below will be non-zero
8716      * */
8717     len_r = i_r;
8718     if (count >= 2) {
8719 	len_r += (len_a - i_a) + (len_b - i_b);
8720     }
8721 
8722     /* Set result to final length, which can change the pointer to array_r, so
8723      * re-find it */
8724     if (len_r != _invlist_len(r)) {
8725 	invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8726 	invlist_trim(r);
8727 	array_r = invlist_array(r);
8728     }
8729 
8730     /* Finish outputting any remaining */
8731     if (count >= 2) { /* At most one will have a non-zero copy count */
8732 	IV copy_count;
8733 	if ((copy_count = len_a - i_a) > 0) {
8734 	    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8735 	}
8736 	else if ((copy_count = len_b - i_b) > 0) {
8737 	    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8738 	}
8739     }
8740 
8741     /*  We may be removing a reference to one of the inputs.  If so, the output
8742      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8743      *  count decremented) */
8744     if (a == *i || b == *i) {
8745         assert(! invlist_is_iterating(*i));
8746         if (SvTEMP(*i)) {
8747             sv_2mortal(r);
8748         }
8749         else {
8750             SvREFCNT_dec_NN(*i);
8751         }
8752     }
8753 
8754     *i = r;
8755 
8756     return;
8757 }
8758 
8759 SV*
8760 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8761 {
8762     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8763      * set.  A pointer to the inversion list is returned.  This may actually be
8764      * a new list, in which case the passed in one has been destroyed.  The
8765      * passed in inversion list can be NULL, in which case a new one is created
8766      * with just the one range in it */
8767 
8768     SV* range_invlist;
8769     UV len;
8770 
8771     if (invlist == NULL) {
8772 	invlist = _new_invlist(2);
8773 	len = 0;
8774     }
8775     else {
8776 	len = _invlist_len(invlist);
8777     }
8778 
8779     /* If comes after the final entry actually in the list, can just append it
8780      * to the end, */
8781     if (len == 0
8782 	|| (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8783             && start >= invlist_array(invlist)[len - 1]))
8784     {
8785 	_append_range_to_invlist(invlist, start, end);
8786 	return invlist;
8787     }
8788 
8789     /* Here, can't just append things, create and return a new inversion list
8790      * which is the union of this range and the existing inversion list */
8791     range_invlist = _new_invlist(2);
8792     _append_range_to_invlist(range_invlist, start, end);
8793 
8794     _invlist_union(invlist, range_invlist, &invlist);
8795 
8796     /* The temporary can be freed */
8797     SvREFCNT_dec_NN(range_invlist);
8798 
8799     return invlist;
8800 }
8801 
8802 SV*
8803 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8804                                  UV** other_elements_ptr)
8805 {
8806     /* Create and return an inversion list whose contents are to be populated
8807      * by the caller.  The caller gives the number of elements (in 'size') and
8808      * the very first element ('element0').  This function will set
8809      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8810      * are to be placed.
8811      *
8812      * Obviously there is some trust involved that the caller will properly
8813      * fill in the other elements of the array.
8814      *
8815      * (The first element needs to be passed in, as the underlying code does
8816      * things differently depending on whether it is zero or non-zero) */
8817 
8818     SV* invlist = _new_invlist(size);
8819     bool offset;
8820 
8821     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8822 
8823     _append_range_to_invlist(invlist, element0, element0);
8824     offset = *get_invlist_offset_addr(invlist);
8825 
8826     invlist_set_len(invlist, size, offset);
8827     *other_elements_ptr = invlist_array(invlist) + 1;
8828     return invlist;
8829 }
8830 
8831 #endif
8832 
8833 PERL_STATIC_INLINE SV*
8834 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8835     return _add_range_to_invlist(invlist, cp, cp);
8836 }
8837 
8838 #ifndef PERL_IN_XSUB_RE
8839 void
8840 Perl__invlist_invert(pTHX_ SV* const invlist)
8841 {
8842     /* Complement the input inversion list.  This adds a 0 if the list didn't
8843      * have a zero; removes it otherwise.  As described above, the data
8844      * structure is set up so that this is very efficient */
8845 
8846     PERL_ARGS_ASSERT__INVLIST_INVERT;
8847 
8848     assert(! invlist_is_iterating(invlist));
8849 
8850     /* The inverse of matching nothing is matching everything */
8851     if (_invlist_len(invlist) == 0) {
8852 	_append_range_to_invlist(invlist, 0, UV_MAX);
8853 	return;
8854     }
8855 
8856     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8857 }
8858 
8859 #endif
8860 
8861 PERL_STATIC_INLINE SV*
8862 S_invlist_clone(pTHX_ SV* const invlist)
8863 {
8864 
8865     /* Return a new inversion list that is a copy of the input one, which is
8866      * unchanged.  The new list will not be mortal even if the old one was. */
8867 
8868     /* Need to allocate extra space to accommodate Perl's addition of a
8869      * trailing NUL to SvPV's, since it thinks they are always strings */
8870     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8871     STRLEN physical_length = SvCUR(invlist);
8872     bool offset = *(get_invlist_offset_addr(invlist));
8873 
8874     PERL_ARGS_ASSERT_INVLIST_CLONE;
8875 
8876     *(get_invlist_offset_addr(new_invlist)) = offset;
8877     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8878     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8879 
8880     return new_invlist;
8881 }
8882 
8883 PERL_STATIC_INLINE STRLEN*
8884 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8885 {
8886     /* Return the address of the UV that contains the current iteration
8887      * position */
8888 
8889     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8890 
8891     assert(SvTYPE(invlist) == SVt_INVLIST);
8892 
8893     return &(((XINVLIST*) SvANY(invlist))->iterator);
8894 }
8895 
8896 PERL_STATIC_INLINE void
8897 S_invlist_iterinit(pTHX_ SV* invlist)	/* Initialize iterator for invlist */
8898 {
8899     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8900 
8901     *get_invlist_iter_addr(invlist) = 0;
8902 }
8903 
8904 PERL_STATIC_INLINE void
8905 S_invlist_iterfinish(pTHX_ SV* invlist)
8906 {
8907     /* Terminate iterator for invlist.  This is to catch development errors.
8908      * Any iteration that is interrupted before completed should call this
8909      * function.  Functions that add code points anywhere else but to the end
8910      * of an inversion list assert that they are not in the middle of an
8911      * iteration.  If they were, the addition would make the iteration
8912      * problematical: if the iteration hadn't reached the place where things
8913      * were being added, it would be ok */
8914 
8915     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8916 
8917     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8918 }
8919 
8920 STATIC bool
8921 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8922 {
8923     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8924      * This call sets in <*start> and <*end>, the next range in <invlist>.
8925      * Returns <TRUE> if successful and the next call will return the next
8926      * range; <FALSE> if was already at the end of the list.  If the latter,
8927      * <*start> and <*end> are unchanged, and the next call to this function
8928      * will start over at the beginning of the list */
8929 
8930     STRLEN* pos = get_invlist_iter_addr(invlist);
8931     UV len = _invlist_len(invlist);
8932     UV *array;
8933 
8934     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8935 
8936     if (*pos >= len) {
8937 	*pos = (STRLEN) UV_MAX;	/* Force iterinit() to be required next time */
8938 	return FALSE;
8939     }
8940 
8941     array = invlist_array(invlist);
8942 
8943     *start = array[(*pos)++];
8944 
8945     if (*pos >= len) {
8946 	*end = UV_MAX;
8947     }
8948     else {
8949 	*end = array[(*pos)++] - 1;
8950     }
8951 
8952     return TRUE;
8953 }
8954 
8955 PERL_STATIC_INLINE bool
8956 S_invlist_is_iterating(pTHX_ SV* const invlist)
8957 {
8958     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8959 
8960     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8961 }
8962 
8963 PERL_STATIC_INLINE UV
8964 S_invlist_highest(pTHX_ SV* const invlist)
8965 {
8966     /* Returns the highest code point that matches an inversion list.  This API
8967      * has an ambiguity, as it returns 0 under either the highest is actually
8968      * 0, or if the list is empty.  If this distinction matters to you, check
8969      * for emptiness before calling this function */
8970 
8971     UV len = _invlist_len(invlist);
8972     UV *array;
8973 
8974     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8975 
8976     if (len == 0) {
8977 	return 0;
8978     }
8979 
8980     array = invlist_array(invlist);
8981 
8982     /* The last element in the array in the inversion list always starts a
8983      * range that goes to infinity.  That range may be for code points that are
8984      * matched in the inversion list, or it may be for ones that aren't
8985      * matched.  In the latter case, the highest code point in the set is one
8986      * less than the beginning of this range; otherwise it is the final element
8987      * of this range: infinity */
8988     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8989            ? UV_MAX
8990            : array[len - 1] - 1;
8991 }
8992 
8993 #ifndef PERL_IN_XSUB_RE
8994 SV *
8995 Perl__invlist_contents(pTHX_ SV* const invlist)
8996 {
8997     /* Get the contents of an inversion list into a string SV so that they can
8998      * be printed out.  It uses the format traditionally done for debug tracing
8999      */
9000 
9001     UV start, end;
9002     SV* output = newSVpvs("\n");
9003 
9004     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9005 
9006     assert(! invlist_is_iterating(invlist));
9007 
9008     invlist_iterinit(invlist);
9009     while (invlist_iternext(invlist, &start, &end)) {
9010 	if (end == UV_MAX) {
9011 	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9012 	}
9013 	else if (end != start) {
9014 	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9015 		    start,       end);
9016 	}
9017 	else {
9018 	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9019 	}
9020     }
9021 
9022     return output;
9023 }
9024 #endif
9025 
9026 #ifndef PERL_IN_XSUB_RE
9027 void
9028 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9029                          const char * const indent, SV* const invlist)
9030 {
9031     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9032      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9033      * the string 'indent'.  The output looks like this:
9034          [0] 0x000A .. 0x000D
9035          [2] 0x0085
9036          [4] 0x2028 .. 0x2029
9037          [6] 0x3104 .. INFINITY
9038      * This means that the first range of code points matched by the list are
9039      * 0xA through 0xD; the second range contains only the single code point
9040      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9041      * are used to define each range (except if the final range extends to
9042      * infinity, only a single element is needed).  The array index of the
9043      * first element for the corresponding range is given in brackets. */
9044 
9045     UV start, end;
9046     STRLEN count = 0;
9047 
9048     PERL_ARGS_ASSERT__INVLIST_DUMP;
9049 
9050     if (invlist_is_iterating(invlist)) {
9051         Perl_dump_indent(aTHX_ level, file,
9052              "%sCan't dump inversion list because is in middle of iterating\n",
9053              indent);
9054         return;
9055     }
9056 
9057     invlist_iterinit(invlist);
9058     while (invlist_iternext(invlist, &start, &end)) {
9059 	if (end == UV_MAX) {
9060 	    Perl_dump_indent(aTHX_ level, file,
9061                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9062                                    indent, (UV)count, start);
9063 	}
9064 	else if (end != start) {
9065 	    Perl_dump_indent(aTHX_ level, file,
9066                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9067 		                indent, (UV)count, start,         end);
9068 	}
9069 	else {
9070 	    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9071                                             indent, (UV)count, start);
9072 	}
9073         count += 2;
9074     }
9075 }
9076 #endif
9077 
9078 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9079 bool
9080 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9081 {
9082     /* Return a boolean as to if the two passed in inversion lists are
9083      * identical.  The final argument, if TRUE, says to take the complement of
9084      * the second inversion list before doing the comparison */
9085 
9086     const UV* array_a = invlist_array(a);
9087     const UV* array_b = invlist_array(b);
9088     UV len_a = _invlist_len(a);
9089     UV len_b = _invlist_len(b);
9090 
9091     UV i = 0;		    /* current index into the arrays */
9092     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9093 
9094     PERL_ARGS_ASSERT__INVLISTEQ;
9095 
9096     /* If are to compare 'a' with the complement of b, set it
9097      * up so are looking at b's complement. */
9098     if (complement_b) {
9099 
9100         /* The complement of nothing is everything, so <a> would have to have
9101          * just one element, starting at zero (ending at infinity) */
9102         if (len_b == 0) {
9103             return (len_a == 1 && array_a[0] == 0);
9104         }
9105         else if (array_b[0] == 0) {
9106 
9107             /* Otherwise, to complement, we invert.  Here, the first element is
9108              * 0, just remove it.  To do this, we just pretend the array starts
9109              * one later */
9110 
9111             array_b++;
9112             len_b--;
9113         }
9114         else {
9115 
9116             /* But if the first element is not zero, we pretend the list starts
9117              * at the 0 that is always stored immediately before the array. */
9118             array_b--;
9119             len_b++;
9120         }
9121     }
9122 
9123     /* Make sure that the lengths are the same, as well as the final element
9124      * before looping through the remainder.  (Thus we test the length, final,
9125      * and first elements right off the bat) */
9126     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9127         retval = FALSE;
9128     }
9129     else for (i = 0; i < len_a - 1; i++) {
9130         if (array_a[i] != array_b[i]) {
9131             retval = FALSE;
9132             break;
9133         }
9134     }
9135 
9136     return retval;
9137 }
9138 #endif
9139 
9140 #undef HEADER_LENGTH
9141 #undef TO_INTERNAL_SIZE
9142 #undef FROM_INTERNAL_SIZE
9143 #undef INVLIST_VERSION_ID
9144 
9145 /* End of inversion list object */
9146 
9147 STATIC void
9148 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9149 {
9150     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9151      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9152      * should point to the first flag; it is updated on output to point to the
9153      * final ')' or ':'.  There needs to be at least one flag, or this will
9154      * abort */
9155 
9156     /* for (?g), (?gc), and (?o) warnings; warning
9157        about (?c) will warn about (?g) -- japhy    */
9158 
9159 #define WASTED_O  0x01
9160 #define WASTED_G  0x02
9161 #define WASTED_C  0x04
9162 #define WASTED_GC (WASTED_G|WASTED_C)
9163     I32 wastedflags = 0x00;
9164     U32 posflags = 0, negflags = 0;
9165     U32 *flagsp = &posflags;
9166     char has_charset_modifier = '\0';
9167     regex_charset cs;
9168     bool has_use_defaults = FALSE;
9169     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9170 
9171     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9172 
9173     /* '^' as an initial flag sets certain defaults */
9174     if (UCHARAT(RExC_parse) == '^') {
9175         RExC_parse++;
9176         has_use_defaults = TRUE;
9177         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9178         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9179                                         ? REGEX_UNICODE_CHARSET
9180                                         : REGEX_DEPENDS_CHARSET);
9181     }
9182 
9183     cs = get_regex_charset(RExC_flags);
9184     if (cs == REGEX_DEPENDS_CHARSET
9185         && (RExC_utf8 || RExC_uni_semantics))
9186     {
9187         cs = REGEX_UNICODE_CHARSET;
9188     }
9189 
9190     while (*RExC_parse) {
9191         /* && strchr("iogcmsx", *RExC_parse) */
9192         /* (?g), (?gc) and (?o) are useless here
9193            and must be globally applied -- japhy */
9194         switch (*RExC_parse) {
9195 
9196             /* Code for the imsx flags */
9197             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9198 
9199             case LOCALE_PAT_MOD:
9200                 if (has_charset_modifier) {
9201                     goto excess_modifier;
9202                 }
9203                 else if (flagsp == &negflags) {
9204                     goto neg_modifier;
9205                 }
9206                 cs = REGEX_LOCALE_CHARSET;
9207                 has_charset_modifier = LOCALE_PAT_MOD;
9208                 break;
9209             case UNICODE_PAT_MOD:
9210                 if (has_charset_modifier) {
9211                     goto excess_modifier;
9212                 }
9213                 else if (flagsp == &negflags) {
9214                     goto neg_modifier;
9215                 }
9216                 cs = REGEX_UNICODE_CHARSET;
9217                 has_charset_modifier = UNICODE_PAT_MOD;
9218                 break;
9219             case ASCII_RESTRICT_PAT_MOD:
9220                 if (flagsp == &negflags) {
9221                     goto neg_modifier;
9222                 }
9223                 if (has_charset_modifier) {
9224                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9225                         goto excess_modifier;
9226                     }
9227                     /* Doubled modifier implies more restricted */
9228                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9229                 }
9230                 else {
9231                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9232                 }
9233                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9234                 break;
9235             case DEPENDS_PAT_MOD:
9236                 if (has_use_defaults) {
9237                     goto fail_modifiers;
9238                 }
9239                 else if (flagsp == &negflags) {
9240                     goto neg_modifier;
9241                 }
9242                 else if (has_charset_modifier) {
9243                     goto excess_modifier;
9244                 }
9245 
9246                 /* The dual charset means unicode semantics if the
9247                  * pattern (or target, not known until runtime) are
9248                  * utf8, or something in the pattern indicates unicode
9249                  * semantics */
9250                 cs = (RExC_utf8 || RExC_uni_semantics)
9251                      ? REGEX_UNICODE_CHARSET
9252                      : REGEX_DEPENDS_CHARSET;
9253                 has_charset_modifier = DEPENDS_PAT_MOD;
9254                 break;
9255             excess_modifier:
9256                 RExC_parse++;
9257                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9258                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9259                 }
9260                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9261                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9262                                         *(RExC_parse - 1));
9263                 }
9264                 else {
9265                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9266                 }
9267                 /*NOTREACHED*/
9268             neg_modifier:
9269                 RExC_parse++;
9270                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9271                                     *(RExC_parse - 1));
9272                 /*NOTREACHED*/
9273             case ONCE_PAT_MOD: /* 'o' */
9274             case GLOBAL_PAT_MOD: /* 'g' */
9275                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9276                     const I32 wflagbit = *RExC_parse == 'o'
9277                                          ? WASTED_O
9278                                          : WASTED_G;
9279                     if (! (wastedflags & wflagbit) ) {
9280                         wastedflags |= wflagbit;
9281 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9282                         vWARN5(
9283                             RExC_parse + 1,
9284                             "Useless (%s%c) - %suse /%c modifier",
9285                             flagsp == &negflags ? "?-" : "?",
9286                             *RExC_parse,
9287                             flagsp == &negflags ? "don't " : "",
9288                             *RExC_parse
9289                         );
9290                     }
9291                 }
9292                 break;
9293 
9294             case CONTINUE_PAT_MOD: /* 'c' */
9295                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9296                     if (! (wastedflags & WASTED_C) ) {
9297                         wastedflags |= WASTED_GC;
9298 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9299                         vWARN3(
9300                             RExC_parse + 1,
9301                             "Useless (%sc) - %suse /gc modifier",
9302                             flagsp == &negflags ? "?-" : "?",
9303                             flagsp == &negflags ? "don't " : ""
9304                         );
9305                     }
9306                 }
9307                 break;
9308             case KEEPCOPY_PAT_MOD: /* 'p' */
9309                 if (flagsp == &negflags) {
9310                     if (SIZE_ONLY)
9311                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9312                 } else {
9313                     *flagsp |= RXf_PMf_KEEPCOPY;
9314                 }
9315                 break;
9316             case '-':
9317                 /* A flag is a default iff it is following a minus, so
9318                  * if there is a minus, it means will be trying to
9319                  * re-specify a default which is an error */
9320                 if (has_use_defaults || flagsp == &negflags) {
9321                     goto fail_modifiers;
9322                 }
9323                 flagsp = &negflags;
9324                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9325                 break;
9326             case ':':
9327             case ')':
9328                 RExC_flags |= posflags;
9329                 RExC_flags &= ~negflags;
9330                 set_regex_charset(&RExC_flags, cs);
9331                 if (RExC_flags & RXf_PMf_FOLD) {
9332                     RExC_contains_i = 1;
9333                 }
9334                 return;
9335                 /*NOTREACHED*/
9336             default:
9337             fail_modifiers:
9338                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9339 		/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9340                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9341                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9342                 /*NOTREACHED*/
9343         }
9344 
9345         ++RExC_parse;
9346     }
9347 }
9348 
9349 /*
9350  - reg - regular expression, i.e. main body or parenthesized thing
9351  *
9352  * Caller must absorb opening parenthesis.
9353  *
9354  * Combining parenthesis handling with the base level of regular expression
9355  * is a trifle forced, but the need to tie the tails of the branches to what
9356  * follows makes it hard to avoid.
9357  */
9358 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9359 #ifdef DEBUGGING
9360 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9361 #else
9362 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9363 #endif
9364 
9365 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9366    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9367    needs to be restarted.
9368    Otherwise would only return NULL if regbranch() returns NULL, which
9369    cannot happen.  */
9370 STATIC regnode *
9371 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9372     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9373      * 2 is like 1, but indicates that nextchar() has been called to advance
9374      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9375      * this flag alerts us to the need to check for that */
9376 {
9377     dVAR;
9378     regnode *ret;		/* Will be the head of the group. */
9379     regnode *br;
9380     regnode *lastbr;
9381     regnode *ender = NULL;
9382     I32 parno = 0;
9383     I32 flags;
9384     U32 oregflags = RExC_flags;
9385     bool have_branch = 0;
9386     bool is_open = 0;
9387     I32 freeze_paren = 0;
9388     I32 after_freeze = 0;
9389 
9390     char * parse_start = RExC_parse; /* MJD */
9391     char * const oregcomp_parse = RExC_parse;
9392 
9393     GET_RE_DEBUG_FLAGS_DECL;
9394 
9395     PERL_ARGS_ASSERT_REG;
9396     DEBUG_PARSE("reg ");
9397 
9398     *flagp = 0;				/* Tentatively. */
9399 
9400 
9401     /* Make an OPEN node, if parenthesized. */
9402     if (paren) {
9403 
9404         /* Under /x, space and comments can be gobbled up between the '(' and
9405          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9406          * intervening space, as the sequence is a token, and a token should be
9407          * indivisible */
9408         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9409 
9410         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9411 	    char *start_verb = RExC_parse;
9412 	    STRLEN verb_len = 0;
9413 	    char *start_arg = NULL;
9414 	    unsigned char op = 0;
9415 	    int argok = 1;
9416             int internal_argval = 0; /* internal_argval is only useful if
9417                                         !argok */
9418 
9419             if (has_intervening_patws && SIZE_ONLY) {
9420                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9421             }
9422 	    while ( *RExC_parse && *RExC_parse != ')' ) {
9423 	        if ( *RExC_parse == ':' ) {
9424 	            start_arg = RExC_parse + 1;
9425 	            break;
9426 	        }
9427 	        RExC_parse++;
9428 	    }
9429 	    ++start_verb;
9430 	    verb_len = RExC_parse - start_verb;
9431 	    if ( start_arg ) {
9432 	        RExC_parse++;
9433 	        while ( *RExC_parse && *RExC_parse != ')' )
9434 	            RExC_parse++;
9435 	        if ( *RExC_parse != ')' )
9436 	            vFAIL("Unterminated verb pattern argument");
9437 	        if ( RExC_parse == start_arg )
9438 	            start_arg = NULL;
9439 	    } else {
9440 	        if ( *RExC_parse != ')' )
9441 	            vFAIL("Unterminated verb pattern");
9442 	    }
9443 
9444 	    switch ( *start_verb ) {
9445             case 'A':  /* (*ACCEPT) */
9446                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9447 		    op = ACCEPT;
9448 		    internal_argval = RExC_nestroot;
9449 		}
9450 		break;
9451             case 'C':  /* (*COMMIT) */
9452                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9453                     op = COMMIT;
9454                 break;
9455             case 'F':  /* (*FAIL) */
9456                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9457 		    op = OPFAIL;
9458 		    argok = 0;
9459 		}
9460 		break;
9461             case ':':  /* (*:NAME) */
9462 	    case 'M':  /* (*MARK:NAME) */
9463 	        if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9464                     op = MARKPOINT;
9465                     argok = -1;
9466                 }
9467                 break;
9468             case 'P':  /* (*PRUNE) */
9469                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9470                     op = PRUNE;
9471                 break;
9472             case 'S':   /* (*SKIP) */
9473                 if ( memEQs(start_verb,verb_len,"SKIP") )
9474                     op = SKIP;
9475                 break;
9476             case 'T':  /* (*THEN) */
9477                 /* [19:06] <TimToady> :: is then */
9478                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9479                     op = CUTGROUP;
9480                     RExC_seen |= REG_CUTGROUP_SEEN;
9481                 }
9482                 break;
9483 	    }
9484 	    if ( ! op ) {
9485 	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9486                 vFAIL2utf8f(
9487                     "Unknown verb pattern '%"UTF8f"'",
9488                     UTF8fARG(UTF, verb_len, start_verb));
9489 	    }
9490 	    if ( argok ) {
9491                 if ( start_arg && internal_argval ) {
9492 	            vFAIL3("Verb pattern '%.*s' may not have an argument",
9493 	                verb_len, start_verb);
9494 	        } else if ( argok < 0 && !start_arg ) {
9495                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9496 	                verb_len, start_verb);
9497 	        } else {
9498 	            ret = reganode(pRExC_state, op, internal_argval);
9499 	            if ( ! internal_argval && ! SIZE_ONLY ) {
9500                         if (start_arg) {
9501                             SV *sv = newSVpvn( start_arg,
9502                                                RExC_parse - start_arg);
9503                             ARG(ret) = add_data( pRExC_state,
9504                                                  STR_WITH_LEN("S"));
9505                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9506                             ret->flags = 0;
9507                         } else {
9508                             ret->flags = 1;
9509                         }
9510                     }
9511 	        }
9512 	        if (!internal_argval)
9513                     RExC_seen |= REG_VERBARG_SEEN;
9514 	    } else if ( start_arg ) {
9515 	        vFAIL3("Verb pattern '%.*s' may not have an argument",
9516 	                verb_len, start_verb);
9517 	    } else {
9518 	        ret = reg_node(pRExC_state, op);
9519 	    }
9520 	    nextchar(pRExC_state);
9521 	    return ret;
9522         }
9523         else if (*RExC_parse == '?') { /* (?...) */
9524 	    bool is_logical = 0;
9525 	    const char * const seqstart = RExC_parse;
9526             if (has_intervening_patws && SIZE_ONLY) {
9527                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9528             }
9529 
9530 	    RExC_parse++;
9531 	    paren = *RExC_parse++;
9532 	    ret = NULL;			/* For look-ahead/behind. */
9533 	    switch (paren) {
9534 
9535 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
9536 	        paren = *RExC_parse++;
9537 		if ( paren == '<')         /* (?P<...>) named capture */
9538 		    goto named_capture;
9539                 else if (paren == '>') {   /* (?P>name) named recursion */
9540                     goto named_recursion;
9541                 }
9542                 else if (paren == '=') {   /* (?P=...)  named backref */
9543                     /* this pretty much dupes the code for \k<NAME> in
9544                      * regatom(), if you change this make sure you change that
9545                      * */
9546                     char* name_start = RExC_parse;
9547 		    U32 num = 0;
9548                     SV *sv_dat = reg_scan_name(pRExC_state,
9549                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9550                     if (RExC_parse == name_start || *RExC_parse != ')')
9551                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9552                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9553 
9554                     if (!SIZE_ONLY) {
9555                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9556                         RExC_rxi->data->data[num]=(void*)sv_dat;
9557                         SvREFCNT_inc_simple_void(sv_dat);
9558                     }
9559                     RExC_sawback = 1;
9560 		    ret = reganode(pRExC_state,
9561 				   ((! FOLD)
9562 				     ? NREF
9563 				     : (ASCII_FOLD_RESTRICTED)
9564 				       ? NREFFA
9565                                        : (AT_LEAST_UNI_SEMANTICS)
9566                                          ? NREFFU
9567                                          : (LOC)
9568                                            ? NREFFL
9569                                            : NREFF),
9570 				    num);
9571                     *flagp |= HASWIDTH;
9572 
9573                     Set_Node_Offset(ret, parse_start+1);
9574                     Set_Node_Cur_Length(ret, parse_start);
9575 
9576                     nextchar(pRExC_state);
9577                     return ret;
9578                 }
9579                 RExC_parse++;
9580                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9581 		vFAIL3("Sequence (%.*s...) not recognized",
9582                                 RExC_parse-seqstart, seqstart);
9583 		/*NOTREACHED*/
9584             case '<':           /* (?<...) */
9585 		if (*RExC_parse == '!')
9586 		    paren = ',';
9587 		else if (*RExC_parse != '=')
9588               named_capture:
9589 		{               /* (?<...>) */
9590 		    char *name_start;
9591 		    SV *svname;
9592 		    paren= '>';
9593             case '\'':          /* (?'...') */
9594     		    name_start= RExC_parse;
9595     		    svname = reg_scan_name(pRExC_state,
9596                         SIZE_ONLY    /* reverse test from the others */
9597                         ? REG_RSN_RETURN_NAME
9598                         : REG_RSN_RETURN_NULL);
9599 		    if (RExC_parse == name_start || *RExC_parse != paren)
9600 		        vFAIL2("Sequence (?%c... not terminated",
9601 		            paren=='>' ? '<' : paren);
9602 		    if (SIZE_ONLY) {
9603 			HE *he_str;
9604 			SV *sv_dat = NULL;
9605                         if (!svname) /* shouldn't happen */
9606                             Perl_croak(aTHX_
9607                                 "panic: reg_scan_name returned NULL");
9608                         if (!RExC_paren_names) {
9609                             RExC_paren_names= newHV();
9610                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9611 #ifdef DEBUGGING
9612                             RExC_paren_name_list= newAV();
9613                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9614 #endif
9615                         }
9616                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9617                         if ( he_str )
9618                             sv_dat = HeVAL(he_str);
9619                         if ( ! sv_dat ) {
9620                             /* croak baby croak */
9621                             Perl_croak(aTHX_
9622                                 "panic: paren_name hash element allocation failed");
9623                         } else if ( SvPOK(sv_dat) ) {
9624                             /* (?|...) can mean we have dupes so scan to check
9625                                its already been stored. Maybe a flag indicating
9626                                we are inside such a construct would be useful,
9627                                but the arrays are likely to be quite small, so
9628                                for now we punt -- dmq */
9629                             IV count = SvIV(sv_dat);
9630                             I32 *pv = (I32*)SvPVX(sv_dat);
9631                             IV i;
9632                             for ( i = 0 ; i < count ; i++ ) {
9633                                 if ( pv[i] == RExC_npar ) {
9634                                     count = 0;
9635                                     break;
9636                                 }
9637                             }
9638                             if ( count ) {
9639                                 pv = (I32*)SvGROW(sv_dat,
9640                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9641                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9642                                 pv[count] = RExC_npar;
9643                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9644                             }
9645                         } else {
9646                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9647                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9648                                                                 sizeof(I32));
9649                             SvIOK_on(sv_dat);
9650                             SvIV_set(sv_dat, 1);
9651                         }
9652 #ifdef DEBUGGING
9653                         /* Yes this does cause a memory leak in debugging Perls
9654                          * */
9655                         if (!av_store(RExC_paren_name_list,
9656                                       RExC_npar, SvREFCNT_inc(svname)))
9657                             SvREFCNT_dec_NN(svname);
9658 #endif
9659 
9660                         /*sv_dump(sv_dat);*/
9661                     }
9662                     nextchar(pRExC_state);
9663 		    paren = 1;
9664 		    goto capturing_parens;
9665 		}
9666                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9667 		RExC_in_lookbehind++;
9668 		RExC_parse++;
9669 	    case '=':           /* (?=...) */
9670 		RExC_seen_zerolen++;
9671                 break;
9672 	    case '!':           /* (?!...) */
9673 		RExC_seen_zerolen++;
9674 	        if (*RExC_parse == ')') {
9675 	            ret=reg_node(pRExC_state, OPFAIL);
9676 	            nextchar(pRExC_state);
9677 	            return ret;
9678 	        }
9679 	        break;
9680 	    case '|':           /* (?|...) */
9681 	        /* branch reset, behave like a (?:...) except that
9682 	           buffers in alternations share the same numbers */
9683 	        paren = ':';
9684 	        after_freeze = freeze_paren = RExC_npar;
9685 	        break;
9686 	    case ':':           /* (?:...) */
9687 	    case '>':           /* (?>...) */
9688 		break;
9689 	    case '$':           /* (?$...) */
9690 	    case '@':           /* (?@...) */
9691 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9692 		break;
9693             case '#':           /* (?#...) */
9694                 /* XXX As soon as we disallow separating the '?' and '*' (by
9695                  * spaces or (?#...) comment), it is believed that this case
9696                  * will be unreachable and can be removed.  See
9697                  * [perl #117327] */
9698                 while (*RExC_parse && *RExC_parse != ')')
9699 		    RExC_parse++;
9700 		if (*RExC_parse != ')')
9701 		    FAIL("Sequence (?#... not terminated");
9702 		nextchar(pRExC_state);
9703 		*flagp = TRYAGAIN;
9704 		return NULL;
9705 	    case '0' :           /* (?0) */
9706 	    case 'R' :           /* (?R) */
9707 		if (*RExC_parse != ')')
9708 		    FAIL("Sequence (?R) not terminated");
9709 		ret = reg_node(pRExC_state, GOSTART);
9710                     RExC_seen |= REG_GOSTART_SEEN;
9711 		*flagp |= POSTPONED;
9712 		nextchar(pRExC_state);
9713 		return ret;
9714 		/*notreached*/
9715             { /* named and numeric backreferences */
9716                 I32 num;
9717             case '&':            /* (?&NAME) */
9718                 parse_start = RExC_parse - 1;
9719               named_recursion:
9720                 {
9721     		    SV *sv_dat = reg_scan_name(pRExC_state,
9722     		        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9723     		     num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9724                 }
9725                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9726                     vFAIL("Sequence (?&... not terminated");
9727                 goto gen_recurse_regop;
9728                 assert(0); /* NOT REACHED */
9729             case '+':
9730                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9731                     RExC_parse++;
9732                     vFAIL("Illegal pattern");
9733                 }
9734                 goto parse_recursion;
9735                 /* NOT REACHED*/
9736             case '-': /* (?-1) */
9737                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9738                     RExC_parse--; /* rewind to let it be handled later */
9739                     goto parse_flags;
9740                 }
9741                 /*FALLTHROUGH */
9742             case '1': case '2': case '3': case '4': /* (?1) */
9743 	    case '5': case '6': case '7': case '8': case '9':
9744 	        RExC_parse--;
9745               parse_recursion:
9746 		num = atoi(RExC_parse);
9747   	        parse_start = RExC_parse - 1; /* MJD */
9748 	        if (*RExC_parse == '-')
9749 	            RExC_parse++;
9750 		while (isDIGIT(*RExC_parse))
9751 			RExC_parse++;
9752 	        if (*RExC_parse!=')')
9753 	            vFAIL("Expecting close bracket");
9754 
9755               gen_recurse_regop:
9756                 if ( paren == '-' ) {
9757                     /*
9758                     Diagram of capture buffer numbering.
9759                     Top line is the normal capture buffer numbers
9760                     Bottom line is the negative indexing as from
9761                     the X (the (?-2))
9762 
9763                     +   1 2    3 4 5 X          6 7
9764                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9765                     -   5 4    3 2 1 X          x x
9766 
9767                     */
9768                     num = RExC_npar + num;
9769                     if (num < 1)  {
9770                         RExC_parse++;
9771                         vFAIL("Reference to nonexistent group");
9772                     }
9773                 } else if ( paren == '+' ) {
9774                     num = RExC_npar + num - 1;
9775                 }
9776 
9777                 ret = reganode(pRExC_state, GOSUB, num);
9778                 if (!SIZE_ONLY) {
9779 		    if (num > (I32)RExC_rx->nparens) {
9780 			RExC_parse++;
9781 			vFAIL("Reference to nonexistent group");
9782 	            }
9783 	            ARG2L_SET( ret, RExC_recurse_count++);
9784                     RExC_emit++;
9785 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9786 			"Recurse #%"UVuf" to %"IVdf"\n",
9787                               (UV)ARG(ret), (IV)ARG2L(ret)));
9788 		} else {
9789 		    RExC_size++;
9790     		}
9791                     RExC_seen |= REG_RECURSE_SEEN;
9792                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9793 		Set_Node_Offset(ret, parse_start); /* MJD */
9794 
9795                 *flagp |= POSTPONED;
9796                 nextchar(pRExC_state);
9797                 return ret;
9798             } /* named and numeric backreferences */
9799             assert(0); /* NOT REACHED */
9800 
9801 	    case '?':           /* (??...) */
9802 		is_logical = 1;
9803 		if (*RExC_parse != '{') {
9804 		    RExC_parse++;
9805                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9806                     vFAIL2utf8f(
9807                         "Sequence (%"UTF8f"...) not recognized",
9808                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9809 		    /*NOTREACHED*/
9810 		}
9811 		*flagp |= POSTPONED;
9812 		paren = *RExC_parse++;
9813 		/* FALL THROUGH */
9814 	    case '{':           /* (?{...}) */
9815 	    {
9816 		U32 n = 0;
9817 		struct reg_code_block *cb;
9818 
9819 		RExC_seen_zerolen++;
9820 
9821 		if (   !pRExC_state->num_code_blocks
9822 		    || pRExC_state->code_index >= pRExC_state->num_code_blocks
9823 		    || pRExC_state->code_blocks[pRExC_state->code_index].start
9824 			!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9825 			    - RExC_start)
9826 		) {
9827 		    if (RExC_pm_flags & PMf_USE_RE_EVAL)
9828 			FAIL("panic: Sequence (?{...}): no code block found\n");
9829 		    FAIL("Eval-group not allowed at runtime, use re 'eval'");
9830 		}
9831 		/* this is a pre-compiled code block (?{...}) */
9832 		cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9833 		RExC_parse = RExC_start + cb->end;
9834 		if (!SIZE_ONLY) {
9835 		    OP *o = cb->block;
9836 		    if (cb->src_regex) {
9837 			n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9838 			RExC_rxi->data->data[n] =
9839 			    (void*)SvREFCNT_inc((SV*)cb->src_regex);
9840 			RExC_rxi->data->data[n+1] = (void*)o;
9841 		    }
9842 		    else {
9843 			n = add_data(pRExC_state,
9844 			       (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9845 			RExC_rxi->data->data[n] = (void*)o;
9846 		    }
9847 		}
9848 		pRExC_state->code_index++;
9849 		nextchar(pRExC_state);
9850 
9851 		if (is_logical) {
9852                     regnode *eval;
9853 		    ret = reg_node(pRExC_state, LOGICAL);
9854                     eval = reganode(pRExC_state, EVAL, n);
9855 		    if (!SIZE_ONLY) {
9856 			ret->flags = 2;
9857                         /* for later propagation into (??{}) return value */
9858                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9859                     }
9860                     REGTAIL(pRExC_state, ret, eval);
9861                     /* deal with the length of this later - MJD */
9862 		    return ret;
9863 		}
9864 		ret = reganode(pRExC_state, EVAL, n);
9865 		Set_Node_Length(ret, RExC_parse - parse_start + 1);
9866 		Set_Node_Offset(ret, parse_start);
9867 		return ret;
9868 	    }
9869 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9870 	    {
9871 	        int is_define= 0;
9872 		if (RExC_parse[0] == '?') {        /* (?(?...)) */
9873 		    if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9874 			|| RExC_parse[1] == '<'
9875 			|| RExC_parse[1] == '{') { /* Lookahead or eval. */
9876 			I32 flag;
9877                         regnode *tail;
9878 
9879 			ret = reg_node(pRExC_state, LOGICAL);
9880 			if (!SIZE_ONLY)
9881 			    ret->flags = 1;
9882 
9883                         tail = reg(pRExC_state, 1, &flag, depth+1);
9884                         if (flag & RESTART_UTF8) {
9885                             *flagp = RESTART_UTF8;
9886                             return NULL;
9887                         }
9888                         REGTAIL(pRExC_state, ret, tail);
9889 			goto insert_if;
9890 		    }
9891 		}
9892 		else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9893 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9894 	        {
9895 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
9896 	            char *name_start= RExC_parse++;
9897 	            U32 num = 0;
9898 	            SV *sv_dat=reg_scan_name(pRExC_state,
9899 	                SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9900 	            if (RExC_parse == name_start || *RExC_parse != ch)
9901                         vFAIL2("Sequence (?(%c... not terminated",
9902                             (ch == '>' ? '<' : ch));
9903                     RExC_parse++;
9904 	            if (!SIZE_ONLY) {
9905                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9906                         RExC_rxi->data->data[num]=(void*)sv_dat;
9907                         SvREFCNT_inc_simple_void(sv_dat);
9908                     }
9909                     ret = reganode(pRExC_state,NGROUPP,num);
9910                     goto insert_if_check_paren;
9911 		}
9912 		else if (RExC_parse[0] == 'D' &&
9913 		         RExC_parse[1] == 'E' &&
9914 		         RExC_parse[2] == 'F' &&
9915 		         RExC_parse[3] == 'I' &&
9916 		         RExC_parse[4] == 'N' &&
9917 		         RExC_parse[5] == 'E')
9918 		{
9919 		    ret = reganode(pRExC_state,DEFINEP,0);
9920 		    RExC_parse +=6 ;
9921 		    is_define = 1;
9922 		    goto insert_if_check_paren;
9923 		}
9924 		else if (RExC_parse[0] == 'R') {
9925 		    RExC_parse++;
9926 		    parno = 0;
9927 		    if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9928 		        parno = atoi(RExC_parse++);
9929 		        while (isDIGIT(*RExC_parse))
9930 			    RExC_parse++;
9931 		    } else if (RExC_parse[0] == '&') {
9932 		        SV *sv_dat;
9933 		        RExC_parse++;
9934 		        sv_dat = reg_scan_name(pRExC_state,
9935                             SIZE_ONLY
9936                             ? REG_RSN_RETURN_NULL
9937                             : REG_RSN_RETURN_DATA);
9938     		        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9939 		    }
9940 		    ret = reganode(pRExC_state,INSUBP,parno);
9941 		    goto insert_if_check_paren;
9942 		}
9943 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9944                     /* (?(1)...) */
9945 		    char c;
9946 		    char *tmp;
9947 		    parno = atoi(RExC_parse++);
9948 
9949 		    while (isDIGIT(*RExC_parse))
9950 			RExC_parse++;
9951                     ret = reganode(pRExC_state, GROUPP, parno);
9952 
9953                  insert_if_check_paren:
9954 		    if (*(tmp = nextchar(pRExC_state)) != ')') {
9955                         /* nextchar also skips comments, so undo its work
9956                          * and skip over the the next character.
9957                          */
9958                         RExC_parse = tmp;
9959                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9960 			vFAIL("Switch condition not recognized");
9961 		    }
9962 		  insert_if:
9963                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9964                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9965 		    if (br == NULL) {
9966                         if (flags & RESTART_UTF8) {
9967                             *flagp = RESTART_UTF8;
9968                             return NULL;
9969                         }
9970                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9971                               (UV) flags);
9972                     } else
9973                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
9974                                                           LONGJMP, 0));
9975 		    c = *nextchar(pRExC_state);
9976 		    if (flags&HASWIDTH)
9977 			*flagp |= HASWIDTH;
9978 		    if (c == '|') {
9979 		        if (is_define)
9980 		            vFAIL("(?(DEFINE)....) does not allow branches");
9981 
9982                         /* Fake one for optimizer.  */
9983                         lastbr = reganode(pRExC_state, IFTHEN, 0);
9984 
9985                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9986                             if (flags & RESTART_UTF8) {
9987                                 *flagp = RESTART_UTF8;
9988                                 return NULL;
9989                             }
9990                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9991                                   (UV) flags);
9992                         }
9993                         REGTAIL(pRExC_state, ret, lastbr);
9994 		 	if (flags&HASWIDTH)
9995 			    *flagp |= HASWIDTH;
9996 			c = *nextchar(pRExC_state);
9997 		    }
9998 		    else
9999 			lastbr = NULL;
10000 		    if (c != ')')
10001 			vFAIL("Switch (?(condition)... contains too many branches");
10002 		    ender = reg_node(pRExC_state, TAIL);
10003                     REGTAIL(pRExC_state, br, ender);
10004 		    if (lastbr) {
10005                         REGTAIL(pRExC_state, lastbr, ender);
10006                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10007 		    }
10008 		    else
10009                         REGTAIL(pRExC_state, ret, ender);
10010                     RExC_size++; /* XXX WHY do we need this?!!
10011                                     For large programs it seems to be required
10012                                     but I can't figure out why. -- dmq*/
10013 		    return ret;
10014 		}
10015 		else {
10016                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10017                     vFAIL("Unknown switch condition (?(...))");
10018 		}
10019 	    }
10020 	    case '[':           /* (?[ ... ]) */
10021                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10022                                          oregcomp_parse);
10023             case 0:
10024 		RExC_parse--; /* for vFAIL to print correctly */
10025                 vFAIL("Sequence (? incomplete");
10026                 break;
10027 	    default: /* e.g., (?i) */
10028 	        --RExC_parse;
10029               parse_flags:
10030 		parse_lparen_question_flags(pRExC_state);
10031                 if (UCHARAT(RExC_parse) != ':') {
10032                     nextchar(pRExC_state);
10033                     *flagp = TRYAGAIN;
10034                     return NULL;
10035                 }
10036                 paren = ':';
10037                 nextchar(pRExC_state);
10038                 ret = NULL;
10039                 goto parse_rest;
10040             } /* end switch */
10041 	}
10042 	else {                  /* (...) */
10043 	  capturing_parens:
10044 	    parno = RExC_npar;
10045 	    RExC_npar++;
10046 
10047 	    ret = reganode(pRExC_state, OPEN, parno);
10048 	    if (!SIZE_ONLY ){
10049 	        if (!RExC_nestroot)
10050 	            RExC_nestroot = parno;
10051                 if (RExC_seen & REG_RECURSE_SEEN
10052 	            && !RExC_open_parens[parno-1])
10053 	        {
10054 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10055 			"Setting open paren #%"IVdf" to %d\n",
10056 			(IV)parno, REG_NODE_NUM(ret)));
10057 	            RExC_open_parens[parno-1]= ret;
10058 	        }
10059 	    }
10060             Set_Node_Length(ret, 1); /* MJD */
10061             Set_Node_Offset(ret, RExC_parse); /* MJD */
10062 	    is_open = 1;
10063 	}
10064     }
10065     else                        /* ! paren */
10066 	ret = NULL;
10067 
10068    parse_rest:
10069     /* Pick up the branches, linking them together. */
10070     parse_start = RExC_parse;   /* MJD */
10071     br = regbranch(pRExC_state, &flags, 1,depth+1);
10072 
10073     /*     branch_len = (paren != 0); */
10074 
10075     if (br == NULL) {
10076         if (flags & RESTART_UTF8) {
10077             *flagp = RESTART_UTF8;
10078             return NULL;
10079         }
10080         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10081     }
10082     if (*RExC_parse == '|') {
10083 	if (!SIZE_ONLY && RExC_extralen) {
10084 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10085 	}
10086 	else {                  /* MJD */
10087 	    reginsert(pRExC_state, BRANCH, br, depth+1);
10088             Set_Node_Length(br, paren != 0);
10089             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10090         }
10091 	have_branch = 1;
10092 	if (SIZE_ONLY)
10093 	    RExC_extralen += 1;		/* For BRANCHJ-BRANCH. */
10094     }
10095     else if (paren == ':') {
10096 	*flagp |= flags&SIMPLE;
10097     }
10098     if (is_open) {				/* Starts with OPEN. */
10099         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10100     }
10101     else if (paren != '?')		/* Not Conditional */
10102 	ret = br;
10103     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10104     lastbr = br;
10105     while (*RExC_parse == '|') {
10106 	if (!SIZE_ONLY && RExC_extralen) {
10107 	    ender = reganode(pRExC_state, LONGJMP,0);
10108 
10109             /* Append to the previous. */
10110             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10111 	}
10112 	if (SIZE_ONLY)
10113 	    RExC_extralen += 2;		/* Account for LONGJMP. */
10114 	nextchar(pRExC_state);
10115 	if (freeze_paren) {
10116 	    if (RExC_npar > after_freeze)
10117 	        after_freeze = RExC_npar;
10118             RExC_npar = freeze_paren;
10119         }
10120         br = regbranch(pRExC_state, &flags, 0, depth+1);
10121 
10122 	if (br == NULL) {
10123             if (flags & RESTART_UTF8) {
10124                 *flagp = RESTART_UTF8;
10125                 return NULL;
10126             }
10127             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10128         }
10129         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10130 	lastbr = br;
10131 	*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10132     }
10133 
10134     if (have_branch || paren != ':') {
10135 	/* Make a closing node, and hook it on the end. */
10136 	switch (paren) {
10137 	case ':':
10138 	    ender = reg_node(pRExC_state, TAIL);
10139 	    break;
10140 	case 1: case 2:
10141 	    ender = reganode(pRExC_state, CLOSE, parno);
10142             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10143 		DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10144 			"Setting close paren #%"IVdf" to %d\n",
10145 			(IV)parno, REG_NODE_NUM(ender)));
10146 	        RExC_close_parens[parno-1]= ender;
10147 	        if (RExC_nestroot == parno)
10148 	            RExC_nestroot = 0;
10149 	    }
10150             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10151             Set_Node_Length(ender,1); /* MJD */
10152 	    break;
10153 	case '<':
10154 	case ',':
10155 	case '=':
10156 	case '!':
10157 	    *flagp &= ~HASWIDTH;
10158 	    /* FALL THROUGH */
10159 	case '>':
10160 	    ender = reg_node(pRExC_state, SUCCEED);
10161 	    break;
10162 	case 0:
10163 	    ender = reg_node(pRExC_state, END);
10164 	    if (!SIZE_ONLY) {
10165                 assert(!RExC_opend); /* there can only be one! */
10166                 RExC_opend = ender;
10167             }
10168 	    break;
10169 	}
10170         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10171             SV * const mysv_val1=sv_newmortal();
10172             SV * const mysv_val2=sv_newmortal();
10173             DEBUG_PARSE_MSG("lsbr");
10174             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10175             regprop(RExC_rx, mysv_val2, ender, NULL);
10176             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10177                           SvPV_nolen_const(mysv_val1),
10178                           (IV)REG_NODE_NUM(lastbr),
10179                           SvPV_nolen_const(mysv_val2),
10180                           (IV)REG_NODE_NUM(ender),
10181                           (IV)(ender - lastbr)
10182             );
10183         });
10184         REGTAIL(pRExC_state, lastbr, ender);
10185 
10186 	if (have_branch && !SIZE_ONLY) {
10187             char is_nothing= 1;
10188 	    if (depth==1)
10189                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10190 
10191 	    /* Hook the tails of the branches to the closing node. */
10192 	    for (br = ret; br; br = regnext(br)) {
10193 		const U8 op = PL_regkind[OP(br)];
10194 		if (op == BRANCH) {
10195                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10196                     if ( OP(NEXTOPER(br)) != NOTHING
10197                          || regnext(NEXTOPER(br)) != ender)
10198                         is_nothing= 0;
10199 		}
10200 		else if (op == BRANCHJ) {
10201                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10202                     /* for now we always disable this optimisation * /
10203                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10204                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10205                     */
10206                         is_nothing= 0;
10207 		}
10208 	    }
10209             if (is_nothing) {
10210                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10211                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10212                     SV * const mysv_val1=sv_newmortal();
10213                     SV * const mysv_val2=sv_newmortal();
10214                     DEBUG_PARSE_MSG("NADA");
10215                     regprop(RExC_rx, mysv_val1, ret, NULL);
10216                     regprop(RExC_rx, mysv_val2, ender, NULL);
10217                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10218                                   SvPV_nolen_const(mysv_val1),
10219                                   (IV)REG_NODE_NUM(ret),
10220                                   SvPV_nolen_const(mysv_val2),
10221                                   (IV)REG_NODE_NUM(ender),
10222                                   (IV)(ender - ret)
10223                     );
10224                 });
10225                 OP(br)= NOTHING;
10226                 if (OP(ender) == TAIL) {
10227                     NEXT_OFF(br)= 0;
10228                     RExC_emit= br + 1;
10229                 } else {
10230                     regnode *opt;
10231                     for ( opt= br + 1; opt < ender ; opt++ )
10232                         OP(opt)= OPTIMIZED;
10233                     NEXT_OFF(br)= ender - br;
10234                 }
10235             }
10236 	}
10237     }
10238 
10239     {
10240         const char *p;
10241         static const char parens[] = "=!<,>";
10242 
10243 	if (paren && (p = strchr(parens, paren))) {
10244 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10245 	    int flag = (p - parens) > 1;
10246 
10247 	    if (paren == '>')
10248 		node = SUSPEND, flag = 0;
10249 	    reginsert(pRExC_state, node,ret, depth+1);
10250             Set_Node_Cur_Length(ret, parse_start);
10251 	    Set_Node_Offset(ret, parse_start + 1);
10252 	    ret->flags = flag;
10253             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10254 	}
10255     }
10256 
10257     /* Check for proper termination. */
10258     if (paren) {
10259         /* restore original flags, but keep (?p) */
10260 	RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10261 	if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10262 	    RExC_parse = oregcomp_parse;
10263 	    vFAIL("Unmatched (");
10264 	}
10265     }
10266     else if (!paren && RExC_parse < RExC_end) {
10267 	if (*RExC_parse == ')') {
10268 	    RExC_parse++;
10269 	    vFAIL("Unmatched )");
10270 	}
10271 	else
10272 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
10273 	assert(0); /* NOTREACHED */
10274     }
10275 
10276     if (RExC_in_lookbehind) {
10277 	RExC_in_lookbehind--;
10278     }
10279     if (after_freeze > RExC_npar)
10280         RExC_npar = after_freeze;
10281     return(ret);
10282 }
10283 
10284 /*
10285  - regbranch - one alternative of an | operator
10286  *
10287  * Implements the concatenation operator.
10288  *
10289  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10290  * restarted.
10291  */
10292 STATIC regnode *
10293 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10294 {
10295     dVAR;
10296     regnode *ret;
10297     regnode *chain = NULL;
10298     regnode *latest;
10299     I32 flags = 0, c = 0;
10300     GET_RE_DEBUG_FLAGS_DECL;
10301 
10302     PERL_ARGS_ASSERT_REGBRANCH;
10303 
10304     DEBUG_PARSE("brnc");
10305 
10306     if (first)
10307 	ret = NULL;
10308     else {
10309 	if (!SIZE_ONLY && RExC_extralen)
10310 	    ret = reganode(pRExC_state, BRANCHJ,0);
10311 	else {
10312 	    ret = reg_node(pRExC_state, BRANCH);
10313             Set_Node_Length(ret, 1);
10314         }
10315     }
10316 
10317     if (!first && SIZE_ONLY)
10318 	RExC_extralen += 1;			/* BRANCHJ */
10319 
10320     *flagp = WORST;			/* Tentatively. */
10321 
10322     RExC_parse--;
10323     nextchar(pRExC_state);
10324     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10325 	flags &= ~TRYAGAIN;
10326         latest = regpiece(pRExC_state, &flags,depth+1);
10327 	if (latest == NULL) {
10328 	    if (flags & TRYAGAIN)
10329 		continue;
10330             if (flags & RESTART_UTF8) {
10331                 *flagp = RESTART_UTF8;
10332                 return NULL;
10333             }
10334             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10335 	}
10336 	else if (ret == NULL)
10337 	    ret = latest;
10338 	*flagp |= flags&(HASWIDTH|POSTPONED);
10339 	if (chain == NULL) 	/* First piece. */
10340 	    *flagp |= flags&SPSTART;
10341 	else {
10342 	    RExC_naughty++;
10343             REGTAIL(pRExC_state, chain, latest);
10344 	}
10345 	chain = latest;
10346 	c++;
10347     }
10348     if (chain == NULL) {	/* Loop ran zero times. */
10349 	chain = reg_node(pRExC_state, NOTHING);
10350 	if (ret == NULL)
10351 	    ret = chain;
10352     }
10353     if (c == 1) {
10354 	*flagp |= flags&SIMPLE;
10355     }
10356 
10357     return ret;
10358 }
10359 
10360 /*
10361  - regpiece - something followed by possible [*+?]
10362  *
10363  * Note that the branching code sequences used for ? and the general cases
10364  * of * and + are somewhat optimized:  they use the same NOTHING node as
10365  * both the endmarker for their branch list and the body of the last branch.
10366  * It might seem that this node could be dispensed with entirely, but the
10367  * endmarker role is not redundant.
10368  *
10369  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10370  * TRYAGAIN.
10371  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10372  * restarted.
10373  */
10374 STATIC regnode *
10375 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10376 {
10377     dVAR;
10378     regnode *ret;
10379     char op;
10380     char *next;
10381     I32 flags;
10382     const char * const origparse = RExC_parse;
10383     I32 min;
10384     I32 max = REG_INFTY;
10385 #ifdef RE_TRACK_PATTERN_OFFSETS
10386     char *parse_start;
10387 #endif
10388     const char *maxpos = NULL;
10389 
10390     /* Save the original in case we change the emitted regop to a FAIL. */
10391     regnode * const orig_emit = RExC_emit;
10392 
10393     GET_RE_DEBUG_FLAGS_DECL;
10394 
10395     PERL_ARGS_ASSERT_REGPIECE;
10396 
10397     DEBUG_PARSE("piec");
10398 
10399     ret = regatom(pRExC_state, &flags,depth+1);
10400     if (ret == NULL) {
10401 	if (flags & (TRYAGAIN|RESTART_UTF8))
10402 	    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10403         else
10404             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10405 	return(NULL);
10406     }
10407 
10408     op = *RExC_parse;
10409 
10410     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10411 	maxpos = NULL;
10412 #ifdef RE_TRACK_PATTERN_OFFSETS
10413         parse_start = RExC_parse; /* MJD */
10414 #endif
10415 	next = RExC_parse + 1;
10416 	while (isDIGIT(*next) || *next == ',') {
10417 	    if (*next == ',') {
10418 		if (maxpos)
10419 		    break;
10420 		else
10421 		    maxpos = next;
10422 	    }
10423 	    next++;
10424 	}
10425 	if (*next == '}') {		/* got one */
10426 	    if (!maxpos)
10427 		maxpos = next;
10428 	    RExC_parse++;
10429 	    min = atoi(RExC_parse);
10430 	    if (*maxpos == ',')
10431 		maxpos++;
10432 	    else
10433 		maxpos = RExC_parse;
10434 	    max = atoi(maxpos);
10435 	    if (!max && *maxpos != '0')
10436 		max = REG_INFTY;		/* meaning "infinity" */
10437 	    else if (max >= REG_INFTY)
10438 		vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10439 	    RExC_parse = next;
10440 	    nextchar(pRExC_state);
10441             if (max < min) {    /* If can't match, warn and optimize to fail
10442                                    unconditionally */
10443                 if (SIZE_ONLY) {
10444                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10445 
10446                     /* We can't back off the size because we have to reserve
10447                      * enough space for all the things we are about to throw
10448                      * away, but we can shrink it by the ammount we are about
10449                      * to re-use here */
10450                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10451                 }
10452                 else {
10453                     RExC_emit = orig_emit;
10454                 }
10455                 ret = reg_node(pRExC_state, OPFAIL);
10456                 return ret;
10457             }
10458             else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
10459             {
10460                 if (SIZE_ONLY) {
10461                     ckWARN2reg(RExC_parse + 1,
10462                                "Useless use of greediness modifier '%c'",
10463                                *RExC_parse);
10464                 }
10465                 /* Absorb the modifier, so later code doesn't see nor use
10466                     * it */
10467                 nextchar(pRExC_state);
10468             }
10469 
10470 	do_curly:
10471 	    if ((flags&SIMPLE)) {
10472 		RExC_naughty += 2 + RExC_naughty / 2;
10473 		reginsert(pRExC_state, CURLY, ret, depth+1);
10474                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10475                 Set_Node_Cur_Length(ret, parse_start);
10476 	    }
10477 	    else {
10478 		regnode * const w = reg_node(pRExC_state, WHILEM);
10479 
10480 		w->flags = 0;
10481                 REGTAIL(pRExC_state, ret, w);
10482 		if (!SIZE_ONLY && RExC_extralen) {
10483 		    reginsert(pRExC_state, LONGJMP,ret, depth+1);
10484 		    reginsert(pRExC_state, NOTHING,ret, depth+1);
10485 		    NEXT_OFF(ret) = 3;	/* Go over LONGJMP. */
10486 		}
10487 		reginsert(pRExC_state, CURLYX,ret, depth+1);
10488                                 /* MJD hk */
10489                 Set_Node_Offset(ret, parse_start+1);
10490                 Set_Node_Length(ret,
10491                                 op == '{' ? (RExC_parse - parse_start) : 1);
10492 
10493 		if (!SIZE_ONLY && RExC_extralen)
10494 		    NEXT_OFF(ret) = 3;	/* Go over NOTHING to LONGJMP. */
10495                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10496 		if (SIZE_ONLY)
10497 		    RExC_whilem_seen++, RExC_extralen += 3;
10498 		RExC_naughty += 4 + RExC_naughty;	/* compound interest */
10499 	    }
10500 	    ret->flags = 0;
10501 
10502 	    if (min > 0)
10503 		*flagp = WORST;
10504 	    if (max > 0)
10505 		*flagp |= HASWIDTH;
10506 	    if (!SIZE_ONLY) {
10507 		ARG1_SET(ret, (U16)min);
10508 		ARG2_SET(ret, (U16)max);
10509 	    }
10510             if (max == REG_INFTY)
10511                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10512 
10513 	    goto nest_check;
10514 	}
10515     }
10516 
10517     if (!ISMULT1(op)) {
10518 	*flagp = flags;
10519 	return(ret);
10520     }
10521 
10522 #if 0				/* Now runtime fix should be reliable. */
10523 
10524     /* if this is reinstated, don't forget to put this back into perldiag:
10525 
10526 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10527 
10528 	   (F) The part of the regexp subject to either the * or + quantifier
10529            could match an empty string. The {#} shows in the regular
10530            expression about where the problem was discovered.
10531 
10532     */
10533 
10534     if (!(flags&HASWIDTH) && op != '?')
10535       vFAIL("Regexp *+ operand could be empty");
10536 #endif
10537 
10538 #ifdef RE_TRACK_PATTERN_OFFSETS
10539     parse_start = RExC_parse;
10540 #endif
10541     nextchar(pRExC_state);
10542 
10543     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10544 
10545     if (op == '*' && (flags&SIMPLE)) {
10546 	reginsert(pRExC_state, STAR, ret, depth+1);
10547 	ret->flags = 0;
10548 	RExC_naughty += 4;
10549         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10550     }
10551     else if (op == '*') {
10552 	min = 0;
10553 	goto do_curly;
10554     }
10555     else if (op == '+' && (flags&SIMPLE)) {
10556 	reginsert(pRExC_state, PLUS, ret, depth+1);
10557 	ret->flags = 0;
10558 	RExC_naughty += 3;
10559         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10560     }
10561     else if (op == '+') {
10562 	min = 1;
10563 	goto do_curly;
10564     }
10565     else if (op == '?') {
10566 	min = 0; max = 1;
10567 	goto do_curly;
10568     }
10569   nest_check:
10570     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10571 	SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10572 	ckWARN2reg(RExC_parse,
10573 		   "%"UTF8f" matches null string many times",
10574 		   UTF8fARG(UTF, (RExC_parse >= origparse
10575                                  ? RExC_parse - origparse
10576                                  : 0),
10577 		   origparse));
10578 	(void)ReREFCNT_inc(RExC_rx_sv);
10579     }
10580 
10581     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10582 	nextchar(pRExC_state);
10583 	reginsert(pRExC_state, MINMOD, ret, depth+1);
10584         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10585     }
10586     else
10587     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10588         regnode *ender;
10589         nextchar(pRExC_state);
10590         ender = reg_node(pRExC_state, SUCCEED);
10591         REGTAIL(pRExC_state, ret, ender);
10592         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10593         ret->flags = 0;
10594         ender = reg_node(pRExC_state, TAIL);
10595         REGTAIL(pRExC_state, ret, ender);
10596     }
10597 
10598     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10599 	RExC_parse++;
10600 	vFAIL("Nested quantifiers");
10601     }
10602 
10603     return(ret);
10604 }
10605 
10606 STATIC bool
10607 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10608                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10609                       const bool strict   /* Apply stricter parsing rules? */
10610     )
10611 {
10612 
10613  /* This is expected to be called by a parser routine that has recognized '\N'
10614    and needs to handle the rest. RExC_parse is expected to point at the first
10615    char following the N at the time of the call.  On successful return,
10616    RExC_parse has been updated to point to just after the sequence identified
10617    by this routine, and <*flagp> has been updated.
10618 
10619    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10620    character class.
10621 
10622    \N may begin either a named sequence, or if outside a character class, mean
10623    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10624    attempted to decide which, and in the case of a named sequence, converted it
10625    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10626    where c1... are the characters in the sequence.  For single-quoted regexes,
10627    the tokenizer passes the \N sequence through unchanged; this code will not
10628    attempt to determine this nor expand those, instead raising a syntax error.
10629    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10630    or there is no '}', it signals that this \N occurrence means to match a
10631    non-newline.
10632 
10633    Only the \N{U+...} form should occur in a character class, for the same
10634    reason that '.' inside a character class means to just match a period: it
10635    just doesn't make sense.
10636 
10637    The function raises an error (via vFAIL), and doesn't return for various
10638    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10639    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10640    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10641    only possible if node_p is non-NULL.
10642 
10643 
10644    If <valuep> is non-null, it means the caller can accept an input sequence
10645    consisting of a just a single code point; <*valuep> is set to that value
10646    if the input is such.
10647 
10648    If <node_p> is non-null it signifies that the caller can accept any other
10649    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10650    is set as follows:
10651     1) \N means not-a-NL: points to a newly created REG_ANY node;
10652     2) \N{}:              points to a new NOTHING node;
10653     3) otherwise:         points to a new EXACT node containing the resolved
10654                           string.
10655    Note that FALSE is returned for single code point sequences if <valuep> is
10656    null.
10657  */
10658 
10659     char * endbrace;    /* '}' following the name */
10660     char* p;
10661     char *endchar;	/* Points to '.' or '}' ending cur char in the input
10662                            stream */
10663     bool has_multiple_chars; /* true if the input stream contains a sequence of
10664                                 more than one character */
10665 
10666     GET_RE_DEBUG_FLAGS_DECL;
10667 
10668     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10669 
10670     GET_RE_DEBUG_FLAGS;
10671 
10672     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10673 
10674     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10675      * modifier.  The other meaning does not, so use a temporary until we find
10676      * out which we are being called with */
10677     p = (RExC_flags & RXf_PMf_EXTENDED)
10678 	? regwhite( pRExC_state, RExC_parse )
10679 	: RExC_parse;
10680 
10681     /* Disambiguate between \N meaning a named character versus \N meaning
10682      * [^\n].  The former is assumed when it can't be the latter. */
10683     if (*p != '{' || regcurly(p, FALSE)) {
10684 	RExC_parse = p;
10685 	if (! node_p) {
10686 	    /* no bare \N allowed in a charclass */
10687             if (in_char_class) {
10688                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10689             }
10690             return FALSE;
10691         }
10692         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10693                            current char */
10694 	nextchar(pRExC_state);
10695 	*node_p = reg_node(pRExC_state, REG_ANY);
10696 	*flagp |= HASWIDTH|SIMPLE;
10697 	RExC_naughty++;
10698         Set_Node_Length(*node_p, 1); /* MJD */
10699 	return TRUE;
10700     }
10701 
10702     /* Here, we have decided it should be a named character or sequence */
10703 
10704     /* The test above made sure that the next real character is a '{', but
10705      * under the /x modifier, it could be separated by space (or a comment and
10706      * \n) and this is not allowed (for consistency with \x{...} and the
10707      * tokenizer handling of \N{NAME}). */
10708     if (*RExC_parse != '{') {
10709 	vFAIL("Missing braces on \\N{}");
10710     }
10711 
10712     RExC_parse++;	/* Skip past the '{' */
10713 
10714     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10715 	|| ! (endbrace == RExC_parse		/* nothing between the {} */
10716               || (endbrace - RExC_parse >= 2	/* U+ (bad hex is checked below
10717                                                  */
10718                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10719                                                      */
10720     {
10721 	if (endbrace) RExC_parse = endbrace;	/* position msg's '<--HERE' */
10722 	vFAIL("\\N{NAME} must be resolved by the lexer");
10723     }
10724 
10725     if (endbrace == RExC_parse) {   /* empty: \N{} */
10726         bool ret = TRUE;
10727 	if (node_p) {
10728 	    *node_p = reg_node(pRExC_state,NOTHING);
10729 	}
10730         else if (in_char_class) {
10731             if (SIZE_ONLY && in_char_class) {
10732                 if (strict) {
10733                     RExC_parse++;   /* Position after the "}" */
10734                     vFAIL("Zero length \\N{}");
10735                 }
10736                 else {
10737                     ckWARNreg(RExC_parse,
10738                               "Ignoring zero length \\N{} in character class");
10739                 }
10740             }
10741             ret = FALSE;
10742 	}
10743         else {
10744             return FALSE;
10745         }
10746         nextchar(pRExC_state);
10747         return ret;
10748     }
10749 
10750     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10751     RExC_parse += 2;	/* Skip past the 'U+' */
10752 
10753     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10754 
10755     /* Code points are separated by dots.  If none, there is only one code
10756      * point, and is terminated by the brace */
10757     has_multiple_chars = (endchar < endbrace);
10758 
10759     if (valuep && (! has_multiple_chars || in_char_class)) {
10760 	/* We only pay attention to the first char of
10761         multichar strings being returned in char classes. I kinda wonder
10762 	if this makes sense as it does change the behaviour
10763 	from earlier versions, OTOH that behaviour was broken
10764 	as well. XXX Solution is to recharacterize as
10765 	[rest-of-class]|multi1|multi2... */
10766 
10767 	STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10768 	I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10769 	    | PERL_SCAN_DISALLOW_PREFIX
10770 	    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10771 
10772 	*valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10773 
10774 	/* The tokenizer should have guaranteed validity, but it's possible to
10775 	 * bypass it by using single quoting, so check */
10776 	if (length_of_hex == 0
10777 	    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10778 	{
10779 	    RExC_parse += length_of_hex;	/* Includes all the valid */
10780 	    RExC_parse += (RExC_orig_utf8)	/* point to after 1st invalid */
10781 			    ? UTF8SKIP(RExC_parse)
10782 			    : 1;
10783 	    /* Guard against malformed utf8 */
10784 	    if (RExC_parse >= endchar) {
10785                 RExC_parse = endchar;
10786             }
10787 	    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10788 	}
10789 
10790         if (in_char_class && has_multiple_chars) {
10791             if (strict) {
10792                 RExC_parse = endbrace;
10793                 vFAIL("\\N{} in character class restricted to one character");
10794             }
10795             else {
10796                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10797             }
10798         }
10799 
10800         RExC_parse = endbrace + 1;
10801     }
10802     else if (! node_p || ! has_multiple_chars) {
10803 
10804         /* Here, the input is legal, but not according to the caller's
10805          * options.  We fail without advancing the parse, so that the
10806          * caller can try again */
10807         RExC_parse = p;
10808         return FALSE;
10809     }
10810     else {
10811 
10812 	/* What is done here is to convert this to a sub-pattern of the form
10813 	 * (?:\x{char1}\x{char2}...)
10814 	 * and then call reg recursively.  That way, it retains its atomicness,
10815 	 * while not having to worry about special handling that some code
10816 	 * points may have.  toke.c has converted the original Unicode values
10817 	 * to native, so that we can just pass on the hex values unchanged.  We
10818 	 * do have to set a flag to keep recoding from happening in the
10819 	 * recursion */
10820 
10821 	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10822 	STRLEN len;
10823 	char *orig_end = RExC_end;
10824         I32 flags;
10825 
10826 	while (RExC_parse < endbrace) {
10827 
10828 	    /* Convert to notation the rest of the code understands */
10829 	    sv_catpv(substitute_parse, "\\x{");
10830 	    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10831 	    sv_catpv(substitute_parse, "}");
10832 
10833 	    /* Point to the beginning of the next character in the sequence. */
10834 	    RExC_parse = endchar + 1;
10835 	    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10836 	}
10837 	sv_catpv(substitute_parse, ")");
10838 
10839 	RExC_parse = SvPV(substitute_parse, len);
10840 
10841 	/* Don't allow empty number */
10842 	if (len < 8) {
10843 	    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10844 	}
10845 	RExC_end = RExC_parse + len;
10846 
10847 	/* The values are Unicode, and therefore not subject to recoding */
10848 	RExC_override_recoding = 1;
10849 
10850 	if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10851             if (flags & RESTART_UTF8) {
10852                 *flagp = RESTART_UTF8;
10853                 return FALSE;
10854             }
10855             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10856                   (UV) flags);
10857         }
10858 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10859 
10860 	RExC_parse = endbrace;
10861 	RExC_end = orig_end;
10862 	RExC_override_recoding = 0;
10863 
10864         nextchar(pRExC_state);
10865     }
10866 
10867     return TRUE;
10868 }
10869 
10870 
10871 /*
10872  * reg_recode
10873  *
10874  * It returns the code point in utf8 for the value in *encp.
10875  *    value: a code value in the source encoding
10876  *    encp:  a pointer to an Encode object
10877  *
10878  * If the result from Encode is not a single character,
10879  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10880  */
10881 STATIC UV
10882 S_reg_recode(pTHX_ const char value, SV **encp)
10883 {
10884     STRLEN numlen = 1;
10885     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10886     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10887     const STRLEN newlen = SvCUR(sv);
10888     UV uv = UNICODE_REPLACEMENT;
10889 
10890     PERL_ARGS_ASSERT_REG_RECODE;
10891 
10892     if (newlen)
10893 	uv = SvUTF8(sv)
10894 	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10895 	     : *(U8*)s;
10896 
10897     if (!newlen || numlen != newlen) {
10898 	uv = UNICODE_REPLACEMENT;
10899 	*encp = NULL;
10900     }
10901     return uv;
10902 }
10903 
10904 PERL_STATIC_INLINE U8
10905 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10906 {
10907     U8 op;
10908 
10909     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10910 
10911     if (! FOLD) {
10912         return EXACT;
10913     }
10914 
10915     op = get_regex_charset(RExC_flags);
10916     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10917         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10918                  been, so there is no hole */
10919     }
10920 
10921     return op + EXACTF;
10922 }
10923 
10924 PERL_STATIC_INLINE void
10925 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10926                          regnode *node, I32* flagp, STRLEN len, UV code_point,
10927                          bool downgradable)
10928 {
10929     /* This knows the details about sizing an EXACTish node, setting flags for
10930      * it (by setting <*flagp>, and potentially populating it with a single
10931      * character.
10932      *
10933      * If <len> (the length in bytes) is non-zero, this function assumes that
10934      * the node has already been populated, and just does the sizing.  In this
10935      * case <code_point> should be the final code point that has already been
10936      * placed into the node.  This value will be ignored except that under some
10937      * circumstances <*flagp> is set based on it.
10938      *
10939      * If <len> is zero, the function assumes that the node is to contain only
10940      * the single character given by <code_point> and calculates what <len>
10941      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10942      * additionally will populate the node's STRING with <code_point> or its
10943      * fold if folding.
10944      *
10945      * In both cases <*flagp> is appropriately set
10946      *
10947      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10948      * 255, must be folded (the former only when the rules indicate it can
10949      * match 'ss')
10950      *
10951      * When it does the populating, it looks at the flag 'downgradable'.  If
10952      * true with a node that folds, it checks if the single code point
10953      * participates in a fold, and if not downgrades the node to an EXACT.
10954      * This helps the optimizer */
10955 
10956     bool len_passed_in = cBOOL(len != 0);
10957     U8 character[UTF8_MAXBYTES_CASE+1];
10958 
10959     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10960 
10961     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10962      * sizing difference, and is extra work that is thrown away */
10963     if (downgradable && ! PASS2) {
10964         downgradable = FALSE;
10965     }
10966 
10967     if (! len_passed_in) {
10968         if (UTF) {
10969             if (UNI_IS_INVARIANT(code_point)) {
10970                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10971                     *character = (U8) code_point;
10972                 }
10973                 else { /* Here is /i and not /l (toFOLD() is defined on just
10974                           ASCII, which isn't the same thing as INVARIANT on
10975                           EBCDIC, but it works there, as the extra invariants
10976                           fold to themselves) */
10977                     *character = toFOLD((U8) code_point);
10978 
10979                     /* We can downgrade to an EXACT node if this character
10980                      * isn't a folding one.  Note that this assumes that
10981                      * nothing above Latin1 folds to some other invariant than
10982                      * one of these alphabetics; otherwise we would also have
10983                      * to check:
10984                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
10985                      *      || ASCII_FOLD_RESTRICTED))
10986                      */
10987                     if (downgradable && PL_fold[code_point] == code_point) {
10988                         OP(node) = EXACT;
10989                     }
10990                 }
10991                 len = 1;
10992             }
10993             else if (FOLD && (! LOC
10994                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10995             {   /* Folding, and ok to do so now */
10996                 UV folded = _to_uni_fold_flags(
10997                                    code_point,
10998                                    character,
10999                                    &len,
11000                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11001                                                       ? FOLD_FLAGS_NOMIX_ASCII
11002                                                       : 0));
11003                 if (downgradable
11004                     && folded == code_point
11005                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11006                 {
11007                     OP(node) = EXACT;
11008                 }
11009             }
11010             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11011 
11012                 /* Not folding this cp, and can output it directly */
11013                 *character = UTF8_TWO_BYTE_HI(code_point);
11014                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11015                 len = 2;
11016             }
11017             else {
11018                 uvchr_to_utf8( character, code_point);
11019                 len = UTF8SKIP(character);
11020             }
11021         } /* Else pattern isn't UTF8.  */
11022         else if (! FOLD) {
11023             *character = (U8) code_point;
11024             len = 1;
11025         } /* Else is folded non-UTF8 */
11026         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11027 
11028             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11029              * comments at join_exact()); */
11030             *character = (U8) code_point;
11031             len = 1;
11032 
11033             /* Can turn into an EXACT node if we know the fold at compile time,
11034              * and it folds to itself and doesn't particpate in other folds */
11035             if (downgradable
11036                 && ! LOC
11037                 && PL_fold_latin1[code_point] == code_point
11038                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11039                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11040             {
11041                 OP(node) = EXACT;
11042             }
11043         } /* else is Sharp s.  May need to fold it */
11044         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11045             *character = 's';
11046             *(character + 1) = 's';
11047             len = 2;
11048         }
11049         else {
11050             *character = LATIN_SMALL_LETTER_SHARP_S;
11051             len = 1;
11052         }
11053     }
11054 
11055     if (SIZE_ONLY) {
11056         RExC_size += STR_SZ(len);
11057     }
11058     else {
11059         RExC_emit += STR_SZ(len);
11060         STR_LEN(node) = len;
11061         if (! len_passed_in) {
11062             Copy((char *) character, STRING(node), len, char);
11063         }
11064     }
11065 
11066     *flagp |= HASWIDTH;
11067 
11068     /* A single character node is SIMPLE, except for the special-cased SHARP S
11069      * under /di. */
11070     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11071         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11072             || ! FOLD || ! DEPENDS_SEMANTICS))
11073     {
11074         *flagp |= SIMPLE;
11075     }
11076 
11077     /* The OP may not be well defined in PASS1 */
11078     if (PASS2 && OP(node) == EXACTFL) {
11079         RExC_contains_locale = 1;
11080     }
11081 }
11082 
11083 
11084 /* return atoi(p), unless it's too big to sensibly be a backref,
11085  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11086 
11087 static I32
11088 S_backref_value(char *p)
11089 {
11090     char *q = p;
11091 
11092     for (;isDIGIT(*q); q++); /* calculate length of num */
11093     if (q - p == 0 || q - p > 9)
11094         return I32_MAX;
11095     return atoi(p);
11096 }
11097 
11098 
11099 /*
11100  - regatom - the lowest level
11101 
11102    Try to identify anything special at the start of the pattern. If there
11103    is, then handle it as required. This may involve generating a single regop,
11104    such as for an assertion; or it may involve recursing, such as to
11105    handle a () structure.
11106 
11107    If the string doesn't start with something special then we gobble up
11108    as much literal text as we can.
11109 
11110    Once we have been able to handle whatever type of thing started the
11111    sequence, we return.
11112 
11113    Note: we have to be careful with escapes, as they can be both literal
11114    and special, and in the case of \10 and friends, context determines which.
11115 
11116    A summary of the code structure is:
11117 
11118    switch (first_byte) {
11119 	cases for each special:
11120 	    handle this special;
11121 	    break;
11122 	case '\\':
11123 	    switch (2nd byte) {
11124 		cases for each unambiguous special:
11125 		    handle this special;
11126 		    break;
11127 		cases for each ambigous special/literal:
11128 		    disambiguate;
11129 		    if (special)  handle here
11130 		    else goto defchar;
11131 		default: // unambiguously literal:
11132 		    goto defchar;
11133 	    }
11134 	default:  // is a literal char
11135 	    // FALL THROUGH
11136 	defchar:
11137 	    create EXACTish node for literal;
11138 	    while (more input and node isn't full) {
11139 		switch (input_byte) {
11140 		   cases for each special;
11141                        make sure parse pointer is set so that the next call to
11142                            regatom will see this special first
11143                        goto loopdone; // EXACTish node terminated by prev. char
11144 		   default:
11145 		       append char to EXACTISH node;
11146 		}
11147 	        get next input byte;
11148 	    }
11149         loopdone:
11150    }
11151    return the generated node;
11152 
11153    Specifically there are two separate switches for handling
11154    escape sequences, with the one for handling literal escapes requiring
11155    a dummy entry for all of the special escapes that are actually handled
11156    by the other.
11157 
11158    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11159    TRYAGAIN.
11160    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11161    restarted.
11162    Otherwise does not return NULL.
11163 */
11164 
11165 STATIC regnode *
11166 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11167 {
11168     dVAR;
11169     regnode *ret = NULL;
11170     I32 flags = 0;
11171     char *parse_start = RExC_parse;
11172     U8 op;
11173     int invert = 0;
11174 
11175     GET_RE_DEBUG_FLAGS_DECL;
11176 
11177     *flagp = WORST;		/* Tentatively. */
11178 
11179     DEBUG_PARSE("atom");
11180 
11181     PERL_ARGS_ASSERT_REGATOM;
11182 
11183 tryagain:
11184     switch ((U8)*RExC_parse) {
11185     case '^':
11186 	RExC_seen_zerolen++;
11187 	nextchar(pRExC_state);
11188 	if (RExC_flags & RXf_PMf_MULTILINE)
11189 	    ret = reg_node(pRExC_state, MBOL);
11190 	else if (RExC_flags & RXf_PMf_SINGLELINE)
11191 	    ret = reg_node(pRExC_state, SBOL);
11192 	else
11193 	    ret = reg_node(pRExC_state, BOL);
11194         Set_Node_Length(ret, 1); /* MJD */
11195 	break;
11196     case '$':
11197 	nextchar(pRExC_state);
11198 	if (*RExC_parse)
11199 	    RExC_seen_zerolen++;
11200 	if (RExC_flags & RXf_PMf_MULTILINE)
11201 	    ret = reg_node(pRExC_state, MEOL);
11202 	else if (RExC_flags & RXf_PMf_SINGLELINE)
11203 	    ret = reg_node(pRExC_state, SEOL);
11204 	else
11205 	    ret = reg_node(pRExC_state, EOL);
11206         Set_Node_Length(ret, 1); /* MJD */
11207 	break;
11208     case '.':
11209 	nextchar(pRExC_state);
11210 	if (RExC_flags & RXf_PMf_SINGLELINE)
11211 	    ret = reg_node(pRExC_state, SANY);
11212 	else
11213 	    ret = reg_node(pRExC_state, REG_ANY);
11214 	*flagp |= HASWIDTH|SIMPLE;
11215 	RExC_naughty++;
11216         Set_Node_Length(ret, 1); /* MJD */
11217 	break;
11218     case '[':
11219     {
11220 	char * const oregcomp_parse = ++RExC_parse;
11221         ret = regclass(pRExC_state, flagp,depth+1,
11222                        FALSE, /* means parse the whole char class */
11223                        TRUE, /* allow multi-char folds */
11224                        FALSE, /* don't silence non-portable warnings. */
11225                        NULL);
11226 	if (*RExC_parse != ']') {
11227 	    RExC_parse = oregcomp_parse;
11228 	    vFAIL("Unmatched [");
11229 	}
11230         if (ret == NULL) {
11231             if (*flagp & RESTART_UTF8)
11232                 return NULL;
11233             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11234                   (UV) *flagp);
11235         }
11236 	nextchar(pRExC_state);
11237         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11238 	break;
11239     }
11240     case '(':
11241 	nextchar(pRExC_state);
11242         ret = reg(pRExC_state, 2, &flags,depth+1);
11243 	if (ret == NULL) {
11244 		if (flags & TRYAGAIN) {
11245 		    if (RExC_parse == RExC_end) {
11246 			 /* Make parent create an empty node if needed. */
11247 			*flagp |= TRYAGAIN;
11248 			return(NULL);
11249 		    }
11250 		    goto tryagain;
11251 		}
11252                 if (flags & RESTART_UTF8) {
11253                     *flagp = RESTART_UTF8;
11254                     return NULL;
11255                 }
11256                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11257                                                                  (UV) flags);
11258 	}
11259 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11260 	break;
11261     case '|':
11262     case ')':
11263 	if (flags & TRYAGAIN) {
11264 	    *flagp |= TRYAGAIN;
11265 	    return NULL;
11266 	}
11267 	vFAIL("Internal urp");
11268 				/* Supposed to be caught earlier. */
11269 	break;
11270     case '{':
11271 	if (!regcurly(RExC_parse, FALSE)) {
11272 	    RExC_parse++;
11273 	    goto defchar;
11274 	}
11275 	/* FALL THROUGH */
11276     case '?':
11277     case '+':
11278     case '*':
11279 	RExC_parse++;
11280 	vFAIL("Quantifier follows nothing");
11281 	break;
11282     case '\\':
11283 	/* Special Escapes
11284 
11285 	   This switch handles escape sequences that resolve to some kind
11286 	   of special regop and not to literal text. Escape sequnces that
11287 	   resolve to literal text are handled below in the switch marked
11288 	   "Literal Escapes".
11289 
11290 	   Every entry in this switch *must* have a corresponding entry
11291 	   in the literal escape switch. However, the opposite is not
11292 	   required, as the default for this switch is to jump to the
11293 	   literal text handling code.
11294 	*/
11295 	switch ((U8)*++RExC_parse) {
11296             U8 arg;
11297 	/* Special Escapes */
11298 	case 'A':
11299 	    RExC_seen_zerolen++;
11300 	    ret = reg_node(pRExC_state, SBOL);
11301 	    *flagp |= SIMPLE;
11302 	    goto finish_meta_pat;
11303 	case 'G':
11304 	    ret = reg_node(pRExC_state, GPOS);
11305             RExC_seen |= REG_GPOS_SEEN;
11306 	    *flagp |= SIMPLE;
11307 	    goto finish_meta_pat;
11308 	case 'K':
11309 	    RExC_seen_zerolen++;
11310 	    ret = reg_node(pRExC_state, KEEPS);
11311 	    *flagp |= SIMPLE;
11312 	    /* XXX:dmq : disabling in-place substitution seems to
11313 	     * be necessary here to avoid cases of memory corruption, as
11314 	     * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11315 	     */
11316             RExC_seen |= REG_LOOKBEHIND_SEEN;
11317 	    goto finish_meta_pat;
11318 	case 'Z':
11319 	    ret = reg_node(pRExC_state, SEOL);
11320 	    *flagp |= SIMPLE;
11321 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
11322 	    goto finish_meta_pat;
11323 	case 'z':
11324 	    ret = reg_node(pRExC_state, EOS);
11325 	    *flagp |= SIMPLE;
11326 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
11327 	    goto finish_meta_pat;
11328 	case 'C':
11329 	    ret = reg_node(pRExC_state, CANY);
11330             RExC_seen |= REG_CANY_SEEN;
11331 	    *flagp |= HASWIDTH|SIMPLE;
11332 	    goto finish_meta_pat;
11333 	case 'X':
11334 	    ret = reg_node(pRExC_state, CLUMP);
11335 	    *flagp |= HASWIDTH;
11336 	    goto finish_meta_pat;
11337 
11338 	case 'W':
11339             invert = 1;
11340             /* FALLTHROUGH */
11341 	case 'w':
11342             arg = ANYOF_WORDCHAR;
11343             goto join_posix;
11344 
11345 	case 'b':
11346 	    RExC_seen_zerolen++;
11347             RExC_seen |= REG_LOOKBEHIND_SEEN;
11348 	    op = BOUND + get_regex_charset(RExC_flags);
11349             if (op > BOUNDA) {  /* /aa is same as /a */
11350                 op = BOUNDA;
11351             }
11352             else if (op == BOUNDL) {
11353                 RExC_contains_locale = 1;
11354             }
11355 	    ret = reg_node(pRExC_state, op);
11356 	    FLAGS(ret) = get_regex_charset(RExC_flags);
11357 	    *flagp |= SIMPLE;
11358 	    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11359                 /* diag_listed_as: Use "%s" instead of "%s" */
11360 	        vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11361 	    }
11362 	    goto finish_meta_pat;
11363 	case 'B':
11364 	    RExC_seen_zerolen++;
11365             RExC_seen |= REG_LOOKBEHIND_SEEN;
11366 	    op = NBOUND + get_regex_charset(RExC_flags);
11367             if (op > NBOUNDA) { /* /aa is same as /a */
11368                 op = NBOUNDA;
11369             }
11370             else if (op == NBOUNDL) {
11371                 RExC_contains_locale = 1;
11372             }
11373 	    ret = reg_node(pRExC_state, op);
11374 	    FLAGS(ret) = get_regex_charset(RExC_flags);
11375 	    *flagp |= SIMPLE;
11376 	    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11377                 /* diag_listed_as: Use "%s" instead of "%s" */
11378 	        vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11379 	    }
11380 	    goto finish_meta_pat;
11381 
11382 	case 'D':
11383             invert = 1;
11384             /* FALLTHROUGH */
11385 	case 'd':
11386             arg = ANYOF_DIGIT;
11387             goto join_posix;
11388 
11389 	case 'R':
11390 	    ret = reg_node(pRExC_state, LNBREAK);
11391 	    *flagp |= HASWIDTH|SIMPLE;
11392 	    goto finish_meta_pat;
11393 
11394 	case 'H':
11395             invert = 1;
11396             /* FALLTHROUGH */
11397 	case 'h':
11398 	    arg = ANYOF_BLANK;
11399             op = POSIXU;
11400             goto join_posix_op_known;
11401 
11402 	case 'V':
11403             invert = 1;
11404             /* FALLTHROUGH */
11405 	case 'v':
11406 	    arg = ANYOF_VERTWS;
11407             op = POSIXU;
11408             goto join_posix_op_known;
11409 
11410 	case 'S':
11411             invert = 1;
11412             /* FALLTHROUGH */
11413 	case 's':
11414             arg = ANYOF_SPACE;
11415 
11416         join_posix:
11417 
11418 	    op = POSIXD + get_regex_charset(RExC_flags);
11419             if (op > POSIXA) {  /* /aa is same as /a */
11420                 op = POSIXA;
11421             }
11422             else if (op == POSIXL) {
11423                 RExC_contains_locale = 1;
11424             }
11425 
11426         join_posix_op_known:
11427 
11428             if (invert) {
11429                 op += NPOSIXD - POSIXD;
11430             }
11431 
11432 	    ret = reg_node(pRExC_state, op);
11433             if (! SIZE_ONLY) {
11434                 FLAGS(ret) = namedclass_to_classnum(arg);
11435             }
11436 
11437 	    *flagp |= HASWIDTH|SIMPLE;
11438             /* FALL THROUGH */
11439 
11440          finish_meta_pat:
11441 	    nextchar(pRExC_state);
11442             Set_Node_Length(ret, 2); /* MJD */
11443 	    break;
11444 	case 'p':
11445 	case 'P':
11446 	    {
11447 #ifdef DEBUGGING
11448 		char* parse_start = RExC_parse - 2;
11449 #endif
11450 
11451 		RExC_parse--;
11452 
11453                 ret = regclass(pRExC_state, flagp,depth+1,
11454                                TRUE, /* means just parse this element */
11455                                FALSE, /* don't allow multi-char folds */
11456                                FALSE, /* don't silence non-portable warnings.
11457                                          It would be a bug if these returned
11458                                          non-portables */
11459                                NULL);
11460                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11461                    are allowed.  */
11462                 if (!ret)
11463                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11464                           (UV) *flagp);
11465 
11466 		RExC_parse--;
11467 
11468 		Set_Node_Offset(ret, parse_start + 2);
11469                 Set_Node_Cur_Length(ret, parse_start);
11470 		nextchar(pRExC_state);
11471 	    }
11472 	    break;
11473         case 'N':
11474             /* Handle \N and \N{NAME} with multiple code points here and not
11475              * below because it can be multicharacter. join_exact() will join
11476              * them up later on.  Also this makes sure that things like
11477              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11478              * The options to the grok function call causes it to fail if the
11479              * sequence is just a single code point.  We then go treat it as
11480              * just another character in the current EXACT node, and hence it
11481              * gets uniform treatment with all the other characters.  The
11482              * special treatment for quantifiers is not needed for such single
11483              * character sequences */
11484             ++RExC_parse;
11485             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11486                                 FALSE /* not strict */ )) {
11487                 if (*flagp & RESTART_UTF8)
11488                     return NULL;
11489                 RExC_parse--;
11490                 goto defchar;
11491             }
11492             break;
11493 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
11494 	parse_named_seq:
11495         {
11496             char ch= RExC_parse[1];
11497 	    if (ch != '<' && ch != '\'' && ch != '{') {
11498 	        RExC_parse++;
11499 		/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11500 	        vFAIL2("Sequence %.2s... not terminated",parse_start);
11501 	    } else {
11502 	        /* this pretty much dupes the code for (?P=...) in reg(), if
11503                    you change this make sure you change that */
11504 		char* name_start = (RExC_parse += 2);
11505 		U32 num = 0;
11506                 SV *sv_dat = reg_scan_name(pRExC_state,
11507                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11508                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11509                 if (RExC_parse == name_start || *RExC_parse != ch)
11510                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11511                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11512 
11513                 if (!SIZE_ONLY) {
11514                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11515                     RExC_rxi->data->data[num]=(void*)sv_dat;
11516                     SvREFCNT_inc_simple_void(sv_dat);
11517                 }
11518 
11519                 RExC_sawback = 1;
11520                 ret = reganode(pRExC_state,
11521                                ((! FOLD)
11522                                  ? NREF
11523 				 : (ASCII_FOLD_RESTRICTED)
11524 				   ? NREFFA
11525                                    : (AT_LEAST_UNI_SEMANTICS)
11526                                      ? NREFFU
11527                                      : (LOC)
11528                                        ? NREFFL
11529                                        : NREFF),
11530                                 num);
11531                 *flagp |= HASWIDTH;
11532 
11533                 /* override incorrect value set in reganode MJD */
11534                 Set_Node_Offset(ret, parse_start+1);
11535                 Set_Node_Cur_Length(ret, parse_start);
11536                 nextchar(pRExC_state);
11537 
11538             }
11539             break;
11540 	}
11541 	case 'g':
11542 	case '1': case '2': case '3': case '4':
11543 	case '5': case '6': case '7': case '8': case '9':
11544 	    {
11545 		I32 num;
11546 		bool hasbrace = 0;
11547 
11548 		if (*RExC_parse == 'g') {
11549                     bool isrel = 0;
11550 
11551 		    RExC_parse++;
11552 		    if (*RExC_parse == '{') {
11553 		        RExC_parse++;
11554 		        hasbrace = 1;
11555 		    }
11556 		    if (*RExC_parse == '-') {
11557 		        RExC_parse++;
11558 		        isrel = 1;
11559 		    }
11560 		    if (hasbrace && !isDIGIT(*RExC_parse)) {
11561 		        if (isrel) RExC_parse--;
11562                         RExC_parse -= 2;
11563 		        goto parse_named_seq;
11564                     }
11565 
11566                     num = S_backref_value(RExC_parse);
11567                     if (num == 0)
11568                         vFAIL("Reference to invalid group 0");
11569                     else if (num == I32_MAX) {
11570                          if (isDIGIT(*RExC_parse))
11571 			    vFAIL("Reference to nonexistent group");
11572                         else
11573                             vFAIL("Unterminated \\g... pattern");
11574                     }
11575 
11576                     if (isrel) {
11577                         num = RExC_npar - num;
11578                         if (num < 1)
11579                             vFAIL("Reference to nonexistent or unclosed group");
11580                     }
11581                 }
11582                 else {
11583                     num = S_backref_value(RExC_parse);
11584                     /* bare \NNN might be backref or octal - if it is larger than or equal
11585                      * RExC_npar then it is assumed to be and octal escape.
11586                      * Note RExC_npar is +1 from the actual number of parens*/
11587                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11588                             && *RExC_parse != '8' && *RExC_parse != '9'))
11589                     {
11590                         /* Probably a character specified in octal, e.g. \35 */
11591                         goto defchar;
11592                     }
11593                 }
11594 
11595                 /* at this point RExC_parse definitely points to a backref
11596                  * number */
11597 		{
11598 #ifdef RE_TRACK_PATTERN_OFFSETS
11599 		    char * const parse_start = RExC_parse - 1; /* MJD */
11600 #endif
11601 		    while (isDIGIT(*RExC_parse))
11602 			RExC_parse++;
11603                     if (hasbrace) {
11604                         if (*RExC_parse != '}')
11605                             vFAIL("Unterminated \\g{...} pattern");
11606                         RExC_parse++;
11607                     }
11608 		    if (!SIZE_ONLY) {
11609 		        if (num > (I32)RExC_rx->nparens)
11610 			    vFAIL("Reference to nonexistent group");
11611 		    }
11612 		    RExC_sawback = 1;
11613 		    ret = reganode(pRExC_state,
11614 				   ((! FOLD)
11615 				     ? REF
11616 				     : (ASCII_FOLD_RESTRICTED)
11617 				       ? REFFA
11618                                        : (AT_LEAST_UNI_SEMANTICS)
11619                                          ? REFFU
11620                                          : (LOC)
11621                                            ? REFFL
11622                                            : REFF),
11623 				    num);
11624 		    *flagp |= HASWIDTH;
11625 
11626                     /* override incorrect value set in reganode MJD */
11627                     Set_Node_Offset(ret, parse_start+1);
11628                     Set_Node_Cur_Length(ret, parse_start);
11629 		    RExC_parse--;
11630 		    nextchar(pRExC_state);
11631 		}
11632 	    }
11633 	    break;
11634 	case '\0':
11635 	    if (RExC_parse >= RExC_end)
11636 		FAIL("Trailing \\");
11637 	    /* FALL THROUGH */
11638 	default:
11639 	    /* Do not generate "unrecognized" warnings here, we fall
11640 	       back into the quick-grab loop below */
11641 	    parse_start--;
11642 	    goto defchar;
11643 	}
11644 	break;
11645 
11646     case '#':
11647 	if (RExC_flags & RXf_PMf_EXTENDED) {
11648 	    if ( reg_skipcomment( pRExC_state ) )
11649 		goto tryagain;
11650 	}
11651 	/* FALL THROUGH */
11652 
11653     default:
11654 
11655             parse_start = RExC_parse - 1;
11656 
11657 	    RExC_parse++;
11658 
11659 	defchar: {
11660 	    STRLEN len = 0;
11661 	    UV ender = 0;
11662 	    char *p;
11663 	    char *s;
11664 #define MAX_NODE_STRING_SIZE 127
11665 	    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11666 	    char *s0;
11667 	    U8 upper_parse = MAX_NODE_STRING_SIZE;
11668             U8 node_type = compute_EXACTish(pRExC_state);
11669             bool next_is_quantifier;
11670             char * oldp = NULL;
11671 
11672             /* We can convert EXACTF nodes to EXACTFU if they contain only
11673              * characters that match identically regardless of the target
11674              * string's UTF8ness.  The reason to do this is that EXACTF is not
11675              * trie-able, EXACTFU is.
11676              *
11677              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11678              * contain only above-Latin1 characters (hence must be in UTF8),
11679              * which don't participate in folds with Latin1-range characters,
11680              * as the latter's folds aren't known until runtime.  (We don't
11681              * need to figure this out until pass 2) */
11682             bool maybe_exactfu = PASS2
11683                                && (node_type == EXACTF || node_type == EXACTFL);
11684 
11685             /* If a folding node contains only code points that don't
11686              * participate in folds, it can be changed into an EXACT node,
11687              * which allows the optimizer more things to look for */
11688             bool maybe_exact;
11689 
11690 	    ret = reg_node(pRExC_state, node_type);
11691 
11692             /* In pass1, folded, we use a temporary buffer instead of the
11693              * actual node, as the node doesn't exist yet */
11694 	    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11695 
11696             s0 = s;
11697 
11698 	reparse:
11699 
11700             /* We do the EXACTFish to EXACT node only if folding.  (And we
11701              * don't need to figure this out until pass 2) */
11702             maybe_exact = FOLD && PASS2;
11703 
11704 	    /* XXX The node can hold up to 255 bytes, yet this only goes to
11705              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11706              * 255 allows us to not have to worry about overflow due to
11707              * converting to utf8 and fold expansion, but that value is
11708              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11709              * split up by this limit into a single one using the real max of
11710              * 255.  Even at 127, this breaks under rare circumstances.  If
11711              * folding, we do not want to split a node at a character that is a
11712              * non-final in a multi-char fold, as an input string could just
11713              * happen to want to match across the node boundary.  The join
11714              * would solve that problem if the join actually happens.  But a
11715              * series of more than two nodes in a row each of 127 would cause
11716              * the first join to succeed to get to 254, but then there wouldn't
11717              * be room for the next one, which could at be one of those split
11718              * multi-char folds.  I don't know of any fool-proof solution.  One
11719              * could back off to end with only a code point that isn't such a
11720              * non-final, but it is possible for there not to be any in the
11721              * entire node. */
11722 	    for (p = RExC_parse - 1;
11723 	         len < upper_parse && p < RExC_end;
11724 	         len++)
11725 	    {
11726 		oldp = p;
11727 
11728 		if (RExC_flags & RXf_PMf_EXTENDED)
11729 		    p = regwhite( pRExC_state, p );
11730 		switch ((U8)*p) {
11731 		case '^':
11732 		case '$':
11733 		case '.':
11734 		case '[':
11735 		case '(':
11736 		case ')':
11737 		case '|':
11738 		    goto loopdone;
11739 		case '\\':
11740 		    /* Literal Escapes Switch
11741 
11742 		       This switch is meant to handle escape sequences that
11743 		       resolve to a literal character.
11744 
11745 		       Every escape sequence that represents something
11746 		       else, like an assertion or a char class, is handled
11747 		       in the switch marked 'Special Escapes' above in this
11748 		       routine, but also has an entry here as anything that
11749 		       isn't explicitly mentioned here will be treated as
11750 		       an unescaped equivalent literal.
11751 		    */
11752 
11753 		    switch ((U8)*++p) {
11754 		    /* These are all the special escapes. */
11755 		    case 'A':             /* Start assertion */
11756 		    case 'b': case 'B':   /* Word-boundary assertion*/
11757 		    case 'C':             /* Single char !DANGEROUS! */
11758 		    case 'd': case 'D':   /* digit class */
11759 		    case 'g': case 'G':   /* generic-backref, pos assertion */
11760 		    case 'h': case 'H':   /* HORIZWS */
11761 		    case 'k': case 'K':   /* named backref, keep marker */
11762 		    case 'p': case 'P':   /* Unicode property */
11763 		              case 'R':   /* LNBREAK */
11764 		    case 's': case 'S':   /* space class */
11765 		    case 'v': case 'V':   /* VERTWS */
11766 		    case 'w': case 'W':   /* word class */
11767                     case 'X':             /* eXtended Unicode "combining
11768                                              character sequence" */
11769 		    case 'z': case 'Z':   /* End of line/string assertion */
11770 			--p;
11771 			goto loopdone;
11772 
11773 	            /* Anything after here is an escape that resolves to a
11774 	               literal. (Except digits, which may or may not)
11775 	             */
11776 		    case 'n':
11777 			ender = '\n';
11778 			p++;
11779 			break;
11780 		    case 'N': /* Handle a single-code point named character. */
11781                         /* The options cause it to fail if a multiple code
11782                          * point sequence.  Handle those in the switch() above
11783                          * */
11784                         RExC_parse = p + 1;
11785                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11786                                             flagp, depth, FALSE,
11787                                             FALSE /* not strict */ ))
11788                         {
11789                             if (*flagp & RESTART_UTF8)
11790                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11791                             RExC_parse = p = oldp;
11792                             goto loopdone;
11793                         }
11794                         p = RExC_parse;
11795                         if (ender > 0xff) {
11796                             REQUIRE_UTF8;
11797                         }
11798                         break;
11799 		    case 'r':
11800 			ender = '\r';
11801 			p++;
11802 			break;
11803 		    case 't':
11804 			ender = '\t';
11805 			p++;
11806 			break;
11807 		    case 'f':
11808 			ender = '\f';
11809 			p++;
11810 			break;
11811 		    case 'e':
11812 			  ender = ASCII_TO_NATIVE('\033');
11813 			p++;
11814 			break;
11815 		    case 'a':
11816 			  ender = '\a';
11817 			p++;
11818 			break;
11819 		    case 'o':
11820 			{
11821 			    UV result;
11822 			    const char* error_msg;
11823 
11824 			    bool valid = grok_bslash_o(&p,
11825 						       &result,
11826 						       &error_msg,
11827 						       TRUE, /* out warnings */
11828                                                        FALSE, /* not strict */
11829                                                        TRUE, /* Output warnings
11830                                                                 for non-
11831                                                                 portables */
11832                                                        UTF);
11833 			    if (! valid) {
11834 				RExC_parse = p;	/* going to die anyway; point
11835 						   to exact spot of failure */
11836 				vFAIL(error_msg);
11837 			    }
11838                             ender = result;
11839 			    if (PL_encoding && ender < 0x100) {
11840 				goto recode_encoding;
11841 			    }
11842 			    if (ender > 0xff) {
11843 				REQUIRE_UTF8;
11844 			    }
11845 			    break;
11846 			}
11847 		    case 'x':
11848 			{
11849                             UV result = UV_MAX; /* initialize to erroneous
11850                                                    value */
11851 			    const char* error_msg;
11852 
11853 			    bool valid = grok_bslash_x(&p,
11854 						       &result,
11855 						       &error_msg,
11856 						       TRUE, /* out warnings */
11857                                                        FALSE, /* not strict */
11858                                                        TRUE, /* Output warnings
11859                                                                 for non-
11860                                                                 portables */
11861                                                        UTF);
11862 			    if (! valid) {
11863 				RExC_parse = p;	/* going to die anyway; point
11864 						   to exact spot of failure */
11865 				vFAIL(error_msg);
11866 			    }
11867                             ender = result;
11868 
11869 			    if (PL_encoding && ender < 0x100) {
11870 				goto recode_encoding;
11871 			    }
11872 			    if (ender > 0xff) {
11873 				REQUIRE_UTF8;
11874 			    }
11875 			    break;
11876 			}
11877 		    case 'c':
11878 			p++;
11879 			ender = grok_bslash_c(*p++, SIZE_ONLY);
11880 			break;
11881                     case '8': case '9': /* must be a backreference */
11882                         --p;
11883                         goto loopdone;
11884                     case '1': case '2': case '3':case '4':
11885 		    case '5': case '6': case '7':
11886                         /* When we parse backslash escapes there is ambiguity
11887                          * between backreferences and octal escapes. Any escape
11888                          * from \1 - \9 is a backreference, any multi-digit
11889                          * escape which does not start with 0 and which when
11890                          * evaluated as decimal could refer to an already
11891                          * parsed capture buffer is a backslash. Anything else
11892                          * is octal.
11893                          *
11894                          * Note this implies that \118 could be interpreted as
11895                          * 118 OR as "\11" . "8" depending on whether there
11896                          * were 118 capture buffers defined already in the
11897                          * pattern.  */
11898 
11899                         /* NOTE, RExC_npar is 1 more than the actual number of
11900                          * parens we have seen so far, hence the < RExC_npar below. */
11901 
11902                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11903                         {  /* Not to be treated as an octal constant, go
11904                                    find backref */
11905                             --p;
11906                             goto loopdone;
11907                         }
11908                     case '0':
11909 			{
11910 			    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11911 			    STRLEN numlen = 3;
11912 			    ender = grok_oct(p, &numlen, &flags, NULL);
11913 			    if (ender > 0xff) {
11914 				REQUIRE_UTF8;
11915 			    }
11916 			    p += numlen;
11917                             if (SIZE_ONLY   /* like \08, \178 */
11918                                 && numlen < 3
11919                                 && p < RExC_end
11920                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11921                             {
11922 				reg_warn_non_literal_string(
11923                                          p + 1,
11924                                          form_short_octal_warning(p, numlen));
11925                             }
11926 			}
11927 			if (PL_encoding && ender < 0x100)
11928 			    goto recode_encoding;
11929 			break;
11930 		    recode_encoding:
11931 			if (! RExC_override_recoding) {
11932 			    SV* enc = PL_encoding;
11933 			    ender = reg_recode((const char)(U8)ender, &enc);
11934 			    if (!enc && SIZE_ONLY)
11935 				ckWARNreg(p, "Invalid escape in the specified encoding");
11936 			    REQUIRE_UTF8;
11937 			}
11938 			break;
11939 		    case '\0':
11940 			if (p >= RExC_end)
11941 			    FAIL("Trailing \\");
11942 			/* FALL THROUGH */
11943 		    default:
11944 			if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11945 			    /* Include any { following the alpha to emphasize
11946 			     * that it could be part of an escape at some point
11947 			     * in the future */
11948 			    int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11949 			    ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11950 			}
11951 			goto normal_default;
11952 		    } /* End of switch on '\' */
11953 		    break;
11954 		default:    /* A literal character */
11955 
11956                     if (! SIZE_ONLY
11957                         && RExC_flags & RXf_PMf_EXTENDED
11958                         && ckWARN_d(WARN_DEPRECATED)
11959                         && is_PATWS_non_low_safe(p, RExC_end, UTF))
11960                     {
11961                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11962                                 "Escape literal pattern white space under /x");
11963                     }
11964 
11965 		  normal_default:
11966 		    if (UTF8_IS_START(*p) && UTF) {
11967 			STRLEN numlen;
11968 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11969 					       &numlen, UTF8_ALLOW_DEFAULT);
11970 			p += numlen;
11971 		    }
11972 		    else
11973 			ender = (U8) *p++;
11974 		    break;
11975 		} /* End of switch on the literal */
11976 
11977 		/* Here, have looked at the literal character and <ender>
11978 		 * contains its ordinal, <p> points to the character after it
11979 		 */
11980 
11981 		if ( RExC_flags & RXf_PMf_EXTENDED)
11982 		    p = regwhite( pRExC_state, p );
11983 
11984                 /* If the next thing is a quantifier, it applies to this
11985                  * character only, which means that this character has to be in
11986                  * its own node and can't just be appended to the string in an
11987                  * existing node, so if there are already other characters in
11988                  * the node, close the node with just them, and set up to do
11989                  * this character again next time through, when it will be the
11990                  * only thing in its new node */
11991                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11992 		{
11993                     p = oldp;
11994                     goto loopdone;
11995                 }
11996 
11997                 if (! FOLD   /* The simple case, just append the literal */
11998                     || (LOC  /* Also don't fold for tricky chars under /l */
11999                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12000                 {
12001                     if (UTF) {
12002 
12003                         /* Normally, we don't need the representation of the
12004                          * character in the sizing pass--just its size, but if
12005                          * folding, we have to actually put the character out
12006                          * even in the sizing pass, because the size could
12007                          * change as we juggle things at the end of this loop
12008                          * to avoid splitting a too-full node in the middle of
12009                          * a potential multi-char fold [perl #123539] */
12010                         const STRLEN unilen = (SIZE_ONLY && ! FOLD)
12011                                                ? UNISKIP(ender)
12012                                                : (uvchr_to_utf8((U8*)s, ender) - (U8*)s);
12013                         if (unilen > 0) {
12014                            s   += unilen;
12015                            len += unilen;
12016                         }
12017 
12018                         /* The loop increments <len> each time, as all but this
12019                          * path (and one other) through it add a single byte to
12020                          * the EXACTish node.  But this one has changed len to
12021                          * be the correct final value, so subtract one to
12022                          * cancel out the increment that follows */
12023                         len--;
12024                     }
12025                     else if (FOLD) {
12026                         /* See comment above for [perl #123539] */
12027                         *(s++) = (char) ender;
12028                     }
12029                     else {
12030                         REGC((char)ender, s++);
12031                     }
12032 
12033                     /* Can get here if folding only if is one of the /l
12034                      * characters whose fold depends on the locale.  The
12035                      * occurrence of any of these indicate that we can't
12036                      * simplify things */
12037                     if (FOLD) {
12038                         maybe_exact = FALSE;
12039                         maybe_exactfu = FALSE;
12040                     }
12041                 }
12042                 else             /* FOLD */
12043                      if (! ( UTF
12044                         /* See comments for join_exact() as to why we fold this
12045                          * non-UTF at compile time */
12046                         || (node_type == EXACTFU
12047                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12048                 {
12049                     /* Here, are folding and are not UTF-8 encoded; therefore
12050                      * the character must be in the range 0-255, and is not /l
12051                      * (Not /l because we already handled these under /l in
12052                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12053                     if (IS_IN_SOME_FOLD_L1(ender)) {
12054                         maybe_exact = FALSE;
12055 
12056                         /* See if the character's fold differs between /d and
12057                          * /u.  This includes the multi-char fold SHARP S to
12058                          * 'ss' */
12059                         if (maybe_exactfu
12060                             && (PL_fold[ender] != PL_fold_latin1[ender]
12061                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12062                                 || (len > 0
12063                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12064                                    && isARG2_lower_or_UPPER_ARG1('s',
12065                                                                  *(s-1)))))
12066                         {
12067                             maybe_exactfu = FALSE;
12068                         }
12069                     }
12070 
12071                     /* Even when folding, we store just the input character, as
12072                      * we have an array that finds its fold quickly */
12073                     *(s++) = (char) ender;
12074                 }
12075                 else {  /* FOLD and UTF */
12076                     /* Unlike the non-fold case, we do actually have to
12077                      * calculate the results here in pass 1.  This is for two
12078                      * reasons, the folded length may be longer than the
12079                      * unfolded, and we have to calculate how many EXACTish
12080                      * nodes it will take; and we may run out of room in a node
12081                      * in the middle of a potential multi-char fold, and have
12082                      * to back off accordingly.  (Hence we can't use REGC for
12083                      * the simple case just below.) */
12084 
12085                     UV folded;
12086                     if (isASCII(ender)) {
12087                         folded = toFOLD(ender);
12088                         *(s)++ = (U8) folded;
12089                     }
12090                     else {
12091                         STRLEN foldlen;
12092 
12093                         folded = _to_uni_fold_flags(
12094                                      ender,
12095                                      (U8 *) s,
12096                                      &foldlen,
12097                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12098                                                         ? FOLD_FLAGS_NOMIX_ASCII
12099                                                         : 0));
12100                         s += foldlen;
12101 
12102                         /* The loop increments <len> each time, as all but this
12103                          * path (and one other) through it add a single byte to
12104                          * the EXACTish node.  But this one has changed len to
12105                          * be the correct final value, so subtract one to
12106                          * cancel out the increment that follows */
12107                         len += foldlen - 1;
12108                     }
12109                     /* If this node only contains non-folding code points so
12110                      * far, see if this new one is also non-folding */
12111                     if (maybe_exact) {
12112                         if (folded != ender) {
12113                             maybe_exact = FALSE;
12114                         }
12115                         else {
12116                             /* Here the fold is the original; we have to check
12117                              * further to see if anything folds to it */
12118                             if (_invlist_contains_cp(PL_utf8_foldable,
12119                                                         ender))
12120                             {
12121                                 maybe_exact = FALSE;
12122                             }
12123                         }
12124                     }
12125                     ender = folded;
12126 		}
12127 
12128 		if (next_is_quantifier) {
12129 
12130                     /* Here, the next input is a quantifier, and to get here,
12131                      * the current character is the only one in the node.
12132                      * Also, here <len> doesn't include the final byte for this
12133                      * character */
12134                     len++;
12135                     goto loopdone;
12136 		}
12137 
12138 	    } /* End of loop through literal characters */
12139 
12140             /* Here we have either exhausted the input or ran out of room in
12141              * the node.  (If we encountered a character that can't be in the
12142              * node, transfer is made directly to <loopdone>, and so we
12143              * wouldn't have fallen off the end of the loop.)  In the latter
12144              * case, we artificially have to split the node into two, because
12145              * we just don't have enough space to hold everything.  This
12146              * creates a problem if the final character participates in a
12147              * multi-character fold in the non-final position, as a match that
12148              * should have occurred won't, due to the way nodes are matched,
12149              * and our artificial boundary.  So back off until we find a non-
12150              * problematic character -- one that isn't at the beginning or
12151              * middle of such a fold.  (Either it doesn't participate in any
12152              * folds, or appears only in the final position of all the folds it
12153              * does participate in.)  A better solution with far fewer false
12154              * positives, and that would fill the nodes more completely, would
12155              * be to actually have available all the multi-character folds to
12156              * test against, and to back-off only far enough to be sure that
12157              * this node isn't ending with a partial one.  <upper_parse> is set
12158              * further below (if we need to reparse the node) to include just
12159              * up through that final non-problematic character that this code
12160              * identifies, so when it is set to less than the full node, we can
12161              * skip the rest of this */
12162             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12163 
12164                 const STRLEN full_len = len;
12165 
12166 		assert(len >= MAX_NODE_STRING_SIZE);
12167 
12168                 /* Here, <s> points to the final byte of the final character.
12169                  * Look backwards through the string until find a non-
12170                  * problematic character */
12171 
12172 		if (! UTF) {
12173 
12174                     /* This has no multi-char folds to non-UTF characters */
12175                     if (ASCII_FOLD_RESTRICTED) {
12176                         goto loopdone;
12177                     }
12178 
12179                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12180                     len = s - s0 + 1;
12181 		}
12182                 else {
12183                     if (!  PL_NonL1NonFinalFold) {
12184                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12185                                         NonL1_Perl_Non_Final_Folds_invlist);
12186                     }
12187 
12188                     /* Point to the first byte of the final character */
12189                     s = (char *) utf8_hop((U8 *) s, -1);
12190 
12191                     while (s >= s0) {   /* Search backwards until find
12192                                            non-problematic char */
12193                         if (UTF8_IS_INVARIANT(*s)) {
12194 
12195                             /* There are no ascii characters that participate
12196                              * in multi-char folds under /aa.  In EBCDIC, the
12197                              * non-ascii invariants are all control characters,
12198                              * so don't ever participate in any folds. */
12199                             if (ASCII_FOLD_RESTRICTED
12200                                 || ! IS_NON_FINAL_FOLD(*s))
12201                             {
12202                                 break;
12203                             }
12204                         }
12205                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12206                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12207                                                                   *s, *(s+1))))
12208                             {
12209                                 break;
12210                             }
12211                         }
12212                         else if (! _invlist_contains_cp(
12213                                         PL_NonL1NonFinalFold,
12214                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12215                         {
12216                             break;
12217                         }
12218 
12219                         /* Here, the current character is problematic in that
12220                          * it does occur in the non-final position of some
12221                          * fold, so try the character before it, but have to
12222                          * special case the very first byte in the string, so
12223                          * we don't read outside the string */
12224                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12225                     } /* End of loop backwards through the string */
12226 
12227                     /* If there were only problematic characters in the string,
12228                      * <s> will point to before s0, in which case the length
12229                      * should be 0, otherwise include the length of the
12230                      * non-problematic character just found */
12231                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12232 		}
12233 
12234                 /* Here, have found the final character, if any, that is
12235                  * non-problematic as far as ending the node without splitting
12236                  * it across a potential multi-char fold.  <len> contains the
12237                  * number of bytes in the node up-to and including that
12238                  * character, or is 0 if there is no such character, meaning
12239                  * the whole node contains only problematic characters.  In
12240                  * this case, give up and just take the node as-is.  We can't
12241                  * do any better */
12242                 if (len == 0) {
12243                     len = full_len;
12244 
12245                     /* If the node ends in an 's' we make sure it stays EXACTF,
12246                      * as if it turns into an EXACTFU, it could later get
12247                      * joined with another 's' that would then wrongly match
12248                      * the sharp s */
12249                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12250                     {
12251                         maybe_exactfu = FALSE;
12252                     }
12253                 } else {
12254 
12255                     /* Here, the node does contain some characters that aren't
12256                      * problematic.  If one such is the final character in the
12257                      * node, we are done */
12258                     if (len == full_len) {
12259                         goto loopdone;
12260                     }
12261                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12262 
12263                         /* If the final character is problematic, but the
12264                          * penultimate is not, back-off that last character to
12265                          * later start a new node with it */
12266                         p = oldp;
12267                         goto loopdone;
12268                     }
12269 
12270                     /* Here, the final non-problematic character is earlier
12271                      * in the input than the penultimate character.  What we do
12272                      * is reparse from the beginning, going up only as far as
12273                      * this final ok one, thus guaranteeing that the node ends
12274                      * in an acceptable character.  The reason we reparse is
12275                      * that we know how far in the character is, but we don't
12276                      * know how to correlate its position with the input parse.
12277                      * An alternate implementation would be to build that
12278                      * correlation as we go along during the original parse,
12279                      * but that would entail extra work for every node, whereas
12280                      * this code gets executed only when the string is too
12281                      * large for the node, and the final two characters are
12282                      * problematic, an infrequent occurrence.  Yet another
12283                      * possible strategy would be to save the tail of the
12284                      * string, and the next time regatom is called, initialize
12285                      * with that.  The problem with this is that unless you
12286                      * back off one more character, you won't be guaranteed
12287                      * regatom will get called again, unless regbranch,
12288                      * regpiece ... are also changed.  If you do back off that
12289                      * extra character, so that there is input guaranteed to
12290                      * force calling regatom, you can't handle the case where
12291                      * just the first character in the node is acceptable.  I
12292                      * (khw) decided to try this method which doesn't have that
12293                      * pitfall; if performance issues are found, we can do a
12294                      * combination of the current approach plus that one */
12295                     upper_parse = len;
12296                     len = 0;
12297                     s = s0;
12298                     goto reparse;
12299                 }
12300 	    }   /* End of verifying node ends with an appropriate char */
12301 
12302 	loopdone:   /* Jumped to when encounters something that shouldn't be in
12303 		       the node */
12304 
12305             /* I (khw) don't know if you can get here with zero length, but the
12306              * old code handled this situation by creating a zero-length EXACT
12307              * node.  Might as well be NOTHING instead */
12308             if (len == 0) {
12309                 OP(ret) = NOTHING;
12310             }
12311             else {
12312                 if (FOLD) {
12313                     /* If 'maybe_exact' is still set here, means there are no
12314                      * code points in the node that participate in folds;
12315                      * similarly for 'maybe_exactfu' and code points that match
12316                      * differently depending on UTF8ness of the target string
12317                      * (for /u), or depending on locale for /l */
12318                     if (maybe_exact) {
12319                         OP(ret) = EXACT;
12320                     }
12321                     else if (maybe_exactfu) {
12322                         OP(ret) = EXACTFU;
12323                     }
12324                 }
12325                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12326                                            FALSE /* Don't look to see if could
12327                                                     be turned into an EXACT
12328                                                     node, as we have already
12329                                                     computed that */
12330                                           );
12331             }
12332 
12333 	    RExC_parse = p - 1;
12334             Set_Node_Cur_Length(ret, parse_start);
12335 	    nextchar(pRExC_state);
12336 	    {
12337 		/* len is STRLEN which is unsigned, need to copy to signed */
12338 		IV iv = len;
12339 		if (iv < 0)
12340 		    vFAIL("Internal disaster");
12341 	    }
12342 
12343 	} /* End of label 'defchar:' */
12344 	break;
12345     } /* End of giant switch on input character */
12346 
12347     return(ret);
12348 }
12349 
12350 STATIC char *
12351 S_regwhite( RExC_state_t *pRExC_state, char *p )
12352 {
12353     const char *e = RExC_end;
12354 
12355     PERL_ARGS_ASSERT_REGWHITE;
12356 
12357     while (p < e) {
12358 	if (isSPACE(*p))
12359 	    ++p;
12360 	else if (*p == '#') {
12361             bool ended = 0;
12362 	    do {
12363 		if (*p++ == '\n') {
12364 		    ended = 1;
12365 		    break;
12366 		}
12367 	    } while (p < e);
12368 	    if (!ended)
12369                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12370 	}
12371 	else
12372 	    break;
12373     }
12374     return p;
12375 }
12376 
12377 STATIC char *
12378 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12379 {
12380     /* Returns the next non-pattern-white space, non-comment character (the
12381      * latter only if 'recognize_comment is true) in the string p, which is
12382      * ended by RExC_end.  If there is no line break ending a comment,
12383      * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12384     const char *e = RExC_end;
12385 
12386     PERL_ARGS_ASSERT_REGPATWS;
12387 
12388     while (p < e) {
12389         STRLEN len;
12390 	if ((len = is_PATWS_safe(p, e, UTF))) {
12391 	    p += len;
12392         }
12393 	else if (recognize_comment && *p == '#') {
12394             bool ended = 0;
12395 	    do {
12396                 p++;
12397                 if (is_LNBREAK_safe(p, e, UTF)) {
12398 		    ended = 1;
12399 		    break;
12400 		}
12401 	    } while (p < e);
12402 	    if (!ended)
12403                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12404 	}
12405 	else
12406 	    break;
12407     }
12408     return p;
12409 }
12410 
12411 STATIC void
12412 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12413 {
12414     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12415      * sets up the bitmap and any flags, removing those code points from the
12416      * inversion list, setting it to NULL should it become completely empty */
12417 
12418     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12419     assert(PL_regkind[OP(node)] == ANYOF);
12420 
12421     ANYOF_BITMAP_ZERO(node);
12422     if (*invlist_ptr) {
12423 
12424 	/* This gets set if we actually need to modify things */
12425 	bool change_invlist = FALSE;
12426 
12427 	UV start, end;
12428 
12429 	/* Start looking through *invlist_ptr */
12430 	invlist_iterinit(*invlist_ptr);
12431 	while (invlist_iternext(*invlist_ptr, &start, &end)) {
12432 	    UV high;
12433 	    int i;
12434 
12435             if (end == UV_MAX && start <= 256) {
12436                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12437             }
12438             else if (end >= 256) {
12439                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12440             }
12441 
12442 	    /* Quit if are above what we should change */
12443 	    if (start > 255) {
12444 		break;
12445 	    }
12446 
12447 	    change_invlist = TRUE;
12448 
12449 	    /* Set all the bits in the range, up to the max that we are doing */
12450 	    high = (end < 255) ? end : 255;
12451 	    for (i = start; i <= (int) high; i++) {
12452 		if (! ANYOF_BITMAP_TEST(node, i)) {
12453 		    ANYOF_BITMAP_SET(node, i);
12454 		}
12455 	    }
12456 	}
12457 	invlist_iterfinish(*invlist_ptr);
12458 
12459         /* Done with loop; remove any code points that are in the bitmap from
12460          * *invlist_ptr; similarly for code points above latin1 if we have a
12461          * flag to match all of them anyways */
12462 	if (change_invlist) {
12463 	    _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12464 	}
12465         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12466 	    _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12467 	}
12468 
12469 	/* If have completely emptied it, remove it completely */
12470 	if (_invlist_len(*invlist_ptr) == 0) {
12471 	    SvREFCNT_dec_NN(*invlist_ptr);
12472 	    *invlist_ptr = NULL;
12473 	}
12474     }
12475 }
12476 
12477 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12478    Character classes ([:foo:]) can also be negated ([:^foo:]).
12479    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12480    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12481    but trigger failures because they are currently unimplemented. */
12482 
12483 #define POSIXCC_DONE(c)   ((c) == ':')
12484 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12485 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12486 
12487 PERL_STATIC_INLINE I32
12488 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12489 {
12490     dVAR;
12491     I32 namedclass = OOB_NAMEDCLASS;
12492 
12493     PERL_ARGS_ASSERT_REGPPOSIXCC;
12494 
12495     if (value == '[' && RExC_parse + 1 < RExC_end &&
12496 	/* I smell either [: or [= or [. -- POSIX has been here, right? */
12497 	POSIXCC(UCHARAT(RExC_parse)))
12498     {
12499 	const char c = UCHARAT(RExC_parse);
12500 	char* const s = RExC_parse++;
12501 
12502 	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12503 	    RExC_parse++;
12504 	if (RExC_parse == RExC_end) {
12505             if (strict) {
12506 
12507                 /* Try to give a better location for the error (than the end of
12508                  * the string) by looking for the matching ']' */
12509                 RExC_parse = s;
12510                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12511                     RExC_parse++;
12512                 }
12513                 vFAIL2("Unmatched '%c' in POSIX class", c);
12514             }
12515 	    /* Grandfather lone [:, [=, [. */
12516 	    RExC_parse = s;
12517         }
12518 	else {
12519 	    const char* const t = RExC_parse++; /* skip over the c */
12520 	    assert(*t == c);
12521 
12522   	    if (UCHARAT(RExC_parse) == ']') {
12523 		const char *posixcc = s + 1;
12524   		RExC_parse++; /* skip over the ending ] */
12525 
12526 		if (*s == ':') {
12527 		    const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12528 		    const I32 skip = t - posixcc;
12529 
12530 		    /* Initially switch on the length of the name.  */
12531 		    switch (skip) {
12532 		    case 4:
12533                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12534                                                           this is the Perl \w
12535                                                         */
12536 			    namedclass = ANYOF_WORDCHAR;
12537 			break;
12538 		    case 5:
12539 			/* Names all of length 5.  */
12540 			/* alnum alpha ascii blank cntrl digit graph lower
12541 			   print punct space upper  */
12542 			/* Offset 4 gives the best switch position.  */
12543 			switch (posixcc[4]) {
12544 			case 'a':
12545 			    if (memEQ(posixcc, "alph", 4)) /* alpha */
12546 				namedclass = ANYOF_ALPHA;
12547 			    break;
12548 			case 'e':
12549 			    if (memEQ(posixcc, "spac", 4)) /* space */
12550 				namedclass = ANYOF_PSXSPC;
12551 			    break;
12552 			case 'h':
12553 			    if (memEQ(posixcc, "grap", 4)) /* graph */
12554 				namedclass = ANYOF_GRAPH;
12555 			    break;
12556 			case 'i':
12557 			    if (memEQ(posixcc, "asci", 4)) /* ascii */
12558 				namedclass = ANYOF_ASCII;
12559 			    break;
12560 			case 'k':
12561 			    if (memEQ(posixcc, "blan", 4)) /* blank */
12562 				namedclass = ANYOF_BLANK;
12563 			    break;
12564 			case 'l':
12565 			    if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12566 				namedclass = ANYOF_CNTRL;
12567 			    break;
12568 			case 'm':
12569 			    if (memEQ(posixcc, "alnu", 4)) /* alnum */
12570 				namedclass = ANYOF_ALPHANUMERIC;
12571 			    break;
12572 			case 'r':
12573 			    if (memEQ(posixcc, "lowe", 4)) /* lower */
12574 				namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12575 			    else if (memEQ(posixcc, "uppe", 4)) /* upper */
12576 				namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12577 			    break;
12578 			case 't':
12579 			    if (memEQ(posixcc, "digi", 4)) /* digit */
12580 				namedclass = ANYOF_DIGIT;
12581 			    else if (memEQ(posixcc, "prin", 4)) /* print */
12582 				namedclass = ANYOF_PRINT;
12583 			    else if (memEQ(posixcc, "punc", 4)) /* punct */
12584 				namedclass = ANYOF_PUNCT;
12585 			    break;
12586 			}
12587 			break;
12588 		    case 6:
12589 			if (memEQ(posixcc, "xdigit", 6))
12590 			    namedclass = ANYOF_XDIGIT;
12591 			break;
12592 		    }
12593 
12594 		    if (namedclass == OOB_NAMEDCLASS)
12595 			vFAIL2utf8f(
12596                             "POSIX class [:%"UTF8f":] unknown",
12597 			    UTF8fARG(UTF, t - s - 1, s + 1));
12598 
12599                     /* The #defines are structured so each complement is +1 to
12600                      * the normal one */
12601                     if (complement) {
12602                         namedclass++;
12603                     }
12604 		    assert (posixcc[skip] == ':');
12605 		    assert (posixcc[skip+1] == ']');
12606 		} else if (!SIZE_ONLY) {
12607 		    /* [[=foo=]] and [[.foo.]] are still future. */
12608 
12609 		    /* adjust RExC_parse so the warning shows after
12610 		       the class closes */
12611 		    while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12612 			RExC_parse++;
12613 		    vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12614 		}
12615 	    } else {
12616 		/* Maternal grandfather:
12617 		 * "[:" ending in ":" but not in ":]" */
12618                 if (strict) {
12619                     vFAIL("Unmatched '[' in POSIX class");
12620                 }
12621 
12622                 /* Grandfather lone [:, [=, [. */
12623 		RExC_parse = s;
12624 	    }
12625 	}
12626     }
12627 
12628     return namedclass;
12629 }
12630 
12631 STATIC bool
12632 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12633 {
12634     /* This applies some heuristics at the current parse position (which should
12635      * be at a '[') to see if what follows might be intended to be a [:posix:]
12636      * class.  It returns true if it really is a posix class, of course, but it
12637      * also can return true if it thinks that what was intended was a posix
12638      * class that didn't quite make it.
12639      *
12640      * It will return true for
12641      *      [:alphanumerics:
12642      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12643      *                         ')' indicating the end of the (?[
12644      *      [:any garbage including %^&$ punctuation:]
12645      *
12646      * This is designed to be called only from S_handle_regex_sets; it could be
12647      * easily adapted to be called from the spot at the beginning of regclass()
12648      * that checks to see in a normal bracketed class if the surrounding []
12649      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12650      * change long-standing behavior, so I (khw) didn't do that */
12651     char* p = RExC_parse + 1;
12652     char first_char = *p;
12653 
12654     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12655 
12656     assert(*(p - 1) == '[');
12657 
12658     if (! POSIXCC(first_char)) {
12659         return FALSE;
12660     }
12661 
12662     p++;
12663     while (p < RExC_end && isWORDCHAR(*p)) p++;
12664 
12665     if (p >= RExC_end) {
12666         return FALSE;
12667     }
12668 
12669     if (p - RExC_parse > 2    /* Got at least 1 word character */
12670         && (*p == first_char
12671             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12672     {
12673         return TRUE;
12674     }
12675 
12676     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12677 
12678     return (p
12679             && p - RExC_parse > 2 /* [:] evaluates to colon;
12680                                       [::] is a bad posix class. */
12681             && first_char == *(p - 1));
12682 }
12683 
12684 STATIC regnode *
12685 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12686                     I32 *flagp, U32 depth,
12687                     char * const oregcomp_parse)
12688 {
12689     /* Handle the (?[...]) construct to do set operations */
12690 
12691     U8 curchar;
12692     UV start, end;	/* End points of code point ranges */
12693     SV* result_string;
12694     char *save_end, *save_parse;
12695     SV* final;
12696     STRLEN len;
12697     regnode* node;
12698     AV* stack;
12699     const bool save_fold = FOLD;
12700 
12701     GET_RE_DEBUG_FLAGS_DECL;
12702 
12703     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12704 
12705     if (LOC) {
12706         vFAIL("(?[...]) not valid in locale");
12707     }
12708     RExC_uni_semantics = 1;
12709 
12710     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12711      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12712      * call regclass to handle '[]' so as to not have to reinvent its parsing
12713      * rules here (throwing away the size it computes each time).  And, we exit
12714      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12715      * these things, we need to realize that something preceded by a backslash
12716      * is escaped, so we have to keep track of backslashes */
12717     if (SIZE_ONLY) {
12718         UV depth = 0; /* how many nested (?[...]) constructs */
12719 
12720         Perl_ck_warner_d(aTHX_
12721             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12722             "The regex_sets feature is experimental" REPORT_LOCATION,
12723                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12724                 UTF8fARG(UTF,
12725                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12726                          RExC_precomp + (RExC_parse - RExC_precomp)));
12727 
12728         while (RExC_parse < RExC_end) {
12729             SV* current = NULL;
12730             RExC_parse = regpatws(pRExC_state, RExC_parse,
12731                                 TRUE); /* means recognize comments */
12732             switch (*RExC_parse) {
12733                 case '?':
12734                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12735                     /* FALL THROUGH */
12736                 default:
12737                     break;
12738                 case '\\':
12739                     /* Skip the next byte (which could cause us to end up in
12740                      * the middle of a UTF-8 character, but since none of those
12741                      * are confusable with anything we currently handle in this
12742                      * switch (invariants all), it's safe.  We'll just hit the
12743                      * default: case next time and keep on incrementing until
12744                      * we find one of the invariants we do handle. */
12745                     RExC_parse++;
12746                     break;
12747                 case '[':
12748                 {
12749                     /* If this looks like it is a [:posix:] class, leave the
12750                      * parse pointer at the '[' to fool regclass() into
12751                      * thinking it is part of a '[[:posix:]]'.  That function
12752                      * will use strict checking to force a syntax error if it
12753                      * doesn't work out to a legitimate class */
12754                     bool is_posix_class
12755                                     = could_it_be_a_POSIX_class(pRExC_state);
12756                     if (! is_posix_class) {
12757                         RExC_parse++;
12758                     }
12759 
12760                     /* regclass() can only return RESTART_UTF8 if multi-char
12761                        folds are allowed.  */
12762                     if (!regclass(pRExC_state, flagp,depth+1,
12763                                   is_posix_class, /* parse the whole char
12764                                                      class only if not a
12765                                                      posix class */
12766                                   FALSE, /* don't allow multi-char folds */
12767                                   TRUE, /* silence non-portable warnings. */
12768                                   &current))
12769                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12770                               (UV) *flagp);
12771 
12772                     /* function call leaves parse pointing to the ']', except
12773                      * if we faked it */
12774                     if (is_posix_class) {
12775                         RExC_parse--;
12776                     }
12777 
12778                     SvREFCNT_dec(current);   /* In case it returned something */
12779                     break;
12780                 }
12781 
12782                 case ']':
12783                     if (depth--) break;
12784                     RExC_parse++;
12785                     if (RExC_parse < RExC_end
12786                         && *RExC_parse == ')')
12787                     {
12788                         node = reganode(pRExC_state, ANYOF, 0);
12789                         RExC_size += ANYOF_SKIP;
12790                         nextchar(pRExC_state);
12791                         Set_Node_Length(node,
12792                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12793                         return node;
12794                     }
12795                     goto no_close;
12796             }
12797             RExC_parse++;
12798         }
12799 
12800         no_close:
12801         FAIL("Syntax error in (?[...])");
12802     }
12803 
12804     /* Pass 2 only after this.  Everything in this construct is a
12805      * metacharacter.  Operands begin with either a '\' (for an escape
12806      * sequence), or a '[' for a bracketed character class.  Any other
12807      * character should be an operator, or parenthesis for grouping.  Both
12808      * types of operands are handled by calling regclass() to parse them.  It
12809      * is called with a parameter to indicate to return the computed inversion
12810      * list.  The parsing here is implemented via a stack.  Each entry on the
12811      * stack is a single character representing one of the operators, or the
12812      * '('; or else a pointer to an operand inversion list. */
12813 
12814 #define IS_OPERAND(a)  (! SvIOK(a))
12815 
12816     /* The stack starts empty.  It is a syntax error if the first thing parsed
12817      * is a binary operator; everything else is pushed on the stack.  When an
12818      * operand is parsed, the top of the stack is examined.  If it is a binary
12819      * operator, the item before it should be an operand, and both are replaced
12820      * by the result of doing that operation on the new operand and the one on
12821      * the stack.   Thus a sequence of binary operands is reduced to a single
12822      * one before the next one is parsed.
12823      *
12824      * A unary operator may immediately follow a binary in the input, for
12825      * example
12826      *      [a] + ! [b]
12827      * When an operand is parsed and the top of the stack is a unary operator,
12828      * the operation is performed, and then the stack is rechecked to see if
12829      * this new operand is part of a binary operation; if so, it is handled as
12830      * above.
12831      *
12832      * A '(' is simply pushed on the stack; it is valid only if the stack is
12833      * empty, or the top element of the stack is an operator or another '('
12834      * (for which the parenthesized expression will become an operand).  By the
12835      * time the corresponding ')' is parsed everything in between should have
12836      * been parsed and evaluated to a single operand (or else is a syntax
12837      * error), and is handled as a regular operand */
12838 
12839     sv_2mortal((SV *)(stack = newAV()));
12840 
12841     while (RExC_parse < RExC_end) {
12842         I32 top_index = av_tindex(stack);
12843         SV** top_ptr;
12844         SV* current = NULL;
12845 
12846         /* Skip white space */
12847         RExC_parse = regpatws(pRExC_state, RExC_parse,
12848                                 TRUE); /* means recognize comments */
12849         if (RExC_parse >= RExC_end) {
12850             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12851         }
12852         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12853             break;
12854         }
12855 
12856         switch (curchar) {
12857 
12858             case '?':
12859                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12860                                                safely subtract 1 from
12861                                                RExC_parse in the next clause.
12862                                                If we have something on the
12863                                                stack, we have parsed something
12864                                              */
12865                     && UCHARAT(RExC_parse - 1) == '('
12866                     && RExC_parse < RExC_end)
12867                 {
12868                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12869                      * This happens when we have some thing like
12870                      *
12871                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12872                      *   ...
12873                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12874                      *
12875                      * Here we would be handling the interpolated
12876                      * '$thai_or_lao'.  We handle this by a recursive call to
12877                      * ourselves which returns the inversion list the
12878                      * interpolated expression evaluates to.  We use the flags
12879                      * from the interpolated pattern. */
12880                     U32 save_flags = RExC_flags;
12881                     const char * const save_parse = ++RExC_parse;
12882 
12883                     parse_lparen_question_flags(pRExC_state);
12884 
12885                     if (RExC_parse == save_parse  /* Makes sure there was at
12886                                                      least one flag (or this
12887                                                      embedding wasn't compiled)
12888                                                    */
12889                         || RExC_parse >= RExC_end - 4
12890                         || UCHARAT(RExC_parse) != ':'
12891                         || UCHARAT(++RExC_parse) != '('
12892                         || UCHARAT(++RExC_parse) != '?'
12893                         || UCHARAT(++RExC_parse) != '[')
12894                     {
12895 
12896                         /* In combination with the above, this moves the
12897                          * pointer to the point just after the first erroneous
12898                          * character (or if there are no flags, to where they
12899                          * should have been) */
12900                         if (RExC_parse >= RExC_end - 4) {
12901                             RExC_parse = RExC_end;
12902                         }
12903                         else if (RExC_parse != save_parse) {
12904                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12905                         }
12906                         vFAIL("Expecting '(?flags:(?[...'");
12907                     }
12908                     RExC_parse++;
12909                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12910                                                     depth+1, oregcomp_parse);
12911 
12912                     /* Here, 'current' contains the embedded expression's
12913                      * inversion list, and RExC_parse points to the trailing
12914                      * ']'; the next character should be the ')' which will be
12915                      * paired with the '(' that has been put on the stack, so
12916                      * the whole embedded expression reduces to '(operand)' */
12917                     RExC_parse++;
12918 
12919                     RExC_flags = save_flags;
12920                     goto handle_operand;
12921                 }
12922                 /* FALL THROUGH */
12923 
12924             default:
12925                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12926                 vFAIL("Unexpected character");
12927 
12928             case '\\':
12929                 /* regclass() can only return RESTART_UTF8 if multi-char
12930                    folds are allowed.  */
12931                 if (!regclass(pRExC_state, flagp,depth+1,
12932                               TRUE, /* means parse just the next thing */
12933                               FALSE, /* don't allow multi-char folds */
12934                               FALSE, /* don't silence non-portable warnings.  */
12935                               &current))
12936                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12937                           (UV) *flagp);
12938                 /* regclass() will return with parsing just the \ sequence,
12939                  * leaving the parse pointer at the next thing to parse */
12940                 RExC_parse--;
12941                 goto handle_operand;
12942 
12943             case '[':   /* Is a bracketed character class */
12944             {
12945                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12946 
12947                 if (! is_posix_class) {
12948                     RExC_parse++;
12949                 }
12950 
12951                 /* regclass() can only return RESTART_UTF8 if multi-char
12952                    folds are allowed.  */
12953                 if(!regclass(pRExC_state, flagp,depth+1,
12954                              is_posix_class, /* parse the whole char class
12955                                                 only if not a posix class */
12956                              FALSE, /* don't allow multi-char folds */
12957                              FALSE, /* don't silence non-portable warnings.  */
12958                              &current))
12959                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12960                           (UV) *flagp);
12961                 /* function call leaves parse pointing to the ']', except if we
12962                  * faked it */
12963                 if (is_posix_class) {
12964                     RExC_parse--;
12965                 }
12966 
12967                 goto handle_operand;
12968             }
12969 
12970             case '&':
12971             case '|':
12972             case '+':
12973             case '-':
12974             case '^':
12975                 if (top_index < 0
12976                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12977                     || ! IS_OPERAND(*top_ptr))
12978                 {
12979                     RExC_parse++;
12980                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12981                 }
12982                 av_push(stack, newSVuv(curchar));
12983                 break;
12984 
12985             case '!':
12986                 av_push(stack, newSVuv(curchar));
12987                 break;
12988 
12989             case '(':
12990                 if (top_index >= 0) {
12991                     top_ptr = av_fetch(stack, top_index, FALSE);
12992                     assert(top_ptr);
12993                     if (IS_OPERAND(*top_ptr)) {
12994                         RExC_parse++;
12995                         vFAIL("Unexpected '(' with no preceding operator");
12996                     }
12997                 }
12998                 av_push(stack, newSVuv(curchar));
12999                 break;
13000 
13001             case ')':
13002             {
13003                 SV* lparen;
13004                 if (top_index < 1
13005                     || ! (current = av_pop(stack))
13006                     || ! IS_OPERAND(current)
13007                     || ! (lparen = av_pop(stack))
13008                     || IS_OPERAND(lparen)
13009                     || SvUV(lparen) != '(')
13010                 {
13011                     SvREFCNT_dec(current);
13012                     RExC_parse++;
13013                     vFAIL("Unexpected ')'");
13014                 }
13015                 top_index -= 2;
13016                 SvREFCNT_dec_NN(lparen);
13017 
13018                 /* FALL THROUGH */
13019             }
13020 
13021               handle_operand:
13022 
13023                 /* Here, we have an operand to process, in 'current' */
13024 
13025                 if (top_index < 0) {    /* Just push if stack is empty */
13026                     av_push(stack, current);
13027                 }
13028                 else {
13029                     SV* top = av_pop(stack);
13030                     SV *prev = NULL;
13031                     char current_operator;
13032 
13033                     if (IS_OPERAND(top)) {
13034                         SvREFCNT_dec_NN(top);
13035                         SvREFCNT_dec_NN(current);
13036                         vFAIL("Operand with no preceding operator");
13037                     }
13038                     current_operator = (char) SvUV(top);
13039                     switch (current_operator) {
13040                         case '(':   /* Push the '(' back on followed by the new
13041                                        operand */
13042                             av_push(stack, top);
13043                             av_push(stack, current);
13044                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13045                                                    just after the 'break', so
13046                                                    it doesn't get wrongly freed
13047                                                  */
13048                             break;
13049 
13050                         case '!':
13051                             _invlist_invert(current);
13052 
13053                             /* Unlike binary operators, the top of the stack,
13054                              * now that this unary one has been popped off, may
13055                              * legally be an operator, and we now have operand
13056                              * for it. */
13057                             top_index--;
13058                             SvREFCNT_dec_NN(top);
13059                             goto handle_operand;
13060 
13061                         case '&':
13062                             prev = av_pop(stack);
13063                             _invlist_intersection(prev,
13064                                                    current,
13065                                                    &current);
13066                             av_push(stack, current);
13067                             break;
13068 
13069                         case '|':
13070                         case '+':
13071                             prev = av_pop(stack);
13072                             _invlist_union(prev, current, &current);
13073                             av_push(stack, current);
13074                             break;
13075 
13076                         case '-':
13077                             prev = av_pop(stack);;
13078                             _invlist_subtract(prev, current, &current);
13079                             av_push(stack, current);
13080                             break;
13081 
13082                         case '^':   /* The union minus the intersection */
13083                         {
13084                             SV* i = NULL;
13085                             SV* u = NULL;
13086                             SV* element;
13087 
13088                             prev = av_pop(stack);
13089                             _invlist_union(prev, current, &u);
13090                             _invlist_intersection(prev, current, &i);
13091                             /* _invlist_subtract will overwrite current
13092                                 without freeing what it already contains */
13093                             element = current;
13094                             _invlist_subtract(u, i, &current);
13095                             av_push(stack, current);
13096                             SvREFCNT_dec_NN(i);
13097                             SvREFCNT_dec_NN(u);
13098                             SvREFCNT_dec_NN(element);
13099                             break;
13100                         }
13101 
13102                         default:
13103                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13104                 }
13105                 SvREFCNT_dec_NN(top);
13106                 SvREFCNT_dec(prev);
13107             }
13108         }
13109 
13110         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13111     }
13112 
13113     if (av_tindex(stack) < 0   /* Was empty */
13114         || ((final = av_pop(stack)) == NULL)
13115         || ! IS_OPERAND(final)
13116         || av_tindex(stack) >= 0)  /* More left on stack */
13117     {
13118         vFAIL("Incomplete expression within '(?[ ])'");
13119     }
13120 
13121     /* Here, 'final' is the resultant inversion list from evaluating the
13122      * expression.  Return it if so requested */
13123     if (return_invlist) {
13124         *return_invlist = final;
13125         return END;
13126     }
13127 
13128     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13129      * expecting a string of ranges and individual code points */
13130     invlist_iterinit(final);
13131     result_string = newSVpvs("");
13132     while (invlist_iternext(final, &start, &end)) {
13133         if (start == end) {
13134             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13135         }
13136         else {
13137             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13138                                                      start,          end);
13139         }
13140     }
13141 
13142     save_parse = RExC_parse;
13143     RExC_parse = SvPV(result_string, len);
13144     save_end = RExC_end;
13145     RExC_end = RExC_parse + len;
13146 
13147     /* We turn off folding around the call, as the class we have constructed
13148      * already has all folding taken into consideration, and we don't want
13149      * regclass() to add to that */
13150     RExC_flags &= ~RXf_PMf_FOLD;
13151     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13152      */
13153     node = regclass(pRExC_state, flagp,depth+1,
13154                     FALSE, /* means parse the whole char class */
13155                     FALSE, /* don't allow multi-char folds */
13156                     TRUE, /* silence non-portable warnings.  The above may very
13157                              well have generated non-portable code points, but
13158                              they're valid on this machine */
13159                     NULL);
13160     if (!node)
13161         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13162                     PTR2UV(flagp));
13163     if (save_fold) {
13164         RExC_flags |= RXf_PMf_FOLD;
13165     }
13166     RExC_parse = save_parse + 1;
13167     RExC_end = save_end;
13168     SvREFCNT_dec_NN(final);
13169     SvREFCNT_dec_NN(result_string);
13170 
13171     nextchar(pRExC_state);
13172     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13173     return node;
13174 }
13175 #undef IS_OPERAND
13176 
13177 /* The names of properties whose definitions are not known at compile time are
13178  * stored in this SV, after a constant heading.  So if the length has been
13179  * changed since initialization, then there is a run-time definition. */
13180 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13181                                         (SvCUR(listsv) != initial_listsv_len)
13182 
13183 STATIC regnode *
13184 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13185                  const bool stop_at_1,  /* Just parse the next thing, don't
13186                                            look for a full character class */
13187                  bool allow_multi_folds,
13188                  const bool silence_non_portable,   /* Don't output warnings
13189                                                        about too large
13190                                                        characters */
13191                  SV** ret_invlist)  /* Return an inversion list, not a node */
13192 {
13193     /* parse a bracketed class specification.  Most of these will produce an
13194      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13195      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13196      * under /i with multi-character folds: it will be rewritten following the
13197      * paradigm of this example, where the <multi-fold>s are characters which
13198      * fold to multiple character sequences:
13199      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13200      * gets effectively rewritten as:
13201      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13202      * reg() gets called (recursively) on the rewritten version, and this
13203      * function will return what it constructs.  (Actually the <multi-fold>s
13204      * aren't physically removed from the [abcdefghi], it's just that they are
13205      * ignored in the recursion by means of a flag:
13206      * <RExC_in_multi_char_class>.)
13207      *
13208      * ANYOF nodes contain a bit map for the first 256 characters, with the
13209      * corresponding bit set if that character is in the list.  For characters
13210      * above 255, a range list or swash is used.  There are extra bits for \w,
13211      * etc. in locale ANYOFs, as what these match is not determinable at
13212      * compile time
13213      *
13214      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13215      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13216      */
13217 
13218     dVAR;
13219     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13220     IV range = 0;
13221     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13222     regnode *ret;
13223     STRLEN numlen;
13224     IV namedclass = OOB_NAMEDCLASS;
13225     char *rangebegin = NULL;
13226     bool need_class = 0;
13227     SV *listsv = NULL;
13228     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13229 				      than just initialized.  */
13230     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13231     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13232                                extended beyond the Latin1 range.  These have to
13233                                be kept separate from other code points for much
13234                                of this function because their handling  is
13235                                different under /i, and for most classes under
13236                                /d as well */
13237     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13238                                separate for a while from the non-complemented
13239                                versions because of complications with /d
13240                                matching */
13241     UV element_count = 0;   /* Number of distinct elements in the class.
13242 			       Optimizations may be possible if this is tiny */
13243     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13244                                        character; used under /i */
13245     UV n;
13246     char * stop_ptr = RExC_end;    /* where to stop parsing */
13247     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13248                                                    space? */
13249     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13250 
13251     /* Unicode properties are stored in a swash; this holds the current one
13252      * being parsed.  If this swash is the only above-latin1 component of the
13253      * character class, an optimization is to pass it directly on to the
13254      * execution engine.  Otherwise, it is set to NULL to indicate that there
13255      * are other things in the class that have to be dealt with at execution
13256      * time */
13257     SV* swash = NULL;		/* Code points that match \p{} \P{} */
13258 
13259     /* Set if a component of this character class is user-defined; just passed
13260      * on to the engine */
13261     bool has_user_defined_property = FALSE;
13262 
13263     /* inversion list of code points this node matches only when the target
13264      * string is in UTF-8.  (Because is under /d) */
13265     SV* depends_list = NULL;
13266 
13267     /* Inversion list of code points this node matches regardless of things
13268      * like locale, folding, utf8ness of the target string */
13269     SV* cp_list = NULL;
13270 
13271     /* Like cp_list, but code points on this list need to be checked for things
13272      * that fold to/from them under /i */
13273     SV* cp_foldable_list = NULL;
13274 
13275     /* Like cp_list, but code points on this list are valid only when the
13276      * runtime locale is UTF-8 */
13277     SV* only_utf8_locale_list = NULL;
13278 
13279 #ifdef EBCDIC
13280     /* In a range, counts how many 0-2 of the ends of it came from literals,
13281      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13282     UV literal_endpoint = 0;
13283 #endif
13284     bool invert = FALSE;    /* Is this class to be complemented */
13285 
13286     bool warn_super = ALWAYS_WARN_SUPER;
13287 
13288     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13289         case we need to change the emitted regop to an EXACT. */
13290     const char * orig_parse = RExC_parse;
13291     const SSize_t orig_size = RExC_size;
13292     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13293     GET_RE_DEBUG_FLAGS_DECL;
13294 
13295     PERL_ARGS_ASSERT_REGCLASS;
13296 #ifndef DEBUGGING
13297     PERL_UNUSED_ARG(depth);
13298 #endif
13299 
13300     DEBUG_PARSE("clas");
13301 
13302     /* Assume we are going to generate an ANYOF node. */
13303     ret = reganode(pRExC_state, ANYOF, 0);
13304 
13305     if (SIZE_ONLY) {
13306 	RExC_size += ANYOF_SKIP;
13307 	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13308     }
13309     else {
13310         ANYOF_FLAGS(ret) = 0;
13311 
13312  	RExC_emit += ANYOF_SKIP;
13313 	listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13314 	initial_listsv_len = SvCUR(listsv);
13315         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13316     }
13317 
13318     if (skip_white) {
13319         RExC_parse = regpatws(pRExC_state, RExC_parse,
13320                               FALSE /* means don't recognize comments */);
13321     }
13322 
13323     if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
13324 	RExC_parse++;
13325         invert = TRUE;
13326         allow_multi_folds = FALSE;
13327         RExC_naughty++;
13328         if (skip_white) {
13329             RExC_parse = regpatws(pRExC_state, RExC_parse,
13330                                   FALSE /* means don't recognize comments */);
13331         }
13332     }
13333 
13334     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13335     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13336 	const char *s = RExC_parse;
13337 	const char  c = *s++;
13338 
13339 	while (isWORDCHAR(*s))
13340 	    s++;
13341 	if (*s && c == *s && s[1] == ']') {
13342 	    SAVEFREESV(RExC_rx_sv);
13343 	    ckWARN3reg(s+2,
13344 		       "POSIX syntax [%c %c] belongs inside character classes",
13345 		       c, c);
13346 	    (void)ReREFCNT_inc(RExC_rx_sv);
13347 	}
13348     }
13349 
13350     /* If the caller wants us to just parse a single element, accomplish this
13351      * by faking the loop ending condition */
13352     if (stop_at_1 && RExC_end > RExC_parse) {
13353         stop_ptr = RExC_parse + 1;
13354     }
13355 
13356     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13357     if (UCHARAT(RExC_parse) == ']')
13358 	goto charclassloop;
13359 
13360 parseit:
13361     while (1) {
13362         if  (RExC_parse >= stop_ptr) {
13363             break;
13364         }
13365 
13366         if (skip_white) {
13367             RExC_parse = regpatws(pRExC_state, RExC_parse,
13368                                   FALSE /* means don't recognize comments */);
13369         }
13370 
13371         if  (UCHARAT(RExC_parse) == ']') {
13372             break;
13373         }
13374 
13375     charclassloop:
13376 
13377 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13378         save_value = value;
13379         save_prevvalue = prevvalue;
13380 
13381 	if (!range) {
13382 	    rangebegin = RExC_parse;
13383 	    element_count++;
13384 	}
13385 	if (UTF) {
13386 	    value = utf8n_to_uvchr((U8*)RExC_parse,
13387 				   RExC_end - RExC_parse,
13388 				   &numlen, UTF8_ALLOW_DEFAULT);
13389 	    RExC_parse += numlen;
13390 	}
13391 	else
13392 	    value = UCHARAT(RExC_parse++);
13393 
13394         if (value == '['
13395             && RExC_parse < RExC_end
13396             && POSIXCC(UCHARAT(RExC_parse)))
13397         {
13398             namedclass = regpposixcc(pRExC_state, value, strict);
13399         }
13400         else if (value == '\\') {
13401 	    if (UTF) {
13402 		value = utf8n_to_uvchr((U8*)RExC_parse,
13403 				   RExC_end - RExC_parse,
13404 				   &numlen, UTF8_ALLOW_DEFAULT);
13405 		RExC_parse += numlen;
13406 	    }
13407 	    else
13408 		value = UCHARAT(RExC_parse++);
13409 
13410 	    /* Some compilers cannot handle switching on 64-bit integer
13411 	     * values, therefore value cannot be an UV.  Yes, this will
13412 	     * be a problem later if we want switch on Unicode.
13413 	     * A similar issue a little bit later when switching on
13414 	     * namedclass. --jhi */
13415 
13416             /* If the \ is escaping white space when white space is being
13417              * skipped, it means that that white space is wanted literally, and
13418              * is already in 'value'.  Otherwise, need to translate the escape
13419              * into what it signifies. */
13420             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13421 
13422 	    case 'w':	namedclass = ANYOF_WORDCHAR;	break;
13423 	    case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
13424 	    case 's':	namedclass = ANYOF_SPACE;	break;
13425 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
13426 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
13427 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
13428 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
13429 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
13430 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
13431 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
13432             case 'N':  /* Handle \N{NAME} in class */
13433                 {
13434                     /* We only pay attention to the first char of
13435                     multichar strings being returned. I kinda wonder
13436                     if this makes sense as it does change the behaviour
13437                     from earlier versions, OTOH that behaviour was broken
13438                     as well. */
13439                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13440                                       TRUE, /* => charclass */
13441                                       strict))
13442                     {
13443                         if (*flagp & RESTART_UTF8)
13444                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13445                         goto parseit;
13446                     }
13447                 }
13448                 break;
13449 	    case 'p':
13450 	    case 'P':
13451 		{
13452 		char *e;
13453 
13454                 /* We will handle any undefined properties ourselves */
13455                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13456                                        /* And we actually would prefer to get
13457                                         * the straight inversion list of the
13458                                         * swash, since we will be accessing it
13459                                         * anyway, to save a little time */
13460                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13461 
13462 		if (RExC_parse >= RExC_end)
13463 		    vFAIL2("Empty \\%c{}", (U8)value);
13464 		if (*RExC_parse == '{') {
13465 		    const U8 c = (U8)value;
13466 		    e = strchr(RExC_parse++, '}');
13467                     if (!e)
13468                         vFAIL2("Missing right brace on \\%c{}", c);
13469 		    while (isSPACE(UCHARAT(RExC_parse)))
13470 		        RExC_parse++;
13471                     if (e == RExC_parse)
13472                         vFAIL2("Empty \\%c{}", c);
13473 		    n = e - RExC_parse;
13474 		    while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13475 		        n--;
13476 		}
13477 		else {
13478 		    e = RExC_parse;
13479 		    n = 1;
13480 		}
13481 		if (!SIZE_ONLY) {
13482                     SV* invlist;
13483                     char* formatted;
13484                     char* name;
13485 
13486 		    if (UCHARAT(RExC_parse) == '^') {
13487 			 RExC_parse++;
13488 			 n--;
13489                          /* toggle.  (The rhs xor gets the single bit that
13490                           * differs between P and p; the other xor inverts just
13491                           * that bit) */
13492                          value ^= 'P' ^ 'p';
13493 
13494 			 while (isSPACE(UCHARAT(RExC_parse))) {
13495 			      RExC_parse++;
13496 			      n--;
13497 			 }
13498 		    }
13499                     /* Try to get the definition of the property into
13500                      * <invlist>.  If /i is in effect, the effective property
13501                      * will have its name be <__NAME_i>.  The design is
13502                      * discussed in commit
13503                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13504                     formatted = Perl_form(aTHX_
13505                                           "%s%.*s%s\n",
13506                                           (FOLD) ? "__" : "",
13507                                           (int)n,
13508                                           RExC_parse,
13509                                           (FOLD) ? "_i" : ""
13510                                 );
13511                     name = savepvn(formatted, strlen(formatted));
13512 
13513                     /* Look up the property name, and get its swash and
13514                      * inversion list, if the property is found  */
13515                     if (swash) {
13516                         SvREFCNT_dec_NN(swash);
13517                     }
13518                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13519                                              1, /* binary */
13520                                              0, /* not tr/// */
13521                                              NULL, /* No inversion list */
13522                                              &swash_init_flags
13523                                             );
13524                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13525                         if (swash) {
13526                             SvREFCNT_dec_NN(swash);
13527                             swash = NULL;
13528                         }
13529 
13530                         /* Here didn't find it.  It could be a user-defined
13531                          * property that will be available at run-time.  If we
13532                          * accept only compile-time properties, is an error;
13533                          * otherwise add it to the list for run-time look up */
13534                         if (ret_invlist) {
13535                             RExC_parse = e + 1;
13536                             vFAIL2utf8f(
13537                                 "Property '%"UTF8f"' is unknown",
13538                                 UTF8fARG(UTF, n, name));
13539                         }
13540                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13541                                         (value == 'p' ? '+' : '!'),
13542                                         UTF8fARG(UTF, n, name));
13543                         has_user_defined_property = TRUE;
13544 
13545                         /* We don't know yet, so have to assume that the
13546                          * property could match something in the Latin1 range,
13547                          * hence something that isn't utf8.  Note that this
13548                          * would cause things in <depends_list> to match
13549                          * inappropriately, except that any \p{}, including
13550                          * this one forces Unicode semantics, which means there
13551                          * is no <depends_list> */
13552                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13553                     }
13554                     else {
13555 
13556                         /* Here, did get the swash and its inversion list.  If
13557                          * the swash is from a user-defined property, then this
13558                          * whole character class should be regarded as such */
13559                         if (swash_init_flags
13560                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13561                         {
13562                             has_user_defined_property = TRUE;
13563                         }
13564                         else if
13565                             /* We warn on matching an above-Unicode code point
13566                              * if the match would return true, except don't
13567                              * warn for \p{All}, which has exactly one element
13568                              * = 0 */
13569                             (_invlist_contains_cp(invlist, 0x110000)
13570                                 && (! (_invlist_len(invlist) == 1
13571                                        && *invlist_array(invlist) == 0)))
13572                         {
13573                             warn_super = TRUE;
13574                         }
13575 
13576 
13577                         /* Invert if asking for the complement */
13578                         if (value == 'P') {
13579 			    _invlist_union_complement_2nd(properties,
13580                                                           invlist,
13581                                                           &properties);
13582 
13583                             /* The swash can't be used as-is, because we've
13584 			     * inverted things; delay removing it to here after
13585 			     * have copied its invlist above */
13586                             SvREFCNT_dec_NN(swash);
13587                             swash = NULL;
13588                         }
13589                         else {
13590                             _invlist_union(properties, invlist, &properties);
13591 			}
13592 		    }
13593 		    Safefree(name);
13594 		}
13595 		RExC_parse = e + 1;
13596                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13597                                                 named */
13598 
13599 		/* \p means they want Unicode semantics */
13600 		RExC_uni_semantics = 1;
13601 		}
13602 		break;
13603 	    case 'n':	value = '\n';			break;
13604 	    case 'r':	value = '\r';			break;
13605 	    case 't':	value = '\t';			break;
13606 	    case 'f':	value = '\f';			break;
13607 	    case 'b':	value = '\b';			break;
13608 	    case 'e':	value = ASCII_TO_NATIVE('\033');break;
13609 	    case 'a':	value = '\a';                   break;
13610 	    case 'o':
13611 		RExC_parse--;	/* function expects to be pointed at the 'o' */
13612 		{
13613 		    const char* error_msg;
13614 		    bool valid = grok_bslash_o(&RExC_parse,
13615 					       &value,
13616 					       &error_msg,
13617                                                SIZE_ONLY,   /* warnings in pass
13618                                                                1 only */
13619                                                strict,
13620                                                silence_non_portable,
13621                                                UTF);
13622 		    if (! valid) {
13623 			vFAIL(error_msg);
13624 		    }
13625 		}
13626 		if (PL_encoding && value < 0x100) {
13627 		    goto recode_encoding;
13628 		}
13629 		break;
13630 	    case 'x':
13631 		RExC_parse--;	/* function expects to be pointed at the 'x' */
13632 		{
13633 		    const char* error_msg;
13634 		    bool valid = grok_bslash_x(&RExC_parse,
13635 					       &value,
13636 					       &error_msg,
13637 					       TRUE, /* Output warnings */
13638                                                strict,
13639                                                silence_non_portable,
13640                                                UTF);
13641                     if (! valid) {
13642 			vFAIL(error_msg);
13643 		    }
13644 		}
13645 		if (PL_encoding && value < 0x100)
13646 		    goto recode_encoding;
13647 		break;
13648 	    case 'c':
13649 		value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13650 		break;
13651 	    case '0': case '1': case '2': case '3': case '4':
13652 	    case '5': case '6': case '7':
13653 		{
13654 		    /* Take 1-3 octal digits */
13655 		    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13656                     numlen = (strict) ? 4 : 3;
13657                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13658 		    RExC_parse += numlen;
13659                     if (numlen != 3) {
13660                         if (strict) {
13661                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13662                             vFAIL("Need exactly 3 octal digits");
13663                         }
13664                         else if (! SIZE_ONLY /* like \08, \178 */
13665                                  && numlen < 3
13666                                  && RExC_parse < RExC_end
13667                                  && isDIGIT(*RExC_parse)
13668                                  && ckWARN(WARN_REGEXP))
13669                         {
13670                             SAVEFREESV(RExC_rx_sv);
13671                             reg_warn_non_literal_string(
13672                                  RExC_parse + 1,
13673                                  form_short_octal_warning(RExC_parse, numlen));
13674                             (void)ReREFCNT_inc(RExC_rx_sv);
13675                         }
13676                     }
13677 		    if (PL_encoding && value < 0x100)
13678 			goto recode_encoding;
13679 		    break;
13680 		}
13681 	    recode_encoding:
13682 		if (! RExC_override_recoding) {
13683 		    SV* enc = PL_encoding;
13684 		    value = reg_recode((const char)(U8)value, &enc);
13685 		    if (!enc) {
13686                         if (strict) {
13687                             vFAIL("Invalid escape in the specified encoding");
13688                         }
13689                         else if (SIZE_ONLY) {
13690                             ckWARNreg(RExC_parse,
13691 				  "Invalid escape in the specified encoding");
13692                         }
13693                     }
13694 		    break;
13695 		}
13696 	    default:
13697 		/* Allow \_ to not give an error */
13698 		if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13699                     if (strict) {
13700                         vFAIL2("Unrecognized escape \\%c in character class",
13701                                (int)value);
13702                     }
13703                     else {
13704                         SAVEFREESV(RExC_rx_sv);
13705                         ckWARN2reg(RExC_parse,
13706                             "Unrecognized escape \\%c in character class passed through",
13707                             (int)value);
13708                         (void)ReREFCNT_inc(RExC_rx_sv);
13709                     }
13710 		}
13711 		break;
13712 	    }   /* End of switch on char following backslash */
13713 	} /* end of handling backslash escape sequences */
13714 #ifdef EBCDIC
13715         else
13716             literal_endpoint++;
13717 #endif
13718 
13719         /* Here, we have the current token in 'value' */
13720 
13721 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13722             U8 classnum;
13723 
13724 	    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13725 	     * literal, as is the character that began the false range, i.e.
13726 	     * the 'a' in the examples */
13727 	    if (range) {
13728 		if (!SIZE_ONLY) {
13729 		    const int w = (RExC_parse >= rangebegin)
13730                                   ? RExC_parse - rangebegin
13731                                   : 0;
13732                     if (strict) {
13733                         vFAIL2utf8f(
13734                             "False [] range \"%"UTF8f"\"",
13735                             UTF8fARG(UTF, w, rangebegin));
13736                     }
13737                     else {
13738                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13739                         ckWARN2reg(RExC_parse,
13740                             "False [] range \"%"UTF8f"\"",
13741                             UTF8fARG(UTF, w, rangebegin));
13742                         (void)ReREFCNT_inc(RExC_rx_sv);
13743                         cp_list = add_cp_to_invlist(cp_list, '-');
13744                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13745                                                              prevvalue);
13746                     }
13747 		}
13748 
13749 		range = 0; /* this was not a true range */
13750                 element_count += 2; /* So counts for three values */
13751 	    }
13752 
13753             classnum = namedclass_to_classnum(namedclass);
13754 
13755 	    if (LOC && namedclass < ANYOF_POSIXL_MAX
13756 #ifndef HAS_ISASCII
13757                 && classnum != _CC_ASCII
13758 #endif
13759             ) {
13760                 /* What the Posix classes (like \w, [:space:]) match in locale
13761                  * isn't knowable under locale until actual match time.  Room
13762                  * must be reserved (one time per outer bracketed class) to
13763                  * store such classes.  The space will contain a bit for each
13764                  * named class that is to be matched against.  This isn't
13765                  * needed for \p{} and pseudo-classes, as they are not affected
13766                  * by locale, and hence are dealt with separately */
13767                 if (! need_class) {
13768                     need_class = 1;
13769                     if (SIZE_ONLY) {
13770                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13771                     }
13772                     else {
13773                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13774                     }
13775                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13776                     ANYOF_POSIXL_ZERO(ret);
13777                 }
13778 
13779                 /* See if it already matches the complement of this POSIX
13780                  * class */
13781                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13782                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13783                                                             ? -1
13784                                                             : 1)))
13785                 {
13786                     posixl_matches_all = TRUE;
13787                     break;  /* No need to continue.  Since it matches both
13788                                e.g., \w and \W, it matches everything, and the
13789                                bracketed class can be optimized into qr/./s */
13790                 }
13791 
13792                 /* Add this class to those that should be checked at runtime */
13793                 ANYOF_POSIXL_SET(ret, namedclass);
13794 
13795                 /* The above-Latin1 characters are not subject to locale rules.
13796                  * Just add them, in the second pass, to the
13797                  * unconditionally-matched list */
13798                 if (! SIZE_ONLY) {
13799                     SV* scratch_list = NULL;
13800 
13801                     /* Get the list of the above-Latin1 code points this
13802                      * matches */
13803                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13804                                           PL_XPosix_ptrs[classnum],
13805 
13806                                           /* Odd numbers are complements, like
13807                                            * NDIGIT, NASCII, ... */
13808                                           namedclass % 2 != 0,
13809                                           &scratch_list);
13810                     /* Checking if 'cp_list' is NULL first saves an extra
13811                      * clone.  Its reference count will be decremented at the
13812                      * next union, etc, or if this is the only instance, at the
13813                      * end of the routine */
13814                     if (! cp_list) {
13815                         cp_list = scratch_list;
13816                     }
13817                     else {
13818                         _invlist_union(cp_list, scratch_list, &cp_list);
13819                         SvREFCNT_dec_NN(scratch_list);
13820                     }
13821                     continue;   /* Go get next character */
13822                 }
13823             }
13824             else if (! SIZE_ONLY) {
13825 
13826                 /* Here, not in pass1 (in that pass we skip calculating the
13827                  * contents of this class), and is /l, or is a POSIX class for
13828                  * which /l doesn't matter (or is a Unicode property, which is
13829                  * skipped here). */
13830                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13831                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13832 
13833                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13834                          * nor /l make a difference in what these match,
13835                          * therefore we just add what they match to cp_list. */
13836                         if (classnum != _CC_VERTSPACE) {
13837                             assert(   namedclass == ANYOF_HORIZWS
13838                                    || namedclass == ANYOF_NHORIZWS);
13839 
13840                             /* It turns out that \h is just a synonym for
13841                              * XPosixBlank */
13842                             classnum = _CC_BLANK;
13843                         }
13844 
13845                         _invlist_union_maybe_complement_2nd(
13846                                 cp_list,
13847                                 PL_XPosix_ptrs[classnum],
13848                                 namedclass % 2 != 0,    /* Complement if odd
13849                                                           (NHORIZWS, NVERTWS)
13850                                                         */
13851                                 &cp_list);
13852                     }
13853                 }
13854                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13855                            complement and use nposixes */
13856                     SV** posixes_ptr = namedclass % 2 == 0
13857                                        ? &posixes
13858                                        : &nposixes;
13859                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13860                     _invlist_union_maybe_complement_2nd(
13861                                                      *posixes_ptr,
13862                                                      *source_ptr,
13863                                                      namedclass % 2 != 0,
13864                                                      posixes_ptr);
13865                 }
13866                 continue;   /* Go get next character */
13867 	    }
13868 	} /* end of namedclass \blah */
13869 
13870         /* Here, we have a single value.  If 'range' is set, it is the ending
13871          * of a range--check its validity.  Later, we will handle each
13872          * individual code point in the range.  If 'range' isn't set, this
13873          * could be the beginning of a range, so check for that by looking
13874          * ahead to see if the next real character to be processed is the range
13875          * indicator--the minus sign */
13876 
13877         if (skip_white) {
13878             RExC_parse = regpatws(pRExC_state, RExC_parse,
13879                                 FALSE /* means don't recognize comments */);
13880         }
13881 
13882 	if (range) {
13883 	    if (prevvalue > value) /* b-a */ {
13884 		const int w = RExC_parse - rangebegin;
13885                 vFAIL2utf8f(
13886                     "Invalid [] range \"%"UTF8f"\"",
13887                     UTF8fARG(UTF, w, rangebegin));
13888 		range = 0; /* not a valid range */
13889 	    }
13890 	}
13891 	else {
13892             prevvalue = value; /* save the beginning of the potential range */
13893             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13894                 && *RExC_parse == '-')
13895             {
13896                 char* next_char_ptr = RExC_parse + 1;
13897                 if (skip_white) {   /* Get the next real char after the '-' */
13898                     next_char_ptr = regpatws(pRExC_state,
13899                                              RExC_parse + 1,
13900                                              FALSE); /* means don't recognize
13901                                                         comments */
13902                 }
13903 
13904                 /* If the '-' is at the end of the class (just before the ']',
13905                  * it is a literal minus; otherwise it is a range */
13906                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13907                     RExC_parse = next_char_ptr;
13908 
13909                     /* a bad range like \w-, [:word:]- ? */
13910                     if (namedclass > OOB_NAMEDCLASS) {
13911                         if (strict || ckWARN(WARN_REGEXP)) {
13912                             const int w =
13913                                 RExC_parse >= rangebegin ?
13914                                 RExC_parse - rangebegin : 0;
13915                             if (strict) {
13916                                 vFAIL4("False [] range \"%*.*s\"",
13917                                     w, w, rangebegin);
13918                             }
13919                             else {
13920                                 vWARN4(RExC_parse,
13921                                     "False [] range \"%*.*s\"",
13922                                     w, w, rangebegin);
13923                             }
13924                         }
13925                         if (!SIZE_ONLY) {
13926                             cp_list = add_cp_to_invlist(cp_list, '-');
13927                         }
13928                         element_count++;
13929                     } else
13930                         range = 1;	/* yeah, it's a range! */
13931                     continue;	/* but do it the next time */
13932                 }
13933 	    }
13934 	}
13935 
13936         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13937          * if not */
13938 
13939 	/* non-Latin1 code point implies unicode semantics.  Must be set in
13940 	 * pass1 so is there for the whole of pass 2 */
13941 	if (value > 255) {
13942 	    RExC_uni_semantics = 1;
13943 	}
13944 
13945         /* Ready to process either the single value, or the completed range.
13946          * For single-valued non-inverted ranges, we consider the possibility
13947          * of multi-char folds.  (We made a conscious decision to not do this
13948          * for the other cases because it can often lead to non-intuitive
13949          * results.  For example, you have the peculiar case that:
13950          *  "s s" =~ /^[^\xDF]+$/i => Y
13951          *  "ss"  =~ /^[^\xDF]+$/i => N
13952          *
13953          * See [perl #89750] */
13954         if (FOLD && allow_multi_folds && value == prevvalue) {
13955             if (value == LATIN_SMALL_LETTER_SHARP_S
13956                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13957                                                         value)))
13958             {
13959                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13960 
13961                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13962                 STRLEN foldlen;
13963 
13964                 UV folded = _to_uni_fold_flags(
13965                                 value,
13966                                 foldbuf,
13967                                 &foldlen,
13968                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13969                                                    ? FOLD_FLAGS_NOMIX_ASCII
13970                                                    : 0)
13971                                 );
13972 
13973                 /* Here, <folded> should be the first character of the
13974                  * multi-char fold of <value>, with <foldbuf> containing the
13975                  * whole thing.  But, if this fold is not allowed (because of
13976                  * the flags), <fold> will be the same as <value>, and should
13977                  * be processed like any other character, so skip the special
13978                  * handling */
13979                 if (folded != value) {
13980 
13981                     /* Skip if we are recursed, currently parsing the class
13982                      * again.  Otherwise add this character to the list of
13983                      * multi-char folds. */
13984                     if (! RExC_in_multi_char_class) {
13985                         AV** this_array_ptr;
13986                         AV* this_array;
13987                         STRLEN cp_count = utf8_length(foldbuf,
13988                                                       foldbuf + foldlen);
13989                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13990 
13991                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13992 
13993 
13994                         if (! multi_char_matches) {
13995                             multi_char_matches = newAV();
13996                         }
13997 
13998                         /* <multi_char_matches> is actually an array of arrays.
13999                          * There will be one or two top-level elements: [2],
14000                          * and/or [3].  The [2] element is an array, each
14001                          * element thereof is a character which folds to TWO
14002                          * characters; [3] is for folds to THREE characters.
14003                          * (Unicode guarantees a maximum of 3 characters in any
14004                          * fold.)  When we rewrite the character class below,
14005                          * we will do so such that the longest folds are
14006                          * written first, so that it prefers the longest
14007                          * matching strings first.  This is done even if it
14008                          * turns out that any quantifier is non-greedy, out of
14009                          * programmer laziness.  Tom Christiansen has agreed
14010                          * that this is ok.  This makes the test for the
14011                          * ligature 'ffi' come before the test for 'ff' */
14012                         if (av_exists(multi_char_matches, cp_count)) {
14013                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
14014                                                              cp_count, FALSE);
14015                             this_array = *this_array_ptr;
14016                         }
14017                         else {
14018                             this_array = newAV();
14019                             av_store(multi_char_matches, cp_count,
14020                                      (SV*) this_array);
14021                         }
14022                         av_push(this_array, multi_fold);
14023                     }
14024 
14025                     /* This element should not be processed further in this
14026                      * class */
14027                     element_count--;
14028                     value = save_value;
14029                     prevvalue = save_prevvalue;
14030                     continue;
14031                 }
14032             }
14033         }
14034 
14035         /* Deal with this element of the class */
14036 	if (! SIZE_ONLY) {
14037 #ifndef EBCDIC
14038             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14039                                                      prevvalue, value);
14040 #else
14041             SV* this_range = _new_invlist(1);
14042             _append_range_to_invlist(this_range, prevvalue, value);
14043 
14044             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14045              * If this range was specified using something like 'i-j', we want
14046              * to include only the 'i' and the 'j', and not anything in
14047              * between, so exclude non-ASCII, non-alphabetics from it.
14048              * However, if the range was specified with something like
14049              * [\x89-\x91] or [\x89-j], all code points within it should be
14050              * included.  literal_endpoint==2 means both ends of the range used
14051              * a literal character, not \x{foo} */
14052 	    if (literal_endpoint == 2
14053                 && ((prevvalue >= 'a' && value <= 'z')
14054                     || (prevvalue >= 'A' && value <= 'Z')))
14055             {
14056                 _invlist_intersection(this_range, PL_ASCII,
14057                                       &this_range);
14058 
14059                 /* Since this above only contains ascii, the intersection of it
14060                  * with anything will still yield only ascii */
14061                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14062                                       &this_range);
14063             }
14064             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14065             literal_endpoint = 0;
14066 #endif
14067         }
14068 
14069 	range = 0; /* this range (if it was one) is done now */
14070     } /* End of loop through all the text within the brackets */
14071 
14072     /* If anything in the class expands to more than one character, we have to
14073      * deal with them by building up a substitute parse string, and recursively
14074      * calling reg() on it, instead of proceeding */
14075     if (multi_char_matches) {
14076 	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14077         I32 cp_count;
14078 	STRLEN len;
14079 	char *save_end = RExC_end;
14080 	char *save_parse = RExC_parse;
14081         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14082                                        a "|" */
14083         I32 reg_flags;
14084 
14085         assert(! invert);
14086 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14087            because too confusing */
14088         if (invert) {
14089             sv_catpv(substitute_parse, "(?:");
14090         }
14091 #endif
14092 
14093         /* Look at the longest folds first */
14094         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14095 
14096             if (av_exists(multi_char_matches, cp_count)) {
14097                 AV** this_array_ptr;
14098                 SV* this_sequence;
14099 
14100                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14101                                                  cp_count, FALSE);
14102                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14103                                                                 &PL_sv_undef)
14104                 {
14105                     if (! first_time) {
14106                         sv_catpv(substitute_parse, "|");
14107                     }
14108                     first_time = FALSE;
14109 
14110                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14111                 }
14112             }
14113         }
14114 
14115         /* If the character class contains anything else besides these
14116          * multi-character folds, have to include it in recursive parsing */
14117         if (element_count) {
14118             sv_catpv(substitute_parse, "|[");
14119             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14120             sv_catpv(substitute_parse, "]");
14121         }
14122 
14123         sv_catpv(substitute_parse, ")");
14124 #if 0
14125         if (invert) {
14126             /* This is a way to get the parse to skip forward a whole named
14127              * sequence instead of matching the 2nd character when it fails the
14128              * first */
14129             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14130         }
14131 #endif
14132 
14133 	RExC_parse = SvPV(substitute_parse, len);
14134 	RExC_end = RExC_parse + len;
14135         RExC_in_multi_char_class = 1;
14136         RExC_emit = (regnode *)orig_emit;
14137 
14138 	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14139 
14140 	*flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14141 
14142 	RExC_parse = save_parse;
14143 	RExC_end = save_end;
14144 	RExC_in_multi_char_class = 0;
14145         SvREFCNT_dec_NN(multi_char_matches);
14146         return ret;
14147     }
14148 
14149     /* Here, we've gone through the entire class and dealt with multi-char
14150      * folds.  We are now in a position that we can do some checks to see if we
14151      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14152      * Currently we only do two checks:
14153      * 1) is in the unlikely event that the user has specified both, eg. \w and
14154      *    \W under /l, then the class matches everything.  (This optimization
14155      *    is done only to make the optimizer code run later work.)
14156      * 2) if the character class contains only a single element (including a
14157      *    single range), we see if there is an equivalent node for it.
14158      * Other checks are possible */
14159     if (! ret_invlist   /* Can't optimize if returning the constructed
14160                            inversion list */
14161         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14162     {
14163         U8 op = END;
14164         U8 arg = 0;
14165 
14166         if (UNLIKELY(posixl_matches_all)) {
14167             op = SANY;
14168         }
14169         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14170                                                    \w or [:digit:] or \p{foo}
14171                                                  */
14172 
14173             /* All named classes are mapped into POSIXish nodes, with its FLAG
14174              * argument giving which class it is */
14175             switch ((I32)namedclass) {
14176                 case ANYOF_UNIPROP:
14177                     break;
14178 
14179                 /* These don't depend on the charset modifiers.  They always
14180                  * match under /u rules */
14181                 case ANYOF_NHORIZWS:
14182                 case ANYOF_HORIZWS:
14183                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14184                     /* FALLTHROUGH */
14185 
14186                 case ANYOF_NVERTWS:
14187                 case ANYOF_VERTWS:
14188                     op = POSIXU;
14189                     goto join_posix;
14190 
14191                 /* The actual POSIXish node for all the rest depends on the
14192                  * charset modifier.  The ones in the first set depend only on
14193                  * ASCII or, if available on this platform, locale */
14194                 case ANYOF_ASCII:
14195                 case ANYOF_NASCII:
14196 #ifdef HAS_ISASCII
14197                     op = (LOC) ? POSIXL : POSIXA;
14198 #else
14199                     op = POSIXA;
14200 #endif
14201                     goto join_posix;
14202 
14203                 case ANYOF_NCASED:
14204                 case ANYOF_LOWER:
14205                 case ANYOF_NLOWER:
14206                 case ANYOF_UPPER:
14207                 case ANYOF_NUPPER:
14208                     /* under /a could be alpha */
14209                     if (FOLD) {
14210                         if (ASCII_RESTRICTED) {
14211                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14212                         }
14213                         else if (! LOC) {
14214                             break;
14215                         }
14216                     }
14217                     /* FALLTHROUGH */
14218 
14219                 /* The rest have more possibilities depending on the charset.
14220                  * We take advantage of the enum ordering of the charset
14221                  * modifiers to get the exact node type, */
14222                 default:
14223                     op = POSIXD + get_regex_charset(RExC_flags);
14224                     if (op > POSIXA) { /* /aa is same as /a */
14225                         op = POSIXA;
14226                     }
14227 
14228                 join_posix:
14229                     /* The odd numbered ones are the complements of the
14230                      * next-lower even number one */
14231                     if (namedclass % 2 == 1) {
14232                         invert = ! invert;
14233                         namedclass--;
14234                     }
14235                     arg = namedclass_to_classnum(namedclass);
14236                     break;
14237             }
14238         }
14239         else if (value == prevvalue) {
14240 
14241             /* Here, the class consists of just a single code point */
14242 
14243             if (invert) {
14244                 if (! LOC && value == '\n') {
14245                     op = REG_ANY; /* Optimize [^\n] */
14246                     *flagp |= HASWIDTH|SIMPLE;
14247                     RExC_naughty++;
14248                 }
14249             }
14250             else if (value < 256 || UTF) {
14251 
14252                 /* Optimize a single value into an EXACTish node, but not if it
14253                  * would require converting the pattern to UTF-8. */
14254                 op = compute_EXACTish(pRExC_state);
14255             }
14256         } /* Otherwise is a range */
14257         else if (! LOC) {   /* locale could vary these */
14258             if (prevvalue == '0') {
14259                 if (value == '9') {
14260                     arg = _CC_DIGIT;
14261                     op = POSIXA;
14262                 }
14263             }
14264         }
14265 
14266         /* Here, we have changed <op> away from its initial value iff we found
14267          * an optimization */
14268         if (op != END) {
14269 
14270             /* Throw away this ANYOF regnode, and emit the calculated one,
14271              * which should correspond to the beginning, not current, state of
14272              * the parse */
14273             const char * cur_parse = RExC_parse;
14274             RExC_parse = (char *)orig_parse;
14275             if ( SIZE_ONLY) {
14276                 if (! LOC) {
14277 
14278                     /* To get locale nodes to not use the full ANYOF size would
14279                      * require moving the code above that writes the portions
14280                      * of it that aren't in other nodes to after this point.
14281                      * e.g.  ANYOF_POSIXL_SET */
14282                     RExC_size = orig_size;
14283                 }
14284             }
14285             else {
14286                 RExC_emit = (regnode *)orig_emit;
14287                 if (PL_regkind[op] == POSIXD) {
14288                     if (op == POSIXL) {
14289                         RExC_contains_locale = 1;
14290                     }
14291                     if (invert) {
14292                         op += NPOSIXD - POSIXD;
14293                     }
14294                 }
14295             }
14296 
14297             ret = reg_node(pRExC_state, op);
14298 
14299             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14300                 if (! SIZE_ONLY) {
14301                     FLAGS(ret) = arg;
14302                 }
14303                 *flagp |= HASWIDTH|SIMPLE;
14304             }
14305             else if (PL_regkind[op] == EXACT) {
14306                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14307                                            TRUE /* downgradable to EXACT */
14308                                            );
14309             }
14310 
14311             RExC_parse = (char *) cur_parse;
14312 
14313             SvREFCNT_dec(posixes);
14314             SvREFCNT_dec(nposixes);
14315             SvREFCNT_dec(cp_list);
14316             SvREFCNT_dec(cp_foldable_list);
14317             return ret;
14318         }
14319     }
14320 
14321     if (SIZE_ONLY)
14322         return ret;
14323     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14324 
14325     /* If folding, we calculate all characters that could fold to or from the
14326      * ones already on the list */
14327     if (cp_foldable_list) {
14328         if (FOLD) {
14329             UV start, end;	/* End points of code point ranges */
14330 
14331             SV* fold_intersection = NULL;
14332             SV** use_list;
14333 
14334             /* Our calculated list will be for Unicode rules.  For locale
14335              * matching, we have to keep a separate list that is consulted at
14336              * runtime only when the locale indicates Unicode rules.  For
14337              * non-locale, we just use to the general list */
14338             if (LOC) {
14339                 use_list = &only_utf8_locale_list;
14340             }
14341             else {
14342                 use_list = &cp_list;
14343             }
14344 
14345             /* Only the characters in this class that participate in folds need
14346              * be checked.  Get the intersection of this class and all the
14347              * possible characters that are foldable.  This can quickly narrow
14348              * down a large class */
14349             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14350                                   &fold_intersection);
14351 
14352             /* The folds for all the Latin1 characters are hard-coded into this
14353              * program, but we have to go out to disk to get the others. */
14354             if (invlist_highest(cp_foldable_list) >= 256) {
14355 
14356                 /* This is a hash that for a particular fold gives all
14357                  * characters that are involved in it */
14358                 if (! PL_utf8_foldclosures) {
14359 
14360                     /* If the folds haven't been read in, call a fold function
14361                      * to force that */
14362                     if (! PL_utf8_tofold) {
14363                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14364 
14365                         /* This string is just a short named one above \xff */
14366                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14367                         assert(PL_utf8_tofold); /* Verify that worked */
14368                     }
14369                     PL_utf8_foldclosures
14370                                       = _swash_inversion_hash(PL_utf8_tofold);
14371                 }
14372             }
14373 
14374             /* Now look at the foldable characters in this class individually */
14375             invlist_iterinit(fold_intersection);
14376             while (invlist_iternext(fold_intersection, &start, &end)) {
14377                 UV j;
14378 
14379                 /* Look at every character in the range */
14380                 for (j = start; j <= end; j++) {
14381                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14382                     STRLEN foldlen;
14383                     SV** listp;
14384 
14385                     if (j < 256) {
14386 
14387                         /* We have the latin1 folding rules hard-coded here so
14388                          * that an innocent-looking character class, like
14389                          * /[ks]/i won't have to go out to disk to find the
14390                          * possible matches.  XXX It would be better to
14391                          * generate these via regen, in case a new version of
14392                          * the Unicode standard adds new mappings, though that
14393                          * is not really likely, and may be caught by the
14394                          * default: case of the switch below. */
14395 
14396                         if (IS_IN_SOME_FOLD_L1(j)) {
14397 
14398                             /* ASCII is always matched; non-ASCII is matched
14399                              * only under Unicode rules (which could happen
14400                              * under /l if the locale is a UTF-8 one */
14401                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14402                                 *use_list = add_cp_to_invlist(*use_list,
14403                                                             PL_fold_latin1[j]);
14404                             }
14405                             else {
14406                                 depends_list =
14407                                  add_cp_to_invlist(depends_list,
14408                                                    PL_fold_latin1[j]);
14409                             }
14410                         }
14411 
14412                         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14413                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14414                         {
14415                             /* Certain Latin1 characters have matches outside
14416                             * Latin1.  To get here, <j> is one of those
14417                             * characters.   None of these matches is valid for
14418                             * ASCII characters under /aa, which is why the 'if'
14419                             * just above excludes those.  These matches only
14420                             * happen when the target string is utf8.  The code
14421                             * below adds the single fold closures for <j> to the
14422                             * inversion list. */
14423 
14424                             switch (j) {
14425                                 case 'k':
14426                                 case 'K':
14427                                   *use_list =
14428                                      add_cp_to_invlist(*use_list, KELVIN_SIGN);
14429                                     break;
14430                                 case 's':
14431                                 case 'S':
14432                                   *use_list = add_cp_to_invlist(*use_list,
14433                                                     LATIN_SMALL_LETTER_LONG_S);
14434                                     break;
14435                                 case MICRO_SIGN:
14436                                   *use_list = add_cp_to_invlist(*use_list,
14437                                                       GREEK_CAPITAL_LETTER_MU);
14438                                   *use_list = add_cp_to_invlist(*use_list,
14439                                                         GREEK_SMALL_LETTER_MU);
14440                                     break;
14441                                 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14442                                 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14443                                   *use_list =
14444                                    add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14445                                     break;
14446                                 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14447                                   *use_list = add_cp_to_invlist(*use_list,
14448                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14449                                     break;
14450                                 case LATIN_SMALL_LETTER_SHARP_S:
14451                                   *use_list = add_cp_to_invlist(*use_list,
14452                                                  LATIN_CAPITAL_LETTER_SHARP_S);
14453                                     break;
14454                                 case 'F': case 'f':
14455                                 case 'I': case 'i':
14456                                 case 'L': case 'l':
14457                                 case 'T': case 't':
14458                                 case 'A': case 'a':
14459                                 case 'H': case 'h':
14460                                 case 'J': case 'j':
14461                                 case 'N': case 'n':
14462                                 case 'W': case 'w':
14463                                 case 'Y': case 'y':
14464                                     /* These all are targets of multi-character
14465                                      * folds from code points that require UTF8
14466                                      * to express, so they can't match unless
14467                                      * the target string is in UTF-8, so no
14468                                      * action here is necessary, as regexec.c
14469                                      * properly handles the general case for
14470                                      * UTF-8 matching and multi-char folds */
14471                                     break;
14472                                 default:
14473                                     /* Use deprecated warning to increase the
14474                                     * chances of this being output */
14475                                     ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14476                                     break;
14477                             }
14478                         }
14479                         continue;
14480                     }
14481 
14482                     /* Here is an above Latin1 character.  We don't have the
14483                      * rules hard-coded for it.  First, get its fold.  This is
14484                      * the simple fold, as the multi-character folds have been
14485                      * handled earlier and separated out */
14486                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14487                                                         (ASCII_FOLD_RESTRICTED)
14488                                                         ? FOLD_FLAGS_NOMIX_ASCII
14489                                                         : 0);
14490 
14491                     /* Single character fold of above Latin1.  Add everything in
14492                     * its fold closure to the list that this node should match.
14493                     * The fold closures data structure is a hash with the keys
14494                     * being the UTF-8 of every character that is folded to, like
14495                     * 'k', and the values each an array of all code points that
14496                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14497                     * Multi-character folds are not included */
14498                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14499                                         (char *) foldbuf, foldlen, FALSE)))
14500                     {
14501                         AV* list = (AV*) *listp;
14502                         IV k;
14503                         for (k = 0; k <= av_tindex(list); k++) {
14504                             SV** c_p = av_fetch(list, k, FALSE);
14505                             UV c;
14506                             if (c_p == NULL) {
14507                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14508                             }
14509                             c = SvUV(*c_p);
14510 
14511                             /* /aa doesn't allow folds between ASCII and non- */
14512                             if ((ASCII_FOLD_RESTRICTED
14513                                 && (isASCII(c) != isASCII(j))))
14514                             {
14515                                 continue;
14516                             }
14517 
14518                             /* Folds under /l which cross the 255/256 boundary
14519                              * are added to a separate list.  (These are valid
14520                              * only when the locale is UTF-8.) */
14521                             if (c < 256 && LOC) {
14522                                 *use_list = add_cp_to_invlist(*use_list, c);
14523                                 continue;
14524                             }
14525 
14526                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14527                             {
14528                                 cp_list = add_cp_to_invlist(cp_list, c);
14529                             }
14530                             else {
14531                                 /* Similarly folds involving non-ascii Latin1
14532                                 * characters under /d are added to their list */
14533                                 depends_list = add_cp_to_invlist(depends_list,
14534                                                                  c);
14535                             }
14536                         }
14537                     }
14538                 }
14539             }
14540             SvREFCNT_dec_NN(fold_intersection);
14541         }
14542 
14543         /* Now that we have finished adding all the folds, there is no reason
14544          * to keep the foldable list separate */
14545         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14546 	SvREFCNT_dec_NN(cp_foldable_list);
14547     }
14548 
14549     /* And combine the result (if any) with any inversion list from posix
14550      * classes.  The lists are kept separate up to now because we don't want to
14551      * fold the classes (folding of those is automatically handled by the swash
14552      * fetching code) */
14553     if (posixes || nposixes) {
14554         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14555             /* Under /a and /aa, nothing above ASCII matches these */
14556             _invlist_intersection(posixes,
14557                                   PL_XPosix_ptrs[_CC_ASCII],
14558                                   &posixes);
14559         }
14560         if (nposixes) {
14561             if (DEPENDS_SEMANTICS) {
14562                 /* Under /d, everything in the upper half of the Latin1 range
14563                  * matches these complements */
14564                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14565             }
14566             else if (AT_LEAST_ASCII_RESTRICTED) {
14567                 /* Under /a and /aa, everything above ASCII matches these
14568                  * complements */
14569                 _invlist_union_complement_2nd(nposixes,
14570                                               PL_XPosix_ptrs[_CC_ASCII],
14571                                               &nposixes);
14572             }
14573             if (posixes) {
14574                 _invlist_union(posixes, nposixes, &posixes);
14575                 SvREFCNT_dec_NN(nposixes);
14576             }
14577             else {
14578                 posixes = nposixes;
14579             }
14580         }
14581         if (! DEPENDS_SEMANTICS) {
14582             if (cp_list) {
14583                 _invlist_union(cp_list, posixes, &cp_list);
14584                 SvREFCNT_dec_NN(posixes);
14585             }
14586             else {
14587                 cp_list = posixes;
14588             }
14589         }
14590         else {
14591             /* Under /d, we put into a separate list the Latin1 things that
14592              * match only when the target string is utf8 */
14593             SV* nonascii_but_latin1_properties = NULL;
14594             _invlist_intersection(posixes, PL_UpperLatin1,
14595                                   &nonascii_but_latin1_properties);
14596             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14597                               &posixes);
14598             if (cp_list) {
14599                 _invlist_union(cp_list, posixes, &cp_list);
14600                 SvREFCNT_dec_NN(posixes);
14601             }
14602             else {
14603                 cp_list = posixes;
14604             }
14605 
14606             if (depends_list) {
14607                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14608                                &depends_list);
14609                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14610             }
14611             else {
14612                 depends_list = nonascii_but_latin1_properties;
14613             }
14614         }
14615     }
14616 
14617     /* And combine the result (if any) with any inversion list from properties.
14618      * The lists are kept separate up to now so that we can distinguish the two
14619      * in regards to matching above-Unicode.  A run-time warning is generated
14620      * if a Unicode property is matched against a non-Unicode code point. But,
14621      * we allow user-defined properties to match anything, without any warning,
14622      * and we also suppress the warning if there is a portion of the character
14623      * class that isn't a Unicode property, and which matches above Unicode, \W
14624      * or [\x{110000}] for example.
14625      * (Note that in this case, unlike the Posix one above, there is no
14626      * <depends_list>, because having a Unicode property forces Unicode
14627      * semantics */
14628     if (properties) {
14629         if (cp_list) {
14630 
14631             /* If it matters to the final outcome, see if a non-property
14632              * component of the class matches above Unicode.  If so, the
14633              * warning gets suppressed.  This is true even if just a single
14634              * such code point is specified, as though not strictly correct if
14635              * another such code point is matched against, the fact that they
14636              * are using above-Unicode code points indicates they should know
14637              * the issues involved */
14638             if (warn_super) {
14639                 warn_super = ! (invert
14640                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14641             }
14642 
14643             _invlist_union(properties, cp_list, &cp_list);
14644             SvREFCNT_dec_NN(properties);
14645         }
14646         else {
14647             cp_list = properties;
14648         }
14649 
14650         if (warn_super) {
14651             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14652         }
14653     }
14654 
14655     /* Here, we have calculated what code points should be in the character
14656      * class.
14657      *
14658      * Now we can see about various optimizations.  Fold calculation (which we
14659      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14660      * would invert to include K, which under /i would match k, which it
14661      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14662      * folded until runtime */
14663 
14664     /* If we didn't do folding, it's because some information isn't available
14665      * until runtime; set the run-time fold flag for these.  (We don't have to
14666      * worry about properties folding, as that is taken care of by the swash
14667      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14668      * locales, or the class matches at least one 0-255 range code point */
14669     if (LOC && FOLD) {
14670         if (only_utf8_locale_list) {
14671             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14672         }
14673         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14674                                the list */
14675             UV start, end;
14676             invlist_iterinit(cp_list);
14677             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14678                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14679             }
14680             invlist_iterfinish(cp_list);
14681         }
14682     }
14683 
14684     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14685      * at compile time.  Besides not inverting folded locale now, we can't
14686      * invert if there are things such as \w, which aren't known until runtime
14687      * */
14688     if (cp_list
14689         && invert
14690         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14691 	&& ! depends_list
14692 	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14693     {
14694         _invlist_invert(cp_list);
14695 
14696         /* Any swash can't be used as-is, because we've inverted things */
14697         if (swash) {
14698             SvREFCNT_dec_NN(swash);
14699             swash = NULL;
14700         }
14701 
14702 	/* Clear the invert flag since have just done it here */
14703 	invert = FALSE;
14704     }
14705 
14706     if (ret_invlist) {
14707         *ret_invlist = cp_list;
14708         SvREFCNT_dec(swash);
14709 
14710         /* Discard the generated node */
14711         if (SIZE_ONLY) {
14712             RExC_size = orig_size;
14713         }
14714         else {
14715             RExC_emit = orig_emit;
14716         }
14717         return orig_emit;
14718     }
14719 
14720     /* Some character classes are equivalent to other nodes.  Such nodes take
14721      * up less room and generally fewer operations to execute than ANYOF nodes.
14722      * Above, we checked for and optimized into some such equivalents for
14723      * certain common classes that are easy to test.  Getting to this point in
14724      * the code means that the class didn't get optimized there.  Since this
14725      * code is only executed in Pass 2, it is too late to save space--it has
14726      * been allocated in Pass 1, and currently isn't given back.  But turning
14727      * things into an EXACTish node can allow the optimizer to join it to any
14728      * adjacent such nodes.  And if the class is equivalent to things like /./,
14729      * expensive run-time swashes can be avoided.  Now that we have more
14730      * complete information, we can find things necessarily missed by the
14731      * earlier code.  I (khw) am not sure how much to look for here.  It would
14732      * be easy, but perhaps too slow, to check any candidates against all the
14733      * node types they could possibly match using _invlistEQ(). */
14734 
14735     if (cp_list
14736         && ! invert
14737         && ! depends_list
14738         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14739         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14740 
14741            /* We don't optimize if we are supposed to make sure all non-Unicode
14742             * code points raise a warning, as only ANYOF nodes have this check.
14743             * */
14744         && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14745     {
14746         UV start, end;
14747         U8 op = END;  /* The optimzation node-type */
14748         const char * cur_parse= RExC_parse;
14749 
14750         invlist_iterinit(cp_list);
14751         if (! invlist_iternext(cp_list, &start, &end)) {
14752 
14753             /* Here, the list is empty.  This happens, for example, when a
14754              * Unicode property is the only thing in the character class, and
14755              * it doesn't match anything.  (perluniprops.pod notes such
14756              * properties) */
14757             op = OPFAIL;
14758             *flagp |= HASWIDTH|SIMPLE;
14759         }
14760         else if (start == end) {    /* The range is a single code point */
14761             if (! invlist_iternext(cp_list, &start, &end)
14762 
14763                     /* Don't do this optimization if it would require changing
14764                      * the pattern to UTF-8 */
14765                 && (start < 256 || UTF))
14766             {
14767                 /* Here, the list contains a single code point.  Can optimize
14768                  * into an EXACTish node */
14769 
14770                 value = start;
14771 
14772                 if (! FOLD) {
14773                     op = EXACT;
14774                 }
14775                 else if (LOC) {
14776 
14777                     /* A locale node under folding with one code point can be
14778                      * an EXACTFL, as its fold won't be calculated until
14779                      * runtime */
14780                     op = EXACTFL;
14781                 }
14782                 else {
14783 
14784                     /* Here, we are generally folding, but there is only one
14785                      * code point to match.  If we have to, we use an EXACT
14786                      * node, but it would be better for joining with adjacent
14787                      * nodes in the optimization pass if we used the same
14788                      * EXACTFish node that any such are likely to be.  We can
14789                      * do this iff the code point doesn't participate in any
14790                      * folds.  For example, an EXACTF of a colon is the same as
14791                      * an EXACT one, since nothing folds to or from a colon. */
14792                     if (value < 256) {
14793                         if (IS_IN_SOME_FOLD_L1(value)) {
14794                             op = EXACT;
14795                         }
14796                     }
14797                     else {
14798                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14799                             op = EXACT;
14800                         }
14801                     }
14802 
14803                     /* If we haven't found the node type, above, it means we
14804                      * can use the prevailing one */
14805                     if (op == END) {
14806                         op = compute_EXACTish(pRExC_state);
14807                     }
14808                 }
14809             }
14810         }
14811         else if (start == 0) {
14812             if (end == UV_MAX) {
14813                 op = SANY;
14814                 *flagp |= HASWIDTH|SIMPLE;
14815                 RExC_naughty++;
14816             }
14817             else if (end == '\n' - 1
14818                     && invlist_iternext(cp_list, &start, &end)
14819                     && start == '\n' + 1 && end == UV_MAX)
14820             {
14821                 op = REG_ANY;
14822                 *flagp |= HASWIDTH|SIMPLE;
14823                 RExC_naughty++;
14824             }
14825         }
14826         invlist_iterfinish(cp_list);
14827 
14828         if (op != END) {
14829             RExC_parse = (char *)orig_parse;
14830             RExC_emit = (regnode *)orig_emit;
14831 
14832             ret = reg_node(pRExC_state, op);
14833 
14834             RExC_parse = (char *)cur_parse;
14835 
14836             if (PL_regkind[op] == EXACT) {
14837                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14838                                            TRUE /* downgradable to EXACT */
14839                                           );
14840             }
14841 
14842             SvREFCNT_dec_NN(cp_list);
14843             return ret;
14844         }
14845     }
14846 
14847     /* Here, <cp_list> contains all the code points we can determine at
14848      * compile time that match under all conditions.  Go through it, and
14849      * for things that belong in the bitmap, put them there, and delete from
14850      * <cp_list>.  While we are at it, see if everything above 255 is in the
14851      * list, and if so, set a flag to speed up execution */
14852 
14853     populate_ANYOF_from_invlist(ret, &cp_list);
14854 
14855     if (invert) {
14856         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14857     }
14858 
14859     /* Here, the bitmap has been populated with all the Latin1 code points that
14860      * always match.  Can now add to the overall list those that match only
14861      * when the target string is UTF-8 (<depends_list>). */
14862     if (depends_list) {
14863 	if (cp_list) {
14864 	    _invlist_union(cp_list, depends_list, &cp_list);
14865 	    SvREFCNT_dec_NN(depends_list);
14866 	}
14867 	else {
14868 	    cp_list = depends_list;
14869 	}
14870         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14871     }
14872 
14873     /* If there is a swash and more than one element, we can't use the swash in
14874      * the optimization below. */
14875     if (swash && element_count > 1) {
14876 	SvREFCNT_dec_NN(swash);
14877 	swash = NULL;
14878     }
14879 
14880     set_ANYOF_arg(pRExC_state, ret, cp_list,
14881                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14882                    ? listsv : NULL,
14883                   only_utf8_locale_list,
14884                   swash, has_user_defined_property);
14885 
14886     *flagp |= HASWIDTH|SIMPLE;
14887 
14888     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14889         RExC_contains_locale = 1;
14890     }
14891 
14892     return ret;
14893 }
14894 
14895 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14896 
14897 STATIC void
14898 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14899                 regnode* const node,
14900                 SV* const cp_list,
14901                 SV* const runtime_defns,
14902                 SV* const only_utf8_locale_list,
14903                 SV* const swash,
14904                 const bool has_user_defined_property)
14905 {
14906     /* Sets the arg field of an ANYOF-type node 'node', using information about
14907      * the node passed-in.  If there is nothing outside the node's bitmap, the
14908      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14909      * the count returned by add_data(), having allocated and stored an array,
14910      * av, that that count references, as follows:
14911      *  av[0] stores the character class description in its textual form.
14912      *        This is used later (regexec.c:Perl_regclass_swash()) to
14913      *        initialize the appropriate swash, and is also useful for dumping
14914      *        the regnode.  This is set to &PL_sv_undef if the textual
14915      *        description is not needed at run-time (as happens if the other
14916      *        elements completely define the class)
14917      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14918      *        computed from av[0].  But if no further computation need be done,
14919      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14920      *  av[2] stores the inversion list of code points that match only if the
14921      *        current locale is UTF-8
14922      *  av[3] stores the cp_list inversion list for use in addition or instead
14923      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14924      *        (Otherwise everything needed is already in av[0] and av[1])
14925      *  av[4] is set if any component of the class is from a user-defined
14926      *        property; used only if av[3] exists */
14927 
14928     UV n;
14929 
14930     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14931 
14932     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14933         assert(! (ANYOF_FLAGS(node)
14934                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14935 	ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14936     }
14937     else {
14938 	AV * const av = newAV();
14939 	SV *rv;
14940 
14941         assert(ANYOF_FLAGS(node)
14942                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14943 
14944 	av_store(av, 0, (runtime_defns)
14945 			? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14946 	if (swash) {
14947 	    av_store(av, 1, swash);
14948 	    SvREFCNT_dec_NN(cp_list);
14949 	}
14950 	else {
14951 	    av_store(av, 1, &PL_sv_undef);
14952 	    if (cp_list) {
14953 		av_store(av, 3, cp_list);
14954 		av_store(av, 4, newSVuv(has_user_defined_property));
14955 	    }
14956 	}
14957 
14958         if (only_utf8_locale_list) {
14959 	    av_store(av, 2, only_utf8_locale_list);
14960         }
14961         else {
14962 	    av_store(av, 2, &PL_sv_undef);
14963         }
14964 
14965 	rv = newRV_noinc(MUTABLE_SV(av));
14966 	n = add_data(pRExC_state, STR_WITH_LEN("s"));
14967 	RExC_rxi->data->data[n] = (void*)rv;
14968 	ARG_SET(node, n);
14969     }
14970 }
14971 
14972 
14973 /* reg_skipcomment()
14974 
14975    Absorbs an /x style # comments from the input stream.
14976    Returns true if there is more text remaining in the stream.
14977    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14978    terminates the pattern without including a newline.
14979 
14980    Note its the callers responsibility to ensure that we are
14981    actually in /x mode
14982 
14983 */
14984 
14985 STATIC bool
14986 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14987 {
14988     bool ended = 0;
14989 
14990     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14991 
14992     while (RExC_parse < RExC_end)
14993         if (*RExC_parse++ == '\n') {
14994             ended = 1;
14995             break;
14996         }
14997     if (!ended) {
14998         /* we ran off the end of the pattern without ending
14999            the comment, so we have to add an \n when wrapping */
15000         RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15001         return 0;
15002     } else
15003         return 1;
15004 }
15005 
15006 /* nextchar()
15007 
15008    Advances the parse position, and optionally absorbs
15009    "whitespace" from the inputstream.
15010 
15011    Without /x "whitespace" means (?#...) style comments only,
15012    with /x this means (?#...) and # comments and whitespace proper.
15013 
15014    Returns the RExC_parse point from BEFORE the scan occurs.
15015 
15016    This is the /x friendly way of saying RExC_parse++.
15017 */
15018 
15019 STATIC char*
15020 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15021 {
15022     char* const retval = RExC_parse++;
15023 
15024     PERL_ARGS_ASSERT_NEXTCHAR;
15025 
15026     for (;;) {
15027 	if (RExC_end - RExC_parse >= 3
15028 	    && *RExC_parse == '('
15029 	    && RExC_parse[1] == '?'
15030 	    && RExC_parse[2] == '#')
15031 	{
15032 	    while (*RExC_parse != ')') {
15033 		if (RExC_parse == RExC_end)
15034 		    FAIL("Sequence (?#... not terminated");
15035 		RExC_parse++;
15036 	    }
15037 	    RExC_parse++;
15038 	    continue;
15039 	}
15040 	if (RExC_flags & RXf_PMf_EXTENDED) {
15041 	    if (isSPACE(*RExC_parse)) {
15042 		RExC_parse++;
15043 		continue;
15044 	    }
15045 	    else if (*RExC_parse == '#') {
15046 	        if ( reg_skipcomment( pRExC_state ) )
15047 	            continue;
15048 	    }
15049 	}
15050 	return retval;
15051     }
15052 }
15053 
15054 /*
15055 - reg_node - emit a node
15056 */
15057 STATIC regnode *			/* Location. */
15058 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15059 {
15060     dVAR;
15061     regnode *ptr;
15062     regnode * const ret = RExC_emit;
15063     GET_RE_DEBUG_FLAGS_DECL;
15064 
15065     PERL_ARGS_ASSERT_REG_NODE;
15066 
15067     if (SIZE_ONLY) {
15068 	SIZE_ALIGN(RExC_size);
15069 	RExC_size += 1;
15070 	return(ret);
15071     }
15072     if (RExC_emit >= RExC_emit_bound)
15073         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15074 		   op, RExC_emit, RExC_emit_bound);
15075 
15076     NODE_ALIGN_FILL(ret);
15077     ptr = ret;
15078     FILL_ADVANCE_NODE(ptr, op);
15079 #ifdef RE_TRACK_PATTERN_OFFSETS
15080     if (RExC_offsets) {         /* MJD */
15081 	MJD_OFFSET_DEBUG(
15082               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15083               "reg_node", __LINE__,
15084               PL_reg_name[op],
15085               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15086 		? "Overwriting end of array!\n" : "OK",
15087               (UV)(RExC_emit - RExC_emit_start),
15088               (UV)(RExC_parse - RExC_start),
15089               (UV)RExC_offsets[0]));
15090 	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15091     }
15092 #endif
15093     RExC_emit = ptr;
15094     return(ret);
15095 }
15096 
15097 /*
15098 - reganode - emit a node with an argument
15099 */
15100 STATIC regnode *			/* Location. */
15101 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15102 {
15103     dVAR;
15104     regnode *ptr;
15105     regnode * const ret = RExC_emit;
15106     GET_RE_DEBUG_FLAGS_DECL;
15107 
15108     PERL_ARGS_ASSERT_REGANODE;
15109 
15110     if (SIZE_ONLY) {
15111 	SIZE_ALIGN(RExC_size);
15112 	RExC_size += 2;
15113 	/*
15114 	   We can't do this:
15115 
15116 	   assert(2==regarglen[op]+1);
15117 
15118 	   Anything larger than this has to allocate the extra amount.
15119 	   If we changed this to be:
15120 
15121 	   RExC_size += (1 + regarglen[op]);
15122 
15123 	   then it wouldn't matter. Its not clear what side effect
15124 	   might come from that so its not done so far.
15125 	   -- dmq
15126 	*/
15127 	return(ret);
15128     }
15129     if (RExC_emit >= RExC_emit_bound)
15130         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15131 		   op, RExC_emit, RExC_emit_bound);
15132 
15133     NODE_ALIGN_FILL(ret);
15134     ptr = ret;
15135     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15136 #ifdef RE_TRACK_PATTERN_OFFSETS
15137     if (RExC_offsets) {         /* MJD */
15138 	MJD_OFFSET_DEBUG(
15139               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15140               "reganode",
15141 	      __LINE__,
15142 	      PL_reg_name[op],
15143               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15144               "Overwriting end of array!\n" : "OK",
15145               (UV)(RExC_emit - RExC_emit_start),
15146               (UV)(RExC_parse - RExC_start),
15147               (UV)RExC_offsets[0]));
15148 	Set_Cur_Node_Offset;
15149     }
15150 #endif
15151     RExC_emit = ptr;
15152     return(ret);
15153 }
15154 
15155 /*
15156 - reguni - emit (if appropriate) a Unicode character
15157 */
15158 PERL_STATIC_INLINE STRLEN
15159 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15160 {
15161     dVAR;
15162 
15163     PERL_ARGS_ASSERT_REGUNI;
15164 
15165     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15166 }
15167 
15168 /*
15169 - reginsert - insert an operator in front of already-emitted operand
15170 *
15171 * Means relocating the operand.
15172 */
15173 STATIC void
15174 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15175 {
15176     dVAR;
15177     regnode *src;
15178     regnode *dst;
15179     regnode *place;
15180     const int offset = regarglen[(U8)op];
15181     const int size = NODE_STEP_REGNODE + offset;
15182     GET_RE_DEBUG_FLAGS_DECL;
15183 
15184     PERL_ARGS_ASSERT_REGINSERT;
15185     PERL_UNUSED_ARG(depth);
15186 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15187     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15188     if (SIZE_ONLY) {
15189 	RExC_size += size;
15190 	return;
15191     }
15192 
15193     src = RExC_emit;
15194     RExC_emit += size;
15195     dst = RExC_emit;
15196     if (RExC_open_parens) {
15197         int paren;
15198         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15199         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15200             if ( RExC_open_parens[paren] >= opnd ) {
15201                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15202                 RExC_open_parens[paren] += size;
15203             } else {
15204                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15205             }
15206             if ( RExC_close_parens[paren] >= opnd ) {
15207                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15208                 RExC_close_parens[paren] += size;
15209             } else {
15210                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15211             }
15212         }
15213     }
15214 
15215     while (src > opnd) {
15216 	StructCopy(--src, --dst, regnode);
15217 #ifdef RE_TRACK_PATTERN_OFFSETS
15218         if (RExC_offsets) {     /* MJD 20010112 */
15219 	    MJD_OFFSET_DEBUG(
15220                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15221                   "reg_insert",
15222 		  __LINE__,
15223 		  PL_reg_name[op],
15224                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15225 		    ? "Overwriting end of array!\n" : "OK",
15226                   (UV)(src - RExC_emit_start),
15227                   (UV)(dst - RExC_emit_start),
15228                   (UV)RExC_offsets[0]));
15229 	    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15230 	    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15231         }
15232 #endif
15233     }
15234 
15235 
15236     place = opnd;		/* Op node, where operand used to be. */
15237 #ifdef RE_TRACK_PATTERN_OFFSETS
15238     if (RExC_offsets) {         /* MJD */
15239 	MJD_OFFSET_DEBUG(
15240               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15241               "reginsert",
15242 	      __LINE__,
15243 	      PL_reg_name[op],
15244               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15245               ? "Overwriting end of array!\n" : "OK",
15246               (UV)(place - RExC_emit_start),
15247               (UV)(RExC_parse - RExC_start),
15248               (UV)RExC_offsets[0]));
15249 	Set_Node_Offset(place, RExC_parse);
15250 	Set_Node_Length(place, 1);
15251     }
15252 #endif
15253     src = NEXTOPER(place);
15254     FILL_ADVANCE_NODE(place, op);
15255     Zero(src, offset, regnode);
15256 }
15257 
15258 /*
15259 - regtail - set the next-pointer at the end of a node chain of p to val.
15260 - SEE ALSO: regtail_study
15261 */
15262 /* TODO: All three parms should be const */
15263 STATIC void
15264 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15265                 const regnode *val,U32 depth)
15266 {
15267     dVAR;
15268     regnode *scan;
15269     GET_RE_DEBUG_FLAGS_DECL;
15270 
15271     PERL_ARGS_ASSERT_REGTAIL;
15272 #ifndef DEBUGGING
15273     PERL_UNUSED_ARG(depth);
15274 #endif
15275 
15276     if (SIZE_ONLY)
15277 	return;
15278 
15279     /* Find last node. */
15280     scan = p;
15281     for (;;) {
15282 	regnode * const temp = regnext(scan);
15283         DEBUG_PARSE_r({
15284             SV * const mysv=sv_newmortal();
15285             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15286             regprop(RExC_rx, mysv, scan, NULL);
15287             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15288                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15289                     (temp == NULL ? "->" : ""),
15290                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15291             );
15292         });
15293         if (temp == NULL)
15294             break;
15295         scan = temp;
15296     }
15297 
15298     if (reg_off_by_arg[OP(scan)]) {
15299         ARG_SET(scan, val - scan);
15300     }
15301     else {
15302         NEXT_OFF(scan) = val - scan;
15303     }
15304 }
15305 
15306 #ifdef DEBUGGING
15307 /*
15308 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15309 - Look for optimizable sequences at the same time.
15310 - currently only looks for EXACT chains.
15311 
15312 This is experimental code. The idea is to use this routine to perform
15313 in place optimizations on branches and groups as they are constructed,
15314 with the long term intention of removing optimization from study_chunk so
15315 that it is purely analytical.
15316 
15317 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15318 to control which is which.
15319 
15320 */
15321 /* TODO: All four parms should be const */
15322 
15323 STATIC U8
15324 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15325                       const regnode *val,U32 depth)
15326 {
15327     dVAR;
15328     regnode *scan;
15329     U8 exact = PSEUDO;
15330 #ifdef EXPERIMENTAL_INPLACESCAN
15331     I32 min = 0;
15332 #endif
15333     GET_RE_DEBUG_FLAGS_DECL;
15334 
15335     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15336 
15337 
15338     if (SIZE_ONLY)
15339         return exact;
15340 
15341     /* Find last node. */
15342 
15343     scan = p;
15344     for (;;) {
15345         regnode * const temp = regnext(scan);
15346 #ifdef EXPERIMENTAL_INPLACESCAN
15347         if (PL_regkind[OP(scan)] == EXACT) {
15348 	    bool unfolded_multi_char;	/* Unexamined in this routine */
15349             if (join_exact(pRExC_state, scan, &min,
15350                            &unfolded_multi_char, 1, val, depth+1))
15351                 return EXACT;
15352 	}
15353 #endif
15354         if ( exact ) {
15355             switch (OP(scan)) {
15356                 case EXACT:
15357                 case EXACTF:
15358                 case EXACTFA_NO_TRIE:
15359                 case EXACTFA:
15360                 case EXACTFU:
15361                 case EXACTFU_SS:
15362                 case EXACTFL:
15363                         if( exact == PSEUDO )
15364                             exact= OP(scan);
15365                         else if ( exact != OP(scan) )
15366                             exact= 0;
15367                 case NOTHING:
15368                     break;
15369                 default:
15370                     exact= 0;
15371             }
15372         }
15373         DEBUG_PARSE_r({
15374             SV * const mysv=sv_newmortal();
15375             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15376             regprop(RExC_rx, mysv, scan, NULL);
15377             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15378                 SvPV_nolen_const(mysv),
15379                 REG_NODE_NUM(scan),
15380                 PL_reg_name[exact]);
15381         });
15382 	if (temp == NULL)
15383 	    break;
15384 	scan = temp;
15385     }
15386     DEBUG_PARSE_r({
15387         SV * const mysv_val=sv_newmortal();
15388         DEBUG_PARSE_MSG("");
15389         regprop(RExC_rx, mysv_val, val, NULL);
15390         PerlIO_printf(Perl_debug_log,
15391                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15392 		      SvPV_nolen_const(mysv_val),
15393 		      (IV)REG_NODE_NUM(val),
15394 		      (IV)(val - scan)
15395         );
15396     });
15397     if (reg_off_by_arg[OP(scan)]) {
15398 	ARG_SET(scan, val - scan);
15399     }
15400     else {
15401 	NEXT_OFF(scan) = val - scan;
15402     }
15403 
15404     return exact;
15405 }
15406 #endif
15407 
15408 /*
15409  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15410  */
15411 #ifdef DEBUGGING
15412 
15413 static void
15414 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15415 {
15416     int bit;
15417     int set=0;
15418 
15419     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15420 
15421     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15422         if (flags & (1<<bit)) {
15423             if (!set++ && lead)
15424                 PerlIO_printf(Perl_debug_log, "%s",lead);
15425             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15426         }
15427     }
15428     if (lead)  {
15429         if (set)
15430             PerlIO_printf(Perl_debug_log, "\n");
15431         else
15432             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15433     }
15434 }
15435 
15436 static void
15437 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15438 {
15439     int bit;
15440     int set=0;
15441     regex_charset cs;
15442 
15443     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15444 
15445     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15446         if (flags & (1<<bit)) {
15447 	    if ((1<<bit) & RXf_PMf_CHARSET) {	/* Output separately, below */
15448 		continue;
15449 	    }
15450             if (!set++ && lead)
15451                 PerlIO_printf(Perl_debug_log, "%s",lead);
15452             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15453         }
15454     }
15455     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15456             if (!set++ && lead) {
15457                 PerlIO_printf(Perl_debug_log, "%s",lead);
15458             }
15459             switch (cs) {
15460                 case REGEX_UNICODE_CHARSET:
15461                     PerlIO_printf(Perl_debug_log, "UNICODE");
15462                     break;
15463                 case REGEX_LOCALE_CHARSET:
15464                     PerlIO_printf(Perl_debug_log, "LOCALE");
15465                     break;
15466                 case REGEX_ASCII_RESTRICTED_CHARSET:
15467                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15468                     break;
15469                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15470                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15471                     break;
15472                 default:
15473                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15474                     break;
15475             }
15476     }
15477     if (lead)  {
15478         if (set)
15479             PerlIO_printf(Perl_debug_log, "\n");
15480         else
15481             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15482     }
15483 }
15484 #endif
15485 
15486 void
15487 Perl_regdump(pTHX_ const regexp *r)
15488 {
15489 #ifdef DEBUGGING
15490     dVAR;
15491     SV * const sv = sv_newmortal();
15492     SV *dsv= sv_newmortal();
15493     RXi_GET_DECL(r,ri);
15494     GET_RE_DEBUG_FLAGS_DECL;
15495 
15496     PERL_ARGS_ASSERT_REGDUMP;
15497 
15498     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15499 
15500     /* Header fields of interest. */
15501     if (r->anchored_substr) {
15502 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15503 	    RE_SV_DUMPLEN(r->anchored_substr), 30);
15504 	PerlIO_printf(Perl_debug_log,
15505 		      "anchored %s%s at %"IVdf" ",
15506 		      s, RE_SV_TAIL(r->anchored_substr),
15507 		      (IV)r->anchored_offset);
15508     } else if (r->anchored_utf8) {
15509 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15510 	    RE_SV_DUMPLEN(r->anchored_utf8), 30);
15511 	PerlIO_printf(Perl_debug_log,
15512 		      "anchored utf8 %s%s at %"IVdf" ",
15513 		      s, RE_SV_TAIL(r->anchored_utf8),
15514 		      (IV)r->anchored_offset);
15515     }
15516     if (r->float_substr) {
15517 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15518 	    RE_SV_DUMPLEN(r->float_substr), 30);
15519 	PerlIO_printf(Perl_debug_log,
15520 		      "floating %s%s at %"IVdf"..%"UVuf" ",
15521 		      s, RE_SV_TAIL(r->float_substr),
15522 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
15523     } else if (r->float_utf8) {
15524 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15525 	    RE_SV_DUMPLEN(r->float_utf8), 30);
15526 	PerlIO_printf(Perl_debug_log,
15527 		      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15528 		      s, RE_SV_TAIL(r->float_utf8),
15529 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
15530     }
15531     if (r->check_substr || r->check_utf8)
15532 	PerlIO_printf(Perl_debug_log,
15533 		      (const char *)
15534 		      (r->check_substr == r->float_substr
15535 		       && r->check_utf8 == r->float_utf8
15536 		       ? "(checking floating" : "(checking anchored"));
15537     if (r->intflags & PREGf_NOSCAN)
15538 	PerlIO_printf(Perl_debug_log, " noscan");
15539     if (r->extflags & RXf_CHECK_ALL)
15540 	PerlIO_printf(Perl_debug_log, " isall");
15541     if (r->check_substr || r->check_utf8)
15542 	PerlIO_printf(Perl_debug_log, ") ");
15543 
15544     if (ri->regstclass) {
15545 	regprop(r, sv, ri->regstclass, NULL);
15546 	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15547     }
15548     if (r->intflags & PREGf_ANCH) {
15549 	PerlIO_printf(Perl_debug_log, "anchored");
15550         if (r->intflags & PREGf_ANCH_BOL)
15551 	    PerlIO_printf(Perl_debug_log, "(BOL)");
15552         if (r->intflags & PREGf_ANCH_MBOL)
15553 	    PerlIO_printf(Perl_debug_log, "(MBOL)");
15554         if (r->intflags & PREGf_ANCH_SBOL)
15555 	    PerlIO_printf(Perl_debug_log, "(SBOL)");
15556         if (r->intflags & PREGf_ANCH_GPOS)
15557 	    PerlIO_printf(Perl_debug_log, "(GPOS)");
15558 	PerlIO_putc(Perl_debug_log, ' ');
15559     }
15560     if (r->intflags & PREGf_GPOS_SEEN)
15561 	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15562     if (r->intflags & PREGf_SKIP)
15563 	PerlIO_printf(Perl_debug_log, "plus ");
15564     if (r->intflags & PREGf_IMPLICIT)
15565 	PerlIO_printf(Perl_debug_log, "implicit ");
15566     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15567     if (r->extflags & RXf_EVAL_SEEN)
15568 	PerlIO_printf(Perl_debug_log, "with eval ");
15569     PerlIO_printf(Perl_debug_log, "\n");
15570     DEBUG_FLAGS_r({
15571         regdump_extflags("r->extflags: ",r->extflags);
15572         regdump_intflags("r->intflags: ",r->intflags);
15573     });
15574 #else
15575     PERL_ARGS_ASSERT_REGDUMP;
15576     PERL_UNUSED_CONTEXT;
15577     PERL_UNUSED_ARG(r);
15578 #endif	/* DEBUGGING */
15579 }
15580 
15581 /*
15582 - regprop - printable representation of opcode, with run time support
15583 */
15584 
15585 void
15586 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15587 {
15588 #ifdef DEBUGGING
15589     dVAR;
15590     int k;
15591 
15592     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15593     static const char * const anyofs[] = {
15594 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15595     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15596     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15597     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15598     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15599     || _CC_VERTSPACE != 16
15600   #error Need to adjust order of anyofs[]
15601 #endif
15602         "\\w",
15603         "\\W",
15604         "\\d",
15605         "\\D",
15606         "[:alpha:]",
15607         "[:^alpha:]",
15608         "[:lower:]",
15609         "[:^lower:]",
15610         "[:upper:]",
15611         "[:^upper:]",
15612         "[:punct:]",
15613         "[:^punct:]",
15614         "[:print:]",
15615         "[:^print:]",
15616         "[:alnum:]",
15617         "[:^alnum:]",
15618         "[:graph:]",
15619         "[:^graph:]",
15620         "[:cased:]",
15621         "[:^cased:]",
15622         "\\s",
15623         "\\S",
15624         "[:blank:]",
15625         "[:^blank:]",
15626         "[:xdigit:]",
15627         "[:^xdigit:]",
15628         "[:space:]",
15629         "[:^space:]",
15630         "[:cntrl:]",
15631         "[:^cntrl:]",
15632         "[:ascii:]",
15633         "[:^ascii:]",
15634         "\\v",
15635         "\\V"
15636     };
15637     RXi_GET_DECL(prog,progi);
15638     GET_RE_DEBUG_FLAGS_DECL;
15639 
15640     PERL_ARGS_ASSERT_REGPROP;
15641 
15642     sv_setpvs(sv, "");
15643 
15644     if (OP(o) > REGNODE_MAX)		/* regnode.type is unsigned */
15645 	/* It would be nice to FAIL() here, but this may be called from
15646 	   regexec.c, and it would be hard to supply pRExC_state. */
15647 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15648                                               (int)OP(o), (int)REGNODE_MAX);
15649     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15650 
15651     k = PL_regkind[OP(o)];
15652 
15653     if (k == EXACT) {
15654 	sv_catpvs(sv, " ");
15655 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15656 	 * is a crude hack but it may be the best for now since
15657 	 * we have no flag "this EXACTish node was UTF-8"
15658 	 * --jhi */
15659 	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15660 		  PERL_PV_ESCAPE_UNI_DETECT |
15661 		  PERL_PV_ESCAPE_NONASCII   |
15662 		  PERL_PV_PRETTY_ELLIPSES   |
15663 		  PERL_PV_PRETTY_LTGT       |
15664 		  PERL_PV_PRETTY_NOCLEAR
15665 		  );
15666     } else if (k == TRIE) {
15667 	/* print the details of the trie in dumpuntil instead, as
15668 	 * progi->data isn't available here */
15669         const char op = OP(o);
15670         const U32 n = ARG(o);
15671         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15672                (reg_ac_data *)progi->data->data[n] :
15673                NULL;
15674         const reg_trie_data * const trie
15675 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15676 
15677         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15678         DEBUG_TRIE_COMPILE_r(
15679           Perl_sv_catpvf(aTHX_ sv,
15680             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15681             (UV)trie->startstate,
15682             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15683             (UV)trie->wordcount,
15684             (UV)trie->minlen,
15685             (UV)trie->maxlen,
15686             (UV)TRIE_CHARCOUNT(trie),
15687             (UV)trie->uniquecharcount
15688           );
15689         );
15690         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15691             sv_catpvs(sv, "[");
15692             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15693                                                    ? ANYOF_BITMAP(o)
15694                                                    : TRIE_BITMAP(trie));
15695             sv_catpvs(sv, "]");
15696         }
15697 
15698     } else if (k == CURLY) {
15699 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15700 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15701 	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15702     }
15703     else if (k == WHILEM && o->flags)			/* Ordinal/of */
15704 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15705     else if (k == REF || k == OPEN || k == CLOSE
15706              || k == GROUPP || OP(o)==ACCEPT)
15707     {
15708 	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));	/* Parenth number */
15709 	if ( RXp_PAREN_NAMES(prog) ) {
15710             if ( k != REF || (OP(o) < NREF)) {
15711 	        AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15712 	        SV **name= av_fetch(list, ARG(o), 0 );
15713 	        if (name)
15714 	            Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15715             }
15716             else {
15717                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15718                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15719                 I32 *nums=(I32*)SvPVX(sv_dat);
15720                 SV **name= av_fetch(list, nums[0], 0 );
15721                 I32 n;
15722                 if (name) {
15723                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15724                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15725 			   	    (n ? "," : ""), (IV)nums[n]);
15726                     }
15727                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15728                 }
15729             }
15730         }
15731         if ( k == REF && reginfo) {
15732             U32 n = ARG(o);  /* which paren pair */
15733             I32 ln = prog->offs[n].start;
15734             if (prog->lastparen < n || ln == -1)
15735                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15736             else if (ln == prog->offs[n].end)
15737                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15738             else {
15739                 const char *s = reginfo->strbeg + ln;
15740                 Perl_sv_catpvf(aTHX_ sv, ": ");
15741                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15742                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15743             }
15744         }
15745     } else if (k == GOSUB)
15746         /* Paren and offset */
15747 	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15748     else if (k == VERB) {
15749         if (!o->flags)
15750             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15751 			   SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15752     } else if (k == LOGICAL)
15753         /* 2: embedded, otherwise 1 */
15754 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15755     else if (k == ANYOF) {
15756 	const U8 flags = ANYOF_FLAGS(o);
15757 	int do_sep = 0;
15758 
15759 
15760 	if (flags & ANYOF_LOCALE_FLAGS)
15761 	    sv_catpvs(sv, "{loc}");
15762 	if (flags & ANYOF_LOC_FOLD)
15763 	    sv_catpvs(sv, "{i}");
15764 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15765 	if (flags & ANYOF_INVERT)
15766 	    sv_catpvs(sv, "^");
15767 
15768 	/* output what the standard cp 0-255 bitmap matches */
15769         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15770 
15771         /* output any special charclass tests (used entirely under use
15772          * locale) * */
15773 	if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15774             int i;
15775 	    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15776 		if (ANYOF_POSIXL_TEST(o,i)) {
15777 		    sv_catpv(sv, anyofs[i]);
15778 		    do_sep = 1;
15779 		}
15780             }
15781         }
15782 
15783 	if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15784                       |ANYOF_UTF8
15785                       |ANYOF_NONBITMAP_NON_UTF8
15786                       |ANYOF_LOC_FOLD)))
15787         {
15788             if (do_sep) {
15789                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15790                 if (flags & ANYOF_INVERT)
15791                     /*make sure the invert info is in each */
15792                     sv_catpvs(sv, "^");
15793             }
15794 
15795             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15796                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15797             }
15798 
15799             /* output information about the unicode matching */
15800             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15801                 sv_catpvs(sv, "{unicode_all}");
15802             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15803                 SV *lv; /* Set if there is something outside the bit map. */
15804                 bool byte_output = FALSE;   /* If something in the bitmap has
15805                                                been output */
15806                 SV *only_utf8_locale;
15807 
15808                 /* Get the stuff that wasn't in the bitmap */
15809                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15810                                                     &lv, &only_utf8_locale);
15811                 if (lv && lv != &PL_sv_undef) {
15812                     char *s = savesvpv(lv);
15813                     char * const origs = s;
15814 
15815                     while (*s && *s != '\n')
15816                         s++;
15817 
15818                     if (*s == '\n') {
15819                         const char * const t = ++s;
15820 
15821                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15822                             sv_catpvs(sv, "{outside bitmap}");
15823                         }
15824                         else {
15825                             sv_catpvs(sv, "{utf8}");
15826                         }
15827 
15828                         if (byte_output) {
15829                             sv_catpvs(sv, " ");
15830                         }
15831 
15832                         while (*s) {
15833                             if (*s == '\n') {
15834 
15835                                 /* Truncate very long output */
15836                                 if (s - origs > 256) {
15837                                     Perl_sv_catpvf(aTHX_ sv,
15838                                                 "%.*s...",
15839                                                 (int) (s - origs - 1),
15840                                                 t);
15841                                     goto out_dump;
15842                                 }
15843                                 *s = ' ';
15844                             }
15845                             else if (*s == '\t') {
15846                                 *s = '-';
15847                             }
15848                             s++;
15849                         }
15850                         if (s[-1] == ' ')
15851                             s[-1] = 0;
15852 
15853                         sv_catpv(sv, t);
15854                     }
15855 
15856                 out_dump:
15857 
15858                     Safefree(origs);
15859                     SvREFCNT_dec_NN(lv);
15860                 }
15861 
15862                 if ((flags & ANYOF_LOC_FOLD)
15863                      && only_utf8_locale
15864                      && only_utf8_locale != &PL_sv_undef)
15865                 {
15866                     UV start, end;
15867                     int max_entries = 256;
15868 
15869                     sv_catpvs(sv, "{utf8 locale}");
15870                     invlist_iterinit(only_utf8_locale);
15871                     while (invlist_iternext(only_utf8_locale,
15872                                             &start, &end)) {
15873                         put_range(sv, start, end);
15874                         max_entries --;
15875                         if (max_entries < 0) {
15876                             sv_catpvs(sv, "...");
15877                             break;
15878                         }
15879                     }
15880                     invlist_iterfinish(only_utf8_locale);
15881                 }
15882             }
15883 	}
15884 
15885 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15886     }
15887     else if (k == POSIXD || k == NPOSIXD) {
15888         U8 index = FLAGS(o) * 2;
15889         if (index < C_ARRAY_LENGTH(anyofs)) {
15890             if (*anyofs[index] != '[')  {
15891                 sv_catpv(sv, "[");
15892             }
15893             sv_catpv(sv, anyofs[index]);
15894             if (*anyofs[index] != '[')  {
15895                 sv_catpv(sv, "]");
15896             }
15897         }
15898         else {
15899             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15900         }
15901     }
15902     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15903 	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15904 #else
15905     PERL_UNUSED_CONTEXT;
15906     PERL_UNUSED_ARG(sv);
15907     PERL_UNUSED_ARG(o);
15908     PERL_UNUSED_ARG(prog);
15909     PERL_UNUSED_ARG(reginfo);
15910 #endif	/* DEBUGGING */
15911 }
15912 
15913 
15914 
15915 SV *
15916 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15917 {				/* Assume that RE_INTUIT is set */
15918     dVAR;
15919     struct regexp *const prog = ReANY(r);
15920     GET_RE_DEBUG_FLAGS_DECL;
15921 
15922     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15923     PERL_UNUSED_CONTEXT;
15924 
15925     DEBUG_COMPILE_r(
15926 	{
15927 	    const char * const s = SvPV_nolen_const(prog->check_substr
15928 		      ? prog->check_substr : prog->check_utf8);
15929 
15930 	    if (!PL_colorset) reginitcolors();
15931 	    PerlIO_printf(Perl_debug_log,
15932 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15933 		      PL_colors[4],
15934 		      prog->check_substr ? "" : "utf8 ",
15935 		      PL_colors[5],PL_colors[0],
15936 		      s,
15937 		      PL_colors[1],
15938 		      (strlen(s) > 60 ? "..." : ""));
15939 	} );
15940 
15941     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15942 }
15943 
15944 /*
15945    pregfree()
15946 
15947    handles refcounting and freeing the perl core regexp structure. When
15948    it is necessary to actually free the structure the first thing it
15949    does is call the 'free' method of the regexp_engine associated to
15950    the regexp, allowing the handling of the void *pprivate; member
15951    first. (This routine is not overridable by extensions, which is why
15952    the extensions free is called first.)
15953 
15954    See regdupe and regdupe_internal if you change anything here.
15955 */
15956 #ifndef PERL_IN_XSUB_RE
15957 void
15958 Perl_pregfree(pTHX_ REGEXP *r)
15959 {
15960     SvREFCNT_dec(r);
15961 }
15962 
15963 void
15964 Perl_pregfree2(pTHX_ REGEXP *rx)
15965 {
15966     dVAR;
15967     struct regexp *const r = ReANY(rx);
15968     GET_RE_DEBUG_FLAGS_DECL;
15969 
15970     PERL_ARGS_ASSERT_PREGFREE2;
15971 
15972     if (r->mother_re) {
15973         ReREFCNT_dec(r->mother_re);
15974     } else {
15975         CALLREGFREE_PVT(rx); /* free the private data */
15976         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15977 	Safefree(r->xpv_len_u.xpvlenu_pv);
15978     }
15979     if (r->substrs) {
15980         SvREFCNT_dec(r->anchored_substr);
15981         SvREFCNT_dec(r->anchored_utf8);
15982         SvREFCNT_dec(r->float_substr);
15983         SvREFCNT_dec(r->float_utf8);
15984 	Safefree(r->substrs);
15985     }
15986     RX_MATCH_COPY_FREE(rx);
15987 #ifdef PERL_ANY_COW
15988     SvREFCNT_dec(r->saved_copy);
15989 #endif
15990     Safefree(r->offs);
15991     SvREFCNT_dec(r->qr_anoncv);
15992     rx->sv_u.svu_rx = 0;
15993 }
15994 
15995 /*  reg_temp_copy()
15996 
15997     This is a hacky workaround to the structural issue of match results
15998     being stored in the regexp structure which is in turn stored in
15999     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16000     could be PL_curpm in multiple contexts, and could require multiple
16001     result sets being associated with the pattern simultaneously, such
16002     as when doing a recursive match with (??{$qr})
16003 
16004     The solution is to make a lightweight copy of the regexp structure
16005     when a qr// is returned from the code executed by (??{$qr}) this
16006     lightweight copy doesn't actually own any of its data except for
16007     the starp/end and the actual regexp structure itself.
16008 
16009 */
16010 
16011 
16012 REGEXP *
16013 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16014 {
16015     struct regexp *ret;
16016     struct regexp *const r = ReANY(rx);
16017     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16018 
16019     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16020 
16021     if (!ret_x)
16022 	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16023     else {
16024 	SvOK_off((SV *)ret_x);
16025 	if (islv) {
16026 	    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16027 	       to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16028 	       made both spots point to the same regexp body.) */
16029 	    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16030 	    assert(!SvPVX(ret_x));
16031 	    ret_x->sv_u.svu_rx = temp->sv_any;
16032 	    temp->sv_any = NULL;
16033 	    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16034 	    SvREFCNT_dec_NN(temp);
16035 	    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16036 	       ing below will not set it. */
16037 	    SvCUR_set(ret_x, SvCUR(rx));
16038 	}
16039     }
16040     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16041        sv_force_normal(sv) is called.  */
16042     SvFAKE_on(ret_x);
16043     ret = ReANY(ret_x);
16044 
16045     SvFLAGS(ret_x) |= SvUTF8(rx);
16046     /* We share the same string buffer as the original regexp, on which we
16047        hold a reference count, incremented when mother_re is set below.
16048        The string pointer is copied here, being part of the regexp struct.
16049      */
16050     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16051 	   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16052     if (r->offs) {
16053         const I32 npar = r->nparens+1;
16054         Newx(ret->offs, npar, regexp_paren_pair);
16055         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16056     }
16057     if (r->substrs) {
16058         Newx(ret->substrs, 1, struct reg_substr_data);
16059 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16060 
16061 	SvREFCNT_inc_void(ret->anchored_substr);
16062 	SvREFCNT_inc_void(ret->anchored_utf8);
16063 	SvREFCNT_inc_void(ret->float_substr);
16064 	SvREFCNT_inc_void(ret->float_utf8);
16065 
16066 	/* check_substr and check_utf8, if non-NULL, point to either their
16067 	   anchored or float namesakes, and don't hold a second reference.  */
16068     }
16069     RX_MATCH_COPIED_off(ret_x);
16070 #ifdef PERL_ANY_COW
16071     ret->saved_copy = NULL;
16072 #endif
16073     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16074     SvREFCNT_inc_void(ret->qr_anoncv);
16075 
16076     return ret_x;
16077 }
16078 #endif
16079 
16080 /* regfree_internal()
16081 
16082    Free the private data in a regexp. This is overloadable by
16083    extensions. Perl takes care of the regexp structure in pregfree(),
16084    this covers the *pprivate pointer which technically perl doesn't
16085    know about, however of course we have to handle the
16086    regexp_internal structure when no extension is in use.
16087 
16088    Note this is called before freeing anything in the regexp
16089    structure.
16090  */
16091 
16092 void
16093 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16094 {
16095     dVAR;
16096     struct regexp *const r = ReANY(rx);
16097     RXi_GET_DECL(r,ri);
16098     GET_RE_DEBUG_FLAGS_DECL;
16099 
16100     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16101 
16102     DEBUG_COMPILE_r({
16103 	if (!PL_colorset)
16104 	    reginitcolors();
16105 	{
16106 	    SV *dsv= sv_newmortal();
16107             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16108                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16109             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16110                 PL_colors[4],PL_colors[5],s);
16111         }
16112     });
16113 #ifdef RE_TRACK_PATTERN_OFFSETS
16114     if (ri->u.offsets)
16115         Safefree(ri->u.offsets);             /* 20010421 MJD */
16116 #endif
16117     if (ri->code_blocks) {
16118 	int n;
16119 	for (n = 0; n < ri->num_code_blocks; n++)
16120 	    SvREFCNT_dec(ri->code_blocks[n].src_regex);
16121 	Safefree(ri->code_blocks);
16122     }
16123 
16124     if (ri->data) {
16125 	int n = ri->data->count;
16126 
16127 	while (--n >= 0) {
16128           /* If you add a ->what type here, update the comment in regcomp.h */
16129 	    switch (ri->data->what[n]) {
16130 	    case 'a':
16131 	    case 'r':
16132 	    case 's':
16133 	    case 'S':
16134 	    case 'u':
16135 		SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16136 		break;
16137 	    case 'f':
16138 		Safefree(ri->data->data[n]);
16139 		break;
16140 	    case 'l':
16141 	    case 'L':
16142 	        break;
16143             case 'T':
16144                 { /* Aho Corasick add-on structure for a trie node.
16145                      Used in stclass optimization only */
16146                     U32 refcount;
16147                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16148                     OP_REFCNT_LOCK;
16149                     refcount = --aho->refcount;
16150                     OP_REFCNT_UNLOCK;
16151                     if ( !refcount ) {
16152                         PerlMemShared_free(aho->states);
16153                         PerlMemShared_free(aho->fail);
16154 			 /* do this last!!!! */
16155                         PerlMemShared_free(ri->data->data[n]);
16156                         PerlMemShared_free(ri->regstclass);
16157                     }
16158                 }
16159                 break;
16160 	    case 't':
16161 	        {
16162 	            /* trie structure. */
16163 	            U32 refcount;
16164 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16165                     OP_REFCNT_LOCK;
16166                     refcount = --trie->refcount;
16167                     OP_REFCNT_UNLOCK;
16168                     if ( !refcount ) {
16169                         PerlMemShared_free(trie->charmap);
16170                         PerlMemShared_free(trie->states);
16171                         PerlMemShared_free(trie->trans);
16172                         if (trie->bitmap)
16173                             PerlMemShared_free(trie->bitmap);
16174                         if (trie->jump)
16175                             PerlMemShared_free(trie->jump);
16176 			PerlMemShared_free(trie->wordinfo);
16177                         /* do this last!!!! */
16178                         PerlMemShared_free(ri->data->data[n]);
16179 		    }
16180 		}
16181 		break;
16182 	    default:
16183 		Perl_croak(aTHX_ "panic: regfree data code '%c'",
16184                                                     ri->data->what[n]);
16185 	    }
16186 	}
16187 	Safefree(ri->data->what);
16188 	Safefree(ri->data);
16189     }
16190 
16191     Safefree(ri);
16192 }
16193 
16194 #define av_dup_inc(s,t)	MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16195 #define hv_dup_inc(s,t)	MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16196 #define SAVEPVN(p,n)	((p) ? savepvn(p,n) : NULL)
16197 
16198 /*
16199    re_dup - duplicate a regexp.
16200 
16201    This routine is expected to clone a given regexp structure. It is only
16202    compiled under USE_ITHREADS.
16203 
16204    After all of the core data stored in struct regexp is duplicated
16205    the regexp_engine.dupe method is used to copy any private data
16206    stored in the *pprivate pointer. This allows extensions to handle
16207    any duplication it needs to do.
16208 
16209    See pregfree() and regfree_internal() if you change anything here.
16210 */
16211 #if defined(USE_ITHREADS)
16212 #ifndef PERL_IN_XSUB_RE
16213 void
16214 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16215 {
16216     dVAR;
16217     I32 npar;
16218     const struct regexp *r = ReANY(sstr);
16219     struct regexp *ret = ReANY(dstr);
16220 
16221     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16222 
16223     npar = r->nparens+1;
16224     Newx(ret->offs, npar, regexp_paren_pair);
16225     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16226 
16227     if (ret->substrs) {
16228 	/* Do it this way to avoid reading from *r after the StructCopy().
16229 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
16230 	   cache, it doesn't matter.  */
16231 	const bool anchored = r->check_substr
16232 	    ? r->check_substr == r->anchored_substr
16233 	    : r->check_utf8 == r->anchored_utf8;
16234         Newx(ret->substrs, 1, struct reg_substr_data);
16235 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16236 
16237 	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16238 	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16239 	ret->float_substr = sv_dup_inc(ret->float_substr, param);
16240 	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16241 
16242 	/* check_substr and check_utf8, if non-NULL, point to either their
16243 	   anchored or float namesakes, and don't hold a second reference.  */
16244 
16245 	if (ret->check_substr) {
16246 	    if (anchored) {
16247 		assert(r->check_utf8 == r->anchored_utf8);
16248 		ret->check_substr = ret->anchored_substr;
16249 		ret->check_utf8 = ret->anchored_utf8;
16250 	    } else {
16251 		assert(r->check_substr == r->float_substr);
16252 		assert(r->check_utf8 == r->float_utf8);
16253 		ret->check_substr = ret->float_substr;
16254 		ret->check_utf8 = ret->float_utf8;
16255 	    }
16256 	} else if (ret->check_utf8) {
16257 	    if (anchored) {
16258 		ret->check_utf8 = ret->anchored_utf8;
16259 	    } else {
16260 		ret->check_utf8 = ret->float_utf8;
16261 	    }
16262 	}
16263     }
16264 
16265     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16266     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16267 
16268     if (ret->pprivate)
16269 	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16270 
16271     if (RX_MATCH_COPIED(dstr))
16272 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16273     else
16274 	ret->subbeg = NULL;
16275 #ifdef PERL_ANY_COW
16276     ret->saved_copy = NULL;
16277 #endif
16278 
16279     /* Whether mother_re be set or no, we need to copy the string.  We
16280        cannot refrain from copying it when the storage points directly to
16281        our mother regexp, because that's
16282 	       1: a buffer in a different thread
16283 	       2: something we no longer hold a reference on
16284 	       so we need to copy it locally.  */
16285     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16286     ret->mother_re   = NULL;
16287 }
16288 #endif /* PERL_IN_XSUB_RE */
16289 
16290 /*
16291    regdupe_internal()
16292 
16293    This is the internal complement to regdupe() which is used to copy
16294    the structure pointed to by the *pprivate pointer in the regexp.
16295    This is the core version of the extension overridable cloning hook.
16296    The regexp structure being duplicated will be copied by perl prior
16297    to this and will be provided as the regexp *r argument, however
16298    with the /old/ structures pprivate pointer value. Thus this routine
16299    may override any copying normally done by perl.
16300 
16301    It returns a pointer to the new regexp_internal structure.
16302 */
16303 
16304 void *
16305 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16306 {
16307     dVAR;
16308     struct regexp *const r = ReANY(rx);
16309     regexp_internal *reti;
16310     int len;
16311     RXi_GET_DECL(r,ri);
16312 
16313     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16314 
16315     len = ProgLen(ri);
16316 
16317     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16318           char, regexp_internal);
16319     Copy(ri->program, reti->program, len+1, regnode);
16320 
16321     reti->num_code_blocks = ri->num_code_blocks;
16322     if (ri->code_blocks) {
16323 	int n;
16324 	Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16325 		struct reg_code_block);
16326 	Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16327 		struct reg_code_block);
16328 	for (n = 0; n < ri->num_code_blocks; n++)
16329 	     reti->code_blocks[n].src_regex = (REGEXP*)
16330 		    sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16331     }
16332     else
16333 	reti->code_blocks = NULL;
16334 
16335     reti->regstclass = NULL;
16336 
16337     if (ri->data) {
16338 	struct reg_data *d;
16339         const int count = ri->data->count;
16340 	int i;
16341 
16342 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16343 		char, struct reg_data);
16344 	Newx(d->what, count, U8);
16345 
16346 	d->count = count;
16347 	for (i = 0; i < count; i++) {
16348 	    d->what[i] = ri->data->what[i];
16349 	    switch (d->what[i]) {
16350 	        /* see also regcomp.h and regfree_internal() */
16351 	    case 'a': /* actually an AV, but the dup function is identical.  */
16352 	    case 'r':
16353 	    case 's':
16354 	    case 'S':
16355 	    case 'u': /* actually an HV, but the dup function is identical.  */
16356 		d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16357 		break;
16358 	    case 'f':
16359 		/* This is cheating. */
16360 		Newx(d->data[i], 1, regnode_ssc);
16361 		StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16362 		reti->regstclass = (regnode*)d->data[i];
16363 		break;
16364 	    case 'T':
16365 		/* Trie stclasses are readonly and can thus be shared
16366 		 * without duplication. We free the stclass in pregfree
16367 		 * when the corresponding reg_ac_data struct is freed.
16368 		 */
16369 		reti->regstclass= ri->regstclass;
16370 		/* Fall through */
16371 	    case 't':
16372 		OP_REFCNT_LOCK;
16373 		((reg_trie_data*)ri->data->data[i])->refcount++;
16374 		OP_REFCNT_UNLOCK;
16375 		/* Fall through */
16376 	    case 'l':
16377 	    case 'L':
16378 		d->data[i] = ri->data->data[i];
16379 		break;
16380             default:
16381 		Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16382                                                            ri->data->what[i]);
16383 	    }
16384 	}
16385 
16386 	reti->data = d;
16387     }
16388     else
16389 	reti->data = NULL;
16390 
16391     reti->name_list_idx = ri->name_list_idx;
16392 
16393 #ifdef RE_TRACK_PATTERN_OFFSETS
16394     if (ri->u.offsets) {
16395         Newx(reti->u.offsets, 2*len+1, U32);
16396         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16397     }
16398 #else
16399     SetProgLen(reti,len);
16400 #endif
16401 
16402     return (void*)reti;
16403 }
16404 
16405 #endif    /* USE_ITHREADS */
16406 
16407 #ifndef PERL_IN_XSUB_RE
16408 
16409 /*
16410  - regnext - dig the "next" pointer out of a node
16411  */
16412 regnode *
16413 Perl_regnext(pTHX_ regnode *p)
16414 {
16415     dVAR;
16416     I32 offset;
16417 
16418     if (!p)
16419 	return(NULL);
16420 
16421     if (OP(p) > REGNODE_MAX) {		/* regnode.type is unsigned */
16422 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16423                                                 (int)OP(p), (int)REGNODE_MAX);
16424     }
16425 
16426     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16427     if (offset == 0)
16428 	return(NULL);
16429 
16430     return(p+offset);
16431 }
16432 #endif
16433 
16434 STATIC void
16435 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16436 {
16437     va_list args;
16438     STRLEN l1 = strlen(pat1);
16439     STRLEN l2 = strlen(pat2);
16440     char buf[512];
16441     SV *msv;
16442     const char *message;
16443 
16444     PERL_ARGS_ASSERT_RE_CROAK2;
16445 
16446     if (l1 > 510)
16447 	l1 = 510;
16448     if (l1 + l2 > 510)
16449 	l2 = 510 - l1;
16450     Copy(pat1, buf, l1 , char);
16451     Copy(pat2, buf + l1, l2 , char);
16452     buf[l1 + l2] = '\n';
16453     buf[l1 + l2 + 1] = '\0';
16454     va_start(args, pat2);
16455     msv = vmess(buf, &args);
16456     va_end(args);
16457     message = SvPV_const(msv,l1);
16458     if (l1 > 512)
16459 	l1 = 512;
16460     Copy(message, buf, l1 , char);
16461     /* l1-1 to avoid \n */
16462     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16463 }
16464 
16465 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16466 
16467 #ifndef PERL_IN_XSUB_RE
16468 void
16469 Perl_save_re_context(pTHX)
16470 {
16471     dVAR;
16472     I32 nparens = -1;
16473     I32 i;
16474 
16475     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16476 
16477     if (PL_curpm) {
16478 	const REGEXP * const rx = PM_GETRE(PL_curpm);
16479 	if (rx)
16480             nparens = RX_NPARENS(rx);
16481     }
16482 
16483     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
16484      * that PL_curpm will be null, but that utf8.pm and the modules it
16485      * loads will only use $1..$3.
16486      * The t/porting/re_context.t test file checks this assumption.
16487      */
16488     if (nparens == -1)
16489         nparens = 3;
16490 
16491     for (i = 1; i <= nparens; i++) {
16492         char digits[TYPE_CHARS(long)];
16493         const STRLEN len = my_snprintf(digits, sizeof(digits),
16494                                        "%lu", (long)i);
16495         GV *const *const gvp
16496             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16497 
16498         if (gvp) {
16499             GV * const gv = *gvp;
16500             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16501                 save_scalar(gv);
16502         }
16503     }
16504 }
16505 #endif
16506 
16507 #ifdef DEBUGGING
16508 
16509 STATIC void
16510 S_put_byte(pTHX_ SV *sv, int c)
16511 {
16512     PERL_ARGS_ASSERT_PUT_BYTE;
16513 
16514     if (!isPRINT(c)) {
16515         switch (c) {
16516             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16517             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16518             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16519             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16520             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16521 
16522             default:
16523                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16524                 break;
16525         }
16526     }
16527     else {
16528 	const char string = c;
16529 	if (c == '-' || c == ']' || c == '\\' || c == '^')
16530 	    sv_catpvs(sv, "\\");
16531 	sv_catpvn(sv, &string, 1);
16532     }
16533 }
16534 
16535 STATIC void
16536 S_put_range(pTHX_ SV *sv, UV start, UV end)
16537 {
16538 
16539     /* Appends to 'sv' a displayable version of the range of code points from
16540      * 'start' to 'end' */
16541 
16542     assert(start <= end);
16543 
16544     PERL_ARGS_ASSERT_PUT_RANGE;
16545 
16546     if (end - start < 3) {  /* Individual chars in short ranges */
16547         for (; start <= end; start++)
16548             put_byte(sv, start);
16549     }
16550     else if (   end > 255
16551              || ! isALPHANUMERIC(start)
16552              || ! isALPHANUMERIC(end)
16553              || isDIGIT(start) != isDIGIT(end)
16554              || isUPPER(start) != isUPPER(end)
16555              || isLOWER(start) != isLOWER(end)
16556 
16557                 /* This final test should get optimized out except on EBCDIC
16558                  * platforms, where it causes ranges that cross discontinuities
16559                  * like i/j to be shown as hex instead of the misleading,
16560                  * e.g. H-K (since that range includes more than H, I, J, K).
16561                  * */
16562              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16563     {
16564         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16565                        start,
16566                        (end < 256) ? end : 255);
16567     }
16568     else { /* Here, the ends of the range are both digits, or both uppercase,
16569               or both lowercase; and there's no discontinuity in the range
16570               (which could happen on EBCDIC platforms) */
16571         put_byte(sv, start);
16572         sv_catpvs(sv, "-");
16573         put_byte(sv, end);
16574     }
16575 }
16576 
16577 STATIC bool
16578 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16579 {
16580     /* Appends to 'sv' a displayable version of the innards of the bracketed
16581      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16582      * output anything */
16583 
16584     int i;
16585     bool has_output_anything = FALSE;
16586 
16587     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16588 
16589     for (i = 0; i < 256; i++) {
16590         if (BITMAP_TEST((U8 *) bitmap,i)) {
16591 
16592             /* The character at index i should be output.  Find the next
16593              * character that should NOT be output */
16594             int j;
16595             for (j = i + 1; j < 256; j++) {
16596                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16597                     break;
16598                 }
16599             }
16600 
16601             /* Everything between them is a single range that should be output
16602              * */
16603             put_range(sv, i, j - 1);
16604             has_output_anything = TRUE;
16605             i = j;
16606         }
16607     }
16608 
16609     return has_output_anything;
16610 }
16611 
16612 #define CLEAR_OPTSTART \
16613     if (optstart) STMT_START {                                               \
16614         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16615                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16616 	optstart=NULL;                                                       \
16617     } STMT_END
16618 
16619 #define DUMPUNTIL(b,e)                                                       \
16620                     CLEAR_OPTSTART;                                          \
16621                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16622 
16623 STATIC const regnode *
16624 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16625 	    const regnode *last, const regnode *plast,
16626 	    SV* sv, I32 indent, U32 depth)
16627 {
16628     dVAR;
16629     U8 op = PSEUDO;	/* Arbitrary non-END op. */
16630     const regnode *next;
16631     const regnode *optstart= NULL;
16632 
16633     RXi_GET_DECL(r,ri);
16634     GET_RE_DEBUG_FLAGS_DECL;
16635 
16636     PERL_ARGS_ASSERT_DUMPUNTIL;
16637 
16638 #ifdef DEBUG_DUMPUNTIL
16639     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16640         last ? last-start : 0,plast ? plast-start : 0);
16641 #endif
16642 
16643     if (plast && plast < last)
16644         last= plast;
16645 
16646     while (PL_regkind[op] != END && (!last || node < last)) {
16647 	/* While that wasn't END last time... */
16648 	NODE_ALIGN(node);
16649 	op = OP(node);
16650 	if (op == CLOSE || op == WHILEM)
16651 	    indent--;
16652 	next = regnext((regnode *)node);
16653 
16654 	/* Where, what. */
16655 	if (OP(node) == OPTIMIZED) {
16656 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16657 	        optstart = node;
16658 	    else
16659 		goto after_print;
16660 	} else
16661 	    CLEAR_OPTSTART;
16662 
16663 	regprop(r, sv, node, NULL);
16664 	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16665 		      (int)(2*indent + 1), "", SvPVX_const(sv));
16666 
16667         if (OP(node) != OPTIMIZED) {
16668             if (next == NULL)		/* Next ptr. */
16669                 PerlIO_printf(Perl_debug_log, " (0)");
16670             else if (PL_regkind[(U8)op] == BRANCH
16671                      && PL_regkind[OP(next)] != BRANCH )
16672                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16673             else
16674                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16675             (void)PerlIO_putc(Perl_debug_log, '\n');
16676         }
16677 
16678       after_print:
16679 	if (PL_regkind[(U8)op] == BRANCHJ) {
16680 	    assert(next);
16681 	    {
16682                 const regnode *nnode = (OP(next) == LONGJMP
16683                                        ? regnext((regnode *)next)
16684                                        : next);
16685                 if (last && nnode > last)
16686                     nnode = last;
16687                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16688 	    }
16689 	}
16690 	else if (PL_regkind[(U8)op] == BRANCH) {
16691 	    assert(next);
16692 	    DUMPUNTIL(NEXTOPER(node), next);
16693 	}
16694 	else if ( PL_regkind[(U8)op]  == TRIE ) {
16695 	    const regnode *this_trie = node;
16696 	    const char op = OP(node);
16697             const U32 n = ARG(node);
16698 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
16699                (reg_ac_data *)ri->data->data[n] :
16700                NULL;
16701 	    const reg_trie_data * const trie =
16702 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16703 #ifdef DEBUGGING
16704 	    AV *const trie_words
16705                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16706 #endif
16707 	    const regnode *nextbranch= NULL;
16708 	    I32 word_idx;
16709             sv_setpvs(sv, "");
16710 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16711 		SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16712 
16713                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16714                    (int)(2*(indent+3)), "",
16715                     elem_ptr
16716                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16717                                 SvCUR(*elem_ptr), 60,
16718                                 PL_colors[0], PL_colors[1],
16719                                 (SvUTF8(*elem_ptr)
16720                                  ? PERL_PV_ESCAPE_UNI
16721                                  : 0)
16722                                 | PERL_PV_PRETTY_ELLIPSES
16723                                 | PERL_PV_PRETTY_LTGT
16724                             )
16725                     : "???"
16726                 );
16727                 if (trie->jump) {
16728                     U16 dist= trie->jump[word_idx+1];
16729 		    PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16730                                (UV)((dist ? this_trie + dist : next) - start));
16731                     if (dist) {
16732                         if (!nextbranch)
16733                             nextbranch= this_trie + trie->jump[0];
16734 			DUMPUNTIL(this_trie + dist, nextbranch);
16735                     }
16736                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16737                         nextbranch= regnext((regnode *)nextbranch);
16738                 } else {
16739                     PerlIO_printf(Perl_debug_log, "\n");
16740 		}
16741 	    }
16742 	    if (last && next > last)
16743 	        node= last;
16744 	    else
16745 	        node= next;
16746 	}
16747 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16748 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16749                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16750 	}
16751 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16752 	    assert(next);
16753 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16754 	}
16755 	else if ( op == PLUS || op == STAR) {
16756 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16757 	}
16758 	else if (PL_regkind[(U8)op] == ANYOF) {
16759 	    /* arglen 1 + class block */
16760 	    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16761                           ? ANYOF_POSIXL_SKIP
16762                           : ANYOF_SKIP);
16763 	    node = NEXTOPER(node);
16764 	}
16765 	else if (PL_regkind[(U8)op] == EXACT) {
16766             /* Literal string, where present. */
16767 	    node += NODE_SZ_STR(node) - 1;
16768 	    node = NEXTOPER(node);
16769 	}
16770 	else {
16771 	    node = NEXTOPER(node);
16772 	    node += regarglen[(U8)op];
16773 	}
16774 	if (op == CURLYX || op == OPEN)
16775 	    indent++;
16776     }
16777     CLEAR_OPTSTART;
16778 #ifdef DEBUG_DUMPUNTIL
16779     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16780 #endif
16781     return node;
16782 }
16783 
16784 #endif	/* DEBUGGING */
16785 
16786 /*
16787  * Local variables:
16788  * c-indentation-style: bsd
16789  * c-basic-offset: 4
16790  * indent-tabs-mode: nil
16791  * End:
16792  *
16793  * ex: set ts=8 sts=4 sw=4 et:
16794  */
16795