1 /*    regexec.c
2  */
3 
4 /*
5  *	One Ring to rule them all, One Ring to find them
6  *
7  *     [p.v of _The Lord of the Rings_, opening poem]
8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10  */
11 
12 /* This file contains functions for executing a regular expression.  See
13  * also regcomp.c which funnily enough, contains functions for compiling
14  * a regular expression.
15  *
16  * This file is also copied at build time to ext/re/re_exec.c, where
17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18  * This causes the main functions to be compiled under new names and with
19  * debugging support added, which makes "use re 'debug'" work.
20  */
21 
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23  * confused with the original package (see point 3 below).  Thanks, Henry!
24  */
25 
26 /* Additional note: this code is very heavily munged from Henry's version
27  * in places.  In some spots I've traded clarity for efficiency, so don't
28  * blame Henry for some of the lack of readability.
29  */
30 
31 /* The names of the functions have been changed from regcomp and
32  * regexec to  pregcomp and pregexec in order to avoid conflicts
33  * with the POSIX routines of the same names.
34 */
35 
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39 
40 /*
41  * pregcomp and pregexec -- regsub and regerror are not used in perl
42  *
43  *	Copyright (c) 1986 by University of Toronto.
44  *	Written by Henry Spencer.  Not derived from licensed software.
45  *
46  *	Permission is granted to anyone to use this software for any
47  *	purpose on any computer system, and to redistribute it freely,
48  *	subject to the following restrictions:
49  *
50  *	1. The author is not responsible for the consequences of use of
51  *		this software, no matter how awful, even if they arise
52  *		from defects in it.
53  *
54  *	2. The origin of this software must not be misrepresented, either
55  *		by explicit claim or by omission.
56  *
57  *	3. Altered versions must be plainly marked as such, and must not
58  *		be misrepresented as being the original software.
59  *
60  ****    Alterations to Henry's code are...
61  ****
62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64  ****    by Larry Wall and others
65  ****
66  ****    You may distribute under the terms of either the GNU General Public
67  ****    License or the Artistic License, as specified in the README file.
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEXEC_C
75 #include "perl.h"
76 
77 #ifdef PERL_IN_XSUB_RE
78 #  include "re_comp.h"
79 #else
80 #  include "regcomp.h"
81 #endif
82 
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
85 
86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
88 
89 static const char utf8_locale_required[] =
90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
91 
92 #ifdef DEBUGGING
93 /* At least one required character in the target string is expressible only in
94  * UTF-8. */
95 static const char non_utf8_target_but_utf8_required[]
96                 = "Can't match, because target string needs to be in UTF-8\n";
97 #endif
98 
99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
101     goto target;                                                         \
102 } STMT_END
103 
104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105 
106 #ifndef STATIC
107 #define	STATIC	static
108 #endif
109 
110 /*
111  * Forwards.
112  */
113 
114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
115 
116 #define HOPc(pos,off) \
117 	(char *)(reginfo->is_utf8_target \
118 	    ? reghop3((U8*)pos, off, \
119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
120 	    : (U8*)(pos + off))
121 
122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
123 #define HOPBACK3(pos, off, lim) \
124 	(reginfo->is_utf8_target                          \
125 	    ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
126 	    : (pos - off >= lim)	                         \
127 		? (U8*)pos - off		                 \
128 		: NULL)
129 
130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
131 
132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
134 
135 /* lim must be +ve. Returns NULL on overshoot */
136 #define HOPMAYBE3(pos,off,lim) \
137 	(reginfo->is_utf8_target                        \
138 	    ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
139 	    : ((U8*)pos + off <= lim)                   \
140 		? (U8*)pos + off                        \
141 		: NULL)
142 
143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
144  * off must be >=0; args should be vars rather than expressions */
145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
146     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
147     : (U8*)((pos + off) > lim ? lim : (pos + off)))
148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
149 
150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
152     : (U8*)(pos + off))
153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
154 
155 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
156 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
157 
158 /* for use after a quantifier and before an EXACT-like node -- japhy */
159 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
160  *
161  * NOTE that *nothing* that affects backtracking should be in here, specifically
162  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
163  * node that is in between two EXACT like nodes when ascertaining what the required
164  * "follow" character is. This should probably be moved to regex compile time
165  * although it may be done at run time beause of the REF possibility - more
166  * investigation required. -- demerphq
167 */
168 #define JUMPABLE(rn) (                                                             \
169     OP(rn) == OPEN ||                                                              \
170     (OP(rn) == CLOSE &&                                                            \
171      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
172     OP(rn) == EVAL ||                                                              \
173     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
174     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
175     OP(rn) == KEEPS ||                                                             \
176     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
177 )
178 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
179 
180 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
181 
182 /*
183   Search for mandatory following text node; for lookahead, the text must
184   follow but for lookbehind (rn->flags != 0) we skip to the next step.
185 */
186 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
187     while (JUMPABLE(rn)) { \
188 	const OPCODE type = OP(rn); \
189 	if (type == SUSPEND || PL_regkind[type] == CURLY) \
190 	    rn = NEXTOPER(NEXTOPER(rn)); \
191 	else if (type == PLUS) \
192 	    rn = NEXTOPER(rn); \
193 	else if (type == IFMATCH) \
194 	    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
195 	else rn += NEXT_OFF(rn); \
196     } \
197 } STMT_END
198 
199 #define SLAB_FIRST(s) (&(s)->states[0])
200 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
201 
202 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
203 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
204 static regmatch_state * S_push_slab(pTHX);
205 
206 #define REGCP_PAREN_ELEMS 3
207 #define REGCP_OTHER_ELEMS 3
208 #define REGCP_FRAME_ELEMS 1
209 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
210  * are needed for the regexp context stack bookkeeping. */
211 
212 STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp * rex,I32 parenfloor,U32 maxopenparen _pDEPTH)213 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
214 {
215     const int retval = PL_savestack_ix;
216     const int paren_elems_to_push =
217                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
218     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
219     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
220     I32 p;
221     GET_RE_DEBUG_FLAGS_DECL;
222 
223     PERL_ARGS_ASSERT_REGCPPUSH;
224 
225     if (paren_elems_to_push < 0)
226         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
227                    (int)paren_elems_to_push, (int)maxopenparen,
228                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
229 
230     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
231 	Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
232 		   " out of range (%lu-%ld)",
233 		   total_elems,
234                    (unsigned long)maxopenparen,
235                    (long)parenfloor);
236 
237     SSGROW(total_elems + REGCP_FRAME_ELEMS);
238 
239     DEBUG_BUFFERS_r(
240 	if ((int)maxopenparen > (int)parenfloor)
241             Perl_re_exec_indentf( aTHX_
242 		"rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
243 		depth,
244                 PTR2UV(rex),
245 		PTR2UV(rex->offs)
246 	    );
247     );
248     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
249 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
250 	SSPUSHIV(rex->offs[p].end);
251 	SSPUSHIV(rex->offs[p].start);
252 	SSPUSHINT(rex->offs[p].start_tmp);
253         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
254 	    "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
255 	    depth,
256             (UV)p,
257 	    (IV)rex->offs[p].start,
258 	    (IV)rex->offs[p].start_tmp,
259 	    (IV)rex->offs[p].end
260 	));
261     }
262 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
263     SSPUSHINT(maxopenparen);
264     SSPUSHINT(rex->lastparen);
265     SSPUSHINT(rex->lastcloseparen);
266     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
267 
268     return retval;
269 }
270 
271 /* These are needed since we do not localize EVAL nodes: */
272 #define REGCP_SET(cp)                                           \
273     DEBUG_STATE_r(                                              \
274         Perl_re_exec_indentf( aTHX_                             \
275             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
276             depth, (IV)PL_savestack_ix                          \
277         )                                                       \
278     );                                                          \
279     cp = PL_savestack_ix
280 
281 #define REGCP_UNWIND(cp)                                        \
282     DEBUG_STATE_r(                                              \
283         if (cp != PL_savestack_ix)                              \
284             Perl_re_exec_indentf( aTHX_                         \
285                 "Clearing an EVAL scope, savestack=%"           \
286                 IVdf "..%" IVdf "\n",                           \
287                 depth, (IV)(cp), (IV)PL_savestack_ix            \
288             )                                                   \
289     );                                                          \
290     regcpblow(cp)
291 
292 /* set the start and end positions of capture ix */
293 #define CLOSE_CAPTURE(ix, s, e)                                            \
294     rex->offs[ix].start = s;                                               \
295     rex->offs[ix].end = e;                                                 \
296     if (ix > rex->lastparen)                                               \
297         rex->lastparen = ix;                                               \
298     rex->lastcloseparen = ix;                                              \
299     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
300         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
301         depth,                                                             \
302         PTR2UV(rex),                                                       \
303         PTR2UV(rex->offs),                                                 \
304         (UV)ix,                                                            \
305         (IV)rex->offs[ix].start,                                           \
306         (IV)rex->offs[ix].end,                                             \
307         (UV)rex->lastparen                                                 \
308     ))
309 
310 #define UNWIND_PAREN(lp, lcp)               \
311     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
312         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
313         depth,                              \
314         PTR2UV(rex),                        \
315         PTR2UV(rex->offs),                  \
316         (UV)(lp),                           \
317         (UV)(rex->lastparen),               \
318         (UV)(lcp)                           \
319     ));                                     \
320     for (n = rex->lastparen; n > lp; n--)   \
321         rex->offs[n].end = -1;              \
322     rex->lastparen = n;                     \
323     rex->lastcloseparen = lcp;
324 
325 
326 STATIC void
S_regcppop(pTHX_ regexp * rex,U32 * maxopenparen_p _pDEPTH)327 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
328 {
329     UV i;
330     U32 paren;
331     GET_RE_DEBUG_FLAGS_DECL;
332 
333     PERL_ARGS_ASSERT_REGCPPOP;
334 
335     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
336     i = SSPOPUV;
337     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
338     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
339     rex->lastcloseparen = SSPOPINT;
340     rex->lastparen = SSPOPINT;
341     *maxopenparen_p = SSPOPINT;
342 
343     i -= REGCP_OTHER_ELEMS;
344     /* Now restore the parentheses context. */
345     DEBUG_BUFFERS_r(
346 	if (i || rex->lastparen + 1 <= rex->nparens)
347             Perl_re_exec_indentf( aTHX_
348 		"rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
349 		depth,
350                 PTR2UV(rex),
351 		PTR2UV(rex->offs)
352 	    );
353     );
354     paren = *maxopenparen_p;
355     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
356 	SSize_t tmps;
357 	rex->offs[paren].start_tmp = SSPOPINT;
358 	rex->offs[paren].start = SSPOPIV;
359 	tmps = SSPOPIV;
360 	if (paren <= rex->lastparen)
361 	    rex->offs[paren].end = tmps;
362         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
363 	    "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
364 	    depth,
365             (UV)paren,
366 	    (IV)rex->offs[paren].start,
367 	    (IV)rex->offs[paren].start_tmp,
368 	    (IV)rex->offs[paren].end,
369 	    (paren > rex->lastparen ? "(skipped)" : ""));
370 	);
371 	paren--;
372     }
373 #if 1
374     /* It would seem that the similar code in regtry()
375      * already takes care of this, and in fact it is in
376      * a better location to since this code can #if 0-ed out
377      * but the code in regtry() is needed or otherwise tests
378      * requiring null fields (pat.t#187 and split.t#{13,14}
379      * (as of patchlevel 7877)  will fail.  Then again,
380      * this code seems to be necessary or otherwise
381      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
382      * --jhi updated by dapm */
383     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
384 	if (i > *maxopenparen_p)
385 	    rex->offs[i].start = -1;
386 	rex->offs[i].end = -1;
387         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
388 	    "    \\%" UVuf ": %s   ..-1 undeffing\n",
389 	    depth,
390             (UV)i,
391 	    (i > *maxopenparen_p) ? "-1" : "  "
392 	));
393     }
394 #endif
395 }
396 
397 /* restore the parens and associated vars at savestack position ix,
398  * but without popping the stack */
399 
400 STATIC void
S_regcp_restore(pTHX_ regexp * rex,I32 ix,U32 * maxopenparen_p _pDEPTH)401 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
402 {
403     I32 tmpix = PL_savestack_ix;
404     PERL_ARGS_ASSERT_REGCP_RESTORE;
405 
406     PL_savestack_ix = ix;
407     regcppop(rex, maxopenparen_p);
408     PL_savestack_ix = tmpix;
409 }
410 
411 #define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
412 
413 #ifndef PERL_IN_XSUB_RE
414 
415 bool
Perl_isFOO_lc(pTHX_ const U8 classnum,const U8 character)416 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
417 {
418     /* Returns a boolean as to whether or not 'character' is a member of the
419      * Posix character class given by 'classnum' that should be equivalent to a
420      * value in the typedef '_char_class_number'.
421      *
422      * Ideally this could be replaced by a just an array of function pointers
423      * to the C library functions that implement the macros this calls.
424      * However, to compile, the precise function signatures are required, and
425      * these may vary from platform to to platform.  To avoid having to figure
426      * out what those all are on each platform, I (khw) am using this method,
427      * which adds an extra layer of function call overhead (unless the C
428      * optimizer strips it away).  But we don't particularly care about
429      * performance with locales anyway. */
430 
431     switch ((_char_class_number) classnum) {
432         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
433         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
434         case _CC_ENUM_ASCII:     return isASCII_LC(character);
435         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
436         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
437                                         || isUPPER_LC(character);
438         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
439         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
440         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
441         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
442         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
443         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
444         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
445         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
446         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
447         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
448         default:    /* VERTSPACE should never occur in locales */
449             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
450     }
451 
452     NOT_REACHED; /* NOTREACHED */
453     return FALSE;
454 }
455 
456 #endif
457 
458 PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(const char * s1,const char * s2,I32 len)459 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
460 {
461     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
462      * folded.  Works on all folds representable without UTF-8, except for
463      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
464      * check that the strings each have at least 'len' characters.
465      *
466      * There is almost an identical API function where s2 need not be folded:
467      * Perl_foldEQ_latin1() */
468 
469     const U8 *a = (const U8 *)s1;
470     const U8 *b = (const U8 *)s2;
471 
472     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
473 
474     assert(len >= 0);
475 
476     while (len--) {
477         assert(! isUPPER_L1(*b));
478         if (toLOWER_L1(*a) != *b) {
479             return 0;
480         }
481         a++, b++;
482     }
483     return 1;
484 }
485 
486 STATIC bool
S_isFOO_utf8_lc(pTHX_ const U8 classnum,const U8 * character,const U8 * e)487 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
488 {
489     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
490      * 'character' is a member of the Posix character class given by 'classnum'
491      * that should be equivalent to a value in the typedef
492      * '_char_class_number'.
493      *
494      * This just calls isFOO_lc on the code point for the character if it is in
495      * the range 0-255.  Outside that range, all characters use Unicode
496      * rules, ignoring any locale.  So use the Unicode function if this class
497      * requires an inversion list, and use the Unicode macro otherwise. */
498 
499     dVAR;
500 
501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
502 
503     if (UTF8_IS_INVARIANT(*character)) {
504         return isFOO_lc(classnum, *character);
505     }
506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
507         return isFOO_lc(classnum,
508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
509     }
510 
511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
512 
513     switch ((_char_class_number) classnum) {
514         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
515         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
516         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
517         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
518         default:
519             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
520                                         utf8_to_uvchr_buf(character, e, NULL));
521     }
522 
523     return FALSE; /* Things like CNTRL are always below 256 */
524 }
525 
526 STATIC U8 *
S_find_span_end(U8 * s,const U8 * send,const U8 span_byte)527 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
528 {
529     /* Returns the position of the first byte in the sequence between 's' and
530      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
531      * */
532 
533     PERL_ARGS_ASSERT_FIND_SPAN_END;
534 
535     assert(send >= s);
536 
537     if ((STRLEN) (send - s) >= PERL_WORDSIZE
538                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
539                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
540     {
541         PERL_UINTMAX_T span_word;
542 
543         /* Process per-byte until reach word boundary.  XXX This loop could be
544          * eliminated if we knew that this platform had fast unaligned reads */
545         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
546             if (*s != span_byte) {
547                 return s;
548             }
549             s++;
550         }
551 
552         /* Create a word filled with the bytes we are spanning */
553         span_word = PERL_COUNT_MULTIPLIER * span_byte;
554 
555         /* Process per-word as long as we have at least a full word left */
556         do {
557 
558             /* Keep going if the whole word is composed of 'span_byte's */
559             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
560                 s += PERL_WORDSIZE;
561                 continue;
562             }
563 
564             /* Here, at least one byte in the word isn't 'span_byte'. */
565 
566 #ifdef EBCDIC
567 
568             break;
569 
570 #else
571 
572             /* This xor leaves 1 bits only in those non-matching bytes */
573             span_word ^= * (PERL_UINTMAX_T *) s;
574 
575             /* Make sure the upper bit of each non-matching byte is set.  This
576              * makes each such byte look like an ASCII platform variant byte */
577             span_word |= span_word << 1;
578             span_word |= span_word << 2;
579             span_word |= span_word << 4;
580 
581             /* That reduces the problem to what this function solves */
582             return s + _variant_byte_number(span_word);
583 
584 #endif
585 
586         } while (s + PERL_WORDSIZE <= send);
587     }
588 
589     /* Process the straggler bytes beyond the final word boundary */
590     while (s < send) {
591         if (*s != span_byte) {
592             return s;
593         }
594         s++;
595     }
596 
597     return s;
598 }
599 
600 STATIC U8 *
S_find_next_masked(U8 * s,const U8 * send,const U8 byte,const U8 mask)601 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
602 {
603     /* Returns the position of the first byte in the sequence between 's'
604      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
605      * returns 'send' if none found.  It uses word-level operations instead of
606      * byte to speed up the process */
607 
608     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
609 
610     assert(send >= s);
611     assert((byte & mask) == byte);
612 
613 #ifndef EBCDIC
614 
615     if ((STRLEN) (send - s) >= PERL_WORDSIZE
616                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
617                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
618     {
619         PERL_UINTMAX_T word, mask_word;
620 
621         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
622             if (((*s) & mask) == byte) {
623                 return s;
624             }
625             s++;
626         }
627 
628         word      = PERL_COUNT_MULTIPLIER * byte;
629         mask_word = PERL_COUNT_MULTIPLIER * mask;
630 
631         do {
632             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
633 
634             /* If 'masked' contains bytes with the bit pattern of 'byte' within
635              * it, xoring with 'word' will leave each of the 8 bits in such
636              * bytes be 0, and no byte containing any other bit pattern will be
637              * 0. */
638             masked ^= word;
639 
640             /* This causes the most significant bit to be set to 1 for any
641              * bytes in the word that aren't completely 0 */
642             masked |= masked << 1;
643             masked |= masked << 2;
644             masked |= masked << 4;
645 
646             /* The msbits are the same as what marks a byte as variant, so we
647              * can use this mask.  If all msbits are 1, the word doesn't
648              * contain 'byte' */
649             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
650                 s += PERL_WORDSIZE;
651                 continue;
652             }
653 
654             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
655              * and any that are, are 0.  Complement and re-AND to swap that */
656             masked = ~ masked;
657             masked &= PERL_VARIANTS_WORD_MASK;
658 
659             /* This reduces the problem to that solved by this function */
660             s += _variant_byte_number(masked);
661             return s;
662 
663         } while (s + PERL_WORDSIZE <= send);
664     }
665 
666 #endif
667 
668     while (s < send) {
669         if (((*s) & mask) == byte) {
670             return s;
671         }
672         s++;
673     }
674 
675     return s;
676 }
677 
678 STATIC U8 *
S_find_span_end_mask(U8 * s,const U8 * send,const U8 span_byte,const U8 mask)679 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
680 {
681     /* Returns the position of the first byte in the sequence between 's' and
682      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
683      * 'span_byte' should have been ANDed with 'mask' in the call of this
684      * function.  Returns 'send' if none found.  Works like find_span_end(),
685      * except for the AND */
686 
687     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
688 
689     assert(send >= s);
690     assert((span_byte & mask) == span_byte);
691 
692     if ((STRLEN) (send - s) >= PERL_WORDSIZE
693                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
694                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
695     {
696         PERL_UINTMAX_T span_word, mask_word;
697 
698         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
699             if (((*s) & mask) != span_byte) {
700                 return s;
701             }
702             s++;
703         }
704 
705         span_word = PERL_COUNT_MULTIPLIER * span_byte;
706         mask_word = PERL_COUNT_MULTIPLIER * mask;
707 
708         do {
709             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
710 
711             if (masked == span_word) {
712                 s += PERL_WORDSIZE;
713                 continue;
714             }
715 
716 #ifdef EBCDIC
717 
718             break;
719 
720 #else
721 
722             masked ^= span_word;
723             masked |= masked << 1;
724             masked |= masked << 2;
725             masked |= masked << 4;
726             return s + _variant_byte_number(masked);
727 
728 #endif
729 
730         } while (s + PERL_WORDSIZE <= send);
731     }
732 
733     while (s < send) {
734         if (((*s) & mask) != span_byte) {
735             return s;
736         }
737         s++;
738     }
739 
740     return s;
741 }
742 
743 /*
744  * pregexec and friends
745  */
746 
747 #ifndef PERL_IN_XSUB_RE
748 /*
749  - pregexec - match a regexp against a string
750  */
751 I32
Perl_pregexec(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)752 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
753 	 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
754 /* stringarg: the point in the string at which to begin matching */
755 /* strend:    pointer to null at end of string */
756 /* strbeg:    real beginning of string */
757 /* minend:    end of match must be >= minend bytes after stringarg. */
758 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
759  *            itself is accessed via the pointers above */
760 /* nosave:    For optimizations. */
761 {
762     PERL_ARGS_ASSERT_PREGEXEC;
763 
764     return
765 	regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
766 		      nosave ? 0 : REXEC_COPY_STR);
767 }
768 #endif
769 
770 
771 
772 /* re_intuit_start():
773  *
774  * Based on some optimiser hints, try to find the earliest position in the
775  * string where the regex could match.
776  *
777  *   rx:     the regex to match against
778  *   sv:     the SV being matched: only used for utf8 flag; the string
779  *           itself is accessed via the pointers below. Note that on
780  *           something like an overloaded SV, SvPOK(sv) may be false
781  *           and the string pointers may point to something unrelated to
782  *           the SV itself.
783  *   strbeg: real beginning of string
784  *   strpos: the point in the string at which to begin matching
785  *   strend: pointer to the byte following the last char of the string
786  *   flags   currently unused; set to 0
787  *   data:   currently unused; set to NULL
788  *
789  * The basic idea of re_intuit_start() is to use some known information
790  * about the pattern, namely:
791  *
792  *   a) the longest known anchored substring (i.e. one that's at a
793  *      constant offset from the beginning of the pattern; but not
794  *      necessarily at a fixed offset from the beginning of the
795  *      string);
796  *   b) the longest floating substring (i.e. one that's not at a constant
797  *      offset from the beginning of the pattern);
798  *   c) Whether the pattern is anchored to the string; either
799  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
800  *      or anchored to pos(): /\G/;
801  *   d) A start class: a real or synthetic character class which
802  *      represents which characters are legal at the start of the pattern;
803  *
804  * to either quickly reject the match, or to find the earliest position
805  * within the string at which the pattern might match, thus avoiding
806  * running the full NFA engine at those earlier locations, only to
807  * eventually fail and retry further along.
808  *
809  * Returns NULL if the pattern can't match, or returns the address within
810  * the string which is the earliest place the match could occur.
811  *
812  * The longest of the anchored and floating substrings is called 'check'
813  * and is checked first. The other is called 'other' and is checked
814  * second. The 'other' substring may not be present.  For example,
815  *
816  *    /(abc|xyz)ABC\d{0,3}DEFG/
817  *
818  * will have
819  *
820  *   check substr (float)    = "DEFG", offset 6..9 chars
821  *   other substr (anchored) = "ABC",  offset 3..3 chars
822  *   stclass = [ax]
823  *
824  * Be aware that during the course of this function, sometimes 'anchored'
825  * refers to a substring being anchored relative to the start of the
826  * pattern, and sometimes to the pattern itself being anchored relative to
827  * the string. For example:
828  *
829  *   /\dabc/:   "abc" is anchored to the pattern;
830  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
831  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
832  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
833  *                    but the pattern is anchored to the string.
834  */
835 
836 char *
Perl_re_intuit_start(pTHX_ REGEXP * const rx,SV * sv,const char * const strbeg,char * strpos,char * strend,const U32 flags,re_scream_pos_data * data)837 Perl_re_intuit_start(pTHX_
838                     REGEXP * const rx,
839                     SV *sv,
840                     const char * const strbeg,
841                     char *strpos,
842                     char *strend,
843                     const U32 flags,
844                     re_scream_pos_data *data)
845 {
846     struct regexp *const prog = ReANY(rx);
847     SSize_t start_shift = prog->check_offset_min;
848     /* Should be nonnegative! */
849     SSize_t end_shift   = 0;
850     /* current lowest pos in string where the regex can start matching */
851     char *rx_origin = strpos;
852     SV *check;
853     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
854     U8   other_ix = 1 - prog->substrs->check_ix;
855     bool ml_anch = 0;
856     char *other_last = strpos;/* latest pos 'other' substr already checked to */
857     char *check_at = NULL;		/* check substr found at this pos */
858     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
859     RXi_GET_DECL(prog,progi);
860     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
861     regmatch_info *const reginfo = &reginfo_buf;
862     GET_RE_DEBUG_FLAGS_DECL;
863 
864     PERL_ARGS_ASSERT_RE_INTUIT_START;
865     PERL_UNUSED_ARG(flags);
866     PERL_UNUSED_ARG(data);
867 
868     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
869                 "Intuit: trying to determine minimum start position...\n"));
870 
871     /* for now, assume that all substr offsets are positive. If at some point
872      * in the future someone wants to do clever things with lookbehind and
873      * -ve offsets, they'll need to fix up any code in this function
874      * which uses these offsets. See the thread beginning
875      * <20140113145929.GF27210@iabyn.com>
876      */
877     assert(prog->substrs->data[0].min_offset >= 0);
878     assert(prog->substrs->data[0].max_offset >= 0);
879     assert(prog->substrs->data[1].min_offset >= 0);
880     assert(prog->substrs->data[1].max_offset >= 0);
881     assert(prog->substrs->data[2].min_offset >= 0);
882     assert(prog->substrs->data[2].max_offset >= 0);
883 
884     /* for now, assume that if both present, that the floating substring
885      * doesn't start before the anchored substring.
886      * If you break this assumption (e.g. doing better optimisations
887      * with lookahead/behind), then you'll need to audit the code in this
888      * function carefully first
889      */
890     assert(
891             ! (  (prog->anchored_utf8 || prog->anchored_substr)
892               && (prog->float_utf8    || prog->float_substr))
893            || (prog->float_min_offset >= prog->anchored_offset));
894 
895     /* byte rather than char calculation for efficiency. It fails
896      * to quickly reject some cases that can't match, but will reject
897      * them later after doing full char arithmetic */
898     if (prog->minlen > strend - strpos) {
899         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
900 			      "  String too short...\n"));
901 	goto fail;
902     }
903 
904     RXp_MATCH_UTF8_set(prog, utf8_target);
905     reginfo->is_utf8_target = cBOOL(utf8_target);
906     reginfo->info_aux = NULL;
907     reginfo->strbeg = strbeg;
908     reginfo->strend = strend;
909     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
910     reginfo->intuit = 1;
911     /* not actually used within intuit, but zero for safety anyway */
912     reginfo->poscache_maxiter = 0;
913 
914     if (utf8_target) {
915         if ((!prog->anchored_utf8 && prog->anchored_substr)
916                 || (!prog->float_utf8 && prog->float_substr))
917 	    to_utf8_substr(prog);
918 	check = prog->check_utf8;
919     } else {
920 	if (!prog->check_substr && prog->check_utf8) {
921 	    if (! to_byte_substr(prog)) {
922                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
923             }
924         }
925 	check = prog->check_substr;
926     }
927 
928     /* dump the various substring data */
929     DEBUG_OPTIMISE_MORE_r({
930         int i;
931         for (i=0; i<=2; i++) {
932             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
933                                   : prog->substrs->data[i].substr);
934             if (!sv)
935                 continue;
936 
937             Perl_re_printf( aTHX_
938                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
939                 " useful=%" IVdf " utf8=%d [%s]\n",
940                 i,
941                 (IV)prog->substrs->data[i].min_offset,
942                 (IV)prog->substrs->data[i].max_offset,
943                 (IV)prog->substrs->data[i].end_shift,
944                 BmUSEFUL(sv),
945                 utf8_target ? 1 : 0,
946                 SvPEEK(sv));
947         }
948     });
949 
950     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
951 
952         /* ml_anch: check after \n?
953          *
954          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
955          * with /.*.../, these flags will have been added by the
956          * compiler:
957          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
958          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
959          */
960 	ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
961                    && !(prog->intflags & PREGf_IMPLICIT);
962 
963 	if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
964             /* we are only allowed to match at BOS or \G */
965 
966             /* trivially reject if there's a BOS anchor and we're not at BOS.
967              *
968              * Note that we don't try to do a similar quick reject for
969              * \G, since generally the caller will have calculated strpos
970              * based on pos() and gofs, so the string is already correctly
971              * anchored by definition; and handling the exceptions would
972              * be too fiddly (e.g. REXEC_IGNOREPOS).
973              */
974             if (   strpos != strbeg
975                 && (prog->intflags & PREGf_ANCH_SBOL))
976             {
977                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
978                                 "  Not at start...\n"));
979 	        goto fail;
980 	    }
981 
982             /* in the presence of an anchor, the anchored (relative to the
983              * start of the regex) substr must also be anchored relative
984              * to strpos. So quickly reject if substr isn't found there.
985              * This works for \G too, because the caller will already have
986              * subtracted gofs from pos, and gofs is the offset from the
987              * \G to the start of the regex. For example, in /.abc\Gdef/,
988              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
989              * caller will have set strpos=pos()-4; we look for the substr
990              * at position pos()-4+1, which lines up with the "a" */
991 
992 	    if (prog->check_offset_min == prog->check_offset_max) {
993 	        /* Substring at constant offset from beg-of-str... */
994 	        SSize_t slen = SvCUR(check);
995                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
996 
997                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
998                     "  Looking for check substr at fixed offset %" IVdf "...\n",
999                     (IV)prog->check_offset_min));
1000 
1001 	        if (SvTAIL(check)) {
1002                     /* In this case, the regex is anchored at the end too.
1003                      * Unless it's a multiline match, the lengths must match
1004                      * exactly, give or take a \n.  NB: slen >= 1 since
1005                      * the last char of check is \n */
1006 		    if (!multiline
1007                         && (   strend - s > slen
1008                             || strend - s < slen - 1
1009                             || (strend - s == slen && strend[-1] != '\n')))
1010                     {
1011                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1012                                             "  String too long...\n"));
1013                         goto fail_finish;
1014                     }
1015                     /* Now should match s[0..slen-2] */
1016                     slen--;
1017                 }
1018                 if (slen && (strend - s < slen
1019                     || *SvPVX_const(check) != *s
1020                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1021                 {
1022                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1023                                     "  String not equal...\n"));
1024                     goto fail_finish;
1025                 }
1026 
1027                 check_at = s;
1028                 goto success_at_start;
1029 	    }
1030 	}
1031     }
1032 
1033     end_shift = prog->check_end_shift;
1034 
1035 #ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
1036     if (end_shift < 0)
1037 	Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1038 		   (IV)end_shift, RX_PRECOMP(rx));
1039 #endif
1040 
1041   restart:
1042 
1043     /* This is the (re)entry point of the main loop in this function.
1044      * The goal of this loop is to:
1045      * 1) find the "check" substring in the region rx_origin..strend
1046      *    (adjusted by start_shift / end_shift). If not found, reject
1047      *    immediately.
1048      * 2) If it exists, look for the "other" substr too if defined; for
1049      *    example, if the check substr maps to the anchored substr, then
1050      *    check the floating substr, and vice-versa. If not found, go
1051      *    back to (1) with rx_origin suitably incremented.
1052      * 3) If we find an rx_origin position that doesn't contradict
1053      *    either of the substrings, then check the possible additional
1054      *    constraints on rx_origin of /^.../m or a known start class.
1055      *    If these fail, then depending on which constraints fail, jump
1056      *    back to here, or to various other re-entry points further along
1057      *    that skip some of the first steps.
1058      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1059      *    substring. If the start position was determined to be at the
1060      *    beginning of the string  - so, not rejected, but not optimised,
1061      *    since we have to run regmatch from position 0 - decrement the
1062      *    BmUSEFUL() count. Otherwise increment it.
1063      */
1064 
1065 
1066     /* first, look for the 'check' substring */
1067 
1068     {
1069         U8* start_point;
1070         U8* end_point;
1071 
1072         DEBUG_OPTIMISE_MORE_r({
1073             Perl_re_printf( aTHX_
1074                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1075                 " Start shift: %" IVdf " End shift %" IVdf
1076                 " Real end Shift: %" IVdf "\n",
1077                 (IV)(rx_origin - strbeg),
1078                 (IV)prog->check_offset_min,
1079                 (IV)start_shift,
1080                 (IV)end_shift,
1081                 (IV)prog->check_end_shift);
1082         });
1083 
1084         end_point = HOPBACK3(strend, end_shift, rx_origin);
1085         if (!end_point)
1086             goto fail_finish;
1087         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1088         if (!start_point)
1089             goto fail_finish;
1090 
1091 
1092         /* If the regex is absolutely anchored to either the start of the
1093          * string (SBOL) or to pos() (ANCH_GPOS), then
1094          * check_offset_max represents an upper bound on the string where
1095          * the substr could start. For the ANCH_GPOS case, we assume that
1096          * the caller of intuit will have already set strpos to
1097          * pos()-gofs, so in this case strpos + offset_max will still be
1098          * an upper bound on the substr.
1099          */
1100         if (!ml_anch
1101             && prog->intflags & PREGf_ANCH
1102             && prog->check_offset_max != SSize_t_MAX)
1103         {
1104             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1105             const char * const anchor =
1106                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1107             SSize_t targ_len = (char*)end_point - anchor;
1108 
1109             if (check_len > targ_len) {
1110                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1111 			      "Target string too short to match required substring...\n"));
1112                 goto fail_finish;
1113             }
1114 
1115             /* do a bytes rather than chars comparison. It's conservative;
1116              * so it skips doing the HOP if the result can't possibly end
1117              * up earlier than the old value of end_point.
1118              */
1119             assert(anchor + check_len <= (char *)end_point);
1120             if (prog->check_offset_max + check_len < targ_len) {
1121                 end_point = HOP3lim((U8*)anchor,
1122                                 prog->check_offset_max,
1123                                 end_point - check_len
1124                             )
1125                             + check_len;
1126                 if (end_point < start_point)
1127                     goto fail_finish;
1128             }
1129         }
1130 
1131 	check_at = fbm_instr( start_point, end_point,
1132 		      check, multiline ? FBMrf_MULTILINE : 0);
1133 
1134         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1135             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1136             (IV)((char*)start_point - strbeg),
1137             (IV)((char*)end_point   - strbeg),
1138             (IV)(check_at ? check_at - strbeg : -1)
1139         ));
1140 
1141         /* Update the count-of-usability, remove useless subpatterns,
1142             unshift s.  */
1143 
1144         DEBUG_EXECUTE_r({
1145             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1146                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1147             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1148                               (check_at ? "Found" : "Did not find"),
1149                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1150                     ? "anchored" : "floating"),
1151                 quoted,
1152                 RE_SV_TAIL(check),
1153                 (check_at ? " at offset " : "...\n") );
1154         });
1155 
1156         if (!check_at)
1157             goto fail_finish;
1158         /* set rx_origin to the minimum position where the regex could start
1159          * matching, given the constraint of the just-matched check substring.
1160          * But don't set it lower than previously.
1161          */
1162 
1163         if (check_at - rx_origin > prog->check_offset_max)
1164             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1165         /* Finish the diagnostic message */
1166         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1167             "%ld (rx_origin now %" IVdf ")...\n",
1168             (long)(check_at - strbeg),
1169             (IV)(rx_origin - strbeg)
1170         ));
1171     }
1172 
1173 
1174     /* now look for the 'other' substring if defined */
1175 
1176     if (prog->substrs->data[other_ix].utf8_substr
1177         || prog->substrs->data[other_ix].substr)
1178     {
1179 	/* Take into account the "other" substring. */
1180         char *last, *last1;
1181         char *s;
1182         SV* must;
1183         struct reg_substr_datum *other;
1184 
1185       do_other_substr:
1186         other = &prog->substrs->data[other_ix];
1187         if (!utf8_target && !other->substr) {
1188             if (!to_byte_substr(prog)) {
1189                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1190             }
1191         }
1192 
1193         /* if "other" is anchored:
1194          * we've previously found a floating substr starting at check_at.
1195          * This means that the regex origin must lie somewhere
1196          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1197          * and max:                 HOP3(check_at, -check_offset_min)
1198          * (except that min will be >= strpos)
1199          * So the fixed  substr must lie somewhere between
1200          *  HOP3(min, anchored_offset)
1201          *  HOP3(max, anchored_offset) + SvCUR(substr)
1202          */
1203 
1204         /* if "other" is floating
1205          * Calculate last1, the absolute latest point where the
1206          * floating substr could start in the string, ignoring any
1207          * constraints from the earlier fixed match. It is calculated
1208          * as follows:
1209          *
1210          * strend - prog->minlen (in chars) is the absolute latest
1211          * position within the string where the origin of the regex
1212          * could appear. The latest start point for the floating
1213          * substr is float_min_offset(*) on from the start of the
1214          * regex.  last1 simply combines thee two offsets.
1215          *
1216          * (*) You might think the latest start point should be
1217          * float_max_offset from the regex origin, and technically
1218          * you'd be correct. However, consider
1219          *    /a\d{2,4}bcd\w/
1220          * Here, float min, max are 3,5 and minlen is 7.
1221          * This can match either
1222          *    /a\d\dbcd\w/
1223          *    /a\d\d\dbcd\w/
1224          *    /a\d\d\d\dbcd\w/
1225          * In the first case, the regex matches minlen chars; in the
1226          * second, minlen+1, in the third, minlen+2.
1227          * In the first case, the floating offset is 3 (which equals
1228          * float_min), in the second, 4, and in the third, 5 (which
1229          * equals float_max). In all cases, the floating string bcd
1230          * can never start more than 4 chars from the end of the
1231          * string, which equals minlen - float_min. As the substring
1232          * starts to match more than float_min from the start of the
1233          * regex, it makes the regex match more than minlen chars,
1234          * and the two cancel each other out. So we can always use
1235          * float_min - minlen, rather than float_max - minlen for the
1236          * latest position in the string.
1237          *
1238          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1239          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1240          */
1241 
1242         assert(prog->minlen >= other->min_offset);
1243         last1 = HOP3c(strend,
1244                         other->min_offset - prog->minlen, strbeg);
1245 
1246         if (other_ix) {/* i.e. if (other-is-float) */
1247             /* last is the latest point where the floating substr could
1248              * start, *given* any constraints from the earlier fixed
1249              * match. This constraint is that the floating string starts
1250              * <= float_max_offset chars from the regex origin (rx_origin).
1251              * If this value is less than last1, use it instead.
1252              */
1253             assert(rx_origin <= last1);
1254             last =
1255                 /* this condition handles the offset==infinity case, and
1256                  * is a short-cut otherwise. Although it's comparing a
1257                  * byte offset to a char length, it does so in a safe way,
1258                  * since 1 char always occupies 1 or more bytes,
1259                  * so if a string range is  (last1 - rx_origin) bytes,
1260                  * it will be less than or equal to  (last1 - rx_origin)
1261                  * chars; meaning it errs towards doing the accurate HOP3
1262                  * rather than just using last1 as a short-cut */
1263                 (last1 - rx_origin) < other->max_offset
1264                     ? last1
1265                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1266         }
1267         else {
1268             assert(strpos + start_shift <= check_at);
1269             last = HOP4c(check_at, other->min_offset - start_shift,
1270                         strbeg, strend);
1271         }
1272 
1273         s = HOP3c(rx_origin, other->min_offset, strend);
1274         if (s < other_last)	/* These positions already checked */
1275             s = other_last;
1276 
1277         must = utf8_target ? other->utf8_substr : other->substr;
1278         assert(SvPOK(must));
1279         {
1280             char *from = s;
1281             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1282 
1283             if (to > strend)
1284                 to = strend;
1285             if (from > to) {
1286                 s = NULL;
1287                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1288                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1289                     (IV)(from - strbeg),
1290                     (IV)(to   - strbeg)
1291                 ));
1292             }
1293             else {
1294                 s = fbm_instr(
1295                     (unsigned char*)from,
1296                     (unsigned char*)to,
1297                     must,
1298                     multiline ? FBMrf_MULTILINE : 0
1299                 );
1300                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1301                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1302                     (IV)(from - strbeg),
1303                     (IV)(to   - strbeg),
1304                     (IV)(s ? s - strbeg : -1)
1305                 ));
1306             }
1307         }
1308 
1309         DEBUG_EXECUTE_r({
1310             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1311                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1312             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1313                 s ? "Found" : "Contradicts",
1314                 other_ix ? "floating" : "anchored",
1315                 quoted, RE_SV_TAIL(must));
1316         });
1317 
1318 
1319         if (!s) {
1320             /* last1 is latest possible substr location. If we didn't
1321              * find it before there, we never will */
1322             if (last >= last1) {
1323                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1324                                         "; giving up...\n"));
1325                 goto fail_finish;
1326             }
1327 
1328             /* try to find the check substr again at a later
1329              * position. Maybe next time we'll find the "other" substr
1330              * in range too */
1331             other_last = HOP3c(last, 1, strend) /* highest failure */;
1332             rx_origin =
1333                 other_ix /* i.e. if other-is-float */
1334                     ? HOP3c(rx_origin, 1, strend)
1335                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1336             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1337                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1338                 (other_ix ? "floating" : "anchored"),
1339                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1340                 (IV)(rx_origin - strbeg)
1341             ));
1342             goto restart;
1343         }
1344         else {
1345             if (other_ix) { /* if (other-is-float) */
1346                 /* other_last is set to s, not s+1, since its possible for
1347                  * a floating substr to fail first time, then succeed
1348                  * second time at the same floating position; e.g.:
1349                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1350                  * The first time round, anchored and float match at
1351                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1352                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1353                  */
1354                 other_last = s;
1355             }
1356             else {
1357                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1358                 other_last = HOP3c(s, 1, strend);
1359             }
1360             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1361                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1362                   (long)(s - strbeg),
1363                 (IV)(rx_origin - strbeg)
1364               ));
1365 
1366         }
1367     }
1368     else {
1369         DEBUG_OPTIMISE_MORE_r(
1370             Perl_re_printf( aTHX_
1371                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1372                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1373                 " strend:%" IVdf "\n",
1374                 (IV)prog->check_offset_min,
1375                 (IV)prog->check_offset_max,
1376                 (IV)(check_at-strbeg),
1377                 (IV)(rx_origin-strbeg),
1378                 (IV)(rx_origin-check_at),
1379                 (IV)(strend-strbeg)
1380             )
1381         );
1382     }
1383 
1384   postprocess_substr_matches:
1385 
1386     /* handle the extra constraint of /^.../m if present */
1387 
1388     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1389         char *s;
1390 
1391         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1392                         "  looking for /^/m anchor"));
1393 
1394         /* we have failed the constraint of a \n before rx_origin.
1395          * Find the next \n, if any, even if it's beyond the current
1396          * anchored and/or floating substrings. Whether we should be
1397          * scanning ahead for the next \n or the next substr is debatable.
1398          * On the one hand you'd expect rare substrings to appear less
1399          * often than \n's. On the other hand, searching for \n means
1400          * we're effectively flipping between check_substr and "\n" on each
1401          * iteration as the current "rarest" string candidate, which
1402          * means for example that we'll quickly reject the whole string if
1403          * hasn't got a \n, rather than trying every substr position
1404          * first
1405          */
1406 
1407         s = HOP3c(strend, - prog->minlen, strpos);
1408         if (s <= rx_origin ||
1409             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1410         {
1411             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1412                             "  Did not find /%s^%s/m...\n",
1413                             PL_colors[0], PL_colors[1]));
1414             goto fail_finish;
1415         }
1416 
1417         /* earliest possible origin is 1 char after the \n.
1418          * (since *rx_origin == '\n', it's safe to ++ here rather than
1419          * HOP(rx_origin, 1)) */
1420         rx_origin++;
1421 
1422         if (prog->substrs->check_ix == 0  /* check is anchored */
1423             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1424         {
1425             /* Position contradicts check-string; either because
1426              * check was anchored (and thus has no wiggle room),
1427              * or check was float and rx_origin is above the float range */
1428             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1430                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1431             goto restart;
1432         }
1433 
1434         /* if we get here, the check substr must have been float,
1435          * is in range, and we may or may not have had an anchored
1436          * "other" substr which still contradicts */
1437         assert(prog->substrs->check_ix); /* check is float */
1438 
1439         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1440             /* whoops, the anchored "other" substr exists, so we still
1441              * contradict. On the other hand, the float "check" substr
1442              * didn't contradict, so just retry the anchored "other"
1443              * substr */
1444             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1445                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1446                 PL_colors[0], PL_colors[1],
1447                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1448                 (IV)(rx_origin - strbeg)
1449             ));
1450             goto do_other_substr;
1451         }
1452 
1453         /* success: we don't contradict the found floating substring
1454          * (and there's no anchored substr). */
1455         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1456             "  Found /%s^%s/m with rx_origin %ld...\n",
1457             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1458     }
1459     else {
1460         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1461             "  (multiline anchor test skipped)\n"));
1462     }
1463 
1464   success_at_start:
1465 
1466 
1467     /* if we have a starting character class, then test that extra constraint.
1468      * (trie stclasses are too expensive to use here, we are better off to
1469      * leave it to regmatch itself) */
1470 
1471     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1472         const U8* const str = (U8*)STRING(progi->regstclass);
1473 
1474         /* XXX this value could be pre-computed */
1475         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1476 		    ?  (reginfo->is_utf8_pat
1477                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1478                         : STR_LEN(progi->regstclass))
1479 		    : 1);
1480 	char * endpos;
1481         char *s;
1482         /* latest pos that a matching float substr constrains rx start to */
1483         char *rx_max_float = NULL;
1484 
1485         /* if the current rx_origin is anchored, either by satisfying an
1486          * anchored substring constraint, or a /^.../m constraint, then we
1487          * can reject the current origin if the start class isn't found
1488          * at the current position. If we have a float-only match, then
1489          * rx_origin is constrained to a range; so look for the start class
1490          * in that range. if neither, then look for the start class in the
1491          * whole rest of the string */
1492 
1493         /* XXX DAPM it's not clear what the minlen test is for, and why
1494          * it's not used in the floating case. Nothing in the test suite
1495          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1496          * Here are some old comments, which may or may not be correct:
1497          *
1498 	 *   minlen == 0 is possible if regstclass is \b or \B,
1499 	 *   and the fixed substr is ''$.
1500          *   Since minlen is already taken into account, rx_origin+1 is
1501          *   before strend; accidentally, minlen >= 1 guaranties no false
1502          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1503          *   0) below assumes that regstclass does not come from lookahead...
1504 	 *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1505          *   This leaves EXACTF-ish only, which are dealt with in
1506          *   find_byclass().
1507          */
1508 
1509 	if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1510             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1511         else if (prog->float_substr || prog->float_utf8) {
1512 	    rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1513 	    endpos = HOP3clim(rx_max_float, cl_l, strend);
1514         }
1515         else
1516             endpos= strend;
1517 
1518         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1519             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1520             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1521               (IV)start_shift, (IV)(check_at - strbeg),
1522               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1523 
1524         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1525                             reginfo);
1526 	if (!s) {
1527 	    if (endpos == strend) {
1528                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1529 				"  Could not match STCLASS...\n") );
1530 		goto fail;
1531 	    }
1532             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1533                                "  This position contradicts STCLASS...\n") );
1534             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1535                         && !(prog->intflags & PREGf_IMPLICIT))
1536 		goto fail;
1537 
1538 	    /* Contradict one of substrings */
1539 	    if (prog->anchored_substr || prog->anchored_utf8) {
1540                 if (prog->substrs->check_ix == 1) { /* check is float */
1541                     /* Have both, check_string is floating */
1542                     assert(rx_origin + start_shift <= check_at);
1543                     if (rx_origin + start_shift != check_at) {
1544                         /* not at latest position float substr could match:
1545                          * Recheck anchored substring, but not floating.
1546                          * The condition above is in bytes rather than
1547                          * chars for efficiency. It's conservative, in
1548                          * that it errs on the side of doing 'goto
1549                          * do_other_substr'. In this case, at worst,
1550                          * an extra anchored search may get done, but in
1551                          * practice the extra fbm_instr() is likely to
1552                          * get skipped anyway. */
1553                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1554                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1555                             (long)(other_last - strbeg),
1556                             (IV)(rx_origin - strbeg)
1557                         ));
1558                         goto do_other_substr;
1559                     }
1560                 }
1561             }
1562 	    else {
1563                 /* float-only */
1564 
1565                 if (ml_anch) {
1566                     /* In the presence of ml_anch, we might be able to
1567                      * find another \n without breaking the current float
1568                      * constraint. */
1569 
1570                     /* strictly speaking this should be HOP3c(..., 1, ...),
1571                      * but since we goto a block of code that's going to
1572                      * search for the next \n if any, its safe here */
1573                     rx_origin++;
1574                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1575                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1576                               PL_colors[0], PL_colors[1],
1577                               (long)(rx_origin - strbeg)) );
1578                     goto postprocess_substr_matches;
1579                 }
1580 
1581                 /* strictly speaking this can never be true; but might
1582                  * be if we ever allow intuit without substrings */
1583                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1584                     goto fail;
1585 
1586                 rx_origin = rx_max_float;
1587             }
1588 
1589             /* at this point, any matching substrings have been
1590              * contradicted. Start again... */
1591 
1592             rx_origin = HOP3c(rx_origin, 1, strend);
1593 
1594             /* uses bytes rather than char calculations for efficiency.
1595              * It's conservative: it errs on the side of doing 'goto restart',
1596              * where there is code that does a proper char-based test */
1597             if (rx_origin + start_shift + end_shift > strend) {
1598                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1599                                        "  Could not match STCLASS...\n") );
1600                 goto fail;
1601             }
1602             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1603                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1604                 (prog->substrs->check_ix ? "floating" : "anchored"),
1605                 (long)(rx_origin + start_shift - strbeg),
1606                 (IV)(rx_origin - strbeg)
1607             ));
1608             goto restart;
1609 	}
1610 
1611         /* Success !!! */
1612 
1613 	if (rx_origin != s) {
1614             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1615 			"  By STCLASS: moving %ld --> %ld\n",
1616                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1617                    );
1618         }
1619         else {
1620             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1621                                   "  Does not contradict STCLASS...\n");
1622                    );
1623         }
1624     }
1625 
1626     /* Decide whether using the substrings helped */
1627 
1628     if (rx_origin != strpos) {
1629 	/* Fixed substring is found far enough so that the match
1630 	   cannot start at strpos. */
1631 
1632         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1633 	++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);	/* hooray/5 */
1634     }
1635     else {
1636         /* The found rx_origin position does not prohibit matching at
1637          * strpos, so calling intuit didn't gain us anything. Decrement
1638          * the BmUSEFUL() count on the check substring, and if we reach
1639          * zero, free it.  */
1640 	if (!(prog->intflags & PREGf_NAUGHTY)
1641 	    && (utf8_target ? (
1642 		prog->check_utf8		/* Could be deleted already */
1643 		&& --BmUSEFUL(prog->check_utf8) < 0
1644 		&& (prog->check_utf8 == prog->float_utf8)
1645 	    ) : (
1646 		prog->check_substr		/* Could be deleted already */
1647 		&& --BmUSEFUL(prog->check_substr) < 0
1648 		&& (prog->check_substr == prog->float_substr)
1649 	    )))
1650 	{
1651 	    /* If flags & SOMETHING - do not do it many times on the same match */
1652             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1653 	    /* XXX Does the destruction order has to change with utf8_target? */
1654 	    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1655 	    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1656 	    prog->check_substr = prog->check_utf8 = NULL;	/* disable */
1657 	    prog->float_substr = prog->float_utf8 = NULL;	/* clear */
1658 	    check = NULL;			/* abort */
1659 	    /* XXXX This is a remnant of the old implementation.  It
1660 	            looks wasteful, since now INTUIT can use many
1661 	            other heuristics. */
1662 	    prog->extflags &= ~RXf_USE_INTUIT;
1663 	}
1664     }
1665 
1666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1667             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1668              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1669 
1670     return rx_origin;
1671 
1672   fail_finish:				/* Substring not found */
1673     if (prog->check_substr || prog->check_utf8)		/* could be removed already */
1674 	BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1675   fail:
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1677 			  PL_colors[4], PL_colors[5]));
1678     return NULL;
1679 }
1680 
1681 
1682 #define DECL_TRIE_TYPE(scan) \
1683     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1684                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1685                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1686                     trie_type = ((scan->flags == EXACT)                             \
1687                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1688                                  : (scan->flags == EXACTL)                          \
1689                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1690                                     : (scan->flags == EXACTFAA)                     \
1691                                       ? (utf8_target                                \
1692                                          ? trie_utf8_exactfa_fold                   \
1693                                          : trie_latin_utf8_exactfa_fold)            \
1694                                       : (scan->flags == EXACTFLU8                   \
1695                                          ? (utf8_target                             \
1696                                            ? trie_flu8                              \
1697                                            : trie_flu8_latin)                       \
1698                                          : (utf8_target                             \
1699                                            ? trie_utf8_fold                         \
1700                                            : trie_latin_utf8_fold)))
1701 
1702 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1703  * 'foldbuf+sizeof(foldbuf)' */
1704 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1705 STMT_START {                                                                        \
1706     STRLEN skiplen;                                                                 \
1707     U8 flags = FOLD_FLAGS_FULL;                                                     \
1708     switch (trie_type) {                                                            \
1709     case trie_flu8:                                                                 \
1710         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1711         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1712             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1713         }                                                                           \
1714         goto do_trie_utf8_fold;                                                     \
1715     case trie_utf8_exactfa_fold:                                                    \
1716         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1717         /* FALLTHROUGH */                                                           \
1718     case trie_utf8_fold:                                                            \
1719       do_trie_utf8_fold:                                                            \
1720         if ( foldlen>0 ) {                                                          \
1721             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1722             foldlen -= len;                                                         \
1723             uscan += len;                                                           \
1724             len=0;                                                                  \
1725         } else {                                                                    \
1726             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1727                                                                             flags); \
1728             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1729             skiplen = UVCHR_SKIP( uvc );                                            \
1730             foldlen -= skiplen;                                                     \
1731             uscan = foldbuf + skiplen;                                              \
1732         }                                                                           \
1733         break;                                                                      \
1734     case trie_flu8_latin:                                                           \
1735         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1736         goto do_trie_latin_utf8_fold;                                               \
1737     case trie_latin_utf8_exactfa_fold:                                              \
1738         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1739         /* FALLTHROUGH */                                                           \
1740     case trie_latin_utf8_fold:                                                      \
1741       do_trie_latin_utf8_fold:                                                      \
1742         if ( foldlen>0 ) {                                                          \
1743             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1744             foldlen -= len;                                                         \
1745             uscan += len;                                                           \
1746             len=0;                                                                  \
1747         } else {                                                                    \
1748             len = 1;                                                                \
1749             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1750             skiplen = UVCHR_SKIP( uvc );                                            \
1751             foldlen -= skiplen;                                                     \
1752             uscan = foldbuf + skiplen;                                              \
1753         }                                                                           \
1754         break;                                                                      \
1755     case trie_utf8l:                                                                \
1756         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1757         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1758             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1759         }                                                                           \
1760         /* FALLTHROUGH */                                                           \
1761     case trie_utf8:                                                                 \
1762         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1763         break;                                                                      \
1764     case trie_plain:                                                                \
1765         uvc = (UV)*uc;                                                              \
1766         len = 1;                                                                    \
1767     }                                                                               \
1768     if (uvc < 256) {                                                                \
1769         charid = trie->charmap[ uvc ];                                              \
1770     }                                                                               \
1771     else {                                                                          \
1772         charid = 0;                                                                 \
1773         if (widecharmap) {                                                          \
1774             SV** const svpp = hv_fetch(widecharmap,                                 \
1775                         (char*)&uvc, sizeof(UV), 0);                                \
1776             if (svpp)                                                               \
1777                 charid = (U16)SvIV(*svpp);                                          \
1778         }                                                                           \
1779     }                                                                               \
1780 } STMT_END
1781 
1782 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1783     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1784                 startpos, doutf8, depth)
1785 
1786 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1787     STMT_START {                                            \
1788         while (s < strend) {                                \
1789             CODE                                            \
1790             s += ((UTF8)                                    \
1791                   ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
1792                   : 1);                                     \
1793         }                                                   \
1794     } STMT_END
1795 
1796 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1800         }                                                   \
1801     } STMT_END
1802 
1803 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1804     if (COND) {                                                \
1805         FBC_CHECK_AND_TRY                                      \
1806         s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
1807         previous_occurrence_end = s;                           \
1808     }                                                          \
1809     else {                                                     \
1810         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1811     }
1812 
1813 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1814     if (utf8_target) {                                         \
1815 	REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1816     }                                                          \
1817     else {                                                     \
1818 	REXEC_FBC_CLASS_SCAN(0, COND);                         \
1819     }
1820 
1821 /* We keep track of where the next character should start after an occurrence
1822  * of the one we're looking for.  Knowing that, we can see right away if the
1823  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1824  * don't accept the 2nd and succeeding adjacent occurrences */
1825 #define FBC_CHECK_AND_TRY                                           \
1826         if (   (   doevery                                          \
1827                 || s != previous_occurrence_end)                    \
1828             && (   reginfo->intuit                                  \
1829                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1830         {                                                           \
1831             goto got_it;                                            \
1832         }
1833 
1834 
1835 /* This differs from the above macros in that it calls a function which returns
1836  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1837  * there is no such occurrence. */
1838 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1839     while (s < strend) {                                    \
1840         s = (f);                                            \
1841         if (s >= strend) {                                  \
1842             break;                                          \
1843         }                                                   \
1844                                                             \
1845         FBC_CHECK_AND_TRY                                   \
1846         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1847         previous_occurrence_end = s;                        \
1848     }
1849 
1850 /* This differs from the above macros in that it is passed a single byte that
1851  * is known to begin the next occurrence of the thing being looked for in 's'.
1852  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1853  * at that position. */
1854 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
1855     while (s < strend) {                                    \
1856         s = (char *) memchr(s, byte, strend -s);            \
1857         if (s == NULL) {                                    \
1858             s = (char *) strend;                            \
1859             break;                                          \
1860         }                                                   \
1861                                                             \
1862         if (COND) {                                         \
1863             FBC_CHECK_AND_TRY                               \
1864             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1865             previous_occurrence_end = s;                    \
1866         }                                                   \
1867         else {                                              \
1868             s += UTF8SKIP(s);                               \
1869         }                                                   \
1870     }
1871 
1872 /* The three macros below are slightly different versions of the same logic.
1873  *
1874  * The first is for /a and /aa when the target string is UTF-8.  This can only
1875  * match ascii, but it must advance based on UTF-8.   The other two handle the
1876  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1877  * for the boundary (or non-boundary) between a word and non-word character.
1878  * The utf8 and non-utf8 cases have the same logic, but the details must be
1879  * different.  Find the "wordness" of the character just prior to this one, and
1880  * compare it with the wordness of this one.  If they differ, we have a
1881  * boundary.  At the beginning of the string, pretend that the previous
1882  * character was a new-line.
1883  *
1884  * All these macros uncleanly have side-effects with each other and outside
1885  * variables.  So far it's been too much trouble to clean-up
1886  *
1887  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1888  *               a word character or not.
1889  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1890  *               word/non-word
1891  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1892  *
1893  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1894  * are looking for a boundary or for a non-boundary.  If we are looking for a
1895  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1896  * see if this tentative match actually works, and if so, to quit the loop
1897  * here.  And vice-versa if we are looking for a non-boundary.
1898  *
1899  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1900  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1901  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1902  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1903  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1904  * complement.  But in that branch we complement tmp, meaning that at the
1905  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1906  * which means at the top of the loop in the next iteration, it is
1907  * TEST_NON_UTF8(s-1) */
1908 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1909     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1910     tmp = TEST_NON_UTF8(tmp);                                                  \
1911     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1912         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1913             tmp = !tmp;                                                        \
1914             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1915         }                                                                      \
1916         else {                                                                 \
1917             IF_FAIL;                                                           \
1918         }                                                                      \
1919     );                                                                         \
1920 
1921 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1922  * TEST_UTF8 is a macro that for the same input code points returns identically
1923  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1924 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1925     if (s == reginfo->strbeg) {                                                \
1926         tmp = '\n';                                                            \
1927     }                                                                          \
1928     else { /* Back-up to the start of the previous character */                \
1929         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1930         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1931                                                        0, UTF8_ALLOW_DEFAULT); \
1932     }                                                                          \
1933     tmp = TEST_UV(tmp);                                                        \
1934     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1935         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1936             tmp = !tmp;                                                        \
1937             IF_SUCCESS;                                                        \
1938         }                                                                      \
1939         else {                                                                 \
1940             IF_FAIL;                                                           \
1941         }                                                                      \
1942     );
1943 
1944 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1945  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1946  * macros below */
1947 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1948     if (utf8_target) {                                                         \
1949         UTF8_CODE                                                              \
1950     }                                                                          \
1951     else {  /* Not utf8 */                                                     \
1952 	tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1953 	tmp = TEST_NON_UTF8(tmp);                                              \
1954 	REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
1955 	    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1956 		IF_SUCCESS;                                                    \
1957 		tmp = !tmp;                                                    \
1958 	    }                                                                  \
1959 	    else {                                                             \
1960 		IF_FAIL;                                                       \
1961 	    }                                                                  \
1962 	);                                                                     \
1963     }                                                                          \
1964     /* Here, things have been set up by the previous code so that tmp is the   \
1965      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1966      * utf8ness of the target).  We also have to check if this matches against \
1967      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1968      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1969      * string */                                                               \
1970     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1971         IF_SUCCESS;                                                            \
1972     }                                                                          \
1973     else {                                                                     \
1974         IF_FAIL;                                                               \
1975     }
1976 
1977 /* This is the macro to use when we want to see if something that looks like it
1978  * could match, actually does, and if so exits the loop.  It needs to be used
1979  * only for bounds checking macros, as it allows for matching beyond the end of
1980  * string (which should be zero length without having to look at the string
1981  * contents) */
1982 #define REXEC_FBC_TRYIT                                                     \
1983     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
1984         goto got_it
1985 
1986 /* The only difference between the BOUND and NBOUND cases is that
1987  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1988  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1989  * with the other one being empty (PLACEHOLDER is defined as empty).
1990  *
1991  * The TEST_FOO parameters are for operating on different forms of input, but
1992  * all should be ones that return identically for the same underlying code
1993  * points */
1994 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1995     FBC_BOUND_COMMON(                                                          \
1996           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1997           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1998 
1999 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2000     FBC_BOUND_COMMON(                                                          \
2001             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2002             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2003 
2004 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2005     FBC_BOUND_COMMON(                                                          \
2006           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2007           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2008 
2009 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2010     FBC_BOUND_COMMON(                                                          \
2011             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2012             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2013 
2014 #ifdef DEBUGGING
2015 static IV
S_get_break_val_cp_checked(SV * const invlist,const UV cp_in)2016 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2017   IV cp_out = _invlist_search(invlist, cp_in);
2018   assert(cp_out >= 0);
2019   return cp_out;
2020 }
2021 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022 	invmap[S_get_break_val_cp_checked(invlist, cp)]
2023 #else
2024 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2025 	invmap[_invlist_search(invlist, cp)]
2026 #endif
2027 
2028 /* Takes a pointer to an inversion list, a pointer to its corresponding
2029  * inversion map, and a code point, and returns the code point's value
2030  * according to the two arrays.  It assumes that all code points have a value.
2031  * This is used as the base macro for macros for particular properties */
2032 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2033 	_generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2034 
2035 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2036  * of a code point, returning the value for the first code point in the string.
2037  * And it takes the particular macro name that finds the desired value given a
2038  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2039 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2040              (__ASSERT_(pos < strend)                                          \
2041                  /* Note assumes is valid UTF-8 */                             \
2042              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2043 
2044 /* Returns the GCB value for the input code point */
2045 #define getGCB_VAL_CP(cp)                                                      \
2046           _generic_GET_BREAK_VAL_CP(                                           \
2047                                     PL_GCB_invlist,                            \
2048                                     _Perl_GCB_invmap,                          \
2049                                     (cp))
2050 
2051 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2052  * bounded by pos and strend */
2053 #define getGCB_VAL_UTF8(pos, strend)                                           \
2054     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2055 
2056 /* Returns the LB value for the input code point */
2057 #define getLB_VAL_CP(cp)                                                       \
2058           _generic_GET_BREAK_VAL_CP(                                           \
2059                                     PL_LB_invlist,                             \
2060                                     _Perl_LB_invmap,                           \
2061                                     (cp))
2062 
2063 /* Returns the LB value for the first code point in the UTF-8 encoded string
2064  * bounded by pos and strend */
2065 #define getLB_VAL_UTF8(pos, strend)                                            \
2066     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2067 
2068 
2069 /* Returns the SB value for the input code point */
2070 #define getSB_VAL_CP(cp)                                                       \
2071           _generic_GET_BREAK_VAL_CP(                                           \
2072                                     PL_SB_invlist,                             \
2073                                     _Perl_SB_invmap,                     \
2074                                     (cp))
2075 
2076 /* Returns the SB value for the first code point in the UTF-8 encoded string
2077  * bounded by pos and strend */
2078 #define getSB_VAL_UTF8(pos, strend)                                            \
2079     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2080 
2081 /* Returns the WB value for the input code point */
2082 #define getWB_VAL_CP(cp)                                                       \
2083           _generic_GET_BREAK_VAL_CP(                                           \
2084                                     PL_WB_invlist,                             \
2085                                     _Perl_WB_invmap,                         \
2086                                     (cp))
2087 
2088 /* Returns the WB value for the first code point in the UTF-8 encoded string
2089  * bounded by pos and strend */
2090 #define getWB_VAL_UTF8(pos, strend)                                            \
2091     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2092 
2093 /* We know what class REx starts with.  Try to find this position... */
2094 /* if reginfo->intuit, its a dryrun */
2095 /* annoyingly all the vars in this routine have different names from their counterparts
2096    in regmatch. /grrr */
2097 STATIC char *
S_find_byclass(pTHX_ regexp * prog,const regnode * c,char * s,const char * strend,regmatch_info * reginfo)2098 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2099     const char *strend, regmatch_info *reginfo)
2100 {
2101     dVAR;
2102 
2103     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2104     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2105 
2106     char *pat_string;   /* The pattern's exactish string */
2107     char *pat_end;	    /* ptr to end char of pat_string */
2108     re_fold_t folder;	/* Function for computing non-utf8 folds */
2109     const U8 *fold_array;   /* array for folding ords < 256 */
2110     STRLEN ln;
2111     STRLEN lnc;
2112     U8 c1;
2113     U8 c2;
2114     char *e = NULL;
2115 
2116     /* In some cases we accept only the first occurence of 'x' in a sequence of
2117      * them.  This variable points to just beyond the end of the previous
2118      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2119      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2120      * hop back.) */
2121     char * previous_occurrence_end = 0;
2122 
2123     I32 tmp;            /* Scratch variable */
2124     const bool utf8_target = reginfo->is_utf8_target;
2125     UV utf8_fold_flags = 0;
2126     const bool is_utf8_pat = reginfo->is_utf8_pat;
2127     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2128                                    with a result inverts that result, as 0^1 =
2129                                    1 and 1^1 = 0 */
2130     _char_class_number classnum;
2131 
2132     RXi_GET_DECL(prog,progi);
2133 
2134     PERL_ARGS_ASSERT_FIND_BYCLASS;
2135 
2136     /* We know what class it must start with. */
2137     switch (OP(c)) {
2138     case ANYOFPOSIXL:
2139     case ANYOFL:
2140         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2141 
2142         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2143             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2144         }
2145 
2146         /* FALLTHROUGH */
2147     case ANYOFD:
2148     case ANYOF:
2149         if (utf8_target) {
2150             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2151                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2152         }
2153         else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2154             /* We know that s is in the bitmap range since the target isn't
2155              * UTF-8, so what happens for out-of-range values is not relevant,
2156              * so exclude that from the flags */
2157             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2158         }
2159         else {
2160             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2161         }
2162         break;
2163 
2164     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2165         /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2166          * use 0 */
2167         REXEC_FBC_FIND_NEXT_SCAN(0,
2168          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2169                                    (U8) ARG(c), FLAGS(c)));
2170         break;
2171 
2172     case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
2173                      */
2174         REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
2175          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2176                                    (U8) ARG(c), FLAGS(c)));
2177         break;
2178 
2179     case ANYOFH:
2180         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2181             U8 first_byte = FLAGS(c);
2182 
2183             if (first_byte) {   /* We know what the first byte of any matched
2184                                    string should be */
2185                 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2186                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2187             }
2188             else {
2189                 REXEC_FBC_CLASS_SCAN(TRUE,
2190                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2191             }
2192         }
2193         break;
2194 
2195     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2196         assert(! is_utf8_pat);
2197 	/* FALLTHROUGH */
2198     case EXACTFAA:
2199         if (is_utf8_pat) {
2200             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2201                              |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2202             goto do_exactf_utf8;
2203         }
2204         else if (utf8_target) {
2205 
2206             /* Here, and elsewhere in this file, the reason we can't consider a
2207              * non-UTF-8 pattern already folded in the presence of a UTF-8
2208              * target is because any MICRO SIGN in the pattern won't be folded.
2209              * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2210              * can consider a non-UTF-8 pattern folded when matching a
2211              * non-UTF-8 target */
2212             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2213             goto do_exactf_utf8;
2214         }
2215 
2216         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2217          * which these functions don't handle anyway */
2218         fold_array = PL_fold_latin1;
2219         folder = foldEQ_latin1_s2_folded;
2220         goto do_exactf_non_utf8;
2221 
2222     case EXACTF:   /* This node only generated for non-utf8 patterns */
2223         assert(! is_utf8_pat);
2224         if (utf8_target) {
2225             goto do_exactf_utf8;
2226         }
2227         fold_array = PL_fold;
2228         folder = foldEQ;
2229         goto do_exactf_non_utf8;
2230 
2231     case EXACTFL:
2232         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2233         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2234             utf8_fold_flags = FOLDEQ_LOCALE;
2235             goto do_exactf_utf8;
2236         }
2237         fold_array = PL_fold_locale;
2238         folder = foldEQ_locale;
2239         goto do_exactf_non_utf8;
2240 
2241     case EXACTFUP:      /* Problematic even though pattern isn't UTF-8.  Use
2242                            full functionality normally not done except for
2243                            UTF-8 */
2244         assert(! is_utf8_pat);
2245         goto do_exactf_utf8;
2246 
2247     case EXACTFLU8:
2248             if (! utf8_target) {    /* All code points in this node require
2249                                        UTF-8 to express.  */
2250                 break;
2251             }
2252             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2253                                              | FOLDEQ_S2_FOLDS_SANE;
2254             goto do_exactf_utf8;
2255 
2256     case EXACTFU_ONLY8:
2257         if (! utf8_target) {
2258             break;
2259         }
2260         assert(is_utf8_pat);
2261         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2262         goto do_exactf_utf8;
2263 
2264     case EXACTFU:
2265         if (is_utf8_pat || utf8_target) {
2266             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2267             goto do_exactf_utf8;
2268         }
2269 
2270         /* Any 'ss' in the pattern should have been replaced by regcomp,
2271          * so we don't have to worry here about this single special case
2272          * in the Latin1 range */
2273         fold_array = PL_fold_latin1;
2274         folder = foldEQ_latin1_s2_folded;
2275 
2276         /* FALLTHROUGH */
2277 
2278       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2279                            are no glitches with fold-length differences
2280                            between the target string and pattern */
2281 
2282         /* The idea in the non-utf8 EXACTF* cases is to first find the
2283          * first character of the EXACTF* node and then, if necessary,
2284          * case-insensitively compare the full text of the node.  c1 is the
2285          * first character.  c2 is its fold.  This logic will not work for
2286          * Unicode semantics and the german sharp ss, which hence should
2287          * not be compiled into a node that gets here. */
2288         pat_string = STRING(c);
2289         ln  = STR_LEN(c);	/* length to match in octets/bytes */
2290 
2291         /* We know that we have to match at least 'ln' bytes (which is the
2292          * same as characters, since not utf8).  If we have to match 3
2293          * characters, and there are only 2 availabe, we know without
2294          * trying that it will fail; so don't start a match past the
2295          * required minimum number from the far end */
2296         e = HOP3c(strend, -((SSize_t)ln), s);
2297         if (e < s)
2298             break;
2299 
2300         c1 = *pat_string;
2301         c2 = fold_array[c1];
2302         if (c1 == c2) { /* If char and fold are the same */
2303             while (s <= e) {
2304                 s = (char *) memchr(s, c1, e + 1 - s);
2305                 if (s == NULL) {
2306                     break;
2307                 }
2308 
2309                 /* Check that the rest of the node matches */
2310                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2311                     && (reginfo->intuit || regtry(reginfo, &s)) )
2312                 {
2313                     goto got_it;
2314                 }
2315                 s++;
2316             }
2317         }
2318         else {
2319             U8 bits_differing = c1 ^ c2;
2320 
2321             /* If the folds differ in one bit position only, we can mask to
2322              * match either of them, and can use this faster find method.  Both
2323              * ASCII and EBCDIC tend to have their case folds differ in only
2324              * one position, so this is very likely */
2325             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2326                 bits_differing = ~ bits_differing;
2327                 while (s <= e) {
2328                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2329                                         (c1 & bits_differing), bits_differing);
2330                     if (s > e) {
2331                         break;
2332                     }
2333 
2334                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2335                         && (reginfo->intuit || regtry(reginfo, &s)) )
2336                     {
2337                         goto got_it;
2338                     }
2339                     s++;
2340                 }
2341             }
2342             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2343                        should actually happen only in EXACTFL nodes */
2344                 while (s <= e) {
2345                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2346                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2347                         && (reginfo->intuit || regtry(reginfo, &s)) )
2348                     {
2349                         goto got_it;
2350                     }
2351                     s++;
2352                 }
2353             }
2354         }
2355         break;
2356 
2357       do_exactf_utf8:
2358       {
2359         unsigned expansion;
2360 
2361         /* If one of the operands is in utf8, we can't use the simpler folding
2362          * above, due to the fact that many different characters can have the
2363          * same fold, or portion of a fold, or different- length fold */
2364         pat_string = STRING(c);
2365         ln  = STR_LEN(c);	/* length to match in octets/bytes */
2366         pat_end = pat_string + ln;
2367         lnc = is_utf8_pat       /* length to match in characters */
2368                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2369                 : ln;
2370 
2371         /* We have 'lnc' characters to match in the pattern, but because of
2372          * multi-character folding, each character in the target can match
2373          * up to 3 characters (Unicode guarantees it will never exceed
2374          * this) if it is utf8-encoded; and up to 2 if not (based on the
2375          * fact that the Latin 1 folds are already determined, and the
2376          * only multi-char fold in that range is the sharp-s folding to
2377          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2378          * string character.  Adjust lnc accordingly, rounding up, so that
2379          * if we need to match at least 4+1/3 chars, that really is 5. */
2380         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2381         lnc = (lnc + expansion - 1) / expansion;
2382 
2383         /* As in the non-UTF8 case, if we have to match 3 characters, and
2384          * only 2 are left, it's guaranteed to fail, so don't start a
2385          * match that would require us to go beyond the end of the string
2386          */
2387         e = HOP3c(strend, -((SSize_t)lnc), s);
2388 
2389         /* XXX Note that we could recalculate e to stop the loop earlier,
2390          * as the worst case expansion above will rarely be met, and as we
2391          * go along we would usually find that e moves further to the left.
2392          * This would happen only after we reached the point in the loop
2393          * where if there were no expansion we should fail.  Unclear if
2394          * worth the expense */
2395 
2396         while (s <= e) {
2397             char *my_strend= (char *)strend;
2398             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2399                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2400                 && (reginfo->intuit || regtry(reginfo, &s)) )
2401             {
2402                 goto got_it;
2403             }
2404             s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2405         }
2406         break;
2407     }
2408 
2409     case BOUNDL:
2410         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2411         if (FLAGS(c) != TRADITIONAL_BOUND) {
2412             if (! IN_UTF8_CTYPE_LOCALE) {
2413                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2414                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2415             }
2416             goto do_boundu;
2417         }
2418 
2419         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2420         break;
2421 
2422     case NBOUNDL:
2423         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2424         if (FLAGS(c) != TRADITIONAL_BOUND) {
2425             if (! IN_UTF8_CTYPE_LOCALE) {
2426                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2427                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2428             }
2429             goto do_nboundu;
2430         }
2431 
2432         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2433         break;
2434 
2435     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2436                    meaning */
2437         assert(FLAGS(c) == TRADITIONAL_BOUND);
2438 
2439         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2440         break;
2441 
2442     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2443                    meaning */
2444         assert(FLAGS(c) == TRADITIONAL_BOUND);
2445 
2446         FBC_BOUND_A(isWORDCHAR_A);
2447         break;
2448 
2449     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2450                    meaning */
2451         assert(FLAGS(c) == TRADITIONAL_BOUND);
2452 
2453         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2454         break;
2455 
2456     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2457                    meaning */
2458         assert(FLAGS(c) == TRADITIONAL_BOUND);
2459 
2460         FBC_NBOUND_A(isWORDCHAR_A);
2461         break;
2462 
2463     case NBOUNDU:
2464         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2465             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2466             break;
2467         }
2468 
2469       do_nboundu:
2470 
2471         to_complement = 1;
2472         /* FALLTHROUGH */
2473 
2474     case BOUNDU:
2475       do_boundu:
2476         switch((bound_type) FLAGS(c)) {
2477             case TRADITIONAL_BOUND:
2478                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2479                 break;
2480             case GCB_BOUND:
2481                 if (s == reginfo->strbeg) {
2482                     if (reginfo->intuit || regtry(reginfo, &s))
2483                     {
2484                         goto got_it;
2485                     }
2486 
2487                     /* Didn't match.  Try at the next position (if there is one) */
2488                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2489                     if (UNLIKELY(s >= reginfo->strend)) {
2490                         break;
2491                     }
2492                 }
2493 
2494                 if (utf8_target) {
2495                     GCB_enum before = getGCB_VAL_UTF8(
2496                                                reghop3((U8*)s, -1,
2497                                                        (U8*)(reginfo->strbeg)),
2498                                                (U8*) reginfo->strend);
2499                     while (s < strend) {
2500                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2501                                                         (U8*) reginfo->strend);
2502                         if (   (to_complement ^ isGCB(before,
2503                                                       after,
2504                                                       (U8*) reginfo->strbeg,
2505                                                       (U8*) s,
2506                                                       utf8_target))
2507                             && (reginfo->intuit || regtry(reginfo, &s)))
2508                         {
2509                             goto got_it;
2510                         }
2511                         before = after;
2512                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2513                     }
2514                 }
2515                 else {  /* Not utf8.  Everything is a GCB except between CR and
2516                            LF */
2517                     while (s < strend) {
2518                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2519                                               || UCHARAT(s) != '\n'))
2520                             && (reginfo->intuit || regtry(reginfo, &s)))
2521                         {
2522                             goto got_it;
2523                         }
2524                         s++;
2525                     }
2526                 }
2527 
2528                 /* And, since this is a bound, it can match after the final
2529                  * character in the string */
2530                 if (   reginfo->intuit
2531                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2532                 {
2533                     goto got_it;
2534                 }
2535                 break;
2536 
2537             case LB_BOUND:
2538                 if (s == reginfo->strbeg) {
2539                     if (reginfo->intuit || regtry(reginfo, &s)) {
2540                         goto got_it;
2541                     }
2542                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2543                     if (UNLIKELY(s >= reginfo->strend)) {
2544                         break;
2545                     }
2546                 }
2547 
2548                 if (utf8_target) {
2549                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2550                                                                -1,
2551                                                                (U8*)(reginfo->strbeg)),
2552                                                        (U8*) reginfo->strend);
2553                     while (s < strend) {
2554                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2555                         if (to_complement ^ isLB(before,
2556                                                  after,
2557                                                  (U8*) reginfo->strbeg,
2558                                                  (U8*) s,
2559                                                  (U8*) reginfo->strend,
2560                                                  utf8_target)
2561                             && (reginfo->intuit || regtry(reginfo, &s)))
2562                         {
2563                             goto got_it;
2564                         }
2565                         before = after;
2566                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2567                     }
2568                 }
2569                 else {  /* Not utf8. */
2570                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2571                     while (s < strend) {
2572                         LB_enum after = getLB_VAL_CP((U8) *s);
2573                         if (to_complement ^ isLB(before,
2574                                                  after,
2575                                                  (U8*) reginfo->strbeg,
2576                                                  (U8*) s,
2577                                                  (U8*) reginfo->strend,
2578                                                  utf8_target)
2579                             && (reginfo->intuit || regtry(reginfo, &s)))
2580                         {
2581                             goto got_it;
2582                         }
2583                         before = after;
2584                         s++;
2585                     }
2586                 }
2587 
2588                 if (   reginfo->intuit
2589                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2590                 {
2591                     goto got_it;
2592                 }
2593 
2594                 break;
2595 
2596             case SB_BOUND:
2597                 if (s == reginfo->strbeg) {
2598                     if (reginfo->intuit || regtry(reginfo, &s)) {
2599                         goto got_it;
2600                     }
2601                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2602                     if (UNLIKELY(s >= reginfo->strend)) {
2603                         break;
2604                     }
2605                 }
2606 
2607                 if (utf8_target) {
2608                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2609                                                         -1,
2610                                                         (U8*)(reginfo->strbeg)),
2611                                                       (U8*) reginfo->strend);
2612                     while (s < strend) {
2613                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2614                                                          (U8*) reginfo->strend);
2615                         if ((to_complement ^ isSB(before,
2616                                                   after,
2617                                                   (U8*) reginfo->strbeg,
2618                                                   (U8*) s,
2619                                                   (U8*) reginfo->strend,
2620                                                   utf8_target))
2621                             && (reginfo->intuit || regtry(reginfo, &s)))
2622                         {
2623                             goto got_it;
2624                         }
2625                         before = after;
2626                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2627                     }
2628                 }
2629                 else {  /* Not utf8. */
2630                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2631                     while (s < strend) {
2632                         SB_enum after = getSB_VAL_CP((U8) *s);
2633                         if ((to_complement ^ isSB(before,
2634                                                   after,
2635                                                   (U8*) reginfo->strbeg,
2636                                                   (U8*) s,
2637                                                   (U8*) reginfo->strend,
2638                                                   utf8_target))
2639                             && (reginfo->intuit || regtry(reginfo, &s)))
2640                         {
2641                             goto got_it;
2642                         }
2643                         before = after;
2644                         s++;
2645                     }
2646                 }
2647 
2648                 /* Here are at the final position in the target string.  The SB
2649                  * value is always true here, so matches, depending on other
2650                  * constraints */
2651                 if (   reginfo->intuit
2652                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2653                 {
2654                     goto got_it;
2655                 }
2656 
2657                 break;
2658 
2659             case WB_BOUND:
2660                 if (s == reginfo->strbeg) {
2661                     if (reginfo->intuit || regtry(reginfo, &s)) {
2662                         goto got_it;
2663                     }
2664                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2665                     if (UNLIKELY(s >= reginfo->strend)) {
2666                         break;
2667                     }
2668                 }
2669 
2670                 if (utf8_target) {
2671                     /* We are at a boundary between char_sub_0 and char_sub_1.
2672                      * We also keep track of the value for char_sub_-1 as we
2673                      * loop through the line.   Context may be needed to make a
2674                      * determination, and if so, this can save having to
2675                      * recalculate it */
2676                     WB_enum previous = WB_UNKNOWN;
2677                     WB_enum before = getWB_VAL_UTF8(
2678                                               reghop3((U8*)s,
2679                                                       -1,
2680                                                       (U8*)(reginfo->strbeg)),
2681                                               (U8*) reginfo->strend);
2682                     while (s < strend) {
2683                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2684                                                         (U8*) reginfo->strend);
2685                         if ((to_complement ^ isWB(previous,
2686                                                   before,
2687                                                   after,
2688                                                   (U8*) reginfo->strbeg,
2689                                                   (U8*) s,
2690                                                   (U8*) reginfo->strend,
2691                                                   utf8_target))
2692                             && (reginfo->intuit || regtry(reginfo, &s)))
2693                         {
2694                             goto got_it;
2695                         }
2696                         previous = before;
2697                         before = after;
2698                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2699                     }
2700                 }
2701                 else {  /* Not utf8. */
2702                     WB_enum previous = WB_UNKNOWN;
2703                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2704                     while (s < strend) {
2705                         WB_enum after = getWB_VAL_CP((U8) *s);
2706                         if ((to_complement ^ isWB(previous,
2707                                                   before,
2708                                                   after,
2709                                                   (U8*) reginfo->strbeg,
2710                                                   (U8*) s,
2711                                                   (U8*) reginfo->strend,
2712                                                   utf8_target))
2713                             && (reginfo->intuit || regtry(reginfo, &s)))
2714                         {
2715                             goto got_it;
2716                         }
2717                         previous = before;
2718                         before = after;
2719                         s++;
2720                     }
2721                 }
2722 
2723                 if (   reginfo->intuit
2724                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2725                 {
2726                     goto got_it;
2727                 }
2728         }
2729         break;
2730 
2731     case LNBREAK:
2732         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2733                         is_LNBREAK_latin1_safe(s, strend)
2734         );
2735         break;
2736 
2737     /* The argument to all the POSIX node types is the class number to pass to
2738      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2739 
2740     case NPOSIXL:
2741         to_complement = 1;
2742         /* FALLTHROUGH */
2743 
2744     case POSIXL:
2745         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2746         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2747                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2748         break;
2749 
2750     case NPOSIXD:
2751         to_complement = 1;
2752         /* FALLTHROUGH */
2753 
2754     case POSIXD:
2755         if (utf8_target) {
2756             goto posix_utf8;
2757         }
2758         goto posixa;
2759 
2760     case NPOSIXA:
2761         if (utf8_target) {
2762             /* The complement of something that matches only ASCII matches all
2763              * non-ASCII, plus everything in ASCII that isn't in the class. */
2764             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2765                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2766             break;
2767         }
2768 
2769         to_complement = 1;
2770         goto posixa;
2771 
2772     case POSIXA:
2773         /* Don't need to worry about utf8, as it can match only a single
2774          * byte invariant character.  But we do anyway for performance reasons,
2775          * as otherwise we would have to examine all the continuation
2776          * characters */
2777         if (utf8_target) {
2778             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2779             break;
2780         }
2781 
2782       posixa:
2783         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2784                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2785         break;
2786 
2787     case NPOSIXU:
2788         to_complement = 1;
2789         /* FALLTHROUGH */
2790 
2791     case POSIXU:
2792         if (! utf8_target) {
2793             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2794                                  to_complement ^ cBOOL(_generic_isCC(*s,
2795                                                                     FLAGS(c))));
2796         }
2797         else {
2798 
2799           posix_utf8:
2800             classnum = (_char_class_number) FLAGS(c);
2801             switch (classnum) {
2802                 default:
2803                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2804                         to_complement ^ cBOOL(_invlist_contains_cp(
2805                                               PL_XPosix_ptrs[classnum],
2806                                               utf8_to_uvchr_buf((U8 *) s,
2807                                                                 (U8 *) strend,
2808                                                                 NULL))));
2809                     break;
2810                 case _CC_ENUM_SPACE:
2811                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2812                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2813                     break;
2814 
2815                 case _CC_ENUM_BLANK:
2816                     REXEC_FBC_CLASS_SCAN(1,
2817                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2818                     break;
2819 
2820                 case _CC_ENUM_XDIGIT:
2821                     REXEC_FBC_CLASS_SCAN(1,
2822                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2823                     break;
2824 
2825                 case _CC_ENUM_VERTSPACE:
2826                     REXEC_FBC_CLASS_SCAN(1,
2827                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2828                     break;
2829 
2830                 case _CC_ENUM_CNTRL:
2831                     REXEC_FBC_CLASS_SCAN(1,
2832                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2833                     break;
2834             }
2835         }
2836         break;
2837 
2838     case AHOCORASICKC:
2839     case AHOCORASICK:
2840         {
2841             DECL_TRIE_TYPE(c);
2842             /* what trie are we using right now */
2843             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2844             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2845             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2846 
2847             const char *last_start = strend - trie->minlen;
2848 #ifdef DEBUGGING
2849             const char *real_start = s;
2850 #endif
2851             STRLEN maxlen = trie->maxlen;
2852             SV *sv_points;
2853             U8 **points; /* map of where we were in the input string
2854                             when reading a given char. For ASCII this
2855                             is unnecessary overhead as the relationship
2856                             is always 1:1, but for Unicode, especially
2857                             case folded Unicode this is not true. */
2858             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2859             U8 *bitmap=NULL;
2860 
2861 
2862             GET_RE_DEBUG_FLAGS_DECL;
2863 
2864             /* We can't just allocate points here. We need to wrap it in
2865              * an SV so it gets freed properly if there is a croak while
2866              * running the match */
2867             ENTER;
2868             SAVETMPS;
2869             sv_points=newSV(maxlen * sizeof(U8 *));
2870             SvCUR_set(sv_points,
2871                 maxlen * sizeof(U8 *));
2872             SvPOK_on(sv_points);
2873             sv_2mortal(sv_points);
2874             points=(U8**)SvPV_nolen(sv_points );
2875             if ( trie_type != trie_utf8_fold
2876                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2877             {
2878                 if (trie->bitmap)
2879                     bitmap=(U8*)trie->bitmap;
2880                 else
2881                     bitmap=(U8*)ANYOF_BITMAP(c);
2882             }
2883             /* this is the Aho-Corasick algorithm modified a touch
2884                to include special handling for long "unknown char" sequences.
2885                The basic idea being that we use AC as long as we are dealing
2886                with a possible matching char, when we encounter an unknown char
2887                (and we have not encountered an accepting state) we scan forward
2888                until we find a legal starting char.
2889                AC matching is basically that of trie matching, except that when
2890                we encounter a failing transition, we fall back to the current
2891                states "fail state", and try the current char again, a process
2892                we repeat until we reach the root state, state 1, or a legal
2893                transition. If we fail on the root state then we can either
2894                terminate if we have reached an accepting state previously, or
2895                restart the entire process from the beginning if we have not.
2896 
2897              */
2898             while (s <= last_start) {
2899                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2900                 U8 *uc = (U8*)s;
2901                 U16 charid = 0;
2902                 U32 base = 1;
2903                 U32 state = 1;
2904                 UV uvc = 0;
2905                 STRLEN len = 0;
2906                 STRLEN foldlen = 0;
2907                 U8 *uscan = (U8*)NULL;
2908                 U8 *leftmost = NULL;
2909 #ifdef DEBUGGING
2910                 U32 accepted_word= 0;
2911 #endif
2912                 U32 pointpos = 0;
2913 
2914                 while ( state && uc <= (U8*)strend ) {
2915                     int failed=0;
2916                     U32 word = aho->states[ state ].wordnum;
2917 
2918                     if( state==1 ) {
2919                         if ( bitmap ) {
2920                             DEBUG_TRIE_EXECUTE_r(
2921                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2922                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2923                                         (char *)uc, utf8_target, 0 );
2924                                     Perl_re_printf( aTHX_
2925                                         " Scanning for legal start char...\n");
2926                                 }
2927                             );
2928                             if (utf8_target) {
2929                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2930                                     uc += UTF8SKIP(uc);
2931                                 }
2932                             } else {
2933                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2934                                     uc++;
2935                                 }
2936                             }
2937                             s= (char *)uc;
2938                         }
2939                         if (uc >(U8*)last_start) break;
2940                     }
2941 
2942                     if ( word ) {
2943                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2944                         if (!leftmost || lpos < leftmost) {
2945                             DEBUG_r(accepted_word=word);
2946                             leftmost= lpos;
2947                         }
2948                         if (base==0) break;
2949 
2950                     }
2951                     points[pointpos++ % maxlen]= uc;
2952                     if (foldlen || uc < (U8*)strend) {
2953                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2954                                              (U8 *) strend, uscan, len, uvc,
2955                                              charid, foldlen, foldbuf,
2956                                              uniflags);
2957                         DEBUG_TRIE_EXECUTE_r({
2958                             dump_exec_pos( (char *)uc, c, strend,
2959                                         real_start, s, utf8_target, 0);
2960                             Perl_re_printf( aTHX_
2961                                 " Charid:%3u CP:%4" UVxf " ",
2962                                  charid, uvc);
2963                         });
2964                     }
2965                     else {
2966                         len = 0;
2967                         charid = 0;
2968                     }
2969 
2970 
2971                     do {
2972 #ifdef DEBUGGING
2973                         word = aho->states[ state ].wordnum;
2974 #endif
2975                         base = aho->states[ state ].trans.base;
2976 
2977                         DEBUG_TRIE_EXECUTE_r({
2978                             if (failed)
2979                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2980                                     s,   utf8_target, 0 );
2981                             Perl_re_printf( aTHX_
2982                                 "%sState: %4" UVxf ", word=%" UVxf,
2983                                 failed ? " Fail transition to " : "",
2984                                 (UV)state, (UV)word);
2985                         });
2986                         if ( base ) {
2987                             U32 tmp;
2988                             I32 offset;
2989                             if (charid &&
2990                                  ( ((offset = base + charid
2991                                     - 1 - trie->uniquecharcount)) >= 0)
2992                                  && ((U32)offset < trie->lasttrans)
2993                                  && trie->trans[offset].check == state
2994                                  && (tmp=trie->trans[offset].next))
2995                             {
2996                                 DEBUG_TRIE_EXECUTE_r(
2997                                     Perl_re_printf( aTHX_ " - legal\n"));
2998                                 state = tmp;
2999                                 break;
3000                             }
3001                             else {
3002                                 DEBUG_TRIE_EXECUTE_r(
3003                                     Perl_re_printf( aTHX_ " - fail\n"));
3004                                 failed = 1;
3005                                 state = aho->fail[state];
3006                             }
3007                         }
3008                         else {
3009                             /* we must be accepting here */
3010                             DEBUG_TRIE_EXECUTE_r(
3011                                     Perl_re_printf( aTHX_ " - accepting\n"));
3012                             failed = 1;
3013                             break;
3014                         }
3015                     } while(state);
3016                     uc += len;
3017                     if (failed) {
3018                         if (leftmost)
3019                             break;
3020                         if (!state) state = 1;
3021                     }
3022                 }
3023                 if ( aho->states[ state ].wordnum ) {
3024                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3025                     if (!leftmost || lpos < leftmost) {
3026                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3027                         leftmost = lpos;
3028                     }
3029                 }
3030                 if (leftmost) {
3031                     s = (char*)leftmost;
3032                     DEBUG_TRIE_EXECUTE_r({
3033                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3034                             (UV)accepted_word, (IV)(s - real_start)
3035                         );
3036                     });
3037                     if (reginfo->intuit || regtry(reginfo, &s)) {
3038                         FREETMPS;
3039                         LEAVE;
3040                         goto got_it;
3041                     }
3042                     if (s < reginfo->strend) {
3043                         s = HOPc(s,1);
3044                     }
3045                     DEBUG_TRIE_EXECUTE_r({
3046                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3047                     });
3048                 } else {
3049                     DEBUG_TRIE_EXECUTE_r(
3050                         Perl_re_printf( aTHX_ "No match.\n"));
3051                     break;
3052                 }
3053             }
3054             FREETMPS;
3055             LEAVE;
3056         }
3057         break;
3058     default:
3059         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3060     }
3061     return 0;
3062   got_it:
3063     return s;
3064 }
3065 
3066 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3067  * flags have same meanings as with regexec_flags() */
3068 
3069 static void
S_reg_set_capture_string(pTHX_ REGEXP * const rx,char * strbeg,char * strend,SV * sv,U32 flags,bool utf8_target)3070 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3071                             char *strbeg,
3072                             char *strend,
3073                             SV *sv,
3074                             U32 flags,
3075                             bool utf8_target)
3076 {
3077     struct regexp *const prog = ReANY(rx);
3078 
3079     if (flags & REXEC_COPY_STR) {
3080 #ifdef PERL_ANY_COW
3081         if (SvCANCOW(sv)) {
3082             DEBUG_C(Perl_re_printf( aTHX_
3083                               "Copy on write: regexp capture, type %d\n",
3084                                     (int) SvTYPE(sv)));
3085             /* Create a new COW SV to share the match string and store
3086              * in saved_copy, unless the current COW SV in saved_copy
3087              * is valid and suitable for our purpose */
3088             if ((   prog->saved_copy
3089                  && SvIsCOW(prog->saved_copy)
3090                  && SvPOKp(prog->saved_copy)
3091                  && SvIsCOW(sv)
3092                  && SvPOKp(sv)
3093                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3094             {
3095                 /* just reuse saved_copy SV */
3096                 if (RXp_MATCH_COPIED(prog)) {
3097                     Safefree(prog->subbeg);
3098                     RXp_MATCH_COPIED_off(prog);
3099                 }
3100             }
3101             else {
3102                 /* create new COW SV to share string */
3103                 RXp_MATCH_COPY_FREE(prog);
3104                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3105             }
3106             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3107             assert (SvPOKp(prog->saved_copy));
3108             prog->sublen  = strend - strbeg;
3109             prog->suboffset = 0;
3110             prog->subcoffset = 0;
3111         } else
3112 #endif
3113         {
3114             SSize_t min = 0;
3115             SSize_t max = strend - strbeg;
3116             SSize_t sublen;
3117 
3118             if (    (flags & REXEC_COPY_SKIP_POST)
3119                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3120                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3121             ) { /* don't copy $' part of string */
3122                 U32 n = 0;
3123                 max = -1;
3124                 /* calculate the right-most part of the string covered
3125                  * by a capture. Due to lookahead, this may be to
3126                  * the right of $&, so we have to scan all captures */
3127                 while (n <= prog->lastparen) {
3128                     if (prog->offs[n].end > max)
3129                         max = prog->offs[n].end;
3130                     n++;
3131                 }
3132                 if (max == -1)
3133                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3134                             ? prog->offs[0].start
3135                             : 0;
3136                 assert(max >= 0 && max <= strend - strbeg);
3137             }
3138 
3139             if (    (flags & REXEC_COPY_SKIP_PRE)
3140                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3141                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3142             ) { /* don't copy $` part of string */
3143                 U32 n = 0;
3144                 min = max;
3145                 /* calculate the left-most part of the string covered
3146                  * by a capture. Due to lookbehind, this may be to
3147                  * the left of $&, so we have to scan all captures */
3148                 while (min && n <= prog->lastparen) {
3149                     if (   prog->offs[n].start != -1
3150                         && prog->offs[n].start < min)
3151                     {
3152                         min = prog->offs[n].start;
3153                     }
3154                     n++;
3155                 }
3156                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3157                     && min >  prog->offs[0].end
3158                 )
3159                     min = prog->offs[0].end;
3160 
3161             }
3162 
3163             assert(min >= 0 && min <= max && min <= strend - strbeg);
3164             sublen = max - min;
3165 
3166             if (RXp_MATCH_COPIED(prog)) {
3167                 if (sublen > prog->sublen)
3168                     prog->subbeg =
3169                             (char*)saferealloc(prog->subbeg, sublen+1);
3170             }
3171             else
3172                 prog->subbeg = (char*)safemalloc(sublen+1);
3173             Copy(strbeg + min, prog->subbeg, sublen, char);
3174             prog->subbeg[sublen] = '\0';
3175             prog->suboffset = min;
3176             prog->sublen = sublen;
3177             RXp_MATCH_COPIED_on(prog);
3178         }
3179         prog->subcoffset = prog->suboffset;
3180         if (prog->suboffset && utf8_target) {
3181             /* Convert byte offset to chars.
3182              * XXX ideally should only compute this if @-/@+
3183              * has been seen, a la PL_sawampersand ??? */
3184 
3185             /* If there's a direct correspondence between the
3186              * string which we're matching and the original SV,
3187              * then we can use the utf8 len cache associated with
3188              * the SV. In particular, it means that under //g,
3189              * sv_pos_b2u() will use the previously cached
3190              * position to speed up working out the new length of
3191              * subcoffset, rather than counting from the start of
3192              * the string each time. This stops
3193              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3194              * from going quadratic */
3195             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3196                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3197                                                 SV_GMAGIC|SV_CONST_RETURN);
3198             else
3199                 prog->subcoffset = utf8_length((U8*)strbeg,
3200                                     (U8*)(strbeg+prog->suboffset));
3201         }
3202     }
3203     else {
3204         RXp_MATCH_COPY_FREE(prog);
3205         prog->subbeg = strbeg;
3206         prog->suboffset = 0;
3207         prog->subcoffset = 0;
3208         prog->sublen = strend - strbeg;
3209     }
3210 }
3211 
3212 
3213 
3214 
3215 /*
3216  - regexec_flags - match a regexp against a string
3217  */
3218 I32
Perl_regexec_flags(pTHX_ REGEXP * const rx,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * sv,void * data,U32 flags)3219 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3220 	      char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3221 /* stringarg: the point in the string at which to begin matching */
3222 /* strend:    pointer to null at end of string */
3223 /* strbeg:    real beginning of string */
3224 /* minend:    end of match must be >= minend bytes after stringarg. */
3225 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3226  *            itself is accessed via the pointers above */
3227 /* data:      May be used for some additional optimizations.
3228               Currently unused. */
3229 /* flags:     For optimizations. See REXEC_* in regexp.h */
3230 
3231 {
3232     struct regexp *const prog = ReANY(rx);
3233     char *s;
3234     regnode *c;
3235     char *startpos;
3236     SSize_t minlen;		/* must match at least this many chars */
3237     SSize_t dontbother = 0;	/* how many characters not to try at end */
3238     const bool utf8_target = cBOOL(DO_UTF8(sv));
3239     I32 multiline;
3240     RXi_GET_DECL(prog,progi);
3241     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3242     regmatch_info *const reginfo = &reginfo_buf;
3243     regexp_paren_pair *swap = NULL;
3244     I32 oldsave;
3245     GET_RE_DEBUG_FLAGS_DECL;
3246 
3247     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3248     PERL_UNUSED_ARG(data);
3249 
3250     /* Be paranoid... */
3251     if (prog == NULL) {
3252 	Perl_croak(aTHX_ "NULL regexp parameter");
3253     }
3254 
3255     DEBUG_EXECUTE_r(
3256         debug_start_match(rx, utf8_target, stringarg, strend,
3257         "Matching");
3258     );
3259 
3260     startpos = stringarg;
3261 
3262     /* set these early as they may be used by the HOP macros below */
3263     reginfo->strbeg = strbeg;
3264     reginfo->strend = strend;
3265     reginfo->is_utf8_target = cBOOL(utf8_target);
3266 
3267     if (prog->intflags & PREGf_GPOS_SEEN) {
3268         MAGIC *mg;
3269 
3270         /* set reginfo->ganch, the position where \G can match */
3271 
3272         reginfo->ganch =
3273             (flags & REXEC_IGNOREPOS)
3274             ? stringarg /* use start pos rather than pos() */
3275             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3276               /* Defined pos(): */
3277             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3278             : strbeg; /* pos() not defined; use start of string */
3279 
3280         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3281             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3282 
3283         /* in the presence of \G, we may need to start looking earlier in
3284          * the string than the suggested start point of stringarg:
3285          * if prog->gofs is set, then that's a known, fixed minimum
3286          * offset, such as
3287          * /..\G/:   gofs = 2
3288          * /ab|c\G/: gofs = 1
3289          * or if the minimum offset isn't known, then we have to go back
3290          * to the start of the string, e.g. /w+\G/
3291          */
3292 
3293         if (prog->intflags & PREGf_ANCH_GPOS) {
3294             if (prog->gofs) {
3295                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3296                 if (!startpos ||
3297                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3298                 {
3299                     DEBUG_r(Perl_re_printf( aTHX_
3300                             "fail: ganch-gofs before earliest possible start\n"));
3301                     return 0;
3302                 }
3303             }
3304             else
3305                 startpos = reginfo->ganch;
3306         }
3307         else if (prog->gofs) {
3308             startpos = HOPBACKc(startpos, prog->gofs);
3309             if (!startpos)
3310                 startpos = strbeg;
3311         }
3312         else if (prog->intflags & PREGf_GPOS_FLOAT)
3313             startpos = strbeg;
3314     }
3315 
3316     minlen = prog->minlen;
3317     if ((startpos + minlen) > strend || startpos < strbeg) {
3318         DEBUG_r(Perl_re_printf( aTHX_
3319                     "Regex match can't succeed, so not even tried\n"));
3320         return 0;
3321     }
3322 
3323     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3324      * which will call destuctors to reset PL_regmatch_state, free higher
3325      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3326      * regmatch_info_aux_eval */
3327 
3328     oldsave = PL_savestack_ix;
3329 
3330     s = startpos;
3331 
3332     if ((prog->extflags & RXf_USE_INTUIT)
3333         && !(flags & REXEC_CHECKED))
3334     {
3335 	s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3336                                     flags, NULL);
3337 	if (!s)
3338 	    return 0;
3339 
3340 	if (prog->extflags & RXf_CHECK_ALL) {
3341             /* we can match based purely on the result of INTUIT.
3342              * Set up captures etc just for $& and $-[0]
3343              * (an intuit-only match wont have $1,$2,..) */
3344             assert(!prog->nparens);
3345 
3346             /* s/// doesn't like it if $& is earlier than where we asked it to
3347              * start searching (which can happen on something like /.\G/) */
3348             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3349                     && (s < stringarg))
3350             {
3351                 /* this should only be possible under \G */
3352                 assert(prog->intflags & PREGf_GPOS_SEEN);
3353                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3354                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3355                 goto phooey;
3356             }
3357 
3358             /* match via INTUIT shouldn't have any captures.
3359              * Let @-, @+, $^N know */
3360             prog->lastparen = prog->lastcloseparen = 0;
3361             RXp_MATCH_UTF8_set(prog, utf8_target);
3362             prog->offs[0].start = s - strbeg;
3363             prog->offs[0].end = utf8_target
3364                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3365                 : s - strbeg + prog->minlenret;
3366             if ( !(flags & REXEC_NOT_FIRST) )
3367                 S_reg_set_capture_string(aTHX_ rx,
3368                                         strbeg, strend,
3369                                         sv, flags, utf8_target);
3370 
3371 	    return 1;
3372         }
3373     }
3374 
3375     multiline = prog->extflags & RXf_PMf_MULTILINE;
3376 
3377     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3378         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3379 			      "String too short [regexec_flags]...\n"));
3380 	goto phooey;
3381     }
3382 
3383     /* Check validity of program. */
3384     if (UCHARAT(progi->program) != REG_MAGIC) {
3385 	Perl_croak(aTHX_ "corrupted regexp program");
3386     }
3387 
3388     RXp_MATCH_TAINTED_off(prog);
3389     RXp_MATCH_UTF8_set(prog, utf8_target);
3390 
3391     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
3392     reginfo->intuit = 0;
3393     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3394     reginfo->warned = FALSE;
3395     reginfo->sv = sv;
3396     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3397     /* see how far we have to get to not match where we matched before */
3398     reginfo->till = stringarg + minend;
3399 
3400     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3401         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3402            S_cleanup_regmatch_info_aux has executed (registered by
3403            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3404            magic belonging to this SV.
3405            Not newSVsv, either, as it does not COW.
3406         */
3407         reginfo->sv = newSV(0);
3408         SvSetSV_nosteal(reginfo->sv, sv);
3409         SAVEFREESV(reginfo->sv);
3410     }
3411 
3412     /* reserve next 2 or 3 slots in PL_regmatch_state:
3413      * slot N+0: may currently be in use: skip it
3414      * slot N+1: use for regmatch_info_aux struct
3415      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3416      * slot N+3: ready for use by regmatch()
3417      */
3418 
3419     {
3420         regmatch_state *old_regmatch_state;
3421         regmatch_slab  *old_regmatch_slab;
3422         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3423 
3424         /* on first ever match, allocate first slab */
3425         if (!PL_regmatch_slab) {
3426             Newx(PL_regmatch_slab, 1, regmatch_slab);
3427             PL_regmatch_slab->prev = NULL;
3428             PL_regmatch_slab->next = NULL;
3429             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3430         }
3431 
3432         old_regmatch_state = PL_regmatch_state;
3433         old_regmatch_slab  = PL_regmatch_slab;
3434 
3435         for (i=0; i <= max; i++) {
3436             if (i == 1)
3437                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3438             else if (i ==2)
3439                 reginfo->info_aux_eval =
3440                 reginfo->info_aux->info_aux_eval =
3441                             &(PL_regmatch_state->u.info_aux_eval);
3442 
3443             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3444                 PL_regmatch_state = S_push_slab(aTHX);
3445         }
3446 
3447         /* note initial PL_regmatch_state position; at end of match we'll
3448          * pop back to there and free any higher slabs */
3449 
3450         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3451         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3452         reginfo->info_aux->poscache = NULL;
3453 
3454         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3455 
3456         if ((prog->extflags & RXf_EVAL_SEEN))
3457             S_setup_eval_state(aTHX_ reginfo);
3458         else
3459             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3460     }
3461 
3462     /* If there is a "must appear" string, look for it. */
3463 
3464     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3465         /* We have to be careful. If the previous successful match
3466            was from this regex we don't want a subsequent partially
3467            successful match to clobber the old results.
3468            So when we detect this possibility we add a swap buffer
3469            to the re, and switch the buffer each match. If we fail,
3470            we switch it back; otherwise we leave it swapped.
3471         */
3472         swap = prog->offs;
3473         /* avoid leak if we die, or clean up anyway if match completes */
3474         SAVEFREEPV(swap);
3475         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3476         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3477 	    "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3478 	    0,
3479             PTR2UV(prog),
3480 	    PTR2UV(swap),
3481 	    PTR2UV(prog->offs)
3482 	));
3483     }
3484 
3485     if (prog->recurse_locinput)
3486         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3487 
3488     /* Simplest case: anchored match need be tried only once, or with
3489      * MBOL, only at the beginning of each line.
3490      *
3491      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3492      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3493      * match at the start of the string then it won't match anywhere else
3494      * either; while with /.*.../, if it doesn't match at the beginning,
3495      * the earliest it could match is at the start of the next line */
3496 
3497     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3498         char *end;
3499 
3500 	if (regtry(reginfo, &s))
3501 	    goto got_it;
3502 
3503         if (!(prog->intflags & PREGf_ANCH_MBOL))
3504             goto phooey;
3505 
3506         /* didn't match at start, try at other newline positions */
3507 
3508         if (minlen)
3509             dontbother = minlen - 1;
3510         end = HOP3c(strend, -dontbother, strbeg) - 1;
3511 
3512         /* skip to next newline */
3513 
3514         while (s <= end) { /* note it could be possible to match at the end of the string */
3515             /* NB: newlines are the same in unicode as they are in latin */
3516             if (*s++ != '\n')
3517                 continue;
3518             if (prog->check_substr || prog->check_utf8) {
3519             /* note that with PREGf_IMPLICIT, intuit can only fail
3520              * or return the start position, so it's of limited utility.
3521              * Nevertheless, I made the decision that the potential for
3522              * quick fail was still worth it - DAPM */
3523                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3524                 if (!s)
3525                     goto phooey;
3526             }
3527             if (regtry(reginfo, &s))
3528                 goto got_it;
3529         }
3530         goto phooey;
3531     } /* end anchored search */
3532 
3533     if (prog->intflags & PREGf_ANCH_GPOS)
3534     {
3535         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3536         assert(prog->intflags & PREGf_GPOS_SEEN);
3537         /* For anchored \G, the only position it can match from is
3538          * (ganch-gofs); we already set startpos to this above; if intuit
3539          * moved us on from there, we can't possibly succeed */
3540         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3541 	if (s == startpos && regtry(reginfo, &s))
3542 	    goto got_it;
3543 	goto phooey;
3544     }
3545 
3546     /* Messy cases:  unanchored match. */
3547     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3548 	/* we have /x+whatever/ */
3549 	/* it must be a one character string (XXXX Except is_utf8_pat?) */
3550 	char ch;
3551 #ifdef DEBUGGING
3552 	int did_match = 0;
3553 #endif
3554 	if (utf8_target) {
3555             if (! prog->anchored_utf8) {
3556                 to_utf8_substr(prog);
3557             }
3558             ch = SvPVX_const(prog->anchored_utf8)[0];
3559 	    REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3560 		if (*s == ch) {
3561 		    DEBUG_EXECUTE_r( did_match = 1 );
3562 		    if (regtry(reginfo, &s)) goto got_it;
3563 		    s += UTF8_SAFE_SKIP(s, strend);
3564 		    while (s < strend && *s == ch)
3565 			s += UTF8SKIP(s);
3566 		}
3567 	    );
3568 
3569 	}
3570 	else {
3571             if (! prog->anchored_substr) {
3572                 if (! to_byte_substr(prog)) {
3573                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3574                 }
3575             }
3576             ch = SvPVX_const(prog->anchored_substr)[0];
3577 	    REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3578 		if (*s == ch) {
3579 		    DEBUG_EXECUTE_r( did_match = 1 );
3580 		    if (regtry(reginfo, &s)) goto got_it;
3581 		    s++;
3582 		    while (s < strend && *s == ch)
3583 			s++;
3584 		}
3585 	    );
3586 	}
3587 	DEBUG_EXECUTE_r(if (!did_match)
3588                 Perl_re_printf( aTHX_
3589                                   "Did not find anchored character...\n")
3590                );
3591     }
3592     else if (prog->anchored_substr != NULL
3593 	      || prog->anchored_utf8 != NULL
3594 	      || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3595 		  && prog->float_max_offset < strend - s)) {
3596 	SV *must;
3597 	SSize_t back_max;
3598 	SSize_t back_min;
3599 	char *last;
3600 	char *last1;		/* Last position checked before */
3601 #ifdef DEBUGGING
3602 	int did_match = 0;
3603 #endif
3604 	if (prog->anchored_substr || prog->anchored_utf8) {
3605 	    if (utf8_target) {
3606                 if (! prog->anchored_utf8) {
3607                     to_utf8_substr(prog);
3608                 }
3609                 must = prog->anchored_utf8;
3610             }
3611             else {
3612                 if (! prog->anchored_substr) {
3613                     if (! to_byte_substr(prog)) {
3614                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3615                     }
3616                 }
3617                 must = prog->anchored_substr;
3618             }
3619 	    back_max = back_min = prog->anchored_offset;
3620 	} else {
3621 	    if (utf8_target) {
3622                 if (! prog->float_utf8) {
3623                     to_utf8_substr(prog);
3624                 }
3625                 must = prog->float_utf8;
3626             }
3627             else {
3628                 if (! prog->float_substr) {
3629                     if (! to_byte_substr(prog)) {
3630                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3631                     }
3632                 }
3633                 must = prog->float_substr;
3634             }
3635 	    back_max = prog->float_max_offset;
3636 	    back_min = prog->float_min_offset;
3637 	}
3638 
3639         if (back_min<0) {
3640 	    last = strend;
3641 	} else {
3642             last = HOP3c(strend,	/* Cannot start after this */
3643         	  -(SSize_t)(CHR_SVLEN(must)
3644         		 - (SvTAIL(must) != 0) + back_min), strbeg);
3645         }
3646 	if (s > reginfo->strbeg)
3647 	    last1 = HOPc(s, -1);
3648 	else
3649 	    last1 = s - 1;	/* bogus */
3650 
3651 	/* XXXX check_substr already used to find "s", can optimize if
3652 	   check_substr==must. */
3653 	dontbother = 0;
3654 	strend = HOPc(strend, -dontbother);
3655 	while ( (s <= last) &&
3656 		(s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3657 				  (unsigned char*)strend, must,
3658 				  multiline ? FBMrf_MULTILINE : 0)) ) {
3659 	    DEBUG_EXECUTE_r( did_match = 1 );
3660 	    if (HOPc(s, -back_max) > last1) {
3661 		last1 = HOPc(s, -back_min);
3662 		s = HOPc(s, -back_max);
3663 	    }
3664 	    else {
3665 		char * const t = (last1 >= reginfo->strbeg)
3666                                     ? HOPc(last1, 1) : last1 + 1;
3667 
3668 		last1 = HOPc(s, -back_min);
3669 		s = t;
3670 	    }
3671 	    if (utf8_target) {
3672 		while (s <= last1) {
3673 		    if (regtry(reginfo, &s))
3674 			goto got_it;
3675                     if (s >= last1) {
3676                         s++; /* to break out of outer loop */
3677                         break;
3678                     }
3679                     s += UTF8SKIP(s);
3680 		}
3681 	    }
3682 	    else {
3683 		while (s <= last1) {
3684 		    if (regtry(reginfo, &s))
3685 			goto got_it;
3686 		    s++;
3687 		}
3688 	    }
3689 	}
3690 	DEBUG_EXECUTE_r(if (!did_match) {
3691             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3692                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3693             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3694 			      ((must == prog->anchored_substr || must == prog->anchored_utf8)
3695 			       ? "anchored" : "floating"),
3696                 quoted, RE_SV_TAIL(must));
3697         });
3698 	goto phooey;
3699     }
3700     else if ( (c = progi->regstclass) ) {
3701 	if (minlen) {
3702 	    const OPCODE op = OP(progi->regstclass);
3703 	    /* don't bother with what can't match */
3704 	    if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3705 	        strend = HOPc(strend, -(minlen - 1));
3706 	}
3707 	DEBUG_EXECUTE_r({
3708 	    SV * const prop = sv_newmortal();
3709             regprop(prog, prop, c, reginfo, NULL);
3710 	    {
3711 		RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3712 		    s,strend-s,PL_dump_re_max_len);
3713                 Perl_re_printf( aTHX_
3714 		    "Matching stclass %.*s against %s (%d bytes)\n",
3715 		    (int)SvCUR(prop), SvPVX_const(prop),
3716 		     quoted, (int)(strend - s));
3717 	    }
3718 	});
3719         if (find_byclass(prog, c, s, strend, reginfo))
3720 	    goto got_it;
3721         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3722     }
3723     else {
3724 	dontbother = 0;
3725 	if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3726 	    /* Trim the end. */
3727 	    char *last= NULL;
3728 	    SV* float_real;
3729 	    STRLEN len;
3730 	    const char *little;
3731 
3732 	    if (utf8_target) {
3733                 if (! prog->float_utf8) {
3734                     to_utf8_substr(prog);
3735                 }
3736                 float_real = prog->float_utf8;
3737             }
3738             else {
3739                 if (! prog->float_substr) {
3740                     if (! to_byte_substr(prog)) {
3741                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3742                     }
3743                 }
3744                 float_real = prog->float_substr;
3745             }
3746 
3747             little = SvPV_const(float_real, len);
3748 	    if (SvTAIL(float_real)) {
3749                     /* This means that float_real contains an artificial \n on
3750                      * the end due to the presence of something like this:
3751                      * /foo$/ where we can match both "foo" and "foo\n" at the
3752                      * end of the string.  So we have to compare the end of the
3753                      * string first against the float_real without the \n and
3754                      * then against the full float_real with the string.  We
3755                      * have to watch out for cases where the string might be
3756                      * smaller than the float_real or the float_real without
3757                      * the \n. */
3758 		    char *checkpos= strend - len;
3759 		    DEBUG_OPTIMISE_r(
3760                         Perl_re_printf( aTHX_
3761 			    "%sChecking for float_real.%s\n",
3762 			    PL_colors[4], PL_colors[5]));
3763 		    if (checkpos + 1 < strbeg) {
3764                         /* can't match, even if we remove the trailing \n
3765                          * string is too short to match */
3766 			DEBUG_EXECUTE_r(
3767                             Perl_re_printf( aTHX_
3768 				"%sString shorter than required trailing substring, cannot match.%s\n",
3769 				PL_colors[4], PL_colors[5]));
3770 			goto phooey;
3771 		    } else if (memEQ(checkpos + 1, little, len - 1)) {
3772                         /* can match, the end of the string matches without the
3773                          * "\n" */
3774 			last = checkpos + 1;
3775 		    } else if (checkpos < strbeg) {
3776                         /* cant match, string is too short when the "\n" is
3777                          * included */
3778 			DEBUG_EXECUTE_r(
3779                             Perl_re_printf( aTHX_
3780 				"%sString does not contain required trailing substring, cannot match.%s\n",
3781 				PL_colors[4], PL_colors[5]));
3782 			goto phooey;
3783 		    } else if (!multiline) {
3784                         /* non multiline match, so compare with the "\n" at the
3785                          * end of the string */
3786 			if (memEQ(checkpos, little, len)) {
3787 			    last= checkpos;
3788 			} else {
3789 			    DEBUG_EXECUTE_r(
3790                                 Perl_re_printf( aTHX_
3791 				    "%sString does not contain required trailing substring, cannot match.%s\n",
3792 				    PL_colors[4], PL_colors[5]));
3793 			    goto phooey;
3794 			}
3795 		    } else {
3796                         /* multiline match, so we have to search for a place
3797                          * where the full string is located */
3798 			goto find_last;
3799 		    }
3800 	    } else {
3801 		  find_last:
3802 		    if (len)
3803 			last = rninstr(s, strend, little, little + len);
3804 		    else
3805 			last = strend;	/* matching "$" */
3806 	    }
3807 	    if (!last) {
3808                 /* at one point this block contained a comment which was
3809                  * probably incorrect, which said that this was a "should not
3810                  * happen" case.  Even if it was true when it was written I am
3811                  * pretty sure it is not anymore, so I have removed the comment
3812                  * and replaced it with this one. Yves */
3813 		DEBUG_EXECUTE_r(
3814                     Perl_re_printf( aTHX_
3815 			"%sString does not contain required substring, cannot match.%s\n",
3816                         PL_colors[4], PL_colors[5]
3817 	            ));
3818 		goto phooey;
3819 	    }
3820 	    dontbother = strend - last + prog->float_min_offset;
3821 	}
3822 	if (minlen && (dontbother < minlen))
3823 	    dontbother = minlen - 1;
3824 	strend -= dontbother; 		   /* this one's always in bytes! */
3825 	/* We don't know much -- general case. */
3826 	if (utf8_target) {
3827 	    for (;;) {
3828 		if (regtry(reginfo, &s))
3829 		    goto got_it;
3830 		if (s >= strend)
3831 		    break;
3832 		s += UTF8SKIP(s);
3833 	    };
3834 	}
3835 	else {
3836 	    do {
3837 		if (regtry(reginfo, &s))
3838 		    goto got_it;
3839 	    } while (s++ < strend);
3840 	}
3841     }
3842 
3843     /* Failure. */
3844     goto phooey;
3845 
3846   got_it:
3847     /* s/// doesn't like it if $& is earlier than where we asked it to
3848      * start searching (which can happen on something like /.\G/) */
3849     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3850             && (prog->offs[0].start < stringarg - strbeg))
3851     {
3852         /* this should only be possible under \G */
3853         assert(prog->intflags & PREGf_GPOS_SEEN);
3854         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3855             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3856         goto phooey;
3857     }
3858 
3859     /* clean up; this will trigger destructors that will free all slabs
3860      * above the current one, and cleanup the regmatch_info_aux
3861      * and regmatch_info_aux_eval sructs */
3862 
3863     LEAVE_SCOPE(oldsave);
3864 
3865     if (RXp_PAREN_NAMES(prog))
3866         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3867 
3868     /* make sure $`, $&, $', and $digit will work later */
3869     if ( !(flags & REXEC_NOT_FIRST) )
3870         S_reg_set_capture_string(aTHX_ rx,
3871                                     strbeg, reginfo->strend,
3872                                     sv, flags, utf8_target);
3873 
3874     return 1;
3875 
3876   phooey:
3877     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3878 			  PL_colors[4], PL_colors[5]));
3879 
3880     if (swap) {
3881         /* we failed :-( roll it back.
3882          * Since the swap buffer will be freed on scope exit which follows
3883          * shortly, restore the old captures by copying 'swap's original
3884          * data to the new offs buffer
3885          */
3886         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3887 	    "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
3888 	    0,
3889             PTR2UV(prog),
3890 	    PTR2UV(prog->offs),
3891 	    PTR2UV(swap)
3892 	));
3893 
3894         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
3895     }
3896 
3897     /* clean up; this will trigger destructors that will free all slabs
3898      * above the current one, and cleanup the regmatch_info_aux
3899      * and regmatch_info_aux_eval sructs */
3900 
3901     LEAVE_SCOPE(oldsave);
3902 
3903     return 0;
3904 }
3905 
3906 
3907 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3908  * Do inc before dec, in case old and new rex are the same */
3909 #define SET_reg_curpm(Re2)                          \
3910     if (reginfo->info_aux_eval) {                   \
3911 	(void)ReREFCNT_inc(Re2);		    \
3912 	ReREFCNT_dec(PM_GETRE(PL_reg_curpm));	    \
3913 	PM_SETRE((PL_reg_curpm), (Re2));	    \
3914     }
3915 
3916 
3917 /*
3918  - regtry - try match at specific point
3919  */
3920 STATIC bool			/* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info * reginfo,char ** startposp)3921 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3922 {
3923     CHECKPOINT lastcp;
3924     REGEXP *const rx = reginfo->prog;
3925     regexp *const prog = ReANY(rx);
3926     SSize_t result;
3927 #ifdef DEBUGGING
3928     U32 depth = 0; /* used by REGCP_SET */
3929 #endif
3930     RXi_GET_DECL(prog,progi);
3931     GET_RE_DEBUG_FLAGS_DECL;
3932 
3933     PERL_ARGS_ASSERT_REGTRY;
3934 
3935     reginfo->cutpoint=NULL;
3936 
3937     prog->offs[0].start = *startposp - reginfo->strbeg;
3938     prog->lastparen = 0;
3939     prog->lastcloseparen = 0;
3940 
3941     /* XXXX What this code is doing here?!!!  There should be no need
3942        to do this again and again, prog->lastparen should take care of
3943        this!  --ilya*/
3944 
3945     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3946      * Actually, the code in regcppop() (which Ilya may be meaning by
3947      * prog->lastparen), is not needed at all by the test suite
3948      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3949      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3950      * Meanwhile, this code *is* needed for the
3951      * above-mentioned test suite tests to succeed.  The common theme
3952      * on those tests seems to be returning null fields from matches.
3953      * --jhi updated by dapm */
3954 
3955     /* After encountering a variant of the issue mentioned above I think
3956      * the point Ilya was making is that if we properly unwind whenever
3957      * we set lastparen to a smaller value then we should not need to do
3958      * this every time, only when needed. So if we have tests that fail if
3959      * we remove this, then it suggests somewhere else we are improperly
3960      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3961      * places it is called, and related regcp() routines. - Yves */
3962 #if 1
3963     if (prog->nparens) {
3964 	regexp_paren_pair *pp = prog->offs;
3965 	I32 i;
3966 	for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3967 	    ++pp;
3968 	    pp->start = -1;
3969 	    pp->end = -1;
3970 	}
3971     }
3972 #endif
3973     REGCP_SET(lastcp);
3974     result = regmatch(reginfo, *startposp, progi->program + 1);
3975     if (result != -1) {
3976 	prog->offs[0].end = result;
3977 	return 1;
3978     }
3979     if (reginfo->cutpoint)
3980         *startposp= reginfo->cutpoint;
3981     REGCP_UNWIND(lastcp);
3982     return 0;
3983 }
3984 
3985 /* this is used to determine how far from the left messages like
3986    'failed...' are printed in regexec.c. It should be set such that
3987    messages are inline with the regop output that created them.
3988 */
3989 #define REPORT_CODE_OFF 29
3990 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3991 #ifdef DEBUGGING
3992 int
Perl_re_exec_indentf(pTHX_ const char * fmt,U32 depth,...)3993 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3994 {
3995     va_list ap;
3996     int result;
3997     PerlIO *f= Perl_debug_log;
3998     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3999     va_start(ap, depth);
4000     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4001     result = PerlIO_vprintf(f, fmt, ap);
4002     va_end(ap);
4003     return result;
4004 }
4005 #endif /* DEBUGGING */
4006 
4007 /* grab a new slab and return the first slot in it */
4008 
4009 STATIC regmatch_state *
S_push_slab(pTHX)4010 S_push_slab(pTHX)
4011 {
4012     regmatch_slab *s = PL_regmatch_slab->next;
4013     if (!s) {
4014 	Newx(s, 1, regmatch_slab);
4015 	s->prev = PL_regmatch_slab;
4016 	s->next = NULL;
4017 	PL_regmatch_slab->next = s;
4018     }
4019     PL_regmatch_slab = s;
4020     return SLAB_FIRST(s);
4021 }
4022 
4023 #ifdef DEBUGGING
4024 
4025 STATIC void
S_debug_start_match(pTHX_ const REGEXP * prog,const bool utf8_target,const char * start,const char * end,const char * blurb)4026 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4027     const char *start, const char *end, const char *blurb)
4028 {
4029     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4030 
4031     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4032 
4033     if (!PL_colorset)
4034             reginitcolors();
4035     {
4036         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4037             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4038 
4039         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4040             start, end - start, PL_dump_re_max_len);
4041 
4042         Perl_re_printf( aTHX_
4043             "%s%s REx%s %s against %s\n",
4044 		       PL_colors[4], blurb, PL_colors[5], s0, s1);
4045 
4046         if (utf8_target||utf8_pat)
4047             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4048                 utf8_pat ? "pattern" : "",
4049                 utf8_pat && utf8_target ? " and " : "",
4050                 utf8_target ? "string" : ""
4051             );
4052     }
4053 }
4054 
4055 STATIC void
S_dump_exec_pos(pTHX_ const char * locinput,const regnode * scan,const char * loc_regeol,const char * loc_bostr,const char * loc_reg_starttry,const bool utf8_target,const U32 depth)4056 S_dump_exec_pos(pTHX_ const char *locinput,
4057                       const regnode *scan,
4058                       const char *loc_regeol,
4059                       const char *loc_bostr,
4060                       const char *loc_reg_starttry,
4061                       const bool utf8_target,
4062                       const U32 depth
4063                 )
4064 {
4065     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4066     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4067     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4068     /* The part of the string before starttry has one color
4069        (pref0_len chars), between starttry and current
4070        position another one (pref_len - pref0_len chars),
4071        after the current position the third one.
4072        We assume that pref0_len <= pref_len, otherwise we
4073        decrease pref0_len.  */
4074     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4075 	? (5 + taill) - l : locinput - loc_bostr;
4076     int pref0_len;
4077 
4078     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4079 
4080     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4081 	pref_len++;
4082     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4083     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4084 	l = ( loc_regeol - locinput > (5 + taill) - pref_len
4085 	      ? (5 + taill) - pref_len : loc_regeol - locinput);
4086     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4087 	l--;
4088     if (pref0_len < 0)
4089 	pref0_len = 0;
4090     if (pref0_len > pref_len)
4091 	pref0_len = pref_len;
4092     {
4093 	const int is_uni = utf8_target ? 1 : 0;
4094 
4095 	RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4096 	    (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4097 
4098 	RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4099 		    (locinput - pref_len + pref0_len),
4100 		    pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4101 
4102 	RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4103 		    locinput, loc_regeol - locinput, 10, 0, 1);
4104 
4105 	const STRLEN tlen=len0+len1+len2;
4106         Perl_re_printf( aTHX_
4107                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4108 		    (IV)(locinput - loc_bostr),
4109 		    len0, s0,
4110 		    len1, s1,
4111 		    (docolor ? "" : "> <"),
4112 		    len2, s2,
4113 		    (int)(tlen > 19 ? 0 :  19 - tlen),
4114                     "",
4115                     depth);
4116     }
4117 }
4118 
4119 #endif
4120 
4121 /* reg_check_named_buff_matched()
4122  * Checks to see if a named buffer has matched. The data array of
4123  * buffer numbers corresponding to the buffer is expected to reside
4124  * in the regexp->data->data array in the slot stored in the ARG() of
4125  * node involved. Note that this routine doesn't actually care about the
4126  * name, that information is not preserved from compilation to execution.
4127  * Returns the index of the leftmost defined buffer with the given name
4128  * or 0 if non of the buffers matched.
4129  */
4130 STATIC I32
S_reg_check_named_buff_matched(const regexp * rex,const regnode * scan)4131 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4132 {
4133     I32 n;
4134     RXi_GET_DECL(rex,rexi);
4135     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4136     I32 *nums=(I32*)SvPVX(sv_dat);
4137 
4138     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4139 
4140     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4141         if ((I32)rex->lastparen >= nums[n] &&
4142             rex->offs[nums[n]].end != -1)
4143         {
4144             return nums[n];
4145         }
4146     }
4147     return 0;
4148 }
4149 
4150 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4151 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4152 #define CHRTEST_NOT_A_CP_1 -999
4153 #define CHRTEST_NOT_A_CP_2 -998
4154 
4155 static bool
S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node,int * c1p,U8 * c1_utf8,int * c2p,U8 * c2_utf8,regmatch_info * reginfo)4156 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4157         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4158 {
4159     /* This function determines if there are zero, one, two, or more characters
4160      * that match the first character of the passed-in EXACTish node
4161      * <text_node>, and if there are one or two, it returns them in the
4162      * passed-in pointers.
4163      *
4164      * If it determines that no possible character in the target string can
4165      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4166      * the first character in <text_node> requires UTF-8 to represent, and the
4167      * target string isn't in UTF-8.)
4168      *
4169      * If there are more than two characters that could match the beginning of
4170      * <text_node>, or if more context is required to determine a match or not,
4171      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4172      *
4173      * The motiviation behind this function is to allow the caller to set up
4174      * tight loops for matching.  If <text_node> is of type EXACT, there is
4175      * only one possible character that can match its first character, and so
4176      * the situation is quite simple.  But things get much more complicated if
4177      * folding is involved.  It may be that the first character of an EXACTFish
4178      * node doesn't participate in any possible fold, e.g., punctuation, so it
4179      * can be matched only by itself.  The vast majority of characters that are
4180      * in folds match just two things, their lower and upper-case equivalents.
4181      * But not all are like that; some have multiple possible matches, or match
4182      * sequences of more than one character.  This function sorts all that out.
4183      *
4184      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4185      * loop of trying to match A*, we know we can't exit where the thing
4186      * following it isn't a B.  And something can't be a B unless it is the
4187      * beginning of B.  By putting a quick test for that beginning in a tight
4188      * loop, we can rule out things that can't possibly be B without having to
4189      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4190      * character, we can make a tight loop matching A*, using the outputs of
4191      * this function.
4192      *
4193      * If the target string to match isn't in UTF-8, and there aren't
4194      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4195      * the one or two possible octets (which are characters in this situation)
4196      * that can match.  In all cases, if there is only one character that can
4197      * match, *<c1p> and *<c2p> will be identical.
4198      *
4199      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4200      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4201      * can match the beginning of <text_node>.  They should be declared with at
4202      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4203      * undefined what these contain.)  If one or both of the buffers are
4204      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4205      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4206      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4207      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4208      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4209 
4210     const bool utf8_target = reginfo->is_utf8_target;
4211 
4212     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4213     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4214     bool use_chrtest_void = FALSE;
4215     const bool is_utf8_pat = reginfo->is_utf8_pat;
4216 
4217     /* Used when we have both utf8 input and utf8 output, to avoid converting
4218      * to/from code points */
4219     bool utf8_has_been_setup = FALSE;
4220 
4221     dVAR;
4222 
4223     U8 *pat = (U8*)STRING(text_node);
4224     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4225 
4226     if (   OP(text_node) == EXACT
4227         || OP(text_node) == EXACT_ONLY8
4228         || OP(text_node) == EXACTL)
4229     {
4230 
4231         /* In an exact node, only one thing can be matched, that first
4232          * character.  If both the pat and the target are UTF-8, we can just
4233          * copy the input to the output, avoiding finding the code point of
4234          * that character */
4235         if (!is_utf8_pat) {
4236             assert(OP(text_node) != EXACT_ONLY8);
4237             c2 = c1 = *pat;
4238         }
4239         else if (utf8_target) {
4240             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4241             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4242             utf8_has_been_setup = TRUE;
4243         }
4244         else if (OP(text_node) == EXACT_ONLY8) {
4245             return FALSE;   /* Can only match UTF-8 target */
4246         }
4247         else {
4248             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4249         }
4250     }
4251     else { /* an EXACTFish node */
4252         U8 *pat_end = pat + STR_LEN(text_node);
4253 
4254         /* An EXACTFL node has at least some characters unfolded, because what
4255          * they match is not known until now.  So, now is the time to fold
4256          * the first few of them, as many as are needed to determine 'c1' and
4257          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4258          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4259          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4260          * need to fold as many characters as a single character can fold to,
4261          * so that later we can check if the first ones are such a multi-char
4262          * fold.  But, in such a pattern only locale-problematic characters
4263          * aren't folded, so we can skip this completely if the first character
4264          * in the node isn't one of the tricky ones */
4265         if (OP(text_node) == EXACTFL) {
4266 
4267             if (! is_utf8_pat) {
4268                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4269                 {
4270                     folded[0] = folded[1] = 's';
4271                     pat = folded;
4272                     pat_end = folded + 2;
4273                 }
4274             }
4275             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4276                 U8 *s = pat;
4277                 U8 *d = folded;
4278                 int i;
4279 
4280                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4281                     if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
4282                         *(d++) = (U8) toFOLD_LC(*s);
4283                         s++;
4284                     }
4285                     else {
4286                         STRLEN len;
4287                         _toFOLD_utf8_flags(s,
4288                                            pat_end,
4289                                            d,
4290                                            &len,
4291                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4292                         d += len;
4293                         s += UTF8SKIP(s);
4294                     }
4295                 }
4296 
4297                 pat = folded;
4298                 pat_end = d;
4299             }
4300         }
4301 
4302         if (    ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4303              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4304         {
4305             /* Multi-character folds require more context to sort out.  Also
4306              * PL_utf8_foldclosures used below doesn't handle them, so have to
4307              * be handled outside this routine */
4308             use_chrtest_void = TRUE;
4309         }
4310         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4311             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4312 
4313             if (   UNLIKELY(PL_in_utf8_turkic_locale)
4314                 && OP(text_node) == EXACTFL
4315                 && UNLIKELY(   c1 == 'i' || c1 == 'I'
4316                             || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
4317                             || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
4318             {   /* Hard-coded Turkish locale rules for these 4 characters
4319                    override normal rules */
4320                 if (c1 == 'i') {
4321                     c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4322                 }
4323                 else if (c1 == 'I') {
4324                     c2 = LATIN_SMALL_LETTER_DOTLESS_I;
4325                 }
4326                 else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4327                     c2 = 'i';
4328                 }
4329                 else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
4330                     c2 = 'I';
4331                 }
4332             }
4333             else if (c1 > 255) {
4334                 const unsigned int * remaining_folds;
4335                 unsigned int first_fold;
4336 
4337                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4338                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4339                 Size_t folds_count = _inverse_folds(c1, &first_fold,
4340                                                        &remaining_folds);
4341                 if (folds_count == 0) {
4342                     c2 = c1;    /* there is only a single character that could
4343                                    match */
4344                 }
4345                 else if (folds_count != 1) {
4346                     /* If there aren't exactly two folds to this (itself and
4347                      * another), it is outside the scope of this function */
4348                     use_chrtest_void = TRUE;
4349                 }
4350                 else {  /* There are two.  We already have one, get the other */
4351                     c2 = first_fold;
4352 
4353                     /* Folds that cross the 255/256 boundary are forbidden if
4354                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4355                      * ASCIII.  The only other match to c1 is c2, and since c1
4356                      * is above 255, c2 better be as well under these
4357                      * circumstances.  If it isn't, it means the only legal
4358                      * match of c1 is itself. */
4359                     if (    c2 < 256
4360                         && (   (   OP(text_node) == EXACTFL
4361                                 && ! IN_UTF8_CTYPE_LOCALE)
4362                             || ((     OP(text_node) == EXACTFAA
4363                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4364                                 && (isASCII(c1) || isASCII(c2)))))
4365                     {
4366                         c2 = c1;
4367                     }
4368                 }
4369             }
4370             else /* Here, c1 is <= 255 */
4371                 if (   utf8_target
4372                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4373                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4374                     && (   (   OP(text_node) != EXACTFAA
4375                             && OP(text_node) != EXACTFAA_NO_TRIE)
4376                         ||   ! isASCII(c1)))
4377             {
4378                 /* Here, there could be something above Latin1 in the target
4379                  * which folds to this character in the pattern.  All such
4380                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4381                  * than two characters involved in their folds, so are outside
4382                  * the scope of this function */
4383                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4384                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4385                 }
4386                 else {
4387                     use_chrtest_void = TRUE;
4388                 }
4389             }
4390             else { /* Here nothing above Latin1 can fold to the pattern
4391                       character */
4392                 switch (OP(text_node)) {
4393 
4394                     case EXACTFL:   /* /l rules */
4395                         c2 = PL_fold_locale[c1];
4396                         break;
4397 
4398                     case EXACTF:   /* This node only generated for non-utf8
4399                                     patterns */
4400                         assert(! is_utf8_pat);
4401                         if (! utf8_target) {    /* /d rules */
4402                             c2 = PL_fold[c1];
4403                             break;
4404                         }
4405                         /* FALLTHROUGH */
4406                         /* /u rules for all these.  This happens to work for
4407                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4408                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4409                                                 non-utf8 patterns */
4410                         assert(! is_utf8_pat);
4411                         /* FALLTHROUGH */
4412                     case EXACTFAA:
4413                     case EXACTFUP:
4414                     case EXACTFU:
4415                         c2 = PL_fold_latin1[c1];
4416                         break;
4417                     case EXACTFU_ONLY8:
4418                         return FALSE;
4419                         NOT_REACHED; /* NOTREACHED */
4420 
4421                     default:
4422                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4423                         NOT_REACHED; /* NOTREACHED */
4424                 }
4425             }
4426         }
4427     }
4428 
4429     /* Here have figured things out.  Set up the returns */
4430     if (use_chrtest_void) {
4431         *c2p = *c1p = CHRTEST_VOID;
4432     }
4433     else if (utf8_target) {
4434         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4435             uvchr_to_utf8(c1_utf8, c1);
4436             uvchr_to_utf8(c2_utf8, c2);
4437         }
4438 
4439         /* Invariants are stored in both the utf8 and byte outputs; Use
4440          * negative numbers otherwise for the byte ones.  Make sure that the
4441          * byte ones are the same iff the utf8 ones are the same */
4442         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4443         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4444                 ? *c2_utf8
4445                 : (c1 == c2)
4446                   ? CHRTEST_NOT_A_CP_1
4447                   : CHRTEST_NOT_A_CP_2;
4448     }
4449     else if (c1 > 255) {
4450        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4451                            can represent */
4452            return FALSE;
4453        }
4454 
4455        *c1p = *c2p = c2;    /* c2 is the only representable value */
4456     }
4457     else {  /* c1 is representable; see about c2 */
4458        *c1p = c1;
4459        *c2p = (c2 < 256) ? c2 : c1;
4460     }
4461 
4462     return TRUE;
4463 }
4464 
4465 STATIC bool
S_isGCB(pTHX_ const GCB_enum before,const GCB_enum after,const U8 * const strbeg,const U8 * const curpos,const bool utf8_target)4466 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4467 {
4468     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4469      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4470 
4471     PERL_ARGS_ASSERT_ISGCB;
4472 
4473     switch (GCB_table[before][after]) {
4474         case GCB_BREAKABLE:
4475             return TRUE;
4476 
4477         case GCB_NOBREAK:
4478             return FALSE;
4479 
4480         case GCB_RI_then_RI:
4481             {
4482                 int RI_count = 1;
4483                 U8 * temp_pos = (U8 *) curpos;
4484 
4485                 /* Do not break within emoji flag sequences. That is, do not
4486                  * break between regional indicator (RI) symbols if there is an
4487                  * odd number of RI characters before the break point.
4488                  *  GB12   sot (RI RI)* RI × RI
4489                  *  GB13 [^RI] (RI RI)* RI × RI */
4490 
4491                 while (backup_one_GCB(strbeg,
4492                                     &temp_pos,
4493                                     utf8_target) == GCB_Regional_Indicator)
4494                 {
4495                     RI_count++;
4496                 }
4497 
4498                 return RI_count % 2 != 1;
4499             }
4500 
4501         case GCB_EX_then_EM:
4502 
4503             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4504             {
4505                 U8 * temp_pos = (U8 *) curpos;
4506                 GCB_enum prev;
4507 
4508                 do {
4509                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4510                 }
4511                 while (prev == GCB_Extend);
4512 
4513                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4514             }
4515 
4516         case GCB_Maybe_Emoji_NonBreak:
4517 
4518             {
4519 
4520             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4521               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4522               */
4523                 U8 * temp_pos = (U8 *) curpos;
4524                 GCB_enum prev;
4525 
4526                 do {
4527                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4528                 }
4529                 while (prev == GCB_Extend);
4530 
4531                 return prev != GCB_XPG_XX;
4532             }
4533 
4534         default:
4535             break;
4536     }
4537 
4538 #ifdef DEBUGGING
4539     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4540                                   before, after, GCB_table[before][after]);
4541     assert(0);
4542 #endif
4543     return TRUE;
4544 }
4545 
4546 STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)4547 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4548 {
4549     dVAR;
4550     GCB_enum gcb;
4551 
4552     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4553 
4554     if (*curpos < strbeg) {
4555         return GCB_EDGE;
4556     }
4557 
4558     if (utf8_target) {
4559         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4560         U8 * prev_prev_char_pos;
4561 
4562         if (! prev_char_pos) {
4563             return GCB_EDGE;
4564         }
4565 
4566         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4567             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4568             *curpos = prev_char_pos;
4569             prev_char_pos = prev_prev_char_pos;
4570         }
4571         else {
4572             *curpos = (U8 *) strbeg;
4573             return GCB_EDGE;
4574         }
4575     }
4576     else {
4577         if (*curpos - 2 < strbeg) {
4578             *curpos = (U8 *) strbeg;
4579             return GCB_EDGE;
4580         }
4581         (*curpos)--;
4582         gcb = getGCB_VAL_CP(*(*curpos - 1));
4583     }
4584 
4585     return gcb;
4586 }
4587 
4588 /* Combining marks attach to most classes that precede them, but this defines
4589  * the exceptions (from TR14) */
4590 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4591                                      || prev == LB_Mandatory_Break      \
4592                                      || prev == LB_Carriage_Return      \
4593                                      || prev == LB_Line_Feed            \
4594                                      || prev == LB_Next_Line            \
4595                                      || prev == LB_Space                \
4596                                      || prev == LB_ZWSpace))
4597 
4598 STATIC bool
S_isLB(pTHX_ LB_enum before,LB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)4599 S_isLB(pTHX_ LB_enum before,
4600              LB_enum after,
4601              const U8 * const strbeg,
4602              const U8 * const curpos,
4603              const U8 * const strend,
4604              const bool utf8_target)
4605 {
4606     U8 * temp_pos = (U8 *) curpos;
4607     LB_enum prev = before;
4608 
4609     /* Is the boundary between 'before' and 'after' line-breakable?
4610      * Most of this is just a table lookup of a generated table from Unicode
4611      * rules.  But some rules require context to decide, and so have to be
4612      * implemented in code */
4613 
4614     PERL_ARGS_ASSERT_ISLB;
4615 
4616     /* Rule numbers in the comments below are as of Unicode 9.0 */
4617 
4618   redo:
4619     before = prev;
4620     switch (LB_table[before][after]) {
4621         case LB_BREAKABLE:
4622             return TRUE;
4623 
4624         case LB_NOBREAK:
4625         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4626             return FALSE;
4627 
4628         case LB_SP_foo + LB_BREAKABLE:
4629         case LB_SP_foo + LB_NOBREAK:
4630         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4631 
4632             /* When we have something following a SP, we have to look at the
4633              * context in order to know what to do.
4634              *
4635              * SP SP should not reach here because LB7: Do not break before
4636              * spaces.  (For two spaces in a row there is nothing that
4637              * overrides that) */
4638             assert(after != LB_Space);
4639 
4640             /* Here we have a space followed by a non-space.  Mostly this is a
4641              * case of LB18: "Break after spaces".  But there are complications
4642              * as the handling of spaces is somewhat tricky.  They are in a
4643              * number of rules, which have to be applied in priority order, but
4644              * something earlier in the string can cause a rule to be skipped
4645              * and a lower priority rule invoked.  A prime example is LB7 which
4646              * says don't break before a space.  But rule LB8 (lower priority)
4647              * says that the first break opportunity after a ZW is after any
4648              * span of spaces immediately after it.  If a ZW comes before a SP
4649              * in the input, rule LB8 applies, and not LB7.  Other such rules
4650              * involve combining marks which are rules 9 and 10, but they may
4651              * override higher priority rules if they come earlier in the
4652              * string.  Since we're doing random access into the middle of the
4653              * string, we have to look for rules that should get applied based
4654              * on both string position and priority.  Combining marks do not
4655              * attach to either ZW nor SP, so we don't have to consider them
4656              * until later.
4657              *
4658              * To check for LB8, we have to find the first non-space character
4659              * before this span of spaces */
4660             do {
4661                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4662             }
4663             while (prev == LB_Space);
4664 
4665             /* LB8 Break before any character following a zero-width space,
4666              * even if one or more spaces intervene.
4667              *      ZW SP* ÷
4668              * So if we have a ZW just before this span, and to get here this
4669              * is the final space in the span. */
4670             if (prev == LB_ZWSpace) {
4671                 return TRUE;
4672             }
4673 
4674             /* Here, not ZW SP+.  There are several rules that have higher
4675              * priority than LB18 and can be resolved now, as they don't depend
4676              * on anything earlier in the string (except ZW, which we have
4677              * already handled).  One of these rules is LB11 Do not break
4678              * before Word joiner, but we have specially encoded that in the
4679              * lookup table so it is caught by the single test below which
4680              * catches the other ones. */
4681             if (LB_table[LB_Space][after] - LB_SP_foo
4682                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4683             {
4684                 return FALSE;
4685             }
4686 
4687             /* If we get here, we have to XXX consider combining marks. */
4688             if (prev == LB_Combining_Mark) {
4689 
4690                 /* What happens with these depends on the character they
4691                  * follow.  */
4692                 do {
4693                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4694                 }
4695                 while (prev == LB_Combining_Mark);
4696 
4697                 /* Most times these attach to and inherit the characteristics
4698                  * of that character, but not always, and when not, they are to
4699                  * be treated as AL by rule LB10. */
4700                 if (! LB_CM_ATTACHES_TO(prev)) {
4701                     prev = LB_Alphabetic;
4702                 }
4703             }
4704 
4705             /* Here, we have the character preceding the span of spaces all set
4706              * up.  We follow LB18: "Break after spaces" unless the table shows
4707              * that is overriden */
4708             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4709 
4710         case LB_CM_ZWJ_foo:
4711 
4712             /* We don't know how to treat the CM except by looking at the first
4713              * non-CM character preceding it.  ZWJ is treated as CM */
4714             do {
4715                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4716             }
4717             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4718 
4719             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4720              * attatches to it, then it inherits the behavior of 'prev'.  If it
4721              * doesn't attach, it is to be treated as an AL */
4722             if (! LB_CM_ATTACHES_TO(prev)) {
4723                 prev = LB_Alphabetic;
4724             }
4725 
4726             goto redo;
4727 
4728         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4729         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4730 
4731             /* LB21a Don't break after Hebrew + Hyphen.
4732              * HL (HY | BA) × */
4733 
4734             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4735                                                           == LB_Hebrew_Letter)
4736             {
4737                 return FALSE;
4738             }
4739 
4740             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4741 
4742         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4743         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4744 
4745             /* LB25a (PR | PO) × ( OP | HY )? NU */
4746             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4747                 return FALSE;
4748             }
4749 
4750             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4751                                                                 == LB_BREAKABLE;
4752 
4753         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4754         case LB_SY_or_IS_then_various + LB_NOBREAK:
4755         {
4756             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4757 
4758             LB_enum temp = prev;
4759             do {
4760                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4761             }
4762             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4763             if (temp == LB_Numeric) {
4764                 return FALSE;
4765             }
4766 
4767             return LB_table[prev][after] - LB_SY_or_IS_then_various
4768                                                                == LB_BREAKABLE;
4769         }
4770 
4771         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4772         case LB_various_then_PO_or_PR + LB_NOBREAK:
4773         {
4774             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4775 
4776             LB_enum temp = prev;
4777             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4778             {
4779                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4780             }
4781             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4782                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4783             }
4784             if (temp == LB_Numeric) {
4785                 return FALSE;
4786             }
4787             return LB_various_then_PO_or_PR;
4788         }
4789 
4790         case LB_RI_then_RI + LB_NOBREAK:
4791         case LB_RI_then_RI + LB_BREAKABLE:
4792             {
4793                 int RI_count = 1;
4794 
4795                 /* LB30a Break between two regional indicator symbols if and
4796                  * only if there are an even number of regional indicators
4797                  * preceding the position of the break.
4798                  *
4799                  *    sot (RI RI)* RI × RI
4800                  *  [^RI] (RI RI)* RI × RI */
4801 
4802                 while (backup_one_LB(strbeg,
4803                                      &temp_pos,
4804                                      utf8_target) == LB_Regional_Indicator)
4805                 {
4806                     RI_count++;
4807                 }
4808 
4809                 return RI_count % 2 == 0;
4810             }
4811 
4812         default:
4813             break;
4814     }
4815 
4816 #ifdef DEBUGGING
4817     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4818                                   before, after, LB_table[before][after]);
4819     assert(0);
4820 #endif
4821     return TRUE;
4822 }
4823 
4824 STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)4825 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4826 {
4827     dVAR;
4828 
4829     LB_enum lb;
4830 
4831     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4832 
4833     if (*curpos >= strend) {
4834         return LB_EDGE;
4835     }
4836 
4837     if (utf8_target) {
4838         *curpos += UTF8SKIP(*curpos);
4839         if (*curpos >= strend) {
4840             return LB_EDGE;
4841         }
4842         lb = getLB_VAL_UTF8(*curpos, strend);
4843     }
4844     else {
4845         (*curpos)++;
4846         if (*curpos >= strend) {
4847             return LB_EDGE;
4848         }
4849         lb = getLB_VAL_CP(**curpos);
4850     }
4851 
4852     return lb;
4853 }
4854 
4855 STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)4856 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4857 {
4858     dVAR;
4859     LB_enum lb;
4860 
4861     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4862 
4863     if (*curpos < strbeg) {
4864         return LB_EDGE;
4865     }
4866 
4867     if (utf8_target) {
4868         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4869         U8 * prev_prev_char_pos;
4870 
4871         if (! prev_char_pos) {
4872             return LB_EDGE;
4873         }
4874 
4875         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4876             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4877             *curpos = prev_char_pos;
4878             prev_char_pos = prev_prev_char_pos;
4879         }
4880         else {
4881             *curpos = (U8 *) strbeg;
4882             return LB_EDGE;
4883         }
4884     }
4885     else {
4886         if (*curpos - 2 < strbeg) {
4887             *curpos = (U8 *) strbeg;
4888             return LB_EDGE;
4889         }
4890         (*curpos)--;
4891         lb = getLB_VAL_CP(*(*curpos - 1));
4892     }
4893 
4894     return lb;
4895 }
4896 
4897 STATIC bool
S_isSB(pTHX_ SB_enum before,SB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)4898 S_isSB(pTHX_ SB_enum before,
4899              SB_enum after,
4900              const U8 * const strbeg,
4901              const U8 * const curpos,
4902              const U8 * const strend,
4903              const bool utf8_target)
4904 {
4905     /* returns a boolean indicating if there is a Sentence Boundary Break
4906      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4907 
4908     U8 * lpos = (U8 *) curpos;
4909     bool has_para_sep = FALSE;
4910     bool has_sp = FALSE;
4911 
4912     PERL_ARGS_ASSERT_ISSB;
4913 
4914     /* Break at the start and end of text.
4915         SB1.  sot  ÷
4916         SB2.  ÷  eot
4917       But unstated in Unicode is don't break if the text is empty */
4918     if (before == SB_EDGE || after == SB_EDGE) {
4919         return before != after;
4920     }
4921 
4922     /* SB 3: Do not break within CRLF. */
4923     if (before == SB_CR && after == SB_LF) {
4924         return FALSE;
4925     }
4926 
4927     /* Break after paragraph separators.  CR and LF are considered
4928      * so because Unicode views text as like word processing text where there
4929      * are no newlines except between paragraphs, and the word processor takes
4930      * care of wrapping without there being hard line-breaks in the text *./
4931        SB4.  Sep | CR | LF  ÷ */
4932     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4933         return TRUE;
4934     }
4935 
4936     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4937      * (See Section 6.2, Replacing Ignore Rules.)
4938         SB5.  X (Extend | Format)*  →  X */
4939     if (after == SB_Extend || after == SB_Format) {
4940 
4941         /* Implied is that the these characters attach to everything
4942          * immediately prior to them except for those separator-type
4943          * characters.  And the rules earlier have already handled the case
4944          * when one of those immediately precedes the extend char */
4945         return FALSE;
4946     }
4947 
4948     if (before == SB_Extend || before == SB_Format) {
4949         U8 * temp_pos = lpos;
4950         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4951         if (   backup != SB_EDGE
4952             && backup != SB_Sep
4953             && backup != SB_CR
4954             && backup != SB_LF)
4955         {
4956             before = backup;
4957             lpos = temp_pos;
4958         }
4959 
4960         /* Here, both 'before' and 'backup' are these types; implied is that we
4961          * don't break between them */
4962         if (backup == SB_Extend || backup == SB_Format) {
4963             return FALSE;
4964         }
4965     }
4966 
4967     /* Do not break after ambiguous terminators like period, if they are
4968      * immediately followed by a number or lowercase letter, if they are
4969      * between uppercase letters, if the first following letter (optionally
4970      * after certain punctuation) is lowercase, or if they are followed by
4971      * "continuation" punctuation such as comma, colon, or semicolon. For
4972      * example, a period may be an abbreviation or numeric period, and thus may
4973      * not mark the end of a sentence.
4974 
4975      * SB6. ATerm  ×  Numeric */
4976     if (before == SB_ATerm && after == SB_Numeric) {
4977         return FALSE;
4978     }
4979 
4980     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4981     if (before == SB_ATerm && after == SB_Upper) {
4982         U8 * temp_pos = lpos;
4983         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4984         if (backup == SB_Upper || backup == SB_Lower) {
4985             return FALSE;
4986         }
4987     }
4988 
4989     /* The remaining rules that aren't the final one, all require an STerm or
4990      * an ATerm after having backed up over some Close* Sp*, and in one case an
4991      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4992      * So do that backup now, setting flags if either Sp or a paragraph
4993      * separator are found */
4994 
4995     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4996         has_para_sep = TRUE;
4997         before = backup_one_SB(strbeg, &lpos, utf8_target);
4998     }
4999 
5000     if (before == SB_Sp) {
5001         has_sp = TRUE;
5002         do {
5003             before = backup_one_SB(strbeg, &lpos, utf8_target);
5004         }
5005         while (before == SB_Sp);
5006     }
5007 
5008     while (before == SB_Close) {
5009         before = backup_one_SB(strbeg, &lpos, utf8_target);
5010     }
5011 
5012     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5013      * most cases an STerm */
5014     if (before == SB_STerm || before == SB_ATerm) {
5015 
5016         /* So, here the lhs matches
5017          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5018          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5019          * The rules that apply here are:
5020          *
5021          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5022                                            | LF | STerm | ATerm) )* Lower
5023            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5024            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5025            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5026            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5027          */
5028 
5029         /* And all but SB11 forbid having seen a paragraph separator */
5030         if (! has_para_sep) {
5031             if (before == SB_ATerm) {          /* SB8 */
5032                 U8 * rpos = (U8 *) curpos;
5033                 SB_enum later = after;
5034 
5035                 while (    later != SB_OLetter
5036                         && later != SB_Upper
5037                         && later != SB_Lower
5038                         && later != SB_Sep
5039                         && later != SB_CR
5040                         && later != SB_LF
5041                         && later != SB_STerm
5042                         && later != SB_ATerm
5043                         && later != SB_EDGE)
5044                 {
5045                     later = advance_one_SB(&rpos, strend, utf8_target);
5046                 }
5047                 if (later == SB_Lower) {
5048                     return FALSE;
5049                 }
5050             }
5051 
5052             if (   after == SB_SContinue    /* SB8a */
5053                 || after == SB_STerm
5054                 || after == SB_ATerm)
5055             {
5056                 return FALSE;
5057             }
5058 
5059             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5060                 if (   after == SB_Close
5061                     || after == SB_Sp
5062                     || after == SB_Sep
5063                     || after == SB_CR
5064                     || after == SB_LF)
5065                 {
5066                     return FALSE;
5067                 }
5068             }
5069 
5070             /* SB10.  This and SB9 could probably be combined some way, but khw
5071              * has decided to follow the Unicode rule book precisely for
5072              * simplified maintenance */
5073             if (   after == SB_Sp
5074                 || after == SB_Sep
5075                 || after == SB_CR
5076                 || after == SB_LF)
5077             {
5078                 return FALSE;
5079             }
5080         }
5081 
5082         /* SB11.  */
5083         return TRUE;
5084     }
5085 
5086     /* Otherwise, do not break.
5087     SB12.  Any  ×  Any */
5088 
5089     return FALSE;
5090 }
5091 
5092 STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5093 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5094 {
5095     dVAR;
5096     SB_enum sb;
5097 
5098     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5099 
5100     if (*curpos >= strend) {
5101         return SB_EDGE;
5102     }
5103 
5104     if (utf8_target) {
5105         do {
5106             *curpos += UTF8SKIP(*curpos);
5107             if (*curpos >= strend) {
5108                 return SB_EDGE;
5109             }
5110             sb = getSB_VAL_UTF8(*curpos, strend);
5111         } while (sb == SB_Extend || sb == SB_Format);
5112     }
5113     else {
5114         do {
5115             (*curpos)++;
5116             if (*curpos >= strend) {
5117                 return SB_EDGE;
5118             }
5119             sb = getSB_VAL_CP(**curpos);
5120         } while (sb == SB_Extend || sb == SB_Format);
5121     }
5122 
5123     return sb;
5124 }
5125 
5126 STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5127 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5128 {
5129     dVAR;
5130     SB_enum sb;
5131 
5132     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5133 
5134     if (*curpos < strbeg) {
5135         return SB_EDGE;
5136     }
5137 
5138     if (utf8_target) {
5139         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5140         if (! prev_char_pos) {
5141             return SB_EDGE;
5142         }
5143 
5144         /* Back up over Extend and Format.  curpos is always just to the right
5145          * of the characater whose value we are getting */
5146         do {
5147             U8 * prev_prev_char_pos;
5148             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5149                                                                       strbeg)))
5150             {
5151                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5152                 *curpos = prev_char_pos;
5153                 prev_char_pos = prev_prev_char_pos;
5154             }
5155             else {
5156                 *curpos = (U8 *) strbeg;
5157                 return SB_EDGE;
5158             }
5159         } while (sb == SB_Extend || sb == SB_Format);
5160     }
5161     else {
5162         do {
5163             if (*curpos - 2 < strbeg) {
5164                 *curpos = (U8 *) strbeg;
5165                 return SB_EDGE;
5166             }
5167             (*curpos)--;
5168             sb = getSB_VAL_CP(*(*curpos - 1));
5169         } while (sb == SB_Extend || sb == SB_Format);
5170     }
5171 
5172     return sb;
5173 }
5174 
5175 STATIC bool
S_isWB(pTHX_ WB_enum previous,WB_enum before,WB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5176 S_isWB(pTHX_ WB_enum previous,
5177              WB_enum before,
5178              WB_enum after,
5179              const U8 * const strbeg,
5180              const U8 * const curpos,
5181              const U8 * const strend,
5182              const bool utf8_target)
5183 {
5184     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5185      *  a Unicode word break, using their published algorithm, but tailored for
5186      *  Perl by treating spans of white space as one unit.  Context may be
5187      *  needed to make this determination.  If the value for the character
5188      *  before 'before' is known, it is passed as 'previous'; otherwise that
5189      *  should be set to WB_UNKNOWN.  The other input parameters give the
5190      *  boundaries and current position in the matching of the string.  That
5191      *  is, 'curpos' marks the position where the character whose wb value is
5192      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5193 
5194     U8 * before_pos = (U8 *) curpos;
5195     U8 * after_pos = (U8 *) curpos;
5196     WB_enum prev = before;
5197     WB_enum next;
5198 
5199     PERL_ARGS_ASSERT_ISWB;
5200 
5201     /* Rule numbers in the comments below are as of Unicode 9.0 */
5202 
5203   redo:
5204     before = prev;
5205     switch (WB_table[before][after]) {
5206         case WB_BREAKABLE:
5207             return TRUE;
5208 
5209         case WB_NOBREAK:
5210             return FALSE;
5211 
5212         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5213             next = advance_one_WB(&after_pos, strend, utf8_target,
5214                                  FALSE /* Don't skip Extend nor Format */ );
5215             /* A space immediately preceeding an Extend or Format is attached
5216              * to by them, and hence gets separated from previous spaces.
5217              * Otherwise don't break between horizontal white space */
5218             return next == WB_Extend || next == WB_Format;
5219 
5220         /* WB4 Ignore Format and Extend characters, except when they appear at
5221          * the beginning of a region of text.  This code currently isn't
5222          * general purpose, but it works as the rules are currently and likely
5223          * to be laid out.  The reason it works is that when 'they appear at
5224          * the beginning of a region of text', the rule is to break before
5225          * them, just like any other character.  Therefore, the default rule
5226          * applies and we don't have to look in more depth.  Should this ever
5227          * change, we would have to have 2 'case' statements, like in the rules
5228          * below, and backup a single character (not spacing over the extend
5229          * ones) and then see if that is one of the region-end characters and
5230          * go from there */
5231         case WB_Ex_or_FO_or_ZWJ_then_foo:
5232             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5233             goto redo;
5234 
5235         case WB_DQ_then_HL + WB_BREAKABLE:
5236         case WB_DQ_then_HL + WB_NOBREAK:
5237 
5238             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5239 
5240             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5241                                                             == WB_Hebrew_Letter)
5242             {
5243                 return FALSE;
5244             }
5245 
5246              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5247 
5248         case WB_HL_then_DQ + WB_BREAKABLE:
5249         case WB_HL_then_DQ + WB_NOBREAK:
5250 
5251             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5252 
5253             if (advance_one_WB(&after_pos, strend, utf8_target,
5254                                        TRUE /* Do skip Extend and Format */ )
5255                                                             == WB_Hebrew_Letter)
5256             {
5257                 return FALSE;
5258             }
5259 
5260             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5261 
5262         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5263         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5264 
5265             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5266              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5267 
5268             next = advance_one_WB(&after_pos, strend, utf8_target,
5269                                        TRUE /* Do skip Extend and Format */ );
5270 
5271             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5272             {
5273                 return FALSE;
5274             }
5275 
5276             return WB_table[before][after]
5277                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5278 
5279         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5280         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5281 
5282             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5283              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5284 
5285             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5286             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5287             {
5288                 return FALSE;
5289             }
5290 
5291             return WB_table[before][after]
5292                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5293 
5294         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5295         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5296 
5297             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5298              * */
5299 
5300             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5301                                                             == WB_Numeric)
5302             {
5303                 return FALSE;
5304             }
5305 
5306             return WB_table[before][after]
5307                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5308 
5309         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5310         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5311 
5312             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5313 
5314             if (advance_one_WB(&after_pos, strend, utf8_target,
5315                                        TRUE /* Do skip Extend and Format */ )
5316                                                             == WB_Numeric)
5317             {
5318                 return FALSE;
5319             }
5320 
5321             return WB_table[before][after]
5322                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5323 
5324         case WB_RI_then_RI + WB_NOBREAK:
5325         case WB_RI_then_RI + WB_BREAKABLE:
5326             {
5327                 int RI_count = 1;
5328 
5329                 /* Do not break within emoji flag sequences. That is, do not
5330                  * break between regional indicator (RI) symbols if there is an
5331                  * odd number of RI characters before the potential break
5332                  * point.
5333                  *
5334                  * WB15   sot (RI RI)* RI × RI
5335                  * WB16 [^RI] (RI RI)* RI × RI */
5336 
5337                 while (backup_one_WB(&previous,
5338                                      strbeg,
5339                                      &before_pos,
5340                                      utf8_target) == WB_Regional_Indicator)
5341                 {
5342                     RI_count++;
5343                 }
5344 
5345                 return RI_count % 2 != 1;
5346             }
5347 
5348         default:
5349             break;
5350     }
5351 
5352 #ifdef DEBUGGING
5353     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5354                                   before, after, WB_table[before][after]);
5355     assert(0);
5356 #endif
5357     return TRUE;
5358 }
5359 
5360 STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target,const bool skip_Extend_Format)5361 S_advance_one_WB(pTHX_ U8 ** curpos,
5362                        const U8 * const strend,
5363                        const bool utf8_target,
5364                        const bool skip_Extend_Format)
5365 {
5366     dVAR;
5367     WB_enum wb;
5368 
5369     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5370 
5371     if (*curpos >= strend) {
5372         return WB_EDGE;
5373     }
5374 
5375     if (utf8_target) {
5376 
5377         /* Advance over Extend and Format */
5378         do {
5379             *curpos += UTF8SKIP(*curpos);
5380             if (*curpos >= strend) {
5381                 return WB_EDGE;
5382             }
5383             wb = getWB_VAL_UTF8(*curpos, strend);
5384         } while (    skip_Extend_Format
5385                  && (wb == WB_Extend || wb == WB_Format));
5386     }
5387     else {
5388         do {
5389             (*curpos)++;
5390             if (*curpos >= strend) {
5391                 return WB_EDGE;
5392             }
5393             wb = getWB_VAL_CP(**curpos);
5394         } while (    skip_Extend_Format
5395                  && (wb == WB_Extend || wb == WB_Format));
5396     }
5397 
5398     return wb;
5399 }
5400 
5401 STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous,const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5402 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5403 {
5404     dVAR;
5405     WB_enum wb;
5406 
5407     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5408 
5409     /* If we know what the previous character's break value is, don't have
5410         * to look it up */
5411     if (*previous != WB_UNKNOWN) {
5412         wb = *previous;
5413 
5414         /* But we need to move backwards by one */
5415         if (utf8_target) {
5416             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5417             if (! *curpos) {
5418                 *previous = WB_EDGE;
5419                 *curpos = (U8 *) strbeg;
5420             }
5421             else {
5422                 *previous = WB_UNKNOWN;
5423             }
5424         }
5425         else {
5426             (*curpos)--;
5427             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5428         }
5429 
5430         /* And we always back up over these three types */
5431         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5432             return wb;
5433         }
5434     }
5435 
5436     if (*curpos < strbeg) {
5437         return WB_EDGE;
5438     }
5439 
5440     if (utf8_target) {
5441         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5442         if (! prev_char_pos) {
5443             return WB_EDGE;
5444         }
5445 
5446         /* Back up over Extend and Format.  curpos is always just to the right
5447          * of the characater whose value we are getting */
5448         do {
5449             U8 * prev_prev_char_pos;
5450             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5451                                                    -1,
5452                                                    strbeg)))
5453             {
5454                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5455                 *curpos = prev_char_pos;
5456                 prev_char_pos = prev_prev_char_pos;
5457             }
5458             else {
5459                 *curpos = (U8 *) strbeg;
5460                 return WB_EDGE;
5461             }
5462         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5463     }
5464     else {
5465         do {
5466             if (*curpos - 2 < strbeg) {
5467                 *curpos = (U8 *) strbeg;
5468                 return WB_EDGE;
5469             }
5470             (*curpos)--;
5471             wb = getWB_VAL_CP(*(*curpos - 1));
5472         } while (wb == WB_Extend || wb == WB_Format);
5473     }
5474 
5475     return wb;
5476 }
5477 
5478 /* Macros for regmatch(), using its internal variables */
5479 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
5480 #define NEXTCHR_IS_EOS (nextchr < 0)
5481 
5482 #define SET_nextchr \
5483     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
5484 
5485 #define SET_locinput(p) \
5486     locinput = (p);  \
5487     SET_nextchr
5488 
5489 #define sayYES goto yes
5490 #define sayNO goto no
5491 #define sayNO_SILENT goto no_silent
5492 
5493 /* we dont use STMT_START/END here because it leads to
5494    "unreachable code" warnings, which are bogus, but distracting. */
5495 #define CACHEsayNO \
5496     if (ST.cache_mask) \
5497        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
5498     sayNO
5499 
5500 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5501 (                                                           \
5502     (   ( st )                                         ) && \
5503     (   ( st )->u.eval.close_paren                     ) && \
5504     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5505 )
5506 
5507 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5508 (                                                           \
5509     (   ( st )                                         ) && \
5510     (   ( st )->u.eval.close_paren                     ) && \
5511     (   ( expr )                                       ) && \
5512     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5513 )
5514 
5515 
5516 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5517     (st)->u.eval.close_paren = ( (expr) + 1 )
5518 
5519 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5520     (st)->u.eval.close_paren = 0
5521 
5522 /* push a new state then goto it */
5523 
5524 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
5525     pushinput = input; \
5526     pusheol = eol; \
5527     pushsr0 = sr0; \
5528     scan = node; \
5529     st->resume_state = state; \
5530     goto push_state;
5531 
5532 /* push a new state with success backtracking, then goto it */
5533 
5534 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
5535     pushinput = input; \
5536     pusheol = eol;     \
5537     pushsr0 = sr0; \
5538     scan = node; \
5539     st->resume_state = state; \
5540     goto push_yes_state;
5541 
5542 #define DEBUG_STATE_pp(pp)                                  \
5543     DEBUG_STATE_r({                                         \
5544         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
5545         Perl_re_printf( aTHX_                               \
5546             "%*s" pp " %s%s%s%s%s\n",                       \
5547             INDENT_CHARS(depth), "",                        \
5548             PL_reg_name[st->resume_state],                  \
5549             ((st==yes_state||st==mark_state) ? "[" : ""),   \
5550             ((st==yes_state) ? "Y" : ""),                   \
5551             ((st==mark_state) ? "M" : ""),                  \
5552             ((st==yes_state||st==mark_state) ? "]" : "")    \
5553         );                                                  \
5554     });
5555 
5556 /*
5557 
5558 regmatch() - main matching routine
5559 
5560 This is basically one big switch statement in a loop. We execute an op,
5561 set 'next' to point the next op, and continue. If we come to a point which
5562 we may need to backtrack to on failure such as (A|B|C), we push a
5563 backtrack state onto the backtrack stack. On failure, we pop the top
5564 state, and re-enter the loop at the state indicated. If there are no more
5565 states to pop, we return failure.
5566 
5567 Sometimes we also need to backtrack on success; for example /A+/, where
5568 after successfully matching one A, we need to go back and try to
5569 match another one; similarly for lookahead assertions: if the assertion
5570 completes successfully, we backtrack to the state just before the assertion
5571 and then carry on.  In these cases, the pushed state is marked as
5572 'backtrack on success too'. This marking is in fact done by a chain of
5573 pointers, each pointing to the previous 'yes' state. On success, we pop to
5574 the nearest yes state, discarding any intermediate failure-only states.
5575 Sometimes a yes state is pushed just to force some cleanup code to be
5576 called at the end of a successful match or submatch; e.g. (??{$re}) uses
5577 it to free the inner regex.
5578 
5579 Note that failure backtracking rewinds the cursor position, while
5580 success backtracking leaves it alone.
5581 
5582 A pattern is complete when the END op is executed, while a subpattern
5583 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
5584 ops trigger the "pop to last yes state if any, otherwise return true"
5585 behaviour.
5586 
5587 A common convention in this function is to use A and B to refer to the two
5588 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
5589 the subpattern to be matched possibly multiple times, while B is the entire
5590 rest of the pattern. Variable and state names reflect this convention.
5591 
5592 The states in the main switch are the union of ops and failure/success of
5593 substates associated with with that op.  For example, IFMATCH is the op
5594 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
5595 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
5596 successfully matched A and IFMATCH_A_fail is a state saying that we have
5597 just failed to match A. Resume states always come in pairs. The backtrack
5598 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
5599 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
5600 on success or failure.
5601 
5602 The struct that holds a backtracking state is actually a big union, with
5603 one variant for each major type of op. The variable st points to the
5604 top-most backtrack struct. To make the code clearer, within each
5605 block of code we #define ST to alias the relevant union.
5606 
5607 Here's a concrete example of a (vastly oversimplified) IFMATCH
5608 implementation:
5609 
5610     switch (state) {
5611     ....
5612 
5613 #define ST st->u.ifmatch
5614 
5615     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5616 	ST.foo = ...; // some state we wish to save
5617 	...
5618 	// push a yes backtrack state with a resume value of
5619 	// IFMATCH_A/IFMATCH_A_fail, then continue execution at the
5620 	// first node of A:
5621 	PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
5622 	// NOTREACHED
5623 
5624     case IFMATCH_A: // we have successfully executed A; now continue with B
5625 	next = B;
5626 	bar = ST.foo; // do something with the preserved value
5627 	break;
5628 
5629     case IFMATCH_A_fail: // A failed, so the assertion failed
5630 	...;   // do some housekeeping, then ...
5631 	sayNO; // propagate the failure
5632 
5633 #undef ST
5634 
5635     ...
5636     }
5637 
5638 For any old-timers reading this who are familiar with the old recursive
5639 approach, the code above is equivalent to:
5640 
5641     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5642     {
5643 	int foo = ...
5644 	...
5645 	if (regmatch(A)) {
5646 	    next = B;
5647 	    bar = foo;
5648 	    break;
5649 	}
5650 	...;   // do some housekeeping, then ...
5651 	sayNO; // propagate the failure
5652     }
5653 
5654 The topmost backtrack state, pointed to by st, is usually free. If you
5655 want to claim it, populate any ST.foo fields in it with values you wish to
5656 save, then do one of
5657 
5658 	PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
5659 	PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
5660 
5661 which sets that backtrack state's resume value to 'resume_state', pushes a
5662 new free entry to the top of the backtrack stack, then goes to 'node'.
5663 On backtracking, the free slot is popped, and the saved state becomes the
5664 new free state. An ST.foo field in this new top state can be temporarily
5665 accessed to retrieve values, but once the main loop is re-entered, it
5666 becomes available for reuse.
5667 
5668 Note that the depth of the backtrack stack constantly increases during the
5669 left-to-right execution of the pattern, rather than going up and down with
5670 the pattern nesting. For example the stack is at its maximum at Z at the
5671 end of the pattern, rather than at X in the following:
5672 
5673     /(((X)+)+)+....(Y)+....Z/
5674 
5675 The only exceptions to this are lookahead/behind assertions and the cut,
5676 (?>A), which pop all the backtrack states associated with A before
5677 continuing.
5678 
5679 Backtrack state structs are allocated in slabs of about 4K in size.
5680 PL_regmatch_state and st always point to the currently active state,
5681 and PL_regmatch_slab points to the slab currently containing
5682 PL_regmatch_state.  The first time regmatch() is called, the first slab is
5683 allocated, and is never freed until interpreter destruction. When the slab
5684 is full, a new one is allocated and chained to the end. At exit from
5685 regmatch(), slabs allocated since entry are freed.
5686 
5687 In order to work with variable length lookbehinds, an upper limit is placed on
5688 lookbehinds which is set to where the match position is at the end of where the
5689 lookbehind would get to.  Nothing in the lookbehind should match above that,
5690 except we should be able to look beyond if for things like \b, which need the
5691 next character in the string to be able to determine if this is a boundary or
5692 not.  We also can't match the end of string/line unless we are also at the end
5693 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
5694 that match a width, we have to add a condition that they are within the legal
5695 bounds of our window into the string.
5696 
5697 */
5698 
5699 /* returns -1 on failure, $+[0] on success */
5700 STATIC SSize_t
S_regmatch(pTHX_ regmatch_info * reginfo,char * startpos,regnode * prog)5701 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5702 {
5703     dVAR;
5704     const bool utf8_target = reginfo->is_utf8_target;
5705     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5706     REGEXP *rex_sv = reginfo->prog;
5707     regexp *rex = ReANY(rex_sv);
5708     RXi_GET_DECL(rex,rexi);
5709     /* the current state. This is a cached copy of PL_regmatch_state */
5710     regmatch_state *st;
5711     /* cache heavy used fields of st in registers */
5712     regnode *scan;
5713     regnode *next;
5714     U32 n = 0;	/* general value; init to avoid compiler warning */
5715     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5716     SSize_t endref = 0; /* offset of end of backref when ln is start */
5717     char *locinput = startpos;
5718     char *loceol = reginfo->strend;
5719     char *pushinput; /* where to continue after a PUSH */
5720     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
5721     U8   *pushsr0;   /* save starting pos of script run */
5722     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5723 
5724     bool result = 0;	    /* return value of S_regmatch */
5725     U32 depth = 0;            /* depth of backtrack stack */
5726     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5727     const U32 max_nochange_depth =
5728         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5729         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5730     regmatch_state *yes_state = NULL; /* state to pop to on success of
5731 							    subpattern */
5732     /* mark_state piggy backs on the yes_state logic so that when we unwind
5733        the stack on success we can update the mark_state as we go */
5734     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5735     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5736     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5737     U32 state_num;
5738     bool no_final = 0;      /* prevent failure from backtracking? */
5739     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5740     char *startpoint = locinput;
5741     SV *popmark = NULL;     /* are we looking for a mark? */
5742     SV *sv_commit = NULL;   /* last mark name seen in failure */
5743     SV *sv_yes_mark = NULL; /* last mark name we have seen
5744                                during a successful match */
5745     U32 lastopen = 0;       /* last open we saw */
5746     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5747     SV* const oreplsv = GvSVn(PL_replgv);
5748     /* these three flags are set by various ops to signal information to
5749      * the very next op. They have a useful lifetime of exactly one loop
5750      * iteration, and are not preserved or restored by state pushes/pops
5751      */
5752     bool sw = 0;	    /* the condition value in (?(cond)a|b) */
5753     bool minmod = 0;	    /* the next "{n,m}" is a "{n,m}?" */
5754     int logical = 0;	    /* the following EVAL is:
5755 				0: (?{...})
5756 				1: (?(?{...})X|Y)
5757 				2: (??{...})
5758 			       or the following IFMATCH/UNLESSM is:
5759 			        false: plain (?=foo)
5760 				true:  used as a condition: (?(?=foo))
5761 			    */
5762     PAD* last_pad = NULL;
5763     dMULTICALL;
5764     U8 gimme = G_SCALAR;
5765     CV *caller_cv = NULL;	/* who called us */
5766     CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
5767     U32 maxopenparen = 0;       /* max '(' index seen so far */
5768     int to_complement;  /* Invert the result? */
5769     _char_class_number classnum;
5770     bool is_utf8_pat = reginfo->is_utf8_pat;
5771     bool match = FALSE;
5772     I32 orig_savestack_ix = PL_savestack_ix;
5773     U8 * script_run_begin = NULL;
5774 
5775 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5776 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5777 #  define SOLARIS_BAD_OPTIMIZER
5778     const U32 *pl_charclass_dup = PL_charclass;
5779 #  define PL_charclass pl_charclass_dup
5780 #endif
5781 
5782 #ifdef DEBUGGING
5783     GET_RE_DEBUG_FLAGS_DECL;
5784 #endif
5785 
5786     /* protect against undef(*^R) */
5787     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5788 
5789     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5790     multicall_oldcatch = 0;
5791     PERL_UNUSED_VAR(multicall_cop);
5792 
5793     PERL_ARGS_ASSERT_REGMATCH;
5794 
5795     st = PL_regmatch_state;
5796 
5797     /* Note that nextchr is a byte even in UTF */
5798     SET_nextchr;
5799     scan = prog;
5800 
5801     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5802             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5803             Perl_re_printf( aTHX_ "regmatch start\n" );
5804     }));
5805 
5806     while (scan != NULL) {
5807 	next = scan + NEXT_OFF(scan);
5808 	if (next == scan)
5809 	    next = NULL;
5810 	state_num = OP(scan);
5811 
5812       reenter_switch:
5813         DEBUG_EXECUTE_r(
5814             if (state_num <= REGNODE_MAX) {
5815                 SV * const prop = sv_newmortal();
5816                 regnode *rnext = regnext(scan);
5817 
5818                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5819                 regprop(rex, prop, scan, reginfo, NULL);
5820                 Perl_re_printf( aTHX_
5821                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5822                     INDENT_CHARS(depth), "",
5823                     (IV)(scan - rexi->program),
5824                     SvPVX_const(prop),
5825                     (PL_regkind[OP(scan)] == END || !rnext) ?
5826                         0 : (IV)(rnext - rexi->program));
5827             }
5828         );
5829 
5830         to_complement = 0;
5831 
5832         SET_nextchr;
5833         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5834 
5835 	switch (state_num) {
5836 	case SBOL: /*  /^../ and /\A../  */
5837 	    if (locinput == reginfo->strbeg)
5838 		break;
5839 	    sayNO;
5840 
5841 	case MBOL: /*  /^../m  */
5842 	    if (locinput == reginfo->strbeg ||
5843 		(!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5844 	    {
5845 		break;
5846 	    }
5847 	    sayNO;
5848 
5849 	case GPOS: /*  \G  */
5850 	    if (locinput == reginfo->ganch)
5851 		break;
5852 	    sayNO;
5853 
5854 	case KEEPS: /*   \K  */
5855 	    /* update the startpoint */
5856 	    st->u.keeper.val = rex->offs[0].start;
5857 	    rex->offs[0].start = locinput - reginfo->strbeg;
5858 	    PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
5859                             script_run_begin);
5860 	    NOT_REACHED; /* NOTREACHED */
5861 
5862 	case KEEPS_next_fail:
5863 	    /* rollback the start point change */
5864 	    rex->offs[0].start = st->u.keeper.val;
5865 	    sayNO_SILENT;
5866 	    NOT_REACHED; /* NOTREACHED */
5867 
5868 	case MEOL: /* /..$/m  */
5869 	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
5870 		sayNO;
5871 	    break;
5872 
5873 	case SEOL: /* /..$/  */
5874 	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
5875 		sayNO;
5876 	    if (reginfo->strend - locinput > 1)
5877 		sayNO;
5878 	    break;
5879 
5880 	case EOS: /*  \z  */
5881 	    if (!NEXTCHR_IS_EOS)
5882 		sayNO;
5883 	    break;
5884 
5885 	case SANY: /*  /./s  */
5886 	    if (NEXTCHR_IS_EOS || locinput >= loceol)
5887 		sayNO;
5888             goto increment_locinput;
5889 
5890 	case REG_ANY: /*  /./  */
5891 	    if (   NEXTCHR_IS_EOS
5892                 || locinput >= loceol
5893                 || nextchr == '\n')
5894             {
5895 		sayNO;
5896             }
5897             goto increment_locinput;
5898 
5899 
5900 #undef  ST
5901 #define ST st->u.trie
5902         case TRIEC: /* (ab|cd) with known charclass */
5903             /* In this case the charclass data is available inline so
5904                we can fail fast without a lot of extra overhead.
5905              */
5906             if ( !   NEXTCHR_IS_EOS
5907                 &&   locinput < loceol
5908                 && ! ANYOF_BITMAP_TEST(scan, nextchr))
5909             {
5910                 DEBUG_EXECUTE_r(
5911                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5912                               depth, PL_colors[4], PL_colors[5])
5913                 );
5914                 sayNO_SILENT;
5915                 NOT_REACHED; /* NOTREACHED */
5916             }
5917             /* FALLTHROUGH */
5918 	case TRIE:  /* (ab|cd)  */
5919 	    /* the basic plan of execution of the trie is:
5920 	     * At the beginning, run though all the states, and
5921 	     * find the longest-matching word. Also remember the position
5922 	     * of the shortest matching word. For example, this pattern:
5923 	     *    1  2 3 4    5
5924 	     *    ab|a|x|abcd|abc
5925 	     * when matched against the string "abcde", will generate
5926 	     * accept states for all words except 3, with the longest
5927 	     * matching word being 4, and the shortest being 2 (with
5928 	     * the position being after char 1 of the string).
5929 	     *
5930 	     * Then for each matching word, in word order (i.e. 1,2,4,5),
5931 	     * we run the remainder of the pattern; on each try setting
5932 	     * the current position to the character following the word,
5933 	     * returning to try the next word on failure.
5934 	     *
5935 	     * We avoid having to build a list of words at runtime by
5936 	     * using a compile-time structure, wordinfo[].prev, which
5937 	     * gives, for each word, the previous accepting word (if any).
5938 	     * In the case above it would contain the mappings 1->2, 2->0,
5939 	     * 3->0, 4->5, 5->1.  We can use this table to generate, from
5940 	     * the longest word (4 above), a list of all words, by
5941 	     * following the list of prev pointers; this gives us the
5942 	     * unordered list 4,5,1,2. Then given the current word we have
5943 	     * just tried, we can go through the list and find the
5944 	     * next-biggest word to try (so if we just failed on word 2,
5945 	     * the next in the list is 4).
5946 	     *
5947 	     * Since at runtime we don't record the matching position in
5948 	     * the string for each word, we have to work that out for
5949 	     * each word we're about to process. The wordinfo table holds
5950 	     * the character length of each word; given that we recorded
5951 	     * at the start: the position of the shortest word and its
5952 	     * length in chars, we just need to move the pointer the
5953 	     * difference between the two char lengths. Depending on
5954 	     * Unicode status and folding, that's cheap or expensive.
5955 	     *
5956 	     * This algorithm is optimised for the case where are only a
5957 	     * small number of accept states, i.e. 0,1, or maybe 2.
5958 	     * With lots of accepts states, and having to try all of them,
5959 	     * it becomes quadratic on number of accept states to find all
5960 	     * the next words.
5961 	     */
5962 
5963 	    {
5964                 /* what type of TRIE am I? (utf8 makes this contextual) */
5965                 DECL_TRIE_TYPE(scan);
5966 
5967                 /* what trie are we using right now */
5968 		reg_trie_data * const trie
5969         	    = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5970 		HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5971                 U32 state = trie->startstate;
5972 
5973                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5974                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5975                     if (utf8_target
5976                         && ! NEXTCHR_IS_EOS
5977                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5978                         && scan->flags == EXACTL)
5979                     {
5980                         /* We only output for EXACTL, as we let the folder
5981                          * output this message for EXACTFLU8 to avoid
5982                          * duplication */
5983                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5984                                                                reginfo->strend);
5985                     }
5986                 }
5987                 if (   trie->bitmap
5988                     && (     NEXTCHR_IS_EOS
5989                         ||   locinput >= loceol
5990                         || ! TRIE_BITMAP_TEST(trie, nextchr)))
5991                 {
5992         	    if (trie->states[ state ].wordnum) {
5993         	         DEBUG_EXECUTE_r(
5994                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5995                                           depth, PL_colors[4], PL_colors[5])
5996                         );
5997 			if (!trie->jump)
5998 			    break;
5999         	    } else {
6000         	        DEBUG_EXECUTE_r(
6001                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6002                                           depth, PL_colors[4], PL_colors[5])
6003                         );
6004         	        sayNO_SILENT;
6005         	   }
6006                 }
6007 
6008             {
6009 		U8 *uc = ( U8* )locinput;
6010 
6011 		STRLEN len = 0;
6012 		STRLEN foldlen = 0;
6013 		U8 *uscan = (U8*)NULL;
6014 		U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6015 		U32 charcount = 0; /* how many input chars we have matched */
6016 		U32 accepted = 0; /* have we seen any accepting states? */
6017 
6018 		ST.jump = trie->jump;
6019 		ST.me = scan;
6020 		ST.firstpos = NULL;
6021 		ST.longfold = FALSE; /* char longer if folded => it's harder */
6022 		ST.nextword = 0;
6023 
6024 		/* fully traverse the TRIE; note the position of the
6025 		   shortest accept state and the wordnum of the longest
6026 		   accept state */
6027 
6028 		while ( state && uc <= (U8*)(loceol) ) {
6029                     U32 base = trie->states[ state ].trans.base;
6030                     UV uvc = 0;
6031                     U16 charid = 0;
6032 		    U16 wordnum;
6033                     wordnum = trie->states[ state ].wordnum;
6034 
6035 		    if (wordnum) { /* it's an accept state */
6036 			if (!accepted) {
6037 			    accepted = 1;
6038 			    /* record first match position */
6039 			    if (ST.longfold) {
6040 				ST.firstpos = (U8*)locinput;
6041 				ST.firstchars = 0;
6042 			    }
6043 			    else {
6044 				ST.firstpos = uc;
6045 				ST.firstchars = charcount;
6046 			    }
6047 			}
6048 			if (!ST.nextword || wordnum < ST.nextword)
6049 			    ST.nextword = wordnum;
6050 			ST.topword = wordnum;
6051 		    }
6052 
6053 		    DEBUG_TRIE_EXECUTE_r({
6054                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6055                                 /* HERE */
6056                                 PerlIO_printf( Perl_debug_log,
6057                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6058                                     INDENT_CHARS(depth), "", PL_colors[4],
6059 			            (UV)state, (accepted ? 'Y' : 'N'));
6060 		    });
6061 
6062 		    /* read a char and goto next state */
6063 		    if ( base && (foldlen || uc < (U8*)(loceol))) {
6064 			I32 offset;
6065 			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6066                                              (U8 *) loceol, uscan,
6067                                              len, uvc, charid, foldlen,
6068                                              foldbuf, uniflags);
6069 			charcount++;
6070 			if (foldlen>0)
6071 			    ST.longfold = TRUE;
6072 			if (charid &&
6073 			     ( ((offset =
6074 			      base + charid - 1 - trie->uniquecharcount)) >= 0)
6075 
6076 			     && ((U32)offset < trie->lasttrans)
6077 			     && trie->trans[offset].check == state)
6078 			{
6079 			    state = trie->trans[offset].next;
6080 			}
6081 			else {
6082 			    state = 0;
6083 			}
6084 			uc += len;
6085 
6086 		    }
6087 		    else {
6088 			state = 0;
6089 		    }
6090 		    DEBUG_TRIE_EXECUTE_r(
6091                         Perl_re_printf( aTHX_
6092 		            "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6093 		            charid, uvc, (UV)state, PL_colors[5] );
6094 		    );
6095 		}
6096 		if (!accepted)
6097 		   sayNO;
6098 
6099 		/* calculate total number of accept states */
6100 		{
6101 		    U16 w = ST.topword;
6102 		    accepted = 0;
6103 		    while (w) {
6104 			w = trie->wordinfo[w].prev;
6105 			accepted++;
6106 		    }
6107 		    ST.accepted = accepted;
6108 		}
6109 
6110 		DEBUG_EXECUTE_r(
6111                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6112                         depth,
6113 			PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6114 		);
6115 		goto trie_first_try; /* jump into the fail handler */
6116 	    }}
6117 	    NOT_REACHED; /* NOTREACHED */
6118 
6119 	case TRIE_next_fail: /* we failed - try next alternative */
6120         {
6121             U8 *uc;
6122             if ( ST.jump ) {
6123                 /* undo any captures done in the tail part of a branch,
6124                  * e.g.
6125                  *    /(?:X(.)(.)|Y(.)).../
6126                  * where the trie just matches X then calls out to do the
6127                  * rest of the branch */
6128                 REGCP_UNWIND(ST.cp);
6129                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6130 	    }
6131 	    if (!--ST.accepted) {
6132 	        DEBUG_EXECUTE_r({
6133                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6134                         depth,
6135 			PL_colors[4],
6136 			PL_colors[5] );
6137 		});
6138 		sayNO_SILENT;
6139 	    }
6140 	    {
6141 		/* Find next-highest word to process.  Note that this code
6142 		 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6143 		U16 min = 0;
6144 		U16 word;
6145 		U16 const nextword = ST.nextword;
6146 		reg_trie_wordinfo * const wordinfo
6147 		    = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6148 		for (word=ST.topword; word; word=wordinfo[word].prev) {
6149 		    if (word > nextword && (!min || word < min))
6150 			min = word;
6151 		}
6152 		ST.nextword = min;
6153 	    }
6154 
6155           trie_first_try:
6156             if (do_cutgroup) {
6157                 do_cutgroup = 0;
6158                 no_final = 0;
6159             }
6160 
6161             if ( ST.jump ) {
6162                 ST.lastparen = rex->lastparen;
6163                 ST.lastcloseparen = rex->lastcloseparen;
6164 	        REGCP_SET(ST.cp);
6165             }
6166 
6167 	    /* find start char of end of current word */
6168 	    {
6169 		U32 chars; /* how many chars to skip */
6170 		reg_trie_data * const trie
6171 		    = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6172 
6173 		assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6174 			    >=  ST.firstchars);
6175 		chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6176 			    - ST.firstchars;
6177 		uc = ST.firstpos;
6178 
6179 		if (ST.longfold) {
6180 		    /* the hard option - fold each char in turn and find
6181 		     * its folded length (which may be different */
6182 		    U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6183 		    STRLEN foldlen;
6184 		    STRLEN len;
6185 		    UV uvc;
6186 		    U8 *uscan;
6187 
6188 		    while (chars) {
6189 			if (utf8_target) {
6190                             /* XXX This assumes the length is well-formed, as
6191                              * does the UTF8SKIP below */
6192 			    uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6193 						    uniflags);
6194 			    uc += len;
6195 			}
6196 			else {
6197 			    uvc = *uc;
6198 			    uc++;
6199 			}
6200 			uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6201 			uscan = foldbuf;
6202 			while (foldlen) {
6203 			    if (!--chars)
6204 				break;
6205 			    uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6206                                                  uniflags);
6207 			    uscan += len;
6208 			    foldlen -= len;
6209 			}
6210 		    }
6211 		}
6212 		else {
6213 		    if (utf8_target)
6214 			while (chars--)
6215 			    uc += UTF8SKIP(uc);
6216 		    else
6217 			uc += chars;
6218 		}
6219 	    }
6220 
6221 	    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6222 			    ? ST.jump[ST.nextword]
6223 			    : NEXT_OFF(ST.me));
6224 
6225 	    DEBUG_EXECUTE_r({
6226                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6227                     depth,
6228 		    PL_colors[4],
6229 		    ST.nextword,
6230 		    PL_colors[5]
6231 		    );
6232 	    });
6233 
6234 	    if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6235 		PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6236                                 script_run_begin);
6237 		NOT_REACHED; /* NOTREACHED */
6238 	    }
6239 	    /* only one choice left - just continue */
6240 	    DEBUG_EXECUTE_r({
6241 		AV *const trie_words
6242 		    = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6243 		SV ** const tmp = trie_words
6244                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6245 		SV *sv= tmp ? sv_newmortal() : NULL;
6246 
6247                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6248                     depth, PL_colors[4],
6249 		    ST.nextword,
6250 		    tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6251 			    PL_colors[0], PL_colors[1],
6252 			    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6253 			)
6254 		    : "not compiled under -Dr",
6255 		    PL_colors[5] );
6256 	    });
6257 
6258 	    locinput = (char*)uc;
6259 	    continue; /* execute rest of RE */
6260             /* NOTREACHED */
6261         }
6262 #undef  ST
6263 
6264 	case EXACTL:             /*  /abc/l       */
6265             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6266 
6267             /* Complete checking would involve going through every character
6268              * matched by the string to see if any is above latin1.  But the
6269              * comparision otherwise might very well be a fast assembly
6270              * language routine, and I (khw) don't think slowing things down
6271              * just to check for this warning is worth it.  So this just checks
6272              * the first character */
6273             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6274                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6275             }
6276             goto do_exact;
6277 	case EXACT_ONLY8:
6278             if (! utf8_target) {
6279                 sayNO;
6280             }
6281             /* FALLTHROUGH */
6282 	case EXACT: {            /*  /abc/        */
6283 	    char *s;
6284           do_exact:
6285 	    s = STRING(scan);
6286 	    ln = STR_LEN(scan);
6287 	    if (utf8_target != is_utf8_pat) {
6288 		/* The target and the pattern have differing utf8ness. */
6289 		char *l = locinput;
6290 		const char * const e = s + ln;
6291 
6292 		if (utf8_target) {
6293                     /* The target is utf8, the pattern is not utf8.
6294                      * Above-Latin1 code points can't match the pattern;
6295                      * invariants match exactly, and the other Latin1 ones need
6296                      * to be downgraded to a single byte in order to do the
6297                      * comparison.  (If we could be confident that the target
6298                      * is not malformed, this could be refactored to have fewer
6299                      * tests by just assuming that if the first bytes match, it
6300                      * is an invariant, but there are tests in the test suite
6301                      * dealing with (??{...}) which violate this) */
6302 		    while (s < e) {
6303 			if (   l >= loceol
6304                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6305                         {
6306                             sayNO;
6307                         }
6308                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6309 			    if (*l != *s) {
6310                                 sayNO;
6311                             }
6312                             l++;
6313                         }
6314                         else {
6315                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6316                             {
6317                                 sayNO;
6318                             }
6319                             l += 2;
6320                         }
6321 			s++;
6322 		    }
6323 		}
6324 		else {
6325 		    /* The target is not utf8, the pattern is utf8. */
6326 		    while (s < e) {
6327                         if (   l >= loceol
6328                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6329                         {
6330                             sayNO;
6331                         }
6332                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6333 			    if (*s != *l) {
6334                                 sayNO;
6335                             }
6336                             s++;
6337                         }
6338                         else {
6339                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6340                             {
6341                                 sayNO;
6342                             }
6343                             s += 2;
6344                         }
6345 			l++;
6346 		    }
6347 		}
6348 		locinput = l;
6349 	    }
6350             else {
6351                 /* The target and the pattern have the same utf8ness. */
6352                 /* Inline the first character, for speed. */
6353                 if (   loceol - locinput < ln
6354                     || UCHARAT(s) != nextchr
6355                     || (ln > 1 && memNE(s, locinput, ln)))
6356                 {
6357                     sayNO;
6358                 }
6359                 locinput += ln;
6360             }
6361 	    break;
6362 	    }
6363 
6364 	case EXACTFL:            /*  /abc/il      */
6365           {
6366 	    re_fold_t folder;
6367 	    const U8 * fold_array;
6368 	    const char * s;
6369 	    U32 fold_utf8_flags;
6370 
6371             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6372             folder = foldEQ_locale;
6373             fold_array = PL_fold_locale;
6374 	    fold_utf8_flags = FOLDEQ_LOCALE;
6375 	    goto do_exactf;
6376 
6377         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6378                                       is effectively /u; hence to match, target
6379                                       must be UTF-8. */
6380             if (! utf8_target) {
6381                 sayNO;
6382             }
6383             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6384                                              | FOLDEQ_S2_FOLDS_SANE;
6385 	    folder = foldEQ_latin1_s2_folded;
6386 	    fold_array = PL_fold_latin1;
6387 	    goto do_exactf;
6388 
6389         case EXACTFU_ONLY8:      /* /abc/iu with something in /abc/ > 255 */
6390             if (! utf8_target) {
6391                 sayNO;
6392             }
6393 	    assert(is_utf8_pat);
6394 	    fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6395 	    goto do_exactf;
6396 
6397         case EXACTFUP:          /*  /foo/iu, and something is problematic in
6398                                     'foo' so can't take shortcuts. */
6399             assert(! is_utf8_pat);
6400             folder = foldEQ_latin1;
6401 	    fold_array = PL_fold_latin1;
6402 	    fold_utf8_flags = 0;
6403 	    goto do_exactf;
6404 
6405 	case EXACTFU:            /*  /abc/iu      */
6406             folder = foldEQ_latin1_s2_folded;
6407 	    fold_array = PL_fold_latin1;
6408 	    fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6409 	    goto do_exactf;
6410 
6411         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6412                                    patterns */
6413             assert(! is_utf8_pat);
6414             /* FALLTHROUGH */
6415 	case EXACTFAA:            /*  /abc/iaa     */
6416             folder = foldEQ_latin1_s2_folded;
6417 	    fold_array = PL_fold_latin1;
6418 	    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6419             if (is_utf8_pat || ! utf8_target) {
6420 
6421                 /* The possible presence of a MICRO SIGN in the pattern forbids
6422                  * us to view a non-UTF-8 pattern as folded when there is a
6423                  * UTF-8 target */
6424                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
6425                                   |FOLDEQ_S2_FOLDS_SANE;
6426             }
6427 	    goto do_exactf;
6428 
6429 
6430         case EXACTF:             /*  /abc/i    This node only generated for
6431                                                non-utf8 patterns */
6432             assert(! is_utf8_pat);
6433 	    folder = foldEQ;
6434 	    fold_array = PL_fold;
6435 	    fold_utf8_flags = 0;
6436 
6437 	  do_exactf:
6438 	    s = STRING(scan);
6439 	    ln = STR_LEN(scan);
6440 
6441 	    if (   utf8_target
6442                 || is_utf8_pat
6443                 || state_num == EXACTFUP
6444                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6445             {
6446 	      /* Either target or the pattern are utf8, or has the issue where
6447 	       * the fold lengths may differ. */
6448 		const char * const l = locinput;
6449 		char *e = loceol;
6450 
6451 		if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
6452                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
6453 		{
6454 		    sayNO;
6455 		}
6456 		locinput = e;
6457 		break;
6458 	    }
6459 
6460 	    /* Neither the target nor the pattern are utf8 */
6461 	    if (UCHARAT(s) != nextchr
6462                 && !NEXTCHR_IS_EOS
6463 		&& UCHARAT(s) != fold_array[nextchr])
6464 	    {
6465 		sayNO;
6466 	    }
6467 	    if (loceol - locinput < ln)
6468 		sayNO;
6469 	    if (ln > 1 && ! folder(locinput, s, ln))
6470 		sayNO;
6471 	    locinput += ln;
6472 	    break;
6473 	}
6474 
6475 	case NBOUNDL: /*  /\B/l  */
6476             to_complement = 1;
6477             /* FALLTHROUGH */
6478 
6479 	case BOUNDL:  /*  /\b/l  */
6480         {
6481             bool b1, b2;
6482             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6483 
6484             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6485                 if (! IN_UTF8_CTYPE_LOCALE) {
6486                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6487                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6488                 }
6489                 goto boundu;
6490             }
6491 
6492 	    if (utf8_target) {
6493 		if (locinput == reginfo->strbeg)
6494 		    b1 = isWORDCHAR_LC('\n');
6495 		else {
6496                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6497                                                         (U8*)(reginfo->strbeg)),
6498                                                  (U8*)(reginfo->strend));
6499 		}
6500                 b2 = (NEXTCHR_IS_EOS)
6501                     ? isWORDCHAR_LC('\n')
6502                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6503                                               (U8*) reginfo->strend);
6504 	    }
6505 	    else { /* Here the string isn't utf8 */
6506 		b1 = (locinput == reginfo->strbeg)
6507                      ? isWORDCHAR_LC('\n')
6508                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6509                 b2 = (NEXTCHR_IS_EOS)
6510                     ? isWORDCHAR_LC('\n')
6511                     : isWORDCHAR_LC(nextchr);
6512 	    }
6513             if (to_complement ^ (b1 == b2)) {
6514                 sayNO;
6515             }
6516 	    break;
6517         }
6518 
6519 	case NBOUND:  /*  /\B/   */
6520             to_complement = 1;
6521             /* FALLTHROUGH */
6522 
6523 	case BOUND:   /*  /\b/   */
6524 	    if (utf8_target) {
6525                 goto bound_utf8;
6526             }
6527             goto bound_ascii_match_only;
6528 
6529 	case NBOUNDA: /*  /\B/a  */
6530             to_complement = 1;
6531             /* FALLTHROUGH */
6532 
6533 	case BOUNDA:  /*  /\b/a  */
6534         {
6535             bool b1, b2;
6536 
6537           bound_ascii_match_only:
6538             /* Here the string isn't utf8, or is utf8 and only ascii characters
6539              * are to match \w.  In the latter case looking at the byte just
6540              * prior to the current one may be just the final byte of a
6541              * multi-byte character.  This is ok.  There are two cases:
6542              * 1) it is a single byte character, and then the test is doing
6543              *    just what it's supposed to.
6544              * 2) it is a multi-byte character, in which case the final byte is
6545              *    never mistakable for ASCII, and so the test will say it is
6546              *    not a word character, which is the correct answer. */
6547             b1 = (locinput == reginfo->strbeg)
6548                  ? isWORDCHAR_A('\n')
6549                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6550             b2 = (NEXTCHR_IS_EOS)
6551                 ? isWORDCHAR_A('\n')
6552                 : isWORDCHAR_A(nextchr);
6553             if (to_complement ^ (b1 == b2)) {
6554                 sayNO;
6555             }
6556 	    break;
6557         }
6558 
6559 	case NBOUNDU: /*  /\B/u  */
6560             to_complement = 1;
6561             /* FALLTHROUGH */
6562 
6563 	case BOUNDU:  /*  /\b/u  */
6564 
6565           boundu:
6566             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6567                 match = FALSE;
6568             }
6569             else if (utf8_target) {
6570               bound_utf8:
6571                 switch((bound_type) FLAGS(scan)) {
6572                     case TRADITIONAL_BOUND:
6573                     {
6574                         bool b1, b2;
6575                         b1 = (locinput == reginfo->strbeg)
6576                              ? 0 /* isWORDCHAR_L1('\n') */
6577                              : isWORDCHAR_utf8_safe(
6578                                                reghop3((U8*)locinput,
6579                                                        -1,
6580                                                        (U8*)(reginfo->strbeg)),
6581                                                     (U8*) reginfo->strend);
6582                         b2 = (NEXTCHR_IS_EOS)
6583                             ? 0 /* isWORDCHAR_L1('\n') */
6584                             : isWORDCHAR_utf8_safe((U8*)locinput,
6585                                                    (U8*) reginfo->strend);
6586                         match = cBOOL(b1 != b2);
6587                         break;
6588                     }
6589                     case GCB_BOUND:
6590                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6591                             match = TRUE; /* GCB always matches at begin and
6592                                              end */
6593                         }
6594                         else {
6595                             /* Find the gcb values of previous and current
6596                              * chars, then see if is a break point */
6597                             match = isGCB(getGCB_VAL_UTF8(
6598                                                 reghop3((U8*)locinput,
6599                                                         -1,
6600                                                         (U8*)(reginfo->strbeg)),
6601                                                 (U8*) reginfo->strend),
6602                                           getGCB_VAL_UTF8((U8*) locinput,
6603                                                         (U8*) reginfo->strend),
6604                                           (U8*) reginfo->strbeg,
6605                                           (U8*) locinput,
6606                                           utf8_target);
6607                         }
6608                         break;
6609 
6610                     case LB_BOUND:
6611                         if (locinput == reginfo->strbeg) {
6612                             match = FALSE;
6613                         }
6614                         else if (NEXTCHR_IS_EOS) {
6615                             match = TRUE;
6616                         }
6617                         else {
6618                             match = isLB(getLB_VAL_UTF8(
6619                                                 reghop3((U8*)locinput,
6620                                                         -1,
6621                                                         (U8*)(reginfo->strbeg)),
6622                                                 (U8*) reginfo->strend),
6623                                           getLB_VAL_UTF8((U8*) locinput,
6624                                                         (U8*) reginfo->strend),
6625                                           (U8*) reginfo->strbeg,
6626                                           (U8*) locinput,
6627                                           (U8*) reginfo->strend,
6628                                           utf8_target);
6629                         }
6630                         break;
6631 
6632                     case SB_BOUND: /* Always matches at begin and end */
6633                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6634                             match = TRUE;
6635                         }
6636                         else {
6637                             match = isSB(getSB_VAL_UTF8(
6638                                                 reghop3((U8*)locinput,
6639                                                         -1,
6640                                                         (U8*)(reginfo->strbeg)),
6641                                                 (U8*) reginfo->strend),
6642                                           getSB_VAL_UTF8((U8*) locinput,
6643                                                         (U8*) reginfo->strend),
6644                                           (U8*) reginfo->strbeg,
6645                                           (U8*) locinput,
6646                                           (U8*) reginfo->strend,
6647                                           utf8_target);
6648                         }
6649                         break;
6650 
6651                     case WB_BOUND:
6652                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6653                             match = TRUE;
6654                         }
6655                         else {
6656                             match = isWB(WB_UNKNOWN,
6657                                          getWB_VAL_UTF8(
6658                                                 reghop3((U8*)locinput,
6659                                                         -1,
6660                                                         (U8*)(reginfo->strbeg)),
6661                                                 (U8*) reginfo->strend),
6662                                           getWB_VAL_UTF8((U8*) locinput,
6663                                                         (U8*) reginfo->strend),
6664                                           (U8*) reginfo->strbeg,
6665                                           (U8*) locinput,
6666                                           (U8*) reginfo->strend,
6667                                           utf8_target);
6668                         }
6669                         break;
6670                 }
6671 	    }
6672 	    else {  /* Not utf8 target */
6673                 switch((bound_type) FLAGS(scan)) {
6674                     case TRADITIONAL_BOUND:
6675                     {
6676                         bool b1, b2;
6677                         b1 = (locinput == reginfo->strbeg)
6678                             ? 0 /* isWORDCHAR_L1('\n') */
6679                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6680                         b2 = (NEXTCHR_IS_EOS)
6681                             ? 0 /* isWORDCHAR_L1('\n') */
6682                             : isWORDCHAR_L1(nextchr);
6683                         match = cBOOL(b1 != b2);
6684                         break;
6685                     }
6686 
6687                     case GCB_BOUND:
6688                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6689                             match = TRUE; /* GCB always matches at begin and
6690                                              end */
6691                         }
6692                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6693                                    range */
6694                             match =    UCHARAT(locinput - 1) != '\r'
6695                                     || UCHARAT(locinput) != '\n';
6696                         }
6697                         break;
6698 
6699                     case LB_BOUND:
6700                         if (locinput == reginfo->strbeg) {
6701                             match = FALSE;
6702                         }
6703                         else if (NEXTCHR_IS_EOS) {
6704                             match = TRUE;
6705                         }
6706                         else {
6707                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6708                                          getLB_VAL_CP(UCHARAT(locinput)),
6709                                          (U8*) reginfo->strbeg,
6710                                          (U8*) locinput,
6711                                          (U8*) reginfo->strend,
6712                                          utf8_target);
6713                         }
6714                         break;
6715 
6716                     case SB_BOUND: /* Always matches at begin and end */
6717                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6718                             match = TRUE;
6719                         }
6720                         else {
6721                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6722                                          getSB_VAL_CP(UCHARAT(locinput)),
6723                                          (U8*) reginfo->strbeg,
6724                                          (U8*) locinput,
6725                                          (U8*) reginfo->strend,
6726                                          utf8_target);
6727                         }
6728                         break;
6729 
6730                     case WB_BOUND:
6731                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6732                             match = TRUE;
6733                         }
6734                         else {
6735                             match = isWB(WB_UNKNOWN,
6736                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6737                                          getWB_VAL_CP(UCHARAT(locinput)),
6738                                          (U8*) reginfo->strbeg,
6739                                          (U8*) locinput,
6740                                          (U8*) reginfo->strend,
6741                                          utf8_target);
6742                         }
6743                         break;
6744                 }
6745 	    }
6746 
6747             if (to_complement ^ ! match) {
6748                 sayNO;
6749             }
6750 	    break;
6751 
6752         case ANYOFPOSIXL:
6753 	case ANYOFL:  /*  /[abc]/l      */
6754             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6755 
6756             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6757             {
6758               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6759             }
6760             /* FALLTHROUGH */
6761 	case ANYOFD:  /*   /[abc]/d       */
6762 	case ANYOF:  /*   /[abc]/       */
6763             if (NEXTCHR_IS_EOS || locinput >= loceol)
6764                 sayNO;
6765 	    if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
6766 	        && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
6767             {
6768                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
6769 		    sayNO;
6770                 }
6771 		locinput++;
6772             }
6773             else {
6774 	        if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6775                                                                    utf8_target))
6776                 {
6777 		    sayNO;
6778                 }
6779                 goto increment_locinput;
6780             }
6781 	    break;
6782 
6783         case ANYOFM:
6784             if (   NEXTCHR_IS_EOS
6785                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
6786                 || locinput >= loceol)
6787             {
6788                 sayNO;
6789             }
6790             locinput++; /* ANYOFM is always single byte */
6791             break;
6792 
6793         case NANYOFM:
6794             if (   NEXTCHR_IS_EOS
6795                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
6796                 || locinput >= loceol)
6797             {
6798                 sayNO;
6799             }
6800             goto increment_locinput;
6801             break;
6802 
6803         case ANYOFH:
6804             if (   ! utf8_target
6805                 ||   NEXTCHR_IS_EOS
6806                 ||  (   ANYOF_FLAGS(scan) != 0
6807                      && ANYOF_FLAGS(scan) != (U8) *locinput)
6808 	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6809                                                                    utf8_target))
6810             {
6811                 sayNO;
6812             }
6813             goto increment_locinput;
6814             break;
6815 
6816         /* The argument (FLAGS) to all the POSIX node types is the class number
6817          * */
6818 
6819         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6820             to_complement = 1;
6821             /* FALLTHROUGH */
6822 
6823         case POSIXL:    /* \w or [:punct:] etc. under /l */
6824             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6825             if (NEXTCHR_IS_EOS || locinput >= loceol)
6826                 sayNO;
6827 
6828             /* Use isFOO_lc() for characters within Latin1.  (Note that
6829              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6830              * wouldn't be invariant) */
6831             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6832                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6833                     sayNO;
6834                 }
6835 
6836                 locinput++;
6837                 break;
6838             }
6839 
6840             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6841                 /* An above Latin-1 code point, or malformed */
6842                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6843                                                        reginfo->strend);
6844                 goto utf8_posix_above_latin1;
6845             }
6846 
6847             /* Here is a UTF-8 variant code point below 256 and the target is
6848              * UTF-8 */
6849             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6850                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6851                                             *(locinput + 1))))))
6852             {
6853                 sayNO;
6854             }
6855 
6856             goto increment_locinput;
6857 
6858         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6859             to_complement = 1;
6860             /* FALLTHROUGH */
6861 
6862         case POSIXD:    /* \w or [:punct:] etc. under /d */
6863             if (utf8_target) {
6864                 goto utf8_posix;
6865             }
6866             goto posixa;
6867 
6868         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6869 
6870             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6871                 sayNO;
6872             }
6873 
6874             /* All UTF-8 variants match */
6875             if (! UTF8_IS_INVARIANT(nextchr)) {
6876                 goto increment_locinput;
6877             }
6878 
6879             to_complement = 1;
6880             goto join_nposixa;
6881 
6882         case POSIXA:    /* \w or [:punct:] etc. under /a */
6883 
6884           posixa:
6885             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6886              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6887              * character is a single byte */
6888 
6889             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6890                 sayNO;
6891             }
6892 
6893           join_nposixa:
6894 
6895             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6896                                                                 FLAGS(scan)))))
6897             {
6898                 sayNO;
6899             }
6900 
6901             /* Here we are either not in utf8, or we matched a utf8-invariant,
6902              * so the next char is the next byte */
6903             locinput++;
6904             break;
6905 
6906         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6907             to_complement = 1;
6908             /* FALLTHROUGH */
6909 
6910         case POSIXU:    /* \w or [:punct:] etc. under /u */
6911           utf8_posix:
6912             if (NEXTCHR_IS_EOS || locinput >= loceol) {
6913                 sayNO;
6914             }
6915 
6916             /* Use _generic_isCC() for characters within Latin1.  (Note that
6917              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6918              * wouldn't be invariant) */
6919             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6920                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6921                                                            FLAGS(scan)))))
6922                 {
6923                     sayNO;
6924                 }
6925                 locinput++;
6926             }
6927             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6928                 if (! (to_complement
6929                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6930                                                                *(locinput + 1)),
6931                                              FLAGS(scan)))))
6932                 {
6933                     sayNO;
6934                 }
6935                 locinput += 2;
6936             }
6937             else {  /* Handle above Latin-1 code points */
6938               utf8_posix_above_latin1:
6939                 classnum = (_char_class_number) FLAGS(scan);
6940                 switch (classnum) {
6941                     default:
6942                         if (! (to_complement
6943                            ^ cBOOL(_invlist_contains_cp(
6944                                       PL_XPosix_ptrs[classnum],
6945                                       utf8_to_uvchr_buf((U8 *) locinput,
6946                                                         (U8 *) reginfo->strend,
6947                                                         NULL)))))
6948                         {
6949                             sayNO;
6950                         }
6951                         break;
6952                     case _CC_ENUM_SPACE:
6953                         if (! (to_complement
6954                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6955                         {
6956                             sayNO;
6957                         }
6958                         break;
6959                     case _CC_ENUM_BLANK:
6960                         if (! (to_complement
6961                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6962                         {
6963                             sayNO;
6964                         }
6965                         break;
6966                     case _CC_ENUM_XDIGIT:
6967                         if (! (to_complement
6968                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6969                         {
6970                             sayNO;
6971                         }
6972                         break;
6973                     case _CC_ENUM_VERTSPACE:
6974                         if (! (to_complement
6975                                         ^ cBOOL(is_VERTWS_high(locinput))))
6976                         {
6977                             sayNO;
6978                         }
6979                         break;
6980                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6981                     case _CC_ENUM_ASCII:
6982                         if (! to_complement) {
6983                             sayNO;
6984                         }
6985                         break;
6986                 }
6987                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
6988             }
6989             break;
6990 
6991 	case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6992 		       a Unicode extended Grapheme Cluster */
6993 	    if (NEXTCHR_IS_EOS || locinput >= loceol)
6994 		sayNO;
6995 	    if  (! utf8_target) {
6996 
6997 		/* Match either CR LF  or '.', as all the other possibilities
6998 		 * require utf8 */
6999 		locinput++;	    /* Match the . or CR */
7000 		if (nextchr == '\r' /* And if it was CR, and the next is LF,
7001 				       match the LF */
7002 		    && locinput <  loceol
7003 		    && UCHARAT(locinput) == '\n')
7004                 {
7005                     locinput++;
7006                 }
7007 	    }
7008 	    else {
7009 
7010                 /* Get the gcb type for the current character */
7011                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7012                                                        (U8*) reginfo->strend);
7013 
7014                 /* Then scan through the input until we get to the first
7015                  * character whose type is supposed to be a gcb with the
7016                  * current character.  (There is always a break at the
7017                  * end-of-input) */
7018                 locinput += UTF8SKIP(locinput);
7019                 while (locinput < loceol) {
7020                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7021                                                          (U8*) reginfo->strend);
7022                     if (isGCB(prev_gcb, cur_gcb,
7023                               (U8*) reginfo->strbeg, (U8*) locinput,
7024                               utf8_target))
7025                     {
7026                         break;
7027                     }
7028 
7029                     prev_gcb = cur_gcb;
7030                     locinput += UTF8SKIP(locinput);
7031                 }
7032 
7033 
7034 	    }
7035 	    break;
7036 
7037 	case NREFFL:  /*  /\g{name}/il  */
7038 	{   /* The capture buffer cases.  The ones beginning with N for the
7039 	       named buffers just convert to the equivalent numbered and
7040 	       pretend they were called as the corresponding numbered buffer
7041 	       op.  */
7042 	    /* don't initialize these in the declaration, it makes C++
7043 	       unhappy */
7044 	    const char *s;
7045 	    char type;
7046 	    re_fold_t folder;
7047 	    const U8 *fold_array;
7048 	    UV utf8_fold_flags;
7049 
7050             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7051 	    folder = foldEQ_locale;
7052 	    fold_array = PL_fold_locale;
7053 	    type = REFFL;
7054 	    utf8_fold_flags = FOLDEQ_LOCALE;
7055 	    goto do_nref;
7056 
7057 	case NREFFA:  /*  /\g{name}/iaa  */
7058 	    folder = foldEQ_latin1;
7059 	    fold_array = PL_fold_latin1;
7060 	    type = REFFA;
7061 	    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7062 	    goto do_nref;
7063 
7064 	case NREFFU:  /*  /\g{name}/iu  */
7065 	    folder = foldEQ_latin1;
7066 	    fold_array = PL_fold_latin1;
7067 	    type = REFFU;
7068 	    utf8_fold_flags = 0;
7069 	    goto do_nref;
7070 
7071 	case NREFF:  /*  /\g{name}/i  */
7072 	    folder = foldEQ;
7073 	    fold_array = PL_fold;
7074 	    type = REFF;
7075 	    utf8_fold_flags = 0;
7076 	    goto do_nref;
7077 
7078 	case NREF:  /*  /\g{name}/   */
7079 	    type = REF;
7080 	    folder = NULL;
7081 	    fold_array = NULL;
7082 	    utf8_fold_flags = 0;
7083 	  do_nref:
7084 
7085 	    /* For the named back references, find the corresponding buffer
7086 	     * number */
7087 	    n = reg_check_named_buff_matched(rex,scan);
7088 
7089             if ( ! n ) {
7090                 sayNO;
7091 	    }
7092 	    goto do_nref_ref_common;
7093 
7094 	case REFFL:  /*  /\1/il  */
7095             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7096 	    folder = foldEQ_locale;
7097 	    fold_array = PL_fold_locale;
7098 	    utf8_fold_flags = FOLDEQ_LOCALE;
7099 	    goto do_ref;
7100 
7101 	case REFFA:  /*  /\1/iaa  */
7102 	    folder = foldEQ_latin1;
7103 	    fold_array = PL_fold_latin1;
7104 	    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7105 	    goto do_ref;
7106 
7107 	case REFFU:  /*  /\1/iu  */
7108 	    folder = foldEQ_latin1;
7109 	    fold_array = PL_fold_latin1;
7110 	    utf8_fold_flags = 0;
7111 	    goto do_ref;
7112 
7113 	case REFF:  /*  /\1/i  */
7114 	    folder = foldEQ;
7115 	    fold_array = PL_fold;
7116 	    utf8_fold_flags = 0;
7117 	    goto do_ref;
7118 
7119         case REF:  /*  /\1/    */
7120 	    folder = NULL;
7121 	    fold_array = NULL;
7122 	    utf8_fold_flags = 0;
7123 
7124 	  do_ref:
7125 	    type = OP(scan);
7126 	    n = ARG(scan);  /* which paren pair */
7127 
7128 	  do_nref_ref_common:
7129 	    ln = rex->offs[n].start;
7130 	    endref = rex->offs[n].end;
7131 	    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7132 	    if (rex->lastparen < n || ln == -1 || endref == -1)
7133 		sayNO;			/* Do not match unless seen CLOSEn. */
7134 	    if (ln == endref)
7135 		break;
7136 
7137 	    s = reginfo->strbeg + ln;
7138 	    if (type != REF	/* REF can do byte comparison */
7139 		&& (utf8_target || type == REFFU || type == REFFL))
7140 	    {
7141 		char * limit = loceol;
7142 
7143 		/* This call case insensitively compares the entire buffer
7144 		    * at s, with the current input starting at locinput, but
7145                     * not going off the end given by loceol, and
7146                     * returns in <limit> upon success, how much of the
7147                     * current input was matched */
7148 		if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7149 				    locinput, &limit, 0, utf8_target, utf8_fold_flags))
7150 		{
7151 		    sayNO;
7152 		}
7153 		locinput = limit;
7154 		break;
7155 	    }
7156 
7157 	    /* Not utf8:  Inline the first character, for speed. */
7158 	    if ( ! NEXTCHR_IS_EOS
7159                 && locinput < loceol
7160                 && UCHARAT(s) != nextchr
7161                 && (   type == REF
7162                     || UCHARAT(s) != fold_array[nextchr]))
7163             {
7164 		sayNO;
7165             }
7166 	    ln = endref - ln;
7167 	    if (locinput + ln > loceol)
7168 		sayNO;
7169 	    if (ln > 1 && (type == REF
7170 			   ? memNE(s, locinput, ln)
7171 			   : ! folder(locinput, s, ln)))
7172 		sayNO;
7173 	    locinput += ln;
7174 	    break;
7175 	}
7176 
7177 	case NOTHING: /* null op; e.g. the 'nothing' following
7178                        * the '*' in m{(a+|b)*}' */
7179 	    break;
7180 	case TAIL: /* placeholder while compiling (A|B|C) */
7181 	    break;
7182 
7183 #undef  ST
7184 #define ST st->u.eval
7185 #define CUR_EVAL cur_eval->u.eval
7186 
7187 	{
7188 	    SV *ret;
7189 	    REGEXP *re_sv;
7190             regexp *re;
7191             regexp_internal *rei;
7192             regnode *startpoint;
7193             U32 arg;
7194 
7195 	case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7196             arg= (U32)ARG(scan);
7197             if (cur_eval && cur_eval->locinput == locinput) {
7198                 if ( ++nochange_depth > max_nochange_depth )
7199                     Perl_croak(aTHX_
7200                         "Pattern subroutine nesting without pos change"
7201                         " exceeded limit in regex");
7202             } else {
7203                 nochange_depth = 0;
7204             }
7205 	    re_sv = rex_sv;
7206             re = rex;
7207             rei = rexi;
7208             startpoint = scan + ARG2L(scan);
7209             EVAL_CLOSE_PAREN_SET( st, arg );
7210             /* Detect infinite recursion
7211              *
7212              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7213              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7214              * So we track the position in the string we are at each time
7215              * we recurse and if we try to enter the same routine twice from
7216              * the same position we throw an error.
7217              */
7218             if ( rex->recurse_locinput[arg] == locinput ) {
7219                 /* FIXME: we should show the regop that is failing as part
7220                  * of the error message. */
7221                 Perl_croak(aTHX_ "Infinite recursion in regex");
7222             } else {
7223                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7224                 rex->recurse_locinput[arg]= locinput;
7225 
7226                 DEBUG_r({
7227                     GET_RE_DEBUG_FLAGS_DECL;
7228                     DEBUG_STACK_r({
7229                         Perl_re_exec_indentf( aTHX_
7230                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7231                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7232                         );
7233                     });
7234                 });
7235             }
7236 
7237             /* Save all the positions seen so far. */
7238             ST.cp = regcppush(rex, 0, maxopenparen);
7239             REGCP_SET(ST.lastcp);
7240 
7241             /* and then jump to the code we share with EVAL */
7242             goto eval_recurse_doit;
7243             /* NOTREACHED */
7244 
7245         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7246             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7247 		if ( ++nochange_depth > max_nochange_depth )
7248                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7249             } else {
7250                 nochange_depth = 0;
7251             }
7252 	    {
7253 		/* execute the code in the {...} */
7254 
7255 		dSP;
7256 		IV before;
7257 		OP * const oop = PL_op;
7258 		COP * const ocurcop = PL_curcop;
7259 		OP *nop;
7260 		CV *newcv;
7261 
7262 		/* save *all* paren positions */
7263                 regcppush(rex, 0, maxopenparen);
7264                 REGCP_SET(ST.lastcp);
7265 
7266 		if (!caller_cv)
7267 		    caller_cv = find_runcv(NULL);
7268 
7269 		n = ARG(scan);
7270 
7271 		if (rexi->data->what[n] == 'r') { /* code from an external qr */
7272                     newcv = (ReANY(
7273                                     (REGEXP*)(rexi->data->data[n])
7274                             ))->qr_anoncv;
7275 		    nop = (OP*)rexi->data->data[n+1];
7276 		}
7277 		else if (rexi->data->what[n] == 'l') { /* literal code */
7278 		    newcv = caller_cv;
7279 		    nop = (OP*)rexi->data->data[n];
7280 		    assert(CvDEPTH(newcv));
7281 		}
7282 		else {
7283 		    /* literal with own CV */
7284 		    assert(rexi->data->what[n] == 'L');
7285 		    newcv = rex->qr_anoncv;
7286 		    nop = (OP*)rexi->data->data[n];
7287 		}
7288 
7289                 /* Some notes about MULTICALL and the context and save stacks.
7290                  *
7291                  * In something like
7292                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7293                  * since codeblocks don't introduce a new scope (so that
7294                  * local() etc accumulate), at the end of a successful
7295                  * match there will be a SAVEt_CLEARSV on the savestack
7296                  * for each of $x, $y, $z. If the three code blocks above
7297                  * happen to have come from different CVs (e.g. via
7298                  * embedded qr//s), then we must ensure that during any
7299                  * savestack unwinding, PL_comppad always points to the
7300                  * right pad at each moment. We achieve this by
7301                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7302                  * there is a change of pad.
7303                  * In theory whenever we call a code block, we should
7304                  * push a CXt_SUB context, then pop it on return from
7305                  * that code block. This causes a bit of an issue in that
7306                  * normally popping a context also clears the savestack
7307                  * back to cx->blk_oldsaveix, but here we specifically
7308                  * don't want to clear the save stack on exit from the
7309                  * code block.
7310                  * Also for efficiency we don't want to keep pushing and
7311                  * popping the single SUB context as we backtrack etc.
7312                  * So instead, we push a single context the first time
7313                  * we need, it, then hang onto it until the end of this
7314                  * function. Whenever we encounter a new code block, we
7315                  * update the CV etc if that's changed. During the times
7316                  * in this function where we're not executing a code
7317                  * block, having the SUB context still there is a bit
7318                  * naughty - but we hope that no-one notices.
7319                  * When the SUB context is initially pushed, we fake up
7320                  * cx->blk_oldsaveix to be as if we'd pushed this context
7321                  * on first entry to S_regmatch rather than at some random
7322                  * point during the regexe execution. That way if we
7323                  * croak, popping the context stack will ensure that
7324                  * *everything* SAVEd by this function is undone and then
7325                  * the context popped, rather than e.g., popping the
7326                  * context (and restoring the original PL_comppad) then
7327                  * popping more of the savestack and restoring a bad
7328                  * PL_comppad.
7329                  */
7330 
7331                 /* If this is the first EVAL, push a MULTICALL. On
7332                  * subsequent calls, if we're executing a different CV, or
7333                  * if PL_comppad has got messed up from backtracking
7334                  * through SAVECOMPPADs, then refresh the context.
7335                  */
7336 		if (newcv != last_pushed_cv || PL_comppad != last_pad)
7337 		{
7338                     U8 flags = (CXp_SUB_RE |
7339                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7340                     SAVECOMPPAD();
7341 		    if (last_pushed_cv) {
7342 			CHANGE_MULTICALL_FLAGS(newcv, flags);
7343 		    }
7344 		    else {
7345 			PUSH_MULTICALL_FLAGS(newcv, flags);
7346 		    }
7347                     /* see notes above */
7348                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7349 
7350 		    last_pushed_cv = newcv;
7351 		}
7352 		else {
7353                     /* these assignments are just to silence compiler
7354                      * warnings */
7355 		    multicall_cop = NULL;
7356 		}
7357 		last_pad = PL_comppad;
7358 
7359 		/* the initial nextstate you would normally execute
7360 		 * at the start of an eval (which would cause error
7361 		 * messages to come from the eval), may be optimised
7362 		 * away from the execution path in the regex code blocks;
7363 		 * so manually set PL_curcop to it initially */
7364 		{
7365 		    OP *o = cUNOPx(nop)->op_first;
7366 		    assert(o->op_type == OP_NULL);
7367 		    if (o->op_targ == OP_SCOPE) {
7368 			o = cUNOPo->op_first;
7369 		    }
7370 		    else {
7371 			assert(o->op_targ == OP_LEAVE);
7372 			o = cUNOPo->op_first;
7373 			assert(o->op_type == OP_ENTER);
7374 			o = OpSIBLING(o);
7375 		    }
7376 
7377 		    if (o->op_type != OP_STUB) {
7378 			assert(    o->op_type == OP_NEXTSTATE
7379 				|| o->op_type == OP_DBSTATE
7380 				|| (o->op_type == OP_NULL
7381 				    &&  (  o->op_targ == OP_NEXTSTATE
7382 					|| o->op_targ == OP_DBSTATE
7383 					)
7384 				    )
7385 			);
7386 			PL_curcop = (COP*)o;
7387 		    }
7388 		}
7389 		nop = nop->op_next;
7390 
7391                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7392 		    "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7393 
7394 		rex->offs[0].end = locinput - reginfo->strbeg;
7395                 if (reginfo->info_aux_eval->pos_magic)
7396                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7397                                   reginfo->sv, reginfo->strbeg,
7398                                   locinput - reginfo->strbeg);
7399 
7400                 if (sv_yes_mark) {
7401                     SV *sv_mrk = get_sv("REGMARK", 1);
7402                     sv_setsv(sv_mrk, sv_yes_mark);
7403                 }
7404 
7405 		/* we don't use MULTICALL here as we want to call the
7406 		 * first op of the block of interest, rather than the
7407 		 * first op of the sub. Also, we don't want to free
7408                  * the savestack frame */
7409 		before = (IV)(SP-PL_stack_base);
7410 		PL_op = nop;
7411 		CALLRUNOPS(aTHX);			/* Scalar context. */
7412 		SPAGAIN;
7413 		if ((IV)(SP-PL_stack_base) == before)
7414 		    ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7415 		else {
7416 		    ret = POPs;
7417 		    PUTBACK;
7418 		}
7419 
7420 		/* before restoring everything, evaluate the returned
7421 		 * value, so that 'uninit' warnings don't use the wrong
7422 		 * PL_op or pad. Also need to process any magic vars
7423 		 * (e.g. $1) *before* parentheses are restored */
7424 
7425 		PL_op = NULL;
7426 
7427                 re_sv = NULL;
7428 		if (logical == 0) {       /*   (?{})/   */
7429                     SV *replsv = save_scalar(PL_replgv);
7430                     sv_setsv(replsv, ret); /* $^R */
7431                     SvSETMAGIC(replsv);
7432                 }
7433 		else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7434 		    sw = cBOOL(SvTRUE_NN(ret));
7435 		    logical = 0;
7436 		}
7437 		else {                   /*  /(??{})  */
7438 		    /*  if its overloaded, let the regex compiler handle
7439 		     *  it; otherwise extract regex, or stringify  */
7440 		    if (SvGMAGICAL(ret))
7441 			ret = sv_mortalcopy(ret);
7442 		    if (!SvAMAGIC(ret)) {
7443 			SV *sv = ret;
7444 			if (SvROK(sv))
7445 			    sv = SvRV(sv);
7446 			if (SvTYPE(sv) == SVt_REGEXP)
7447 			    re_sv = (REGEXP*) sv;
7448 			else if (SvSMAGICAL(ret)) {
7449 			    MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7450 			    if (mg)
7451 				re_sv = (REGEXP *) mg->mg_obj;
7452 			}
7453 
7454 			/* force any undef warnings here */
7455 			if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7456 			    ret = sv_mortalcopy(ret);
7457 			    (void) SvPV_force_nolen(ret);
7458 			}
7459 		    }
7460 
7461 		}
7462 
7463 		/* *** Note that at this point we don't restore
7464 		 * PL_comppad, (or pop the CxSUB) on the assumption it may
7465 		 * be used again soon. This is safe as long as nothing
7466 		 * in the regexp code uses the pad ! */
7467 		PL_op = oop;
7468 		PL_curcop = ocurcop;
7469                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7470                 PL_curpm_under = PL_curpm;
7471                 PL_curpm = PL_reg_curpm;
7472 
7473 		if (logical != 2) {
7474                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
7475                                     script_run_begin);
7476 		    /* NOTREACHED */
7477                 }
7478 	    }
7479 
7480 		/* only /(??{})/  from now on */
7481 		logical = 0;
7482 		{
7483 		    /* extract RE object from returned value; compiling if
7484 		     * necessary */
7485 
7486 		    if (re_sv) {
7487 			re_sv = reg_temp_copy(NULL, re_sv);
7488 		    }
7489 		    else {
7490 			U32 pm_flags = 0;
7491 
7492 			if (SvUTF8(ret) && IN_BYTES) {
7493 			    /* In use 'bytes': make a copy of the octet
7494 			     * sequence, but without the flag on */
7495 			    STRLEN len;
7496 			    const char *const p = SvPV(ret, len);
7497 			    ret = newSVpvn_flags(p, len, SVs_TEMP);
7498 			}
7499 			if (rex->intflags & PREGf_USE_RE_EVAL)
7500 			    pm_flags |= PMf_USE_RE_EVAL;
7501 
7502 			/* if we got here, it should be an engine which
7503 			 * supports compiling code blocks and stuff */
7504 			assert(rex->engine && rex->engine->op_comp);
7505                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7506 			re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7507 				    rex->engine, NULL, NULL,
7508                                     /* copy /msixn etc to inner pattern */
7509                                     ARG2L(scan),
7510                                     pm_flags);
7511 
7512 			if (!(SvFLAGS(ret)
7513 			      & (SVs_TEMP | SVs_GMG | SVf_ROK))
7514 			 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7515 			    /* This isn't a first class regexp. Instead, it's
7516 			       caching a regexp onto an existing, Perl visible
7517 			       scalar.  */
7518 			    sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7519 			}
7520 		    }
7521 		    SAVEFREESV(re_sv);
7522 		    re = ReANY(re_sv);
7523 		}
7524                 RXp_MATCH_COPIED_off(re);
7525                 re->subbeg = rex->subbeg;
7526                 re->sublen = rex->sublen;
7527                 re->suboffset = rex->suboffset;
7528                 re->subcoffset = rex->subcoffset;
7529                 re->lastparen = 0;
7530                 re->lastcloseparen = 0;
7531 		rei = RXi_GET(re);
7532                 DEBUG_EXECUTE_r(
7533                     debug_start_match(re_sv, utf8_target, locinput,
7534                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7535 		);
7536 		startpoint = rei->program + 1;
7537                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7538                                              * close_paren only for GOSUB */
7539                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7540                 /* Save all the seen positions so far. */
7541                 ST.cp = regcppush(rex, 0, maxopenparen);
7542                 REGCP_SET(ST.lastcp);
7543                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7544                 maxopenparen = 0;
7545                 /* run the pattern returned from (??{...}) */
7546 
7547               eval_recurse_doit: /* Share code with GOSUB below this line
7548                             * At this point we expect the stack context to be
7549                             * set up correctly */
7550 
7551                 /* invalidate the S-L poscache. We're now executing a
7552                  * different set of WHILEM ops (and their associated
7553                  * indexes) against the same string, so the bits in the
7554                  * cache are meaningless. Setting maxiter to zero forces
7555                  * the cache to be invalidated and zeroed before reuse.
7556 		 * XXX This is too dramatic a measure. Ideally we should
7557                  * save the old cache and restore when running the outer
7558                  * pattern again */
7559 		reginfo->poscache_maxiter = 0;
7560 
7561                 /* the new regexp might have a different is_utf8_pat than we do */
7562                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7563 
7564 		ST.prev_rex = rex_sv;
7565 		ST.prev_curlyx = cur_curlyx;
7566 		rex_sv = re_sv;
7567 		SET_reg_curpm(rex_sv);
7568 		rex = re;
7569 		rexi = rei;
7570 		cur_curlyx = NULL;
7571 		ST.B = next;
7572 		ST.prev_eval = cur_eval;
7573 		cur_eval = st;
7574 		/* now continue from first node in postoned RE */
7575 		PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
7576                                     loceol, script_run_begin);
7577 		NOT_REACHED; /* NOTREACHED */
7578 	}
7579 
7580 	case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7581             /* note: this is called twice; first after popping B, then A */
7582             DEBUG_STACK_r({
7583                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7584                     depth, cur_eval, ST.prev_eval);
7585             });
7586 
7587 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7588             if ( cur_eval && CUR_EVAL.close_paren ) {\
7589                 DEBUG_STACK_r({ \
7590                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7591                         depth,    \
7592                         CUR_EVAL.close_paren - 1,\
7593                         cur_eval, \
7594                         VAL);     \
7595                 });               \
7596                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7597             }
7598 
7599             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7600 
7601 	    rex_sv = ST.prev_rex;
7602             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7603 	    SET_reg_curpm(rex_sv);
7604 	    rex = ReANY(rex_sv);
7605 	    rexi = RXi_GET(rex);
7606             {
7607                 /* preserve $^R across LEAVE's. See Bug 121070. */
7608                 SV *save_sv= GvSV(PL_replgv);
7609                 SV *replsv;
7610                 SvREFCNT_inc(save_sv);
7611                 regcpblow(ST.cp); /* LEAVE in disguise */
7612                 /* don't move this initialization up */
7613                 replsv = GvSV(PL_replgv);
7614                 sv_setsv(replsv, save_sv);
7615                 SvSETMAGIC(replsv);
7616                 SvREFCNT_dec(save_sv);
7617             }
7618 	    cur_eval = ST.prev_eval;
7619 	    cur_curlyx = ST.prev_curlyx;
7620 
7621 	    /* Invalidate cache. See "invalidate" comment above. */
7622 	    reginfo->poscache_maxiter = 0;
7623             if ( nochange_depth )
7624 	        nochange_depth--;
7625 
7626             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7627 	    sayYES;
7628 
7629 
7630 	case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7631 	    REGCP_UNWIND(ST.lastcp);
7632             sayNO;
7633 
7634 	case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7635 	    /* note: this is called twice; first after popping B, then A */
7636             DEBUG_STACK_r({
7637                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7638                     depth, cur_eval, ST.prev_eval);
7639             });
7640 
7641             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7642 
7643 	    rex_sv = ST.prev_rex;
7644             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7645 	    SET_reg_curpm(rex_sv);
7646 	    rex = ReANY(rex_sv);
7647 	    rexi = RXi_GET(rex);
7648 
7649 	    REGCP_UNWIND(ST.lastcp);
7650             regcppop(rex, &maxopenparen);
7651 	    cur_eval = ST.prev_eval;
7652 	    cur_curlyx = ST.prev_curlyx;
7653 
7654 	    /* Invalidate cache. See "invalidate" comment above. */
7655 	    reginfo->poscache_maxiter = 0;
7656 	    if ( nochange_depth )
7657 	        nochange_depth--;
7658 
7659             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7660             sayNO_SILENT;
7661 #undef ST
7662 
7663 	case OPEN: /*  (  */
7664 	    n = ARG(scan);  /* which paren pair */
7665 	    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7666 	    if (n > maxopenparen)
7667 		maxopenparen = n;
7668             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7669 		"OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7670                 depth,
7671 		PTR2UV(rex),
7672 		PTR2UV(rex->offs),
7673 		(UV)n,
7674 		(IV)rex->offs[n].start_tmp,
7675 		(UV)maxopenparen
7676 	    ));
7677             lastopen = n;
7678 	    break;
7679 
7680         case SROPEN: /*  (*SCRIPT_RUN:  */
7681             script_run_begin = (U8 *) locinput;
7682             break;
7683 
7684 
7685 	case CLOSE:  /*  )  */
7686 	    n = ARG(scan);  /* which paren pair */
7687 	    CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7688                              locinput - reginfo->strbeg);
7689             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7690 	        goto fake_end;
7691 
7692 	    break;
7693 
7694         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7695 
7696             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7697             {
7698                 sayNO;
7699             }
7700 
7701             break;
7702 
7703 
7704         case ACCEPT:  /*  (*ACCEPT)  */
7705             if (scan->flags)
7706                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7707             if (ARG2L(scan)){
7708                 regnode *cursor;
7709                 for (cursor=scan;
7710                      cursor && OP(cursor)!=END;
7711                      cursor=regnext(cursor))
7712                 {
7713                     if ( OP(cursor)==CLOSE ){
7714                         n = ARG(cursor);
7715                         if ( n <= lastopen ) {
7716 			    CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7717                                              locinput - reginfo->strbeg);
7718                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7719                                 break;
7720                         }
7721                     }
7722                 }
7723             }
7724 	    goto fake_end;
7725 	    /* NOTREACHED */
7726 
7727 	case GROUPP:  /*  (?(1))  */
7728 	    n = ARG(scan);  /* which paren pair */
7729 	    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7730 	    break;
7731 
7732 	case NGROUPP:  /*  (?(<name>))  */
7733 	    /* reg_check_named_buff_matched returns 0 for no match */
7734 	    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7735 	    break;
7736 
7737         case INSUBP:   /*  (?(R))  */
7738             n = ARG(scan);
7739             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7740              * of SCAN is already set up as matches a eval.close_paren */
7741             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7742             break;
7743 
7744         case DEFINEP:  /*  (?(DEFINE))  */
7745             sw = 0;
7746             break;
7747 
7748 	case IFTHEN:   /*  (?(cond)A|B)  */
7749 	    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7750 	    if (sw)
7751 		next = NEXTOPER(NEXTOPER(scan));
7752 	    else {
7753 		next = scan + ARG(scan);
7754 		if (OP(next) == IFTHEN) /* Fake one. */
7755 		    next = NEXTOPER(NEXTOPER(next));
7756 	    }
7757 	    break;
7758 
7759 	case LOGICAL:  /* modifier for EVAL and IFMATCH */
7760 	    logical = scan->flags;
7761 	    break;
7762 
7763 /*******************************************************************
7764 
7765 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7766 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7767 STAR/PLUS/CURLY/CURLYN are used instead.)
7768 
7769 A*B is compiled as <CURLYX><A><WHILEM><B>
7770 
7771 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7772 state, which contains the current count, initialised to -1. It also sets
7773 cur_curlyx to point to this state, with any previous value saved in the
7774 state block.
7775 
7776 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7777 since the pattern may possibly match zero times (i.e. it's a while {} loop
7778 rather than a do {} while loop).
7779 
7780 Each entry to WHILEM represents a successful match of A. The count in the
7781 CURLYX block is incremented, another WHILEM state is pushed, and execution
7782 passes to A or B depending on greediness and the current count.
7783 
7784 For example, if matching against the string a1a2a3b (where the aN are
7785 substrings that match /A/), then the match progresses as follows: (the
7786 pushed states are interspersed with the bits of strings matched so far):
7787 
7788     <CURLYX cnt=-1>
7789     <CURLYX cnt=0><WHILEM>
7790     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7791     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7792     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7793     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7794 
7795 (Contrast this with something like CURLYM, which maintains only a single
7796 backtrack state:
7797 
7798     <CURLYM cnt=0> a1
7799     a1 <CURLYM cnt=1> a2
7800     a1 a2 <CURLYM cnt=2> a3
7801     a1 a2 a3 <CURLYM cnt=3> b
7802 )
7803 
7804 Each WHILEM state block marks a point to backtrack to upon partial failure
7805 of A or B, and also contains some minor state data related to that
7806 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7807 overall state, such as the count, and pointers to the A and B ops.
7808 
7809 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7810 must always point to the *current* CURLYX block, the rules are:
7811 
7812 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7813 and set cur_curlyx to point the new block.
7814 
7815 When popping the CURLYX block after a successful or unsuccessful match,
7816 restore the previous cur_curlyx.
7817 
7818 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7819 to the outer one saved in the CURLYX block.
7820 
7821 When popping the WHILEM block after a successful or unsuccessful B match,
7822 restore the previous cur_curlyx.
7823 
7824 Here's an example for the pattern (AI* BI)*BO
7825 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7826 
7827 cur_
7828 curlyx backtrack stack
7829 ------ ---------------
7830 NULL
7831 CO     <CO prev=NULL> <WO>
7832 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7833 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7834 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7835 
7836 At this point the pattern succeeds, and we work back down the stack to
7837 clean up, restoring as we go:
7838 
7839 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7840 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7841 CO     <CO prev=NULL> <WO>
7842 NULL
7843 
7844 *******************************************************************/
7845 
7846 #define ST st->u.curlyx
7847 
7848 	case CURLYX:    /* start of /A*B/  (for complex A) */
7849 	{
7850 	    /* No need to save/restore up to this paren */
7851 	    I32 parenfloor = scan->flags;
7852 
7853 	    assert(next); /* keep Coverity happy */
7854 	    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7855 		next += ARG(next);
7856 
7857 	    /* XXXX Probably it is better to teach regpush to support
7858 	       parenfloor > maxopenparen ... */
7859 	    if (parenfloor > (I32)rex->lastparen)
7860 		parenfloor = rex->lastparen; /* Pessimization... */
7861 
7862 	    ST.prev_curlyx= cur_curlyx;
7863 	    cur_curlyx = st;
7864 	    ST.cp = PL_savestack_ix;
7865 
7866 	    /* these fields contain the state of the current curly.
7867 	     * they are accessed by subsequent WHILEMs */
7868 	    ST.parenfloor = parenfloor;
7869 	    ST.me = scan;
7870 	    ST.B = next;
7871 	    ST.minmod = minmod;
7872 	    minmod = 0;
7873 	    ST.count = -1;	/* this will be updated by WHILEM */
7874 	    ST.lastloc = NULL;  /* this will be updated by WHILEM */
7875 
7876 	    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
7877                                 script_run_begin);
7878 	    NOT_REACHED; /* NOTREACHED */
7879 	}
7880 
7881 	case CURLYX_end: /* just finished matching all of A*B */
7882 	    cur_curlyx = ST.prev_curlyx;
7883 	    sayYES;
7884 	    NOT_REACHED; /* NOTREACHED */
7885 
7886 	case CURLYX_end_fail: /* just failed to match all of A*B */
7887 	    regcpblow(ST.cp);
7888 	    cur_curlyx = ST.prev_curlyx;
7889 	    sayNO;
7890 	    NOT_REACHED; /* NOTREACHED */
7891 
7892 
7893 #undef ST
7894 #define ST st->u.whilem
7895 
7896 	case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7897 	{
7898 	    /* see the discussion above about CURLYX/WHILEM */
7899 	    I32 n;
7900 	    int min, max;
7901 	    regnode *A;
7902 
7903 	    assert(cur_curlyx); /* keep Coverity happy */
7904 
7905 	    min = ARG1(cur_curlyx->u.curlyx.me);
7906 	    max = ARG2(cur_curlyx->u.curlyx.me);
7907 	    A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7908 	    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7909 	    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7910 	    ST.cache_offset = 0;
7911 	    ST.cache_mask = 0;
7912 
7913 
7914             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7915                   depth, (long)n, min, max)
7916 	    );
7917 
7918 	    /* First just match a string of min A's. */
7919 
7920 	    if (n < min) {
7921                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7922 		cur_curlyx->u.curlyx.lastloc = locinput;
7923 		REGCP_SET(ST.lastcp);
7924 
7925 		PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
7926                                 script_run_begin);
7927 		NOT_REACHED; /* NOTREACHED */
7928 	    }
7929 
7930 	    /* If degenerate A matches "", assume A done. */
7931 
7932 	    if (locinput == cur_curlyx->u.curlyx.lastloc) {
7933                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7934                    depth)
7935 		);
7936 		goto do_whilem_B_max;
7937 	    }
7938 
7939 	    /* super-linear cache processing.
7940              *
7941              * The idea here is that for certain types of CURLYX/WHILEM -
7942              * principally those whose upper bound is infinity (and
7943              * excluding regexes that have things like \1 and other very
7944              * non-regular expresssiony things), then if a pattern like
7945              * /....A*.../ fails and we backtrack to the WHILEM, then we
7946              * make a note that this particular WHILEM op was at string
7947              * position 47 (say) when the rest of pattern failed. Then, if
7948              * we ever find ourselves back at that WHILEM, and at string
7949              * position 47 again, we can just fail immediately rather than
7950              * running the rest of the pattern again.
7951              *
7952              * This is very handy when patterns start to go
7953              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7954              * with a combinatorial explosion of backtracking.
7955              *
7956              * The cache is implemented as a bit array, with one bit per
7957              * string byte position per WHILEM op (up to 16) - so its
7958              * between 0.25 and 2x the string size.
7959              *
7960              * To avoid allocating a poscache buffer every time, we do an
7961              * initially countdown; only after we have  executed a WHILEM
7962              * op (string-length x #WHILEMs) times do we allocate the
7963              * cache.
7964              *
7965              * The top 4 bits of scan->flags byte say how many different
7966              * relevant CURLLYX/WHILEM op pairs there are, while the
7967              * bottom 4-bits is the identifying index number of this
7968              * WHILEM.
7969              */
7970 
7971 	    if (scan->flags) {
7972 
7973 		if (!reginfo->poscache_maxiter) {
7974 		    /* start the countdown: Postpone detection until we
7975 		     * know the match is not *that* much linear. */
7976 		    reginfo->poscache_maxiter
7977                         =    (reginfo->strend - reginfo->strbeg + 1)
7978                            * (scan->flags>>4);
7979 		    /* possible overflow for long strings and many CURLYX's */
7980 		    if (reginfo->poscache_maxiter < 0)
7981 			reginfo->poscache_maxiter = I32_MAX;
7982 		    reginfo->poscache_iter = reginfo->poscache_maxiter;
7983 		}
7984 
7985 		if (reginfo->poscache_iter-- == 0) {
7986 		    /* initialise cache */
7987 		    const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7988                     regmatch_info_aux *const aux = reginfo->info_aux;
7989 		    if (aux->poscache) {
7990 			if ((SSize_t)reginfo->poscache_size < size) {
7991 			    Renew(aux->poscache, size, char);
7992 			    reginfo->poscache_size = size;
7993 			}
7994 			Zero(aux->poscache, size, char);
7995 		    }
7996 		    else {
7997 			reginfo->poscache_size = size;
7998 			Newxz(aux->poscache, size, char);
7999 		    }
8000                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8001       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8002 			      PL_colors[4], PL_colors[5])
8003 		    );
8004 		}
8005 
8006 		if (reginfo->poscache_iter < 0) {
8007 		    /* have we already failed at this position? */
8008 		    SSize_t offset, mask;
8009 
8010                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8011 		    offset  = (scan->flags & 0xf) - 1
8012                                 +   (locinput - reginfo->strbeg)
8013                                   * (scan->flags>>4);
8014 		    mask    = 1 << (offset % 8);
8015 		    offset /= 8;
8016 		    if (reginfo->info_aux->poscache[offset] & mask) {
8017                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8018                             depth)
8019 			);
8020                         cur_curlyx->u.curlyx.count--;
8021 			sayNO; /* cache records failure */
8022 		    }
8023 		    ST.cache_offset = offset;
8024 		    ST.cache_mask   = mask;
8025 		}
8026 	    }
8027 
8028 	    /* Prefer B over A for minimal matching. */
8029 
8030 	    if (cur_curlyx->u.curlyx.minmod) {
8031 		ST.save_curlyx = cur_curlyx;
8032 		cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8033 		PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8034                                     locinput, loceol, script_run_begin);
8035 		NOT_REACHED; /* NOTREACHED */
8036 	    }
8037 
8038 	    /* Prefer A over B for maximal matching. */
8039 
8040 	    if (n < max) { /* More greed allowed? */
8041                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8042                             maxopenparen);
8043 		cur_curlyx->u.curlyx.lastloc = locinput;
8044 		REGCP_SET(ST.lastcp);
8045 		PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8046                                 script_run_begin);
8047 		NOT_REACHED; /* NOTREACHED */
8048 	    }
8049 	    goto do_whilem_B_max;
8050 	}
8051 	NOT_REACHED; /* NOTREACHED */
8052 
8053 	case WHILEM_B_min: /* just matched B in a minimal match */
8054 	case WHILEM_B_max: /* just matched B in a maximal match */
8055 	    cur_curlyx = ST.save_curlyx;
8056 	    sayYES;
8057 	    NOT_REACHED; /* NOTREACHED */
8058 
8059 	case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8060 	    cur_curlyx = ST.save_curlyx;
8061 	    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8062 	    cur_curlyx->u.curlyx.count--;
8063 	    CACHEsayNO;
8064 	    NOT_REACHED; /* NOTREACHED */
8065 
8066 	case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8067 	    /* FALLTHROUGH */
8068 	case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8069 	    REGCP_UNWIND(ST.lastcp);
8070             regcppop(rex, &maxopenparen);
8071 	    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8072 	    cur_curlyx->u.curlyx.count--;
8073 	    CACHEsayNO;
8074 	    NOT_REACHED; /* NOTREACHED */
8075 
8076 	case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8077 	    REGCP_UNWIND(ST.lastcp);
8078             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8079             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8080                 depth)
8081 	    );
8082 	  do_whilem_B_max:
8083 	    if (cur_curlyx->u.curlyx.count >= REG_INFTY
8084 		&& ckWARN(WARN_REGEXP)
8085 		&& !reginfo->warned)
8086 	    {
8087                 reginfo->warned	= TRUE;
8088 		Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8089 		     "Complex regular subexpression recursion limit (%d) "
8090 		     "exceeded",
8091 		     REG_INFTY - 1);
8092 	    }
8093 
8094 	    /* now try B */
8095 	    ST.save_curlyx = cur_curlyx;
8096 	    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8097 	    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8098                                 locinput, loceol, script_run_begin);
8099 	    NOT_REACHED; /* NOTREACHED */
8100 
8101 	case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8102 	    cur_curlyx = ST.save_curlyx;
8103 
8104 	    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8105 		/* Maximum greed exceeded */
8106 		if (cur_curlyx->u.curlyx.count >= REG_INFTY
8107 		    && ckWARN(WARN_REGEXP)
8108                     && !reginfo->warned)
8109 		{
8110                     reginfo->warned	= TRUE;
8111 		    Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8112 			"Complex regular subexpression recursion "
8113 			"limit (%d) exceeded",
8114 			REG_INFTY - 1);
8115 		}
8116 		cur_curlyx->u.curlyx.count--;
8117 		CACHEsayNO;
8118 	    }
8119 
8120             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8121 	    );
8122 	    /* Try grabbing another A and see if it helps. */
8123 	    cur_curlyx->u.curlyx.lastloc = locinput;
8124             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8125                             maxopenparen);
8126 	    REGCP_SET(ST.lastcp);
8127 	    PUSH_STATE_GOTO(WHILEM_A_min,
8128 		/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8129                 locinput, loceol, script_run_begin);
8130 	    NOT_REACHED; /* NOTREACHED */
8131 
8132 #undef  ST
8133 #define ST st->u.branch
8134 
8135 	case BRANCHJ:	    /*  /(...|A|...)/ with long next pointer */
8136 	    next = scan + ARG(scan);
8137 	    if (next == scan)
8138 		next = NULL;
8139 	    scan = NEXTOPER(scan);
8140 	    /* FALLTHROUGH */
8141 
8142 	case BRANCH:	    /*  /(...|A|...)/ */
8143 	    scan = NEXTOPER(scan); /* scan now points to inner node */
8144 	    ST.lastparen = rex->lastparen;
8145 	    ST.lastcloseparen = rex->lastcloseparen;
8146 	    ST.next_branch = next;
8147 	    REGCP_SET(ST.cp);
8148 
8149 	    /* Now go into the branch */
8150 	    if (has_cutgroup) {
8151 	        PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8152                                     script_run_begin);
8153 	    } else {
8154 	        PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8155                                 script_run_begin);
8156 	    }
8157 	    NOT_REACHED; /* NOTREACHED */
8158 
8159         case CUTGROUP:  /*  /(*THEN)/  */
8160             sv_yes_mark = st->u.mark.mark_name = scan->flags
8161                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8162                 : NULL;
8163             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8164                             script_run_begin);
8165             NOT_REACHED; /* NOTREACHED */
8166 
8167         case CUTGROUP_next_fail:
8168             do_cutgroup = 1;
8169             no_final = 1;
8170             if (st->u.mark.mark_name)
8171                 sv_commit = st->u.mark.mark_name;
8172             sayNO;
8173             NOT_REACHED; /* NOTREACHED */
8174 
8175         case BRANCH_next:
8176             sayYES;
8177             NOT_REACHED; /* NOTREACHED */
8178 
8179 	case BRANCH_next_fail: /* that branch failed; try the next, if any */
8180 	    if (do_cutgroup) {
8181 	        do_cutgroup = 0;
8182 	        no_final = 0;
8183 	    }
8184 	    REGCP_UNWIND(ST.cp);
8185             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8186 	    scan = ST.next_branch;
8187 	    /* no more branches? */
8188 	    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8189 	        DEBUG_EXECUTE_r({
8190                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8191                         depth,
8192 			PL_colors[4],
8193 			PL_colors[5] );
8194 		});
8195 		sayNO_SILENT;
8196             }
8197 	    continue; /* execute next BRANCH[J] op */
8198             /* NOTREACHED */
8199 
8200 	case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8201 	    minmod = 1;
8202 	    break;
8203 
8204 #undef  ST
8205 #define ST st->u.curlym
8206 
8207 	case CURLYM:	/* /A{m,n}B/ where A is fixed-length */
8208 
8209 	    /* This is an optimisation of CURLYX that enables us to push
8210 	     * only a single backtracking state, no matter how many matches
8211 	     * there are in {m,n}. It relies on the pattern being constant
8212 	     * length, with no parens to influence future backrefs
8213 	     */
8214 
8215 	    ST.me = scan;
8216 	    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8217 
8218 	    ST.lastparen      = rex->lastparen;
8219 	    ST.lastcloseparen = rex->lastcloseparen;
8220 
8221 	    /* if paren positive, emulate an OPEN/CLOSE around A */
8222 	    if (ST.me->flags) {
8223 		U32 paren = ST.me->flags;
8224 		if (paren > maxopenparen)
8225 		    maxopenparen = paren;
8226 		scan += NEXT_OFF(scan); /* Skip former OPEN. */
8227 	    }
8228 	    ST.A = scan;
8229 	    ST.B = next;
8230 	    ST.alen = 0;
8231 	    ST.count = 0;
8232 	    ST.minmod = minmod;
8233 	    minmod = 0;
8234 	    ST.c1 = CHRTEST_UNINIT;
8235 	    REGCP_SET(ST.cp);
8236 
8237 	    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8238 		goto curlym_do_B;
8239 
8240 	  curlym_do_A: /* execute the A in /A{m,n}B/  */
8241 	    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8242                                 script_run_begin);
8243 	    NOT_REACHED; /* NOTREACHED */
8244 
8245 	case CURLYM_A: /* we've just matched an A */
8246 	    ST.count++;
8247 	    /* after first match, determine A's length: u.curlym.alen */
8248 	    if (ST.count == 1) {
8249 		if (reginfo->is_utf8_target) {
8250 		    char *s = st->locinput;
8251 		    while (s < locinput) {
8252 			ST.alen++;
8253 			s += UTF8SKIP(s);
8254 		    }
8255 		}
8256 		else {
8257 		    ST.alen = locinput - st->locinput;
8258 		}
8259 		if (ST.alen == 0)
8260 		    ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8261 	    }
8262 	    DEBUG_EXECUTE_r(
8263                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8264                           depth, (IV) ST.count, (IV)ST.alen)
8265 	    );
8266 
8267             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8268 	        goto fake_end;
8269 
8270 	    {
8271 		I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8272 		if ( max == REG_INFTY || ST.count < max )
8273 		    goto curlym_do_A; /* try to match another A */
8274 	    }
8275 	    goto curlym_do_B; /* try to match B */
8276 
8277 	case CURLYM_A_fail: /* just failed to match an A */
8278 	    REGCP_UNWIND(ST.cp);
8279 
8280 
8281 	    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8282                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8283 		sayNO;
8284 
8285 	  curlym_do_B: /* execute the B in /A{m,n}B/  */
8286 	    if (ST.c1 == CHRTEST_UNINIT) {
8287 		/* calculate c1 and c2 for possible match of 1st char
8288 		 * following curly */
8289 		ST.c1 = ST.c2 = CHRTEST_VOID;
8290                 assert(ST.B);
8291 		if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8292 		    regnode *text_node = ST.B;
8293 		    if (! HAS_TEXT(text_node))
8294 			FIND_NEXT_IMPT(text_node);
8295 		    if (PL_regkind[OP(text_node)] == EXACT) {
8296                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8297                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8298                            reginfo))
8299                         {
8300                             sayNO;
8301                         }
8302 		    }
8303 		}
8304 	    }
8305 
8306 	    DEBUG_EXECUTE_r(
8307                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8308                     depth, (IV)ST.count)
8309 		);
8310 	    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8311                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8312 
8313                            /* (We can use memEQ and memNE in this file without
8314                             * having to worry about one being shorter than the
8315                             * other, since the first byte of each gives the
8316                             * length of the character) */
8317                     if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
8318                                                               reginfo->strend))
8319                         && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
8320                                                              reginfo->strend)))
8321                     {
8322                         /* simulate B failing */
8323                         DEBUG_OPTIMISE_r(
8324                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8325                                 depth,
8326                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8327                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8328                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8329                         );
8330                         state_num = CURLYM_B_fail;
8331                         goto reenter_switch;
8332                     }
8333                 }
8334                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8335                     /* simulate B failing */
8336                     DEBUG_OPTIMISE_r(
8337                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8338                             depth,
8339                             (int) nextchr, ST.c1, ST.c2)
8340                     );
8341                     state_num = CURLYM_B_fail;
8342                     goto reenter_switch;
8343                 }
8344             }
8345 
8346 	    if (ST.me->flags) {
8347 		/* emulate CLOSE: mark current A as captured */
8348 		U32 paren = (U32)ST.me->flags;
8349 		if (ST.count) {
8350                     CLOSE_CAPTURE(paren,
8351 			HOPc(locinput, -ST.alen) - reginfo->strbeg,
8352 		        locinput - reginfo->strbeg);
8353 		}
8354 		else
8355 		    rex->offs[paren].end = -1;
8356 
8357                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8358 		{
8359 		    if (ST.count)
8360 	                goto fake_end;
8361 	            else
8362 	                sayNO;
8363 	        }
8364 	    }
8365 
8366 	    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
8367                             script_run_begin);
8368 	    NOT_REACHED; /* NOTREACHED */
8369 
8370 	case CURLYM_B_fail: /* just failed to match a B */
8371 	    REGCP_UNWIND(ST.cp);
8372             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8373 	    if (ST.minmod) {
8374 		I32 max = ARG2(ST.me);
8375 		if (max != REG_INFTY && ST.count == max)
8376 		    sayNO;
8377 		goto curlym_do_A; /* try to match a further A */
8378 	    }
8379 	    /* backtrack one A */
8380 	    if (ST.count == ARG1(ST.me) /* min */)
8381 		sayNO;
8382 	    ST.count--;
8383 	    SET_locinput(HOPc(locinput, -ST.alen));
8384 	    goto curlym_do_B; /* try to match B */
8385 
8386 #undef ST
8387 #define ST st->u.curly
8388 
8389 #define CURLY_SETPAREN(paren, success) \
8390     if (paren) { \
8391 	if (success) { \
8392             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8393 	                         locinput - reginfo->strbeg); \
8394 	} \
8395 	else { \
8396 	    rex->offs[paren].end = -1; \
8397 	    rex->lastparen      = ST.lastparen; \
8398 	    rex->lastcloseparen = ST.lastcloseparen; \
8399 	} \
8400     }
8401 
8402         case STAR:		/*  /A*B/ where A is width 1 char */
8403 	    ST.paren = 0;
8404 	    ST.min = 0;
8405 	    ST.max = REG_INFTY;
8406 	    scan = NEXTOPER(scan);
8407 	    goto repeat;
8408 
8409         case PLUS:		/*  /A+B/ where A is width 1 char */
8410 	    ST.paren = 0;
8411 	    ST.min = 1;
8412 	    ST.max = REG_INFTY;
8413 	    scan = NEXTOPER(scan);
8414 	    goto repeat;
8415 
8416 	case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
8417             ST.paren = scan->flags;	/* Which paren to set */
8418             ST.lastparen      = rex->lastparen;
8419 	    ST.lastcloseparen = rex->lastcloseparen;
8420 	    if (ST.paren > maxopenparen)
8421 		maxopenparen = ST.paren;
8422 	    ST.min = ARG1(scan);  /* min to match */
8423 	    ST.max = ARG2(scan);  /* max to match */
8424             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8425 
8426             /* handle the single-char capture called as a GOSUB etc */
8427             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8428             {
8429                 char *li = locinput;
8430                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
8431 		    sayNO;
8432                 SET_locinput(li);
8433                 goto fake_end;
8434 	    }
8435 
8436 	    goto repeat;
8437 
8438 	case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
8439 	    ST.paren = 0;
8440 	    ST.min = ARG1(scan);  /* min to match */
8441 	    ST.max = ARG2(scan);  /* max to match */
8442 	    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8443 	  repeat:
8444 	    /*
8445 	    * Lookahead to avoid useless match attempts
8446 	    * when we know what character comes next.
8447 	    *
8448 	    * Used to only do .*x and .*?x, but now it allows
8449 	    * for )'s, ('s and (?{ ... })'s to be in the way
8450 	    * of the quantifier and the EXACT-like node.  -- japhy
8451 	    */
8452 
8453 	    assert(ST.min <= ST.max);
8454             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8455                 ST.c1 = ST.c2 = CHRTEST_VOID;
8456             }
8457             else {
8458 		regnode *text_node = next;
8459 
8460 		if (! HAS_TEXT(text_node))
8461 		    FIND_NEXT_IMPT(text_node);
8462 
8463 		if (! HAS_TEXT(text_node))
8464 		    ST.c1 = ST.c2 = CHRTEST_VOID;
8465 		else {
8466 		    if ( PL_regkind[OP(text_node)] != EXACT ) {
8467 			ST.c1 = ST.c2 = CHRTEST_VOID;
8468 		    }
8469 		    else {
8470                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8471                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8472                            reginfo))
8473                         {
8474                             sayNO;
8475                         }
8476                     }
8477 		}
8478 	    }
8479 
8480 	    ST.A = scan;
8481 	    ST.B = next;
8482 	    if (minmod) {
8483                 char *li = locinput;
8484 		minmod = 0;
8485 		if (ST.min &&
8486                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
8487                             < ST.min)
8488 		    sayNO;
8489                 SET_locinput(li);
8490 		ST.count = ST.min;
8491 		REGCP_SET(ST.cp);
8492 		if (ST.c1 == CHRTEST_VOID)
8493 		    goto curly_try_B_min;
8494 
8495 		ST.oldloc = locinput;
8496 
8497 		/* set ST.maxpos to the furthest point along the
8498 		 * string that could possibly match */
8499 		if  (ST.max == REG_INFTY) {
8500 		    ST.maxpos = loceol - 1;
8501 		    if (utf8_target)
8502 			while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8503 			    ST.maxpos--;
8504 		}
8505 		else if (utf8_target) {
8506 		    int m = ST.max - ST.min;
8507 		    for (ST.maxpos = locinput;
8508 			 m >0 && ST.maxpos <  loceol; m--)
8509 			ST.maxpos += UTF8SKIP(ST.maxpos);
8510 		}
8511 		else {
8512 		    ST.maxpos = locinput + ST.max - ST.min;
8513 		    if (ST.maxpos >=  loceol)
8514 			ST.maxpos =  loceol - 1;
8515 		}
8516 		goto curly_try_B_min_known;
8517 
8518 	    }
8519 	    else {
8520                 /* avoid taking address of locinput, so it can remain
8521                  * a register var */
8522                 char *li = locinput;
8523                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
8524 		if (ST.count < ST.min)
8525 		    sayNO;
8526                 SET_locinput(li);
8527 		if ((ST.count > ST.min)
8528 		    && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8529 		{
8530 		    /* A{m,n} must come at the end of the string, there's
8531 		     * no point in backing off ... */
8532 		    ST.min = ST.count;
8533 		    /* ...except that $ and \Z can match before *and* after
8534 		       newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8535 		       We may back off by one in this case. */
8536 		    if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8537 			ST.min--;
8538 		}
8539 		REGCP_SET(ST.cp);
8540 		goto curly_try_B_max;
8541 	    }
8542 	    NOT_REACHED; /* NOTREACHED */
8543 
8544 	case CURLY_B_min_fail:
8545 	    /* failed to find B in a non-greedy match.
8546              * Handles both cases where c1,c2 valid or not */
8547 
8548 	    REGCP_UNWIND(ST.cp);
8549             if (ST.paren) {
8550                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8551             }
8552 
8553             if (ST.c1 == CHRTEST_VOID) {
8554                 /* failed -- move forward one */
8555                 char *li = locinput;
8556                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
8557                     sayNO;
8558                 }
8559                 locinput = li;
8560                 ST.count++;
8561 		if (!(   ST.count <= ST.max
8562                         /* count overflow ? */
8563                      || (ST.max == REG_INFTY && ST.count > 0))
8564                 )
8565                     sayNO;
8566             }
8567             else {
8568 		int n;
8569                 /* Couldn't or didn't -- move forward. */
8570                 ST.oldloc = locinput;
8571                 if (utf8_target)
8572                     locinput += UTF8SKIP(locinput);
8573                 else
8574                     locinput++;
8575                 ST.count++;
8576 
8577               curly_try_B_min_known:
8578                 /* find the next place where 'B' could work, then call B */
8579 		if (utf8_target) {
8580 		    n = (ST.oldloc == locinput) ? 0 : 1;
8581 		    if (ST.c1 == ST.c2) {
8582 			/* set n to utf8_distance(oldloc, locinput) */
8583 			while (    locinput <= ST.maxpos
8584                                &&  locinput < loceol
8585                                &&  memNE(locinput, ST.c1_utf8,
8586                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8587                         {
8588 			    locinput += UTF8_SAFE_SKIP(locinput,
8589                                                        reginfo->strend);
8590 			    n++;
8591 			}
8592 		    }
8593 		    else {
8594 			/* set n to utf8_distance(oldloc, locinput) */
8595 			while (   locinput <= ST.maxpos
8596                                && locinput < loceol
8597                                && memNE(locinput, ST.c1_utf8,
8598                                      UTF8_SAFE_SKIP(locinput, reginfo->strend))
8599                                && memNE(locinput, ST.c2_utf8,
8600                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8601                         {
8602 			    locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
8603 			    n++;
8604 			}
8605 		    }
8606 		}
8607 		else {  /* Not utf8_target */
8608 		    if (ST.c1 == ST.c2) {
8609                         locinput = (char *) memchr(locinput,
8610                                                    ST.c1,
8611                                                    ST.maxpos + 1 - locinput);
8612                         if (! locinput) {
8613                             locinput = ST.maxpos + 1;
8614                         }
8615 		    }
8616                     else {
8617                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8618 
8619                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8620                             while (   locinput <= ST.maxpos
8621                                    && UCHARAT(locinput) != ST.c1
8622                                    && UCHARAT(locinput) != ST.c2)
8623                             {
8624                                 locinput++;
8625                             }
8626                         }
8627                         else {
8628                             /* If c1 and c2 only differ by a single bit, we can
8629                              * avoid a conditional each time through the loop,
8630                              * at the expense of a little preliminary setup and
8631                              * an extra mask each iteration.  By masking out
8632                              * that bit, we match exactly two characters, c1
8633                              * and c2, and so we don't have to test for both.
8634                              * On both ASCII and EBCDIC platforms, most of the
8635                              * ASCII-range and Latin1-range folded equivalents
8636                              * differ only in a single bit, so this is actually
8637                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8638                              * 0x61). */
8639                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8640                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8641                             while (   locinput <= ST.maxpos
8642                                    && (UCHARAT(locinput) & c1_c2_mask)
8643                                                                 != c1_masked)
8644                             {
8645                                 locinput++;
8646                             }
8647                         }
8648                     }
8649 		    n = locinput - ST.oldloc;
8650 		}
8651 		if (locinput > ST.maxpos)
8652 		    sayNO;
8653 		if (n) {
8654                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8655                      * at b; check that everything between oldloc and
8656                      * locinput matches */
8657                     char *li = ST.oldloc;
8658 		    ST.count += n;
8659                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
8660 			sayNO;
8661                     assert(n == REG_INFTY || locinput == li);
8662 		}
8663 	    }
8664 
8665           curly_try_B_min:
8666             CURLY_SETPAREN(ST.paren, ST.count);
8667             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
8668                             script_run_begin);
8669 	    NOT_REACHED; /* NOTREACHED */
8670 
8671 
8672           curly_try_B_max:
8673 	    /* a successful greedy match: now try to match B */
8674 	    {
8675 		bool could_match = locinput <  loceol;
8676 
8677 		/* If it could work, try it. */
8678                 if (ST.c1 != CHRTEST_VOID && could_match) {
8679                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8680                     {
8681                         could_match =  memEQ(locinput, ST.c1_utf8,
8682                                              UTF8_SAFE_SKIP(locinput,
8683                                                             reginfo->strend))
8684                                     || memEQ(locinput, ST.c2_utf8,
8685                                              UTF8_SAFE_SKIP(locinput,
8686                                                             reginfo->strend));
8687                     }
8688                     else {
8689                         could_match =   UCHARAT(locinput) == ST.c1
8690                                      || UCHARAT(locinput) == ST.c2;
8691                     }
8692                 }
8693                 if (ST.c1 == CHRTEST_VOID || could_match) {
8694 		    CURLY_SETPAREN(ST.paren, ST.count);
8695 		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
8696                                     script_run_begin);
8697 		    NOT_REACHED; /* NOTREACHED */
8698 		}
8699 	    }
8700 	    /* FALLTHROUGH */
8701 
8702 	case CURLY_B_max_fail:
8703 	    /* failed to find B in a greedy match */
8704 
8705 	    REGCP_UNWIND(ST.cp);
8706             if (ST.paren) {
8707                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8708             }
8709 	    /*  back up. */
8710 	    if (--ST.count < ST.min)
8711 		sayNO;
8712 	    locinput = HOPc(locinput, -1);
8713 	    goto curly_try_B_max;
8714 
8715 #undef ST
8716 
8717 	case END: /*  last op of main pattern  */
8718           fake_end:
8719 	    if (cur_eval) {
8720 		/* we've just finished A in /(??{A})B/; now continue with B */
8721                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8722 		st->u.eval.prev_rex = rex_sv;		/* inner */
8723 
8724                 /* Save *all* the positions. */
8725                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8726                 rex_sv = CUR_EVAL.prev_rex;
8727 		is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8728 		SET_reg_curpm(rex_sv);
8729 		rex = ReANY(rex_sv);
8730 		rexi = RXi_GET(rex);
8731 
8732                 st->u.eval.prev_curlyx = cur_curlyx;
8733                 cur_curlyx = CUR_EVAL.prev_curlyx;
8734 
8735 		REGCP_SET(st->u.eval.lastcp);
8736 
8737 		/* Restore parens of the outer rex without popping the
8738 		 * savestack */
8739                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8740 
8741 		st->u.eval.prev_eval = cur_eval;
8742                 cur_eval = CUR_EVAL.prev_eval;
8743 		DEBUG_EXECUTE_r(
8744                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8745                                       depth, cur_eval););
8746                 if ( nochange_depth )
8747 	            nochange_depth--;
8748 
8749                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8750 
8751                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
8752                                     st->u.eval.prev_eval->u.eval.B,
8753                                     locinput, loceol, script_run_begin);
8754 	    }
8755 
8756 	    if (locinput < reginfo->till) {
8757                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8758                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8759 				      PL_colors[4],
8760 				      (long)(locinput - startpos),
8761 				      (long)(reginfo->till - startpos),
8762 				      PL_colors[5]));
8763 
8764 		sayNO_SILENT;		/* Cannot match: too short. */
8765 	    }
8766 	    sayYES;			/* Success! */
8767 
8768 	case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8769 	    DEBUG_EXECUTE_r(
8770             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8771                 depth, PL_colors[4], PL_colors[5]));
8772 	    sayYES;			/* Success! */
8773 
8774 #undef  ST
8775 #define ST st->u.ifmatch
8776 
8777 	case SUSPEND:	/* (?>A) */
8778 	    ST.wanted = 1;
8779 	    ST.start = locinput;
8780 	    ST.end = loceol;
8781             ST.count = 1;
8782 	    goto do_ifmatch;
8783 
8784 	case UNLESSM:	/* -ve lookaround: (?!A), or with 'flags', (?<!A) */
8785 	    ST.wanted = 0;
8786 	    goto ifmatch_trivial_fail_test;
8787 
8788 	case IFMATCH:	/* +ve lookaround: (?=A), or with 'flags', (?<=A) */
8789 	    ST.wanted = 1;
8790 	  ifmatch_trivial_fail_test:
8791             ST.count = scan->next_off + 1; /* next_off repurposed to be
8792                                               lookbehind count, requires
8793                                               non-zero flags */
8794 	    if (! scan->flags) {    /* 'flags' zero means lookahed */
8795 
8796                 /* Lookahead starts here and ends at the normal place */
8797 		ST.start = locinput;
8798 		ST.end = loceol;
8799             }
8800 	    else {
8801                 PERL_UINT_FAST8_T back_count = scan->flags;
8802 		char * s;
8803 
8804                 /* Lookbehind can look beyond the current position */
8805 		ST.end = loceol;
8806 
8807                 /* ... and starts at the first place in the input that is in
8808                  * the range of the possible start positions */
8809                 for (; ST.count > 0; ST.count--, back_count--) {
8810                     s = HOPBACKc(locinput, back_count);
8811                     if (s) {
8812                         ST.start = s;
8813                         goto do_ifmatch;
8814                     }
8815                 }
8816 
8817                 /* If the lookbehind doesn't start in the actual string, is a
8818                  * trivial match failure */
8819                 if (logical) {
8820                     logical = 0;
8821                     sw = 1 - cBOOL(ST.wanted);
8822                 }
8823                 else if (ST.wanted)
8824                     sayNO;
8825 
8826                 /* Here, we didn't want it to match, so is actually success */
8827                 next = scan + ARG(scan);
8828                 if (next == scan)
8829                     next = NULL;
8830                 break;
8831 	    }
8832 
8833 	  do_ifmatch:
8834 	    ST.me = scan;
8835 	    ST.logical = logical;
8836 	    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8837 
8838 	    /* execute body of (?...A) */
8839 	    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
8840                                 ST.end, script_run_begin);
8841 	    NOT_REACHED; /* NOTREACHED */
8842 
8843         {
8844             bool matched;
8845 
8846 	case IFMATCH_A_fail: /* body of (?...A) failed */
8847 	    if (! ST.logical && ST.count > 1) {
8848 
8849                 /* It isn't a real failure until we've tried all starting
8850                  * positions.  Move to the next starting position and retry */
8851                 ST.count--;
8852                 ST.start = HOPc(ST.start, 1);
8853                 scan = ST.me;
8854                 logical = ST.logical;
8855                 goto do_ifmatch;
8856             }
8857 
8858             /* Here, all starting positions have been tried. */
8859 	    matched = FALSE;
8860 	    goto ifmatch_done;
8861 
8862 	case IFMATCH_A: /* body of (?...A) succeeded */
8863 	    matched = TRUE;
8864           ifmatch_done:
8865             sw = matched == ST.wanted;
8866 	    if (! ST.logical && !sw) {
8867                 sayNO;
8868             }
8869 
8870 	    if (OP(ST.me) != SUSPEND) {
8871                 /* restore old position except for (?>...) */
8872 		locinput = st->locinput;
8873                 loceol = st->loceol;
8874                 script_run_begin = st->sr0;
8875 	    }
8876 	    scan = ST.me + ARG(ST.me);
8877 	    if (scan == ST.me)
8878 		scan = NULL;
8879 	    continue; /* execute B */
8880         }
8881 
8882 #undef ST
8883 
8884 	case LONGJMP: /*  alternative with many branches compiles to
8885                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8886 	    next = scan + ARG(scan);
8887 	    if (next == scan)
8888 		next = NULL;
8889 	    break;
8890 
8891 	case COMMIT:  /*  (*COMMIT)  */
8892 	    reginfo->cutpoint = loceol;
8893 	    /* FALLTHROUGH */
8894 
8895 	case PRUNE:   /*  (*PRUNE)   */
8896             if (scan->flags)
8897 	        sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8898 	    PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
8899                             script_run_begin);
8900 	    NOT_REACHED; /* NOTREACHED */
8901 
8902 	case COMMIT_next_fail:
8903 	    no_final = 1;
8904 	    /* FALLTHROUGH */
8905             sayNO;
8906             NOT_REACHED; /* NOTREACHED */
8907 
8908 	case OPFAIL:   /* (*FAIL)  */
8909             if (scan->flags)
8910                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8911             if (logical) {
8912                 /* deal with (?(?!)X|Y) properly,
8913                  * make sure we trigger the no branch
8914                  * of the trailing IFTHEN structure*/
8915                 sw= 0;
8916                 break;
8917             } else {
8918                 sayNO;
8919             }
8920 	    NOT_REACHED; /* NOTREACHED */
8921 
8922 #define ST st->u.mark
8923         case MARKPOINT: /*  (*MARK:foo)  */
8924             ST.prev_mark = mark_state;
8925             ST.mark_name = sv_commit = sv_yes_mark
8926                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8927             mark_state = st;
8928             ST.mark_loc = locinput;
8929             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
8930                                 script_run_begin);
8931             NOT_REACHED; /* NOTREACHED */
8932 
8933         case MARKPOINT_next:
8934             mark_state = ST.prev_mark;
8935             sayYES;
8936             NOT_REACHED; /* NOTREACHED */
8937 
8938         case MARKPOINT_next_fail:
8939             if (popmark && sv_eq(ST.mark_name,popmark))
8940             {
8941                 if (ST.mark_loc > startpoint)
8942 	            reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8943                 popmark = NULL; /* we found our mark */
8944                 sv_commit = ST.mark_name;
8945 
8946                 DEBUG_EXECUTE_r({
8947                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8948                             depth,
8949 		            PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8950 		});
8951             }
8952             mark_state = ST.prev_mark;
8953             sv_yes_mark = mark_state ?
8954                 mark_state->u.mark.mark_name : NULL;
8955             sayNO;
8956             NOT_REACHED; /* NOTREACHED */
8957 
8958         case SKIP:  /*  (*SKIP)  */
8959             if (!scan->flags) {
8960                 /* (*SKIP) : if we fail we cut here*/
8961                 ST.mark_name = NULL;
8962                 ST.mark_loc = locinput;
8963                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
8964                                 script_run_begin);
8965             } else {
8966                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8967                    otherwise do nothing.  Meaning we need to scan
8968                  */
8969                 regmatch_state *cur = mark_state;
8970                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8971 
8972                 while (cur) {
8973                     if ( sv_eq( cur->u.mark.mark_name,
8974                                 find ) )
8975                     {
8976                         ST.mark_name = find;
8977                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
8978                                          script_run_begin);
8979                     }
8980                     cur = cur->u.mark.prev_mark;
8981                 }
8982             }
8983             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8984             break;
8985 
8986 	case SKIP_next_fail:
8987 	    if (ST.mark_name) {
8988 	        /* (*CUT:NAME) - Set up to search for the name as we
8989 	           collapse the stack*/
8990 	        popmark = ST.mark_name;
8991 	    } else {
8992 	        /* (*CUT) - No name, we cut here.*/
8993 	        if (ST.mark_loc > startpoint)
8994 	            reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8995 	        /* but we set sv_commit to latest mark_name if there
8996 	           is one so they can test to see how things lead to this
8997 	           cut */
8998                 if (mark_state)
8999                     sv_commit=mark_state->u.mark.mark_name;
9000             }
9001             no_final = 1;
9002             sayNO;
9003             NOT_REACHED; /* NOTREACHED */
9004 #undef ST
9005 
9006         case LNBREAK: /* \R */
9007             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9008                 locinput += n;
9009             } else
9010                 sayNO;
9011             break;
9012 
9013 	default:
9014 	    PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9015 			  PTR2UV(scan), OP(scan));
9016 	    Perl_croak(aTHX_ "regexp memory corruption");
9017 
9018         /* this is a point to jump to in order to increment
9019          * locinput by one character */
9020           increment_locinput:
9021             assert(!NEXTCHR_IS_EOS);
9022             if (utf8_target) {
9023                 locinput += PL_utf8skip[nextchr];
9024                 /* locinput is allowed to go 1 char off the end (signifying
9025                  * EOS), but not 2+ */
9026                 if (locinput >  loceol)
9027                     sayNO;
9028             }
9029             else
9030                 locinput++;
9031             break;
9032 
9033 	} /* end switch */
9034 
9035         /* switch break jumps here */
9036 	scan = next; /* prepare to execute the next op and ... */
9037 	continue;    /* ... jump back to the top, reusing st */
9038         /* NOTREACHED */
9039 
9040       push_yes_state:
9041 	/* push a state that backtracks on success */
9042 	st->u.yes.prev_yes_state = yes_state;
9043 	yes_state = st;
9044 	/* FALLTHROUGH */
9045       push_state:
9046 	/* push a new regex state, then continue at scan  */
9047 	{
9048 	    regmatch_state *newst;
9049 
9050 	    DEBUG_STACK_r({
9051 	        regmatch_state *cur = st;
9052 	        regmatch_state *curyes = yes_state;
9053 	        U32 i;
9054 	        regmatch_slab *slab = PL_regmatch_slab;
9055                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9056                     if (cur < SLAB_FIRST(slab)) {
9057                 	slab = slab->prev;
9058                 	cur = SLAB_LAST(slab);
9059                     }
9060                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9061                         depth,
9062                         i ? "    " : "push",
9063                         depth - i, PL_reg_name[cur->resume_state],
9064                         (curyes == cur) ? "yes" : ""
9065                     );
9066                     if (curyes == cur)
9067 	                curyes = cur->u.yes.prev_yes_state;
9068                 }
9069             } else
9070                 DEBUG_STATE_pp("push")
9071             );
9072 	    depth++;
9073 	    st->locinput = locinput;
9074 	    st->loceol = loceol;
9075             st->sr0 = script_run_begin;
9076 	    newst = st+1;
9077 	    if (newst >  SLAB_LAST(PL_regmatch_slab))
9078 		newst = S_push_slab(aTHX);
9079 	    PL_regmatch_state = newst;
9080 
9081 	    locinput = pushinput;
9082             loceol = pusheol;
9083             script_run_begin = pushsr0;
9084 	    st = newst;
9085 	    continue;
9086             /* NOTREACHED */
9087 	}
9088     }
9089 #ifdef SOLARIS_BAD_OPTIMIZER
9090 #  undef PL_charclass
9091 #endif
9092 
9093     /*
9094     * We get here only if there's trouble -- normally "case END" is
9095     * the terminating point.
9096     */
9097     Perl_croak(aTHX_ "corrupted regexp pointers");
9098     NOT_REACHED; /* NOTREACHED */
9099 
9100   yes:
9101     if (yes_state) {
9102 	/* we have successfully completed a subexpression, but we must now
9103 	 * pop to the state marked by yes_state and continue from there */
9104 	assert(st != yes_state);
9105 #ifdef DEBUGGING
9106 	while (st != yes_state) {
9107 	    st--;
9108 	    if (st < SLAB_FIRST(PL_regmatch_slab)) {
9109 		PL_regmatch_slab = PL_regmatch_slab->prev;
9110 		st = SLAB_LAST(PL_regmatch_slab);
9111 	    }
9112 	    DEBUG_STATE_r({
9113 	        if (no_final) {
9114 	            DEBUG_STATE_pp("pop (no final)");
9115 	        } else {
9116 	            DEBUG_STATE_pp("pop (yes)");
9117 	        }
9118 	    });
9119 	    depth--;
9120 	}
9121 #else
9122 	while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9123 	    || yes_state > SLAB_LAST(PL_regmatch_slab))
9124 	{
9125 	    /* not in this slab, pop slab */
9126 	    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9127 	    PL_regmatch_slab = PL_regmatch_slab->prev;
9128 	    st = SLAB_LAST(PL_regmatch_slab);
9129 	}
9130 	depth -= (st - yes_state);
9131 #endif
9132 	st = yes_state;
9133 	yes_state = st->u.yes.prev_yes_state;
9134 	PL_regmatch_state = st;
9135 
9136         if (no_final) {
9137             locinput= st->locinput;
9138             loceol= st->loceol;
9139             script_run_begin = st->sr0;
9140         }
9141 	state_num = st->resume_state + no_final;
9142 	goto reenter_switch;
9143     }
9144 
9145     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9146 			  PL_colors[4], PL_colors[5]));
9147 
9148     if (reginfo->info_aux_eval) {
9149 	/* each successfully executed (?{...}) block does the equivalent of
9150 	 *   local $^R = do {...}
9151 	 * When popping the save stack, all these locals would be undone;
9152 	 * bypass this by setting the outermost saved $^R to the latest
9153 	 * value */
9154         /* I dont know if this is needed or works properly now.
9155          * see code related to PL_replgv elsewhere in this file.
9156          * Yves
9157          */
9158 	if (oreplsv != GvSV(PL_replgv)) {
9159 	    sv_setsv(oreplsv, GvSV(PL_replgv));
9160             SvSETMAGIC(oreplsv);
9161         }
9162     }
9163     result = 1;
9164     goto final_exit;
9165 
9166   no:
9167     DEBUG_EXECUTE_r(
9168         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9169             depth,
9170             PL_colors[4], PL_colors[5])
9171 	);
9172 
9173   no_silent:
9174     if (no_final) {
9175         if (yes_state) {
9176             goto yes;
9177         } else {
9178             goto final_exit;
9179         }
9180     }
9181     if (depth) {
9182 	/* there's a previous state to backtrack to */
9183 	st--;
9184 	if (st < SLAB_FIRST(PL_regmatch_slab)) {
9185 	    PL_regmatch_slab = PL_regmatch_slab->prev;
9186 	    st = SLAB_LAST(PL_regmatch_slab);
9187 	}
9188 	PL_regmatch_state = st;
9189 	locinput= st->locinput;
9190 	loceol= st->loceol;
9191         script_run_begin = st->sr0;
9192 
9193 	DEBUG_STATE_pp("pop");
9194 	depth--;
9195 	if (yes_state == st)
9196 	    yes_state = st->u.yes.prev_yes_state;
9197 
9198 	state_num = st->resume_state + 1; /* failure = success + 1 */
9199         PERL_ASYNC_CHECK();
9200 	goto reenter_switch;
9201     }
9202     result = 0;
9203 
9204   final_exit:
9205     if (rex->intflags & PREGf_VERBARG_SEEN) {
9206         SV *sv_err = get_sv("REGERROR", 1);
9207         SV *sv_mrk = get_sv("REGMARK", 1);
9208         if (result) {
9209             sv_commit = &PL_sv_no;
9210             if (!sv_yes_mark)
9211                 sv_yes_mark = &PL_sv_yes;
9212         } else {
9213             if (!sv_commit)
9214                 sv_commit = &PL_sv_yes;
9215             sv_yes_mark = &PL_sv_no;
9216         }
9217         assert(sv_err);
9218         assert(sv_mrk);
9219         sv_setsv(sv_err, sv_commit);
9220         sv_setsv(sv_mrk, sv_yes_mark);
9221     }
9222 
9223 
9224     if (last_pushed_cv) {
9225 	dSP;
9226         /* see "Some notes about MULTICALL" above */
9227 	POP_MULTICALL;
9228         PERL_UNUSED_VAR(SP);
9229     }
9230     else
9231         LEAVE_SCOPE(orig_savestack_ix);
9232 
9233     assert(!result ||  locinput - reginfo->strbeg >= 0);
9234     return result ?  locinput - reginfo->strbeg : -1;
9235 }
9236 
9237 /*
9238  - regrepeat - repeatedly match something simple, report how many
9239  *
9240  * What 'simple' means is a node which can be the operand of a quantifier like
9241  * '+', or {1,3}
9242  *
9243  * startposp - pointer to a pointer to the start position.  This is updated
9244  *             to point to the byte following the highest successful
9245  *             match.
9246  * p         - the regnode to be repeatedly matched against.
9247  * loceol    - pointer to the end position beyond which we aren't supposed to
9248  *             look.
9249  * reginfo   - struct holding match state, such as utf8_target
9250  * max       - maximum number of things to match.
9251  * depth     - (for debugging) backtracking depth.
9252  */
9253 STATIC I32
S_regrepeat(pTHX_ regexp * prog,char ** startposp,const regnode * p,char * loceol,regmatch_info * const reginfo,I32 max _pDEPTH)9254 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9255             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9256 {
9257     dVAR;
9258     char *scan;     /* Pointer to current position in target string */
9259     I32 c;
9260     char *this_eol = loceol;   /* potentially adjusted version. */
9261     I32 hardcount = 0;  /* How many matches so far */
9262     bool utf8_target = reginfo->is_utf8_target;
9263     unsigned int to_complement = 0;  /* Invert the result? */
9264     UV utf8_flags = 0;
9265     _char_class_number classnum;
9266 
9267     PERL_ARGS_ASSERT_REGREPEAT;
9268 
9269     /* This routine is structured so that we switch on the input OP.  Each OP
9270      * case: statement contains a loop to repeatedly apply the OP, advancing
9271      * the input until it fails, or reaches the end of the input, or until it
9272      * reaches the upper limit of matches. */
9273 
9274     scan = *startposp;
9275     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9276                                max */
9277 	max = I32_MAX;
9278     else if (! utf8_target && this_eol - scan > max)
9279 	this_eol = scan + max;
9280 
9281     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9282      * to the maximum of how far we should go in it (leaving it set to the real
9283      * end, if the maximum permissible would take us beyond that).  This allows
9284      * us to make the loop exit condition that we haven't gone past <this_eol> to
9285      * also mean that we haven't exceeded the max permissible count, saving a
9286      * test each time through the loops.  But it assumes that the OP matches a
9287      * single byte, which is true for most of the OPs below when applied to a
9288      * non-UTF-8 target.  Those relatively few OPs that don't have this
9289      * characteristic will have to compensate.
9290      *
9291      * There is no adjustment for UTF-8 targets, as the number of bytes per
9292      * character varies.  OPs will have to test both that the count is less
9293      * than the max permissible (using <hardcount> to keep track), and that we
9294      * are still within the bounds of the string (using <this_eol>.  A few OPs
9295      * match a single byte no matter what the encoding.  They can omit the max
9296      * test if, for the UTF-8 case, they do the adjustment that was skipped
9297      * above.
9298      *
9299      * Thus, the code above sets things up for the common case; and exceptional
9300      * cases need extra work; the common case is to make sure <scan> doesn't
9301      * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9302      * count doesn't exceed the maximum permissible */
9303 
9304     switch (OP(p)) {
9305     case REG_ANY:
9306 	if (utf8_target) {
9307 	    while (scan < this_eol && hardcount < max && *scan != '\n') {
9308 		scan += UTF8SKIP(scan);
9309 		hardcount++;
9310 	    }
9311 	} else {
9312             scan = (char *) memchr(scan, '\n', this_eol - scan);
9313             if (! scan) {
9314                 scan = this_eol;
9315             }
9316 	}
9317 	break;
9318     case SANY:
9319         if (utf8_target) {
9320 	    while (scan < this_eol && hardcount < max) {
9321 	        scan += UTF8SKIP(scan);
9322 		hardcount++;
9323 	    }
9324 	}
9325 	else
9326 	    scan = this_eol;
9327 	break;
9328     case EXACTL:
9329         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9330         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9331             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9332         }
9333         goto do_exact;
9334 
9335     case EXACT_ONLY8:
9336         if (! utf8_target) {
9337             break;
9338         }
9339         /* FALLTHROUGH */
9340     case EXACT:
9341       do_exact:
9342         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9343 
9344 	c = (U8)*STRING(p);
9345 
9346         /* Can use a simple find if the pattern char to match on is invariant
9347          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9348          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9349          * true iff it doesn't matter if the argument is in UTF-8 or not */
9350         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9351             if (utf8_target && this_eol - scan > max) {
9352                 /* We didn't adjust <this_eol> because is UTF-8, but ok to do so,
9353                  * since here, to match at all, 1 char == 1 byte */
9354                 this_eol = scan + max;
9355             }
9356             scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9357 	}
9358 	else if (reginfo->is_utf8_pat) {
9359             if (utf8_target) {
9360                 STRLEN scan_char_len;
9361 
9362                 /* When both target and pattern are UTF-8, we have to do
9363                  * string EQ */
9364                 while (hardcount < max
9365                        && scan < this_eol
9366                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9367                        && memEQ(scan, STRING(p), scan_char_len))
9368                 {
9369                     scan += scan_char_len;
9370                     hardcount++;
9371                 }
9372             }
9373             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9374 
9375                 /* Target isn't utf8; convert the character in the UTF-8
9376                  * pattern to non-UTF8, and do a simple find */
9377                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9378                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9379             } /* else pattern char is above Latin1, can't possibly match the
9380                  non-UTF-8 target */
9381         }
9382         else {
9383 
9384             /* Here, the string must be utf8; pattern isn't, and <c> is
9385              * different in utf8 than not, so can't compare them directly.
9386              * Outside the loop, find the two utf8 bytes that represent c, and
9387              * then look for those in sequence in the utf8 string */
9388 	    U8 high = UTF8_TWO_BYTE_HI(c);
9389 	    U8 low = UTF8_TWO_BYTE_LO(c);
9390 
9391 	    while (hardcount < max
9392 		    && scan + 1 < this_eol
9393 		    && UCHARAT(scan) == high
9394 		    && UCHARAT(scan + 1) == low)
9395 	    {
9396 		scan += 2;
9397 		hardcount++;
9398 	    }
9399 	}
9400 	break;
9401 
9402     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9403         assert(! reginfo->is_utf8_pat);
9404         /* FALLTHROUGH */
9405     case EXACTFAA:
9406         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9407         if (reginfo->is_utf8_pat || ! utf8_target) {
9408 
9409             /* The possible presence of a MICRO SIGN in the pattern forbids us
9410              * to view a non-UTF-8 pattern as folded when there is a UTF-8
9411              * target.  */
9412             utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
9413         }
9414         goto do_exactf;
9415 
9416     case EXACTFL:
9417         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9418 	utf8_flags = FOLDEQ_LOCALE;
9419 	goto do_exactf;
9420 
9421     case EXACTF:   /* This node only generated for non-utf8 patterns */
9422         assert(! reginfo->is_utf8_pat);
9423         goto do_exactf;
9424 
9425     case EXACTFLU8:
9426         if (! utf8_target) {
9427             break;
9428         }
9429         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9430                                     | FOLDEQ_S2_FOLDS_SANE;
9431         goto do_exactf;
9432 
9433     case EXACTFU_ONLY8:
9434         if (! utf8_target) {
9435             break;
9436         }
9437 	assert(reginfo->is_utf8_pat);
9438 	utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9439         goto do_exactf;
9440 
9441     case EXACTFU:
9442         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9443         /* FALLTHROUGH */
9444 
9445     case EXACTFUP:
9446 
9447       do_exactf: {
9448         int c1, c2;
9449         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9450 
9451         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9452 
9453         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9454                                         reginfo))
9455         {
9456             if (c1 == CHRTEST_VOID) {
9457                 /* Use full Unicode fold matching */
9458                 char *tmpeol = loceol;
9459                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9460                 while (hardcount < max
9461                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9462                                              STRING(p), NULL, pat_len,
9463                                              reginfo->is_utf8_pat, utf8_flags))
9464                 {
9465                     scan = tmpeol;
9466                     tmpeol = loceol;
9467                     hardcount++;
9468                 }
9469             }
9470             else if (utf8_target) {
9471                 if (c1 == c2) {
9472                     while (scan < this_eol
9473                            && hardcount < max
9474                            && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9475                                                                   loceol)))
9476                     {
9477                         scan += UTF8SKIP(c1_utf8);
9478                         hardcount++;
9479                     }
9480                 }
9481                 else {
9482                     while (scan < this_eol
9483                            && hardcount < max
9484                            && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9485                                                                      loceol))
9486                                || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
9487                                                                      loceol))))
9488                     {
9489                         scan += UTF8_SAFE_SKIP(scan, loceol);
9490                         hardcount++;
9491                     }
9492                 }
9493             }
9494             else if (c1 == c2) {
9495                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1);
9496             }
9497             else {
9498                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9499                  * a conditional each time through the loop if the characters
9500                  * differ only in a single bit, as is the usual situation */
9501                 U8 c1_c2_bits_differing = c1 ^ c2;
9502 
9503                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9504                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9505 
9506                     scan = (char *) find_span_end_mask((U8 *) scan,
9507                                                        (U8 *) this_eol,
9508                                                        c1 & c1_c2_mask,
9509                                                        c1_c2_mask);
9510                 }
9511                 else {
9512                     while (    scan < this_eol
9513                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9514                     {
9515                         scan++;
9516                     }
9517                 }
9518             }
9519 	}
9520 	break;
9521     }
9522     case ANYOFPOSIXL:
9523     case ANYOFL:
9524         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9525 
9526         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9527             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9528         }
9529         /* FALLTHROUGH */
9530     case ANYOFD:
9531     case ANYOF:
9532 	if (utf8_target) {
9533 	    while (hardcount < max
9534                    && scan < this_eol
9535 		   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
9536 	    {
9537 		scan += UTF8SKIP(scan);
9538 		hardcount++;
9539 	    }
9540 	}
9541         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
9542 	    while (scan < this_eol
9543                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9544 		scan++;
9545         }
9546         else {
9547 	    while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9548 		scan++;
9549 	}
9550 	break;
9551 
9552     case ANYOFM:
9553         if (utf8_target && this_eol - scan > max) {
9554 
9555             /* We didn't adjust <this_eol> at the beginning of this routine
9556              * because is UTF-8, but it is actually ok to do so, since here, to
9557              * match, 1 char == 1 byte. */
9558             this_eol = scan + max;
9559         }
9560 
9561         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9562         break;
9563 
9564     case NANYOFM:
9565 	if (utf8_target) {
9566 	    while (     hardcount < max
9567                    &&   scan < this_eol
9568 		   &&  (*scan & FLAGS(p)) != ARG(p))
9569 	    {
9570 		scan += UTF8SKIP(scan);
9571 		hardcount++;
9572 	    }
9573 	}
9574         else {
9575             scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9576 	}
9577         break;
9578 
9579     case ANYOFH:
9580         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9581             if (ANYOF_FLAGS(p)) {   /* If we know the first byte of what
9582                                        matches, we can avoid calling reginclass
9583                                      */
9584                 while (   hardcount < max
9585                        && scan < this_eol
9586                        && (U8) *scan == ANYOF_FLAGS(p)
9587                        && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
9588                                                                   TRUE))
9589                 {
9590                     scan += UTF8SKIP(scan);
9591                     hardcount++;
9592                 }
9593             }
9594             else while (  hardcount < max
9595                         && scan < this_eol
9596                         && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9597             {
9598                 scan += UTF8SKIP(scan);
9599                 hardcount++;
9600             }
9601         }
9602         break;
9603 
9604     /* The argument (FLAGS) to all the POSIX node types is the class number */
9605 
9606     case NPOSIXL:
9607         to_complement = 1;
9608         /* FALLTHROUGH */
9609 
9610     case POSIXL:
9611         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9612 	if (! utf8_target) {
9613 	    while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9614                                                                    *scan)))
9615             {
9616 		scan++;
9617             }
9618 	} else {
9619 	    while (hardcount < max && scan < this_eol
9620                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9621                                                                   (U8 *) scan,
9622                                                                   (U8 *) this_eol)))
9623             {
9624                 scan += UTF8SKIP(scan);
9625 		hardcount++;
9626 	    }
9627 	}
9628 	break;
9629 
9630     case POSIXD:
9631         if (utf8_target) {
9632             goto utf8_posix;
9633         }
9634         /* FALLTHROUGH */
9635 
9636     case POSIXA:
9637         if (utf8_target && this_eol - scan > max) {
9638 
9639             /* We didn't adjust <this_eol> at the beginning of this routine
9640              * because is UTF-8, but it is actually ok to do so, since here, to
9641              * match, 1 char == 1 byte. */
9642             this_eol = scan + max;
9643         }
9644         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9645 	    scan++;
9646 	}
9647 	break;
9648 
9649     case NPOSIXD:
9650         if (utf8_target) {
9651             to_complement = 1;
9652             goto utf8_posix;
9653         }
9654         /* FALLTHROUGH */
9655 
9656     case NPOSIXA:
9657         if (! utf8_target) {
9658             while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9659                 scan++;
9660             }
9661         }
9662         else {
9663 
9664             /* The complement of something that matches only ASCII matches all
9665              * non-ASCII, plus everything in ASCII that isn't in the class. */
9666 	    while (hardcount < max && scan < this_eol
9667                    && (   ! isASCII_utf8_safe(scan, loceol)
9668                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9669             {
9670                 scan += UTF8SKIP(scan);
9671 		hardcount++;
9672 	    }
9673         }
9674         break;
9675 
9676     case NPOSIXU:
9677         to_complement = 1;
9678         /* FALLTHROUGH */
9679 
9680     case POSIXU:
9681 	if (! utf8_target) {
9682             while (scan < this_eol && to_complement
9683                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9684             {
9685                 scan++;
9686             }
9687 	}
9688 	else {
9689           utf8_posix:
9690             classnum = (_char_class_number) FLAGS(p);
9691             switch (classnum) {
9692                 default:
9693                     while (   hardcount < max && scan < this_eol
9694                            && to_complement ^ cBOOL(_invlist_contains_cp(
9695                                               PL_XPosix_ptrs[classnum],
9696                                               utf8_to_uvchr_buf((U8 *) scan,
9697                                                                 (U8 *) this_eol,
9698                                                                 NULL))))
9699                     {
9700                         scan += UTF8SKIP(scan);
9701                         hardcount++;
9702                     }
9703                     break;
9704 
9705                     /* For the classes below, the knowledge of how to handle
9706                      * every code point is compiled in to Perl via a macro.
9707                      * This code is written for making the loops as tight as
9708                      * possible.  It could be refactored to save space instead.
9709                      * */
9710 
9711                 case _CC_ENUM_SPACE:
9712                     while (hardcount < max
9713                            && scan < this_eol
9714                            && (to_complement
9715                                ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
9716                     {
9717                         scan += UTF8SKIP(scan);
9718                         hardcount++;
9719                     }
9720                     break;
9721                 case _CC_ENUM_BLANK:
9722                     while (hardcount < max
9723                            && scan < this_eol
9724                            && (to_complement
9725                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
9726                     {
9727                         scan += UTF8SKIP(scan);
9728                         hardcount++;
9729                     }
9730                     break;
9731                 case _CC_ENUM_XDIGIT:
9732                     while (hardcount < max
9733                            && scan < this_eol
9734                            && (to_complement
9735                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
9736                     {
9737                         scan += UTF8SKIP(scan);
9738                         hardcount++;
9739                     }
9740                     break;
9741                 case _CC_ENUM_VERTSPACE:
9742                     while (hardcount < max
9743                            && scan < this_eol
9744                            && (to_complement
9745                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
9746                     {
9747                         scan += UTF8SKIP(scan);
9748                         hardcount++;
9749                     }
9750                     break;
9751                 case _CC_ENUM_CNTRL:
9752                     while (hardcount < max
9753                            && scan < this_eol
9754                            && (to_complement
9755                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
9756                     {
9757                         scan += UTF8SKIP(scan);
9758                         hardcount++;
9759                     }
9760                     break;
9761             }
9762 	}
9763         break;
9764 
9765     case LNBREAK:
9766         if (utf8_target) {
9767 	    while (hardcount < max && scan < this_eol &&
9768                     (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
9769 		scan += c;
9770 		hardcount++;
9771 	    }
9772 	} else {
9773             /* LNBREAK can match one or two latin chars, which is ok, but we
9774              * have to use hardcount in this situation, and throw away the
9775              * adjustment to <this_eol> done before the switch statement */
9776 	    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9777 		scan+=c;
9778 		hardcount++;
9779 	    }
9780 	}
9781 	break;
9782 
9783     case BOUNDL:
9784     case NBOUNDL:
9785         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9786         /* FALLTHROUGH */
9787     case BOUND:
9788     case BOUNDA:
9789     case BOUNDU:
9790     case EOS:
9791     case GPOS:
9792     case KEEPS:
9793     case NBOUND:
9794     case NBOUNDA:
9795     case NBOUNDU:
9796     case OPFAIL:
9797     case SBOL:
9798     case SEOL:
9799         /* These are all 0 width, so match right here or not at all. */
9800         break;
9801 
9802     default:
9803         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9804         NOT_REACHED; /* NOTREACHED */
9805 
9806     }
9807 
9808     if (hardcount)
9809 	c = hardcount;
9810     else
9811 	c = scan - *startposp;
9812     *startposp = scan;
9813 
9814     DEBUG_r({
9815 	GET_RE_DEBUG_FLAGS_DECL;
9816 	DEBUG_EXECUTE_r({
9817 	    SV * const prop = sv_newmortal();
9818             regprop(prog, prop, p, reginfo, NULL);
9819             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9820                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9821 	});
9822     });
9823 
9824     return(c);
9825 }
9826 
9827 /*
9828  - reginclass - determine if a character falls into a character class
9829 
9830   n is the ANYOF-type regnode
9831   p is the target string
9832   p_end points to one byte beyond the end of the target string
9833   utf8_target tells whether p is in UTF-8.
9834 
9835   Returns true if matched; false otherwise.
9836 
9837   Note that this can be a synthetic start class, a combination of various
9838   nodes, so things you think might be mutually exclusive, such as locale,
9839   aren't.  It can match both locale and non-locale
9840 
9841  */
9842 
9843 STATIC bool
S_reginclass(pTHX_ regexp * const prog,const regnode * const n,const U8 * const p,const U8 * const p_end,const bool utf8_target)9844 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9845 {
9846     dVAR;
9847     const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n);
9848     bool match = FALSE;
9849     UV c = *p;
9850 
9851     PERL_ARGS_ASSERT_REGINCLASS;
9852 
9853     /* If c is not already the code point, get it.  Note that
9854      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9855     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9856         STRLEN c_len = 0;
9857         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9858 	c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9859 	if (c_len == (STRLEN)-1) {
9860             _force_out_malformed_utf8_message(p, p_end,
9861                                               utf8n_flags,
9862                                               1 /* 1 means die */ );
9863             NOT_REACHED; /* NOTREACHED */
9864         }
9865         if (     c > 255
9866             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9867             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9868         {
9869             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9870         }
9871     }
9872 
9873     /* If this character is potentially in the bitmap, check it */
9874     if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
9875 	if (ANYOF_BITMAP_TEST(n, c))
9876 	    match = TRUE;
9877 	else if ((flags
9878                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9879                   && OP(n) == ANYOFD
9880 		  && ! utf8_target
9881 		  && ! isASCII(c))
9882 	{
9883 	    match = TRUE;
9884 	}
9885 	else if (flags & ANYOF_LOCALE_FLAGS) {
9886 	    if (  (flags & ANYOFL_FOLD)
9887                 && c < sizeof(PL_fold_locale)
9888 		&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9889             {
9890                 match = TRUE;
9891             }
9892             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
9893                      && c <= U8_MAX  /* param to isFOO_lc() */
9894             ) {
9895 
9896                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9897                  * if the class includes the Posix character class given by
9898                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9899                  * complemented Posix class given by int(bit/2).  So we loop
9900                  * through the bits, each time changing whether we complement
9901                  * the result or not.  Suppose for the sake of illustration
9902                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9903                  * is set, it means there is a match for this ANYOF node if the
9904                  * character is in the class given by the expression (0 / 2 = 0
9905                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9906                  * and since 'to_complement' is 0, the result will stay TRUE,
9907                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9908                  * bit 1 is 1.  That means there is a match if the character
9909                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9910                  * but will on bit 1.  On the second iteration 'to_complement'
9911                  * will be 1, so the exclusive or will reverse things, so we
9912                  * are testing for \W.  On the third iteration, 'to_complement'
9913                  * will be 0, and we would be testing for \s; the fourth
9914                  * iteration would test for \S, etc.
9915                  *
9916                  * Note that this code assumes that all the classes are closed
9917                  * under folding.  For example, if a character matches \w, then
9918                  * its fold does too; and vice versa.  This should be true for
9919                  * any well-behaved locale for all the currently defined Posix
9920                  * classes, except for :lower: and :upper:, which are handled
9921                  * by the pseudo-class :cased: which matches if either of the
9922                  * other two does.  To get rid of this assumption, an outer
9923                  * loop could be used below to iterate over both the source
9924                  * character, and its fold (if different) */
9925 
9926                 int count = 0;
9927                 int to_complement = 0;
9928 
9929                 while (count < ANYOF_MAX) {
9930                     if (ANYOF_POSIXL_TEST(n, count)
9931                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9932                     {
9933                         match = TRUE;
9934                         break;
9935                     }
9936                     count++;
9937                     to_complement ^= 1;
9938                 }
9939 	    }
9940 	}
9941     }
9942 
9943 
9944     /* If the bitmap didn't (or couldn't) match, and something outside the
9945      * bitmap could match, try that. */
9946     if (!match) {
9947 	if (c >= NUM_ANYOF_CODE_POINTS
9948             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9949         {
9950 	    match = TRUE;	/* Everything above the bitmap matches */
9951 	}
9952             /* Here doesn't match everything above the bitmap.  If there is
9953              * some information available beyond the bitmap, we may find a
9954              * match in it.  If so, this is most likely because the code point
9955              * is outside the bitmap range.  But rarely, it could be because of
9956              * some other reason.  If so, various flags are set to indicate
9957              * this possibility.  On ANYOFD nodes, there may be matches that
9958              * happen only when the target string is UTF-8; or for other node
9959              * types, because runtime lookup is needed, regardless of the
9960              * UTF-8ness of the target string.  Finally, under /il, there may
9961              * be some matches only possible if the locale is a UTF-8 one. */
9962 	else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9963                  && (   c >= NUM_ANYOF_CODE_POINTS
9964                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9965                          && (   UNLIKELY(OP(n) != ANYOFD)
9966                              || (utf8_target && ! isASCII_uni(c)
9967 #                               if NUM_ANYOF_CODE_POINTS > 256
9968                                                                  && c < 256
9969 #                               endif
9970                                 )))
9971                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9972                          && IN_UTF8_CTYPE_LOCALE)))
9973         {
9974             SV* only_utf8_locale = NULL;
9975 	    SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE,
9976                                                    0, &only_utf8_locale, NULL);
9977 	    if (definition) {
9978                 U8 utf8_buffer[2];
9979 		U8 * utf8_p;
9980 		if (utf8_target) {
9981 		    utf8_p = (U8 *) p;
9982 		} else { /* Convert to utf8 */
9983 		    utf8_p = utf8_buffer;
9984                     append_utf8_from_native_byte(*p, &utf8_p);
9985 		    utf8_p = utf8_buffer;
9986 		}
9987 
9988                 /* Turkish locales have these hard-coded rules overriding
9989                  * normal ones */
9990                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
9991                     && isALPHA_FOLD_EQ(*p, 'i'))
9992                 {
9993                     if (*p == 'i') {
9994                         if (_invlist_contains_cp(definition,
9995                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
9996                         {
9997                             match = TRUE;
9998                         }
9999                     }
10000                     else if (*p == 'I') {
10001                         if (_invlist_contains_cp(definition,
10002                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10003                         {
10004                             match = TRUE;
10005                         }
10006                     }
10007                 }
10008                 else if (_invlist_contains_cp(definition, c)) {
10009 		    match = TRUE;
10010                 }
10011 	    }
10012             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10013                 match = _invlist_contains_cp(only_utf8_locale, c);
10014             }
10015 	}
10016 
10017         /* In a Turkic locale under folding, hard-code the I i case pair
10018          * matches */
10019         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10020             && ! match
10021             &&   (flags & ANYOFL_FOLD)
10022             &&   utf8_target)
10023         {
10024             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10025 		if (ANYOF_BITMAP_TEST(n, 'i')) {
10026                     match = TRUE;
10027                 }
10028             }
10029             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10030 		if (ANYOF_BITMAP_TEST(n, 'I')) {
10031                     match = TRUE;
10032                 }
10033             }
10034         }
10035 
10036         if (UNICODE_IS_SUPER(c)
10037             && (flags
10038                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10039             && OP(n) != ANYOFD
10040             && ckWARN_d(WARN_NON_UNICODE))
10041         {
10042             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10043                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10044         }
10045     }
10046 
10047 #if ANYOF_INVERT != 1
10048     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10049      * use it, don't */
10050 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10051 #endif
10052 
10053     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10054     return (flags & ANYOF_INVERT) ^ match;
10055 }
10056 
10057 STATIC U8 *
S_reghop3(U8 * s,SSize_t off,const U8 * lim)10058 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10059 {
10060     /* return the position 'off' UTF-8 characters away from 's', forward if
10061      * 'off' >= 0, backwards if negative.  But don't go outside of position
10062      * 'lim', which better be < s  if off < 0 */
10063 
10064     PERL_ARGS_ASSERT_REGHOP3;
10065 
10066     if (off >= 0) {
10067 	while (off-- && s < lim) {
10068 	    /* XXX could check well-formedness here */
10069 	    U8 *new_s = s + UTF8SKIP(s);
10070             if (new_s > lim) /* lim may be in the middle of a long character */
10071                 return s;
10072             s = new_s;
10073 	}
10074     }
10075     else {
10076         while (off++ && s > lim) {
10077             s--;
10078             if (UTF8_IS_CONTINUED(*s)) {
10079                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10080                     s--;
10081                 if (! UTF8_IS_START(*s)) {
10082                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10083                 }
10084 	    }
10085             /* XXX could check well-formedness here */
10086 	}
10087     }
10088     return s;
10089 }
10090 
10091 STATIC U8 *
S_reghop4(U8 * s,SSize_t off,const U8 * llim,const U8 * rlim)10092 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10093 {
10094     PERL_ARGS_ASSERT_REGHOP4;
10095 
10096     if (off >= 0) {
10097         while (off-- && s < rlim) {
10098             /* XXX could check well-formedness here */
10099             s += UTF8SKIP(s);
10100         }
10101     }
10102     else {
10103         while (off++ && s > llim) {
10104             s--;
10105             if (UTF8_IS_CONTINUED(*s)) {
10106                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10107                     s--;
10108                 if (! UTF8_IS_START(*s)) {
10109                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10110                 }
10111             }
10112             /* XXX could check well-formedness here */
10113         }
10114     }
10115     return s;
10116 }
10117 
10118 /* like reghop3, but returns NULL on overrun, rather than returning last
10119  * char pos */
10120 
10121 STATIC U8 *
S_reghopmaybe3(U8 * s,SSize_t off,const U8 * const lim)10122 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10123 {
10124     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10125 
10126     if (off >= 0) {
10127 	while (off-- && s < lim) {
10128 	    /* XXX could check well-formedness here */
10129 	    s += UTF8SKIP(s);
10130 	}
10131 	if (off >= 0)
10132 	    return NULL;
10133     }
10134     else {
10135         while (off++ && s > lim) {
10136             s--;
10137             if (UTF8_IS_CONTINUED(*s)) {
10138                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10139                     s--;
10140                 if (! UTF8_IS_START(*s)) {
10141                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10142                 }
10143 	    }
10144             /* XXX could check well-formedness here */
10145 	}
10146 	if (off <= 0)
10147 	    return NULL;
10148     }
10149     return s;
10150 }
10151 
10152 
10153 /* when executing a regex that may have (?{}), extra stuff needs setting
10154    up that will be visible to the called code, even before the current
10155    match has finished. In particular:
10156 
10157    * $_ is localised to the SV currently being matched;
10158    * pos($_) is created if necessary, ready to be updated on each call-out
10159      to code;
10160    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10161      isn't set until the current pattern is successfully finished), so that
10162      $1 etc of the match-so-far can be seen;
10163    * save the old values of subbeg etc of the current regex, and  set then
10164      to the current string (again, this is normally only done at the end
10165      of execution)
10166 */
10167 
10168 static void
S_setup_eval_state(pTHX_ regmatch_info * const reginfo)10169 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10170 {
10171     MAGIC *mg;
10172     regexp *const rex = ReANY(reginfo->prog);
10173     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10174 
10175     eval_state->rex = rex;
10176     eval_state->sv  = reginfo->sv;
10177 
10178     if (reginfo->sv) {
10179         /* Make $_ available to executed code. */
10180         if (reginfo->sv != DEFSV) {
10181             SAVE_DEFSV;
10182             DEFSV_set(reginfo->sv);
10183         }
10184         /* will be dec'd by S_cleanup_regmatch_info_aux */
10185         SvREFCNT_inc_NN(reginfo->sv);
10186 
10187         if (!(mg = mg_find_mglob(reginfo->sv))) {
10188             /* prepare for quick setting of pos */
10189             mg = sv_magicext_mglob(reginfo->sv);
10190             mg->mg_len = -1;
10191         }
10192         eval_state->pos_magic = mg;
10193         eval_state->pos       = mg->mg_len;
10194         eval_state->pos_flags = mg->mg_flags;
10195     }
10196     else
10197         eval_state->pos_magic = NULL;
10198 
10199     if (!PL_reg_curpm) {
10200         /* PL_reg_curpm is a fake PMOP that we can attach the current
10201          * regex to and point PL_curpm at, so that $1 et al are visible
10202          * within a /(?{})/. It's just allocated once per interpreter the
10203          * first time its needed */
10204         Newxz(PL_reg_curpm, 1, PMOP);
10205 #ifdef USE_ITHREADS
10206         {
10207             SV* const repointer = &PL_sv_undef;
10208             /* this regexp is also owned by the new PL_reg_curpm, which
10209                will try to free it.  */
10210             av_push(PL_regex_padav, repointer);
10211             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10212             PL_regex_pad = AvARRAY(PL_regex_padav);
10213         }
10214 #endif
10215     }
10216     SET_reg_curpm(reginfo->prog);
10217     eval_state->curpm = PL_curpm;
10218     PL_curpm_under = PL_curpm;
10219     PL_curpm = PL_reg_curpm;
10220     if (RXp_MATCH_COPIED(rex)) {
10221         /*  Here is a serious problem: we cannot rewrite subbeg,
10222             since it may be needed if this match fails.  Thus
10223             $` inside (?{}) could fail... */
10224         eval_state->subbeg     = rex->subbeg;
10225         eval_state->sublen     = rex->sublen;
10226         eval_state->suboffset  = rex->suboffset;
10227         eval_state->subcoffset = rex->subcoffset;
10228 #ifdef PERL_ANY_COW
10229         eval_state->saved_copy = rex->saved_copy;
10230 #endif
10231         RXp_MATCH_COPIED_off(rex);
10232     }
10233     else
10234         eval_state->subbeg = NULL;
10235     rex->subbeg = (char *)reginfo->strbeg;
10236     rex->suboffset = 0;
10237     rex->subcoffset = 0;
10238     rex->sublen = reginfo->strend - reginfo->strbeg;
10239 }
10240 
10241 
10242 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10243 
10244 static void
S_cleanup_regmatch_info_aux(pTHX_ void * arg)10245 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10246 {
10247     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10248     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10249     regmatch_slab *s;
10250 
10251     Safefree(aux->poscache);
10252 
10253     if (eval_state) {
10254 
10255         /* undo the effects of S_setup_eval_state() */
10256 
10257         if (eval_state->subbeg) {
10258             regexp * const rex = eval_state->rex;
10259             rex->subbeg     = eval_state->subbeg;
10260             rex->sublen     = eval_state->sublen;
10261             rex->suboffset  = eval_state->suboffset;
10262             rex->subcoffset = eval_state->subcoffset;
10263 #ifdef PERL_ANY_COW
10264             rex->saved_copy = eval_state->saved_copy;
10265 #endif
10266             RXp_MATCH_COPIED_on(rex);
10267         }
10268         if (eval_state->pos_magic)
10269         {
10270             eval_state->pos_magic->mg_len = eval_state->pos;
10271             eval_state->pos_magic->mg_flags =
10272                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10273                | (eval_state->pos_flags & MGf_BYTES);
10274         }
10275 
10276         PL_curpm = eval_state->curpm;
10277         SvREFCNT_dec(eval_state->sv);
10278     }
10279 
10280     PL_regmatch_state = aux->old_regmatch_state;
10281     PL_regmatch_slab  = aux->old_regmatch_slab;
10282 
10283     /* free all slabs above current one - this must be the last action
10284      * of this function, as aux and eval_state are allocated within
10285      * slabs and may be freed here */
10286 
10287     s = PL_regmatch_slab->next;
10288     if (s) {
10289         PL_regmatch_slab->next = NULL;
10290         while (s) {
10291             regmatch_slab * const osl = s;
10292             s = s->next;
10293             Safefree(osl);
10294         }
10295     }
10296 }
10297 
10298 
10299 STATIC void
S_to_utf8_substr(pTHX_ regexp * prog)10300 S_to_utf8_substr(pTHX_ regexp *prog)
10301 {
10302     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10303      * on the converted value */
10304 
10305     int i = 1;
10306 
10307     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10308 
10309     do {
10310 	if (prog->substrs->data[i].substr
10311 	    && !prog->substrs->data[i].utf8_substr) {
10312 	    SV* const sv = newSVsv(prog->substrs->data[i].substr);
10313 	    prog->substrs->data[i].utf8_substr = sv;
10314 	    sv_utf8_upgrade(sv);
10315 	    if (SvVALID(prog->substrs->data[i].substr)) {
10316 		if (SvTAIL(prog->substrs->data[i].substr)) {
10317 		    /* Trim the trailing \n that fbm_compile added last
10318 		       time.  */
10319 		    SvCUR_set(sv, SvCUR(sv) - 1);
10320 		    /* Whilst this makes the SV technically "invalid" (as its
10321 		       buffer is no longer followed by "\0") when fbm_compile()
10322 		       adds the "\n" back, a "\0" is restored.  */
10323 		    fbm_compile(sv, FBMcf_TAIL);
10324 		} else
10325 		    fbm_compile(sv, 0);
10326 	    }
10327 	    if (prog->substrs->data[i].substr == prog->check_substr)
10328 		prog->check_utf8 = sv;
10329 	}
10330     } while (i--);
10331 }
10332 
10333 STATIC bool
S_to_byte_substr(pTHX_ regexp * prog)10334 S_to_byte_substr(pTHX_ regexp *prog)
10335 {
10336     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10337      * on the converted value; returns FALSE if can't be converted. */
10338 
10339     int i = 1;
10340 
10341     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10342 
10343     do {
10344 	if (prog->substrs->data[i].utf8_substr
10345 	    && !prog->substrs->data[i].substr) {
10346 	    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10347 	    if (! sv_utf8_downgrade(sv, TRUE)) {
10348                 SvREFCNT_dec_NN(sv);
10349                 return FALSE;
10350             }
10351             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10352                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10353                     /* Trim the trailing \n that fbm_compile added last
10354                         time.  */
10355                     SvCUR_set(sv, SvCUR(sv) - 1);
10356                     fbm_compile(sv, FBMcf_TAIL);
10357                 } else
10358                     fbm_compile(sv, 0);
10359             }
10360 	    prog->substrs->data[i].substr = sv;
10361 	    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10362 		prog->check_substr = sv;
10363 	}
10364     } while (i--);
10365 
10366     return TRUE;
10367 }
10368 
10369 #ifndef PERL_IN_XSUB_RE
10370 
10371 bool
Perl__is_grapheme(pTHX_ const U8 * strbeg,const U8 * s,const U8 * strend,const UV cp)10372 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10373 {
10374     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10375      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10376      * the larger string bounded by 'strbeg' and 'strend'.
10377      *
10378      * 'cp' needs to be assigned (if not a future version of the Unicode
10379      * Standard could make it something that combines with adjacent characters,
10380      * so code using it would then break), and there has to be a GCB break
10381      * before and after the character. */
10382 
10383     dVAR;
10384 
10385     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10386     const U8 * prev_cp_start;
10387 
10388     PERL_ARGS_ASSERT__IS_GRAPHEME;
10389 
10390     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10391         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10392     {
10393         /* These are considered graphemes */
10394         return TRUE;
10395     }
10396 
10397     /* Otherwise, unassigned code points are forbidden */
10398     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10399                                     _invlist_search(PL_Assigned_invlist, cp))))
10400     {
10401         return FALSE;
10402     }
10403 
10404     cp_gcb_val = getGCB_VAL_CP(cp);
10405 
10406     /* Find the GCB value of the previous code point in the input */
10407     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10408     if (UNLIKELY(prev_cp_start == s)) {
10409         prev_cp_gcb_val = GCB_EDGE;
10410     }
10411     else {
10412         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10413     }
10414 
10415     /* And check that is a grapheme boundary */
10416     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10417                 TRUE /* is UTF-8 encoded */ ))
10418     {
10419         return FALSE;
10420     }
10421 
10422     /* Similarly verify there is a break between the current character and the
10423      * following one */
10424     s += UTF8SKIP(s);
10425     if (s >= strend) {
10426         next_cp_gcb_val = GCB_EDGE;
10427     }
10428     else {
10429         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10430     }
10431 
10432     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10433 }
10434 
10435 /*
10436 =head1 Unicode Support
10437 
10438 =for apidoc isSCRIPT_RUN
10439 
10440 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10441 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10442 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10443 two degenerate cases given below, this function returns TRUE iff all code
10444 points in it come from any combination of three "scripts" given by the Unicode
10445 "Script Extensions" property: Common, Inherited, and possibly one other.
10446 Additionally all decimal digits must come from the same consecutive sequence of
10447 10.
10448 
10449 For example, if all the characters in the sequence are Greek, or Common, or
10450 Inherited, this function will return TRUE, provided any decimal digits in it
10451 are from the same block of digits in Common.  (These are the ASCII digits
10452 "0".."9" and additionally a block for full width forms of these, and several
10453 others used in mathematical notation.)   For scripts (unlike Greek) that have
10454 their own digits defined this will accept either digits from that set or from
10455 one of the Common digit sets, but not a combination of the two.  Some scripts,
10456 such as Arabic, have more than one set of digits.  All digits must come from
10457 the same set for this function to return TRUE.
10458 
10459 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10460 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10461 C<SCX_INVALID> if the function returns FALSE.
10462 
10463 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10464 will be C<SCX_INVALID>.
10465 
10466 If the sequence contains a single code point which is unassigned to a character
10467 in the version of Unicode being used, the function will return TRUE, and the
10468 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10469 in the input sequence will result in the function treating the input as not
10470 being a script run.
10471 
10472 The returned script will be C<SCX_Inherited> iff all the code points in it are
10473 from the Inherited script.
10474 
10475 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10476 it are from the Inherited or Common scripts.
10477 
10478 =cut
10479 
10480 */
10481 
10482 bool
Perl_isSCRIPT_RUN(pTHX_ const U8 * s,const U8 * send,const bool utf8_target)10483 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10484 {
10485     /* Basically, it looks at each character in the sequence to see if the
10486      * above conditions are met; if not it fails.  It uses an inversion map to
10487      * find the enum corresponding to the script of each character.  But this
10488      * is complicated by the fact that a few code points can be in any of
10489      * several scripts.  The data has been constructed so that there are
10490      * additional enum values (all negative) for these situations.  The
10491      * absolute value of those is an index into another table which contains
10492      * pointers to auxiliary tables for each such situation.  Each aux array
10493      * lists all the scripts for the given situation.  There is another,
10494      * parallel, table that gives the number of entries in each aux table.
10495      * These are all defined in charclass_invlists.h */
10496 
10497     /* XXX Here are the additional things UTS 39 says could be done:
10498      *
10499      * Forbid sequences of the same nonspacing mark
10500      *
10501      * Check to see that all the characters are in the sets of exemplar
10502      * characters for at least one language in the Unicode Common Locale Data
10503      * Repository [CLDR]. */
10504 
10505     dVAR;
10506 
10507     /* Things that match /\d/u */
10508     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10509     UV * decimals_array = invlist_array(decimals_invlist);
10510 
10511     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10512      * not currently known) */
10513     UV zero_of_run = 0;
10514 
10515     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10516     SCX_enum script_of_char = SCX_INVALID;
10517 
10518     /* If the script remains not fully determined from iteration to iteration,
10519      * this is the current intersection of the possiblities.  */
10520     SCX_enum * intersection = NULL;
10521     PERL_UINT_FAST8_T intersection_len = 0;
10522 
10523     bool retval = TRUE;
10524     SCX_enum * ret_script = NULL;
10525 
10526     assert(send >= s);
10527 
10528     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10529 
10530     /* All code points in 0..255 are either Common or Latin, so must be a
10531      * script run.  We can return immediately unless we need to know which
10532      * script it is. */
10533     if (! utf8_target && LIKELY(send > s)) {
10534         if (ret_script == NULL) {
10535             return TRUE;
10536         }
10537 
10538         /* If any character is Latin, the run is Latin */
10539         while (s < send) {
10540             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10541                 *ret_script = SCX_Latin;
10542                 return TRUE;
10543             }
10544         }
10545 
10546         /* Here, all are Common */
10547         *ret_script = SCX_Common;
10548         return TRUE;
10549     }
10550 
10551     /* Look at each character in the sequence */
10552     while (s < send) {
10553         /* If the current character being examined is a digit, this is the code
10554          * point of the zero for its sequence of 10 */
10555         UV zero_of_char;
10556 
10557         UV cp;
10558 
10559         /* The code allows all scripts to use the ASCII digits.  This is
10560          * because they are in the Common script.  Hence any ASCII ones found
10561          * are ok, unless and until a digit from another set has already been
10562          * encountered.  digit ranges in Common are not similarly blessed) */
10563         if (UNLIKELY(isDIGIT(*s))) {
10564             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10565                 retval = FALSE;
10566                 break;
10567             }
10568             if (zero_of_run) {
10569                 if (zero_of_run != '0') {
10570                     retval = FALSE;
10571                     break;
10572                 }
10573             }
10574             else {
10575                 zero_of_run = '0';
10576             }
10577             s++;
10578             continue;
10579         }
10580 
10581         /* Here, isn't an ASCII digit.  Find the code point of the character */
10582         if (! UTF8_IS_INVARIANT(*s)) {
10583             Size_t len;
10584             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10585             s += len;
10586         }
10587         else {
10588             cp = *(s++);
10589         }
10590 
10591         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10592          * digit in that script.  We can skip the rest of this code for this
10593          * character. */
10594         if (UNLIKELY(   zero_of_run
10595                      && cp >= zero_of_run
10596                      && cp - zero_of_run <= 9))
10597         {
10598             continue;
10599         }
10600 
10601         /* Find the character's script.  The correct values are hard-coded here
10602          * for small-enough code points. */
10603         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10604                                unlikely to change */
10605             if (       cp > 255
10606                 || (   isALPHA_L1(cp)
10607                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10608             {
10609                 script_of_char = SCX_Latin;
10610             }
10611             else {
10612                 script_of_char = SCX_Common;
10613             }
10614         }
10615         else {
10616             script_of_char = _Perl_SCX_invmap[
10617                                        _invlist_search(PL_SCX_invlist, cp)];
10618         }
10619 
10620         /* We arbitrarily accept a single unassigned character, but not in
10621          * combination with anything else, and not a run of them. */
10622         if (   UNLIKELY(script_of_run == SCX_Unknown)
10623             || UNLIKELY(   script_of_run != SCX_INVALID
10624                         && script_of_char == SCX_Unknown))
10625         {
10626             retval = FALSE;
10627             break;
10628         }
10629 
10630         /* For the first character, or the run is inherited, the run's script
10631          * is set to the char's */
10632         if (   UNLIKELY(script_of_run == SCX_INVALID)
10633             || UNLIKELY(script_of_run == SCX_Inherited))
10634         {
10635             script_of_run = script_of_char;
10636         }
10637 
10638         /* For the character's script to be Unknown, it must be the first
10639          * character in the sequence (for otherwise a test above would have
10640          * prevented us from reaching here), and we have set the run's script
10641          * to it.  Nothing further to be done for this character */
10642         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10643             continue;
10644         }
10645 
10646         /* We accept 'inherited' script characters currently even at the
10647          * beginning.  (We know that no characters in Inherited are digits, or
10648          * we'd have to check for that) */
10649         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10650             continue;
10651         }
10652 
10653         /* If the run so far is Common, and the new character isn't, change the
10654          * run's script to that of this character */
10655         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10656             script_of_run = script_of_char;
10657         }
10658 
10659         /* Now we can see if the script of the new character is the same as
10660          * that of the run */
10661         if (LIKELY(script_of_char == script_of_run)) {
10662             /* By far the most common case */
10663             goto scripts_match;
10664         }
10665 
10666         /* Here, the script of the run isn't Common.  But characters in Common
10667          * match any script */
10668         if (script_of_char == SCX_Common) {
10669             goto scripts_match;
10670         }
10671 
10672 #ifndef HAS_SCX_AUX_TABLES
10673 
10674         /* Too early a Unicode version to have a code point belonging to more
10675          * than one script, so, if the scripts don't exactly match, fail */
10676         PERL_UNUSED_VAR(intersection_len);
10677         retval = FALSE;
10678         break;
10679 
10680 #else
10681 
10682         /* Here there is no exact match between the character's script and the
10683          * run's.  And we've handled the special cases of scripts Unknown,
10684          * Inherited, and Common.
10685          *
10686          * Negative script numbers signify that the value may be any of several
10687          * scripts, and we need to look at auxiliary information to make our
10688          * deterimination.  But if both are non-negative, we can fail now */
10689         if (LIKELY(script_of_char >= 0)) {
10690             const SCX_enum * search_in;
10691             PERL_UINT_FAST8_T search_in_len;
10692             PERL_UINT_FAST8_T i;
10693 
10694             if (LIKELY(script_of_run >= 0)) {
10695                 retval = FALSE;
10696                 break;
10697             }
10698 
10699             /* Use the previously constructed set of possible scripts, if any.
10700              * */
10701             if (intersection) {
10702                 search_in = intersection;
10703                 search_in_len = intersection_len;
10704             }
10705             else {
10706                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10707                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10708             }
10709 
10710             for (i = 0; i < search_in_len; i++) {
10711                 if (search_in[i] == script_of_char) {
10712                     script_of_run = script_of_char;
10713                     goto scripts_match;
10714                 }
10715             }
10716 
10717             retval = FALSE;
10718             break;
10719         }
10720         else if (LIKELY(script_of_run >= 0)) {
10721             /* script of character could be one of several, but run is a single
10722              * script */
10723             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10724             const PERL_UINT_FAST8_T search_in_len
10725                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10726             PERL_UINT_FAST8_T i;
10727 
10728             for (i = 0; i < search_in_len; i++) {
10729                 if (search_in[i] == script_of_run) {
10730                     script_of_char = script_of_run;
10731                     goto scripts_match;
10732                 }
10733             }
10734 
10735             retval = FALSE;
10736             break;
10737         }
10738         else {
10739             /* Both run and char could be in one of several scripts.  If the
10740              * intersection is empty, then this character isn't in this script
10741              * run.  Otherwise, we need to calculate the intersection to use
10742              * for future iterations of the loop, unless we are already at the
10743              * final character */
10744             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10745             const PERL_UINT_FAST8_T char_len
10746                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10747             const SCX_enum * search_run;
10748             PERL_UINT_FAST8_T run_len;
10749 
10750             SCX_enum * new_overlap = NULL;
10751             PERL_UINT_FAST8_T i, j;
10752 
10753             if (intersection) {
10754                 search_run = intersection;
10755                 run_len = intersection_len;
10756             }
10757             else {
10758                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10759                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10760             }
10761 
10762             intersection_len = 0;
10763 
10764             for (i = 0; i < run_len; i++) {
10765                 for (j = 0; j < char_len; j++) {
10766                     if (search_run[i] == search_char[j]) {
10767 
10768                         /* Here, the script at i,j matches.  That means this
10769                          * character is in the run.  But continue on to find
10770                          * the complete intersection, for the next loop
10771                          * iteration, and for the digit check after it.
10772                          *
10773                          * On the first found common script, we malloc space
10774                          * for the intersection list for the worst case of the
10775                          * intersection, which is the minimum of the number of
10776                          * scripts remaining in each set. */
10777                         if (intersection_len == 0) {
10778                             Newx(new_overlap,
10779                                  MIN(run_len - i, char_len - j),
10780                                  SCX_enum);
10781                         }
10782                         new_overlap[intersection_len++] = search_run[i];
10783                     }
10784                 }
10785             }
10786 
10787             /* Here we've looked through everything.  If they have no scripts
10788              * in common, not a run */
10789             if (intersection_len == 0) {
10790                 retval = FALSE;
10791                 break;
10792             }
10793 
10794             /* If there is only a single script in common, set to that.
10795              * Otherwise, use the intersection going forward */
10796             Safefree(intersection);
10797             intersection = NULL;
10798             if (intersection_len == 1) {
10799                 script_of_run = script_of_char = new_overlap[0];
10800                 Safefree(new_overlap);
10801                 new_overlap = NULL;
10802             }
10803             else {
10804                 intersection = new_overlap;
10805             }
10806         }
10807 
10808 #endif
10809 
10810   scripts_match:
10811 
10812         /* Here, the script of the character is compatible with that of the
10813          * run.  That means that in most cases, it continues the script run.
10814          * Either it and the run match exactly, or one or both can be in any of
10815          * several scripts, and the intersection is not empty.  However, if the
10816          * character is a decimal digit, it could still mean failure if it is
10817          * from the wrong sequence of 10.  So, we need to look at if it's a
10818          * digit.  We've already handled the 10 decimal digits, and the next
10819          * lowest one is this one: */
10820         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10821             continue;   /* Not a digit; this character is part of the run */
10822         }
10823 
10824         /* If we have a definitive '0' for the script of this character, we
10825          * know that for this to be a digit, it must be in the range of +0..+9
10826          * of that zero. */
10827         if (   script_of_char >= 0
10828             && (zero_of_char = script_zeros[script_of_char]))
10829         {
10830             if (   cp < zero_of_char
10831                 || cp > zero_of_char + 9)
10832             {
10833                 continue;   /* Not a digit; this character is part of the run
10834                              */
10835             }
10836 
10837         }
10838         else {  /* Need to look up if this character is a digit or not */
10839             SSize_t index_of_zero_of_char;
10840             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10841             if (     UNLIKELY(index_of_zero_of_char < 0)
10842                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10843             {
10844                 continue;   /* Not a digit; this character is part of the run.
10845                              */
10846             }
10847 
10848             zero_of_char = decimals_array[index_of_zero_of_char];
10849         }
10850 
10851         /* Here, the character is a decimal digit, and the zero of its sequence
10852          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10853          * they better be the same. */
10854         if (zero_of_run) {
10855             if (zero_of_run != zero_of_char) {
10856                 retval = FALSE;
10857                 break;
10858             }
10859         }
10860         else {  /* Otherwise we now have a zero for this run */
10861             zero_of_run = zero_of_char;
10862         }
10863     } /* end of looping through CLOSESR text */
10864 
10865     Safefree(intersection);
10866 
10867     if (ret_script != NULL) {
10868         if (retval) {
10869             *ret_script = script_of_run;
10870         }
10871         else {
10872             *ret_script = SCX_INVALID;
10873         }
10874     }
10875 
10876     return retval;
10877 }
10878 
10879 #endif /* ifndef PERL_IN_XSUB_RE */
10880 
10881 /*
10882  * ex: set ts=8 sts=4 sw=4 et:
10883  */
10884