1 /*    regexec.c
2  */
3 
4 /*
5  *	One Ring to rule them all, One Ring to find them
6  *
7  *     [p.v of _The Lord of the Rings_, opening poem]
8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10  */
11 
12 /* This file contains functions for executing a regular expression.  See
13  * also regcomp.c which funnily enough, contains functions for compiling
14  * a regular expression.
15  *
16  * This file is also copied at build time to ext/re/re_exec.c, where
17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18  * This causes the main functions to be compiled under new names and with
19  * debugging support added, which makes "use re 'debug'" work.
20  */
21 
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23  * confused with the original package (see point 3 below).  Thanks, Henry!
24  */
25 
26 /* Additional note: this code is very heavily munged from Henry's version
27  * in places.  In some spots I've traded clarity for efficiency, so don't
28  * blame Henry for some of the lack of readability.
29  */
30 
31 /* The names of the functions have been changed from regcomp and
32  * regexec to  pregcomp and pregexec in order to avoid conflicts
33  * with the POSIX routines of the same names.
34 */
35 
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39 
40 /*
41  * pregcomp and pregexec -- regsub and regerror are not used in perl
42  *
43  *	Copyright (c) 1986 by University of Toronto.
44  *	Written by Henry Spencer.  Not derived from licensed software.
45  *
46  *	Permission is granted to anyone to use this software for any
47  *	purpose on any computer system, and to redistribute it freely,
48  *	subject to the following restrictions:
49  *
50  *	1. The author is not responsible for the consequences of use of
51  *		this software, no matter how awful, even if they arise
52  *		from defects in it.
53  *
54  *	2. The origin of this software must not be misrepresented, either
55  *		by explicit claim or by omission.
56  *
57  *	3. Altered versions must be plainly marked as such, and must not
58  *		be misrepresented as being the original software.
59  *
60  ****    Alterations to Henry's code are...
61  ****
62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64  ****    by Larry Wall and others
65  ****
66  ****    You may distribute under the terms of either the GNU General Public
67  ****    License or the Artistic License, as specified in the README file.
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEXEC_C
75 #include "perl.h"
76 
77 #ifdef PERL_IN_XSUB_RE
78 #  include "re_comp.h"
79 #else
80 #  include "regcomp.h"
81 #endif
82 
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
85 
86 static const char b_utf8_locale_required[] =
87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88                                                 "  Assuming a UTF-8 locale";
89 
90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND                       \
91     STMT_START {                                                            \
92         if (! IN_UTF8_CTYPE_LOCALE) {                                       \
93           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
94                                                 b_utf8_locale_required);    \
95         }                                                                   \
96     } STMT_END
97 
98 static const char sets_utf8_locale_required[] =
99       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
100 
101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n)                     \
102     STMT_START {                                                            \
103         if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) {  \
104           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
105                                              sets_utf8_locale_required);    \
106         }                                                                   \
107     } STMT_END
108 
109 #ifdef DEBUGGING
110 /* At least one required character in the target string is expressible only in
111  * UTF-8. */
112 static const char non_utf8_target_but_utf8_required[]
113                 = "Can't match, because target string needs to be in UTF-8\n";
114 #endif
115 
116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
117     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
118     goto target;                                                         \
119 } STMT_END
120 
121 #ifndef STATIC
122 #define	STATIC	static
123 #endif
124 
125 /*
126  * Forwards.
127  */
128 
129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
130 
131 #define HOPc(pos,off) \
132         (char *)(reginfo->is_utf8_target \
133             ? reghop3((U8*)pos, off, \
134                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
135             : (U8*)(pos + off))
136 
137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138 #define HOPBACK3(pos, off, lim) \
139         (reginfo->is_utf8_target                          \
140             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141             : (pos - off >= lim)	                         \
142                 ? (U8*)pos - off		                 \
143                 : NULL)
144 
145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
146 
147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
149 
150 /* lim must be +ve. Returns NULL on overshoot */
151 #define HOPMAYBE3(pos,off,lim) \
152         (reginfo->is_utf8_target                        \
153             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
154             : ((U8*)pos + off <= lim)                   \
155                 ? (U8*)pos + off                        \
156                 : NULL)
157 
158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159  * off must be >=0; args should be vars rather than expressions */
160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162     : (U8*)((pos + off) > lim ? lim : (pos + off)))
163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
164 
165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
167     : (U8*)(pos + off))
168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
169 
170 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
172 
173 /* for use after a quantifier and before an EXACT-like node -- japhy */
174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
175  *
176  * NOTE that *nothing* that affects backtracking should be in here, specifically
177  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
178  * node that is in between two EXACT like nodes when ascertaining what the required
179  * "follow" character is. This should probably be moved to regex compile time
180  * although it may be done at run time beause of the REF possibility - more
181  * investigation required. -- demerphq
182 */
183 #define JUMPABLE(rn) (                                                             \
184     OP(rn) == OPEN ||                                                              \
185     (OP(rn) == CLOSE &&                                                            \
186      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
187     OP(rn) == EVAL ||                                                              \
188     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
189     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
190     OP(rn) == KEEPS ||                                                             \
191     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
192 )
193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
194 
195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
196 
197 /*
198   Search for mandatory following text node; for lookahead, the text must
199   follow but for lookbehind (rn->flags != 0) we skip to the next step.
200 */
201 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
202     while (JUMPABLE(rn)) { \
203         const OPCODE type = OP(rn); \
204         if (type == SUSPEND || PL_regkind[type] == CURLY) \
205             rn = NEXTOPER(NEXTOPER(rn)); \
206         else if (type == PLUS) \
207             rn = NEXTOPER(rn); \
208         else if (type == IFMATCH) \
209             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
210         else rn += NEXT_OFF(rn); \
211     } \
212 } STMT_END
213 
214 #define SLAB_FIRST(s) (&(s)->states[0])
215 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
216 
217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
219 static regmatch_state * S_push_slab(pTHX);
220 
221 #define REGCP_PAREN_ELEMS 3
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225  * are needed for the regexp context stack bookkeeping. */
226 
227 STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp * rex,I32 parenfloor,U32 maxopenparen _pDEPTH)228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
229 {
230     const int retval = PL_savestack_ix;
231     const int paren_elems_to_push =
232                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
233     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
235     I32 p;
236     DECLARE_AND_GET_RE_DEBUG_FLAGS;
237 
238     PERL_ARGS_ASSERT_REGCPPUSH;
239 
240     if (paren_elems_to_push < 0)
241         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242                    (int)paren_elems_to_push, (int)maxopenparen,
243                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
244 
245     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
246         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
247                    " out of range (%lu-%ld)",
248                    total_elems,
249                    (unsigned long)maxopenparen,
250                    (long)parenfloor);
251 
252     SSGROW(total_elems + REGCP_FRAME_ELEMS);
253 
254     DEBUG_BUFFERS_r(
255         if ((int)maxopenparen > (int)parenfloor)
256             Perl_re_exec_indentf( aTHX_
257                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
258                 depth,
259                 PTR2UV(rex),
260                 PTR2UV(rex->offs)
261             );
262     );
263     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
265         SSPUSHIV(rex->offs[p].end);
266         SSPUSHIV(rex->offs[p].start);
267         SSPUSHINT(rex->offs[p].start_tmp);
268         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
269             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
270             depth,
271             (UV)p,
272             (IV)rex->offs[p].start,
273             (IV)rex->offs[p].start_tmp,
274             (IV)rex->offs[p].end
275         ));
276     }
277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
278     SSPUSHINT(maxopenparen);
279     SSPUSHINT(rex->lastparen);
280     SSPUSHINT(rex->lastcloseparen);
281     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
282 
283     return retval;
284 }
285 
286 /* These are needed since we do not localize EVAL nodes: */
287 #define REGCP_SET(cp)                                           \
288     DEBUG_STATE_r(                                              \
289         Perl_re_exec_indentf( aTHX_                             \
290             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
291             depth, (IV)PL_savestack_ix                          \
292         )                                                       \
293     );                                                          \
294     cp = PL_savestack_ix
295 
296 #define REGCP_UNWIND(cp)                                        \
297     DEBUG_STATE_r(                                              \
298         if (cp != PL_savestack_ix)                              \
299             Perl_re_exec_indentf( aTHX_                         \
300                 "Clearing an EVAL scope, savestack=%"           \
301                 IVdf "..%" IVdf "\n",                           \
302                 depth, (IV)(cp), (IV)PL_savestack_ix            \
303             )                                                   \
304     );                                                          \
305     regcpblow(cp)
306 
307 /* set the start and end positions of capture ix */
308 #define CLOSE_CAPTURE(ix, s, e)                                            \
309     rex->offs[ix].start = s;                                               \
310     rex->offs[ix].end = e;                                                 \
311     if (ix > rex->lastparen)                                               \
312         rex->lastparen = ix;                                               \
313     rex->lastcloseparen = ix;                                              \
314     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
315         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
316         depth,                                                             \
317         PTR2UV(rex),                                                       \
318         PTR2UV(rex->offs),                                                 \
319         (UV)ix,                                                            \
320         (IV)rex->offs[ix].start,                                           \
321         (IV)rex->offs[ix].end,                                             \
322         (UV)rex->lastparen                                                 \
323     ))
324 
325 #define UNWIND_PAREN(lp, lcp)               \
326     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
327         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
328         depth,                              \
329         PTR2UV(rex),                        \
330         PTR2UV(rex->offs),                  \
331         (UV)(lp),                           \
332         (UV)(rex->lastparen),               \
333         (UV)(lcp)                           \
334     ));                                     \
335     for (n = rex->lastparen; n > lp; n--)   \
336         rex->offs[n].end = -1;              \
337     rex->lastparen = n;                     \
338     rex->lastcloseparen = lcp;
339 
340 
341 STATIC void
S_regcppop(pTHX_ regexp * rex,U32 * maxopenparen_p _pDEPTH)342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
343 {
344     UV i;
345     U32 paren;
346     DECLARE_AND_GET_RE_DEBUG_FLAGS;
347 
348     PERL_ARGS_ASSERT_REGCPPOP;
349 
350     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
351     i = SSPOPUV;
352     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
354     rex->lastcloseparen = SSPOPINT;
355     rex->lastparen = SSPOPINT;
356     *maxopenparen_p = SSPOPINT;
357 
358     i -= REGCP_OTHER_ELEMS;
359     /* Now restore the parentheses context. */
360     DEBUG_BUFFERS_r(
361         if (i || rex->lastparen + 1 <= rex->nparens)
362             Perl_re_exec_indentf( aTHX_
363                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
364                 depth,
365                 PTR2UV(rex),
366                 PTR2UV(rex->offs)
367             );
368     );
369     paren = *maxopenparen_p;
370     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
371         SSize_t tmps;
372         rex->offs[paren].start_tmp = SSPOPINT;
373         rex->offs[paren].start = SSPOPIV;
374         tmps = SSPOPIV;
375         if (paren <= rex->lastparen)
376             rex->offs[paren].end = tmps;
377         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
378             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
379             depth,
380             (UV)paren,
381             (IV)rex->offs[paren].start,
382             (IV)rex->offs[paren].start_tmp,
383             (IV)rex->offs[paren].end,
384             (paren > rex->lastparen ? "(skipped)" : ""));
385         );
386         paren--;
387     }
388 #if 1
389     /* It would seem that the similar code in regtry()
390      * already takes care of this, and in fact it is in
391      * a better location to since this code can #if 0-ed out
392      * but the code in regtry() is needed or otherwise tests
393      * requiring null fields (pat.t#187 and split.t#{13,14}
394      * (as of patchlevel 7877)  will fail.  Then again,
395      * this code seems to be necessary or otherwise
396      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397      * --jhi updated by dapm */
398     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399         if (i > *maxopenparen_p)
400             rex->offs[i].start = -1;
401         rex->offs[i].end = -1;
402         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
403             "    \\%" UVuf ": %s   ..-1 undeffing\n",
404             depth,
405             (UV)i,
406             (i > *maxopenparen_p) ? "-1" : "  "
407         ));
408     }
409 #endif
410 }
411 
412 /* restore the parens and associated vars at savestack position ix,
413  * but without popping the stack */
414 
415 STATIC void
S_regcp_restore(pTHX_ regexp * rex,I32 ix,U32 * maxopenparen_p _pDEPTH)416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
417 {
418     I32 tmpix = PL_savestack_ix;
419     PERL_ARGS_ASSERT_REGCP_RESTORE;
420 
421     PL_savestack_ix = ix;
422     regcppop(rex, maxopenparen_p);
423     PL_savestack_ix = tmpix;
424 }
425 
426 #define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
427 
428 STATIC bool
S_isFOO_lc(pTHX_ const U8 classnum,const U8 character)429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
430 {
431     /* Returns a boolean as to whether or not 'character' is a member of the
432      * Posix character class given by 'classnum' that should be equivalent to a
433      * value in the typedef '_char_class_number'.
434      *
435      * Ideally this could be replaced by a just an array of function pointers
436      * to the C library functions that implement the macros this calls.
437      * However, to compile, the precise function signatures are required, and
438      * these may vary from platform to platform.  To avoid having to figure
439      * out what those all are on each platform, I (khw) am using this method,
440      * which adds an extra layer of function call overhead (unless the C
441      * optimizer strips it away).  But we don't particularly care about
442      * performance with locales anyway. */
443 
444     switch ((_char_class_number) classnum) {
445         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
446         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
447         case _CC_ENUM_ASCII:     return isASCII_LC(character);
448         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
449         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
450                                         || isUPPER_LC(character);
451         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
452         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
453         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
454         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
455         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
456         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
457         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
458         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
459         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
460         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
461         default:    /* VERTSPACE should never occur in locales */
462             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
463     }
464 
465     NOT_REACHED; /* NOTREACHED */
466     return FALSE;
467 }
468 
469 PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(const char * s1,const char * s2,I32 len)470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
471 {
472     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
473      * folded.  Works on all folds representable without UTF-8, except for
474      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
475      * check that the strings each have at least 'len' characters.
476      *
477      * There is almost an identical API function where s2 need not be folded:
478      * Perl_foldEQ_latin1() */
479 
480     const U8 *a = (const U8 *)s1;
481     const U8 *b = (const U8 *)s2;
482 
483     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
484 
485     assert(len >= 0);
486 
487     while (len--) {
488         assert(! isUPPER_L1(*b));
489         if (toLOWER_L1(*a) != *b) {
490             return 0;
491         }
492         a++, b++;
493     }
494     return 1;
495 }
496 
497 STATIC bool
S_isFOO_utf8_lc(pTHX_ const U8 classnum,const U8 * character,const U8 * e)498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
499 {
500     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501      * 'character' is a member of the Posix character class given by 'classnum'
502      * that should be equivalent to a value in the typedef
503      * '_char_class_number'.
504      *
505      * This just calls isFOO_lc on the code point for the character if it is in
506      * the range 0-255.  Outside that range, all characters use Unicode
507      * rules, ignoring any locale.  So use the Unicode function if this class
508      * requires an inversion list, and use the Unicode macro otherwise. */
509 
510 
511     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
512 
513     if (UTF8_IS_INVARIANT(*character)) {
514         return isFOO_lc(classnum, *character);
515     }
516     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
517         return isFOO_lc(classnum,
518                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
519     }
520 
521     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
522 
523     switch ((_char_class_number) classnum) {
524         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
525         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
526         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
527         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
528         default:
529             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
530                                         utf8_to_uvchr_buf(character, e, NULL));
531     }
532 
533     return FALSE; /* Things like CNTRL are always below 256 */
534 }
535 
536 STATIC U8 *
S_find_span_end(U8 * s,const U8 * send,const U8 span_byte)537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
538 {
539     /* Returns the position of the first byte in the sequence between 's' and
540      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
541      * */
542 
543     PERL_ARGS_ASSERT_FIND_SPAN_END;
544 
545     assert(send >= s);
546 
547     if ((STRLEN) (send - s) >= PERL_WORDSIZE
548                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
549                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
550     {
551         PERL_UINTMAX_T span_word;
552 
553         /* Process per-byte until reach word boundary.  XXX This loop could be
554          * eliminated if we knew that this platform had fast unaligned reads */
555         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
556             if (*s != span_byte) {
557                 return s;
558             }
559             s++;
560         }
561 
562         /* Create a word filled with the bytes we are spanning */
563         span_word = PERL_COUNT_MULTIPLIER * span_byte;
564 
565         /* Process per-word as long as we have at least a full word left */
566         do {
567 
568             /* Keep going if the whole word is composed of 'span_byte's */
569             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
570                 s += PERL_WORDSIZE;
571                 continue;
572             }
573 
574             /* Here, at least one byte in the word isn't 'span_byte'. */
575 
576 #ifdef EBCDIC
577 
578             break;
579 
580 #else
581 
582             /* This xor leaves 1 bits only in those non-matching bytes */
583             span_word ^= * (PERL_UINTMAX_T *) s;
584 
585             /* Make sure the upper bit of each non-matching byte is set.  This
586              * makes each such byte look like an ASCII platform variant byte */
587             span_word |= span_word << 1;
588             span_word |= span_word << 2;
589             span_word |= span_word << 4;
590 
591             /* That reduces the problem to what this function solves */
592             return s + variant_byte_number(span_word);
593 
594 #endif
595 
596         } while (s + PERL_WORDSIZE <= send);
597     }
598 
599     /* Process the straggler bytes beyond the final word boundary */
600     while (s < send) {
601         if (*s != span_byte) {
602             return s;
603         }
604         s++;
605     }
606 
607     return s;
608 }
609 
610 STATIC U8 *
S_find_next_masked(U8 * s,const U8 * send,const U8 byte,const U8 mask)611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
612 {
613     /* Returns the position of the first byte in the sequence between 's'
614      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
615      * returns 'send' if none found.  It uses word-level operations instead of
616      * byte to speed up the process */
617 
618     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
619 
620     assert(send >= s);
621     assert((byte & mask) == byte);
622 
623 #ifndef EBCDIC
624 
625     if ((STRLEN) (send - s) >= PERL_WORDSIZE
626                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
627                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
628     {
629         PERL_UINTMAX_T word, mask_word;
630 
631         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
632             if (((*s) & mask) == byte) {
633                 return s;
634             }
635             s++;
636         }
637 
638         word      = PERL_COUNT_MULTIPLIER * byte;
639         mask_word = PERL_COUNT_MULTIPLIER * mask;
640 
641         do {
642             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
643 
644             /* If 'masked' contains bytes with the bit pattern of 'byte' within
645              * it, xoring with 'word' will leave each of the 8 bits in such
646              * bytes be 0, and no byte containing any other bit pattern will be
647              * 0. */
648             masked ^= word;
649 
650             /* This causes the most significant bit to be set to 1 for any
651              * bytes in the word that aren't completely 0 */
652             masked |= masked << 1;
653             masked |= masked << 2;
654             masked |= masked << 4;
655 
656             /* The msbits are the same as what marks a byte as variant, so we
657              * can use this mask.  If all msbits are 1, the word doesn't
658              * contain 'byte' */
659             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
660                 s += PERL_WORDSIZE;
661                 continue;
662             }
663 
664             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
665              * and any that are, are 0.  Complement and re-AND to swap that */
666             masked = ~ masked;
667             masked &= PERL_VARIANTS_WORD_MASK;
668 
669             /* This reduces the problem to that solved by this function */
670             s += variant_byte_number(masked);
671             return s;
672 
673         } while (s + PERL_WORDSIZE <= send);
674     }
675 
676 #endif
677 
678     while (s < send) {
679         if (((*s) & mask) == byte) {
680             return s;
681         }
682         s++;
683     }
684 
685     return s;
686 }
687 
688 STATIC U8 *
S_find_span_end_mask(U8 * s,const U8 * send,const U8 span_byte,const U8 mask)689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
690 {
691     /* Returns the position of the first byte in the sequence between 's' and
692      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
693      * 'span_byte' should have been ANDed with 'mask' in the call of this
694      * function.  Returns 'send' if none found.  Works like find_span_end(),
695      * except for the AND */
696 
697     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
698 
699     assert(send >= s);
700     assert((span_byte & mask) == span_byte);
701 
702     if ((STRLEN) (send - s) >= PERL_WORDSIZE
703                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
704                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
705     {
706         PERL_UINTMAX_T span_word, mask_word;
707 
708         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
709             if (((*s) & mask) != span_byte) {
710                 return s;
711             }
712             s++;
713         }
714 
715         span_word = PERL_COUNT_MULTIPLIER * span_byte;
716         mask_word = PERL_COUNT_MULTIPLIER * mask;
717 
718         do {
719             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
720 
721             if (masked == span_word) {
722                 s += PERL_WORDSIZE;
723                 continue;
724             }
725 
726 #ifdef EBCDIC
727 
728             break;
729 
730 #else
731 
732             masked ^= span_word;
733             masked |= masked << 1;
734             masked |= masked << 2;
735             masked |= masked << 4;
736             return s + variant_byte_number(masked);
737 
738 #endif
739 
740         } while (s + PERL_WORDSIZE <= send);
741     }
742 
743     while (s < send) {
744         if (((*s) & mask) != span_byte) {
745             return s;
746         }
747         s++;
748     }
749 
750     return s;
751 }
752 
753 /*
754  * pregexec and friends
755  */
756 
757 #ifndef PERL_IN_XSUB_RE
758 /*
759  - pregexec - match a regexp against a string
760  */
761 I32
Perl_pregexec(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
763          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
764 /* stringarg: the point in the string at which to begin matching */
765 /* strend:    pointer to null at end of string */
766 /* strbeg:    real beginning of string */
767 /* minend:    end of match must be >= minend bytes after stringarg. */
768 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
769  *            itself is accessed via the pointers above */
770 /* nosave:    For optimizations. */
771 {
772     PERL_ARGS_ASSERT_PREGEXEC;
773 
774     return
775         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
776                       nosave ? 0 : REXEC_COPY_STR);
777 }
778 #endif
779 
780 
781 
782 /* re_intuit_start():
783  *
784  * Based on some optimiser hints, try to find the earliest position in the
785  * string where the regex could match.
786  *
787  *   rx:     the regex to match against
788  *   sv:     the SV being matched: only used for utf8 flag; the string
789  *           itself is accessed via the pointers below. Note that on
790  *           something like an overloaded SV, SvPOK(sv) may be false
791  *           and the string pointers may point to something unrelated to
792  *           the SV itself.
793  *   strbeg: real beginning of string
794  *   strpos: the point in the string at which to begin matching
795  *   strend: pointer to the byte following the last char of the string
796  *   flags   currently unused; set to 0
797  *   data:   currently unused; set to NULL
798  *
799  * The basic idea of re_intuit_start() is to use some known information
800  * about the pattern, namely:
801  *
802  *   a) the longest known anchored substring (i.e. one that's at a
803  *      constant offset from the beginning of the pattern; but not
804  *      necessarily at a fixed offset from the beginning of the
805  *      string);
806  *   b) the longest floating substring (i.e. one that's not at a constant
807  *      offset from the beginning of the pattern);
808  *   c) Whether the pattern is anchored to the string; either
809  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
810  *      or anchored to pos(): /\G/;
811  *   d) A start class: a real or synthetic character class which
812  *      represents which characters are legal at the start of the pattern;
813  *
814  * to either quickly reject the match, or to find the earliest position
815  * within the string at which the pattern might match, thus avoiding
816  * running the full NFA engine at those earlier locations, only to
817  * eventually fail and retry further along.
818  *
819  * Returns NULL if the pattern can't match, or returns the address within
820  * the string which is the earliest place the match could occur.
821  *
822  * The longest of the anchored and floating substrings is called 'check'
823  * and is checked first. The other is called 'other' and is checked
824  * second. The 'other' substring may not be present.  For example,
825  *
826  *    /(abc|xyz)ABC\d{0,3}DEFG/
827  *
828  * will have
829  *
830  *   check substr (float)    = "DEFG", offset 6..9 chars
831  *   other substr (anchored) = "ABC",  offset 3..3 chars
832  *   stclass = [ax]
833  *
834  * Be aware that during the course of this function, sometimes 'anchored'
835  * refers to a substring being anchored relative to the start of the
836  * pattern, and sometimes to the pattern itself being anchored relative to
837  * the string. For example:
838  *
839  *   /\dabc/:   "abc" is anchored to the pattern;
840  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
841  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
842  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
843  *                    but the pattern is anchored to the string.
844  */
845 
846 char *
Perl_re_intuit_start(pTHX_ REGEXP * const rx,SV * sv,const char * const strbeg,char * strpos,char * strend,const U32 flags,re_scream_pos_data * data)847 Perl_re_intuit_start(pTHX_
848                     REGEXP * const rx,
849                     SV *sv,
850                     const char * const strbeg,
851                     char *strpos,
852                     char *strend,
853                     const U32 flags,
854                     re_scream_pos_data *data)
855 {
856     struct regexp *const prog = ReANY(rx);
857     SSize_t start_shift = prog->check_offset_min;
858     /* Should be nonnegative! */
859     SSize_t end_shift   = 0;
860     /* current lowest pos in string where the regex can start matching */
861     char *rx_origin = strpos;
862     SV *check;
863     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
864     U8   other_ix = 1 - prog->substrs->check_ix;
865     bool ml_anch = 0;
866     char *other_last = strpos;/* latest pos 'other' substr already checked to */
867     char *check_at = NULL;		/* check substr found at this pos */
868     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
869     RXi_GET_DECL(prog,progi);
870     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
871     regmatch_info *const reginfo = &reginfo_buf;
872     DECLARE_AND_GET_RE_DEBUG_FLAGS;
873 
874     PERL_ARGS_ASSERT_RE_INTUIT_START;
875     PERL_UNUSED_ARG(flags);
876     PERL_UNUSED_ARG(data);
877 
878     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
879                 "Intuit: trying to determine minimum start position...\n"));
880 
881     /* for now, assume that all substr offsets are positive. If at some point
882      * in the future someone wants to do clever things with lookbehind and
883      * -ve offsets, they'll need to fix up any code in this function
884      * which uses these offsets. See the thread beginning
885      * <20140113145929.GF27210@iabyn.com>
886      */
887     assert(prog->substrs->data[0].min_offset >= 0);
888     assert(prog->substrs->data[0].max_offset >= 0);
889     assert(prog->substrs->data[1].min_offset >= 0);
890     assert(prog->substrs->data[1].max_offset >= 0);
891     assert(prog->substrs->data[2].min_offset >= 0);
892     assert(prog->substrs->data[2].max_offset >= 0);
893 
894     /* for now, assume that if both present, that the floating substring
895      * doesn't start before the anchored substring.
896      * If you break this assumption (e.g. doing better optimisations
897      * with lookahead/behind), then you'll need to audit the code in this
898      * function carefully first
899      */
900     assert(
901             ! (  (prog->anchored_utf8 || prog->anchored_substr)
902               && (prog->float_utf8    || prog->float_substr))
903            || (prog->float_min_offset >= prog->anchored_offset));
904 
905     /* byte rather than char calculation for efficiency. It fails
906      * to quickly reject some cases that can't match, but will reject
907      * them later after doing full char arithmetic */
908     if (prog->minlen > strend - strpos) {
909         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
910                               "  String too short...\n"));
911         goto fail;
912     }
913 
914     RXp_MATCH_UTF8_set(prog, utf8_target);
915     reginfo->is_utf8_target = cBOOL(utf8_target);
916     reginfo->info_aux = NULL;
917     reginfo->strbeg = strbeg;
918     reginfo->strend = strend;
919     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
920     reginfo->intuit = 1;
921     /* not actually used within intuit, but zero for safety anyway */
922     reginfo->poscache_maxiter = 0;
923 
924     if (utf8_target) {
925         if ((!prog->anchored_utf8 && prog->anchored_substr)
926                 || (!prog->float_utf8 && prog->float_substr))
927             to_utf8_substr(prog);
928         check = prog->check_utf8;
929     } else {
930         if (!prog->check_substr && prog->check_utf8) {
931             if (! to_byte_substr(prog)) {
932                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
933             }
934         }
935         check = prog->check_substr;
936     }
937 
938     /* dump the various substring data */
939     DEBUG_OPTIMISE_MORE_r({
940         int i;
941         for (i=0; i<=2; i++) {
942             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
943                                   : prog->substrs->data[i].substr);
944             if (!sv)
945                 continue;
946 
947             Perl_re_printf( aTHX_
948                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
949                 " useful=%" IVdf " utf8=%d [%s]\n",
950                 i,
951                 (IV)prog->substrs->data[i].min_offset,
952                 (IV)prog->substrs->data[i].max_offset,
953                 (IV)prog->substrs->data[i].end_shift,
954                 BmUSEFUL(sv),
955                 utf8_target ? 1 : 0,
956                 SvPEEK(sv));
957         }
958     });
959 
960     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
961 
962         /* ml_anch: check after \n?
963          *
964          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
965          * with /.*.../, these flags will have been added by the
966          * compiler:
967          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
968          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
969          */
970         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
971                    && !(prog->intflags & PREGf_IMPLICIT);
972 
973         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
974             /* we are only allowed to match at BOS or \G */
975 
976             /* trivially reject if there's a BOS anchor and we're not at BOS.
977              *
978              * Note that we don't try to do a similar quick reject for
979              * \G, since generally the caller will have calculated strpos
980              * based on pos() and gofs, so the string is already correctly
981              * anchored by definition; and handling the exceptions would
982              * be too fiddly (e.g. REXEC_IGNOREPOS).
983              */
984             if (   strpos != strbeg
985                 && (prog->intflags & PREGf_ANCH_SBOL))
986             {
987                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
988                                 "  Not at start...\n"));
989                 goto fail;
990             }
991 
992             /* in the presence of an anchor, the anchored (relative to the
993              * start of the regex) substr must also be anchored relative
994              * to strpos. So quickly reject if substr isn't found there.
995              * This works for \G too, because the caller will already have
996              * subtracted gofs from pos, and gofs is the offset from the
997              * \G to the start of the regex. For example, in /.abc\Gdef/,
998              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
999              * caller will have set strpos=pos()-4; we look for the substr
1000              * at position pos()-4+1, which lines up with the "a" */
1001 
1002             if (prog->check_offset_min == prog->check_offset_max) {
1003                 /* Substring at constant offset from beg-of-str... */
1004                 SSize_t slen = SvCUR(check);
1005                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1006 
1007                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1008                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1009                     (IV)prog->check_offset_min));
1010 
1011                 if (SvTAIL(check)) {
1012                     /* In this case, the regex is anchored at the end too.
1013                      * Unless it's a multiline match, the lengths must match
1014                      * exactly, give or take a \n.  NB: slen >= 1 since
1015                      * the last char of check is \n */
1016                     if (!multiline
1017                         && (   strend - s > slen
1018                             || strend - s < slen - 1
1019                             || (strend - s == slen && strend[-1] != '\n')))
1020                     {
1021                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1022                                             "  String too long...\n"));
1023                         goto fail_finish;
1024                     }
1025                     /* Now should match s[0..slen-2] */
1026                     slen--;
1027                 }
1028                 if (slen && (strend - s < slen
1029                     || *SvPVX_const(check) != *s
1030                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1031                 {
1032                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033                                     "  String not equal...\n"));
1034                     goto fail_finish;
1035                 }
1036 
1037                 check_at = s;
1038                 goto success_at_start;
1039             }
1040         }
1041     }
1042 
1043     end_shift = prog->check_end_shift;
1044 
1045 #ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
1046     if (end_shift < 0)
1047         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1048                    (IV)end_shift, RX_PRECOMP(rx));
1049 #endif
1050 
1051   restart:
1052 
1053     /* This is the (re)entry point of the main loop in this function.
1054      * The goal of this loop is to:
1055      * 1) find the "check" substring in the region rx_origin..strend
1056      *    (adjusted by start_shift / end_shift). If not found, reject
1057      *    immediately.
1058      * 2) If it exists, look for the "other" substr too if defined; for
1059      *    example, if the check substr maps to the anchored substr, then
1060      *    check the floating substr, and vice-versa. If not found, go
1061      *    back to (1) with rx_origin suitably incremented.
1062      * 3) If we find an rx_origin position that doesn't contradict
1063      *    either of the substrings, then check the possible additional
1064      *    constraints on rx_origin of /^.../m or a known start class.
1065      *    If these fail, then depending on which constraints fail, jump
1066      *    back to here, or to various other re-entry points further along
1067      *    that skip some of the first steps.
1068      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1069      *    substring. If the start position was determined to be at the
1070      *    beginning of the string  - so, not rejected, but not optimised,
1071      *    since we have to run regmatch from position 0 - decrement the
1072      *    BmUSEFUL() count. Otherwise increment it.
1073      */
1074 
1075 
1076     /* first, look for the 'check' substring */
1077 
1078     {
1079         U8* start_point;
1080         U8* end_point;
1081 
1082         DEBUG_OPTIMISE_MORE_r({
1083             Perl_re_printf( aTHX_
1084                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1085                 " Start shift: %" IVdf " End shift %" IVdf
1086                 " Real end Shift: %" IVdf "\n",
1087                 (IV)(rx_origin - strbeg),
1088                 (IV)prog->check_offset_min,
1089                 (IV)start_shift,
1090                 (IV)end_shift,
1091                 (IV)prog->check_end_shift);
1092         });
1093 
1094         end_point = HOPBACK3(strend, end_shift, rx_origin);
1095         if (!end_point)
1096             goto fail_finish;
1097         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1098         if (!start_point)
1099             goto fail_finish;
1100 
1101 
1102         /* If the regex is absolutely anchored to either the start of the
1103          * string (SBOL) or to pos() (ANCH_GPOS), then
1104          * check_offset_max represents an upper bound on the string where
1105          * the substr could start. For the ANCH_GPOS case, we assume that
1106          * the caller of intuit will have already set strpos to
1107          * pos()-gofs, so in this case strpos + offset_max will still be
1108          * an upper bound on the substr.
1109          */
1110         if (!ml_anch
1111             && prog->intflags & PREGf_ANCH
1112             && prog->check_offset_max != SSize_t_MAX)
1113         {
1114             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1115             const char * const anchor =
1116                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1117             SSize_t targ_len = (char*)end_point - anchor;
1118 
1119             if (check_len > targ_len) {
1120                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121                               "Target string too short to match required substring...\n"));
1122                 goto fail_finish;
1123             }
1124 
1125             /* do a bytes rather than chars comparison. It's conservative;
1126              * so it skips doing the HOP if the result can't possibly end
1127              * up earlier than the old value of end_point.
1128              */
1129             assert(anchor + check_len <= (char *)end_point);
1130             if (prog->check_offset_max + check_len < targ_len) {
1131                 end_point = HOP3lim((U8*)anchor,
1132                                 prog->check_offset_max,
1133                                 end_point - check_len
1134                             )
1135                             + check_len;
1136                 if (end_point < start_point)
1137                     goto fail_finish;
1138             }
1139         }
1140 
1141         check_at = fbm_instr( start_point, end_point,
1142                       check, multiline ? FBMrf_MULTILINE : 0);
1143 
1144         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1146             (IV)((char*)start_point - strbeg),
1147             (IV)((char*)end_point   - strbeg),
1148             (IV)(check_at ? check_at - strbeg : -1)
1149         ));
1150 
1151         /* Update the count-of-usability, remove useless subpatterns,
1152             unshift s.  */
1153 
1154         DEBUG_EXECUTE_r({
1155             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1156                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1157             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1158                               (check_at ? "Found" : "Did not find"),
1159                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1160                     ? "anchored" : "floating"),
1161                 quoted,
1162                 RE_SV_TAIL(check),
1163                 (check_at ? " at offset " : "...\n") );
1164         });
1165 
1166         if (!check_at)
1167             goto fail_finish;
1168         /* set rx_origin to the minimum position where the regex could start
1169          * matching, given the constraint of the just-matched check substring.
1170          * But don't set it lower than previously.
1171          */
1172 
1173         if (check_at - rx_origin > prog->check_offset_max)
1174             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1175         /* Finish the diagnostic message */
1176         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1177             "%ld (rx_origin now %" IVdf ")...\n",
1178             (long)(check_at - strbeg),
1179             (IV)(rx_origin - strbeg)
1180         ));
1181     }
1182 
1183 
1184     /* now look for the 'other' substring if defined */
1185 
1186     if (prog->substrs->data[other_ix].utf8_substr
1187         || prog->substrs->data[other_ix].substr)
1188     {
1189         /* Take into account the "other" substring. */
1190         char *last, *last1;
1191         char *s;
1192         SV* must;
1193         struct reg_substr_datum *other;
1194 
1195       do_other_substr:
1196         other = &prog->substrs->data[other_ix];
1197         if (!utf8_target && !other->substr) {
1198             if (!to_byte_substr(prog)) {
1199                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1200             }
1201         }
1202 
1203         /* if "other" is anchored:
1204          * we've previously found a floating substr starting at check_at.
1205          * This means that the regex origin must lie somewhere
1206          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1207          * and max:                 HOP3(check_at, -check_offset_min)
1208          * (except that min will be >= strpos)
1209          * So the fixed  substr must lie somewhere between
1210          *  HOP3(min, anchored_offset)
1211          *  HOP3(max, anchored_offset) + SvCUR(substr)
1212          */
1213 
1214         /* if "other" is floating
1215          * Calculate last1, the absolute latest point where the
1216          * floating substr could start in the string, ignoring any
1217          * constraints from the earlier fixed match. It is calculated
1218          * as follows:
1219          *
1220          * strend - prog->minlen (in chars) is the absolute latest
1221          * position within the string where the origin of the regex
1222          * could appear. The latest start point for the floating
1223          * substr is float_min_offset(*) on from the start of the
1224          * regex.  last1 simply combines thee two offsets.
1225          *
1226          * (*) You might think the latest start point should be
1227          * float_max_offset from the regex origin, and technically
1228          * you'd be correct. However, consider
1229          *    /a\d{2,4}bcd\w/
1230          * Here, float min, max are 3,5 and minlen is 7.
1231          * This can match either
1232          *    /a\d\dbcd\w/
1233          *    /a\d\d\dbcd\w/
1234          *    /a\d\d\d\dbcd\w/
1235          * In the first case, the regex matches minlen chars; in the
1236          * second, minlen+1, in the third, minlen+2.
1237          * In the first case, the floating offset is 3 (which equals
1238          * float_min), in the second, 4, and in the third, 5 (which
1239          * equals float_max). In all cases, the floating string bcd
1240          * can never start more than 4 chars from the end of the
1241          * string, which equals minlen - float_min. As the substring
1242          * starts to match more than float_min from the start of the
1243          * regex, it makes the regex match more than minlen chars,
1244          * and the two cancel each other out. So we can always use
1245          * float_min - minlen, rather than float_max - minlen for the
1246          * latest position in the string.
1247          *
1248          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1249          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1250          */
1251 
1252         assert(prog->minlen >= other->min_offset);
1253         last1 = HOP3c(strend,
1254                         other->min_offset - prog->minlen, strbeg);
1255 
1256         if (other_ix) {/* i.e. if (other-is-float) */
1257             /* last is the latest point where the floating substr could
1258              * start, *given* any constraints from the earlier fixed
1259              * match. This constraint is that the floating string starts
1260              * <= float_max_offset chars from the regex origin (rx_origin).
1261              * If this value is less than last1, use it instead.
1262              */
1263             assert(rx_origin <= last1);
1264             last =
1265                 /* this condition handles the offset==infinity case, and
1266                  * is a short-cut otherwise. Although it's comparing a
1267                  * byte offset to a char length, it does so in a safe way,
1268                  * since 1 char always occupies 1 or more bytes,
1269                  * so if a string range is  (last1 - rx_origin) bytes,
1270                  * it will be less than or equal to  (last1 - rx_origin)
1271                  * chars; meaning it errs towards doing the accurate HOP3
1272                  * rather than just using last1 as a short-cut */
1273                 (last1 - rx_origin) < other->max_offset
1274                     ? last1
1275                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1276         }
1277         else {
1278             assert(strpos + start_shift <= check_at);
1279             last = HOP4c(check_at, other->min_offset - start_shift,
1280                         strbeg, strend);
1281         }
1282 
1283         s = HOP3c(rx_origin, other->min_offset, strend);
1284         if (s < other_last)	/* These positions already checked */
1285             s = other_last;
1286 
1287         must = utf8_target ? other->utf8_substr : other->substr;
1288         assert(SvPOK(must));
1289         {
1290             char *from = s;
1291             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1292 
1293             if (to > strend)
1294                 to = strend;
1295             if (from > to) {
1296                 s = NULL;
1297                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1299                     (IV)(from - strbeg),
1300                     (IV)(to   - strbeg)
1301                 ));
1302             }
1303             else {
1304                 s = fbm_instr(
1305                     (unsigned char*)from,
1306                     (unsigned char*)to,
1307                     must,
1308                     multiline ? FBMrf_MULTILINE : 0
1309                 );
1310                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1311                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1312                     (IV)(from - strbeg),
1313                     (IV)(to   - strbeg),
1314                     (IV)(s ? s - strbeg : -1)
1315                 ));
1316             }
1317         }
1318 
1319         DEBUG_EXECUTE_r({
1320             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1321                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1322             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1323                 s ? "Found" : "Contradicts",
1324                 other_ix ? "floating" : "anchored",
1325                 quoted, RE_SV_TAIL(must));
1326         });
1327 
1328 
1329         if (!s) {
1330             /* last1 is latest possible substr location. If we didn't
1331              * find it before there, we never will */
1332             if (last >= last1) {
1333                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1334                                         "; giving up...\n"));
1335                 goto fail_finish;
1336             }
1337 
1338             /* try to find the check substr again at a later
1339              * position. Maybe next time we'll find the "other" substr
1340              * in range too */
1341             other_last = HOP3c(last, 1, strend) /* highest failure */;
1342             rx_origin =
1343                 other_ix /* i.e. if other-is-float */
1344                     ? HOP3c(rx_origin, 1, strend)
1345                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1346             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1347                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1348                 (other_ix ? "floating" : "anchored"),
1349                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1350                 (IV)(rx_origin - strbeg)
1351             ));
1352             goto restart;
1353         }
1354         else {
1355             if (other_ix) { /* if (other-is-float) */
1356                 /* other_last is set to s, not s+1, since its possible for
1357                  * a floating substr to fail first time, then succeed
1358                  * second time at the same floating position; e.g.:
1359                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1360                  * The first time round, anchored and float match at
1361                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1362                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1363                  */
1364                 other_last = s;
1365             }
1366             else {
1367                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1368                 other_last = HOP3c(s, 1, strend);
1369             }
1370             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1372                   (long)(s - strbeg),
1373                 (IV)(rx_origin - strbeg)
1374               ));
1375 
1376         }
1377     }
1378     else {
1379         DEBUG_OPTIMISE_MORE_r(
1380             Perl_re_printf( aTHX_
1381                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1382                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1383                 " strend:%" IVdf "\n",
1384                 (IV)prog->check_offset_min,
1385                 (IV)prog->check_offset_max,
1386                 (IV)(check_at-strbeg),
1387                 (IV)(rx_origin-strbeg),
1388                 (IV)(rx_origin-check_at),
1389                 (IV)(strend-strbeg)
1390             )
1391         );
1392     }
1393 
1394   postprocess_substr_matches:
1395 
1396     /* handle the extra constraint of /^.../m if present */
1397 
1398     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1399         char *s;
1400 
1401         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1402                         "  looking for /^/m anchor"));
1403 
1404         /* we have failed the constraint of a \n before rx_origin.
1405          * Find the next \n, if any, even if it's beyond the current
1406          * anchored and/or floating substrings. Whether we should be
1407          * scanning ahead for the next \n or the next substr is debatable.
1408          * On the one hand you'd expect rare substrings to appear less
1409          * often than \n's. On the other hand, searching for \n means
1410          * we're effectively flipping between check_substr and "\n" on each
1411          * iteration as the current "rarest" candidate string, which
1412          * means for example that we'll quickly reject the whole string if
1413          * hasn't got a \n, rather than trying every substr position
1414          * first
1415          */
1416 
1417         s = HOP3c(strend, - prog->minlen, strpos);
1418         if (s <= rx_origin ||
1419             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1420         {
1421             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422                             "  Did not find /%s^%s/m...\n",
1423                             PL_colors[0], PL_colors[1]));
1424             goto fail_finish;
1425         }
1426 
1427         /* earliest possible origin is 1 char after the \n.
1428          * (since *rx_origin == '\n', it's safe to ++ here rather than
1429          * HOP(rx_origin, 1)) */
1430         rx_origin++;
1431 
1432         if (prog->substrs->check_ix == 0  /* check is anchored */
1433             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1434         {
1435             /* Position contradicts check-string; either because
1436              * check was anchored (and thus has no wiggle room),
1437              * or check was float and rx_origin is above the float range */
1438             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1440                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1441             goto restart;
1442         }
1443 
1444         /* if we get here, the check substr must have been float,
1445          * is in range, and we may or may not have had an anchored
1446          * "other" substr which still contradicts */
1447         assert(prog->substrs->check_ix); /* check is float */
1448 
1449         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1450             /* whoops, the anchored "other" substr exists, so we still
1451              * contradict. On the other hand, the float "check" substr
1452              * didn't contradict, so just retry the anchored "other"
1453              * substr */
1454             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1455                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1456                 PL_colors[0], PL_colors[1],
1457                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1458                 (IV)(rx_origin - strbeg)
1459             ));
1460             goto do_other_substr;
1461         }
1462 
1463         /* success: we don't contradict the found floating substring
1464          * (and there's no anchored substr). */
1465         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466             "  Found /%s^%s/m with rx_origin %ld...\n",
1467             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1468     }
1469     else {
1470         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1471             "  (multiline anchor test skipped)\n"));
1472     }
1473 
1474   success_at_start:
1475 
1476 
1477     /* if we have a starting character class, then test that extra constraint.
1478      * (trie stclasses are too expensive to use here, we are better off to
1479      * leave it to regmatch itself) */
1480 
1481     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1482         const U8* const str = (U8*)STRING(progi->regstclass);
1483 
1484         /* XXX this value could be pre-computed */
1485         const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1486                     ?  (reginfo->is_utf8_pat
1487                         ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1488                         : (SSize_t)STR_LEN(progi->regstclass))
1489                     : 1);
1490         char * endpos;
1491         char *s;
1492         /* latest pos that a matching float substr constrains rx start to */
1493         char *rx_max_float = NULL;
1494 
1495         /* if the current rx_origin is anchored, either by satisfying an
1496          * anchored substring constraint, or a /^.../m constraint, then we
1497          * can reject the current origin if the start class isn't found
1498          * at the current position. If we have a float-only match, then
1499          * rx_origin is constrained to a range; so look for the start class
1500          * in that range. if neither, then look for the start class in the
1501          * whole rest of the string */
1502 
1503         /* XXX DAPM it's not clear what the minlen test is for, and why
1504          * it's not used in the floating case. Nothing in the test suite
1505          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1506          * Here are some old comments, which may or may not be correct:
1507          *
1508          *   minlen == 0 is possible if regstclass is \b or \B,
1509          *   and the fixed substr is ''$.
1510          *   Since minlen is already taken into account, rx_origin+1 is
1511          *   before strend; accidentally, minlen >= 1 guaranties no false
1512          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1513          *   0) below assumes that regstclass does not come from lookahead...
1514          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1515          *   This leaves EXACTF-ish only, which are dealt with in
1516          *   find_byclass().
1517          */
1518 
1519         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1520             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1521         else if (prog->float_substr || prog->float_utf8) {
1522             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1523             endpos = HOP3clim(rx_max_float, cl_l, strend);
1524         }
1525         else
1526             endpos= strend;
1527 
1528         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1529             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1530             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1531               (IV)start_shift, (IV)(check_at - strbeg),
1532               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1533 
1534         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1535                             reginfo);
1536         if (!s) {
1537             if (endpos == strend) {
1538                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1539                                 "  Could not match STCLASS...\n") );
1540                 goto fail;
1541             }
1542             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1543                                "  This position contradicts STCLASS...\n") );
1544             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1545                         && !(prog->intflags & PREGf_IMPLICIT))
1546                 goto fail;
1547 
1548             /* Contradict one of substrings */
1549             if (prog->anchored_substr || prog->anchored_utf8) {
1550                 if (prog->substrs->check_ix == 1) { /* check is float */
1551                     /* Have both, check_string is floating */
1552                     assert(rx_origin + start_shift <= check_at);
1553                     if (rx_origin + start_shift != check_at) {
1554                         /* not at latest position float substr could match:
1555                          * Recheck anchored substring, but not floating.
1556                          * The condition above is in bytes rather than
1557                          * chars for efficiency. It's conservative, in
1558                          * that it errs on the side of doing 'goto
1559                          * do_other_substr'. In this case, at worst,
1560                          * an extra anchored search may get done, but in
1561                          * practice the extra fbm_instr() is likely to
1562                          * get skipped anyway. */
1563                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1564                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1565                             (long)(other_last - strbeg),
1566                             (IV)(rx_origin - strbeg)
1567                         ));
1568                         goto do_other_substr;
1569                     }
1570                 }
1571             }
1572             else {
1573                 /* float-only */
1574 
1575                 if (ml_anch) {
1576                     /* In the presence of ml_anch, we might be able to
1577                      * find another \n without breaking the current float
1578                      * constraint. */
1579 
1580                     /* strictly speaking this should be HOP3c(..., 1, ...),
1581                      * but since we goto a block of code that's going to
1582                      * search for the next \n if any, its safe here */
1583                     rx_origin++;
1584                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1585                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1586                               PL_colors[0], PL_colors[1],
1587                               (long)(rx_origin - strbeg)) );
1588                     goto postprocess_substr_matches;
1589                 }
1590 
1591                 /* strictly speaking this can never be true; but might
1592                  * be if we ever allow intuit without substrings */
1593                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1594                     goto fail;
1595 
1596                 rx_origin = rx_max_float;
1597             }
1598 
1599             /* at this point, any matching substrings have been
1600              * contradicted. Start again... */
1601 
1602             rx_origin = HOP3c(rx_origin, 1, strend);
1603 
1604             /* uses bytes rather than char calculations for efficiency.
1605              * It's conservative: it errs on the side of doing 'goto restart',
1606              * where there is code that does a proper char-based test */
1607             if (rx_origin + start_shift + end_shift > strend) {
1608                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1609                                        "  Could not match STCLASS...\n") );
1610                 goto fail;
1611             }
1612             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1613                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1614                 (prog->substrs->check_ix ? "floating" : "anchored"),
1615                 (long)(rx_origin + start_shift - strbeg),
1616                 (IV)(rx_origin - strbeg)
1617             ));
1618             goto restart;
1619         }
1620 
1621         /* Success !!! */
1622 
1623         if (rx_origin != s) {
1624             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625                         "  By STCLASS: moving %ld --> %ld\n",
1626                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1627                    );
1628         }
1629         else {
1630             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1631                                   "  Does not contradict STCLASS...\n");
1632                    );
1633         }
1634     }
1635 
1636     /* Decide whether using the substrings helped */
1637 
1638     if (rx_origin != strpos) {
1639         /* Fixed substring is found far enough so that the match
1640            cannot start at strpos. */
1641 
1642         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1643         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);	/* hooray/5 */
1644     }
1645     else {
1646         /* The found rx_origin position does not prohibit matching at
1647          * strpos, so calling intuit didn't gain us anything. Decrement
1648          * the BmUSEFUL() count on the check substring, and if we reach
1649          * zero, free it.  */
1650         if (!(prog->intflags & PREGf_NAUGHTY)
1651             && (utf8_target ? (
1652                 prog->check_utf8		/* Could be deleted already */
1653                 && --BmUSEFUL(prog->check_utf8) < 0
1654                 && (prog->check_utf8 == prog->float_utf8)
1655             ) : (
1656                 prog->check_substr		/* Could be deleted already */
1657                 && --BmUSEFUL(prog->check_substr) < 0
1658                 && (prog->check_substr == prog->float_substr)
1659             )))
1660         {
1661             /* If flags & SOMETHING - do not do it many times on the same match */
1662             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1663             /* XXX Does the destruction order has to change with utf8_target? */
1664             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1665             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1666             prog->check_substr = prog->check_utf8 = NULL;	/* disable */
1667             prog->float_substr = prog->float_utf8 = NULL;	/* clear */
1668             check = NULL;			/* abort */
1669             /* XXXX This is a remnant of the old implementation.  It
1670                     looks wasteful, since now INTUIT can use many
1671                     other heuristics. */
1672             prog->extflags &= ~RXf_USE_INTUIT;
1673         }
1674     }
1675 
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1677             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1678              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1679 
1680     return rx_origin;
1681 
1682   fail_finish:				/* Substring not found */
1683     if (prog->check_substr || prog->check_utf8)		/* could be removed already */
1684         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1685   fail:
1686     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1687                           PL_colors[4], PL_colors[5]));
1688     return NULL;
1689 }
1690 
1691 
1692 #define DECL_TRIE_TYPE(scan) \
1693     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1694                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1695                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1696                     trie_type = ((scan->flags == EXACT)                             \
1697                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1698                                  : (scan->flags == EXACTL)                          \
1699                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1700                                     : (scan->flags == EXACTFAA)                     \
1701                                       ? (utf8_target                                \
1702                                          ? trie_utf8_exactfa_fold                   \
1703                                          : trie_latin_utf8_exactfa_fold)            \
1704                                       : (scan->flags == EXACTFLU8                   \
1705                                          ? (utf8_target                             \
1706                                            ? trie_flu8                              \
1707                                            : trie_flu8_latin)                       \
1708                                          : (utf8_target                             \
1709                                            ? trie_utf8_fold                         \
1710                                            : trie_latin_utf8_fold)))
1711 
1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1713  * 'foldbuf+sizeof(foldbuf)' */
1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1715 STMT_START {                                                                        \
1716     STRLEN skiplen;                                                                 \
1717     U8 flags = FOLD_FLAGS_FULL;                                                     \
1718     switch (trie_type) {                                                            \
1719     case trie_flu8:                                                                 \
1720         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1721         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1722             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1723         }                                                                           \
1724         goto do_trie_utf8_fold;                                                     \
1725     case trie_utf8_exactfa_fold:                                                    \
1726         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1727         /* FALLTHROUGH */                                                           \
1728     case trie_utf8_fold:                                                            \
1729       do_trie_utf8_fold:                                                            \
1730         if ( foldlen>0 ) {                                                          \
1731             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1732             foldlen -= len;                                                         \
1733             uscan += len;                                                           \
1734             len=0;                                                                  \
1735         } else {                                                                    \
1736             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1737                                                                             flags); \
1738             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1739             skiplen = UVCHR_SKIP( uvc );                                            \
1740             foldlen -= skiplen;                                                     \
1741             uscan = foldbuf + skiplen;                                              \
1742         }                                                                           \
1743         break;                                                                      \
1744     case trie_flu8_latin:                                                           \
1745         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1746         goto do_trie_latin_utf8_fold;                                               \
1747     case trie_latin_utf8_exactfa_fold:                                              \
1748         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1749         /* FALLTHROUGH */                                                           \
1750     case trie_latin_utf8_fold:                                                      \
1751       do_trie_latin_utf8_fold:                                                      \
1752         if ( foldlen>0 ) {                                                          \
1753             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1754             foldlen -= len;                                                         \
1755             uscan += len;                                                           \
1756             len=0;                                                                  \
1757         } else {                                                                    \
1758             len = 1;                                                                \
1759             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1760             skiplen = UVCHR_SKIP( uvc );                                            \
1761             foldlen -= skiplen;                                                     \
1762             uscan = foldbuf + skiplen;                                              \
1763         }                                                                           \
1764         break;                                                                      \
1765     case trie_utf8l:                                                                \
1766         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1767         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1768             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1769         }                                                                           \
1770         /* FALLTHROUGH */                                                           \
1771     case trie_utf8:                                                                 \
1772         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1773         break;                                                                      \
1774     case trie_plain:                                                                \
1775         uvc = (UV)*uc;                                                              \
1776         len = 1;                                                                    \
1777     }                                                                               \
1778     if (uvc < 256) {                                                                \
1779         charid = trie->charmap[ uvc ];                                              \
1780     }                                                                               \
1781     else {                                                                          \
1782         charid = 0;                                                                 \
1783         if (widecharmap) {                                                          \
1784             SV** const svpp = hv_fetch(widecharmap,                                 \
1785                         (char*)&uvc, sizeof(UV), 0);                                \
1786             if (svpp)                                                               \
1787                 charid = (U16)SvIV(*svpp);                                          \
1788         }                                                                           \
1789     }                                                                               \
1790 } STMT_END
1791 
1792 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1793     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1794                 startpos, doutf8, depth)
1795 
1796 #define REXEC_FBC_UTF8_SCAN(CODE)                           \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             CODE                                            \
1800             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1801         }                                                   \
1802     } STMT_END
1803 
1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE)                       \
1805     STMT_START {                                            \
1806         while (s < strend) {                                \
1807             CODE                                            \
1808             s++;                                            \
1809         }                                                   \
1810     } STMT_END
1811 
1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                     \
1813     STMT_START {                                            \
1814         while (s < strend) {                                \
1815             REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)            \
1816         }                                                   \
1817     } STMT_END
1818 
1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND)                 \
1820     STMT_START {                                            \
1821         while (s < strend) {                                \
1822             REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)        \
1823         }                                                   \
1824     } STMT_END
1825 
1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)                   \
1827     if (COND) {                                                \
1828         FBC_CHECK_AND_TRY                                      \
1829         s += UTF8_SAFE_SKIP(s, reginfo->strend);               \
1830         previous_occurrence_end = s;                           \
1831     }                                                          \
1832     else {                                                     \
1833         s += UTF8SKIP(s);                                      \
1834     }
1835 
1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)               \
1837     if (COND) {                                                \
1838         FBC_CHECK_AND_TRY                                      \
1839         s++;                                                   \
1840         previous_occurrence_end = s;                           \
1841     }                                                          \
1842     else {                                                     \
1843         s++;                                                   \
1844     }
1845 
1846 /* We keep track of where the next character should start after an occurrence
1847  * of the one we're looking for.  Knowing that, we can see right away if the
1848  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1849  * don't accept the 2nd and succeeding adjacent occurrences */
1850 #define FBC_CHECK_AND_TRY                                           \
1851         if (   (   doevery                                          \
1852                 || s != previous_occurrence_end)                    \
1853             && (   reginfo->intuit                                  \
1854                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1855         {                                                           \
1856             goto got_it;                                            \
1857         }
1858 
1859 
1860 /* These differ from the above macros in that they call a function which
1861  * returns the next occurrence of the thing being looked for in 's'; and
1862  * 'strend' if there is no such occurrence.  'f' is something like fcn(a,b,c)
1863  * */
1864 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f)                    \
1865     while (s < strend) {                                    \
1866         s = (char *) (f);                                   \
1867         if (s >= strend) {                                  \
1868             break;                                          \
1869         }                                                   \
1870                                                             \
1871         FBC_CHECK_AND_TRY                                   \
1872         s += UTF8SKIP(s);                                   \
1873         previous_occurrence_end = s;                        \
1874     }
1875 
1876 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f)                \
1877     while (s < strend) {                                    \
1878         s = (char *) (f);                                   \
1879         if (s >= strend) {                                  \
1880             break;                                          \
1881         }                                                   \
1882                                                             \
1883         FBC_CHECK_AND_TRY                                   \
1884         s++;                                                \
1885         previous_occurrence_end = s;                        \
1886     }
1887 
1888 /* This is like the above macro except the function returns NULL if there is no
1889  * occurrence, and there is a further condition that must be matched besides
1890  * the function */
1891 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND)         \
1892     while (s < strend) {                                    \
1893         s = (char *) (f);                                     \
1894         if (s == NULL) {                                    \
1895             s = (char *) strend;                            \
1896             break;                                          \
1897         }                                                   \
1898                                                             \
1899         if (COND) {                                         \
1900             FBC_CHECK_AND_TRY                               \
1901             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1902             previous_occurrence_end = s;                    \
1903         }                                                   \
1904         else {                                              \
1905             s += UTF8SKIP(s);                               \
1906         }                                                   \
1907     }
1908 
1909 /* This differs from the above macros in that it is passed a single byte that
1910  * is known to begin the next occurrence of the thing being looked for in 's'.
1911  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1912  * at that position. */
1913 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)                  \
1914     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s),     \
1915                                               COND)
1916 
1917 /* This is like the function above, but takes an entire string to look for
1918  * instead of a single byte */
1919 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND)      \
1920     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(                                     \
1921                                      ninstr(s, strend, substr, substr_end), \
1922                                      COND)
1923 
1924 /* The four macros below are slightly different versions of the same logic.
1925  *
1926  * The first is for /a and /aa when the target string is UTF-8.  This can only
1927  * match ascii, but it must advance based on UTF-8.   The other three handle
1928  * the non-UTF-8 and the more generic UTF-8 cases.   In all four, we are
1929  * looking for the boundary (or non-boundary) between a word and non-word
1930  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1931  * must be different.  Find the "wordness" of the character just prior to this
1932  * one, and compare it with the wordness of this one.  If they differ, we have
1933  * a boundary.  At the beginning of the string, pretend that the previous
1934  * character was a new-line.
1935  *
1936  * All these macros uncleanly have side-effects with each other and outside
1937  * variables.  So far it's been too much trouble to clean-up
1938  *
1939  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1940  *               a word character or not.
1941  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1942  *               word/non-word
1943  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1944  *
1945  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1946  * are looking for a boundary or for a non-boundary.  If we are looking for a
1947  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1948  * see if this tentative match actually works, and if so, to quit the loop
1949  * here.  And vice-versa if we are looking for a non-boundary.
1950  *
1951  * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1952  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1953  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1954  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1955  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1956  * complement.  But in that branch we complement tmp, meaning that at the
1957  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1958  * which means at the top of the loop in the next iteration, it is
1959  * TEST_NON_UTF8(s-1) */
1960 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1961     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1962     tmp = TEST_NON_UTF8(tmp);                                                  \
1963     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1964         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1965             tmp = !tmp;                                                        \
1966             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1967         }                                                                      \
1968         else {                                                                 \
1969             IF_FAIL;                                                           \
1970         }                                                                      \
1971     );                                                                         \
1972 
1973 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1974  * TEST_UTF8 is a macro that for the same input code points returns identically
1975  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1976  * end pointer as well) */
1977 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1978     if (s == reginfo->strbeg) {                                                \
1979         tmp = '\n';                                                            \
1980     }                                                                          \
1981     else { /* Back-up to the start of the previous character */                \
1982         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1983         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1984                                                        0, UTF8_ALLOW_DEFAULT); \
1985     }                                                                          \
1986     tmp = TEST_UV(tmp);                                                        \
1987     REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */                      \
1988         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1989             tmp = !tmp;                                                        \
1990             IF_SUCCESS;                                                        \
1991         }                                                                      \
1992         else {                                                                 \
1993             IF_FAIL;                                                           \
1994         }                                                                      \
1995     );
1996 
1997 /* Like the above two macros, for a UTF-8 target string.  UTF8_CODE is the
1998  * complete code for handling UTF-8.  Common to the BOUND and NBOUND cases,
1999  * set-up by the FBC_BOUND, etc macros below */
2000 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)   \
2001     UTF8_CODE;                                                                 \
2002     /* Here, things have been set up by the previous code so that tmp is the   \
2003      * return of TEST_NON_UTF8(s-1).  We also have to check if this matches    \
2004      * against the EOS, which we treat as a \n */                              \
2005     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2006         IF_SUCCESS;                                                            \
2007     }                                                                          \
2008     else {                                                                     \
2009         IF_FAIL;                                                               \
2010     }
2011 
2012 /* Same as the macro above, but the target isn't UTF-8 */
2013 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)       \
2014     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                   \
2015     tmp = TEST_NON_UTF8(tmp);                                               \
2016     REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */               \
2017         if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) {                           \
2018             IF_SUCCESS;                                                     \
2019             tmp = !tmp;                                                     \
2020         }                                                                   \
2021         else {                                                              \
2022             IF_FAIL;                                                        \
2023         }                                                                   \
2024     );                                                                      \
2025     /* Here, things have been set up by the previous code so that tmp is    \
2026      * the return of TEST_NON_UTF8(s-1).   We also have to check if this    \
2027      * matches against the EOS, which we treat as a \n */                   \
2028     if (tmp == ! TEST_NON_UTF8('\n')) {                                     \
2029         IF_SUCCESS;                                                         \
2030     }                                                                       \
2031     else {                                                                  \
2032         IF_FAIL;                                                            \
2033     }
2034 
2035 /* This is the macro to use when we want to see if something that looks like it
2036  * could match, actually does, and if so exits the loop.  It needs to be used
2037  * only for bounds checking macros, as it allows for matching beyond the end of
2038  * string (which should be zero length without having to look at the string
2039  * contents) */
2040 #define REXEC_FBC_TRYIT                                                     \
2041     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
2042         goto got_it
2043 
2044 /* The only difference between the BOUND and NBOUND cases is that
2045  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2046  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2047  * with the other one being empty (PLACEHOLDER is defined as empty).
2048  *
2049  * The TEST_FOO parameters are for operating on different forms of input, but
2050  * all should be ones that return identically for the same underlying code
2051  * points */
2052 
2053 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                   \
2054     FBC_BOUND_COMMON_UTF8(                                                  \
2055           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),       \
2056           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2057 
2058 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8)                                   \
2059     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2060 
2061 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8)                                     \
2062     FBC_BOUND_COMMON_UTF8(                                                  \
2063                     FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2064                     TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2065 
2066 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8)                                 \
2067     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2068 
2069 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                  \
2070     FBC_BOUND_COMMON_UTF8(                                                  \
2071               FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),   \
2072               TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2073 
2074 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8)                                  \
2075     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2076 
2077 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8)                                    \
2078     FBC_BOUND_COMMON_UTF8(                                                  \
2079             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),        \
2080             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2081 
2082 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8)                                \
2083     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2084 
2085 #ifdef DEBUGGING
2086 static IV
S_get_break_val_cp_checked(SV * const invlist,const UV cp_in)2087 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2088   IV cp_out = _invlist_search(invlist, cp_in);
2089   assert(cp_out >= 0);
2090   return cp_out;
2091 }
2092 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2093         invmap[S_get_break_val_cp_checked(invlist, cp)]
2094 #else
2095 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2096         invmap[_invlist_search(invlist, cp)]
2097 #endif
2098 
2099 /* Takes a pointer to an inversion list, a pointer to its corresponding
2100  * inversion map, and a code point, and returns the code point's value
2101  * according to the two arrays.  It assumes that all code points have a value.
2102  * This is used as the base macro for macros for particular properties */
2103 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2104         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2105 
2106 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2107  * of a code point, returning the value for the first code point in the string.
2108  * And it takes the particular macro name that finds the desired value given a
2109  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2110 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2111              (__ASSERT_(pos < strend)                                          \
2112                  /* Note assumes is valid UTF-8 */                             \
2113              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2114 
2115 /* Returns the GCB value for the input code point */
2116 #define getGCB_VAL_CP(cp)                                                      \
2117           _generic_GET_BREAK_VAL_CP(                                           \
2118                                     PL_GCB_invlist,                            \
2119                                     _Perl_GCB_invmap,                          \
2120                                     (cp))
2121 
2122 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2123  * bounded by pos and strend */
2124 #define getGCB_VAL_UTF8(pos, strend)                                           \
2125     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2126 
2127 /* Returns the LB value for the input code point */
2128 #define getLB_VAL_CP(cp)                                                       \
2129           _generic_GET_BREAK_VAL_CP(                                           \
2130                                     PL_LB_invlist,                             \
2131                                     _Perl_LB_invmap,                           \
2132                                     (cp))
2133 
2134 /* Returns the LB value for the first code point in the UTF-8 encoded string
2135  * bounded by pos and strend */
2136 #define getLB_VAL_UTF8(pos, strend)                                            \
2137     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2138 
2139 
2140 /* Returns the SB value for the input code point */
2141 #define getSB_VAL_CP(cp)                                                       \
2142           _generic_GET_BREAK_VAL_CP(                                           \
2143                                     PL_SB_invlist,                             \
2144                                     _Perl_SB_invmap,                     \
2145                                     (cp))
2146 
2147 /* Returns the SB value for the first code point in the UTF-8 encoded string
2148  * bounded by pos and strend */
2149 #define getSB_VAL_UTF8(pos, strend)                                            \
2150     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2151 
2152 /* Returns the WB value for the input code point */
2153 #define getWB_VAL_CP(cp)                                                       \
2154           _generic_GET_BREAK_VAL_CP(                                           \
2155                                     PL_WB_invlist,                             \
2156                                     _Perl_WB_invmap,                         \
2157                                     (cp))
2158 
2159 /* Returns the WB value for the first code point in the UTF-8 encoded string
2160  * bounded by pos and strend */
2161 #define getWB_VAL_UTF8(pos, strend)                                            \
2162     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2163 
2164 /* We know what class REx starts with.  Try to find this position... */
2165 /* if reginfo->intuit, its a dryrun */
2166 /* annoyingly all the vars in this routine have different names from their counterparts
2167    in regmatch. /grrr */
2168 STATIC char *
S_find_byclass(pTHX_ regexp * prog,const regnode * c,char * s,const char * strend,regmatch_info * reginfo)2169 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2170     const char *strend, regmatch_info *reginfo)
2171 {
2172 
2173     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2174     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2175 
2176     char *pat_string;   /* The pattern's exactish string */
2177     char *pat_end;	    /* ptr to end char of pat_string */
2178     re_fold_t folder;	/* Function for computing non-utf8 folds */
2179     const U8 *fold_array;   /* array for folding ords < 256 */
2180     STRLEN ln;
2181     STRLEN lnc;
2182     U8 c1;
2183     U8 c2;
2184     char *e = NULL;
2185 
2186     /* In some cases we accept only the first occurence of 'x' in a sequence of
2187      * them.  This variable points to just beyond the end of the previous
2188      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2189      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2190      * hop back.) */
2191     char * previous_occurrence_end = 0;
2192 
2193     I32 tmp;            /* Scratch variable */
2194     const bool utf8_target = reginfo->is_utf8_target;
2195     UV utf8_fold_flags = 0;
2196     const bool is_utf8_pat = reginfo->is_utf8_pat;
2197     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2198                                    with a result inverts that result, as 0^1 =
2199                                    1 and 1^1 = 0 */
2200     _char_class_number classnum;
2201 
2202     RXi_GET_DECL(prog,progi);
2203 
2204     PERL_ARGS_ASSERT_FIND_BYCLASS;
2205 
2206     /* We know what class it must start with. The case statements below have
2207      * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2208      * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2209      * ('p8' and 'pb'. */
2210     switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2211 
2212       case ANYOFPOSIXL_t8_pb:
2213       case ANYOFPOSIXL_t8_p8:
2214       case ANYOFL_t8_pb:
2215       case ANYOFL_t8_p8:
2216         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2217         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2218 
2219         /* FALLTHROUGH */
2220 
2221       case ANYOFD_t8_pb:
2222       case ANYOFD_t8_p8:
2223       case ANYOF_t8_pb:
2224       case ANYOF_t8_p8:
2225         REXEC_FBC_UTF8_CLASS_SCAN(
2226                 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2227         break;
2228 
2229       case ANYOFPOSIXL_tb_pb:
2230       case ANYOFPOSIXL_tb_p8:
2231       case ANYOFL_tb_pb:
2232       case ANYOFL_tb_p8:
2233         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2234         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2235 
2236         /* FALLTHROUGH */
2237 
2238       case ANYOFD_tb_pb:
2239       case ANYOFD_tb_p8:
2240       case ANYOF_tb_pb:
2241       case ANYOF_tb_p8:
2242         if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2243             /* We know that s is in the bitmap range since the target isn't
2244              * UTF-8, so what happens for out-of-range values is not relevant,
2245              * so exclude that from the flags */
2246             REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2247                                                      0));
2248         }
2249         else {
2250             REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2251         }
2252         break;
2253 
2254       case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2255       case ANYOFM_tb_p8:
2256         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2257              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2258         break;
2259 
2260       case ANYOFM_t8_pb:
2261       case ANYOFM_t8_p8:
2262         /* UTF-8ness doesn't matter because only matches UTF-8 invariants.  But
2263          * we do anyway for performance reasons, as otherwise we would have to
2264          * examine all the continuation characters */
2265         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2266              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2267         break;
2268 
2269       case NANYOFM_tb_pb:
2270       case NANYOFM_tb_p8:
2271         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2272            find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2273         break;
2274 
2275       case NANYOFM_t8_pb:
2276       case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2277                                   variants. */
2278         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2279                         (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2280                                                     (U8) ARG(c), FLAGS(c)));
2281         break;
2282 
2283       /* These nodes all require at least one code point to be in UTF-8 to
2284        * match */
2285       case ANYOFH_tb_pb:
2286       case ANYOFH_tb_p8:
2287       case ANYOFHb_tb_pb:
2288       case ANYOFHb_tb_p8:
2289       case ANYOFHr_tb_pb:
2290       case ANYOFHr_tb_p8:
2291       case ANYOFHs_tb_pb:
2292       case ANYOFHs_tb_p8:
2293       case EXACTFLU8_tb_pb:
2294       case EXACTFLU8_tb_p8:
2295       case EXACTFU_REQ8_tb_pb:
2296       case EXACTFU_REQ8_tb_p8:
2297         break;
2298 
2299       case ANYOFH_t8_pb:
2300       case ANYOFH_t8_p8:
2301         REXEC_FBC_UTF8_CLASS_SCAN(
2302               (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2303                && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2304         break;
2305 
2306       case ANYOFHb_t8_pb:
2307       case ANYOFHb_t8_p8:
2308         {
2309             /* We know what the first byte of any matched string should be. */
2310             U8 first_byte = FLAGS(c);
2311 
2312             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2313                     reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2314         }
2315         break;
2316 
2317       case ANYOFHr_t8_pb:
2318       case ANYOFHr_t8_p8:
2319         REXEC_FBC_UTF8_CLASS_SCAN(
2320                     (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2321                                 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2322                                 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2323                     && reginclass(prog, c, (U8*)s, (U8*) strend,
2324                                                            1 /* is utf8 */)));
2325         break;
2326 
2327       case ANYOFHs_t8_pb:
2328       case ANYOFHs_t8_p8:
2329         REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2330                         ((struct regnode_anyofhs *) c)->string,
2331                         /* Note FLAGS is the string length in this regnode */
2332                         ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2333                         reginclass(prog, c, (U8*)s, (U8*) strend,
2334                                    1 /* is utf8 */));
2335         break;
2336 
2337       case ANYOFR_tb_pb:
2338       case ANYOFR_tb_p8:
2339         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2340                                             ANYOFRbase(c), ANYOFRdelta(c)));
2341         break;
2342 
2343       case ANYOFR_t8_pb:
2344       case ANYOFR_t8_p8:
2345         REXEC_FBC_UTF8_CLASS_SCAN(
2346                             (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2347                              && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2348                                                               (U8 *) strend,
2349                                                               NULL),
2350                                             ANYOFRbase(c), ANYOFRdelta(c))));
2351         break;
2352 
2353       case ANYOFRb_tb_pb:
2354       case ANYOFRb_tb_p8:
2355         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2356                                             ANYOFRbase(c), ANYOFRdelta(c)));
2357         break;
2358 
2359       case ANYOFRb_t8_pb:
2360       case ANYOFRb_t8_p8:
2361         {   /* We know what the first byte of any matched string should be */
2362             U8 first_byte = FLAGS(c);
2363 
2364             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2365                                 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2366                                                               (U8 *) strend,
2367                                                               NULL),
2368                                             ANYOFRbase(c), ANYOFRdelta(c)));
2369         }
2370         break;
2371 
2372       case EXACTFAA_tb_pb:
2373 
2374         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2375          * which these functions don't handle anyway */
2376         fold_array = PL_fold_latin1;
2377         folder = foldEQ_latin1_s2_folded;
2378         goto do_exactf_non_utf8;
2379 
2380       case EXACTF_tb_pb:
2381         fold_array = PL_fold;
2382         folder = foldEQ;
2383         goto do_exactf_non_utf8;
2384 
2385       case EXACTFL_tb_pb:
2386         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2387 
2388         if (IN_UTF8_CTYPE_LOCALE) {
2389             utf8_fold_flags = FOLDEQ_LOCALE;
2390             goto do_exactf_utf8;
2391         }
2392 
2393         fold_array = PL_fold_locale;
2394         folder = foldEQ_locale;
2395         goto do_exactf_non_utf8;
2396 
2397       case EXACTFU_tb_pb:
2398         /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2399          * don't have to worry here about this single special case in the
2400          * Latin1 range */
2401         fold_array = PL_fold_latin1;
2402         folder = foldEQ_latin1_s2_folded;
2403 
2404         /* FALLTHROUGH */
2405 
2406        do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2407                               are no glitches with fold-length differences
2408                               between the target string and pattern */
2409 
2410         /* The idea in the non-utf8 EXACTF* cases is to first find the first
2411          * character of the EXACTF* node and then, if necessary,
2412          * case-insensitively compare the full text of the node.  c1 is the
2413          * first character.  c2 is its fold.  This logic will not work for
2414          * Unicode semantics and the german sharp ss, which hence should not be
2415          * compiled into a node that gets here. */
2416         pat_string = STRINGs(c);
2417         ln  = STR_LENs(c);	/* length to match in octets/bytes */
2418 
2419         /* We know that we have to match at least 'ln' bytes (which is the same
2420          * as characters, since not utf8).  If we have to match 3 characters,
2421          * and there are only 2 availabe, we know without trying that it will
2422          * fail; so don't start a match past the required minimum number from
2423          * the far end */
2424         e = HOP3c(strend, -((SSize_t)ln), s);
2425         if (e < s)
2426             break;
2427 
2428         c1 = *pat_string;
2429         c2 = fold_array[c1];
2430         if (c1 == c2) { /* If char and fold are the same */
2431             while (s <= e) {
2432                 s = (char *) memchr(s, c1, e + 1 - s);
2433                 if (s == NULL) {
2434                     break;
2435                 }
2436 
2437                 /* Check that the rest of the node matches */
2438                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2439                     && (reginfo->intuit || regtry(reginfo, &s)) )
2440                 {
2441                     goto got_it;
2442                 }
2443                 s++;
2444             }
2445         }
2446         else {
2447             U8 bits_differing = c1 ^ c2;
2448 
2449             /* If the folds differ in one bit position only, we can mask to
2450              * match either of them, and can use this faster find method.  Both
2451              * ASCII and EBCDIC tend to have their case folds differ in only
2452              * one position, so this is very likely */
2453             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2454                 bits_differing = ~ bits_differing;
2455                 while (s <= e) {
2456                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2457                                         (c1 & bits_differing), bits_differing);
2458                     if (s > e) {
2459                         break;
2460                     }
2461 
2462                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2463                         && (reginfo->intuit || regtry(reginfo, &s)) )
2464                     {
2465                         goto got_it;
2466                     }
2467                     s++;
2468                 }
2469             }
2470             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2471                        should actually happen only in EXACTFL nodes */
2472                 while (s <= e) {
2473                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2474                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2475                         && (reginfo->intuit || regtry(reginfo, &s)) )
2476                     {
2477                         goto got_it;
2478                     }
2479                     s++;
2480                 }
2481             }
2482         }
2483         break;
2484 
2485       case EXACTFAA_tb_p8:
2486       case EXACTFAA_t8_p8:
2487         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2488                          |FOLDEQ_S2_ALREADY_FOLDED
2489                          |FOLDEQ_S2_FOLDS_SANE;
2490         goto do_exactf_utf8;
2491 
2492       case EXACTFAA_NO_TRIE_tb_pb:
2493       case EXACTFAA_NO_TRIE_t8_pb:
2494       case EXACTFAA_t8_pb:
2495 
2496         /* Here, and elsewhere in this file, the reason we can't consider a
2497          * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2498          * is because any MICRO SIGN in the pattern won't be folded.  Since the
2499          * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2500          * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2501         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2502         goto do_exactf_utf8;
2503 
2504       case EXACTFL_tb_p8:
2505       case EXACTFL_t8_pb:
2506       case EXACTFL_t8_p8:
2507         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2508         utf8_fold_flags = FOLDEQ_LOCALE;
2509         goto do_exactf_utf8;
2510 
2511       case EXACTFLU8_t8_pb:
2512       case EXACTFLU8_t8_p8:
2513         utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2514                                          | FOLDEQ_S2_FOLDS_SANE;
2515         goto do_exactf_utf8;
2516 
2517       case EXACTFU_REQ8_t8_p8:
2518         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2519         goto do_exactf_utf8;
2520 
2521       case EXACTFU_tb_p8:
2522       case EXACTFU_t8_pb:
2523       case EXACTFU_t8_p8:
2524         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2525         goto do_exactf_utf8;
2526 
2527       /* The following are problematic even though pattern isn't UTF-8.  Use
2528        * full functionality normally not done except for UTF-8. */
2529       case EXACTF_t8_pb:
2530       case EXACTFUP_tb_pb:
2531       case EXACTFUP_t8_pb:
2532 
2533        do_exactf_utf8:
2534         {
2535             unsigned expansion;
2536 
2537             /* If one of the operands is in utf8, we can't use the simpler
2538              * folding above, due to the fact that many different characters
2539              * can have the same fold, or portion of a fold, or different-
2540              * length fold */
2541             pat_string = STRINGs(c);
2542             ln  = STR_LENs(c);	/* length to match in octets/bytes */
2543             pat_end = pat_string + ln;
2544             lnc = is_utf8_pat       /* length to match in characters */
2545                   ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2546                   : ln;
2547 
2548             /* We have 'lnc' characters to match in the pattern, but because of
2549              * multi-character folding, each character in the target can match
2550              * up to 3 characters (Unicode guarantees it will never exceed
2551              * this) if it is utf8-encoded; and up to 2 if not (based on the
2552              * fact that the Latin 1 folds are already determined, and the only
2553              * multi-char fold in that range is the sharp-s folding to 'ss'.
2554              * Thus, a pattern character can match as little as 1/3 of a string
2555              * character.  Adjust lnc accordingly, rounding up, so that if we
2556              * need to match at least 4+1/3 chars, that really is 5. */
2557             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2558             lnc = (lnc + expansion - 1) / expansion;
2559 
2560             /* As in the non-UTF8 case, if we have to match 3 characters, and
2561              * only 2 are left, it's guaranteed to fail, so don't start a match
2562              * that would require us to go beyond the end of the string */
2563             e = HOP3c(strend, -((SSize_t)lnc), s);
2564 
2565             /* XXX Note that we could recalculate e to stop the loop earlier,
2566              * as the worst case expansion above will rarely be met, and as we
2567              * go along we would usually find that e moves further to the left.
2568              * This would happen only after we reached the point in the loop
2569              * where if there were no expansion we should fail.  Unclear if
2570              * worth the expense */
2571 
2572             while (s <= e) {
2573                 char *my_strend= (char *)strend;
2574                 if (   foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2575                                          pat_string, NULL, ln, is_utf8_pat,
2576                                          utf8_fold_flags)
2577                     && (reginfo->intuit || regtry(reginfo, &s)) )
2578                 {
2579                     goto got_it;
2580                 }
2581                 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2582             }
2583         }
2584         break;
2585 
2586       case BOUNDA_tb_pb:
2587       case BOUNDA_tb_p8:
2588       case BOUND_tb_pb:  /* /d without utf8 target is /a */
2589       case BOUND_tb_p8:
2590         /* regcomp.c makes sure that these only have the traditional \b
2591          * meaning. */
2592         assert(FLAGS(c) == TRADITIONAL_BOUND);
2593 
2594         FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2595         break;
2596 
2597       case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2598       case BOUNDA_t8_p8:
2599         /* regcomp.c makes sure that these only have the traditional \b
2600          * meaning. */
2601         assert(FLAGS(c) == TRADITIONAL_BOUND);
2602 
2603         FBC_BOUND_A_UTF8(isWORDCHAR_A);
2604         break;
2605 
2606       case NBOUNDA_tb_pb:
2607       case NBOUNDA_tb_p8:
2608       case NBOUND_tb_pb: /* /d without utf8 target is /a */
2609       case NBOUND_tb_p8:
2610         /* regcomp.c makes sure that these only have the traditional \b
2611          * meaning. */
2612         assert(FLAGS(c) == TRADITIONAL_BOUND);
2613 
2614         FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2615         break;
2616 
2617       case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2618       case NBOUNDA_t8_p8:
2619         /* regcomp.c makes sure that these only have the traditional \b
2620          * meaning. */
2621         assert(FLAGS(c) == TRADITIONAL_BOUND);
2622 
2623         FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2624         break;
2625 
2626       case NBOUNDU_tb_pb:
2627       case NBOUNDU_tb_p8:
2628         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2629             FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2630             break;
2631         }
2632 
2633         to_complement = 1;
2634         goto do_boundu_non_utf8;
2635 
2636       case NBOUNDL_tb_pb:
2637       case NBOUNDL_tb_p8:
2638         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2639         if (FLAGS(c) == TRADITIONAL_BOUND) {
2640             FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2641             break;
2642         }
2643 
2644         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2645 
2646         to_complement = 1;
2647         goto do_boundu_non_utf8;
2648 
2649       case BOUNDL_tb_pb:
2650       case BOUNDL_tb_p8:
2651         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2652         if (FLAGS(c) == TRADITIONAL_BOUND) {
2653             FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2654             break;
2655         }
2656 
2657         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2658 
2659         goto do_boundu_non_utf8;
2660 
2661       case BOUNDU_tb_pb:
2662       case BOUNDU_tb_p8:
2663         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2664             FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2665             break;
2666         }
2667 
2668       do_boundu_non_utf8:
2669         if (s == reginfo->strbeg) {
2670             if (reginfo->intuit || regtry(reginfo, &s))
2671             {
2672                 goto got_it;
2673             }
2674 
2675             /* Didn't match.  Try at the next position (if there is one) */
2676             s++;
2677             if (UNLIKELY(s >= reginfo->strend)) {
2678                 break;
2679             }
2680         }
2681 
2682         switch((bound_type) FLAGS(c)) {
2683           case TRADITIONAL_BOUND: /* Should have already been handled */
2684             assert(0);
2685             break;
2686 
2687           case GCB_BOUND:
2688             /* Not utf8.  Everything is a GCB except between CR and LF */
2689             while (s < strend) {
2690                 if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2691                                       || UCHARAT(s) != '\n'))
2692                     && (reginfo->intuit || regtry(reginfo, &s)))
2693                 {
2694                     goto got_it;
2695                 }
2696                 s++;
2697             }
2698 
2699             break;
2700 
2701           case LB_BOUND:
2702             {
2703                 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2704                 while (s < strend) {
2705                     LB_enum after = getLB_VAL_CP((U8) *s);
2706                     if (to_complement ^ isLB(before,
2707                                              after,
2708                                              (U8*) reginfo->strbeg,
2709                                              (U8*) s,
2710                                              (U8*) reginfo->strend,
2711                                              0 /* target not utf8 */ )
2712                         && (reginfo->intuit || regtry(reginfo, &s)))
2713                     {
2714                         goto got_it;
2715                     }
2716                     before = after;
2717                     s++;
2718                 }
2719             }
2720 
2721             break;
2722 
2723           case SB_BOUND:
2724             {
2725                 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2726                 while (s < strend) {
2727                     SB_enum after = getSB_VAL_CP((U8) *s);
2728                     if ((to_complement ^ isSB(before,
2729                                               after,
2730                                               (U8*) reginfo->strbeg,
2731                                               (U8*) s,
2732                                               (U8*) reginfo->strend,
2733                                              0 /* target not utf8 */ ))
2734                         && (reginfo->intuit || regtry(reginfo, &s)))
2735                     {
2736                         goto got_it;
2737                     }
2738                     before = after;
2739                     s++;
2740                 }
2741             }
2742 
2743             break;
2744 
2745           case WB_BOUND:
2746             {
2747                 WB_enum previous = WB_UNKNOWN;
2748                 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2749                 while (s < strend) {
2750                     WB_enum after = getWB_VAL_CP((U8) *s);
2751                     if ((to_complement ^ isWB(previous,
2752                                               before,
2753                                               after,
2754                                               (U8*) reginfo->strbeg,
2755                                               (U8*) s,
2756                                               (U8*) reginfo->strend,
2757                                                0 /* target not utf8 */ ))
2758                         && (reginfo->intuit || regtry(reginfo, &s)))
2759                     {
2760                         goto got_it;
2761                     }
2762                     previous = before;
2763                     before = after;
2764                     s++;
2765                 }
2766             }
2767         }
2768 
2769         /* Here are at the final position in the target string, which is a
2770          * boundary by definition, so matches, depending on other constraints.
2771          * */
2772         if (   reginfo->intuit
2773             || (s <= reginfo->strend && regtry(reginfo, &s)))
2774         {
2775             goto got_it;
2776         }
2777 
2778         break;
2779 
2780       case BOUNDL_t8_pb:
2781       case BOUNDL_t8_p8:
2782         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2783         if (FLAGS(c) == TRADITIONAL_BOUND) {
2784             FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2785                            isWORDCHAR_LC_utf8_safe);
2786             break;
2787         }
2788 
2789         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2790 
2791         to_complement = 1;
2792         goto do_boundu_utf8;
2793 
2794       case NBOUNDL_t8_pb:
2795       case NBOUNDL_t8_p8:
2796         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2797         if (FLAGS(c) == TRADITIONAL_BOUND) {
2798             FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2799                             isWORDCHAR_LC_utf8_safe);
2800             break;
2801         }
2802 
2803         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2804 
2805         to_complement = 1;
2806         goto do_boundu_utf8;
2807 
2808       case NBOUND_t8_pb:
2809       case NBOUND_t8_p8:
2810         /* regcomp.c makes sure that these only have the traditional \b
2811          * meaning. */
2812         assert(FLAGS(c) == TRADITIONAL_BOUND);
2813 
2814         /* FALLTHROUGH */
2815 
2816       case NBOUNDU_t8_pb:
2817       case NBOUNDU_t8_p8:
2818         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2819             FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2820                             isWORDCHAR_utf8_safe);
2821             break;
2822         }
2823 
2824         to_complement = 1;
2825         goto do_boundu_utf8;
2826 
2827       case BOUND_t8_pb:
2828       case BOUND_t8_p8:
2829         /* regcomp.c makes sure that these only have the traditional \b
2830          * meaning. */
2831         assert(FLAGS(c) == TRADITIONAL_BOUND);
2832 
2833         /* FALLTHROUGH */
2834 
2835       case BOUNDU_t8_pb:
2836       case BOUNDU_t8_p8:
2837         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2838             FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2839             break;
2840         }
2841 
2842       do_boundu_utf8:
2843         if (s == reginfo->strbeg) {
2844             if (reginfo->intuit || regtry(reginfo, &s))
2845             {
2846                 goto got_it;
2847             }
2848 
2849             /* Didn't match.  Try at the next position (if there is one) */
2850             s += UTF8_SAFE_SKIP(s, reginfo->strend);
2851             if (UNLIKELY(s >= reginfo->strend)) {
2852                 break;
2853             }
2854         }
2855 
2856         switch((bound_type) FLAGS(c)) {
2857           case TRADITIONAL_BOUND: /* Should have already been handled */
2858             assert(0);
2859             break;
2860 
2861           case GCB_BOUND:
2862             {
2863                 GCB_enum before = getGCB_VAL_UTF8(
2864                                            reghop3((U8*)s, -1,
2865                                                    (U8*)(reginfo->strbeg)),
2866                                            (U8*) reginfo->strend);
2867                 while (s < strend) {
2868                     GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2869                                                     (U8*) reginfo->strend);
2870                     if (   (to_complement ^ isGCB(before,
2871                                                   after,
2872                                                   (U8*) reginfo->strbeg,
2873                                                   (U8*) s,
2874                                                   1 /* target is utf8 */ ))
2875                         && (reginfo->intuit || regtry(reginfo, &s)))
2876                     {
2877                         goto got_it;
2878                     }
2879                     before = after;
2880                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2881                 }
2882             }
2883             break;
2884 
2885           case LB_BOUND:
2886             {
2887                 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2888                                                         -1,
2889                                                         (U8*)(reginfo->strbeg)),
2890                                                    (U8*) reginfo->strend);
2891                 while (s < strend) {
2892                     LB_enum after = getLB_VAL_UTF8((U8*) s,
2893                                                    (U8*) reginfo->strend);
2894                     if (to_complement ^ isLB(before,
2895                                              after,
2896                                              (U8*) reginfo->strbeg,
2897                                              (U8*) s,
2898                                              (U8*) reginfo->strend,
2899                                              1 /* target is utf8 */ )
2900                         && (reginfo->intuit || regtry(reginfo, &s)))
2901                     {
2902                         goto got_it;
2903                     }
2904                     before = after;
2905                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2906                 }
2907             }
2908 
2909             break;
2910 
2911           case SB_BOUND:
2912             {
2913                 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2914                                                     -1,
2915                                                     (U8*)(reginfo->strbeg)),
2916                                                   (U8*) reginfo->strend);
2917                 while (s < strend) {
2918                     SB_enum after = getSB_VAL_UTF8((U8*) s,
2919                                                      (U8*) reginfo->strend);
2920                     if ((to_complement ^ isSB(before,
2921                                               after,
2922                                               (U8*) reginfo->strbeg,
2923                                               (U8*) s,
2924                                               (U8*) reginfo->strend,
2925                                               1 /* target is utf8 */ ))
2926                         && (reginfo->intuit || regtry(reginfo, &s)))
2927                     {
2928                         goto got_it;
2929                     }
2930                     before = after;
2931                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2932                 }
2933             }
2934 
2935             break;
2936 
2937           case WB_BOUND:
2938             {
2939                 /* We are at a boundary between char_sub_0 and char_sub_1.
2940                  * We also keep track of the value for char_sub_-1 as we
2941                  * loop through the line.   Context may be needed to make a
2942                  * determination, and if so, this can save having to
2943                  * recalculate it */
2944                 WB_enum previous = WB_UNKNOWN;
2945                 WB_enum before = getWB_VAL_UTF8(
2946                                           reghop3((U8*)s,
2947                                                   -1,
2948                                                   (U8*)(reginfo->strbeg)),
2949                                           (U8*) reginfo->strend);
2950                 while (s < strend) {
2951                     WB_enum after = getWB_VAL_UTF8((U8*) s,
2952                                                     (U8*) reginfo->strend);
2953                     if ((to_complement ^ isWB(previous,
2954                                               before,
2955                                               after,
2956                                               (U8*) reginfo->strbeg,
2957                                               (U8*) s,
2958                                               (U8*) reginfo->strend,
2959                                               1 /* target is utf8 */ ))
2960                         && (reginfo->intuit || regtry(reginfo, &s)))
2961                     {
2962                         goto got_it;
2963                     }
2964                     previous = before;
2965                     before = after;
2966                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2967                 }
2968             }
2969         }
2970 
2971         /* Here are at the final position in the target string, which is a
2972          * boundary by definition, so matches, depending on other constraints.
2973          * */
2974 
2975         if (   reginfo->intuit
2976             || (s <= reginfo->strend && regtry(reginfo, &s)))
2977         {
2978             goto got_it;
2979         }
2980         break;
2981 
2982       case LNBREAK_t8_pb:
2983       case LNBREAK_t8_p8:
2984         REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2985         break;
2986 
2987       case LNBREAK_tb_pb:
2988       case LNBREAK_tb_p8:
2989         REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
2990         break;
2991 
2992       /* The argument to all the POSIX node types is the class number to pass
2993        * to _generic_isCC() to build a mask for searching in PL_charclass[] */
2994 
2995       case NPOSIXL_t8_pb:
2996       case NPOSIXL_t8_p8:
2997         to_complement = 1;
2998         /* FALLTHROUGH */
2999 
3000       case POSIXL_t8_pb:
3001       case POSIXL_t8_p8:
3002         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3003         REXEC_FBC_UTF8_CLASS_SCAN(
3004             to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3005                                                           (U8 *) strend)));
3006         break;
3007 
3008       case NPOSIXL_tb_pb:
3009       case NPOSIXL_tb_p8:
3010         to_complement = 1;
3011         /* FALLTHROUGH */
3012 
3013       case POSIXL_tb_pb:
3014       case POSIXL_tb_p8:
3015         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3016         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3017                                 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3018         break;
3019 
3020       case NPOSIXA_t8_pb:
3021       case NPOSIXA_t8_p8:
3022         /* The complement of something that matches only ASCII matches all
3023          * non-ASCII, plus everything in ASCII that isn't in the class. */
3024         REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
3025                                   || ! _generic_isCC_A(*s, FLAGS(c)));
3026         break;
3027 
3028       case POSIXA_t8_pb:
3029       case POSIXA_t8_p8:
3030         /* Don't need to worry about utf8, as it can match only a single
3031          * byte invariant character.  But we do anyway for performance reasons,
3032          * as otherwise we would have to examine all the continuation
3033          * characters */
3034         REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3035         break;
3036 
3037       case NPOSIXD_tb_pb:
3038       case NPOSIXD_tb_p8:
3039       case NPOSIXA_tb_pb:
3040       case NPOSIXA_tb_p8:
3041         to_complement = 1;
3042         /* FALLTHROUGH */
3043 
3044       case POSIXD_tb_pb:
3045       case POSIXD_tb_p8:
3046       case POSIXA_tb_pb:
3047       case POSIXA_tb_p8:
3048         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3049                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
3050         break;
3051 
3052       case NPOSIXU_tb_pb:
3053       case NPOSIXU_tb_p8:
3054         to_complement = 1;
3055         /* FALLTHROUGH */
3056 
3057       case POSIXU_tb_pb:
3058       case POSIXU_tb_p8:
3059             REXEC_FBC_NON_UTF8_CLASS_SCAN(
3060                                  to_complement ^ cBOOL(_generic_isCC(*s,
3061                                                                     FLAGS(c))));
3062         break;
3063 
3064       case NPOSIXD_t8_pb:
3065       case NPOSIXD_t8_p8:
3066       case NPOSIXU_t8_pb:
3067       case NPOSIXU_t8_p8:
3068         to_complement = 1;
3069         /* FALLTHROUGH */
3070 
3071       case POSIXD_t8_pb:
3072       case POSIXD_t8_p8:
3073       case POSIXU_t8_pb:
3074       case POSIXU_t8_p8:
3075         classnum = (_char_class_number) FLAGS(c);
3076         switch (classnum) {
3077           default:
3078             REXEC_FBC_UTF8_CLASS_SCAN(
3079                         to_complement ^ cBOOL(_invlist_contains_cp(
3080                                                 PL_XPosix_ptrs[classnum],
3081                                                 utf8_to_uvchr_buf((U8 *) s,
3082                                                                 (U8 *) strend,
3083                                                                 NULL))));
3084             break;
3085 
3086           case _CC_ENUM_SPACE:
3087             REXEC_FBC_UTF8_CLASS_SCAN(
3088                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3089             break;
3090 
3091           case _CC_ENUM_BLANK:
3092             REXEC_FBC_UTF8_CLASS_SCAN(
3093                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3094             break;
3095 
3096           case _CC_ENUM_XDIGIT:
3097             REXEC_FBC_UTF8_CLASS_SCAN(
3098                         to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3099             break;
3100 
3101           case _CC_ENUM_VERTSPACE:
3102             REXEC_FBC_UTF8_CLASS_SCAN(
3103                         to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3104             break;
3105 
3106           case _CC_ENUM_CNTRL:
3107             REXEC_FBC_UTF8_CLASS_SCAN(
3108                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3109             break;
3110         }
3111         break;
3112 
3113       case AHOCORASICKC_tb_pb:
3114       case AHOCORASICKC_tb_p8:
3115       case AHOCORASICKC_t8_pb:
3116       case AHOCORASICKC_t8_p8:
3117       case AHOCORASICK_tb_pb:
3118       case AHOCORASICK_tb_p8:
3119       case AHOCORASICK_t8_pb:
3120       case AHOCORASICK_t8_p8:
3121         {
3122             DECL_TRIE_TYPE(c);
3123             /* what trie are we using right now */
3124             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3125             reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3126             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3127 
3128             const char *last_start = strend - trie->minlen;
3129 #ifdef DEBUGGING
3130             const char *real_start = s;
3131 #endif
3132             STRLEN maxlen = trie->maxlen;
3133             SV *sv_points;
3134             U8 **points; /* map of where we were in the input string
3135                             when reading a given char. For ASCII this
3136                             is unnecessary overhead as the relationship
3137                             is always 1:1, but for Unicode, especially
3138                             case folded Unicode this is not true. */
3139             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3140             U8 *bitmap=NULL;
3141 
3142 
3143             DECLARE_AND_GET_RE_DEBUG_FLAGS;
3144 
3145             /* We can't just allocate points here. We need to wrap it in
3146              * an SV so it gets freed properly if there is a croak while
3147              * running the match */
3148             ENTER;
3149             SAVETMPS;
3150             sv_points=newSV(maxlen * sizeof(U8 *));
3151             SvCUR_set(sv_points,
3152                 maxlen * sizeof(U8 *));
3153             SvPOK_on(sv_points);
3154             sv_2mortal(sv_points);
3155             points=(U8**)SvPV_nolen(sv_points );
3156             if ( trie_type != trie_utf8_fold
3157                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
3158             {
3159                 if (trie->bitmap)
3160                     bitmap=(U8*)trie->bitmap;
3161                 else
3162                     bitmap=(U8*)ANYOF_BITMAP(c);
3163             }
3164             /* this is the Aho-Corasick algorithm modified a touch
3165                to include special handling for long "unknown char" sequences.
3166                The basic idea being that we use AC as long as we are dealing
3167                with a possible matching char, when we encounter an unknown char
3168                (and we have not encountered an accepting state) we scan forward
3169                until we find a legal starting char.
3170                AC matching is basically that of trie matching, except that when
3171                we encounter a failing transition, we fall back to the current
3172                states "fail state", and try the current char again, a process
3173                we repeat until we reach the root state, state 1, or a legal
3174                transition. If we fail on the root state then we can either
3175                terminate if we have reached an accepting state previously, or
3176                restart the entire process from the beginning if we have not.
3177 
3178              */
3179             while (s <= last_start) {
3180                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3181                 U8 *uc = (U8*)s;
3182                 U16 charid = 0;
3183                 U32 base = 1;
3184                 U32 state = 1;
3185                 UV uvc = 0;
3186                 STRLEN len = 0;
3187                 STRLEN foldlen = 0;
3188                 U8 *uscan = (U8*)NULL;
3189                 U8 *leftmost = NULL;
3190 #ifdef DEBUGGING
3191                 U32 accepted_word= 0;
3192 #endif
3193                 U32 pointpos = 0;
3194 
3195                 while ( state && uc <= (U8*)strend ) {
3196                     int failed=0;
3197                     U32 word = aho->states[ state ].wordnum;
3198 
3199                     if( state==1 ) {
3200                         if ( bitmap ) {
3201                             DEBUG_TRIE_EXECUTE_r(
3202                                 if (  uc <= (U8*)last_start
3203                                     && !BITMAP_TEST(bitmap,*uc) )
3204                                 {
3205                                     dump_exec_pos( (char *)uc, c, strend,
3206                                         real_start,
3207                                         (char *)uc, utf8_target, 0 );
3208                                     Perl_re_printf( aTHX_
3209                                         " Scanning for legal start char...\n");
3210                                 }
3211                             );
3212                             if (utf8_target) {
3213                                 while (  uc <= (U8*)last_start
3214                                        && !BITMAP_TEST(bitmap,*uc) )
3215                                 {
3216                                     uc += UTF8SKIP(uc);
3217                                 }
3218                             } else {
3219                                 while (  uc <= (U8*)last_start
3220                                        && ! BITMAP_TEST(bitmap,*uc) )
3221                                 {
3222                                     uc++;
3223                                 }
3224                             }
3225                             s= (char *)uc;
3226                         }
3227                         if (uc >(U8*)last_start) break;
3228                     }
3229 
3230                     if ( word ) {
3231                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3232                                                                     % maxlen ];
3233                         if (!leftmost || lpos < leftmost) {
3234                             DEBUG_r(accepted_word=word);
3235                             leftmost= lpos;
3236                         }
3237                         if (base==0) break;
3238 
3239                     }
3240                     points[pointpos++ % maxlen]= uc;
3241                     if (foldlen || uc < (U8*)strend) {
3242                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3243                                              (U8 *) strend, uscan, len, uvc,
3244                                              charid, foldlen, foldbuf,
3245                                              uniflags);
3246                         DEBUG_TRIE_EXECUTE_r({
3247                             dump_exec_pos( (char *)uc, c, strend,
3248                                         real_start, s, utf8_target, 0);
3249                             Perl_re_printf( aTHX_
3250                                 " Charid:%3u CP:%4" UVxf " ",
3251                                  charid, uvc);
3252                         });
3253                     }
3254                     else {
3255                         len = 0;
3256                         charid = 0;
3257                     }
3258 
3259 
3260                     do {
3261 #ifdef DEBUGGING
3262                         word = aho->states[ state ].wordnum;
3263 #endif
3264                         base = aho->states[ state ].trans.base;
3265 
3266                         DEBUG_TRIE_EXECUTE_r({
3267                             if (failed)
3268                                 dump_exec_pos((char *)uc, c, strend, real_start,
3269                                     s,   utf8_target, 0 );
3270                             Perl_re_printf( aTHX_
3271                                 "%sState: %4" UVxf ", word=%" UVxf,
3272                                 failed ? " Fail transition to " : "",
3273                                 (UV)state, (UV)word);
3274                         });
3275                         if ( base ) {
3276                             U32 tmp;
3277                             I32 offset;
3278                             if (charid &&
3279                                  ( ((offset = base + charid
3280                                     - 1 - trie->uniquecharcount)) >= 0)
3281                                  && ((U32)offset < trie->lasttrans)
3282                                  && trie->trans[offset].check == state
3283                                  && (tmp=trie->trans[offset].next))
3284                             {
3285                                 DEBUG_TRIE_EXECUTE_r(
3286                                     Perl_re_printf( aTHX_ " - legal\n"));
3287                                 state = tmp;
3288                                 break;
3289                             }
3290                             else {
3291                                 DEBUG_TRIE_EXECUTE_r(
3292                                     Perl_re_printf( aTHX_ " - fail\n"));
3293                                 failed = 1;
3294                                 state = aho->fail[state];
3295                             }
3296                         }
3297                         else {
3298                             /* we must be accepting here */
3299                             DEBUG_TRIE_EXECUTE_r(
3300                                     Perl_re_printf( aTHX_ " - accepting\n"));
3301                             failed = 1;
3302                             break;
3303                         }
3304                     } while(state);
3305                     uc += len;
3306                     if (failed) {
3307                         if (leftmost)
3308                             break;
3309                         if (!state) state = 1;
3310                     }
3311                 }
3312                 if ( aho->states[ state ].wordnum ) {
3313                     U8 *lpos = points[ (pointpos
3314                                       - trie->wordinfo[aho->states[ state ]
3315                                                     .wordnum].len) % maxlen ];
3316                     if (!leftmost || lpos < leftmost) {
3317                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3318                         leftmost = lpos;
3319                     }
3320                 }
3321                 if (leftmost) {
3322                     s = (char*)leftmost;
3323                     DEBUG_TRIE_EXECUTE_r({
3324                         Perl_re_printf( aTHX_  "Matches word #%" UVxf
3325                                         " at position %" IVdf ". Trying full"
3326                                         " pattern...\n",
3327                             (UV)accepted_word, (IV)(s - real_start)
3328                         );
3329                     });
3330                     if (reginfo->intuit || regtry(reginfo, &s)) {
3331                         FREETMPS;
3332                         LEAVE;
3333                         goto got_it;
3334                     }
3335                     if (s < reginfo->strend) {
3336                         s = HOPc(s,1);
3337                     }
3338                     DEBUG_TRIE_EXECUTE_r({
3339                         Perl_re_printf( aTHX_
3340                                        "Pattern failed. Looking for new start"
3341                                        " point...\n");
3342                     });
3343                 } else {
3344                     DEBUG_TRIE_EXECUTE_r(
3345                         Perl_re_printf( aTHX_ "No match.\n"));
3346                     break;
3347                 }
3348             }
3349             FREETMPS;
3350             LEAVE;
3351         }
3352         break;
3353 
3354       case EXACTFU_REQ8_t8_pb:
3355       case EXACTFUP_tb_p8:
3356       case EXACTFUP_t8_p8:
3357       case EXACTF_tb_p8:
3358       case EXACTF_t8_p8:   /* This node only generated for non-utf8 patterns */
3359       case EXACTFAA_NO_TRIE_tb_p8:
3360       case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3361                                       patterns */
3362         assert(0);
3363 
3364       default:
3365         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3366     } /* End of switch on node type */
3367 
3368     return 0;
3369 
3370   got_it:
3371     return s;
3372 }
3373 
3374 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3375  * flags have same meanings as with regexec_flags() */
3376 
3377 static void
S_reg_set_capture_string(pTHX_ REGEXP * const rx,char * strbeg,char * strend,SV * sv,U32 flags,bool utf8_target)3378 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3379                             char *strbeg,
3380                             char *strend,
3381                             SV *sv,
3382                             U32 flags,
3383                             bool utf8_target)
3384 {
3385     struct regexp *const prog = ReANY(rx);
3386 
3387     if (flags & REXEC_COPY_STR) {
3388 #ifdef PERL_ANY_COW
3389         if (SvCANCOW(sv)) {
3390             DEBUG_C(Perl_re_printf( aTHX_
3391                               "Copy on write: regexp capture, type %d\n",
3392                                     (int) SvTYPE(sv)));
3393             /* Create a new COW SV to share the match string and store
3394              * in saved_copy, unless the current COW SV in saved_copy
3395              * is valid and suitable for our purpose */
3396             if ((   prog->saved_copy
3397                  && SvIsCOW(prog->saved_copy)
3398                  && SvPOKp(prog->saved_copy)
3399                  && SvIsCOW(sv)
3400                  && SvPOKp(sv)
3401                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3402             {
3403                 /* just reuse saved_copy SV */
3404                 if (RXp_MATCH_COPIED(prog)) {
3405                     Safefree(prog->subbeg);
3406                     RXp_MATCH_COPIED_off(prog);
3407                 }
3408             }
3409             else {
3410                 /* create new COW SV to share string */
3411                 RXp_MATCH_COPY_FREE(prog);
3412                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3413             }
3414             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3415             assert (SvPOKp(prog->saved_copy));
3416             prog->sublen  = strend - strbeg;
3417             prog->suboffset = 0;
3418             prog->subcoffset = 0;
3419         } else
3420 #endif
3421         {
3422             SSize_t min = 0;
3423             SSize_t max = strend - strbeg;
3424             SSize_t sublen;
3425 
3426             if (    (flags & REXEC_COPY_SKIP_POST)
3427                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3428                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3429             ) { /* don't copy $' part of string */
3430                 U32 n = 0;
3431                 max = -1;
3432                 /* calculate the right-most part of the string covered
3433                  * by a capture. Due to lookahead, this may be to
3434                  * the right of $&, so we have to scan all captures */
3435                 while (n <= prog->lastparen) {
3436                     if (prog->offs[n].end > max)
3437                         max = prog->offs[n].end;
3438                     n++;
3439                 }
3440                 if (max == -1)
3441                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3442                             ? prog->offs[0].start
3443                             : 0;
3444                 assert(max >= 0 && max <= strend - strbeg);
3445             }
3446 
3447             if (    (flags & REXEC_COPY_SKIP_PRE)
3448                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3449                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3450             ) { /* don't copy $` part of string */
3451                 U32 n = 0;
3452                 min = max;
3453                 /* calculate the left-most part of the string covered
3454                  * by a capture. Due to lookbehind, this may be to
3455                  * the left of $&, so we have to scan all captures */
3456                 while (min && n <= prog->lastparen) {
3457                     if (   prog->offs[n].start != -1
3458                         && prog->offs[n].start < min)
3459                     {
3460                         min = prog->offs[n].start;
3461                     }
3462                     n++;
3463                 }
3464                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3465                     && min >  prog->offs[0].end
3466                 )
3467                     min = prog->offs[0].end;
3468 
3469             }
3470 
3471             assert(min >= 0 && min <= max && min <= strend - strbeg);
3472             sublen = max - min;
3473 
3474             if (RXp_MATCH_COPIED(prog)) {
3475                 if (sublen > prog->sublen)
3476                     prog->subbeg =
3477                             (char*)saferealloc(prog->subbeg, sublen+1);
3478             }
3479             else
3480                 prog->subbeg = (char*)safemalloc(sublen+1);
3481             Copy(strbeg + min, prog->subbeg, sublen, char);
3482             prog->subbeg[sublen] = '\0';
3483             prog->suboffset = min;
3484             prog->sublen = sublen;
3485             RXp_MATCH_COPIED_on(prog);
3486         }
3487         prog->subcoffset = prog->suboffset;
3488         if (prog->suboffset && utf8_target) {
3489             /* Convert byte offset to chars.
3490              * XXX ideally should only compute this if @-/@+
3491              * has been seen, a la PL_sawampersand ??? */
3492 
3493             /* If there's a direct correspondence between the
3494              * string which we're matching and the original SV,
3495              * then we can use the utf8 len cache associated with
3496              * the SV. In particular, it means that under //g,
3497              * sv_pos_b2u() will use the previously cached
3498              * position to speed up working out the new length of
3499              * subcoffset, rather than counting from the start of
3500              * the string each time. This stops
3501              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3502              * from going quadratic */
3503             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3504                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3505                                                 SV_GMAGIC|SV_CONST_RETURN);
3506             else
3507                 prog->subcoffset = utf8_length((U8*)strbeg,
3508                                     (U8*)(strbeg+prog->suboffset));
3509         }
3510     }
3511     else {
3512         RXp_MATCH_COPY_FREE(prog);
3513         prog->subbeg = strbeg;
3514         prog->suboffset = 0;
3515         prog->subcoffset = 0;
3516         prog->sublen = strend - strbeg;
3517     }
3518 }
3519 
3520 
3521 
3522 
3523 /*
3524  - regexec_flags - match a regexp against a string
3525  */
3526 I32
Perl_regexec_flags(pTHX_ REGEXP * const rx,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * sv,void * data,U32 flags)3527 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3528               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3529 /* stringarg: the point in the string at which to begin matching */
3530 /* strend:    pointer to null at end of string */
3531 /* strbeg:    real beginning of string */
3532 /* minend:    end of match must be >= minend bytes after stringarg. */
3533 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3534  *            itself is accessed via the pointers above */
3535 /* data:      May be used for some additional optimizations.
3536               Currently unused. */
3537 /* flags:     For optimizations. See REXEC_* in regexp.h */
3538 
3539 {
3540     struct regexp *const prog = ReANY(rx);
3541     char *s;
3542     regnode *c;
3543     char *startpos;
3544     SSize_t minlen;		/* must match at least this many chars */
3545     SSize_t dontbother = 0;	/* how many characters not to try at end */
3546     const bool utf8_target = cBOOL(DO_UTF8(sv));
3547     I32 multiline;
3548     RXi_GET_DECL(prog,progi);
3549     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3550     regmatch_info *const reginfo = &reginfo_buf;
3551     regexp_paren_pair *swap = NULL;
3552     I32 oldsave;
3553     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3554 
3555     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3556     PERL_UNUSED_ARG(data);
3557 
3558     /* Be paranoid... */
3559     if (prog == NULL) {
3560         Perl_croak(aTHX_ "NULL regexp parameter");
3561     }
3562 
3563     DEBUG_EXECUTE_r(
3564         debug_start_match(rx, utf8_target, stringarg, strend,
3565         "Matching");
3566     );
3567 
3568     startpos = stringarg;
3569 
3570     /* set these early as they may be used by the HOP macros below */
3571     reginfo->strbeg = strbeg;
3572     reginfo->strend = strend;
3573     reginfo->is_utf8_target = cBOOL(utf8_target);
3574 
3575     if (prog->intflags & PREGf_GPOS_SEEN) {
3576         MAGIC *mg;
3577 
3578         /* set reginfo->ganch, the position where \G can match */
3579 
3580         reginfo->ganch =
3581             (flags & REXEC_IGNOREPOS)
3582             ? stringarg /* use start pos rather than pos() */
3583             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3584               /* Defined pos(): */
3585             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3586             : strbeg; /* pos() not defined; use start of string */
3587 
3588         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3589             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3590 
3591         /* in the presence of \G, we may need to start looking earlier in
3592          * the string than the suggested start point of stringarg:
3593          * if prog->gofs is set, then that's a known, fixed minimum
3594          * offset, such as
3595          * /..\G/:   gofs = 2
3596          * /ab|c\G/: gofs = 1
3597          * or if the minimum offset isn't known, then we have to go back
3598          * to the start of the string, e.g. /w+\G/
3599          */
3600 
3601         if (prog->intflags & PREGf_ANCH_GPOS) {
3602             if (prog->gofs) {
3603                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3604                 if (!startpos ||
3605                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3606                 {
3607                     DEBUG_GPOS_r(Perl_re_printf( aTHX_
3608                             "fail: ganch-gofs before earliest possible start\n"));
3609                     return 0;
3610                 }
3611             }
3612             else
3613                 startpos = reginfo->ganch;
3614         }
3615         else if (prog->gofs) {
3616             startpos = HOPBACKc(startpos, prog->gofs);
3617             if (!startpos)
3618                 startpos = strbeg;
3619         }
3620         else if (prog->intflags & PREGf_GPOS_FLOAT)
3621             startpos = strbeg;
3622     }
3623 
3624     minlen = prog->minlen;
3625     if ((startpos + minlen) > strend || startpos < strbeg) {
3626         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3627                         "Regex match can't succeed, so not even tried\n"));
3628         return 0;
3629     }
3630 
3631     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3632      * which will call destuctors to reset PL_regmatch_state, free higher
3633      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3634      * regmatch_info_aux_eval */
3635 
3636     oldsave = PL_savestack_ix;
3637 
3638     s = startpos;
3639 
3640     if ((prog->extflags & RXf_USE_INTUIT)
3641         && !(flags & REXEC_CHECKED))
3642     {
3643         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3644                                     flags, NULL);
3645         if (!s)
3646             return 0;
3647 
3648         if (prog->extflags & RXf_CHECK_ALL) {
3649             /* we can match based purely on the result of INTUIT.
3650              * Set up captures etc just for $& and $-[0]
3651              * (an intuit-only match wont have $1,$2,..) */
3652             assert(!prog->nparens);
3653 
3654             /* s/// doesn't like it if $& is earlier than where we asked it to
3655              * start searching (which can happen on something like /.\G/) */
3656             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3657                     && (s < stringarg))
3658             {
3659                 /* this should only be possible under \G */
3660                 assert(prog->intflags & PREGf_GPOS_SEEN);
3661                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3662                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3663                 goto phooey;
3664             }
3665 
3666             /* match via INTUIT shouldn't have any captures.
3667              * Let @-, @+, $^N know */
3668             prog->lastparen = prog->lastcloseparen = 0;
3669             RXp_MATCH_UTF8_set(prog, utf8_target);
3670             prog->offs[0].start = s - strbeg;
3671             prog->offs[0].end = utf8_target
3672                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3673                 : s - strbeg + prog->minlenret;
3674             if ( !(flags & REXEC_NOT_FIRST) )
3675                 S_reg_set_capture_string(aTHX_ rx,
3676                                         strbeg, strend,
3677                                         sv, flags, utf8_target);
3678 
3679             return 1;
3680         }
3681     }
3682 
3683     multiline = prog->extflags & RXf_PMf_MULTILINE;
3684 
3685     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3686         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3687                               "String too short [regexec_flags]...\n"));
3688         goto phooey;
3689     }
3690 
3691     /* Check validity of program. */
3692     if (UCHARAT(progi->program) != REG_MAGIC) {
3693         Perl_croak(aTHX_ "corrupted regexp program");
3694     }
3695 
3696     RXp_MATCH_TAINTED_off(prog);
3697     RXp_MATCH_UTF8_set(prog, utf8_target);
3698 
3699     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
3700     reginfo->intuit = 0;
3701     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3702     reginfo->warned = FALSE;
3703     reginfo->sv = sv;
3704     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3705     /* see how far we have to get to not match where we matched before */
3706     reginfo->till = stringarg + minend;
3707 
3708     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3709         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3710            S_cleanup_regmatch_info_aux has executed (registered by
3711            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3712            magic belonging to this SV.
3713            Not newSVsv, either, as it does not COW.
3714         */
3715         reginfo->sv = newSV(0);
3716         SvSetSV_nosteal(reginfo->sv, sv);
3717         SAVEFREESV(reginfo->sv);
3718     }
3719 
3720     /* reserve next 2 or 3 slots in PL_regmatch_state:
3721      * slot N+0: may currently be in use: skip it
3722      * slot N+1: use for regmatch_info_aux struct
3723      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3724      * slot N+3: ready for use by regmatch()
3725      */
3726 
3727     {
3728         regmatch_state *old_regmatch_state;
3729         regmatch_slab  *old_regmatch_slab;
3730         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3731 
3732         /* on first ever match, allocate first slab */
3733         if (!PL_regmatch_slab) {
3734             Newx(PL_regmatch_slab, 1, regmatch_slab);
3735             PL_regmatch_slab->prev = NULL;
3736             PL_regmatch_slab->next = NULL;
3737             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3738         }
3739 
3740         old_regmatch_state = PL_regmatch_state;
3741         old_regmatch_slab  = PL_regmatch_slab;
3742 
3743         for (i=0; i <= max; i++) {
3744             if (i == 1)
3745                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3746             else if (i ==2)
3747                 reginfo->info_aux_eval =
3748                 reginfo->info_aux->info_aux_eval =
3749                             &(PL_regmatch_state->u.info_aux_eval);
3750 
3751             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3752                 PL_regmatch_state = S_push_slab(aTHX);
3753         }
3754 
3755         /* note initial PL_regmatch_state position; at end of match we'll
3756          * pop back to there and free any higher slabs */
3757 
3758         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3759         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3760         reginfo->info_aux->poscache = NULL;
3761 
3762         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3763 
3764         if ((prog->extflags & RXf_EVAL_SEEN))
3765             S_setup_eval_state(aTHX_ reginfo);
3766         else
3767             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3768     }
3769 
3770     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3771         /* We have to be careful. If the previous successful match
3772            was from this regex we don't want a subsequent partially
3773            successful match to clobber the old results.
3774            So when we detect this possibility we add a swap buffer
3775            to the re, and switch the buffer each match. If we fail,
3776            we switch it back; otherwise we leave it swapped.
3777         */
3778         swap = prog->offs;
3779         /* avoid leak if we die, or clean up anyway if match completes */
3780         SAVEFREEPV(swap);
3781         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3782         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3783             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3784             0,
3785             PTR2UV(prog),
3786             PTR2UV(swap),
3787             PTR2UV(prog->offs)
3788         ));
3789     }
3790 
3791     if (prog->recurse_locinput)
3792         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3793 
3794     /* Simplest case: anchored match (but not \G) need be tried only once,
3795      * or with MBOL, only at the beginning of each line.
3796      *
3797      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3798      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3799      * match at the start of the string then it won't match anywhere else
3800      * either; while with /.*.../, if it doesn't match at the beginning,
3801      * the earliest it could match is at the start of the next line */
3802 
3803     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3804         char *end;
3805 
3806         if (regtry(reginfo, &s))
3807             goto got_it;
3808 
3809         if (!(prog->intflags & PREGf_ANCH_MBOL))
3810             goto phooey;
3811 
3812         /* didn't match at start, try at other newline positions */
3813 
3814         if (minlen)
3815             dontbother = minlen - 1;
3816         end = HOP3c(strend, -dontbother, strbeg) - 1;
3817 
3818         /* skip to next newline */
3819 
3820         while (s <= end) { /* note it could be possible to match at the end of the string */
3821             /* NB: newlines are the same in unicode as they are in latin */
3822             if (*s++ != '\n')
3823                 continue;
3824             if (prog->check_substr || prog->check_utf8) {
3825             /* note that with PREGf_IMPLICIT, intuit can only fail
3826              * or return the start position, so it's of limited utility.
3827              * Nevertheless, I made the decision that the potential for
3828              * quick fail was still worth it - DAPM */
3829                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3830                 if (!s)
3831                     goto phooey;
3832             }
3833             if (regtry(reginfo, &s))
3834                 goto got_it;
3835         }
3836         goto phooey;
3837     } /* end anchored search */
3838 
3839     /* anchored \G match */
3840     if (prog->intflags & PREGf_ANCH_GPOS)
3841     {
3842         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3843         assert(prog->intflags & PREGf_GPOS_SEEN);
3844         /* For anchored \G, the only position it can match from is
3845          * (ganch-gofs); we already set startpos to this above; if intuit
3846          * moved us on from there, we can't possibly succeed */
3847         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3848         if (s == startpos && regtry(reginfo, &s))
3849             goto got_it;
3850         goto phooey;
3851     }
3852 
3853     /* Messy cases:  unanchored match. */
3854 
3855     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3856         /* we have /x+whatever/ */
3857         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3858         char ch;
3859 #ifdef DEBUGGING
3860         int did_match = 0;
3861 #endif
3862         if (utf8_target) {
3863             if (! prog->anchored_utf8) {
3864                 to_utf8_substr(prog);
3865             }
3866             ch = SvPVX_const(prog->anchored_utf8)[0];
3867             REXEC_FBC_UTF8_SCAN(
3868                 if (*s == ch) {
3869                     DEBUG_EXECUTE_r( did_match = 1 );
3870                     if (regtry(reginfo, &s)) goto got_it;
3871                     s += UTF8_SAFE_SKIP(s, strend);
3872                     while (s < strend && *s == ch)
3873                         s += UTF8SKIP(s);
3874                 }
3875             );
3876 
3877         }
3878         else {
3879             if (! prog->anchored_substr) {
3880                 if (! to_byte_substr(prog)) {
3881                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3882                 }
3883             }
3884             ch = SvPVX_const(prog->anchored_substr)[0];
3885             REXEC_FBC_NON_UTF8_SCAN(
3886                 if (*s == ch) {
3887                     DEBUG_EXECUTE_r( did_match = 1 );
3888                     if (regtry(reginfo, &s)) goto got_it;
3889                     s++;
3890                     while (s < strend && *s == ch)
3891                         s++;
3892                 }
3893             );
3894         }
3895         DEBUG_EXECUTE_r(if (!did_match)
3896                 Perl_re_printf( aTHX_
3897                                   "Did not find anchored character...\n")
3898                );
3899     }
3900     else if (prog->anchored_substr != NULL
3901               || prog->anchored_utf8 != NULL
3902               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3903                   && prog->float_max_offset < strend - s)) {
3904         SV *must;
3905         SSize_t back_max;
3906         SSize_t back_min;
3907         char *last;
3908         char *last1;		/* Last position checked before */
3909 #ifdef DEBUGGING
3910         int did_match = 0;
3911 #endif
3912         if (prog->anchored_substr || prog->anchored_utf8) {
3913             if (utf8_target) {
3914                 if (! prog->anchored_utf8) {
3915                     to_utf8_substr(prog);
3916                 }
3917                 must = prog->anchored_utf8;
3918             }
3919             else {
3920                 if (! prog->anchored_substr) {
3921                     if (! to_byte_substr(prog)) {
3922                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3923                     }
3924                 }
3925                 must = prog->anchored_substr;
3926             }
3927             back_max = back_min = prog->anchored_offset;
3928         } else {
3929             if (utf8_target) {
3930                 if (! prog->float_utf8) {
3931                     to_utf8_substr(prog);
3932                 }
3933                 must = prog->float_utf8;
3934             }
3935             else {
3936                 if (! prog->float_substr) {
3937                     if (! to_byte_substr(prog)) {
3938                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3939                     }
3940                 }
3941                 must = prog->float_substr;
3942             }
3943             back_max = prog->float_max_offset;
3944             back_min = prog->float_min_offset;
3945         }
3946 
3947         if (back_min<0) {
3948             last = strend;
3949         } else {
3950             last = HOP3c(strend,	/* Cannot start after this */
3951                   -(SSize_t)(CHR_SVLEN(must)
3952                          - (SvTAIL(must) != 0) + back_min), strbeg);
3953         }
3954         if (s > reginfo->strbeg)
3955             last1 = HOPc(s, -1);
3956         else
3957             last1 = s - 1;	/* bogus */
3958 
3959         /* XXXX check_substr already used to find "s", can optimize if
3960            check_substr==must. */
3961         dontbother = 0;
3962         strend = HOPc(strend, -dontbother);
3963         while ( (s <= last) &&
3964                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3965                                   (unsigned char*)strend, must,
3966                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3967             DEBUG_EXECUTE_r( did_match = 1 );
3968             if (HOPc(s, -back_max) > last1) {
3969                 last1 = HOPc(s, -back_min);
3970                 s = HOPc(s, -back_max);
3971             }
3972             else {
3973                 char * const t = (last1 >= reginfo->strbeg)
3974                                     ? HOPc(last1, 1) : last1 + 1;
3975 
3976                 last1 = HOPc(s, -back_min);
3977                 s = t;
3978             }
3979             if (utf8_target) {
3980                 while (s <= last1) {
3981                     if (regtry(reginfo, &s))
3982                         goto got_it;
3983                     if (s >= last1) {
3984                         s++; /* to break out of outer loop */
3985                         break;
3986                     }
3987                     s += UTF8SKIP(s);
3988                 }
3989             }
3990             else {
3991                 while (s <= last1) {
3992                     if (regtry(reginfo, &s))
3993                         goto got_it;
3994                     s++;
3995                 }
3996             }
3997         }
3998         DEBUG_EXECUTE_r(if (!did_match) {
3999             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4000                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4001             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
4002                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
4003                                ? "anchored" : "floating"),
4004                 quoted, RE_SV_TAIL(must));
4005         });
4006         goto phooey;
4007     }
4008     else if ( (c = progi->regstclass) ) {
4009         if (minlen) {
4010             const OPCODE op = OP(progi->regstclass);
4011             /* don't bother with what can't match */
4012             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
4013                 strend = HOPc(strend, -(minlen - 1));
4014         }
4015         DEBUG_EXECUTE_r({
4016             SV * const prop = sv_newmortal();
4017             regprop(prog, prop, c, reginfo, NULL);
4018             {
4019                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4020                     s,strend-s,PL_dump_re_max_len);
4021                 Perl_re_printf( aTHX_
4022                     "Matching stclass %.*s against %s (%d bytes)\n",
4023                     (int)SvCUR(prop), SvPVX_const(prop),
4024                      quoted, (int)(strend - s));
4025             }
4026         });
4027         if (find_byclass(prog, c, s, strend, reginfo))
4028             goto got_it;
4029         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
4030     }
4031     else {
4032         dontbother = 0;
4033         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4034             /* Trim the end. */
4035             char *last= NULL;
4036             SV* float_real;
4037             STRLEN len;
4038             const char *little;
4039 
4040             if (utf8_target) {
4041                 if (! prog->float_utf8) {
4042                     to_utf8_substr(prog);
4043                 }
4044                 float_real = prog->float_utf8;
4045             }
4046             else {
4047                 if (! prog->float_substr) {
4048                     if (! to_byte_substr(prog)) {
4049                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4050                     }
4051                 }
4052                 float_real = prog->float_substr;
4053             }
4054 
4055             little = SvPV_const(float_real, len);
4056             if (SvTAIL(float_real)) {
4057                     /* This means that float_real contains an artificial \n on
4058                      * the end due to the presence of something like this:
4059                      * /foo$/ where we can match both "foo" and "foo\n" at the
4060                      * end of the string.  So we have to compare the end of the
4061                      * string first against the float_real without the \n and
4062                      * then against the full float_real with the string.  We
4063                      * have to watch out for cases where the string might be
4064                      * smaller than the float_real or the float_real without
4065                      * the \n. */
4066                     char *checkpos= strend - len;
4067                     DEBUG_OPTIMISE_r(
4068                         Perl_re_printf( aTHX_
4069                             "%sChecking for float_real.%s\n",
4070                             PL_colors[4], PL_colors[5]));
4071                     if (checkpos + 1 < strbeg) {
4072                         /* can't match, even if we remove the trailing \n
4073                          * string is too short to match */
4074                         DEBUG_EXECUTE_r(
4075                             Perl_re_printf( aTHX_
4076                                 "%sString shorter than required trailing substring, cannot match.%s\n",
4077                                 PL_colors[4], PL_colors[5]));
4078                         goto phooey;
4079                     } else if (memEQ(checkpos + 1, little, len - 1)) {
4080                         /* can match, the end of the string matches without the
4081                          * "\n" */
4082                         last = checkpos + 1;
4083                     } else if (checkpos < strbeg) {
4084                         /* cant match, string is too short when the "\n" is
4085                          * included */
4086                         DEBUG_EXECUTE_r(
4087                             Perl_re_printf( aTHX_
4088                                 "%sString does not contain required trailing substring, cannot match.%s\n",
4089                                 PL_colors[4], PL_colors[5]));
4090                         goto phooey;
4091                     } else if (!multiline) {
4092                         /* non multiline match, so compare with the "\n" at the
4093                          * end of the string */
4094                         if (memEQ(checkpos, little, len)) {
4095                             last= checkpos;
4096                         } else {
4097                             DEBUG_EXECUTE_r(
4098                                 Perl_re_printf( aTHX_
4099                                     "%sString does not contain required trailing substring, cannot match.%s\n",
4100                                     PL_colors[4], PL_colors[5]));
4101                             goto phooey;
4102                         }
4103                     } else {
4104                         /* multiline match, so we have to search for a place
4105                          * where the full string is located */
4106                         goto find_last;
4107                     }
4108             } else {
4109                   find_last:
4110                     if (len)
4111                         last = rninstr(s, strend, little, little + len);
4112                     else
4113                         last = strend;	/* matching "$" */
4114             }
4115             if (!last) {
4116                 /* at one point this block contained a comment which was
4117                  * probably incorrect, which said that this was a "should not
4118                  * happen" case.  Even if it was true when it was written I am
4119                  * pretty sure it is not anymore, so I have removed the comment
4120                  * and replaced it with this one. Yves */
4121                 DEBUG_EXECUTE_r(
4122                     Perl_re_printf( aTHX_
4123                         "%sString does not contain required substring, cannot match.%s\n",
4124                         PL_colors[4], PL_colors[5]
4125                     ));
4126                 goto phooey;
4127             }
4128             dontbother = strend - last + prog->float_min_offset;
4129         }
4130         if (minlen && (dontbother < minlen))
4131             dontbother = minlen - 1;
4132         strend -= dontbother; 		   /* this one's always in bytes! */
4133         /* We don't know much -- general case. */
4134         if (utf8_target) {
4135             for (;;) {
4136                 if (regtry(reginfo, &s))
4137                     goto got_it;
4138                 if (s >= strend)
4139                     break;
4140                 s += UTF8SKIP(s);
4141             };
4142         }
4143         else {
4144             do {
4145                 if (regtry(reginfo, &s))
4146                     goto got_it;
4147             } while (s++ < strend);
4148         }
4149     }
4150 
4151     /* Failure. */
4152     goto phooey;
4153 
4154   got_it:
4155     /* s/// doesn't like it if $& is earlier than where we asked it to
4156      * start searching (which can happen on something like /.\G/) */
4157     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
4158             && (prog->offs[0].start < stringarg - strbeg))
4159     {
4160         /* this should only be possible under \G */
4161         assert(prog->intflags & PREGf_GPOS_SEEN);
4162         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4163             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4164         goto phooey;
4165     }
4166 
4167     /* clean up; this will trigger destructors that will free all slabs
4168      * above the current one, and cleanup the regmatch_info_aux
4169      * and regmatch_info_aux_eval sructs */
4170 
4171     LEAVE_SCOPE(oldsave);
4172 
4173     if (RXp_PAREN_NAMES(prog))
4174         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4175 
4176     /* make sure $`, $&, $', and $digit will work later */
4177     if ( !(flags & REXEC_NOT_FIRST) )
4178         S_reg_set_capture_string(aTHX_ rx,
4179                                     strbeg, reginfo->strend,
4180                                     sv, flags, utf8_target);
4181 
4182     return 1;
4183 
4184   phooey:
4185     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
4186                           PL_colors[4], PL_colors[5]));
4187 
4188     if (swap) {
4189         /* we failed :-( roll it back.
4190          * Since the swap buffer will be freed on scope exit which follows
4191          * shortly, restore the old captures by copying 'swap's original
4192          * data to the new offs buffer
4193          */
4194         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4195             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4196             0,
4197             PTR2UV(prog),
4198             PTR2UV(prog->offs),
4199             PTR2UV(swap)
4200         ));
4201 
4202         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4203     }
4204 
4205     /* clean up; this will trigger destructors that will free all slabs
4206      * above the current one, and cleanup the regmatch_info_aux
4207      * and regmatch_info_aux_eval sructs */
4208 
4209     LEAVE_SCOPE(oldsave);
4210 
4211     return 0;
4212 }
4213 
4214 
4215 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4216  * Do inc before dec, in case old and new rex are the same */
4217 #define SET_reg_curpm(Re2)                          \
4218     if (reginfo->info_aux_eval) {                   \
4219         (void)ReREFCNT_inc(Re2);		    \
4220         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));	    \
4221         PM_SETRE((PL_reg_curpm), (Re2));	    \
4222     }
4223 
4224 
4225 /*
4226  - regtry - try match at specific point
4227  */
4228 STATIC bool			/* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info * reginfo,char ** startposp)4229 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4230 {
4231     CHECKPOINT lastcp;
4232     REGEXP *const rx = reginfo->prog;
4233     regexp *const prog = ReANY(rx);
4234     SSize_t result;
4235 #ifdef DEBUGGING
4236     U32 depth = 0; /* used by REGCP_SET */
4237 #endif
4238     RXi_GET_DECL(prog,progi);
4239     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4240 
4241     PERL_ARGS_ASSERT_REGTRY;
4242 
4243     reginfo->cutpoint=NULL;
4244 
4245     prog->offs[0].start = *startposp - reginfo->strbeg;
4246     prog->lastparen = 0;
4247     prog->lastcloseparen = 0;
4248 
4249     /* XXXX What this code is doing here?!!!  There should be no need
4250        to do this again and again, prog->lastparen should take care of
4251        this!  --ilya*/
4252 
4253     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4254      * Actually, the code in regcppop() (which Ilya may be meaning by
4255      * prog->lastparen), is not needed at all by the test suite
4256      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4257      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4258      * Meanwhile, this code *is* needed for the
4259      * above-mentioned test suite tests to succeed.  The common theme
4260      * on those tests seems to be returning null fields from matches.
4261      * --jhi updated by dapm */
4262 
4263     /* After encountering a variant of the issue mentioned above I think
4264      * the point Ilya was making is that if we properly unwind whenever
4265      * we set lastparen to a smaller value then we should not need to do
4266      * this every time, only when needed. So if we have tests that fail if
4267      * we remove this, then it suggests somewhere else we are improperly
4268      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4269      * places it is called, and related regcp() routines. - Yves */
4270 #if 1
4271     if (prog->nparens) {
4272         regexp_paren_pair *pp = prog->offs;
4273         I32 i;
4274         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4275             ++pp;
4276             pp->start = -1;
4277             pp->end = -1;
4278         }
4279     }
4280 #endif
4281     REGCP_SET(lastcp);
4282     result = regmatch(reginfo, *startposp, progi->program + 1);
4283     if (result != -1) {
4284         prog->offs[0].end = result;
4285         return 1;
4286     }
4287     if (reginfo->cutpoint)
4288         *startposp= reginfo->cutpoint;
4289     REGCP_UNWIND(lastcp);
4290     return 0;
4291 }
4292 
4293 /* this is used to determine how far from the left messages like
4294    'failed...' are printed in regexec.c. It should be set such that
4295    messages are inline with the regop output that created them.
4296 */
4297 #define REPORT_CODE_OFF 29
4298 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4299 #ifdef DEBUGGING
4300 int
Perl_re_exec_indentf(pTHX_ const char * fmt,U32 depth,...)4301 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4302 {
4303     va_list ap;
4304     int result;
4305     PerlIO *f= Perl_debug_log;
4306     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4307     va_start(ap, depth);
4308     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4309     result = PerlIO_vprintf(f, fmt, ap);
4310     va_end(ap);
4311     return result;
4312 }
4313 #endif /* DEBUGGING */
4314 
4315 /* grab a new slab and return the first slot in it */
4316 
4317 STATIC regmatch_state *
S_push_slab(pTHX)4318 S_push_slab(pTHX)
4319 {
4320     regmatch_slab *s = PL_regmatch_slab->next;
4321     if (!s) {
4322         Newx(s, 1, regmatch_slab);
4323         s->prev = PL_regmatch_slab;
4324         s->next = NULL;
4325         PL_regmatch_slab->next = s;
4326     }
4327     PL_regmatch_slab = s;
4328     return SLAB_FIRST(s);
4329 }
4330 
4331 #ifdef DEBUGGING
4332 
4333 STATIC void
S_debug_start_match(pTHX_ const REGEXP * prog,const bool utf8_target,const char * start,const char * end,const char * blurb)4334 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4335     const char *start, const char *end, const char *blurb)
4336 {
4337     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4338 
4339     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4340 
4341     if (!PL_colorset)
4342             reginitcolors();
4343     {
4344         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4345             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4346 
4347         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4348             start, end - start, PL_dump_re_max_len);
4349 
4350         Perl_re_printf( aTHX_
4351             "%s%s REx%s %s against %s\n",
4352                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4353 
4354         if (utf8_target||utf8_pat)
4355             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4356                 utf8_pat ? "pattern" : "",
4357                 utf8_pat && utf8_target ? " and " : "",
4358                 utf8_target ? "string" : ""
4359             );
4360     }
4361 }
4362 
4363 STATIC void
S_dump_exec_pos(pTHX_ const char * locinput,const regnode * scan,const char * loc_regeol,const char * loc_bostr,const char * loc_reg_starttry,const bool utf8_target,const U32 depth)4364 S_dump_exec_pos(pTHX_ const char *locinput,
4365                       const regnode *scan,
4366                       const char *loc_regeol,
4367                       const char *loc_bostr,
4368                       const char *loc_reg_starttry,
4369                       const bool utf8_target,
4370                       const U32 depth
4371                 )
4372 {
4373     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4374     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4375     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4376     /* The part of the string before starttry has one color
4377        (pref0_len chars), between starttry and current
4378        position another one (pref_len - pref0_len chars),
4379        after the current position the third one.
4380        We assume that pref0_len <= pref_len, otherwise we
4381        decrease pref0_len.  */
4382     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4383         ? (5 + taill) - l : locinput - loc_bostr;
4384     int pref0_len;
4385 
4386     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4387 
4388     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4389         pref_len++;
4390     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4391     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4392         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4393               ? (5 + taill) - pref_len : loc_regeol - locinput);
4394     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4395         l--;
4396     if (pref0_len < 0)
4397         pref0_len = 0;
4398     if (pref0_len > pref_len)
4399         pref0_len = pref_len;
4400     {
4401         const int is_uni = utf8_target ? 1 : 0;
4402 
4403         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4404             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4405 
4406         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4407                     (locinput - pref_len + pref0_len),
4408                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4409 
4410         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4411                     locinput, loc_regeol - locinput, 10, 0, 1);
4412 
4413         const STRLEN tlen=len0+len1+len2;
4414         Perl_re_printf( aTHX_
4415                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4416                     (IV)(locinput - loc_bostr),
4417                     len0, s0,
4418                     len1, s1,
4419                     (docolor ? "" : "> <"),
4420                     len2, s2,
4421                     (int)(tlen > 19 ? 0 :  19 - tlen),
4422                     "",
4423                     depth);
4424     }
4425 }
4426 
4427 #endif
4428 
4429 /* reg_check_named_buff_matched()
4430  * Checks to see if a named buffer has matched. The data array of
4431  * buffer numbers corresponding to the buffer is expected to reside
4432  * in the regexp->data->data array in the slot stored in the ARG() of
4433  * node involved. Note that this routine doesn't actually care about the
4434  * name, that information is not preserved from compilation to execution.
4435  * Returns the index of the leftmost defined buffer with the given name
4436  * or 0 if non of the buffers matched.
4437  */
4438 STATIC I32
S_reg_check_named_buff_matched(const regexp * rex,const regnode * scan)4439 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4440 {
4441     I32 n;
4442     RXi_GET_DECL(rex,rexi);
4443     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4444     I32 *nums=(I32*)SvPVX(sv_dat);
4445 
4446     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4447 
4448     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4449         if ((I32)rex->lastparen >= nums[n] &&
4450             rex->offs[nums[n]].end != -1)
4451         {
4452             return nums[n];
4453         }
4454     }
4455     return 0;
4456 }
4457 
4458 static bool
S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,struct next_matchable_info * m,regmatch_info * reginfo)4459 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4460                           struct next_matchable_info * m,
4461                           regmatch_info *reginfo)
4462 {
4463     /* This function determines various characteristics about every possible
4464      * initial match of the passed-in EXACTish <text_node>, and stores them in
4465      * <*m>.
4466      *
4467      * That includes a match string and a parallel mask, such that if you AND
4468      * the target string with the mask and compare with the match string,
4469      * you'll have a pretty good idea, perhaps even perfect, if that portion of
4470      * the target matches or not.
4471      *
4472      * The motivation behind this function is to allow the caller to set up
4473      * tight loops for matching.  Consider patterns like '.*B' or '.*?B' where
4474      * B is an arbitrary EXACTish node.  To find the end of .*, we look for the
4475      * beginning oF B, which is the passed in <text_node>  That's where this
4476      * function comes in.  The values it returns can quickly be used to rule
4477      * out many, or all, cases of possible matches not actually being the
4478      * beginning of B, <text_node>.  It is also used in regrepeat() where we
4479      * have 'A*', for arbitrary 'A'.  This sets up criteria to more efficiently
4480      * determine where the span of 'A's stop.
4481      *
4482      * If <text_node> is of type EXACT, there is only one possible character
4483      * that can match its first character, and so the situation is quite
4484      * simple.  But things can get much more complicated if folding is
4485      * involved.  It may be that the first character of an EXACTFish node
4486      * doesn't participate in any possible fold, e.g., punctuation, so it can
4487      * be matched only by itself.  The vast majority of characters that are in
4488      * folds match just two things, their lower and upper-case equivalents.
4489      * But not all are like that; some have multiple possible matches, or match
4490      * sequences of more than one character.  This function sorts all that out.
4491      *
4492      * It returns information about all possibilities of what the first
4493      * character(s) of <text_node> could look like.  Again, if <text_node> is a
4494      * plain EXACT node, that's just the actual first bytes of the first
4495      * character; but otherwise it is the bytes, that when masked, match all
4496      * possible combinations of all the initial bytes of all the characters
4497      * that could match, folded.  (Actually, this is a slight over promise.  It
4498      * handles only up to the initial 5 bytes, which is enough for all Unicode
4499      * characters, but not for all non-Unicode ones.)
4500      *
4501      * Here's an example to clarify.  Suppose the first character of
4502      * <text_node> is the letter 'C', and we are under /i matching.  That means
4503      * 'c' also matches.  The representations of these two characters differ in
4504      * just one bit, so the mask would be a zero in that position and ones in
4505      * the other 7.  And the returned string would be the AND of these two
4506      * characters, and would be one byte long, since these characters are each
4507      * a single byte.  ANDing the target <text_node> with this mask will yield
4508      * the returned string if and only if <text_node> begins with one of these
4509      * two characters.  So, the function would also return that the definitive
4510      * length matched is 1 byte.
4511      *
4512      * Now, suppose instead of the letter 'C',  <text_node> begins with the
4513      * letter 'F'.  The situation is much more complicated because there are
4514      * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4515      * begins with 'f', and hence could match.  We add these into the returned
4516      * string and mask, but the result isn't definitive; the caller has to
4517      * check further if its AND and compare pass.  But the failure of that
4518      * compare will quickly rule out most possible inputs.
4519      *
4520      * Much of this could be done in regcomp.c at compile time, except for
4521      * locale-dependent, and UTF-8 target dependent data.  Extra data fields
4522      * could be used for one or the other eventualities.
4523      *
4524      * If this function determines that no possible character in the target
4525      * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
4526      * situation occurs if the first character in <text_node> requires UTF-8 to
4527      * represent, and the target string isn't in UTF-8.)
4528      *
4529      * Some analysis is in GH #18414, located at the time of this writing at:
4530      * https://github.com/Perl/perl5/issues/18414
4531      */
4532 
4533     const bool utf8_target = reginfo->is_utf8_target;
4534     bool utf8_pat = reginfo->is_utf8_pat;
4535 
4536     PERL_UINT_FAST8_T i;
4537 
4538     /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4539      */
4540     U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4541     U8 lengths[MAX_MATCHES] = { 0 };
4542 
4543     U8 index_of_longest = 0;
4544 
4545     U8 *pat = (U8*)STRING(text_node);
4546     Size_t pat_len = STR_LEN(text_node);
4547     U8 op = OP(text_node);
4548 
4549     U8 byte_mask[5]  = {0};
4550     U8 byte_anded[5] = {0};
4551 
4552     /* There are some folds in Unicode to multiple characters.  This will hold
4553      * such characters that could fold to the beginning of 'text_node' */
4554     UV multi_fold_from = 0;
4555 
4556     /* We may have to create a modified copy of the pattern */
4557     U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4558 
4559     m->max_length = 0;
4560     m->min_length = 255;
4561     m->count = 0;
4562 
4563     /* Even if the first character in the node can match something in Latin1,
4564      * if there is anything in the node that can't, the match must fail */
4565     if (! utf8_target && isEXACT_REQ8(op)) {
4566         return FALSE;
4567     }
4568 
4569 /* Define a temporary op for use in this function, using an existing one that
4570  * should never be a real op during execution */
4571 #define TURKISH  PSEUDO
4572 
4573     /* What to do about these two nodes had to be deferred to runtime (which is
4574      * now).  If the extra information we now have so indicates, turn them into
4575      * EXACTFU nodes */
4576     if (   (op == EXACTF && utf8_target)
4577         || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4578     {
4579         if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4580             op = TURKISH;
4581         }
4582         else {
4583             op = EXACTFU;
4584         }
4585 
4586         /* And certain situations are better handled if we create a modified
4587          * version of the pattern */
4588         if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4589                            specific problematic characters */
4590             if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4591 
4592                 /* The node could start with characters that are the first ones
4593                  * of a multi-character fold. */
4594                 multi_fold_from
4595                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4596                 if (multi_fold_from) {
4597 
4598                     /* Here, they do form a sequence that matches the fold of a
4599                      * single character.  That single character then is a
4600                      * possible match.  Below we will look again at this, but
4601                      * the code below is expecting every character in the
4602                      * pattern to be folded, which the input isn't required to
4603                      * be in this case.  So, just fold the single character,
4604                      * and the result will be in the expected form. */
4605                     _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4606                                        FOLD_FLAGS_FULL);
4607                     pat = mod_pat;
4608                 }
4609                          /* Turkish has a couple extra possibilities. */
4610                 else if (   UNLIKELY(op == TURKISH)
4611                          &&  pat_len >= 3
4612                          &&  isALPHA_FOLD_EQ(pat[0], 'f')
4613                          && (   memBEGINs(pat + 1, pat_len - 1,
4614                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4615                              || (   pat_len >= 4
4616                                  && isALPHA_FOLD_EQ(pat[1], 'f')
4617                                  && memBEGINs(pat + 2, pat_len - 2,
4618                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4619                 ))) {
4620                     /* The macros for finding a multi-char fold don't include
4621                      * the Turkish possibilities, in which U+130 folds to 'i'.
4622                      * Hard-code these.  It's very unlikely that Unicode will
4623                      * ever add any others.  */
4624                     if (pat[1] == 'f') {
4625                         pat_len = 3;
4626                         Copy("ffi", mod_pat, pat_len, U8);
4627                     }
4628                     else {
4629                         pat_len = 2;
4630                         Copy("fi", mod_pat, pat_len, U8);
4631                     }
4632                     pat = mod_pat;
4633                 }
4634                 else if (    UTF8_IS_DOWNGRADEABLE_START(*pat)
4635                          &&  LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4636                          &&  LIKELY(memNEs(pat, pat_len,
4637                                            LATIN_SMALL_LETTER_SHARP_S_UTF8))
4638                          && (LIKELY(op != TURKISH || *pat != 'I')))
4639                 {
4640                     /* For all cases of things between 0-255, except the ones
4641                      * in the conditional above, the fold is just the lower
4642                      * case, which is faster than the more general case. */
4643                     mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4644                                                                      pat[1]));
4645                     pat_len = 1;
4646                     pat = mod_pat;
4647                     utf8_pat = FALSE;
4648                 }
4649                 else {  /* Code point above 255, or needs special handling */
4650                     _to_utf8_fold_flags(pat, pat + pat_len,
4651                                         mod_pat, &pat_len,
4652                                         FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4653                     pat = mod_pat;
4654                 }
4655             }
4656         }
4657         else if /* Below is not a UTF-8 pattern; there's a somewhat different
4658                    set of problematic characters */
4659                 ((multi_fold_from
4660                           = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4661         {
4662             /* We may have to canonicalize a multi-char fold, as in the UTF-8
4663              * case */
4664             _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4665                                FOLD_FLAGS_FULL);
4666             pat = mod_pat;
4667         }
4668         else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4669             mod_pat[0] = mod_pat[1] = 's';
4670             pat_len = 2;
4671             utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4672                                        chars, and speeds copying */
4673             pat = mod_pat;
4674         }
4675         else if (LIKELY(op != TURKISH || *pat != 'I')) {
4676             mod_pat[0] = toLOWER_L1(*pat);
4677             pat_len = 1;
4678             pat = mod_pat;
4679         }
4680     }
4681     else if /* Below isn't a node that we convert to UTF-8 */
4682             (     utf8_target
4683              && ! utf8_pat
4684              &&   op == EXACTFAA_NO_TRIE
4685              &&  *pat == LATIN_SMALL_LETTER_SHARP_S)
4686     {
4687         /* A very special case.  Folding U+DF goes to U+17F under /iaa.  We
4688          * did this at compile time when the pattern was UTF-8 , but otherwise
4689          * we couldn't do it earlier, because it requires a UTF-8 target for
4690          * this match to be legal. */
4691         pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4692         Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4693              LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4694         pat = mod_pat;
4695         utf8_pat = TRUE;
4696     }
4697 
4698     /* Here, we have taken care of the initial work for a few very problematic
4699      * situations, possibly creating a modified pattern.
4700      *
4701      * Now ready for the general case.  We build up all the possible things
4702      * that could match the first character of the pattern into the elements of
4703      * 'matches[]'
4704      *
4705      * Everything generally matches at least itself.  But if there is a
4706      * UTF8ness mismatch, we have to convert to that of the target string. */
4707     if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
4708         matches[0][0] = pat[0];
4709         lengths[0] = 1;
4710         m->count++;
4711     }
4712     else if (utf8_target) {
4713         if (utf8_pat) {
4714             lengths[0] = UTF8SKIP(pat);
4715             Copy(pat, matches[0], lengths[0], U8);
4716             m->count++;
4717         }
4718         else {  /* target is UTF-8, pattern isn't */
4719             matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4720             matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4721             lengths[0] = 2;
4722             m->count++;
4723         }
4724     }
4725     else if (! utf8_pat) {  /* Neither is UTF-8 */
4726         matches[0][0] = pat[0];
4727         lengths[0] = 1;
4728         m->count++;
4729     }
4730     else     /* target isn't UTF-8; pattern is.  No match possible unless the
4731                 pattern's first character can fit in a byte */
4732          if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4733     {
4734         matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4735         lengths[0] = 1;
4736         m->count++;
4737     }
4738 
4739     /* Here we have taken care of any necessary node-type changes */
4740 
4741     if (m->count) {
4742         m->max_length = lengths[0];
4743         m->min_length = lengths[0];
4744     }
4745 
4746     /* For non-folding nodes, there are no other possible candidate matches,
4747      * but for foldable ones, we have to look further. */
4748     if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4749         UV folded;  /* The first character in the pattern, folded */
4750         U32 first_fold_from;    /* A character that folds to it */
4751         const U32 * remaining_fold_froms;   /* The remaining characters that
4752                                                fold to it, if any */
4753         Size_t folds_to_count;  /* The total number of characters that fold to
4754                                    'folded' */
4755 
4756         /* If the node begins with a sequence of more than one character that
4757          * together form the fold of a single character, it is called a
4758          * 'multi-character fold', and the normal functions don't handle this
4759          * case.  We set 'multi_fold_from' to the single folded-from character,
4760          * which is handled in an extra iteration below */
4761         if (utf8_pat) {
4762             folded = valid_utf8_to_uvchr(pat, NULL);
4763             multi_fold_from
4764                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4765         }
4766         else {
4767             folded = *pat;
4768 
4769             /* This may generate illegal combinations for things like EXACTF,
4770              * but rather than repeat the logic and exclude them here, all such
4771              * illegalities are checked for and skipped below in the loop */
4772             multi_fold_from
4773                         = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4774         }
4775 
4776         /* Everything matches at least itself; initialize to that because the
4777          * only the branches below that set it are the ones where the number
4778          * isn't 1. */
4779         folds_to_count = 1;
4780 
4781         /* There are a few special cases for locale-dependent nodes, where the
4782          * run-time context was needed before we could know what matched */
4783         if (UNLIKELY(op == EXACTFL) && folded < 256)  {
4784             first_fold_from = PL_fold_locale[folded];
4785         }
4786         else if (   op == EXACTFL && utf8_target && utf8_pat
4787                  && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4788                                             LATIN_SMALL_LETTER_LONG_S_UTF8))
4789         {
4790             first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4791         }
4792         else if (UNLIKELY(    op == TURKISH
4793                           && (   isALPHA_FOLD_EQ(folded, 'i')
4794                               || inRANGE(folded,
4795                                          LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4796                                          LATIN_SMALL_LETTER_DOTLESS_I))))
4797         {   /* Turkish folding requires special handling */
4798             if (folded == 'i')
4799                 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4800             else if (folded == 'I')
4801                 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4802             else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4803                 first_fold_from = 'i';
4804             else first_fold_from = 'I';
4805         }
4806         else {
4807             /* Here, isn't a special case: use the generic function to
4808              * calculate what folds to this */
4809           redo_multi:
4810             /* Look up what code points (besides itself) fold to 'folded';
4811              * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4812             folds_to_count = _inverse_folds(folded, &first_fold_from,
4813                                                        &remaining_fold_froms);
4814         }
4815 
4816         /* Add each character that folds to 'folded' to the list of them,
4817          * subject to limitations based on the node type and target UTF8ness.
4818          * If there was a character that folded to multiple characters, do an
4819          * extra iteration for it.  (Note the extra iteration if there is a
4820          * multi-character fold) */
4821         for (i = 0; i < folds_to_count
4822                       + UNLIKELY(multi_fold_from != 0); i++)
4823         {
4824             UV fold_from = 0;
4825 
4826             if (i >= folds_to_count) {  /* Final iteration: handle the
4827                                            multi-char */
4828                 fold_from = multi_fold_from;
4829             }
4830             else if (i == 0) {
4831                 fold_from = first_fold_from;
4832             }
4833             else if (i < folds_to_count) {
4834                 fold_from = remaining_fold_froms[i-1];
4835             }
4836 
4837             if (folded == fold_from) {  /* We already added the character
4838                                            itself */
4839                 continue;
4840             }
4841 
4842             /* EXACTF doesn't have any non-ascii folds */
4843             if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4844                 continue;
4845             }
4846 
4847             /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4848              * */
4849             if (    isASCII(folded) != isASCII(fold_from)
4850                 &&  inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4851 
4852             {
4853                 continue;
4854             }
4855 
4856             /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4857              * locale, but those have been converted to EXACTFU above) */
4858             if (   op == EXACTFL
4859                 && (folded < 256) != (fold_from < 256))
4860             {
4861                 continue;
4862             }
4863 
4864             /* If this triggers, it likely is because of the unlikely case
4865              * where a new Unicode standard has changed what MAX_MATCHES should
4866              * be set to */
4867             assert(m->count < MAX_MATCHES);
4868 
4869             /* Add this character to the list of possible matches */
4870             if (utf8_target) {
4871                 uvchr_to_utf8(matches[m->count], fold_from);
4872                 lengths[m->count] = UVCHR_SKIP(fold_from);
4873                 m->count++;
4874             }
4875             else { /* Non-UTF8 target: no code point above 255 can appear in it
4876                     */
4877                 if (fold_from > 255) {
4878                     continue;
4879                 }
4880 
4881                 matches[m->count][0] = fold_from;
4882                 lengths[m->count] = 1;
4883                 m->count++;
4884             }
4885 
4886             /* Update min and mlengths */
4887             if (m->min_length > lengths[m->count-1]) {
4888                 m->min_length = lengths[m->count-1];
4889             }
4890 
4891             if (m->max_length < lengths[m->count-1]) {
4892                 index_of_longest = m->count - 1;
4893                 m->max_length = lengths[index_of_longest];
4894             }
4895         } /* looped through each potential fold */
4896 
4897         /* If there is something that folded to an initial multi-character
4898          * fold, repeat, using it.  This catches some edge cases.  An example
4899          * of one is /ss/i when UTF-8 encoded.  The function
4900          * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4901          * (LATIN SMALL SHARP S).  If it returned a list of characters, this
4902          * code wouldn't be needed.  But since it doesn't, we have to look what
4903          * folds to the U+DF.  In this case, U+1E9E does, and has to be added.
4904          * */
4905         if (multi_fold_from) {
4906             folded = multi_fold_from;
4907             multi_fold_from = 0;
4908             goto redo_multi;
4909         }
4910     } /* End of finding things that participate in this fold */
4911 
4912     if (m->count == 0) {    /* If nothing found, can't match */
4913         m->min_length = 0;
4914         return FALSE;
4915     }
4916 
4917     /* Have calculated all possible matches.  Now calculate the mask and AND
4918      * values */
4919     m->initial_exact = 0;
4920     m->initial_definitive = 0;
4921 
4922     {
4923         unsigned int mask_ones = 0;
4924         unsigned int possible_ones = 0;
4925         U8 j;
4926 
4927         /* For each byte that is in all possible matches ... */
4928         for (j = 0; j < MIN(m->min_length, 5); j++) {
4929 
4930             /* Initialize the accumulator for this byte */
4931             byte_mask[j] = 0xFF;
4932             byte_anded[j] = matches[0][j];
4933 
4934             /* Then the rest of the rows (folds).  The mask is based on, like,
4935              * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4936              * where they differ. */
4937             for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4938                 byte_mask[j]  &= ~ (byte_anded[j] ^ matches[i][j]);
4939                 byte_anded[j] &= matches[i][j];
4940             }
4941 
4942             /* Keep track of the number of initial mask bytes that are all one
4943              * bits.  The code calling this can use this number to know that
4944              * a string that matches this number of bytes in the pattern is an
4945              * exact match of that pattern for this number of bytes.  But also
4946              * counted are the number of initial bytes that in total have a
4947              * single zero bit.  If a string matches those, masked, it must be
4948              * one of two possibilites, both of which this function has
4949              * determined are legal.  (But if that single 0 is one of the
4950              * initial bits for masking a UTF-8 start byte, that could
4951              * incorrectly lead to different length strings appearing to be
4952              * equivalent, so only do this optimization when the matchables are
4953              * all the same length.  This was uncovered by testing
4954              * /\x{029E}/i.) */
4955             if (m->min_length == m->max_length) {
4956                 mask_ones += PL_bitcount[byte_mask[j]];
4957                 possible_ones += 8;
4958                 if (mask_ones + 1 >= possible_ones) {
4959                     m->initial_definitive++;
4960                     if (mask_ones >= possible_ones) {
4961                         m->initial_exact++;
4962                     }
4963                 }
4964             }
4965         }
4966     }
4967 
4968     /* The first byte is separate for speed */
4969     m->first_byte_mask = byte_mask[0];
4970     m->first_byte_anded = byte_anded[0];
4971 
4972     /* Then pack up to the next 4 bytes into a word */
4973     m->mask32 = m->anded32 = 0;
4974     for (i = 1; i < MIN(m->min_length, 5); i++) {
4975         U8 which = i;
4976         U8 shift = (which - 1) * 8;
4977         m->mask32  |= (U32) byte_mask[i]  << shift;
4978         m->anded32 |= (U32) byte_anded[i] << shift;
4979     }
4980 
4981     /* Finally, take the match strings and place them sequentially into a
4982      * one-dimensional array.  (This is done to save significant space in the
4983      * structure.) Sort so the longest (presumably the least likely) is last.
4984      * XXX When this gets moved to regcomp, may want to fully sort shortest
4985      * first, but above we generally used the folded code point first, and
4986      * those tend to be no longer than their upper case values, so this is
4987      * already pretty well sorted by size.
4988      *
4989      * If the asserts fail, it's most likely because a new version of the
4990      * Unicode standard requires more space; simply increase the declaration
4991      * size. */
4992     {
4993         U8 cur_pos = 0;
4994         U8 output_index = 0;
4995 
4996         if (m->count > 1) { /* No need to sort a single entry */
4997             for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
4998 
4999                 /* Keep the same order for all but the longest.  (If the
5000                  * asserts fail, it could be because m->matches is declared too
5001                  * short, either because of a new Unicode release, or an
5002                  * overlooked test case, or it could be a bug.) */
5003                 if (i != index_of_longest) {
5004                     assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5005                     Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5006                     cur_pos += lengths[i];
5007                     m->lengths[output_index++] = lengths[i];
5008                 }
5009             }
5010         }
5011 
5012         assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5013         Copy(matches[index_of_longest], m->matches + cur_pos,
5014              lengths[index_of_longest], U8);
5015 
5016         /* Place the longest match last */
5017         m->lengths[output_index] = lengths[index_of_longest];
5018     }
5019 
5020 
5021     return TRUE;
5022 }
5023 
5024 PERL_STATIC_FORCE_INLINE    /* We want speed at the expense of size */
5025 bool
S_test_EXACTISH_ST(const char * loc,struct next_matchable_info info)5026 S_test_EXACTISH_ST(const char * loc,
5027                    struct next_matchable_info info)
5028 {
5029     /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5030      * bytes starting at 'loc' can match based on 'next_matchable_info' */
5031 
5032     U32 input32 = 0;
5033 
5034     /* Check the first byte */
5035     if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5036         return FALSE;
5037 
5038     /* Pack the next up-to-4 bytes into a 32 bit word */
5039     switch (info.min_length) {
5040         default:
5041             input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5042             /* FALLTHROUGH */
5043         case 4:
5044             input32 |= (U8) loc[3] << 2 * 8;
5045             /* FALLTHROUGH */
5046         case 3:
5047             input32 |= (U8) loc[2] << 1 * 8;
5048             /* FALLTHROUGH */
5049         case 2:
5050             input32 |= (U8) loc[1];
5051             break;
5052         case 1:
5053             return TRUE;    /* We already tested and passed the 0th byte */
5054         case 0:
5055             ASSUME(0);
5056     }
5057 
5058     /* And AND that with the mask and compare that with the assembled ANDED
5059      * values */
5060     return (input32 & info.mask32) == info.anded32;
5061 }
5062 
5063 STATIC bool
S_isGCB(pTHX_ const GCB_enum before,const GCB_enum after,const U8 * const strbeg,const U8 * const curpos,const bool utf8_target)5064 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5065 {
5066     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5067      * between the inputs.  See https://www.unicode.org/reports/tr29/. */
5068 
5069     PERL_ARGS_ASSERT_ISGCB;
5070 
5071     switch (GCB_table[before][after]) {
5072         case GCB_BREAKABLE:
5073             return TRUE;
5074 
5075         case GCB_NOBREAK:
5076             return FALSE;
5077 
5078         case GCB_RI_then_RI:
5079             {
5080                 int RI_count = 1;
5081                 U8 * temp_pos = (U8 *) curpos;
5082 
5083                 /* Do not break within emoji flag sequences. That is, do not
5084                  * break between regional indicator (RI) symbols if there is an
5085                  * odd number of RI characters before the break point.
5086                  *  GB12   sot (RI RI)* RI × RI
5087                  *  GB13 [^RI] (RI RI)* RI × RI */
5088 
5089                 while (backup_one_GCB(strbeg,
5090                                     &temp_pos,
5091                                     utf8_target) == GCB_Regional_Indicator)
5092                 {
5093                     RI_count++;
5094                 }
5095 
5096                 return RI_count % 2 != 1;
5097             }
5098 
5099         case GCB_EX_then_EM:
5100 
5101             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
5102             {
5103                 U8 * temp_pos = (U8 *) curpos;
5104                 GCB_enum prev;
5105 
5106                 do {
5107                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5108                 }
5109                 while (prev == GCB_Extend);
5110 
5111                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5112             }
5113 
5114         case GCB_Maybe_Emoji_NonBreak:
5115 
5116             {
5117 
5118             /* Do not break within emoji modifier sequences or emoji zwj sequences.
5119               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5120               */
5121                 U8 * temp_pos = (U8 *) curpos;
5122                 GCB_enum prev;
5123 
5124                 do {
5125                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5126                 }
5127                 while (prev == GCB_Extend);
5128 
5129                 return prev != GCB_ExtPict_XX;
5130             }
5131 
5132         default:
5133             break;
5134     }
5135 
5136 #ifdef DEBUGGING
5137     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5138                                   before, after, GCB_table[before][after]);
5139     assert(0);
5140 #endif
5141     return TRUE;
5142 }
5143 
5144 STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5145 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5146 {
5147     GCB_enum gcb;
5148 
5149     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5150 
5151     if (*curpos < strbeg) {
5152         return GCB_EDGE;
5153     }
5154 
5155     if (utf8_target) {
5156         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5157         U8 * prev_prev_char_pos;
5158 
5159         if (! prev_char_pos) {
5160             return GCB_EDGE;
5161         }
5162 
5163         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5164             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5165             *curpos = prev_char_pos;
5166             prev_char_pos = prev_prev_char_pos;
5167         }
5168         else {
5169             *curpos = (U8 *) strbeg;
5170             return GCB_EDGE;
5171         }
5172     }
5173     else {
5174         if (*curpos - 2 < strbeg) {
5175             *curpos = (U8 *) strbeg;
5176             return GCB_EDGE;
5177         }
5178         (*curpos)--;
5179         gcb = getGCB_VAL_CP(*(*curpos - 1));
5180     }
5181 
5182     return gcb;
5183 }
5184 
5185 /* Combining marks attach to most classes that precede them, but this defines
5186  * the exceptions (from TR14) */
5187 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
5188                                      || prev == LB_Mandatory_Break      \
5189                                      || prev == LB_Carriage_Return      \
5190                                      || prev == LB_Line_Feed            \
5191                                      || prev == LB_Next_Line            \
5192                                      || prev == LB_Space                \
5193                                      || prev == LB_ZWSpace))
5194 
5195 STATIC bool
S_isLB(pTHX_ LB_enum before,LB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5196 S_isLB(pTHX_ LB_enum before,
5197              LB_enum after,
5198              const U8 * const strbeg,
5199              const U8 * const curpos,
5200              const U8 * const strend,
5201              const bool utf8_target)
5202 {
5203     U8 * temp_pos = (U8 *) curpos;
5204     LB_enum prev = before;
5205 
5206     /* Is the boundary between 'before' and 'after' line-breakable?
5207      * Most of this is just a table lookup of a generated table from Unicode
5208      * rules.  But some rules require context to decide, and so have to be
5209      * implemented in code */
5210 
5211     PERL_ARGS_ASSERT_ISLB;
5212 
5213     /* Rule numbers in the comments below are as of Unicode 9.0 */
5214 
5215   redo:
5216     before = prev;
5217     switch (LB_table[before][after]) {
5218         case LB_BREAKABLE:
5219             return TRUE;
5220 
5221         case LB_NOBREAK:
5222         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5223             return FALSE;
5224 
5225         case LB_SP_foo + LB_BREAKABLE:
5226         case LB_SP_foo + LB_NOBREAK:
5227         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5228 
5229             /* When we have something following a SP, we have to look at the
5230              * context in order to know what to do.
5231              *
5232              * SP SP should not reach here because LB7: Do not break before
5233              * spaces.  (For two spaces in a row there is nothing that
5234              * overrides that) */
5235             assert(after != LB_Space);
5236 
5237             /* Here we have a space followed by a non-space.  Mostly this is a
5238              * case of LB18: "Break after spaces".  But there are complications
5239              * as the handling of spaces is somewhat tricky.  They are in a
5240              * number of rules, which have to be applied in priority order, but
5241              * something earlier in the string can cause a rule to be skipped
5242              * and a lower priority rule invoked.  A prime example is LB7 which
5243              * says don't break before a space.  But rule LB8 (lower priority)
5244              * says that the first break opportunity after a ZW is after any
5245              * span of spaces immediately after it.  If a ZW comes before a SP
5246              * in the input, rule LB8 applies, and not LB7.  Other such rules
5247              * involve combining marks which are rules 9 and 10, but they may
5248              * override higher priority rules if they come earlier in the
5249              * string.  Since we're doing random access into the middle of the
5250              * string, we have to look for rules that should get applied based
5251              * on both string position and priority.  Combining marks do not
5252              * attach to either ZW nor SP, so we don't have to consider them
5253              * until later.
5254              *
5255              * To check for LB8, we have to find the first non-space character
5256              * before this span of spaces */
5257             do {
5258                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5259             }
5260             while (prev == LB_Space);
5261 
5262             /* LB8 Break before any character following a zero-width space,
5263              * even if one or more spaces intervene.
5264              *      ZW SP* ÷
5265              * So if we have a ZW just before this span, and to get here this
5266              * is the final space in the span. */
5267             if (prev == LB_ZWSpace) {
5268                 return TRUE;
5269             }
5270 
5271             /* Here, not ZW SP+.  There are several rules that have higher
5272              * priority than LB18 and can be resolved now, as they don't depend
5273              * on anything earlier in the string (except ZW, which we have
5274              * already handled).  One of these rules is LB11 Do not break
5275              * before Word joiner, but we have specially encoded that in the
5276              * lookup table so it is caught by the single test below which
5277              * catches the other ones. */
5278             if (LB_table[LB_Space][after] - LB_SP_foo
5279                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5280             {
5281                 return FALSE;
5282             }
5283 
5284             /* If we get here, we have to XXX consider combining marks. */
5285             if (prev == LB_Combining_Mark) {
5286 
5287                 /* What happens with these depends on the character they
5288                  * follow.  */
5289                 do {
5290                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5291                 }
5292                 while (prev == LB_Combining_Mark);
5293 
5294                 /* Most times these attach to and inherit the characteristics
5295                  * of that character, but not always, and when not, they are to
5296                  * be treated as AL by rule LB10. */
5297                 if (! LB_CM_ATTACHES_TO(prev)) {
5298                     prev = LB_Alphabetic;
5299                 }
5300             }
5301 
5302             /* Here, we have the character preceding the span of spaces all set
5303              * up.  We follow LB18: "Break after spaces" unless the table shows
5304              * that is overriden */
5305             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5306 
5307         case LB_CM_ZWJ_foo:
5308 
5309             /* We don't know how to treat the CM except by looking at the first
5310              * non-CM character preceding it.  ZWJ is treated as CM */
5311             do {
5312                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5313             }
5314             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5315 
5316             /* Here, 'prev' is that first earlier non-CM character.  If the CM
5317              * attatches to it, then it inherits the behavior of 'prev'.  If it
5318              * doesn't attach, it is to be treated as an AL */
5319             if (! LB_CM_ATTACHES_TO(prev)) {
5320                 prev = LB_Alphabetic;
5321             }
5322 
5323             goto redo;
5324 
5325         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5326         case LB_HY_or_BA_then_foo + LB_NOBREAK:
5327 
5328             /* LB21a Don't break after Hebrew + Hyphen.
5329              * HL (HY | BA) × */
5330 
5331             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5332                                                           == LB_Hebrew_Letter)
5333             {
5334                 return FALSE;
5335             }
5336 
5337             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5338 
5339         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5340         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5341 
5342             /* LB25a (PR | PO) × ( OP | HY )? NU */
5343             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5344                 return FALSE;
5345             }
5346 
5347             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5348                                                                 == LB_BREAKABLE;
5349 
5350         case LB_SY_or_IS_then_various + LB_BREAKABLE:
5351         case LB_SY_or_IS_then_various + LB_NOBREAK:
5352         {
5353             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5354 
5355             LB_enum temp = prev;
5356             do {
5357                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5358             }
5359             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5360             if (temp == LB_Numeric) {
5361                 return FALSE;
5362             }
5363 
5364             return LB_table[prev][after] - LB_SY_or_IS_then_various
5365                                                                == LB_BREAKABLE;
5366         }
5367 
5368         case LB_various_then_PO_or_PR + LB_BREAKABLE:
5369         case LB_various_then_PO_or_PR + LB_NOBREAK:
5370         {
5371             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5372 
5373             LB_enum temp = prev;
5374             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5375             {
5376                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5377             }
5378             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5379                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5380             }
5381             if (temp == LB_Numeric) {
5382                 return FALSE;
5383             }
5384             return LB_various_then_PO_or_PR;
5385         }
5386 
5387         case LB_RI_then_RI + LB_NOBREAK:
5388         case LB_RI_then_RI + LB_BREAKABLE:
5389             {
5390                 int RI_count = 1;
5391 
5392                 /* LB30a Break between two regional indicator symbols if and
5393                  * only if there are an even number of regional indicators
5394                  * preceding the position of the break.
5395                  *
5396                  *    sot (RI RI)* RI × RI
5397                  *  [^RI] (RI RI)* RI × RI */
5398 
5399                 while (backup_one_LB(strbeg,
5400                                      &temp_pos,
5401                                      utf8_target) == LB_Regional_Indicator)
5402                 {
5403                     RI_count++;
5404                 }
5405 
5406                 return RI_count % 2 == 0;
5407             }
5408 
5409         default:
5410             break;
5411     }
5412 
5413 #ifdef DEBUGGING
5414     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5415                                   before, after, LB_table[before][after]);
5416     assert(0);
5417 #endif
5418     return TRUE;
5419 }
5420 
5421 STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5422 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5423 {
5424 
5425     LB_enum lb;
5426 
5427     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5428 
5429     if (*curpos >= strend) {
5430         return LB_EDGE;
5431     }
5432 
5433     if (utf8_target) {
5434         *curpos += UTF8SKIP(*curpos);
5435         if (*curpos >= strend) {
5436             return LB_EDGE;
5437         }
5438         lb = getLB_VAL_UTF8(*curpos, strend);
5439     }
5440     else {
5441         (*curpos)++;
5442         if (*curpos >= strend) {
5443             return LB_EDGE;
5444         }
5445         lb = getLB_VAL_CP(**curpos);
5446     }
5447 
5448     return lb;
5449 }
5450 
5451 STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5452 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5453 {
5454     LB_enum lb;
5455 
5456     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5457 
5458     if (*curpos < strbeg) {
5459         return LB_EDGE;
5460     }
5461 
5462     if (utf8_target) {
5463         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5464         U8 * prev_prev_char_pos;
5465 
5466         if (! prev_char_pos) {
5467             return LB_EDGE;
5468         }
5469 
5470         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5471             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5472             *curpos = prev_char_pos;
5473             prev_char_pos = prev_prev_char_pos;
5474         }
5475         else {
5476             *curpos = (U8 *) strbeg;
5477             return LB_EDGE;
5478         }
5479     }
5480     else {
5481         if (*curpos - 2 < strbeg) {
5482             *curpos = (U8 *) strbeg;
5483             return LB_EDGE;
5484         }
5485         (*curpos)--;
5486         lb = getLB_VAL_CP(*(*curpos - 1));
5487     }
5488 
5489     return lb;
5490 }
5491 
5492 STATIC bool
S_isSB(pTHX_ SB_enum before,SB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5493 S_isSB(pTHX_ SB_enum before,
5494              SB_enum after,
5495              const U8 * const strbeg,
5496              const U8 * const curpos,
5497              const U8 * const strend,
5498              const bool utf8_target)
5499 {
5500     /* returns a boolean indicating if there is a Sentence Boundary Break
5501      * between the inputs.  See https://www.unicode.org/reports/tr29/ */
5502 
5503     U8 * lpos = (U8 *) curpos;
5504     bool has_para_sep = FALSE;
5505     bool has_sp = FALSE;
5506 
5507     PERL_ARGS_ASSERT_ISSB;
5508 
5509     /* Break at the start and end of text.
5510         SB1.  sot  ÷
5511         SB2.  ÷  eot
5512       But unstated in Unicode is don't break if the text is empty */
5513     if (before == SB_EDGE || after == SB_EDGE) {
5514         return before != after;
5515     }
5516 
5517     /* SB 3: Do not break within CRLF. */
5518     if (before == SB_CR && after == SB_LF) {
5519         return FALSE;
5520     }
5521 
5522     /* Break after paragraph separators.  CR and LF are considered
5523      * so because Unicode views text as like word processing text where there
5524      * are no newlines except between paragraphs, and the word processor takes
5525      * care of wrapping without there being hard line-breaks in the text *./
5526        SB4.  Sep | CR | LF  ÷ */
5527     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5528         return TRUE;
5529     }
5530 
5531     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5532      * (See Section 6.2, Replacing Ignore Rules.)
5533         SB5.  X (Extend | Format)*  →  X */
5534     if (after == SB_Extend || after == SB_Format) {
5535 
5536         /* Implied is that the these characters attach to everything
5537          * immediately prior to them except for those separator-type
5538          * characters.  And the rules earlier have already handled the case
5539          * when one of those immediately precedes the extend char */
5540         return FALSE;
5541     }
5542 
5543     if (before == SB_Extend || before == SB_Format) {
5544         U8 * temp_pos = lpos;
5545         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5546         if (   backup != SB_EDGE
5547             && backup != SB_Sep
5548             && backup != SB_CR
5549             && backup != SB_LF)
5550         {
5551             before = backup;
5552             lpos = temp_pos;
5553         }
5554 
5555         /* Here, both 'before' and 'backup' are these types; implied is that we
5556          * don't break between them */
5557         if (backup == SB_Extend || backup == SB_Format) {
5558             return FALSE;
5559         }
5560     }
5561 
5562     /* Do not break after ambiguous terminators like period, if they are
5563      * immediately followed by a number or lowercase letter, if they are
5564      * between uppercase letters, if the first following letter (optionally
5565      * after certain punctuation) is lowercase, or if they are followed by
5566      * "continuation" punctuation such as comma, colon, or semicolon. For
5567      * example, a period may be an abbreviation or numeric period, and thus may
5568      * not mark the end of a sentence.
5569 
5570      * SB6. ATerm  ×  Numeric */
5571     if (before == SB_ATerm && after == SB_Numeric) {
5572         return FALSE;
5573     }
5574 
5575     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5576     if (before == SB_ATerm && after == SB_Upper) {
5577         U8 * temp_pos = lpos;
5578         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5579         if (backup == SB_Upper || backup == SB_Lower) {
5580             return FALSE;
5581         }
5582     }
5583 
5584     /* The remaining rules that aren't the final one, all require an STerm or
5585      * an ATerm after having backed up over some Close* Sp*, and in one case an
5586      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5587      * So do that backup now, setting flags if either Sp or a paragraph
5588      * separator are found */
5589 
5590     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5591         has_para_sep = TRUE;
5592         before = backup_one_SB(strbeg, &lpos, utf8_target);
5593     }
5594 
5595     if (before == SB_Sp) {
5596         has_sp = TRUE;
5597         do {
5598             before = backup_one_SB(strbeg, &lpos, utf8_target);
5599         }
5600         while (before == SB_Sp);
5601     }
5602 
5603     while (before == SB_Close) {
5604         before = backup_one_SB(strbeg, &lpos, utf8_target);
5605     }
5606 
5607     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5608      * most cases an STerm */
5609     if (before == SB_STerm || before == SB_ATerm) {
5610 
5611         /* So, here the lhs matches
5612          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5613          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5614          * The rules that apply here are:
5615          *
5616          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5617                                            | LF | STerm | ATerm) )* Lower
5618            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5619            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5620            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5621            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5622          */
5623 
5624         /* And all but SB11 forbid having seen a paragraph separator */
5625         if (! has_para_sep) {
5626             if (before == SB_ATerm) {          /* SB8 */
5627                 U8 * rpos = (U8 *) curpos;
5628                 SB_enum later = after;
5629 
5630                 while (    later != SB_OLetter
5631                         && later != SB_Upper
5632                         && later != SB_Lower
5633                         && later != SB_Sep
5634                         && later != SB_CR
5635                         && later != SB_LF
5636                         && later != SB_STerm
5637                         && later != SB_ATerm
5638                         && later != SB_EDGE)
5639                 {
5640                     later = advance_one_SB(&rpos, strend, utf8_target);
5641                 }
5642                 if (later == SB_Lower) {
5643                     return FALSE;
5644                 }
5645             }
5646 
5647             if (   after == SB_SContinue    /* SB8a */
5648                 || after == SB_STerm
5649                 || after == SB_ATerm)
5650             {
5651                 return FALSE;
5652             }
5653 
5654             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5655                 if (   after == SB_Close
5656                     || after == SB_Sp
5657                     || after == SB_Sep
5658                     || after == SB_CR
5659                     || after == SB_LF)
5660                 {
5661                     return FALSE;
5662                 }
5663             }
5664 
5665             /* SB10.  This and SB9 could probably be combined some way, but khw
5666              * has decided to follow the Unicode rule book precisely for
5667              * simplified maintenance */
5668             if (   after == SB_Sp
5669                 || after == SB_Sep
5670                 || after == SB_CR
5671                 || after == SB_LF)
5672             {
5673                 return FALSE;
5674             }
5675         }
5676 
5677         /* SB11.  */
5678         return TRUE;
5679     }
5680 
5681     /* Otherwise, do not break.
5682     SB12.  Any  ×  Any */
5683 
5684     return FALSE;
5685 }
5686 
5687 STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5688 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5689 {
5690     SB_enum sb;
5691 
5692     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5693 
5694     if (*curpos >= strend) {
5695         return SB_EDGE;
5696     }
5697 
5698     if (utf8_target) {
5699         do {
5700             *curpos += UTF8SKIP(*curpos);
5701             if (*curpos >= strend) {
5702                 return SB_EDGE;
5703             }
5704             sb = getSB_VAL_UTF8(*curpos, strend);
5705         } while (sb == SB_Extend || sb == SB_Format);
5706     }
5707     else {
5708         do {
5709             (*curpos)++;
5710             if (*curpos >= strend) {
5711                 return SB_EDGE;
5712             }
5713             sb = getSB_VAL_CP(**curpos);
5714         } while (sb == SB_Extend || sb == SB_Format);
5715     }
5716 
5717     return sb;
5718 }
5719 
5720 STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5721 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5722 {
5723     SB_enum sb;
5724 
5725     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5726 
5727     if (*curpos < strbeg) {
5728         return SB_EDGE;
5729     }
5730 
5731     if (utf8_target) {
5732         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5733         if (! prev_char_pos) {
5734             return SB_EDGE;
5735         }
5736 
5737         /* Back up over Extend and Format.  curpos is always just to the right
5738          * of the characater whose value we are getting */
5739         do {
5740             U8 * prev_prev_char_pos;
5741             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5742                                                                       strbeg)))
5743             {
5744                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5745                 *curpos = prev_char_pos;
5746                 prev_char_pos = prev_prev_char_pos;
5747             }
5748             else {
5749                 *curpos = (U8 *) strbeg;
5750                 return SB_EDGE;
5751             }
5752         } while (sb == SB_Extend || sb == SB_Format);
5753     }
5754     else {
5755         do {
5756             if (*curpos - 2 < strbeg) {
5757                 *curpos = (U8 *) strbeg;
5758                 return SB_EDGE;
5759             }
5760             (*curpos)--;
5761             sb = getSB_VAL_CP(*(*curpos - 1));
5762         } while (sb == SB_Extend || sb == SB_Format);
5763     }
5764 
5765     return sb;
5766 }
5767 
5768 STATIC bool
S_isWB(pTHX_ WB_enum previous,WB_enum before,WB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5769 S_isWB(pTHX_ WB_enum previous,
5770              WB_enum before,
5771              WB_enum after,
5772              const U8 * const strbeg,
5773              const U8 * const curpos,
5774              const U8 * const strend,
5775              const bool utf8_target)
5776 {
5777     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5778      *  a Unicode word break, using their published algorithm, but tailored for
5779      *  Perl by treating spans of white space as one unit.  Context may be
5780      *  needed to make this determination.  If the value for the character
5781      *  before 'before' is known, it is passed as 'previous'; otherwise that
5782      *  should be set to WB_UNKNOWN.  The other input parameters give the
5783      *  boundaries and current position in the matching of the string.  That
5784      *  is, 'curpos' marks the position where the character whose wb value is
5785      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5786 
5787     U8 * before_pos = (U8 *) curpos;
5788     U8 * after_pos = (U8 *) curpos;
5789     WB_enum prev = before;
5790     WB_enum next;
5791 
5792     PERL_ARGS_ASSERT_ISWB;
5793 
5794     /* Rule numbers in the comments below are as of Unicode 9.0 */
5795 
5796   redo:
5797     before = prev;
5798     switch (WB_table[before][after]) {
5799         case WB_BREAKABLE:
5800             return TRUE;
5801 
5802         case WB_NOBREAK:
5803             return FALSE;
5804 
5805         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5806             next = advance_one_WB(&after_pos, strend, utf8_target,
5807                                  FALSE /* Don't skip Extend nor Format */ );
5808             /* A space immediately preceding an Extend or Format is attached
5809              * to by them, and hence gets separated from previous spaces.
5810              * Otherwise don't break between horizontal white space */
5811             return next == WB_Extend || next == WB_Format;
5812 
5813         /* WB4 Ignore Format and Extend characters, except when they appear at
5814          * the beginning of a region of text.  This code currently isn't
5815          * general purpose, but it works as the rules are currently and likely
5816          * to be laid out.  The reason it works is that when 'they appear at
5817          * the beginning of a region of text', the rule is to break before
5818          * them, just like any other character.  Therefore, the default rule
5819          * applies and we don't have to look in more depth.  Should this ever
5820          * change, we would have to have 2 'case' statements, like in the rules
5821          * below, and backup a single character (not spacing over the extend
5822          * ones) and then see if that is one of the region-end characters and
5823          * go from there */
5824         case WB_Ex_or_FO_or_ZWJ_then_foo:
5825             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5826             goto redo;
5827 
5828         case WB_DQ_then_HL + WB_BREAKABLE:
5829         case WB_DQ_then_HL + WB_NOBREAK:
5830 
5831             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5832 
5833             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5834                                                             == WB_Hebrew_Letter)
5835             {
5836                 return FALSE;
5837             }
5838 
5839              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5840 
5841         case WB_HL_then_DQ + WB_BREAKABLE:
5842         case WB_HL_then_DQ + WB_NOBREAK:
5843 
5844             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5845 
5846             if (advance_one_WB(&after_pos, strend, utf8_target,
5847                                        TRUE /* Do skip Extend and Format */ )
5848                                                             == WB_Hebrew_Letter)
5849             {
5850                 return FALSE;
5851             }
5852 
5853             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5854 
5855         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5856         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5857 
5858             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5859              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5860 
5861             next = advance_one_WB(&after_pos, strend, utf8_target,
5862                                        TRUE /* Do skip Extend and Format */ );
5863 
5864             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5865             {
5866                 return FALSE;
5867             }
5868 
5869             return WB_table[before][after]
5870                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5871 
5872         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5873         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5874 
5875             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5876              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5877 
5878             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5879             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5880             {
5881                 return FALSE;
5882             }
5883 
5884             return WB_table[before][after]
5885                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5886 
5887         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5888         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5889 
5890             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5891              * */
5892 
5893             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5894                                                             == WB_Numeric)
5895             {
5896                 return FALSE;
5897             }
5898 
5899             return WB_table[before][after]
5900                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5901 
5902         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5903         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5904 
5905             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5906 
5907             if (advance_one_WB(&after_pos, strend, utf8_target,
5908                                        TRUE /* Do skip Extend and Format */ )
5909                                                             == WB_Numeric)
5910             {
5911                 return FALSE;
5912             }
5913 
5914             return WB_table[before][after]
5915                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5916 
5917         case WB_RI_then_RI + WB_NOBREAK:
5918         case WB_RI_then_RI + WB_BREAKABLE:
5919             {
5920                 int RI_count = 1;
5921 
5922                 /* Do not break within emoji flag sequences. That is, do not
5923                  * break between regional indicator (RI) symbols if there is an
5924                  * odd number of RI characters before the potential break
5925                  * point.
5926                  *
5927                  * WB15   sot (RI RI)* RI × RI
5928                  * WB16 [^RI] (RI RI)* RI × RI */
5929 
5930                 while (backup_one_WB(&previous,
5931                                      strbeg,
5932                                      &before_pos,
5933                                      utf8_target) == WB_Regional_Indicator)
5934                 {
5935                     RI_count++;
5936                 }
5937 
5938                 return RI_count % 2 != 1;
5939             }
5940 
5941         default:
5942             break;
5943     }
5944 
5945 #ifdef DEBUGGING
5946     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5947                                   before, after, WB_table[before][after]);
5948     assert(0);
5949 #endif
5950     return TRUE;
5951 }
5952 
5953 STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target,const bool skip_Extend_Format)5954 S_advance_one_WB(pTHX_ U8 ** curpos,
5955                        const U8 * const strend,
5956                        const bool utf8_target,
5957                        const bool skip_Extend_Format)
5958 {
5959     WB_enum wb;
5960 
5961     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5962 
5963     if (*curpos >= strend) {
5964         return WB_EDGE;
5965     }
5966 
5967     if (utf8_target) {
5968 
5969         /* Advance over Extend and Format */
5970         do {
5971             *curpos += UTF8SKIP(*curpos);
5972             if (*curpos >= strend) {
5973                 return WB_EDGE;
5974             }
5975             wb = getWB_VAL_UTF8(*curpos, strend);
5976         } while (    skip_Extend_Format
5977                  && (wb == WB_Extend || wb == WB_Format));
5978     }
5979     else {
5980         do {
5981             (*curpos)++;
5982             if (*curpos >= strend) {
5983                 return WB_EDGE;
5984             }
5985             wb = getWB_VAL_CP(**curpos);
5986         } while (    skip_Extend_Format
5987                  && (wb == WB_Extend || wb == WB_Format));
5988     }
5989 
5990     return wb;
5991 }
5992 
5993 STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous,const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5994 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5995 {
5996     WB_enum wb;
5997 
5998     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5999 
6000     /* If we know what the previous character's break value is, don't have
6001         * to look it up */
6002     if (*previous != WB_UNKNOWN) {
6003         wb = *previous;
6004 
6005         /* But we need to move backwards by one */
6006         if (utf8_target) {
6007             *curpos = reghopmaybe3(*curpos, -1, strbeg);
6008             if (! *curpos) {
6009                 *previous = WB_EDGE;
6010                 *curpos = (U8 *) strbeg;
6011             }
6012             else {
6013                 *previous = WB_UNKNOWN;
6014             }
6015         }
6016         else {
6017             (*curpos)--;
6018             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6019         }
6020 
6021         /* And we always back up over these three types */
6022         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6023             return wb;
6024         }
6025     }
6026 
6027     if (*curpos < strbeg) {
6028         return WB_EDGE;
6029     }
6030 
6031     if (utf8_target) {
6032         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6033         if (! prev_char_pos) {
6034             return WB_EDGE;
6035         }
6036 
6037         /* Back up over Extend and Format.  curpos is always just to the right
6038          * of the characater whose value we are getting */
6039         do {
6040             U8 * prev_prev_char_pos;
6041             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6042                                                    -1,
6043                                                    strbeg)))
6044             {
6045                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6046                 *curpos = prev_char_pos;
6047                 prev_char_pos = prev_prev_char_pos;
6048             }
6049             else {
6050                 *curpos = (U8 *) strbeg;
6051                 return WB_EDGE;
6052             }
6053         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6054     }
6055     else {
6056         do {
6057             if (*curpos - 2 < strbeg) {
6058                 *curpos = (U8 *) strbeg;
6059                 return WB_EDGE;
6060             }
6061             (*curpos)--;
6062             wb = getWB_VAL_CP(*(*curpos - 1));
6063         } while (wb == WB_Extend || wb == WB_Format);
6064     }
6065 
6066     return wb;
6067 }
6068 
6069 /* Macros for regmatch(), using its internal variables */
6070 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6071 #define NEXTCHR_IS_EOS (nextbyte < 0)
6072 
6073 #define SET_nextchr \
6074     nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6075 
6076 #define SET_locinput(p) \
6077     locinput = (p);  \
6078     SET_nextchr
6079 
6080 #define sayYES goto yes
6081 #define sayNO goto no
6082 #define sayNO_SILENT goto no_silent
6083 
6084 /* we dont use STMT_START/END here because it leads to
6085    "unreachable code" warnings, which are bogus, but distracting. */
6086 #define CACHEsayNO \
6087     if (ST.cache_mask) \
6088        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6089     sayNO
6090 
6091 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
6092 (                                                           \
6093     (   ( st )                                         ) && \
6094     (   ( st )->u.eval.close_paren                     ) && \
6095     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6096 )
6097 
6098 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
6099 (                                                           \
6100     (   ( st )                                         ) && \
6101     (   ( st )->u.eval.close_paren                     ) && \
6102     (   ( expr )                                       ) && \
6103     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6104 )
6105 
6106 
6107 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6108     (st)->u.eval.close_paren = ( (expr) + 1 )
6109 
6110 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6111     (st)->u.eval.close_paren = 0
6112 
6113 /* push a new state then goto it */
6114 
6115 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
6116     pushinput = input; \
6117     pusheol = eol; \
6118     pushsr0 = sr0; \
6119     scan = node; \
6120     st->resume_state = state; \
6121     goto push_state;
6122 
6123 /* push a new state with success backtracking, then goto it */
6124 
6125 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
6126     pushinput = input; \
6127     pusheol = eol;     \
6128     pushsr0 = sr0; \
6129     scan = node; \
6130     st->resume_state = state; \
6131     goto push_yes_state;
6132 
6133 #define DEBUG_STATE_pp(pp)                                  \
6134     DEBUG_STATE_r({                                         \
6135         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
6136         Perl_re_printf( aTHX_                               \
6137             "%*s" pp " %s%s%s%s%s\n",                       \
6138             INDENT_CHARS(depth), "",                        \
6139             PL_reg_name[st->resume_state],                  \
6140             ((st==yes_state||st==mark_state) ? "[" : ""),   \
6141             ((st==yes_state) ? "Y" : ""),                   \
6142             ((st==mark_state) ? "M" : ""),                  \
6143             ((st==yes_state||st==mark_state) ? "]" : "")    \
6144         );                                                  \
6145     });
6146 
6147 /*
6148 
6149 regmatch() - main matching routine
6150 
6151 This is basically one big switch statement in a loop. We execute an op,
6152 set 'next' to point the next op, and continue. If we come to a point which
6153 we may need to backtrack to on failure such as (A|B|C), we push a
6154 backtrack state onto the backtrack stack. On failure, we pop the top
6155 state, and re-enter the loop at the state indicated. If there are no more
6156 states to pop, we return failure.
6157 
6158 Sometimes we also need to backtrack on success; for example /A+/, where
6159 after successfully matching one A, we need to go back and try to
6160 match another one; similarly for lookahead assertions: if the assertion
6161 completes successfully, we backtrack to the state just before the assertion
6162 and then carry on.  In these cases, the pushed state is marked as
6163 'backtrack on success too'. This marking is in fact done by a chain of
6164 pointers, each pointing to the previous 'yes' state. On success, we pop to
6165 the nearest yes state, discarding any intermediate failure-only states.
6166 Sometimes a yes state is pushed just to force some cleanup code to be
6167 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6168 it to free the inner regex.
6169 
6170 Note that failure backtracking rewinds the cursor position, while
6171 success backtracking leaves it alone.
6172 
6173 A pattern is complete when the END op is executed, while a subpattern
6174 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6175 ops trigger the "pop to last yes state if any, otherwise return true"
6176 behaviour.
6177 
6178 A common convention in this function is to use A and B to refer to the two
6179 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6180 the subpattern to be matched possibly multiple times, while B is the entire
6181 rest of the pattern. Variable and state names reflect this convention.
6182 
6183 The states in the main switch are the union of ops and failure/success of
6184 substates associated with that op.  For example, IFMATCH is the op
6185 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6186 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6187 successfully matched A and IFMATCH_A_fail is a state saying that we have
6188 just failed to match A. Resume states always come in pairs. The backtrack
6189 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6190 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6191 on success or failure.
6192 
6193 The struct that holds a backtracking state is actually a big union, with
6194 one variant for each major type of op. The variable st points to the
6195 top-most backtrack struct. To make the code clearer, within each
6196 block of code we #define ST to alias the relevant union.
6197 
6198 Here's a concrete example of a (vastly oversimplified) IFMATCH
6199 implementation:
6200 
6201     switch (state) {
6202     ....
6203 
6204 #define ST st->u.ifmatch
6205 
6206     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6207         ST.foo = ...; // some state we wish to save
6208         ...
6209         // push a yes backtrack state with a resume value of
6210         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6211         // first node of A:
6212         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6213         // NOTREACHED
6214 
6215     case IFMATCH_A: // we have successfully executed A; now continue with B
6216         next = B;
6217         bar = ST.foo; // do something with the preserved value
6218         break;
6219 
6220     case IFMATCH_A_fail: // A failed, so the assertion failed
6221         ...;   // do some housekeeping, then ...
6222         sayNO; // propagate the failure
6223 
6224 #undef ST
6225 
6226     ...
6227     }
6228 
6229 For any old-timers reading this who are familiar with the old recursive
6230 approach, the code above is equivalent to:
6231 
6232     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6233     {
6234         int foo = ...
6235         ...
6236         if (regmatch(A)) {
6237             next = B;
6238             bar = foo;
6239             break;
6240         }
6241         ...;   // do some housekeeping, then ...
6242         sayNO; // propagate the failure
6243     }
6244 
6245 The topmost backtrack state, pointed to by st, is usually free. If you
6246 want to claim it, populate any ST.foo fields in it with values you wish to
6247 save, then do one of
6248 
6249         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6250         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6251 
6252 which sets that backtrack state's resume value to 'resume_state', pushes a
6253 new free entry to the top of the backtrack stack, then goes to 'node'.
6254 On backtracking, the free slot is popped, and the saved state becomes the
6255 new free state. An ST.foo field in this new top state can be temporarily
6256 accessed to retrieve values, but once the main loop is re-entered, it
6257 becomes available for reuse.
6258 
6259 Note that the depth of the backtrack stack constantly increases during the
6260 left-to-right execution of the pattern, rather than going up and down with
6261 the pattern nesting. For example the stack is at its maximum at Z at the
6262 end of the pattern, rather than at X in the following:
6263 
6264     /(((X)+)+)+....(Y)+....Z/
6265 
6266 The only exceptions to this are lookahead/behind assertions and the cut,
6267 (?>A), which pop all the backtrack states associated with A before
6268 continuing.
6269 
6270 Backtrack state structs are allocated in slabs of about 4K in size.
6271 PL_regmatch_state and st always point to the currently active state,
6272 and PL_regmatch_slab points to the slab currently containing
6273 PL_regmatch_state.  The first time regmatch() is called, the first slab is
6274 allocated, and is never freed until interpreter destruction. When the slab
6275 is full, a new one is allocated and chained to the end. At exit from
6276 regmatch(), slabs allocated since entry are freed.
6277 
6278 In order to work with variable length lookbehinds, an upper limit is placed on
6279 lookbehinds which is set to where the match position is at the end of where the
6280 lookbehind would get to.  Nothing in the lookbehind should match above that,
6281 except we should be able to look beyond if for things like \b, which need the
6282 next character in the string to be able to determine if this is a boundary or
6283 not.  We also can't match the end of string/line unless we are also at the end
6284 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6285 that match a width, we have to add a condition that they are within the legal
6286 bounds of our window into the string.
6287 
6288 */
6289 
6290 /* returns -1 on failure, $+[0] on success */
6291 STATIC SSize_t
S_regmatch(pTHX_ regmatch_info * reginfo,char * startpos,regnode * prog)6292 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6293 {
6294     const bool utf8_target = reginfo->is_utf8_target;
6295     const U32 uniflags = UTF8_ALLOW_DEFAULT;
6296     REGEXP *rex_sv = reginfo->prog;
6297     regexp *rex = ReANY(rex_sv);
6298     RXi_GET_DECL(rex,rexi);
6299     /* the current state. This is a cached copy of PL_regmatch_state */
6300     regmatch_state *st;
6301     /* cache heavy used fields of st in registers */
6302     regnode *scan;
6303     regnode *next;
6304     U32 n = 0;	/* general value; init to avoid compiler warning */
6305     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
6306     SSize_t endref = 0; /* offset of end of backref when ln is start */
6307     char *locinput = startpos;
6308     char *loceol = reginfo->strend;
6309     char *pushinput; /* where to continue after a PUSH */
6310     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
6311     U8   *pushsr0;   /* save starting pos of script run */
6312     PERL_INT_FAST16_T nextbyte;   /* is always set to UCHARAT(locinput), or -1
6313                                      at EOS */
6314 
6315     bool result = 0;	    /* return value of S_regmatch */
6316     U32 depth = 0;            /* depth of backtrack stack */
6317     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6318     const U32 max_nochange_depth =
6319         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6320         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6321     regmatch_state *yes_state = NULL; /* state to pop to on success of
6322                                                             subpattern */
6323     /* mark_state piggy backs on the yes_state logic so that when we unwind
6324        the stack on success we can update the mark_state as we go */
6325     regmatch_state *mark_state = NULL; /* last mark state we have seen */
6326     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6327     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
6328     U32 state_num;
6329     bool no_final = 0;      /* prevent failure from backtracking? */
6330     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
6331     char *startpoint = locinput;
6332     SV *popmark = NULL;     /* are we looking for a mark? */
6333     SV *sv_commit = NULL;   /* last mark name seen in failure */
6334     SV *sv_yes_mark = NULL; /* last mark name we have seen
6335                                during a successful match */
6336     U32 lastopen = 0;       /* last open we saw */
6337     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6338     SV* const oreplsv = GvSVn(PL_replgv);
6339     /* these three flags are set by various ops to signal information to
6340      * the very next op. They have a useful lifetime of exactly one loop
6341      * iteration, and are not preserved or restored by state pushes/pops
6342      */
6343     bool sw = 0;	    /* the condition value in (?(cond)a|b) */
6344     bool minmod = 0;	    /* the next "{n,m}" is a "{n,m}?" */
6345     int logical = 0;	    /* the following EVAL is:
6346                                 0: (?{...})
6347                                 1: (?(?{...})X|Y)
6348                                 2: (??{...})
6349                                or the following IFMATCH/UNLESSM is:
6350                                 false: plain (?=foo)
6351                                 true:  used as a condition: (?(?=foo))
6352                             */
6353     PAD* last_pad = NULL;
6354     dMULTICALL;
6355     U8 gimme = G_SCALAR;
6356     CV *caller_cv = NULL;	/* who called us */
6357     CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
6358     U32 maxopenparen = 0;       /* max '(' index seen so far */
6359     int to_complement;  /* Invert the result? */
6360     _char_class_number classnum;
6361     bool is_utf8_pat = reginfo->is_utf8_pat;
6362     bool match = FALSE;
6363     I32 orig_savestack_ix = PL_savestack_ix;
6364     U8 * script_run_begin = NULL;
6365 
6366 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6367 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6368 #  define SOLARIS_BAD_OPTIMIZER
6369     const U32 *pl_charclass_dup = PL_charclass;
6370 #  define PL_charclass pl_charclass_dup
6371 #endif
6372 
6373 #ifdef DEBUGGING
6374     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6375 #endif
6376 
6377     /* protect against undef(*^R) */
6378     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6379 
6380     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6381     multicall_oldcatch = 0;
6382     PERL_UNUSED_VAR(multicall_cop);
6383 
6384     PERL_ARGS_ASSERT_REGMATCH;
6385 
6386     st = PL_regmatch_state;
6387 
6388     /* Note that nextbyte is a byte even in UTF */
6389     SET_nextchr;
6390     scan = prog;
6391 
6392     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6393             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6394             Perl_re_printf( aTHX_ "regmatch start\n" );
6395     }));
6396 
6397     while (scan != NULL) {
6398         next = scan + NEXT_OFF(scan);
6399         if (next == scan)
6400             next = NULL;
6401         state_num = OP(scan);
6402 
6403       reenter_switch:
6404         DEBUG_EXECUTE_r(
6405             if (state_num <= REGNODE_MAX) {
6406                 SV * const prop = sv_newmortal();
6407                 regnode *rnext = regnext(scan);
6408 
6409                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6410                 regprop(rex, prop, scan, reginfo, NULL);
6411                 Perl_re_printf( aTHX_
6412                     "%*s%" IVdf ":%s(%" IVdf ")\n",
6413                     INDENT_CHARS(depth), "",
6414                     (IV)(scan - rexi->program),
6415                     SvPVX_const(prop),
6416                     (PL_regkind[OP(scan)] == END || !rnext) ?
6417                         0 : (IV)(rnext - rexi->program));
6418             }
6419         );
6420 
6421         to_complement = 0;
6422 
6423         SET_nextchr;
6424         assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6425 
6426         switch (state_num) {
6427         case SBOL: /*  /^../ and /\A../  */
6428             if (locinput == reginfo->strbeg)
6429                 break;
6430             sayNO;
6431 
6432         case MBOL: /*  /^../m  */
6433             if (locinput == reginfo->strbeg ||
6434                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6435             {
6436                 break;
6437             }
6438             sayNO;
6439 
6440         case GPOS: /*  \G  */
6441             if (locinput == reginfo->ganch)
6442                 break;
6443             sayNO;
6444 
6445         case KEEPS: /*   \K  */
6446             /* update the startpoint */
6447             st->u.keeper.val = rex->offs[0].start;
6448             rex->offs[0].start = locinput - reginfo->strbeg;
6449             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6450                             script_run_begin);
6451             NOT_REACHED; /* NOTREACHED */
6452 
6453         case KEEPS_next_fail:
6454             /* rollback the start point change */
6455             rex->offs[0].start = st->u.keeper.val;
6456             sayNO_SILENT;
6457             NOT_REACHED; /* NOTREACHED */
6458 
6459         case MEOL: /* /..$/m  */
6460             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6461                 sayNO;
6462             break;
6463 
6464         case SEOL: /* /..$/  */
6465             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6466                 sayNO;
6467             if (reginfo->strend - locinput > 1)
6468                 sayNO;
6469             break;
6470 
6471         case EOS: /*  \z  */
6472             if (!NEXTCHR_IS_EOS)
6473                 sayNO;
6474             break;
6475 
6476         case SANY: /*  /./s  */
6477             if (NEXTCHR_IS_EOS || locinput >= loceol)
6478                 sayNO;
6479             goto increment_locinput;
6480 
6481         case REG_ANY: /*  /./  */
6482             if (   NEXTCHR_IS_EOS
6483                 || locinput >= loceol
6484                 || nextbyte == '\n')
6485             {
6486                 sayNO;
6487             }
6488             goto increment_locinput;
6489 
6490 
6491 #undef  ST
6492 #define ST st->u.trie
6493         case TRIEC: /* (ab|cd) with known charclass */
6494             /* In this case the charclass data is available inline so
6495                we can fail fast without a lot of extra overhead.
6496              */
6497             if ( !   NEXTCHR_IS_EOS
6498                 &&   locinput < loceol
6499                 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6500             {
6501                 DEBUG_EXECUTE_r(
6502                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6503                               depth, PL_colors[4], PL_colors[5])
6504                 );
6505                 sayNO_SILENT;
6506                 NOT_REACHED; /* NOTREACHED */
6507             }
6508             /* FALLTHROUGH */
6509         case TRIE:  /* (ab|cd)  */
6510             /* the basic plan of execution of the trie is:
6511              * At the beginning, run though all the states, and
6512              * find the longest-matching word. Also remember the position
6513              * of the shortest matching word. For example, this pattern:
6514              *    1  2 3 4    5
6515              *    ab|a|x|abcd|abc
6516              * when matched against the string "abcde", will generate
6517              * accept states for all words except 3, with the longest
6518              * matching word being 4, and the shortest being 2 (with
6519              * the position being after char 1 of the string).
6520              *
6521              * Then for each matching word, in word order (i.e. 1,2,4,5),
6522              * we run the remainder of the pattern; on each try setting
6523              * the current position to the character following the word,
6524              * returning to try the next word on failure.
6525              *
6526              * We avoid having to build a list of words at runtime by
6527              * using a compile-time structure, wordinfo[].prev, which
6528              * gives, for each word, the previous accepting word (if any).
6529              * In the case above it would contain the mappings 1->2, 2->0,
6530              * 3->0, 4->5, 5->1.  We can use this table to generate, from
6531              * the longest word (4 above), a list of all words, by
6532              * following the list of prev pointers; this gives us the
6533              * unordered list 4,5,1,2. Then given the current word we have
6534              * just tried, we can go through the list and find the
6535              * next-biggest word to try (so if we just failed on word 2,
6536              * the next in the list is 4).
6537              *
6538              * Since at runtime we don't record the matching position in
6539              * the string for each word, we have to work that out for
6540              * each word we're about to process. The wordinfo table holds
6541              * the character length of each word; given that we recorded
6542              * at the start: the position of the shortest word and its
6543              * length in chars, we just need to move the pointer the
6544              * difference between the two char lengths. Depending on
6545              * Unicode status and folding, that's cheap or expensive.
6546              *
6547              * This algorithm is optimised for the case where are only a
6548              * small number of accept states, i.e. 0,1, or maybe 2.
6549              * With lots of accepts states, and having to try all of them,
6550              * it becomes quadratic on number of accept states to find all
6551              * the next words.
6552              */
6553 
6554             {
6555                 /* what type of TRIE am I? (utf8 makes this contextual) */
6556                 DECL_TRIE_TYPE(scan);
6557 
6558                 /* what trie are we using right now */
6559                 reg_trie_data * const trie
6560                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6561                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6562                 U32 state = trie->startstate;
6563 
6564                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6565                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6566                     if (utf8_target
6567                         && ! NEXTCHR_IS_EOS
6568                         && UTF8_IS_ABOVE_LATIN1(nextbyte)
6569                         && scan->flags == EXACTL)
6570                     {
6571                         /* We only output for EXACTL, as we let the folder
6572                          * output this message for EXACTFLU8 to avoid
6573                          * duplication */
6574                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6575                                                                reginfo->strend);
6576                     }
6577                 }
6578                 if (   trie->bitmap
6579                     && (     NEXTCHR_IS_EOS
6580                         ||   locinput >= loceol
6581                         || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6582                 {
6583                     if (trie->states[ state ].wordnum) {
6584                          DEBUG_EXECUTE_r(
6585                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6586                                           depth, PL_colors[4], PL_colors[5])
6587                         );
6588                         if (!trie->jump)
6589                             break;
6590                     } else {
6591                         DEBUG_EXECUTE_r(
6592                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6593                                           depth, PL_colors[4], PL_colors[5])
6594                         );
6595                         sayNO_SILENT;
6596                    }
6597                 }
6598 
6599             {
6600                 U8 *uc = ( U8* )locinput;
6601 
6602                 STRLEN len = 0;
6603                 STRLEN foldlen = 0;
6604                 U8 *uscan = (U8*)NULL;
6605                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6606                 U32 charcount = 0; /* how many input chars we have matched */
6607                 U32 accepted = 0; /* have we seen any accepting states? */
6608 
6609                 ST.jump = trie->jump;
6610                 ST.me = scan;
6611                 ST.firstpos = NULL;
6612                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6613                 ST.nextword = 0;
6614 
6615                 /* fully traverse the TRIE; note the position of the
6616                    shortest accept state and the wordnum of the longest
6617                    accept state */
6618 
6619                 while ( state && uc <= (U8*)(loceol) ) {
6620                     U32 base = trie->states[ state ].trans.base;
6621                     UV uvc = 0;
6622                     U16 charid = 0;
6623                     U16 wordnum;
6624                     wordnum = trie->states[ state ].wordnum;
6625 
6626                     if (wordnum) { /* it's an accept state */
6627                         if (!accepted) {
6628                             accepted = 1;
6629                             /* record first match position */
6630                             if (ST.longfold) {
6631                                 ST.firstpos = (U8*)locinput;
6632                                 ST.firstchars = 0;
6633                             }
6634                             else {
6635                                 ST.firstpos = uc;
6636                                 ST.firstchars = charcount;
6637                             }
6638                         }
6639                         if (!ST.nextword || wordnum < ST.nextword)
6640                             ST.nextword = wordnum;
6641                         ST.topword = wordnum;
6642                     }
6643 
6644                     DEBUG_TRIE_EXECUTE_r({
6645                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6646                                 /* HERE */
6647                                 PerlIO_printf( Perl_debug_log,
6648                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6649                                     INDENT_CHARS(depth), "", PL_colors[4],
6650                                     (UV)state, (accepted ? 'Y' : 'N'));
6651                     });
6652 
6653                     /* read a char and goto next state */
6654                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6655                         I32 offset;
6656                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6657                                              (U8 *) loceol, uscan,
6658                                              len, uvc, charid, foldlen,
6659                                              foldbuf, uniflags);
6660                         charcount++;
6661                         if (foldlen>0)
6662                             ST.longfold = TRUE;
6663                         if (charid &&
6664                              ( ((offset =
6665                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6666 
6667                              && ((U32)offset < trie->lasttrans)
6668                              && trie->trans[offset].check == state)
6669                         {
6670                             state = trie->trans[offset].next;
6671                         }
6672                         else {
6673                             state = 0;
6674                         }
6675                         uc += len;
6676 
6677                     }
6678                     else {
6679                         state = 0;
6680                     }
6681                     DEBUG_TRIE_EXECUTE_r(
6682                         Perl_re_printf( aTHX_
6683                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6684                             charid, uvc, (UV)state, PL_colors[5] );
6685                     );
6686                 }
6687                 if (!accepted)
6688                    sayNO;
6689 
6690                 /* calculate total number of accept states */
6691                 {
6692                     U16 w = ST.topword;
6693                     accepted = 0;
6694                     while (w) {
6695                         w = trie->wordinfo[w].prev;
6696                         accepted++;
6697                     }
6698                     ST.accepted = accepted;
6699                 }
6700 
6701                 DEBUG_EXECUTE_r(
6702                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6703                         depth,
6704                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6705                 );
6706                 goto trie_first_try; /* jump into the fail handler */
6707             }}
6708             NOT_REACHED; /* NOTREACHED */
6709 
6710         case TRIE_next_fail: /* we failed - try next alternative */
6711         {
6712             U8 *uc;
6713             if ( ST.jump ) {
6714                 /* undo any captures done in the tail part of a branch,
6715                  * e.g.
6716                  *    /(?:X(.)(.)|Y(.)).../
6717                  * where the trie just matches X then calls out to do the
6718                  * rest of the branch */
6719                 REGCP_UNWIND(ST.cp);
6720                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6721             }
6722             if (!--ST.accepted) {
6723                 DEBUG_EXECUTE_r({
6724                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6725                         depth,
6726                         PL_colors[4],
6727                         PL_colors[5] );
6728                 });
6729                 sayNO_SILENT;
6730             }
6731             {
6732                 /* Find next-highest word to process.  Note that this code
6733                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6734                 U16 min = 0;
6735                 U16 word;
6736                 U16 const nextword = ST.nextword;
6737                 reg_trie_wordinfo * const wordinfo
6738                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6739                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6740                     if (word > nextword && (!min || word < min))
6741                         min = word;
6742                 }
6743                 ST.nextword = min;
6744             }
6745 
6746           trie_first_try:
6747             if (do_cutgroup) {
6748                 do_cutgroup = 0;
6749                 no_final = 0;
6750             }
6751 
6752             if ( ST.jump ) {
6753                 ST.lastparen = rex->lastparen;
6754                 ST.lastcloseparen = rex->lastcloseparen;
6755                 REGCP_SET(ST.cp);
6756             }
6757 
6758             /* find start char of end of current word */
6759             {
6760                 U32 chars; /* how many chars to skip */
6761                 reg_trie_data * const trie
6762                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6763 
6764                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6765                             >=  ST.firstchars);
6766                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6767                             - ST.firstchars;
6768                 uc = ST.firstpos;
6769 
6770                 if (ST.longfold) {
6771                     /* the hard option - fold each char in turn and find
6772                      * its folded length (which may be different */
6773                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6774                     STRLEN foldlen;
6775                     STRLEN len;
6776                     UV uvc;
6777                     U8 *uscan;
6778 
6779                     while (chars) {
6780                         if (utf8_target) {
6781                             /* XXX This assumes the length is well-formed, as
6782                              * does the UTF8SKIP below */
6783                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6784                                                     uniflags);
6785                             uc += len;
6786                         }
6787                         else {
6788                             uvc = *uc;
6789                             uc++;
6790                         }
6791                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6792                         uscan = foldbuf;
6793                         while (foldlen) {
6794                             if (!--chars)
6795                                 break;
6796                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6797                                                  uniflags);
6798                             uscan += len;
6799                             foldlen -= len;
6800                         }
6801                     }
6802                 }
6803                 else {
6804                     if (utf8_target)
6805                         uc = utf8_hop(uc, chars);
6806                     else
6807                         uc += chars;
6808                 }
6809             }
6810 
6811             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6812                             ? ST.jump[ST.nextword]
6813                             : NEXT_OFF(ST.me));
6814 
6815             DEBUG_EXECUTE_r({
6816                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6817                     depth,
6818                     PL_colors[4],
6819                     ST.nextword,
6820                     PL_colors[5]
6821                     );
6822             });
6823 
6824             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6825                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6826                                 script_run_begin);
6827                 NOT_REACHED; /* NOTREACHED */
6828             }
6829             /* only one choice left - just continue */
6830             DEBUG_EXECUTE_r({
6831                 AV *const trie_words
6832                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6833                 SV ** const tmp = trie_words
6834                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6835                 SV *sv= tmp ? sv_newmortal() : NULL;
6836 
6837                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6838                     depth, PL_colors[4],
6839                     ST.nextword,
6840                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6841                             PL_colors[0], PL_colors[1],
6842                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6843                         )
6844                     : "not compiled under -Dr",
6845                     PL_colors[5] );
6846             });
6847 
6848             locinput = (char*)uc;
6849             continue; /* execute rest of RE */
6850             /* NOTREACHED */
6851         }
6852 #undef  ST
6853 
6854         case LEXACT_REQ8:
6855             if (! utf8_target) {
6856                 sayNO;
6857             }
6858             /* FALLTHROUGH */
6859 
6860         case LEXACT:
6861         {
6862             char *s;
6863 
6864             s = STRINGl(scan);
6865             ln = STR_LENl(scan);
6866             goto join_short_long_exact;
6867 
6868         case EXACTL:             /*  /abc/l       */
6869             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6870 
6871             /* Complete checking would involve going through every character
6872              * matched by the string to see if any is above latin1.  But the
6873              * comparision otherwise might very well be a fast assembly
6874              * language routine, and I (khw) don't think slowing things down
6875              * just to check for this warning is worth it.  So this just checks
6876              * the first character */
6877             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6878                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6879             }
6880             goto do_exact;
6881         case EXACT_REQ8:
6882             if (! utf8_target) {
6883                 sayNO;
6884             }
6885             /* FALLTHROUGH */
6886 
6887         case EXACT:             /*  /abc/        */
6888           do_exact:
6889             s = STRINGs(scan);
6890             ln = STR_LENs(scan);
6891 
6892           join_short_long_exact:
6893             if (utf8_target != is_utf8_pat) {
6894                 /* The target and the pattern have differing utf8ness. */
6895                 char *l = locinput;
6896                 const char * const e = s + ln;
6897 
6898                 if (utf8_target) {
6899                     /* The target is utf8, the pattern is not utf8.
6900                      * Above-Latin1 code points can't match the pattern;
6901                      * invariants match exactly, and the other Latin1 ones need
6902                      * to be downgraded to a single byte in order to do the
6903                      * comparison.  (If we could be confident that the target
6904                      * is not malformed, this could be refactored to have fewer
6905                      * tests by just assuming that if the first bytes match, it
6906                      * is an invariant, but there are tests in the test suite
6907                      * dealing with (??{...}) which violate this) */
6908                     while (s < e) {
6909                         if (   l >= loceol
6910                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6911                         {
6912                             sayNO;
6913                         }
6914                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6915                             if (*l != *s) {
6916                                 sayNO;
6917                             }
6918                             l++;
6919                         }
6920                         else {
6921                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6922                             {
6923                                 sayNO;
6924                             }
6925                             l += 2;
6926                         }
6927                         s++;
6928                     }
6929                 }
6930                 else {
6931                     /* The target is not utf8, the pattern is utf8. */
6932                     while (s < e) {
6933                         if (   l >= loceol
6934                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6935                         {
6936                             sayNO;
6937                         }
6938                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6939                             if (*s != *l) {
6940                                 sayNO;
6941                             }
6942                             s++;
6943                         }
6944                         else {
6945                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6946                             {
6947                                 sayNO;
6948                             }
6949                             s += 2;
6950                         }
6951                         l++;
6952                     }
6953                 }
6954                 locinput = l;
6955             }
6956             else {
6957                 /* The target and the pattern have the same utf8ness. */
6958                 /* Inline the first character, for speed. */
6959                 if (   loceol - locinput < ln
6960                     || UCHARAT(s) != nextbyte
6961                     || (ln > 1 && memNE(s, locinput, ln)))
6962                 {
6963                     sayNO;
6964                 }
6965                 locinput += ln;
6966             }
6967             break;
6968             }
6969 
6970         case EXACTFL:            /*  /abc/il      */
6971           {
6972             re_fold_t folder;
6973             const U8 * fold_array;
6974             const char * s;
6975             U32 fold_utf8_flags;
6976 
6977             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6978             folder = foldEQ_locale;
6979             fold_array = PL_fold_locale;
6980             fold_utf8_flags = FOLDEQ_LOCALE;
6981             goto do_exactf;
6982 
6983         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6984                                       is effectively /u; hence to match, target
6985                                       must be UTF-8. */
6986             if (! utf8_target) {
6987                 sayNO;
6988             }
6989             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6990                                              | FOLDEQ_S2_FOLDS_SANE;
6991             folder = foldEQ_latin1_s2_folded;
6992             fold_array = PL_fold_latin1;
6993             goto do_exactf;
6994 
6995         case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
6996             if (! utf8_target) {
6997                 sayNO;
6998             }
6999             assert(is_utf8_pat);
7000             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7001             goto do_exactf;
7002 
7003         case EXACTFUP:          /*  /foo/iu, and something is problematic in
7004                                     'foo' so can't take shortcuts. */
7005             assert(! is_utf8_pat);
7006             folder = foldEQ_latin1;
7007             fold_array = PL_fold_latin1;
7008             fold_utf8_flags = 0;
7009             goto do_exactf;
7010 
7011         case EXACTFU:            /*  /abc/iu      */
7012             folder = foldEQ_latin1_s2_folded;
7013             fold_array = PL_fold_latin1;
7014             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7015             goto do_exactf;
7016 
7017         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
7018                                    patterns */
7019             assert(! is_utf8_pat);
7020             /* FALLTHROUGH */
7021         case EXACTFAA:            /*  /abc/iaa     */
7022             folder = foldEQ_latin1_s2_folded;
7023             fold_array = PL_fold_latin1;
7024             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7025             if (is_utf8_pat || ! utf8_target) {
7026 
7027                 /* The possible presence of a MICRO SIGN in the pattern forbids
7028                  * us to view a non-UTF-8 pattern as folded when there is a
7029                  * UTF-8 target */
7030                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7031                                   |FOLDEQ_S2_FOLDS_SANE;
7032             }
7033             goto do_exactf;
7034 
7035 
7036         case EXACTF:             /*  /abc/i    This node only generated for
7037                                                non-utf8 patterns */
7038             assert(! is_utf8_pat);
7039             folder = foldEQ;
7040             fold_array = PL_fold;
7041             fold_utf8_flags = 0;
7042 
7043           do_exactf:
7044             s = STRINGs(scan);
7045             ln = STR_LENs(scan);
7046 
7047             if (   utf8_target
7048                 || is_utf8_pat
7049                 || state_num == EXACTFUP
7050                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7051             {
7052               /* Either target or the pattern are utf8, or has the issue where
7053                * the fold lengths may differ. */
7054                 const char * const l = locinput;
7055                 char *e = loceol;
7056 
7057                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
7058                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
7059                 {
7060                     sayNO;
7061                 }
7062                 locinput = e;
7063                 break;
7064             }
7065 
7066             /* Neither the target nor the pattern are utf8 */
7067             if (UCHARAT(s) != nextbyte
7068                 && !NEXTCHR_IS_EOS
7069                 && UCHARAT(s) != fold_array[nextbyte])
7070             {
7071                 sayNO;
7072             }
7073             if (loceol - locinput < ln)
7074                 sayNO;
7075             if (ln > 1 && ! folder(locinput, s, ln))
7076                 sayNO;
7077             locinput += ln;
7078             break;
7079         }
7080 
7081         case NBOUNDL: /*  /\B/l  */
7082             to_complement = 1;
7083             /* FALLTHROUGH */
7084 
7085         case BOUNDL:  /*  /\b/l  */
7086         {
7087             bool b1, b2;
7088             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7089 
7090             if (FLAGS(scan) != TRADITIONAL_BOUND) {
7091                 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7092                 goto boundu;
7093             }
7094 
7095             if (utf8_target) {
7096                 if (locinput == reginfo->strbeg)
7097                     b1 = isWORDCHAR_LC('\n');
7098                 else {
7099                     U8 *p = reghop3((U8*)locinput, -1,
7100                                     (U8*)(reginfo->strbeg));
7101                     b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7102                 }
7103                 b2 = (NEXTCHR_IS_EOS)
7104                     ? isWORDCHAR_LC('\n')
7105                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7106                                               (U8*) reginfo->strend);
7107             }
7108             else { /* Here the string isn't utf8 */
7109                 b1 = (locinput == reginfo->strbeg)
7110                      ? isWORDCHAR_LC('\n')
7111                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
7112                 b2 = (NEXTCHR_IS_EOS)
7113                     ? isWORDCHAR_LC('\n')
7114                     : isWORDCHAR_LC(nextbyte);
7115             }
7116             if (to_complement ^ (b1 == b2)) {
7117                 sayNO;
7118             }
7119             break;
7120         }
7121 
7122         case NBOUND:  /*  /\B/   */
7123             to_complement = 1;
7124             /* FALLTHROUGH */
7125 
7126         case BOUND:   /*  /\b/   */
7127             if (utf8_target) {
7128                 goto bound_utf8;
7129             }
7130             goto bound_ascii_match_only;
7131 
7132         case NBOUNDA: /*  /\B/a  */
7133             to_complement = 1;
7134             /* FALLTHROUGH */
7135 
7136         case BOUNDA:  /*  /\b/a  */
7137         {
7138             bool b1, b2;
7139 
7140           bound_ascii_match_only:
7141             /* Here the string isn't utf8, or is utf8 and only ascii characters
7142              * are to match \w.  In the latter case looking at the byte just
7143              * prior to the current one may be just the final byte of a
7144              * multi-byte character.  This is ok.  There are two cases:
7145              * 1) it is a single byte character, and then the test is doing
7146              *    just what it's supposed to.
7147              * 2) it is a multi-byte character, in which case the final byte is
7148              *    never mistakable for ASCII, and so the test will say it is
7149              *    not a word character, which is the correct answer. */
7150             b1 = (locinput == reginfo->strbeg)
7151                  ? isWORDCHAR_A('\n')
7152                  : isWORDCHAR_A(UCHARAT(locinput - 1));
7153             b2 = (NEXTCHR_IS_EOS)
7154                 ? isWORDCHAR_A('\n')
7155                 : isWORDCHAR_A(nextbyte);
7156             if (to_complement ^ (b1 == b2)) {
7157                 sayNO;
7158             }
7159             break;
7160         }
7161 
7162         case NBOUNDU: /*  /\B/u  */
7163             to_complement = 1;
7164             /* FALLTHROUGH */
7165 
7166         case BOUNDU:  /*  /\b/u  */
7167 
7168           boundu:
7169             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7170                 match = FALSE;
7171             }
7172             else if (utf8_target) {
7173               bound_utf8:
7174                 switch((bound_type) FLAGS(scan)) {
7175                     case TRADITIONAL_BOUND:
7176                     {
7177                         bool b1, b2;
7178                         if (locinput == reginfo->strbeg) {
7179                             b1 = 0 /* isWORDCHAR_L1('\n') */;
7180                         }
7181                         else {
7182                             U8 *p = reghop3((U8*)locinput, -1,
7183                                             (U8*)(reginfo->strbeg));
7184 
7185                             b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7186                         }
7187                         b2 = (NEXTCHR_IS_EOS)
7188                             ? 0 /* isWORDCHAR_L1('\n') */
7189                             : isWORDCHAR_utf8_safe((U8*)locinput,
7190                                                    (U8*) reginfo->strend);
7191                         match = cBOOL(b1 != b2);
7192                         break;
7193                     }
7194                     case GCB_BOUND:
7195                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7196                             match = TRUE; /* GCB always matches at begin and
7197                                              end */
7198                         }
7199                         else {
7200                             /* Find the gcb values of previous and current
7201                              * chars, then see if is a break point */
7202                             match = isGCB(getGCB_VAL_UTF8(
7203                                                 reghop3((U8*)locinput,
7204                                                         -1,
7205                                                         (U8*)(reginfo->strbeg)),
7206                                                 (U8*) reginfo->strend),
7207                                           getGCB_VAL_UTF8((U8*) locinput,
7208                                                         (U8*) reginfo->strend),
7209                                           (U8*) reginfo->strbeg,
7210                                           (U8*) locinput,
7211                                           utf8_target);
7212                         }
7213                         break;
7214 
7215                     case LB_BOUND:
7216                         if (locinput == reginfo->strbeg) {
7217                             match = FALSE;
7218                         }
7219                         else if (NEXTCHR_IS_EOS) {
7220                             match = TRUE;
7221                         }
7222                         else {
7223                             match = isLB(getLB_VAL_UTF8(
7224                                                 reghop3((U8*)locinput,
7225                                                         -1,
7226                                                         (U8*)(reginfo->strbeg)),
7227                                                 (U8*) reginfo->strend),
7228                                           getLB_VAL_UTF8((U8*) locinput,
7229                                                         (U8*) reginfo->strend),
7230                                           (U8*) reginfo->strbeg,
7231                                           (U8*) locinput,
7232                                           (U8*) reginfo->strend,
7233                                           utf8_target);
7234                         }
7235                         break;
7236 
7237                     case SB_BOUND: /* Always matches at begin and end */
7238                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7239                             match = TRUE;
7240                         }
7241                         else {
7242                             match = isSB(getSB_VAL_UTF8(
7243                                                 reghop3((U8*)locinput,
7244                                                         -1,
7245                                                         (U8*)(reginfo->strbeg)),
7246                                                 (U8*) reginfo->strend),
7247                                           getSB_VAL_UTF8((U8*) locinput,
7248                                                         (U8*) reginfo->strend),
7249                                           (U8*) reginfo->strbeg,
7250                                           (U8*) locinput,
7251                                           (U8*) reginfo->strend,
7252                                           utf8_target);
7253                         }
7254                         break;
7255 
7256                     case WB_BOUND:
7257                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7258                             match = TRUE;
7259                         }
7260                         else {
7261                             match = isWB(WB_UNKNOWN,
7262                                          getWB_VAL_UTF8(
7263                                                 reghop3((U8*)locinput,
7264                                                         -1,
7265                                                         (U8*)(reginfo->strbeg)),
7266                                                 (U8*) reginfo->strend),
7267                                           getWB_VAL_UTF8((U8*) locinput,
7268                                                         (U8*) reginfo->strend),
7269                                           (U8*) reginfo->strbeg,
7270                                           (U8*) locinput,
7271                                           (U8*) reginfo->strend,
7272                                           utf8_target);
7273                         }
7274                         break;
7275                 }
7276             }
7277             else {  /* Not utf8 target */
7278                 switch((bound_type) FLAGS(scan)) {
7279                     case TRADITIONAL_BOUND:
7280                     {
7281                         bool b1, b2;
7282                         b1 = (locinput == reginfo->strbeg)
7283                             ? 0 /* isWORDCHAR_L1('\n') */
7284                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
7285                         b2 = (NEXTCHR_IS_EOS)
7286                             ? 0 /* isWORDCHAR_L1('\n') */
7287                             : isWORDCHAR_L1(nextbyte);
7288                         match = cBOOL(b1 != b2);
7289                         break;
7290                     }
7291 
7292                     case GCB_BOUND:
7293                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7294                             match = TRUE; /* GCB always matches at begin and
7295                                              end */
7296                         }
7297                         else {  /* Only CR-LF combo isn't a GCB in 0-255
7298                                    range */
7299                             match =    UCHARAT(locinput - 1) != '\r'
7300                                     || UCHARAT(locinput) != '\n';
7301                         }
7302                         break;
7303 
7304                     case LB_BOUND:
7305                         if (locinput == reginfo->strbeg) {
7306                             match = FALSE;
7307                         }
7308                         else if (NEXTCHR_IS_EOS) {
7309                             match = TRUE;
7310                         }
7311                         else {
7312                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7313                                          getLB_VAL_CP(UCHARAT(locinput)),
7314                                          (U8*) reginfo->strbeg,
7315                                          (U8*) locinput,
7316                                          (U8*) reginfo->strend,
7317                                          utf8_target);
7318                         }
7319                         break;
7320 
7321                     case SB_BOUND: /* Always matches at begin and end */
7322                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7323                             match = TRUE;
7324                         }
7325                         else {
7326                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7327                                          getSB_VAL_CP(UCHARAT(locinput)),
7328                                          (U8*) reginfo->strbeg,
7329                                          (U8*) locinput,
7330                                          (U8*) reginfo->strend,
7331                                          utf8_target);
7332                         }
7333                         break;
7334 
7335                     case WB_BOUND:
7336                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7337                             match = TRUE;
7338                         }
7339                         else {
7340                             match = isWB(WB_UNKNOWN,
7341                                          getWB_VAL_CP(UCHARAT(locinput -1)),
7342                                          getWB_VAL_CP(UCHARAT(locinput)),
7343                                          (U8*) reginfo->strbeg,
7344                                          (U8*) locinput,
7345                                          (U8*) reginfo->strend,
7346                                          utf8_target);
7347                         }
7348                         break;
7349                 }
7350             }
7351 
7352             if (to_complement ^ ! match) {
7353                 sayNO;
7354             }
7355             break;
7356 
7357         case ANYOFPOSIXL:
7358         case ANYOFL:  /*  /[abc]/l      */
7359             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7360             CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7361 
7362             /* FALLTHROUGH */
7363         case ANYOFD:  /*   /[abc]/d       */
7364         case ANYOF:  /*   /[abc]/       */
7365             if (NEXTCHR_IS_EOS || locinput >= loceol)
7366                 sayNO;
7367             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7368                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7369             {
7370                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7371                     sayNO;
7372                 }
7373                 locinput++;
7374             }
7375             else {
7376                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7377                                                                    utf8_target))
7378                 {
7379                     sayNO;
7380                 }
7381                 goto increment_locinput;
7382             }
7383             break;
7384 
7385         case ANYOFM:
7386             if (   NEXTCHR_IS_EOS
7387                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7388                 || locinput >= loceol)
7389             {
7390                 sayNO;
7391             }
7392             locinput++; /* ANYOFM is always single byte */
7393             break;
7394 
7395         case NANYOFM:
7396             if (   NEXTCHR_IS_EOS
7397                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7398                 || locinput >= loceol)
7399             {
7400                 sayNO;
7401             }
7402             goto increment_locinput;
7403             break;
7404 
7405         case ANYOFH:
7406             if (   ! utf8_target
7407                 ||   NEXTCHR_IS_EOS
7408                 ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7409                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7410                                                                    utf8_target))
7411             {
7412                 sayNO;
7413             }
7414             goto increment_locinput;
7415             break;
7416 
7417         case ANYOFHb:
7418             if (   ! utf8_target
7419                 ||   NEXTCHR_IS_EOS
7420                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
7421                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7422                                                                   utf8_target))
7423             {
7424                 sayNO;
7425             }
7426             goto increment_locinput;
7427             break;
7428 
7429         case ANYOFHr:
7430             if (   ! utf8_target
7431                 ||   NEXTCHR_IS_EOS
7432                 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7433                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7434                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7435                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7436                                                                    utf8_target))
7437             {
7438                 sayNO;
7439             }
7440             goto increment_locinput;
7441             break;
7442 
7443         case ANYOFHs:
7444             if (   ! utf8_target
7445                 ||   NEXTCHR_IS_EOS
7446                 ||   loceol - locinput < FLAGS(scan)
7447                 ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7448                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7449                                                                    utf8_target))
7450             {
7451                 sayNO;
7452             }
7453             goto increment_locinput;
7454             break;
7455 
7456         case ANYOFR:
7457             if (NEXTCHR_IS_EOS) {
7458                 sayNO;
7459             }
7460 
7461             if (utf8_target) {
7462                 if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7463                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7464                                                 (U8 *) reginfo->strend,
7465                                                 NULL),
7466                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
7467                 {
7468                     sayNO;
7469                 }
7470             }
7471             else {
7472                 if (! withinCOUNT((U8) *locinput,
7473                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7474                 {
7475                     sayNO;
7476                 }
7477             }
7478             goto increment_locinput;
7479             break;
7480 
7481         case ANYOFRb:
7482             if (NEXTCHR_IS_EOS) {
7483                 sayNO;
7484             }
7485 
7486             if (utf8_target) {
7487                 if (     ANYOF_FLAGS(scan) != (U8) *locinput
7488                     || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7489                                                 (U8 *) reginfo->strend,
7490                                                 NULL),
7491                                      ANYOFRbase(scan), ANYOFRdelta(scan)))
7492                 {
7493                     sayNO;
7494                 }
7495             }
7496             else {
7497                 if (! withinCOUNT((U8) *locinput,
7498                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7499                 {
7500                     sayNO;
7501                 }
7502             }
7503             goto increment_locinput;
7504             break;
7505 
7506         /* The argument (FLAGS) to all the POSIX node types is the class number
7507          * */
7508 
7509         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
7510             to_complement = 1;
7511             /* FALLTHROUGH */
7512 
7513         case POSIXL:    /* \w or [:punct:] etc. under /l */
7514             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7515             if (NEXTCHR_IS_EOS || locinput >= loceol)
7516                 sayNO;
7517 
7518             /* Use isFOO_lc() for characters within Latin1.  (Note that
7519              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7520              * wouldn't be invariant) */
7521             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7522                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7523                     sayNO;
7524                 }
7525 
7526                 locinput++;
7527                 break;
7528             }
7529 
7530             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7531                 /* An above Latin-1 code point, or malformed */
7532                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7533                                                        reginfo->strend);
7534                 goto utf8_posix_above_latin1;
7535             }
7536 
7537             /* Here is a UTF-8 variant code point below 256 and the target is
7538              * UTF-8 */
7539             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7540                                             EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7541                                             *(locinput + 1))))))
7542             {
7543                 sayNO;
7544             }
7545 
7546             goto increment_locinput;
7547 
7548         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7549             to_complement = 1;
7550             /* FALLTHROUGH */
7551 
7552         case POSIXD:    /* \w or [:punct:] etc. under /d */
7553             if (utf8_target) {
7554                 goto utf8_posix;
7555             }
7556             goto posixa;
7557 
7558         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7559 
7560             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7561                 sayNO;
7562             }
7563 
7564             /* All UTF-8 variants match */
7565             if (! UTF8_IS_INVARIANT(nextbyte)) {
7566                 goto increment_locinput;
7567             }
7568 
7569             to_complement = 1;
7570             goto join_nposixa;
7571 
7572         case POSIXA:    /* \w or [:punct:] etc. under /a */
7573 
7574           posixa:
7575             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7576              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7577              * character is a single byte */
7578 
7579             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7580                 sayNO;
7581             }
7582 
7583           join_nposixa:
7584 
7585             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
7586                                                                 FLAGS(scan)))))
7587             {
7588                 sayNO;
7589             }
7590 
7591             /* Here we are either not in utf8, or we matched a utf8-invariant,
7592              * so the next char is the next byte */
7593             locinput++;
7594             break;
7595 
7596         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7597             to_complement = 1;
7598             /* FALLTHROUGH */
7599 
7600         case POSIXU:    /* \w or [:punct:] etc. under /u */
7601           utf8_posix:
7602             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7603                 sayNO;
7604             }
7605 
7606             /* Use _generic_isCC() for characters within Latin1.  (Note that
7607              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7608              * wouldn't be invariant) */
7609             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7610                 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
7611                                                            FLAGS(scan)))))
7612                 {
7613                     sayNO;
7614                 }
7615                 locinput++;
7616             }
7617             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7618                 if (! (to_complement
7619                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7620                                                                *(locinput + 1)),
7621                                              FLAGS(scan)))))
7622                 {
7623                     sayNO;
7624                 }
7625                 locinput += 2;
7626             }
7627             else {  /* Handle above Latin-1 code points */
7628               utf8_posix_above_latin1:
7629                 classnum = (_char_class_number) FLAGS(scan);
7630                 switch (classnum) {
7631                     default:
7632                         if (! (to_complement
7633                            ^ cBOOL(_invlist_contains_cp(
7634                                       PL_XPosix_ptrs[classnum],
7635                                       utf8_to_uvchr_buf((U8 *) locinput,
7636                                                         (U8 *) reginfo->strend,
7637                                                         NULL)))))
7638                         {
7639                             sayNO;
7640                         }
7641                         break;
7642                     case _CC_ENUM_SPACE:
7643                         if (! (to_complement
7644                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
7645                         {
7646                             sayNO;
7647                         }
7648                         break;
7649                     case _CC_ENUM_BLANK:
7650                         if (! (to_complement
7651                                         ^ cBOOL(is_HORIZWS_high(locinput))))
7652                         {
7653                             sayNO;
7654                         }
7655                         break;
7656                     case _CC_ENUM_XDIGIT:
7657                         if (! (to_complement
7658                                         ^ cBOOL(is_XDIGIT_high(locinput))))
7659                         {
7660                             sayNO;
7661                         }
7662                         break;
7663                     case _CC_ENUM_VERTSPACE:
7664                         if (! (to_complement
7665                                         ^ cBOOL(is_VERTWS_high(locinput))))
7666                         {
7667                             sayNO;
7668                         }
7669                         break;
7670                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
7671                     case _CC_ENUM_ASCII:
7672                         if (! to_complement) {
7673                             sayNO;
7674                         }
7675                         break;
7676                 }
7677                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7678             }
7679             break;
7680 
7681         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7682                        a Unicode extended Grapheme Cluster */
7683             if (NEXTCHR_IS_EOS || locinput >= loceol)
7684                 sayNO;
7685             if  (! utf8_target) {
7686 
7687                 /* Match either CR LF  or '.', as all the other possibilities
7688                  * require utf8 */
7689                 locinput++;	    /* Match the . or CR */
7690                 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7691                                        match the LF */
7692                     && locinput <  loceol
7693                     && UCHARAT(locinput) == '\n')
7694                 {
7695                     locinput++;
7696                 }
7697             }
7698             else {
7699 
7700                 /* Get the gcb type for the current character */
7701                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7702                                                        (U8*) reginfo->strend);
7703 
7704                 /* Then scan through the input until we get to the first
7705                  * character whose type is supposed to be a gcb with the
7706                  * current character.  (There is always a break at the
7707                  * end-of-input) */
7708                 locinput += UTF8SKIP(locinput);
7709                 while (locinput < loceol) {
7710                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7711                                                          (U8*) reginfo->strend);
7712                     if (isGCB(prev_gcb, cur_gcb,
7713                               (U8*) reginfo->strbeg, (U8*) locinput,
7714                               utf8_target))
7715                     {
7716                         break;
7717                     }
7718 
7719                     prev_gcb = cur_gcb;
7720                     locinput += UTF8SKIP(locinput);
7721                 }
7722 
7723 
7724             }
7725             break;
7726 
7727         case REFFLN:  /*  /\g{name}/il  */
7728         {   /* The capture buffer cases.  The ones beginning with N for the
7729                named buffers just convert to the equivalent numbered and
7730                pretend they were called as the corresponding numbered buffer
7731                op.  */
7732             /* don't initialize these in the declaration, it makes C++
7733                unhappy */
7734             const char *s;
7735             char type;
7736             re_fold_t folder;
7737             const U8 *fold_array;
7738             UV utf8_fold_flags;
7739 
7740             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7741             folder = foldEQ_locale;
7742             fold_array = PL_fold_locale;
7743             type = REFFL;
7744             utf8_fold_flags = FOLDEQ_LOCALE;
7745             goto do_nref;
7746 
7747         case REFFAN:  /*  /\g{name}/iaa  */
7748             folder = foldEQ_latin1;
7749             fold_array = PL_fold_latin1;
7750             type = REFFA;
7751             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7752             goto do_nref;
7753 
7754         case REFFUN:  /*  /\g{name}/iu  */
7755             folder = foldEQ_latin1;
7756             fold_array = PL_fold_latin1;
7757             type = REFFU;
7758             utf8_fold_flags = 0;
7759             goto do_nref;
7760 
7761         case REFFN:  /*  /\g{name}/i  */
7762             folder = foldEQ;
7763             fold_array = PL_fold;
7764             type = REFF;
7765             utf8_fold_flags = 0;
7766             goto do_nref;
7767 
7768         case REFN:  /*  /\g{name}/   */
7769             type = REF;
7770             folder = NULL;
7771             fold_array = NULL;
7772             utf8_fold_flags = 0;
7773           do_nref:
7774 
7775             /* For the named back references, find the corresponding buffer
7776              * number */
7777             n = reg_check_named_buff_matched(rex,scan);
7778 
7779             if ( ! n ) {
7780                 sayNO;
7781             }
7782             goto do_nref_ref_common;
7783 
7784         case REFFL:  /*  /\1/il  */
7785             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7786             folder = foldEQ_locale;
7787             fold_array = PL_fold_locale;
7788             utf8_fold_flags = FOLDEQ_LOCALE;
7789             goto do_ref;
7790 
7791         case REFFA:  /*  /\1/iaa  */
7792             folder = foldEQ_latin1;
7793             fold_array = PL_fold_latin1;
7794             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7795             goto do_ref;
7796 
7797         case REFFU:  /*  /\1/iu  */
7798             folder = foldEQ_latin1;
7799             fold_array = PL_fold_latin1;
7800             utf8_fold_flags = 0;
7801             goto do_ref;
7802 
7803         case REFF:  /*  /\1/i  */
7804             folder = foldEQ;
7805             fold_array = PL_fold;
7806             utf8_fold_flags = 0;
7807             goto do_ref;
7808 
7809         case REF:  /*  /\1/    */
7810             folder = NULL;
7811             fold_array = NULL;
7812             utf8_fold_flags = 0;
7813 
7814           do_ref:
7815             type = OP(scan);
7816             n = ARG(scan);  /* which paren pair */
7817 
7818           do_nref_ref_common:
7819             ln = rex->offs[n].start;
7820             endref = rex->offs[n].end;
7821             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7822             if (rex->lastparen < n || ln == -1 || endref == -1)
7823                 sayNO;			/* Do not match unless seen CLOSEn. */
7824             if (ln == endref)
7825                 break;
7826 
7827             s = reginfo->strbeg + ln;
7828             if (type != REF	/* REF can do byte comparison */
7829                 && (utf8_target || type == REFFU || type == REFFL))
7830             {
7831                 char * limit = loceol;
7832 
7833                 /* This call case insensitively compares the entire buffer
7834                     * at s, with the current input starting at locinput, but
7835                     * not going off the end given by loceol, and
7836                     * returns in <limit> upon success, how much of the
7837                     * current input was matched */
7838                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7839                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7840                 {
7841                     sayNO;
7842                 }
7843                 locinput = limit;
7844                 break;
7845             }
7846 
7847             /* Not utf8:  Inline the first character, for speed. */
7848             if ( ! NEXTCHR_IS_EOS
7849                 && locinput < loceol
7850                 && UCHARAT(s) != nextbyte
7851                 && (   type == REF
7852                     || UCHARAT(s) != fold_array[nextbyte]))
7853             {
7854                 sayNO;
7855             }
7856             ln = endref - ln;
7857             if (locinput + ln > loceol)
7858                 sayNO;
7859             if (ln > 1 && (type == REF
7860                            ? memNE(s, locinput, ln)
7861                            : ! folder(locinput, s, ln)))
7862                 sayNO;
7863             locinput += ln;
7864             break;
7865         }
7866 
7867         case NOTHING: /* null op; e.g. the 'nothing' following
7868                        * the '*' in m{(a+|b)*}' */
7869             break;
7870         case TAIL: /* placeholder while compiling (A|B|C) */
7871             break;
7872 
7873 #undef  ST
7874 #define ST st->u.eval
7875 #define CUR_EVAL cur_eval->u.eval
7876 
7877         {
7878             SV *ret;
7879             REGEXP *re_sv;
7880             regexp *re;
7881             regexp_internal *rei;
7882             regnode *startpoint;
7883             U32 arg;
7884 
7885         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7886             arg= (U32)ARG(scan);
7887             if (cur_eval && cur_eval->locinput == locinput) {
7888                 if ( ++nochange_depth > max_nochange_depth )
7889                     Perl_croak(aTHX_
7890                         "Pattern subroutine nesting without pos change"
7891                         " exceeded limit in regex");
7892             } else {
7893                 nochange_depth = 0;
7894             }
7895             re_sv = rex_sv;
7896             re = rex;
7897             rei = rexi;
7898             startpoint = scan + ARG2L(scan);
7899             EVAL_CLOSE_PAREN_SET( st, arg );
7900             /* Detect infinite recursion
7901              *
7902              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7903              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7904              * So we track the position in the string we are at each time
7905              * we recurse and if we try to enter the same routine twice from
7906              * the same position we throw an error.
7907              */
7908             if ( rex->recurse_locinput[arg] == locinput ) {
7909                 /* FIXME: we should show the regop that is failing as part
7910                  * of the error message. */
7911                 Perl_croak(aTHX_ "Infinite recursion in regex");
7912             } else {
7913                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7914                 rex->recurse_locinput[arg]= locinput;
7915 
7916                 DEBUG_r({
7917                     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918                     DEBUG_STACK_r({
7919                         Perl_re_exec_indentf( aTHX_
7920                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7921                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7922                         );
7923                     });
7924                 });
7925             }
7926 
7927             /* Save all the positions seen so far. */
7928             ST.cp = regcppush(rex, 0, maxopenparen);
7929             REGCP_SET(ST.lastcp);
7930 
7931             /* and then jump to the code we share with EVAL */
7932             goto eval_recurse_doit;
7933             /* NOTREACHED */
7934 
7935         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7936             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7937                 if ( ++nochange_depth > max_nochange_depth )
7938                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7939             } else {
7940                 nochange_depth = 0;
7941             }
7942             {
7943                 /* execute the code in the {...} */
7944 
7945                 dSP;
7946                 IV before;
7947                 OP * const oop = PL_op;
7948                 COP * const ocurcop = PL_curcop;
7949                 OP *nop;
7950                 CV *newcv;
7951 
7952                 /* save *all* paren positions */
7953                 regcppush(rex, 0, maxopenparen);
7954                 REGCP_SET(ST.lastcp);
7955 
7956                 if (!caller_cv)
7957                     caller_cv = find_runcv(NULL);
7958 
7959                 n = ARG(scan);
7960 
7961                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7962                     newcv = (ReANY(
7963                                     (REGEXP*)(rexi->data->data[n])
7964                             ))->qr_anoncv;
7965                     nop = (OP*)rexi->data->data[n+1];
7966                 }
7967                 else if (rexi->data->what[n] == 'l') { /* literal code */
7968                     newcv = caller_cv;
7969                     nop = (OP*)rexi->data->data[n];
7970                     assert(CvDEPTH(newcv));
7971                 }
7972                 else {
7973                     /* literal with own CV */
7974                     assert(rexi->data->what[n] == 'L');
7975                     newcv = rex->qr_anoncv;
7976                     nop = (OP*)rexi->data->data[n];
7977                 }
7978 
7979                 /* Some notes about MULTICALL and the context and save stacks.
7980                  *
7981                  * In something like
7982                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7983                  * since codeblocks don't introduce a new scope (so that
7984                  * local() etc accumulate), at the end of a successful
7985                  * match there will be a SAVEt_CLEARSV on the savestack
7986                  * for each of $x, $y, $z. If the three code blocks above
7987                  * happen to have come from different CVs (e.g. via
7988                  * embedded qr//s), then we must ensure that during any
7989                  * savestack unwinding, PL_comppad always points to the
7990                  * right pad at each moment. We achieve this by
7991                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7992                  * there is a change of pad.
7993                  * In theory whenever we call a code block, we should
7994                  * push a CXt_SUB context, then pop it on return from
7995                  * that code block. This causes a bit of an issue in that
7996                  * normally popping a context also clears the savestack
7997                  * back to cx->blk_oldsaveix, but here we specifically
7998                  * don't want to clear the save stack on exit from the
7999                  * code block.
8000                  * Also for efficiency we don't want to keep pushing and
8001                  * popping the single SUB context as we backtrack etc.
8002                  * So instead, we push a single context the first time
8003                  * we need, it, then hang onto it until the end of this
8004                  * function. Whenever we encounter a new code block, we
8005                  * update the CV etc if that's changed. During the times
8006                  * in this function where we're not executing a code
8007                  * block, having the SUB context still there is a bit
8008                  * naughty - but we hope that no-one notices.
8009                  * When the SUB context is initially pushed, we fake up
8010                  * cx->blk_oldsaveix to be as if we'd pushed this context
8011                  * on first entry to S_regmatch rather than at some random
8012                  * point during the regexe execution. That way if we
8013                  * croak, popping the context stack will ensure that
8014                  * *everything* SAVEd by this function is undone and then
8015                  * the context popped, rather than e.g., popping the
8016                  * context (and restoring the original PL_comppad) then
8017                  * popping more of the savestack and restoring a bad
8018                  * PL_comppad.
8019                  */
8020 
8021                 /* If this is the first EVAL, push a MULTICALL. On
8022                  * subsequent calls, if we're executing a different CV, or
8023                  * if PL_comppad has got messed up from backtracking
8024                  * through SAVECOMPPADs, then refresh the context.
8025                  */
8026                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8027                 {
8028                     U8 flags = (CXp_SUB_RE |
8029                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8030                     SAVECOMPPAD();
8031                     if (last_pushed_cv) {
8032                         CHANGE_MULTICALL_FLAGS(newcv, flags);
8033                     }
8034                     else {
8035                         PUSH_MULTICALL_FLAGS(newcv, flags);
8036                     }
8037                     /* see notes above */
8038                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8039 
8040                     last_pushed_cv = newcv;
8041                 }
8042                 else {
8043                     /* these assignments are just to silence compiler
8044                      * warnings */
8045                     multicall_cop = NULL;
8046                 }
8047                 last_pad = PL_comppad;
8048 
8049                 /* the initial nextstate you would normally execute
8050                  * at the start of an eval (which would cause error
8051                  * messages to come from the eval), may be optimised
8052                  * away from the execution path in the regex code blocks;
8053                  * so manually set PL_curcop to it initially */
8054                 {
8055                     OP *o = cUNOPx(nop)->op_first;
8056                     assert(o->op_type == OP_NULL);
8057                     if (o->op_targ == OP_SCOPE) {
8058                         o = cUNOPo->op_first;
8059                     }
8060                     else {
8061                         assert(o->op_targ == OP_LEAVE);
8062                         o = cUNOPo->op_first;
8063                         assert(o->op_type == OP_ENTER);
8064                         o = OpSIBLING(o);
8065                     }
8066 
8067                     if (o->op_type != OP_STUB) {
8068                         assert(    o->op_type == OP_NEXTSTATE
8069                                 || o->op_type == OP_DBSTATE
8070                                 || (o->op_type == OP_NULL
8071                                     &&  (  o->op_targ == OP_NEXTSTATE
8072                                         || o->op_targ == OP_DBSTATE
8073                                         )
8074                                     )
8075                         );
8076                         PL_curcop = (COP*)o;
8077                     }
8078                 }
8079                 nop = nop->op_next;
8080 
8081                 DEBUG_STATE_r( Perl_re_printf( aTHX_
8082                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8083 
8084                 rex->offs[0].end = locinput - reginfo->strbeg;
8085                 if (reginfo->info_aux_eval->pos_magic)
8086                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8087                                   reginfo->sv, reginfo->strbeg,
8088                                   locinput - reginfo->strbeg);
8089 
8090                 if (sv_yes_mark) {
8091                     SV *sv_mrk = get_sv("REGMARK", 1);
8092                     sv_setsv(sv_mrk, sv_yes_mark);
8093                 }
8094 
8095                 /* we don't use MULTICALL here as we want to call the
8096                  * first op of the block of interest, rather than the
8097                  * first op of the sub. Also, we don't want to free
8098                  * the savestack frame */
8099                 before = (IV)(SP-PL_stack_base);
8100                 PL_op = nop;
8101                 CALLRUNOPS(aTHX);			/* Scalar context. */
8102                 SPAGAIN;
8103                 if ((IV)(SP-PL_stack_base) == before)
8104                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
8105                 else {
8106                     ret = POPs;
8107                     PUTBACK;
8108                 }
8109 
8110                 /* before restoring everything, evaluate the returned
8111                  * value, so that 'uninit' warnings don't use the wrong
8112                  * PL_op or pad. Also need to process any magic vars
8113                  * (e.g. $1) *before* parentheses are restored */
8114 
8115                 PL_op = NULL;
8116 
8117                 re_sv = NULL;
8118                 if (logical == 0) {       /*   (?{})/   */
8119                     SV *replsv = save_scalar(PL_replgv);
8120                     sv_setsv(replsv, ret); /* $^R */
8121                     SvSETMAGIC(replsv);
8122                 }
8123                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
8124                     sw = cBOOL(SvTRUE_NN(ret));
8125                     logical = 0;
8126                 }
8127                 else {                   /*  /(??{})  */
8128                     /*  if its overloaded, let the regex compiler handle
8129                      *  it; otherwise extract regex, or stringify  */
8130                     if (SvGMAGICAL(ret))
8131                         ret = sv_mortalcopy(ret);
8132                     if (!SvAMAGIC(ret)) {
8133                         SV *sv = ret;
8134                         if (SvROK(sv))
8135                             sv = SvRV(sv);
8136                         if (SvTYPE(sv) == SVt_REGEXP)
8137                             re_sv = (REGEXP*) sv;
8138                         else if (SvSMAGICAL(ret)) {
8139                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8140                             if (mg)
8141                                 re_sv = (REGEXP *) mg->mg_obj;
8142                         }
8143 
8144                         /* force any undef warnings here */
8145                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8146                             ret = sv_mortalcopy(ret);
8147                             (void) SvPV_force_nolen(ret);
8148                         }
8149                     }
8150 
8151                 }
8152 
8153                 /* *** Note that at this point we don't restore
8154                  * PL_comppad, (or pop the CxSUB) on the assumption it may
8155                  * be used again soon. This is safe as long as nothing
8156                  * in the regexp code uses the pad ! */
8157                 PL_op = oop;
8158                 PL_curcop = ocurcop;
8159                 regcp_restore(rex, ST.lastcp, &maxopenparen);
8160                 PL_curpm_under = PL_curpm;
8161                 PL_curpm = PL_reg_curpm;
8162 
8163                 if (logical != 2) {
8164                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8165                                     script_run_begin);
8166                     /* NOTREACHED */
8167                 }
8168             }
8169 
8170                 /* only /(??{})/  from now on */
8171                 logical = 0;
8172                 {
8173                     /* extract RE object from returned value; compiling if
8174                      * necessary */
8175 
8176                     if (re_sv) {
8177                         re_sv = reg_temp_copy(NULL, re_sv);
8178                     }
8179                     else {
8180                         U32 pm_flags = 0;
8181 
8182                         if (SvUTF8(ret) && IN_BYTES) {
8183                             /* In use 'bytes': make a copy of the octet
8184                              * sequence, but without the flag on */
8185                             STRLEN len;
8186                             const char *const p = SvPV(ret, len);
8187                             ret = newSVpvn_flags(p, len, SVs_TEMP);
8188                         }
8189                         if (rex->intflags & PREGf_USE_RE_EVAL)
8190                             pm_flags |= PMf_USE_RE_EVAL;
8191 
8192                         /* if we got here, it should be an engine which
8193                          * supports compiling code blocks and stuff */
8194                         assert(rex->engine && rex->engine->op_comp);
8195                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8196                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8197                                     rex->engine, NULL, NULL,
8198                                     /* copy /msixn etc to inner pattern */
8199                                     ARG2L(scan),
8200                                     pm_flags);
8201 
8202                         if (!(SvFLAGS(ret)
8203                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
8204                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8205                             /* This isn't a first class regexp. Instead, it's
8206                                caching a regexp onto an existing, Perl visible
8207                                scalar.  */
8208                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8209                         }
8210                     }
8211                     SAVEFREESV(re_sv);
8212                     re = ReANY(re_sv);
8213                 }
8214                 RXp_MATCH_COPIED_off(re);
8215                 re->subbeg = rex->subbeg;
8216                 re->sublen = rex->sublen;
8217                 re->suboffset = rex->suboffset;
8218                 re->subcoffset = rex->subcoffset;
8219                 re->lastparen = 0;
8220                 re->lastcloseparen = 0;
8221                 rei = RXi_GET(re);
8222                 DEBUG_EXECUTE_r(
8223                     debug_start_match(re_sv, utf8_target, locinput,
8224                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
8225                 );
8226                 startpoint = rei->program + 1;
8227                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8228                                              * close_paren only for GOSUB */
8229                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8230                 /* Save all the seen positions so far. */
8231                 ST.cp = regcppush(rex, 0, maxopenparen);
8232                 REGCP_SET(ST.lastcp);
8233                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8234                 maxopenparen = 0;
8235                 /* run the pattern returned from (??{...}) */
8236 
8237               eval_recurse_doit: /* Share code with GOSUB below this line
8238                             * At this point we expect the stack context to be
8239                             * set up correctly */
8240 
8241                 /* invalidate the S-L poscache. We're now executing a
8242                  * different set of WHILEM ops (and their associated
8243                  * indexes) against the same string, so the bits in the
8244                  * cache are meaningless. Setting maxiter to zero forces
8245                  * the cache to be invalidated and zeroed before reuse.
8246                  * XXX This is too dramatic a measure. Ideally we should
8247                  * save the old cache and restore when running the outer
8248                  * pattern again */
8249                 reginfo->poscache_maxiter = 0;
8250 
8251                 /* the new regexp might have a different is_utf8_pat than we do */
8252                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8253 
8254                 ST.prev_rex = rex_sv;
8255                 ST.prev_curlyx = cur_curlyx;
8256                 rex_sv = re_sv;
8257                 SET_reg_curpm(rex_sv);
8258                 rex = re;
8259                 rexi = rei;
8260                 cur_curlyx = NULL;
8261                 ST.B = next;
8262                 ST.prev_eval = cur_eval;
8263                 cur_eval = st;
8264                 /* now continue from first node in postoned RE */
8265                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8266                                     loceol, script_run_begin);
8267                 NOT_REACHED; /* NOTREACHED */
8268         }
8269 
8270         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8271             /* note: this is called twice; first after popping B, then A */
8272             DEBUG_STACK_r({
8273                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
8274                     depth, cur_eval, ST.prev_eval);
8275             });
8276 
8277 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8278             if ( cur_eval && CUR_EVAL.close_paren ) {\
8279                 DEBUG_STACK_r({ \
8280                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8281                         depth,    \
8282                         CUR_EVAL.close_paren - 1,\
8283                         cur_eval, \
8284                         VAL);     \
8285                 });               \
8286                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8287             }
8288 
8289             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8290 
8291             rex_sv = ST.prev_rex;
8292             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8293             SET_reg_curpm(rex_sv);
8294             rex = ReANY(rex_sv);
8295             rexi = RXi_GET(rex);
8296             {
8297                 /* preserve $^R across LEAVE's. See Bug 121070. */
8298                 SV *save_sv= GvSV(PL_replgv);
8299                 SV *replsv;
8300                 SvREFCNT_inc(save_sv);
8301                 regcpblow(ST.cp); /* LEAVE in disguise */
8302                 /* don't move this initialization up */
8303                 replsv = GvSV(PL_replgv);
8304                 sv_setsv(replsv, save_sv);
8305                 SvSETMAGIC(replsv);
8306                 SvREFCNT_dec(save_sv);
8307             }
8308             cur_eval = ST.prev_eval;
8309             cur_curlyx = ST.prev_curlyx;
8310 
8311             /* Invalidate cache. See "invalidate" comment above. */
8312             reginfo->poscache_maxiter = 0;
8313             if ( nochange_depth )
8314                 nochange_depth--;
8315 
8316             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8317             sayYES;
8318 
8319 
8320         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8321             REGCP_UNWIND(ST.lastcp);
8322             sayNO;
8323 
8324         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8325             /* note: this is called twice; first after popping B, then A */
8326             DEBUG_STACK_r({
8327                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8328                     depth, cur_eval, ST.prev_eval);
8329             });
8330 
8331             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8332 
8333             rex_sv = ST.prev_rex;
8334             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8335             SET_reg_curpm(rex_sv);
8336             rex = ReANY(rex_sv);
8337             rexi = RXi_GET(rex);
8338 
8339             REGCP_UNWIND(ST.lastcp);
8340             regcppop(rex, &maxopenparen);
8341             cur_eval = ST.prev_eval;
8342             cur_curlyx = ST.prev_curlyx;
8343 
8344             /* Invalidate cache. See "invalidate" comment above. */
8345             reginfo->poscache_maxiter = 0;
8346             if ( nochange_depth )
8347                 nochange_depth--;
8348 
8349             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8350             sayNO_SILENT;
8351 #undef ST
8352 
8353         case OPEN: /*  (  */
8354             n = ARG(scan);  /* which paren pair */
8355             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8356             if (n > maxopenparen)
8357                 maxopenparen = n;
8358             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8359                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8360                 depth,
8361                 PTR2UV(rex),
8362                 PTR2UV(rex->offs),
8363                 (UV)n,
8364                 (IV)rex->offs[n].start_tmp,
8365                 (UV)maxopenparen
8366             ));
8367             lastopen = n;
8368             break;
8369 
8370         case SROPEN: /*  (*SCRIPT_RUN:  */
8371             script_run_begin = (U8 *) locinput;
8372             break;
8373 
8374 
8375         case CLOSE:  /*  )  */
8376             n = ARG(scan);  /* which paren pair */
8377             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8378                              locinput - reginfo->strbeg);
8379             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8380                 goto fake_end;
8381 
8382             break;
8383 
8384         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
8385 
8386             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8387             {
8388                 sayNO;
8389             }
8390 
8391             break;
8392 
8393 
8394         case ACCEPT:  /*  (*ACCEPT)  */
8395             if (scan->flags)
8396                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8397             if (ARG2L(scan)){
8398                 regnode *cursor;
8399                 for (cursor=scan;
8400                      cursor && OP(cursor)!=END;
8401                      cursor=regnext(cursor))
8402                 {
8403                     if ( OP(cursor)==CLOSE ){
8404                         n = ARG(cursor);
8405                         if ( n <= lastopen ) {
8406                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8407                                              locinput - reginfo->strbeg);
8408                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8409                                 break;
8410                         }
8411                     }
8412                 }
8413             }
8414             goto fake_end;
8415             /* NOTREACHED */
8416 
8417         case GROUPP:  /*  (?(1))  */
8418             n = ARG(scan);  /* which paren pair */
8419             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8420             break;
8421 
8422         case GROUPPN:  /*  (?(<name>))  */
8423             /* reg_check_named_buff_matched returns 0 for no match */
8424             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8425             break;
8426 
8427         case INSUBP:   /*  (?(R))  */
8428             n = ARG(scan);
8429             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8430              * of SCAN is already set up as matches a eval.close_paren */
8431             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8432             break;
8433 
8434         case DEFINEP:  /*  (?(DEFINE))  */
8435             sw = 0;
8436             break;
8437 
8438         case IFTHEN:   /*  (?(cond)A|B)  */
8439             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8440             if (sw)
8441                 next = NEXTOPER(NEXTOPER(scan));
8442             else {
8443                 next = scan + ARG(scan);
8444                 if (OP(next) == IFTHEN) /* Fake one. */
8445                     next = NEXTOPER(NEXTOPER(next));
8446             }
8447             break;
8448 
8449         case LOGICAL:  /* modifier for EVAL and IFMATCH */
8450             logical = scan->flags;
8451             break;
8452 
8453 /*******************************************************************
8454 
8455 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8456 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8457 STAR/PLUS/CURLY/CURLYN are used instead.)
8458 
8459 A*B is compiled as <CURLYX><A><WHILEM><B>
8460 
8461 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8462 state, which contains the current count, initialised to -1. It also sets
8463 cur_curlyx to point to this state, with any previous value saved in the
8464 state block.
8465 
8466 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8467 since the pattern may possibly match zero times (i.e. it's a while {} loop
8468 rather than a do {} while loop).
8469 
8470 Each entry to WHILEM represents a successful match of A. The count in the
8471 CURLYX block is incremented, another WHILEM state is pushed, and execution
8472 passes to A or B depending on greediness and the current count.
8473 
8474 For example, if matching against the string a1a2a3b (where the aN are
8475 substrings that match /A/), then the match progresses as follows: (the
8476 pushed states are interspersed with the bits of strings matched so far):
8477 
8478     <CURLYX cnt=-1>
8479     <CURLYX cnt=0><WHILEM>
8480     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8481     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8482     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8483     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8484 
8485 (Contrast this with something like CURLYM, which maintains only a single
8486 backtrack state:
8487 
8488     <CURLYM cnt=0> a1
8489     a1 <CURLYM cnt=1> a2
8490     a1 a2 <CURLYM cnt=2> a3
8491     a1 a2 a3 <CURLYM cnt=3> b
8492 )
8493 
8494 Each WHILEM state block marks a point to backtrack to upon partial failure
8495 of A or B, and also contains some minor state data related to that
8496 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
8497 overall state, such as the count, and pointers to the A and B ops.
8498 
8499 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8500 must always point to the *current* CURLYX block, the rules are:
8501 
8502 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8503 and set cur_curlyx to point the new block.
8504 
8505 When popping the CURLYX block after a successful or unsuccessful match,
8506 restore the previous cur_curlyx.
8507 
8508 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8509 to the outer one saved in the CURLYX block.
8510 
8511 When popping the WHILEM block after a successful or unsuccessful B match,
8512 restore the previous cur_curlyx.
8513 
8514 Here's an example for the pattern (AI* BI)*BO
8515 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8516 
8517 cur_
8518 curlyx backtrack stack
8519 ------ ---------------
8520 NULL
8521 CO     <CO prev=NULL> <WO>
8522 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8523 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8524 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8525 
8526 At this point the pattern succeeds, and we work back down the stack to
8527 clean up, restoring as we go:
8528 
8529 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8530 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8531 CO     <CO prev=NULL> <WO>
8532 NULL
8533 
8534 *******************************************************************/
8535 
8536 #define ST st->u.curlyx
8537 
8538         case CURLYX:    /* start of /A*B/  (for complex A) */
8539         {
8540             /* No need to save/restore up to this paren */
8541             I32 parenfloor = scan->flags;
8542 
8543             assert(next); /* keep Coverity happy */
8544             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8545                 next += ARG(next);
8546 
8547             /* XXXX Probably it is better to teach regpush to support
8548                parenfloor > maxopenparen ... */
8549             if (parenfloor > (I32)rex->lastparen)
8550                 parenfloor = rex->lastparen; /* Pessimization... */
8551 
8552             ST.prev_curlyx= cur_curlyx;
8553             cur_curlyx = st;
8554             ST.cp = PL_savestack_ix;
8555 
8556             /* these fields contain the state of the current curly.
8557              * they are accessed by subsequent WHILEMs */
8558             ST.parenfloor = parenfloor;
8559             ST.me = scan;
8560             ST.B = next;
8561             ST.minmod = minmod;
8562             minmod = 0;
8563             ST.count = -1;	/* this will be updated by WHILEM */
8564             ST.lastloc = NULL;  /* this will be updated by WHILEM */
8565 
8566             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8567                                 script_run_begin);
8568             NOT_REACHED; /* NOTREACHED */
8569         }
8570 
8571         case CURLYX_end: /* just finished matching all of A*B */
8572             cur_curlyx = ST.prev_curlyx;
8573             sayYES;
8574             NOT_REACHED; /* NOTREACHED */
8575 
8576         case CURLYX_end_fail: /* just failed to match all of A*B */
8577             regcpblow(ST.cp);
8578             cur_curlyx = ST.prev_curlyx;
8579             sayNO;
8580             NOT_REACHED; /* NOTREACHED */
8581 
8582 
8583 #undef ST
8584 #define ST st->u.whilem
8585 
8586         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8587         {
8588             /* see the discussion above about CURLYX/WHILEM */
8589             I32 n;
8590             int min, max;
8591             regnode *A;
8592 
8593             assert(cur_curlyx); /* keep Coverity happy */
8594 
8595             min = ARG1(cur_curlyx->u.curlyx.me);
8596             max = ARG2(cur_curlyx->u.curlyx.me);
8597             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8598             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8599             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8600             ST.cache_offset = 0;
8601             ST.cache_mask = 0;
8602 
8603 
8604             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8605                   depth, (long)n, min, max)
8606             );
8607 
8608             /* First just match a string of min A's. */
8609 
8610             if (n < min) {
8611                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8612                 cur_curlyx->u.curlyx.lastloc = locinput;
8613                 REGCP_SET(ST.lastcp);
8614 
8615                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8616                                 script_run_begin);
8617                 NOT_REACHED; /* NOTREACHED */
8618             }
8619 
8620             /* If degenerate A matches "", assume A done. */
8621 
8622             if (locinput == cur_curlyx->u.curlyx.lastloc) {
8623                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8624                    depth)
8625                 );
8626                 goto do_whilem_B_max;
8627             }
8628 
8629             /* super-linear cache processing.
8630              *
8631              * The idea here is that for certain types of CURLYX/WHILEM -
8632              * principally those whose upper bound is infinity (and
8633              * excluding regexes that have things like \1 and other very
8634              * non-regular expresssiony things), then if a pattern like
8635              * /....A*.../ fails and we backtrack to the WHILEM, then we
8636              * make a note that this particular WHILEM op was at string
8637              * position 47 (say) when the rest of pattern failed. Then, if
8638              * we ever find ourselves back at that WHILEM, and at string
8639              * position 47 again, we can just fail immediately rather than
8640              * running the rest of the pattern again.
8641              *
8642              * This is very handy when patterns start to go
8643              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8644              * with a combinatorial explosion of backtracking.
8645              *
8646              * The cache is implemented as a bit array, with one bit per
8647              * string byte position per WHILEM op (up to 16) - so its
8648              * between 0.25 and 2x the string size.
8649              *
8650              * To avoid allocating a poscache buffer every time, we do an
8651              * initially countdown; only after we have  executed a WHILEM
8652              * op (string-length x #WHILEMs) times do we allocate the
8653              * cache.
8654              *
8655              * The top 4 bits of scan->flags byte say how many different
8656              * relevant CURLLYX/WHILEM op pairs there are, while the
8657              * bottom 4-bits is the identifying index number of this
8658              * WHILEM.
8659              */
8660 
8661             if (scan->flags) {
8662 
8663                 if (!reginfo->poscache_maxiter) {
8664                     /* start the countdown: Postpone detection until we
8665                      * know the match is not *that* much linear. */
8666                     reginfo->poscache_maxiter
8667                         =    (reginfo->strend - reginfo->strbeg + 1)
8668                            * (scan->flags>>4);
8669                     /* possible overflow for long strings and many CURLYX's */
8670                     if (reginfo->poscache_maxiter < 0)
8671                         reginfo->poscache_maxiter = I32_MAX;
8672                     reginfo->poscache_iter = reginfo->poscache_maxiter;
8673                 }
8674 
8675                 if (reginfo->poscache_iter-- == 0) {
8676                     /* initialise cache */
8677                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8678                     regmatch_info_aux *const aux = reginfo->info_aux;
8679                     if (aux->poscache) {
8680                         if ((SSize_t)reginfo->poscache_size < size) {
8681                             Renew(aux->poscache, size, char);
8682                             reginfo->poscache_size = size;
8683                         }
8684                         Zero(aux->poscache, size, char);
8685                     }
8686                     else {
8687                         reginfo->poscache_size = size;
8688                         Newxz(aux->poscache, size, char);
8689                     }
8690                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8691       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8692                               PL_colors[4], PL_colors[5])
8693                     );
8694                 }
8695 
8696                 if (reginfo->poscache_iter < 0) {
8697                     /* have we already failed at this position? */
8698                     SSize_t offset, mask;
8699 
8700                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8701                     offset  = (scan->flags & 0xf) - 1
8702                                 +   (locinput - reginfo->strbeg)
8703                                   * (scan->flags>>4);
8704                     mask    = 1 << (offset % 8);
8705                     offset /= 8;
8706                     if (reginfo->info_aux->poscache[offset] & mask) {
8707                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8708                             depth)
8709                         );
8710                         cur_curlyx->u.curlyx.count--;
8711                         sayNO; /* cache records failure */
8712                     }
8713                     ST.cache_offset = offset;
8714                     ST.cache_mask   = mask;
8715                 }
8716             }
8717 
8718             /* Prefer B over A for minimal matching. */
8719 
8720             if (cur_curlyx->u.curlyx.minmod) {
8721                 ST.save_curlyx = cur_curlyx;
8722                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8723                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8724                                     locinput, loceol, script_run_begin);
8725                 NOT_REACHED; /* NOTREACHED */
8726             }
8727 
8728             /* Prefer A over B for maximal matching. */
8729 
8730             if (n < max) { /* More greed allowed? */
8731                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8732                             maxopenparen);
8733                 cur_curlyx->u.curlyx.lastloc = locinput;
8734                 REGCP_SET(ST.lastcp);
8735                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8736                                 script_run_begin);
8737                 NOT_REACHED; /* NOTREACHED */
8738             }
8739             goto do_whilem_B_max;
8740         }
8741         NOT_REACHED; /* NOTREACHED */
8742 
8743         case WHILEM_B_min: /* just matched B in a minimal match */
8744         case WHILEM_B_max: /* just matched B in a maximal match */
8745             cur_curlyx = ST.save_curlyx;
8746             sayYES;
8747             NOT_REACHED; /* NOTREACHED */
8748 
8749         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8750             cur_curlyx = ST.save_curlyx;
8751             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8752             cur_curlyx->u.curlyx.count--;
8753             CACHEsayNO;
8754             NOT_REACHED; /* NOTREACHED */
8755 
8756         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8757             /* FALLTHROUGH */
8758         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8759             REGCP_UNWIND(ST.lastcp);
8760             regcppop(rex, &maxopenparen);
8761             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8762             cur_curlyx->u.curlyx.count--;
8763             CACHEsayNO;
8764             NOT_REACHED; /* NOTREACHED */
8765 
8766         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8767             REGCP_UNWIND(ST.lastcp);
8768             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8769             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8770                 depth)
8771             );
8772           do_whilem_B_max:
8773             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8774                 && ckWARN(WARN_REGEXP)
8775                 && !reginfo->warned)
8776             {
8777                 reginfo->warned	= TRUE;
8778                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8779                      "Complex regular subexpression recursion limit (%d) "
8780                      "exceeded",
8781                      REG_INFTY - 1);
8782             }
8783 
8784             /* now try B */
8785             ST.save_curlyx = cur_curlyx;
8786             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8787             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8788                                 locinput, loceol, script_run_begin);
8789             NOT_REACHED; /* NOTREACHED */
8790 
8791         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8792             cur_curlyx = ST.save_curlyx;
8793 
8794             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8795                 /* Maximum greed exceeded */
8796                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8797                     && ckWARN(WARN_REGEXP)
8798                     && !reginfo->warned)
8799                 {
8800                     reginfo->warned	= TRUE;
8801                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8802                         "Complex regular subexpression recursion "
8803                         "limit (%d) exceeded",
8804                         REG_INFTY - 1);
8805                 }
8806                 cur_curlyx->u.curlyx.count--;
8807                 CACHEsayNO;
8808             }
8809 
8810             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8811             );
8812             /* Try grabbing another A and see if it helps. */
8813             cur_curlyx->u.curlyx.lastloc = locinput;
8814             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8815                             maxopenparen);
8816             REGCP_SET(ST.lastcp);
8817             PUSH_STATE_GOTO(WHILEM_A_min,
8818                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8819                 locinput, loceol, script_run_begin);
8820             NOT_REACHED; /* NOTREACHED */
8821 
8822 #undef  ST
8823 #define ST st->u.branch
8824 
8825         case BRANCHJ:	    /*  /(...|A|...)/ with long next pointer */
8826             next = scan + ARG(scan);
8827             if (next == scan)
8828                 next = NULL;
8829             scan = NEXTOPER(scan);
8830             /* FALLTHROUGH */
8831 
8832         case BRANCH:	    /*  /(...|A|...)/ */
8833             scan = NEXTOPER(scan); /* scan now points to inner node */
8834             ST.lastparen = rex->lastparen;
8835             ST.lastcloseparen = rex->lastcloseparen;
8836             ST.next_branch = next;
8837             REGCP_SET(ST.cp);
8838 
8839             /* Now go into the branch */
8840             if (has_cutgroup) {
8841                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8842                                     script_run_begin);
8843             } else {
8844                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8845                                 script_run_begin);
8846             }
8847             NOT_REACHED; /* NOTREACHED */
8848 
8849         case CUTGROUP:  /*  /(*THEN)/  */
8850             sv_yes_mark = st->u.mark.mark_name = scan->flags
8851                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8852                 : NULL;
8853             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8854                             script_run_begin);
8855             NOT_REACHED; /* NOTREACHED */
8856 
8857         case CUTGROUP_next_fail:
8858             do_cutgroup = 1;
8859             no_final = 1;
8860             if (st->u.mark.mark_name)
8861                 sv_commit = st->u.mark.mark_name;
8862             sayNO;
8863             NOT_REACHED; /* NOTREACHED */
8864 
8865         case BRANCH_next:
8866             sayYES;
8867             NOT_REACHED; /* NOTREACHED */
8868 
8869         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8870             if (do_cutgroup) {
8871                 do_cutgroup = 0;
8872                 no_final = 0;
8873             }
8874             REGCP_UNWIND(ST.cp);
8875             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8876             scan = ST.next_branch;
8877             /* no more branches? */
8878             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8879                 DEBUG_EXECUTE_r({
8880                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8881                         depth,
8882                         PL_colors[4],
8883                         PL_colors[5] );
8884                 });
8885                 sayNO_SILENT;
8886             }
8887             continue; /* execute next BRANCH[J] op */
8888             /* NOTREACHED */
8889 
8890         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8891             minmod = 1;
8892             break;
8893 
8894 #undef  ST
8895 #define ST st->u.curlym
8896 
8897         case CURLYM:	/* /A{m,n}B/ where A is fixed-length */
8898 
8899             /* This is an optimisation of CURLYX that enables us to push
8900              * only a single backtracking state, no matter how many matches
8901              * there are in {m,n}. It relies on the pattern being constant
8902              * length, with no parens to influence future backrefs
8903              */
8904 
8905             ST.me = scan;
8906             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8907 
8908             ST.lastparen      = rex->lastparen;
8909             ST.lastcloseparen = rex->lastcloseparen;
8910 
8911             /* if paren positive, emulate an OPEN/CLOSE around A */
8912             if (ST.me->flags) {
8913                 U32 paren = ST.me->flags;
8914                 if (paren > maxopenparen)
8915                     maxopenparen = paren;
8916                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8917             }
8918             ST.A = scan;
8919             ST.B = next;
8920             ST.alen = 0;
8921             ST.count = 0;
8922             ST.minmod = minmod;
8923             minmod = 0;
8924             ST.Binfo.count = -1;
8925             REGCP_SET(ST.cp);
8926 
8927             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8928                 goto curlym_do_B;
8929 
8930           curlym_do_A: /* execute the A in /A{m,n}B/  */
8931             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8932                                 script_run_begin);
8933             NOT_REACHED; /* NOTREACHED */
8934 
8935         case CURLYM_A: /* we've just matched an A */
8936             ST.count++;
8937             /* after first match, determine A's length: u.curlym.alen */
8938             if (ST.count == 1) {
8939                 if (reginfo->is_utf8_target) {
8940                     char *s = st->locinput;
8941                     while (s < locinput) {
8942                         ST.alen++;
8943                         s += UTF8SKIP(s);
8944                     }
8945                 }
8946                 else {
8947                     ST.alen = locinput - st->locinput;
8948                 }
8949                 if (ST.alen == 0)
8950                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8951             }
8952             DEBUG_EXECUTE_r(
8953                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8954                           depth, (IV) ST.count, (IV)ST.alen)
8955             );
8956 
8957             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8958                 goto fake_end;
8959 
8960             {
8961                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8962                 if ( max == REG_INFTY || ST.count < max )
8963                     goto curlym_do_A; /* try to match another A */
8964             }
8965             goto curlym_do_B; /* try to match B */
8966 
8967         case CURLYM_A_fail: /* just failed to match an A */
8968             REGCP_UNWIND(ST.cp);
8969 
8970 
8971             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8972                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8973                 sayNO;
8974 
8975           curlym_do_B: /* execute the B in /A{m,n}B/  */
8976             if (ST.Binfo.count < 0) {
8977                 /* calculate possible match of 1st char following curly */
8978                 assert(ST.B);
8979                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8980                     regnode *text_node = ST.B;
8981                     if (! HAS_TEXT(text_node))
8982                         FIND_NEXT_IMPT(text_node);
8983                     if (PL_regkind[OP(text_node)] == EXACT) {
8984                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
8985                                                         &ST.Binfo, reginfo))
8986                         {
8987                             sayNO;
8988                         }
8989                     }
8990                 }
8991             }
8992 
8993             DEBUG_EXECUTE_r(
8994                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8995                     depth, (IV)ST.count)
8996             );
8997             if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
8998                 assert(ST.Binfo.count > 0);
8999 
9000                 /* Do a quick test to hopefully rule out most non-matches */
9001                 if (     locinput + ST.Binfo.min_length > loceol
9002                     || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9003                 {
9004                     DEBUG_OPTIMISE_r(
9005                         Perl_re_exec_indentf( aTHX_
9006                             "CURLYM Fast bail next target=0x%X anded==0x%X"
9007                                                                 " mask=0x%X\n",
9008                             depth,
9009                             (int) nextbyte, ST.Binfo.first_byte_anded,
9010                                             ST.Binfo.first_byte_mask)
9011                     );
9012                     state_num = CURLYM_B_fail;
9013                     goto reenter_switch;
9014                 }
9015             }
9016 
9017             if (ST.me->flags) {
9018                 /* emulate CLOSE: mark current A as captured */
9019                 U32 paren = (U32)ST.me->flags;
9020                 if (ST.count) {
9021                     CLOSE_CAPTURE(paren,
9022                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
9023                         locinput - reginfo->strbeg);
9024                 }
9025                 else
9026                     rex->offs[paren].end = -1;
9027 
9028                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9029                 {
9030                     if (ST.count)
9031                         goto fake_end;
9032                     else
9033                         sayNO;
9034                 }
9035             }
9036 
9037             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
9038                             script_run_begin);
9039             NOT_REACHED; /* NOTREACHED */
9040 
9041         case CURLYM_B_fail: /* just failed to match a B */
9042             REGCP_UNWIND(ST.cp);
9043             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9044             if (ST.minmod) {
9045                 I32 max = ARG2(ST.me);
9046                 if (max != REG_INFTY && ST.count == max)
9047                     sayNO;
9048                 goto curlym_do_A; /* try to match a further A */
9049             }
9050             /* backtrack one A */
9051             if (ST.count == ARG1(ST.me) /* min */)
9052                 sayNO;
9053             ST.count--;
9054             SET_locinput(HOPc(locinput, -ST.alen));
9055             goto curlym_do_B; /* try to match B */
9056 
9057 #undef ST
9058 #define ST st->u.curly
9059 
9060 #define CURLY_SETPAREN(paren, success) \
9061     if (paren) { \
9062         if (success) { \
9063             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9064                                  locinput - reginfo->strbeg); \
9065         } \
9066         else { \
9067             rex->offs[paren].end = -1; \
9068             rex->lastparen      = ST.lastparen; \
9069             rex->lastcloseparen = ST.lastcloseparen; \
9070         } \
9071     }
9072 
9073         case STAR:		/*  /A*B/ where A is width 1 char */
9074             ST.paren = 0;
9075             ST.min = 0;
9076             ST.max = REG_INFTY;
9077             scan = NEXTOPER(scan);
9078             goto repeat;
9079 
9080         case PLUS:		/*  /A+B/ where A is width 1 char */
9081             ST.paren = 0;
9082             ST.min = 1;
9083             ST.max = REG_INFTY;
9084             scan = NEXTOPER(scan);
9085             goto repeat;
9086 
9087         case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
9088             ST.paren = scan->flags;	/* Which paren to set */
9089             ST.lastparen      = rex->lastparen;
9090             ST.lastcloseparen = rex->lastcloseparen;
9091             if (ST.paren > maxopenparen)
9092                 maxopenparen = ST.paren;
9093             ST.min = ARG1(scan);  /* min to match */
9094             ST.max = ARG2(scan);  /* max to match */
9095             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
9096 
9097             /* handle the single-char capture called as a GOSUB etc */
9098             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9099             {
9100                 char *li = locinput;
9101                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9102                     sayNO;
9103                 SET_locinput(li);
9104                 goto fake_end;
9105             }
9106 
9107             goto repeat;
9108 
9109         case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
9110             ST.paren = 0;
9111             ST.min = ARG1(scan);  /* min to match */
9112             ST.max = ARG2(scan);  /* max to match */
9113             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
9114           repeat:
9115             /*
9116             * Lookahead to avoid useless match attempts
9117             * when we know what character comes next.
9118             *
9119             * Used to only do .*x and .*?x, but now it allows
9120             * for )'s, ('s and (?{ ... })'s to be in the way
9121             * of the quantifier and the EXACT-like node.  -- japhy
9122             */
9123 
9124             assert(ST.min <= ST.max);
9125             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9126                 ST.Binfo.count = 0;
9127             }
9128             else {
9129                 regnode *text_node = next;
9130 
9131                 if (! HAS_TEXT(text_node))
9132                     FIND_NEXT_IMPT(text_node);
9133 
9134                 if (! HAS_TEXT(text_node))
9135                     ST.Binfo.count = 0;
9136                 else {
9137                     if ( PL_regkind[OP(text_node)] != EXACT ) {
9138                         ST.Binfo.count = 0;
9139                     }
9140                     else {
9141                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9142                                                         &ST.Binfo, reginfo))
9143                         {
9144                             sayNO;
9145                         }
9146                     }
9147                 }
9148             }
9149 
9150             ST.A = scan;
9151             ST.B = next;
9152             if (minmod) {
9153                 char *li = locinput;
9154                 minmod = 0;
9155                 if (ST.min &&
9156                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9157                             < ST.min)
9158                     sayNO;
9159                 SET_locinput(li);
9160                 ST.count = ST.min;
9161                 REGCP_SET(ST.cp);
9162 
9163                 if (ST.Binfo.count <= 0)
9164                     goto curly_try_B_min;
9165 
9166                 ST.oldloc = locinput;
9167 
9168                 /* set ST.maxpos to the furthest point along the
9169                  * string that could possibly match, i.e., that a match could
9170                  * start at. */
9171                 if  (ST.max == REG_INFTY) {
9172                     ST.maxpos = loceol - 1;
9173                     if (utf8_target)
9174                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9175                             ST.maxpos--;
9176                 }
9177                 else if (utf8_target) {
9178                     int m = ST.max - ST.min;
9179                     for (ST.maxpos = locinput;
9180                          m >0 && ST.maxpos <  loceol; m--)
9181                         ST.maxpos += UTF8SKIP(ST.maxpos);
9182                 }
9183                 else {
9184                     ST.maxpos = locinput + ST.max - ST.min;
9185                     if (ST.maxpos >=  loceol)
9186                         ST.maxpos =  loceol - 1;
9187                 }
9188                 goto curly_try_B_min_known;
9189 
9190             }
9191             else {
9192                 /* avoid taking address of locinput, so it can remain
9193                  * a register var */
9194                 char *li = locinput;
9195                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9196                 if (ST.count < ST.min)
9197                     sayNO;
9198                 SET_locinput(li);
9199                 if ((ST.count > ST.min)
9200                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
9201                 {
9202                     /* A{m,n} must come at the end of the string, there's
9203                      * no point in backing off ... */
9204                     ST.min = ST.count;
9205                     /* ...except that $ and \Z can match before *and* after
9206                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
9207                        We may back off by one in this case. */
9208                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9209                         ST.min--;
9210                 }
9211                 REGCP_SET(ST.cp);
9212                 goto curly_try_B_max;
9213             }
9214             NOT_REACHED; /* NOTREACHED */
9215 
9216         case CURLY_B_min_fail:
9217             /* failed to find B in a non-greedy match. */
9218 
9219             REGCP_UNWIND(ST.cp);
9220             if (ST.paren) {
9221                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9222             }
9223 
9224             if (ST.Binfo.count == 0) {
9225                 /* failed -- move forward one */
9226                 char *li = locinput;
9227                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9228                     sayNO;
9229                 }
9230                 locinput = li;
9231                 ST.count++;
9232                 if (!(   ST.count <= ST.max
9233                         /* count overflow ? */
9234                      || (ST.max == REG_INFTY && ST.count > 0))
9235                 )
9236                     sayNO;
9237             }
9238             else {
9239                 int n;
9240                 /* Couldn't or didn't -- move forward. */
9241                 ST.oldloc = locinput;
9242                 if (utf8_target)
9243                     locinput += UTF8SKIP(locinput);
9244                 else
9245                     locinput++;
9246                 ST.count++;
9247 
9248               curly_try_B_min_known:
9249                 /* find the next place where 'B' could work, then call B */
9250                 if (locinput + ST.Binfo.initial_exact < loceol) {
9251                     if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9252 
9253                         /* Here, the mask is all 1's for the entire length of
9254                          * any possible match.  (That actually means that there
9255                          * is only one possible match.)  Look for the next
9256                          * occurrence */
9257                         locinput = ninstr(locinput, loceol,
9258                                         (char *) ST.Binfo.matches,
9259                                         (char *) ST.Binfo.matches
9260                                                     + ST.Binfo.initial_exact);
9261                         if (locinput == NULL) {
9262                             sayNO;
9263                         }
9264                     }
9265                     else do {
9266                         /* If the first byte(s) of the mask are all ones, it
9267                          * means those bytes must match identically, so can use
9268                          * ninstr() to find the next possible matchpoint */
9269                         if (ST.Binfo.initial_exact > 0) {
9270                             locinput = ninstr(locinput, loceol,
9271                                               (char *) ST.Binfo.matches,
9272                                               (char *) ST.Binfo.matches
9273                                                      + ST.Binfo.initial_exact);
9274                         }
9275                         else { /* Otherwise find the next byte that matches,
9276                                   masked */
9277                             locinput = (char *) find_next_masked(
9278                                                 (U8 *) locinput, (U8 *) loceol,
9279                                                 ST.Binfo.first_byte_anded,
9280                                                 ST.Binfo.first_byte_mask);
9281                             /* Advance to the end of a multi-byte character */
9282                             if (utf8_target) {
9283                                 while (   locinput < loceol
9284                                     && UTF8_IS_CONTINUATION(*locinput))
9285                                 {
9286                                     locinput++;
9287                                 }
9288                             }
9289                         }
9290                         if (   locinput == NULL
9291                             || locinput + ST.Binfo.min_length > loceol)
9292                         {
9293                             sayNO;
9294                         }
9295 
9296                         /* Here, we have found a possible match point; if can't
9297                          * rule it out, quit the loop so can check fully */
9298                         if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9299                             break;
9300                         }
9301 
9302                         locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9303 
9304                     } while (locinput <= ST.maxpos);
9305                 }
9306 
9307                 if (locinput > ST.maxpos)
9308                     sayNO;
9309 
9310                 n = (utf8_target)
9311                     ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9312                     : (STRLEN) (locinput - ST.oldloc);
9313 
9314 
9315                 /* Here is at the beginning of a character that meets the mask
9316                  * criteria.  Need to make sure that some real possibility */
9317 
9318                 if (n) {
9319                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9320                      * at what may be the beginning of b; check that everything
9321                      * between oldloc and locinput matches */
9322                     char *li = ST.oldloc;
9323                     ST.count += n;
9324                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9325                         sayNO;
9326                     assert(n == REG_INFTY || locinput == li);
9327                 }
9328             }
9329 
9330           curly_try_B_min:
9331             CURLY_SETPAREN(ST.paren, ST.count);
9332             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9333                             script_run_begin);
9334             NOT_REACHED; /* NOTREACHED */
9335 
9336 
9337           curly_try_B_max:
9338             /* a successful greedy match: now try to match B */
9339             if (        ST.Binfo.count <= 0
9340                 || (    ST.Binfo.count > 0
9341                     &&  locinput + ST.Binfo.min_length <= loceol
9342                     &&  S_test_EXACTISH_ST(locinput, ST.Binfo)))
9343             {
9344                 CURLY_SETPAREN(ST.paren, ST.count);
9345                 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9346                                 script_run_begin);
9347                 NOT_REACHED; /* NOTREACHED */
9348             }
9349             /* FALLTHROUGH */
9350 
9351         case CURLY_B_max_fail:
9352             /* failed to find B in a greedy match */
9353 
9354             REGCP_UNWIND(ST.cp);
9355             if (ST.paren) {
9356                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9357             }
9358             /*  back up. */
9359             if (--ST.count < ST.min)
9360                 sayNO;
9361             locinput = HOPc(locinput, -1);
9362             goto curly_try_B_max;
9363 
9364 #undef ST
9365 
9366         case END: /*  last op of main pattern  */
9367           fake_end:
9368             if (cur_eval) {
9369                 /* we've just finished A in /(??{A})B/; now continue with B */
9370                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9371                 st->u.eval.prev_rex = rex_sv;		/* inner */
9372 
9373                 /* Save *all* the positions. */
9374                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9375                 rex_sv = CUR_EVAL.prev_rex;
9376                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9377                 SET_reg_curpm(rex_sv);
9378                 rex = ReANY(rex_sv);
9379                 rexi = RXi_GET(rex);
9380 
9381                 st->u.eval.prev_curlyx = cur_curlyx;
9382                 cur_curlyx = CUR_EVAL.prev_curlyx;
9383 
9384                 REGCP_SET(st->u.eval.lastcp);
9385 
9386                 /* Restore parens of the outer rex without popping the
9387                  * savestack */
9388                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9389 
9390                 st->u.eval.prev_eval = cur_eval;
9391                 cur_eval = CUR_EVAL.prev_eval;
9392                 DEBUG_EXECUTE_r(
9393                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
9394                                       depth, cur_eval););
9395                 if ( nochange_depth )
9396                     nochange_depth--;
9397 
9398                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9399 
9400                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
9401                                     st->u.eval.prev_eval->u.eval.B,
9402                                     locinput, loceol, script_run_begin);
9403             }
9404 
9405             if (locinput < reginfo->till) {
9406                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9407                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9408                                       PL_colors[4],
9409                                       (long)(locinput - startpos),
9410                                       (long)(reginfo->till - startpos),
9411                                       PL_colors[5]));
9412 
9413                 sayNO_SILENT;		/* Cannot match: too short. */
9414             }
9415             sayYES;			/* Success! */
9416 
9417         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
9418             DEBUG_EXECUTE_r(
9419             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
9420                 depth, PL_colors[4], PL_colors[5]));
9421             sayYES;			/* Success! */
9422 
9423 #undef  ST
9424 #define ST st->u.ifmatch
9425 
9426         case SUSPEND:	/* (?>A) */
9427             ST.wanted = 1;
9428             ST.start = locinput;
9429             ST.end = loceol;
9430             ST.count = 1;
9431             goto do_ifmatch;
9432 
9433         case UNLESSM:	/* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9434             ST.wanted = 0;
9435             goto ifmatch_trivial_fail_test;
9436 
9437         case IFMATCH:	/* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9438             ST.wanted = 1;
9439           ifmatch_trivial_fail_test:
9440             ST.count = scan->next_off + 1; /* next_off repurposed to be
9441                                               lookbehind count, requires
9442                                               non-zero flags */
9443             if (! scan->flags) {    /* 'flags' zero means lookahed */
9444 
9445                 /* Lookahead starts here and ends at the normal place */
9446                 ST.start = locinput;
9447                 ST.end = loceol;
9448             }
9449             else {
9450                 PERL_UINT_FAST8_T back_count = scan->flags;
9451                 char * s;
9452 
9453                 /* Lookbehind can look beyond the current position */
9454                 ST.end = loceol;
9455 
9456                 /* ... and starts at the first place in the input that is in
9457                  * the range of the possible start positions */
9458                 for (; ST.count > 0; ST.count--, back_count--) {
9459                     s = HOPBACKc(locinput, back_count);
9460                     if (s) {
9461                         ST.start = s;
9462                         goto do_ifmatch;
9463                     }
9464                 }
9465 
9466                 /* If the lookbehind doesn't start in the actual string, is a
9467                  * trivial match failure */
9468                 if (logical) {
9469                     logical = 0;
9470                     sw = 1 - cBOOL(ST.wanted);
9471                 }
9472                 else if (ST.wanted)
9473                     sayNO;
9474 
9475                 /* Here, we didn't want it to match, so is actually success */
9476                 next = scan + ARG(scan);
9477                 if (next == scan)
9478                     next = NULL;
9479                 break;
9480             }
9481 
9482           do_ifmatch:
9483             ST.me = scan;
9484             ST.logical = logical;
9485             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9486 
9487             /* execute body of (?...A) */
9488             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9489                                 ST.end, script_run_begin);
9490             NOT_REACHED; /* NOTREACHED */
9491 
9492         {
9493             bool matched;
9494 
9495         case IFMATCH_A_fail: /* body of (?...A) failed */
9496             if (! ST.logical && ST.count > 1) {
9497 
9498                 /* It isn't a real failure until we've tried all starting
9499                  * positions.  Move to the next starting position and retry */
9500                 ST.count--;
9501                 ST.start = HOPc(ST.start, 1);
9502                 scan = ST.me;
9503                 logical = ST.logical;
9504                 goto do_ifmatch;
9505             }
9506 
9507             /* Here, all starting positions have been tried. */
9508             matched = FALSE;
9509             goto ifmatch_done;
9510 
9511         case IFMATCH_A: /* body of (?...A) succeeded */
9512             matched = TRUE;
9513           ifmatch_done:
9514             sw = matched == ST.wanted;
9515             if (! ST.logical && !sw) {
9516                 sayNO;
9517             }
9518 
9519             if (OP(ST.me) != SUSPEND) {
9520                 /* restore old position except for (?>...) */
9521                 locinput = st->locinput;
9522                 loceol = st->loceol;
9523                 script_run_begin = st->sr0;
9524             }
9525             scan = ST.me + ARG(ST.me);
9526             if (scan == ST.me)
9527                 scan = NULL;
9528             continue; /* execute B */
9529         }
9530 
9531 #undef ST
9532 
9533         case LONGJMP: /*  alternative with many branches compiles to
9534                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9535             next = scan + ARG(scan);
9536             if (next == scan)
9537                 next = NULL;
9538             break;
9539 
9540         case COMMIT:  /*  (*COMMIT)  */
9541             reginfo->cutpoint = loceol;
9542             /* FALLTHROUGH */
9543 
9544         case PRUNE:   /*  (*PRUNE)   */
9545             if (scan->flags)
9546                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9547             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9548                             script_run_begin);
9549             NOT_REACHED; /* NOTREACHED */
9550 
9551         case COMMIT_next_fail:
9552             no_final = 1;
9553             /* FALLTHROUGH */
9554             sayNO;
9555             NOT_REACHED; /* NOTREACHED */
9556 
9557         case OPFAIL:   /* (*FAIL)  */
9558             if (scan->flags)
9559                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9560             if (logical) {
9561                 /* deal with (?(?!)X|Y) properly,
9562                  * make sure we trigger the no branch
9563                  * of the trailing IFTHEN structure*/
9564                 sw= 0;
9565                 break;
9566             } else {
9567                 sayNO;
9568             }
9569             NOT_REACHED; /* NOTREACHED */
9570 
9571 #define ST st->u.mark
9572         case MARKPOINT: /*  (*MARK:foo)  */
9573             ST.prev_mark = mark_state;
9574             ST.mark_name = sv_commit = sv_yes_mark
9575                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9576             mark_state = st;
9577             ST.mark_loc = locinput;
9578             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9579                                 script_run_begin);
9580             NOT_REACHED; /* NOTREACHED */
9581 
9582         case MARKPOINT_next:
9583             mark_state = ST.prev_mark;
9584             sayYES;
9585             NOT_REACHED; /* NOTREACHED */
9586 
9587         case MARKPOINT_next_fail:
9588             if (popmark && sv_eq(ST.mark_name,popmark))
9589             {
9590                 if (ST.mark_loc > startpoint)
9591                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9592                 popmark = NULL; /* we found our mark */
9593                 sv_commit = ST.mark_name;
9594 
9595                 DEBUG_EXECUTE_r({
9596                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9597                             depth,
9598                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9599                 });
9600             }
9601             mark_state = ST.prev_mark;
9602             sv_yes_mark = mark_state ?
9603                 mark_state->u.mark.mark_name : NULL;
9604             sayNO;
9605             NOT_REACHED; /* NOTREACHED */
9606 
9607         case SKIP:  /*  (*SKIP)  */
9608             if (!scan->flags) {
9609                 /* (*SKIP) : if we fail we cut here*/
9610                 ST.mark_name = NULL;
9611                 ST.mark_loc = locinput;
9612                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9613                                 script_run_begin);
9614             } else {
9615                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9616                    otherwise do nothing.  Meaning we need to scan
9617                  */
9618                 regmatch_state *cur = mark_state;
9619                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9620 
9621                 while (cur) {
9622                     if ( sv_eq( cur->u.mark.mark_name,
9623                                 find ) )
9624                     {
9625                         ST.mark_name = find;
9626                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9627                                          script_run_begin);
9628                     }
9629                     cur = cur->u.mark.prev_mark;
9630                 }
9631             }
9632             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9633             break;
9634 
9635         case SKIP_next_fail:
9636             if (ST.mark_name) {
9637                 /* (*CUT:NAME) - Set up to search for the name as we
9638                    collapse the stack*/
9639                 popmark = ST.mark_name;
9640             } else {
9641                 /* (*CUT) - No name, we cut here.*/
9642                 if (ST.mark_loc > startpoint)
9643                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9644                 /* but we set sv_commit to latest mark_name if there
9645                    is one so they can test to see how things lead to this
9646                    cut */
9647                 if (mark_state)
9648                     sv_commit=mark_state->u.mark.mark_name;
9649             }
9650             no_final = 1;
9651             sayNO;
9652             NOT_REACHED; /* NOTREACHED */
9653 #undef ST
9654 
9655         case LNBREAK: /* \R */
9656             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9657                 locinput += n;
9658             } else
9659                 sayNO;
9660             break;
9661 
9662         default:
9663             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9664                           PTR2UV(scan), OP(scan));
9665             Perl_croak(aTHX_ "regexp memory corruption");
9666 
9667         /* this is a point to jump to in order to increment
9668          * locinput by one character */
9669           increment_locinput:
9670             assert(!NEXTCHR_IS_EOS);
9671             if (utf8_target) {
9672                 locinput += PL_utf8skip[nextbyte];
9673                 /* locinput is allowed to go 1 char off the end (signifying
9674                  * EOS), but not 2+ */
9675                 if (locinput >  loceol)
9676                     sayNO;
9677             }
9678             else
9679                 locinput++;
9680             break;
9681 
9682         } /* end switch */
9683 
9684         /* switch break jumps here */
9685         scan = next; /* prepare to execute the next op and ... */
9686         continue;    /* ... jump back to the top, reusing st */
9687         /* NOTREACHED */
9688 
9689       push_yes_state:
9690         /* push a state that backtracks on success */
9691         st->u.yes.prev_yes_state = yes_state;
9692         yes_state = st;
9693         /* FALLTHROUGH */
9694       push_state:
9695         /* push a new regex state, then continue at scan  */
9696         {
9697             regmatch_state *newst;
9698             DECLARE_AND_GET_RE_DEBUG_FLAGS;
9699 
9700             DEBUG_r( /* DEBUG_STACK_r */
9701               if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9702                 regmatch_state *cur = st;
9703                 regmatch_state *curyes = yes_state;
9704                 U32 i;
9705                 regmatch_slab *slab = PL_regmatch_slab;
9706                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9707                     if (cur < SLAB_FIRST(slab)) {
9708                         slab = slab->prev;
9709                         cur = SLAB_LAST(slab);
9710                     }
9711                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9712                         depth,
9713                         i ? "    " : "push",
9714                         depth - i, PL_reg_name[cur->resume_state],
9715                         (curyes == cur) ? "yes" : ""
9716                     );
9717                     if (curyes == cur)
9718                         curyes = cur->u.yes.prev_yes_state;
9719                 }
9720             } else {
9721                 DEBUG_STATE_pp("push")
9722             });
9723             depth++;
9724             st->locinput = locinput;
9725             st->loceol = loceol;
9726             st->sr0 = script_run_begin;
9727             newst = st+1;
9728             if (newst >  SLAB_LAST(PL_regmatch_slab))
9729                 newst = S_push_slab(aTHX);
9730             PL_regmatch_state = newst;
9731 
9732             locinput = pushinput;
9733             loceol = pusheol;
9734             script_run_begin = pushsr0;
9735             st = newst;
9736             continue;
9737             /* NOTREACHED */
9738         }
9739     }
9740 #ifdef SOLARIS_BAD_OPTIMIZER
9741 #  undef PL_charclass
9742 #endif
9743 
9744     /*
9745     * We get here only if there's trouble -- normally "case END" is
9746     * the terminating point.
9747     */
9748     Perl_croak(aTHX_ "corrupted regexp pointers");
9749     NOT_REACHED; /* NOTREACHED */
9750 
9751   yes:
9752     if (yes_state) {
9753         /* we have successfully completed a subexpression, but we must now
9754          * pop to the state marked by yes_state and continue from there */
9755         assert(st != yes_state);
9756 #ifdef DEBUGGING
9757         while (st != yes_state) {
9758             st--;
9759             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9760                 PL_regmatch_slab = PL_regmatch_slab->prev;
9761                 st = SLAB_LAST(PL_regmatch_slab);
9762             }
9763             DEBUG_STATE_r({
9764                 if (no_final) {
9765                     DEBUG_STATE_pp("pop (no final)");
9766                 } else {
9767                     DEBUG_STATE_pp("pop (yes)");
9768                 }
9769             });
9770             depth--;
9771         }
9772 #else
9773         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9774             || yes_state > SLAB_LAST(PL_regmatch_slab))
9775         {
9776             /* not in this slab, pop slab */
9777             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9778             PL_regmatch_slab = PL_regmatch_slab->prev;
9779             st = SLAB_LAST(PL_regmatch_slab);
9780         }
9781         depth -= (st - yes_state);
9782 #endif
9783         st = yes_state;
9784         yes_state = st->u.yes.prev_yes_state;
9785         PL_regmatch_state = st;
9786 
9787         if (no_final) {
9788             locinput= st->locinput;
9789             loceol= st->loceol;
9790             script_run_begin = st->sr0;
9791         }
9792         state_num = st->resume_state + no_final;
9793         goto reenter_switch;
9794     }
9795 
9796     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9797                           PL_colors[4], PL_colors[5]));
9798 
9799     if (reginfo->info_aux_eval) {
9800         /* each successfully executed (?{...}) block does the equivalent of
9801          *   local $^R = do {...}
9802          * When popping the save stack, all these locals would be undone;
9803          * bypass this by setting the outermost saved $^R to the latest
9804          * value */
9805         /* I dont know if this is needed or works properly now.
9806          * see code related to PL_replgv elsewhere in this file.
9807          * Yves
9808          */
9809         if (oreplsv != GvSV(PL_replgv)) {
9810             sv_setsv(oreplsv, GvSV(PL_replgv));
9811             SvSETMAGIC(oreplsv);
9812         }
9813     }
9814     result = 1;
9815     goto final_exit;
9816 
9817   no:
9818     DEBUG_EXECUTE_r(
9819         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9820             depth,
9821             PL_colors[4], PL_colors[5])
9822         );
9823 
9824   no_silent:
9825     if (no_final) {
9826         if (yes_state) {
9827             goto yes;
9828         } else {
9829             goto final_exit;
9830         }
9831     }
9832     if (depth) {
9833         /* there's a previous state to backtrack to */
9834         st--;
9835         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9836             PL_regmatch_slab = PL_regmatch_slab->prev;
9837             st = SLAB_LAST(PL_regmatch_slab);
9838         }
9839         PL_regmatch_state = st;
9840         locinput= st->locinput;
9841         loceol= st->loceol;
9842         script_run_begin = st->sr0;
9843 
9844         DEBUG_STATE_pp("pop");
9845         depth--;
9846         if (yes_state == st)
9847             yes_state = st->u.yes.prev_yes_state;
9848 
9849         state_num = st->resume_state + 1; /* failure = success + 1 */
9850         PERL_ASYNC_CHECK();
9851         goto reenter_switch;
9852     }
9853     result = 0;
9854 
9855   final_exit:
9856     if (rex->intflags & PREGf_VERBARG_SEEN) {
9857         SV *sv_err = get_sv("REGERROR", 1);
9858         SV *sv_mrk = get_sv("REGMARK", 1);
9859         if (result) {
9860             sv_commit = &PL_sv_no;
9861             if (!sv_yes_mark)
9862                 sv_yes_mark = &PL_sv_yes;
9863         } else {
9864             if (!sv_commit)
9865                 sv_commit = &PL_sv_yes;
9866             sv_yes_mark = &PL_sv_no;
9867         }
9868         assert(sv_err);
9869         assert(sv_mrk);
9870         sv_setsv(sv_err, sv_commit);
9871         sv_setsv(sv_mrk, sv_yes_mark);
9872     }
9873 
9874 
9875     if (last_pushed_cv) {
9876         dSP;
9877         /* see "Some notes about MULTICALL" above */
9878         POP_MULTICALL;
9879         PERL_UNUSED_VAR(SP);
9880     }
9881     else
9882         LEAVE_SCOPE(orig_savestack_ix);
9883 
9884     assert(!result ||  locinput - reginfo->strbeg >= 0);
9885     return result ?  locinput - reginfo->strbeg : -1;
9886 }
9887 
9888 /*
9889  - regrepeat - repeatedly match something simple, report how many
9890  *
9891  * What 'simple' means is a node which can be the operand of a quantifier like
9892  * '+', or {1,3}
9893  *
9894  * startposp - pointer to a pointer to the start position.  This is updated
9895  *             to point to the byte following the highest successful
9896  *             match.
9897  * p         - the regnode to be repeatedly matched against.
9898  * loceol    - pointer to the end position beyond which we aren't supposed to
9899  *             look.
9900  * reginfo   - struct holding match state, such as utf8_target
9901  * max       - maximum number of things to match.
9902  * depth     - (for debugging) backtracking depth.
9903  */
9904 STATIC I32
S_regrepeat(pTHX_ regexp * prog,char ** startposp,const regnode * p,char * loceol,regmatch_info * const reginfo,I32 max _pDEPTH)9905 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9906             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9907 {
9908     char *scan;     /* Pointer to current position in target string */
9909     I32 c;
9910     char *this_eol = loceol;   /* potentially adjusted version. */
9911     I32 hardcount = 0;  /* How many matches so far */
9912     bool utf8_target = reginfo->is_utf8_target;
9913     unsigned int to_complement = 0;  /* Invert the result? */
9914     _char_class_number classnum;
9915 
9916     PERL_ARGS_ASSERT_REGREPEAT;
9917 
9918     /* This routine is structured so that we switch on the input OP.  Each OP
9919      * case: statement contains a loop to repeatedly apply the OP, advancing
9920      * the input until it fails, or reaches the end of the input, or until it
9921      * reaches the upper limit of matches. */
9922 
9923     scan = *startposp;
9924     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9925                                max */
9926         max = I32_MAX;
9927     else if (! utf8_target && this_eol - scan > max)
9928         this_eol = scan + max;
9929 
9930     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
9931      * down to the maximum of how far we should go in it (but leaving it set to
9932      * the real end if the maximum permissible would take us beyond that).
9933      * This allows us to make the loop exit condition that we haven't gone past
9934      * <this_eol> to also mean that we haven't exceeded the max permissible
9935      * count, saving a test each time through the loop.  But it assumes that
9936      * the OP matches a single byte, which is true for most of the OPs below
9937      * when applied to a non-UTF-8 target.  Those relatively few OPs that don't
9938      * have this characteristic have to compensate.
9939      *
9940      * There is no such adjustment for UTF-8 targets, since the number of bytes
9941      * per character can vary.  OPs will have to test both that the count is
9942      * less than the max permissible (using <hardcount> to keep track), and
9943      * that we are still within the bounds of the string (using <this_eol>.  A
9944      * few OPs match a single byte no matter what the encoding.  They can omit
9945      * the max test if, for the UTF-8 case, they do the adjustment that was
9946      * skipped above.
9947      *
9948      * Thus, the code above sets things up for the common case; and exceptional
9949      * cases need extra work; the common case is to make sure <scan> doesn't go
9950      * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9951      * count doesn't exceed the maximum permissible */
9952 
9953     switch (with_t_UTF8ness(OP(p), utf8_target)) {
9954       case REG_ANY_t8:
9955         while (scan < this_eol && hardcount < max && *scan != '\n') {
9956             scan += UTF8SKIP(scan);
9957             hardcount++;
9958         }
9959         break;
9960 
9961       case REG_ANY_tb:
9962         scan = (char *) memchr(scan, '\n', this_eol - scan);
9963         if (! scan) {
9964             scan = this_eol;
9965         }
9966         break;
9967 
9968       case SANY_t8:
9969         while (scan < this_eol && hardcount < max) {
9970             scan += UTF8SKIP(scan);
9971             hardcount++;
9972         }
9973         break;
9974 
9975       case SANY_tb:
9976         scan = this_eol;
9977         break;
9978 
9979       case EXACT_REQ8_tb:
9980       case LEXACT_REQ8_tb:
9981       case EXACTFU_REQ8_tb:
9982         break;
9983 
9984       case EXACTL_t8:
9985         if (UTF8_IS_ABOVE_LATIN1(*scan)) {
9986             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9987         }
9988         /* FALLTHROUGH */
9989 
9990       case EXACTL_tb:
9991       case EXACTFL_t8:
9992       case EXACTFL_tb:
9993       case EXACTFLU8_t8:
9994       case EXACTFLU8_tb:
9995         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9996         /* FALLTHROUGH */
9997 
9998       case EXACT_REQ8_t8:
9999       case LEXACT_REQ8_t8:
10000       case EXACTFU_REQ8_t8:
10001       case LEXACT_t8:
10002       case LEXACT_tb:
10003       case EXACT_t8:
10004       case EXACT_tb:
10005       case EXACTF_t8:
10006       case EXACTF_tb:
10007       case EXACTFAA_NO_TRIE_t8:
10008       case EXACTFAA_NO_TRIE_tb:
10009       case EXACTFAA_t8:
10010       case EXACTFAA_tb:
10011       case EXACTFU_t8:
10012       case EXACTFU_tb:
10013       case EXACTFUP_t8:
10014       case EXACTFUP_tb:
10015 
10016       {
10017         struct next_matchable_info Binfo;
10018         PERL_UINT_FAST8_T definitive_len;
10019 
10020         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10021 
10022         /* Set up termination info, and quit if we can rule out that we've
10023          * gotten a match of the termination criteria */
10024         if (   ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10025             ||   scan + Binfo.min_length > this_eol
10026             || ! S_test_EXACTISH_ST(scan, Binfo))
10027         {
10028             break;
10029         }
10030 
10031         definitive_len = Binfo.initial_definitive;
10032 
10033         /* Here there are potential matches, and the first byte(s) matched our
10034          * filter
10035          *
10036          * If we got a definitive match of some initial bytes, there is no
10037          * possibility of false positives as far as it got */
10038         if (definitive_len > 0) {
10039 
10040             /* If as far as it got is the maximum possible, there were no false
10041              * positives at all.  Since we have everything set up, see how many
10042              * repeats there are. */
10043             if (definitive_len >= Binfo.max_length) {
10044 
10045                 /* We've already found one match */
10046                 scan += definitive_len;
10047                 hardcount++;
10048 
10049                 /* If want more than the one match, and there is room for more,
10050                  * see if there are any */
10051                 if (hardcount < max && scan + definitive_len <= this_eol) {
10052 
10053                     /* If the character is only a single byte long, just span
10054                      * all such bytes. */
10055                     if (definitive_len == 1) {
10056                         const char * orig_scan = scan;
10057 
10058                         if (this_eol - (scan - hardcount) > max) {
10059                             this_eol = scan - hardcount + max;
10060                         }
10061 
10062                         /* Use different routines depending on whether it's an
10063                          * exact match or matches with a mask */
10064                         if (Binfo.initial_exact == 1) {
10065                             scan = (char *) find_span_end((U8 *) scan,
10066                                                           (U8 *) this_eol,
10067                                                           Binfo.matches[0]);
10068                         }
10069                         else {
10070                             scan = (char *) find_span_end_mask(
10071                                                        (U8 *) scan,
10072                                                        (U8 *) this_eol,
10073                                                        Binfo.first_byte_anded,
10074                                                        Binfo.first_byte_mask);
10075                         }
10076 
10077                         hardcount += scan - orig_scan;
10078                     }
10079                     else { /* Here, the full character definitive match is more
10080                               than one byte */
10081                         while (   hardcount < max
10082                                && scan + definitive_len <= this_eol
10083                                && S_test_EXACTISH_ST(scan, Binfo))
10084                         {
10085                                 scan += definitive_len;
10086                                 hardcount++;
10087                         }
10088                     }
10089                 }
10090 
10091                 break;
10092             }   /* End of a full character is definitively matched */
10093 
10094             /* Here, an initial portion of the character matched definitively,
10095              * and the rest matched as well, but could have false positives */
10096 
10097             do {
10098                 PERL_INT_FAST8_T i;
10099                 U8 * matches = Binfo.matches;
10100 
10101                 /* The first bytes were definitive.  Look at the remaining */
10102                 for (i = 0; i < Binfo.count; i++) {
10103                     if (memEQ(scan + definitive_len,
10104                               matches + definitive_len,
10105                               Binfo.lengths[i] - definitive_len))
10106                     {
10107                         goto found_a_completion;
10108                     }
10109 
10110                     matches += Binfo.lengths[i];
10111                 }
10112 
10113                 /* Didn't find anything to complete our initial match.  Stop
10114                  * here */
10115                 break;
10116 
10117               found_a_completion:
10118 
10119                 /* Here, matched a full character, Include it in the result,
10120                  * and then look to see if the next char matches */
10121                 hardcount++;
10122                 scan += Binfo.lengths[i];
10123 
10124             } while (   hardcount < max
10125                      && scan + definitive_len < this_eol
10126                      && S_test_EXACTISH_ST(scan, Binfo));
10127 
10128             /* Here, have advanced as far as possible */
10129             break;
10130         } /* End of found some initial bytes that definitively matched */
10131 
10132         /* Here, we can't rule out that we have found the beginning of 'B', but
10133          * there were no initial bytes that could rule out anything
10134          * definitively. Use brute force to examine all the possibilities */
10135         while (scan < this_eol && hardcount < max) {
10136             PERL_INT_FAST8_T i;
10137             U8 * matches = Binfo.matches;
10138 
10139             for (i = 0; i < Binfo.count; i++) {
10140                 if (memEQ(scan, matches, Binfo.lengths[i])) {
10141                     goto found1;
10142                 }
10143 
10144                 matches += Binfo.lengths[i];
10145             }
10146 
10147             break;
10148 
10149           found1:
10150             hardcount++;
10151             scan += Binfo.lengths[i];
10152         }
10153 
10154         break;
10155       }
10156 
10157       case ANYOFPOSIXL_t8:
10158       case ANYOFL_t8:
10159         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10160          CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10161 
10162         /* FALLTHROUGH */
10163       case ANYOFD_t8:
10164       case ANYOF_t8:
10165         while (   hardcount < max
10166                && scan < this_eol
10167                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10168         {
10169             scan += UTF8SKIP(scan);
10170             hardcount++;
10171         }
10172         break;
10173 
10174       case ANYOFPOSIXL_tb:
10175       case ANYOFL_tb:
10176         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10177          CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10178         /* FALLTHROUGH */
10179 
10180       case ANYOFD_tb:
10181       case ANYOF_tb:
10182         if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
10183             while (   scan < this_eol
10184                    && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10185                 scan++;
10186         }
10187         else {
10188             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10189                 scan++;
10190         }
10191         break;
10192 
10193       case ANYOFM_t8:
10194         if (this_eol - scan > max) {
10195 
10196             /* We didn't adjust <this_eol> at the beginning of this routine
10197              * because is UTF-8, but it is actually ok to do so, since here, to
10198              * match, 1 char == 1 byte. */
10199             this_eol = scan + max;
10200         }
10201         /* FALLTHROUGH */
10202 
10203       case ANYOFM_tb:
10204         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10205                                            (U8) ARG(p), FLAGS(p));
10206         break;
10207 
10208       case NANYOFM_t8:
10209         while (     hardcount < max
10210                &&   scan < this_eol
10211                &&  (*scan & FLAGS(p)) != ARG(p))
10212         {
10213             scan += UTF8SKIP(scan);
10214             hardcount++;
10215         }
10216         break;
10217 
10218       case NANYOFM_tb:
10219         scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10220                                          (U8) ARG(p), FLAGS(p));
10221         break;
10222 
10223       case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10224       case ANYOFHb_tb:
10225       case ANYOFHr_tb:
10226       case ANYOFHs_tb:
10227         break;
10228 
10229       case ANYOFH_t8:
10230         while (  hardcount < max
10231                && scan < this_eol
10232                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10233                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10234         {
10235             scan += UTF8SKIP(scan);
10236             hardcount++;
10237         }
10238         break;
10239 
10240       case ANYOFHb_t8:
10241         /* we know the first byte must be the FLAGS field */
10242         while (   hardcount < max
10243                && scan < this_eol
10244                && (U8) *scan == ANYOF_FLAGS(p)
10245                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10246         {
10247             scan += UTF8SKIP(scan);
10248             hardcount++;
10249         }
10250         break;
10251 
10252       case ANYOFHr_t8:
10253         while (  hardcount < max
10254                && scan < this_eol
10255                && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10256                           LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10257                           HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10258                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10259                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10260         {
10261             scan += UTF8SKIP(scan);
10262             hardcount++;
10263         }
10264         break;
10265 
10266       case ANYOFHs_t8:
10267         while (   hardcount < max
10268                && scan + FLAGS(p) < this_eol
10269                && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10270                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10271         {
10272             scan += UTF8SKIP(scan);
10273             hardcount++;
10274         }
10275         break;
10276 
10277       case ANYOFR_t8:
10278         while (   hardcount < max
10279                && scan < this_eol
10280                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10281                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10282                                             (U8 *) this_eol,
10283                                             NULL),
10284                               ANYOFRbase(p), ANYOFRdelta(p)))
10285         {
10286             scan += UTF8SKIP(scan);
10287             hardcount++;
10288         }
10289         break;
10290 
10291       case ANYOFR_tb:
10292         while (   hardcount < max
10293                && scan < this_eol
10294                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10295         {
10296             scan++;
10297             hardcount++;
10298         }
10299         break;
10300 
10301       case ANYOFRb_t8:
10302         while (   hardcount < max
10303                && scan < this_eol
10304                && (U8) *scan == ANYOF_FLAGS(p)
10305                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10306                                             (U8 *) this_eol,
10307                                             NULL),
10308                               ANYOFRbase(p), ANYOFRdelta(p)))
10309         {
10310             scan += UTF8SKIP(scan);
10311             hardcount++;
10312         }
10313         break;
10314 
10315       case ANYOFRb_tb:
10316         while (   hardcount < max
10317                && scan < this_eol
10318                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10319         {
10320             scan++;
10321             hardcount++;
10322         }
10323         break;
10324 
10325     /* The argument (FLAGS) to all the POSIX node types is the class number */
10326 
10327       case NPOSIXL_tb:
10328         to_complement = 1;
10329         /* FALLTHROUGH */
10330 
10331       case POSIXL_tb:
10332         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10333         while (   scan < this_eol
10334                && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10335         {
10336             scan++;
10337         }
10338         break;
10339 
10340       case NPOSIXL_t8:
10341         to_complement = 1;
10342         /* FALLTHROUGH */
10343 
10344       case POSIXL_t8:
10345         while (   hardcount < max && scan < this_eol
10346                && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10347                                                       (U8 *) scan,
10348                                                       (U8 *) this_eol)))
10349         {
10350             scan += UTF8SKIP(scan);
10351             hardcount++;
10352         }
10353         break;
10354 
10355       case POSIXD_tb:
10356         /* FALLTHROUGH */
10357 
10358       case POSIXA_t8:
10359         if (this_eol - scan > max) {
10360 
10361             /* We didn't adjust <this_eol> at the beginning of this routine
10362              * because is UTF-8, but it is actually ok to do so, since here, to
10363              * match, 1 char == 1 byte. */
10364             this_eol = scan + max;
10365         }
10366         /* FALLTHROUGH */
10367 
10368       case POSIXA_tb:
10369         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
10370             scan++;
10371         }
10372         break;
10373 
10374       case NPOSIXD_tb:
10375         /* FALLTHROUGH */
10376 
10377       case NPOSIXA_tb:
10378         while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
10379             scan++;
10380         }
10381         break;
10382 
10383       case NPOSIXA_t8:
10384 
10385         /* The complement of something that matches only ASCII matches all
10386          * non-ASCII, plus everything in ASCII that isn't in the class. */
10387         while (   hardcount < max && scan < this_eol
10388                && (   ! isASCII_utf8_safe(scan, loceol)
10389                    || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
10390             {
10391                 scan += UTF8SKIP(scan);
10392                 hardcount++;
10393             }
10394         break;
10395 
10396       case NPOSIXU_tb:
10397         to_complement = 1;
10398         /* FALLTHROUGH */
10399 
10400       case POSIXU_tb:
10401         while (   scan < this_eol
10402                && to_complement ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
10403         {
10404             scan++;
10405         }
10406         break;
10407 
10408       case NPOSIXU_t8:
10409       case NPOSIXD_t8:
10410         to_complement = 1;
10411         /* FALLTHROUGH */
10412 
10413       case POSIXD_t8:
10414       case POSIXU_t8:
10415         classnum = (_char_class_number) FLAGS(p);
10416         switch (classnum) {
10417           default:
10418             while (   hardcount < max && scan < this_eol
10419                    && to_complement
10420                     ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10421                        utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10422             {
10423                 scan += UTF8SKIP(scan);
10424                 hardcount++;
10425             }
10426             break;
10427 
10428             /* For the classes below, the knowledge of how to handle every code
10429              * point is compiled into Perl via a macro.  This code is written
10430              * for making the loops as tight as possible.  It could be
10431              * refactored to save space instead. */
10432 
10433           case _CC_ENUM_SPACE:
10434             while (   hardcount < max
10435                    && scan < this_eol
10436                    && (to_complement
10437                                    ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10438             {
10439                 scan += UTF8SKIP(scan);
10440                 hardcount++;
10441             }
10442             break;
10443           case _CC_ENUM_BLANK:
10444             while (   hardcount < max
10445                    && scan < this_eol
10446                    && (to_complement
10447                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10448             {
10449                 scan += UTF8SKIP(scan);
10450                 hardcount++;
10451             }
10452             break;
10453           case _CC_ENUM_XDIGIT:
10454             while (   hardcount < max
10455                    && scan < this_eol
10456                    && (to_complement
10457                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10458             {
10459                 scan += UTF8SKIP(scan);
10460                 hardcount++;
10461             }
10462             break;
10463           case _CC_ENUM_VERTSPACE:
10464             while (   hardcount < max
10465                    && scan < this_eol
10466                    && (to_complement
10467                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10468             {
10469                 scan += UTF8SKIP(scan);
10470                 hardcount++;
10471             }
10472             break;
10473           case _CC_ENUM_CNTRL:
10474             while (   hardcount < max
10475                    && scan < this_eol
10476                    && (to_complement
10477                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10478             {
10479                 scan += UTF8SKIP(scan);
10480                 hardcount++;
10481             }
10482             break;
10483         }
10484         break;
10485 
10486       case LNBREAK_t8:
10487         while (    hardcount < max && scan < this_eol
10488                && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10489         {
10490             scan += c;
10491             hardcount++;
10492         }
10493         break;
10494 
10495       case LNBREAK_tb:
10496         /* LNBREAK can match one or two latin chars, which is ok, but we have
10497          * to use hardcount in this situation, and throw away the adjustment to
10498          * <this_eol> done before the switch statement */
10499         while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
10500             scan+=c;
10501             hardcount++;
10502         }
10503         break;
10504 
10505     default:
10506         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10507                          " node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10508         NOT_REACHED; /* NOTREACHED */
10509 
10510     }
10511 
10512     if (hardcount)
10513         c = hardcount;
10514     else
10515         c = scan - *startposp;
10516     *startposp = scan;
10517 
10518     DEBUG_r({
10519         DECLARE_AND_GET_RE_DEBUG_FLAGS;
10520         DEBUG_EXECUTE_r({
10521             SV * const prop = sv_newmortal();
10522             regprop(prog, prop, p, reginfo, NULL);
10523             Perl_re_exec_indentf( aTHX_
10524                         "%s can match %" IVdf " times out of %" IVdf "...\n",
10525                         depth, SvPVX_const(prop),(IV)c,(IV)max);
10526         });
10527     });
10528 
10529     return(c);
10530 }
10531 
10532 /*
10533  - reginclass - determine if a character falls into a character class
10534 
10535   n is the ANYOF-type regnode
10536   p is the target string
10537   p_end points to one byte beyond the end of the target string
10538   utf8_target tells whether p is in UTF-8.
10539 
10540   Returns true if matched; false otherwise.
10541 
10542   Note that this can be a synthetic start class, a combination of various
10543   nodes, so things you think might be mutually exclusive, such as locale,
10544   aren't.  It can match both locale and non-locale
10545 
10546  */
10547 
10548 STATIC bool
S_reginclass(pTHX_ regexp * const prog,const regnode * const n,const U8 * const p,const U8 * const p_end,const bool utf8_target)10549 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10550 {
10551     const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10552                         ? 0
10553                         : ANYOF_FLAGS(n);
10554     bool match = FALSE;
10555     UV c = *p;
10556 
10557     PERL_ARGS_ASSERT_REGINCLASS;
10558 
10559     /* If c is not already the code point, get it.  Note that
10560      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10561     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10562         STRLEN c_len = 0;
10563         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10564         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10565         if (c_len == (STRLEN)-1) {
10566             _force_out_malformed_utf8_message(p, p_end,
10567                                               utf8n_flags,
10568                                               1 /* 1 means die */ );
10569             NOT_REACHED; /* NOTREACHED */
10570         }
10571         if (     c > 255
10572             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10573             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10574         {
10575             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10576         }
10577     }
10578 
10579     /* If this character is potentially in the bitmap, check it */
10580     if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10581         if (ANYOF_BITMAP_TEST(n, c))
10582             match = TRUE;
10583         else if ((flags
10584                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10585                   && OP(n) == ANYOFD
10586                   && ! utf8_target
10587                   && ! isASCII(c))
10588         {
10589             match = TRUE;
10590         }
10591         else if (flags & ANYOF_LOCALE_FLAGS) {
10592             if (  (flags & ANYOFL_FOLD)
10593                 && c < 256
10594                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10595             {
10596                 match = TRUE;
10597             }
10598             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10599                      && c <= U8_MAX  /* param to isFOO_lc() */
10600             ) {
10601                 /* The data structure is arranged so bits 0, 2, 4, ... are set
10602                  * if the class includes the Posix character class given by
10603                  * bit/2; and 1, 3, 5, ... are set if the class includes the
10604                  * complemented Posix class given by int(bit/2), so the
10605                  * remainder modulo 2 tells us if to complement or not.
10606                  *
10607                  * Note that this code assumes that all the classes are closed
10608                  * under folding.  For example, if a character matches \w, then
10609                  * its fold does too; and vice versa.  This should be true for
10610                  * any well-behaved locale for all the currently defined Posix
10611                  * classes, except for :lower: and :upper:, which are handled
10612                  * by the pseudo-class :cased: which matches if either of the
10613                  * other two does.  To get rid of this assumption, an outer
10614                  * loop could be used below to iterate over both the source
10615                  * character, and its fold (if different) */
10616 
10617                 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10618 
10619                 do {
10620                     /* Find the next set bit indicating a class to try matching
10621                      * against */
10622                     U8 bit_pos = lsbit_pos32(posixl_bits);
10623 
10624                     if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10625                         match = TRUE;
10626                         break;
10627                     }
10628 
10629                     /* Remove this class from consideration; repeat */
10630                     POSIXL_CLEAR(posixl_bits, bit_pos);
10631                 } while(posixl_bits != 0);
10632             }
10633         }
10634     }
10635 
10636 
10637     /* If the bitmap didn't (or couldn't) match, and something outside the
10638      * bitmap could match, try that. */
10639     if (!match) {
10640         if (c >= NUM_ANYOF_CODE_POINTS
10641             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10642         {
10643             match = TRUE;	/* Everything above the bitmap matches */
10644         }
10645             /* Here doesn't match everything above the bitmap.  If there is
10646              * some information available beyond the bitmap, we may find a
10647              * match in it.  If so, this is most likely because the code point
10648              * is outside the bitmap range.  But rarely, it could be because of
10649              * some other reason.  If so, various flags are set to indicate
10650              * this possibility.  On ANYOFD nodes, there may be matches that
10651              * happen only when the target string is UTF-8; or for other node
10652              * types, because runtime lookup is needed, regardless of the
10653              * UTF-8ness of the target string.  Finally, under /il, there may
10654              * be some matches only possible if the locale is a UTF-8 one. */
10655         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
10656                  && (   c >= NUM_ANYOF_CODE_POINTS
10657                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10658                          && (   UNLIKELY(OP(n) != ANYOFD)
10659                              || (utf8_target && ! isASCII_uni(c)
10660 #                               if NUM_ANYOF_CODE_POINTS > 256
10661                                                                  && c < 256
10662 #                               endif
10663                                 )))
10664                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10665                          && IN_UTF8_CTYPE_LOCALE)))
10666         {
10667             SV* only_utf8_locale = NULL;
10668             SV * const definition =
10669 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10670                 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10671                                             &only_utf8_locale, NULL);
10672 #else
10673                 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10674                                              &only_utf8_locale, NULL);
10675 #endif
10676             if (definition) {
10677                 U8 utf8_buffer[2];
10678                 U8 * utf8_p;
10679                 if (utf8_target) {
10680                     utf8_p = (U8 *) p;
10681                 } else { /* Convert to utf8 */
10682                     utf8_p = utf8_buffer;
10683                     append_utf8_from_native_byte(*p, &utf8_p);
10684                     utf8_p = utf8_buffer;
10685                 }
10686 
10687                 /* Turkish locales have these hard-coded rules overriding
10688                  * normal ones */
10689                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
10690                     && isALPHA_FOLD_EQ(*p, 'i'))
10691                 {
10692                     if (*p == 'i') {
10693                         if (_invlist_contains_cp(definition,
10694                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10695                         {
10696                             match = TRUE;
10697                         }
10698                     }
10699                     else if (*p == 'I') {
10700                         if (_invlist_contains_cp(definition,
10701                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10702                         {
10703                             match = TRUE;
10704                         }
10705                     }
10706                 }
10707                 else if (_invlist_contains_cp(definition, c)) {
10708                     match = TRUE;
10709                 }
10710             }
10711             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10712                 match = _invlist_contains_cp(only_utf8_locale, c);
10713             }
10714         }
10715 
10716         /* In a Turkic locale under folding, hard-code the I i case pair
10717          * matches */
10718         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10719             && ! match
10720             &&   (flags & ANYOFL_FOLD)
10721             &&   utf8_target)
10722         {
10723             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10724                 if (ANYOF_BITMAP_TEST(n, 'i')) {
10725                     match = TRUE;
10726                 }
10727             }
10728             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10729                 if (ANYOF_BITMAP_TEST(n, 'I')) {
10730                     match = TRUE;
10731                 }
10732             }
10733         }
10734 
10735         if (UNICODE_IS_SUPER(c)
10736             && (flags
10737                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10738             && OP(n) != ANYOFD
10739             && ckWARN_d(WARN_NON_UNICODE))
10740         {
10741             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10742                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10743         }
10744     }
10745 
10746 #if ANYOF_INVERT != 1
10747     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10748      * use it, don't */
10749 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10750 #endif
10751 
10752     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10753     return (flags & ANYOF_INVERT) ^ match;
10754 }
10755 
10756 STATIC U8 *
S_reghop3(U8 * s,SSize_t off,const U8 * lim)10757 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10758 {
10759     /* return the position 'off' UTF-8 characters away from 's', forward if
10760      * 'off' >= 0, backwards if negative.  But don't go outside of position
10761      * 'lim', which better be < s  if off < 0 */
10762 
10763     PERL_ARGS_ASSERT_REGHOP3;
10764 
10765     if (off >= 0) {
10766         while (off-- && s < lim) {
10767             /* XXX could check well-formedness here */
10768             U8 *new_s = s + UTF8SKIP(s);
10769             if (new_s > lim) /* lim may be in the middle of a long character */
10770                 return s;
10771             s = new_s;
10772         }
10773     }
10774     else {
10775         while (off++ && s > lim) {
10776             s--;
10777             if (UTF8_IS_CONTINUED(*s)) {
10778                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10779                     s--;
10780                 if (! UTF8_IS_START(*s)) {
10781                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10782                 }
10783             }
10784             /* XXX could check well-formedness here */
10785         }
10786     }
10787     return s;
10788 }
10789 
10790 STATIC U8 *
S_reghop4(U8 * s,SSize_t off,const U8 * llim,const U8 * rlim)10791 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10792 {
10793     PERL_ARGS_ASSERT_REGHOP4;
10794 
10795     if (off >= 0) {
10796         while (off-- && s < rlim) {
10797             /* XXX could check well-formedness here */
10798             s += UTF8SKIP(s);
10799         }
10800     }
10801     else {
10802         while (off++ && s > llim) {
10803             s--;
10804             if (UTF8_IS_CONTINUED(*s)) {
10805                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10806                     s--;
10807                 if (! UTF8_IS_START(*s)) {
10808                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10809                 }
10810             }
10811             /* XXX could check well-formedness here */
10812         }
10813     }
10814     return s;
10815 }
10816 
10817 /* like reghop3, but returns NULL on overrun, rather than returning last
10818  * char pos */
10819 
10820 STATIC U8 *
S_reghopmaybe3(U8 * s,SSize_t off,const U8 * const lim)10821 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10822 {
10823     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10824 
10825     if (off >= 0) {
10826         while (off-- && s < lim) {
10827             /* XXX could check well-formedness here */
10828             s += UTF8SKIP(s);
10829         }
10830         if (off >= 0)
10831             return NULL;
10832     }
10833     else {
10834         while (off++ && s > lim) {
10835             s--;
10836             if (UTF8_IS_CONTINUED(*s)) {
10837                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10838                     s--;
10839                 if (! UTF8_IS_START(*s)) {
10840                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10841                 }
10842             }
10843             /* XXX could check well-formedness here */
10844         }
10845         if (off <= 0)
10846             return NULL;
10847     }
10848     return s;
10849 }
10850 
10851 
10852 /* when executing a regex that may have (?{}), extra stuff needs setting
10853    up that will be visible to the called code, even before the current
10854    match has finished. In particular:
10855 
10856    * $_ is localised to the SV currently being matched;
10857    * pos($_) is created if necessary, ready to be updated on each call-out
10858      to code;
10859    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10860      isn't set until the current pattern is successfully finished), so that
10861      $1 etc of the match-so-far can be seen;
10862    * save the old values of subbeg etc of the current regex, and  set then
10863      to the current string (again, this is normally only done at the end
10864      of execution)
10865 */
10866 
10867 static void
S_setup_eval_state(pTHX_ regmatch_info * const reginfo)10868 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10869 {
10870     MAGIC *mg;
10871     regexp *const rex = ReANY(reginfo->prog);
10872     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10873 
10874     eval_state->rex = rex;
10875     eval_state->sv  = reginfo->sv;
10876 
10877     if (reginfo->sv) {
10878         /* Make $_ available to executed code. */
10879         if (reginfo->sv != DEFSV) {
10880             SAVE_DEFSV;
10881             DEFSV_set(reginfo->sv);
10882         }
10883         /* will be dec'd by S_cleanup_regmatch_info_aux */
10884         SvREFCNT_inc_NN(reginfo->sv);
10885 
10886         if (!(mg = mg_find_mglob(reginfo->sv))) {
10887             /* prepare for quick setting of pos */
10888             mg = sv_magicext_mglob(reginfo->sv);
10889             mg->mg_len = -1;
10890         }
10891         eval_state->pos_magic = mg;
10892         eval_state->pos       = mg->mg_len;
10893         eval_state->pos_flags = mg->mg_flags;
10894     }
10895     else
10896         eval_state->pos_magic = NULL;
10897 
10898     if (!PL_reg_curpm) {
10899         /* PL_reg_curpm is a fake PMOP that we can attach the current
10900          * regex to and point PL_curpm at, so that $1 et al are visible
10901          * within a /(?{})/. It's just allocated once per interpreter the
10902          * first time its needed */
10903         Newxz(PL_reg_curpm, 1, PMOP);
10904 #ifdef USE_ITHREADS
10905         {
10906             SV* const repointer = &PL_sv_undef;
10907             /* this regexp is also owned by the new PL_reg_curpm, which
10908                will try to free it.  */
10909             av_push(PL_regex_padav, repointer);
10910             PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
10911             PL_regex_pad = AvARRAY(PL_regex_padav);
10912         }
10913 #endif
10914     }
10915     SET_reg_curpm(reginfo->prog);
10916     eval_state->curpm = PL_curpm;
10917     PL_curpm_under = PL_curpm;
10918     PL_curpm = PL_reg_curpm;
10919     if (RXp_MATCH_COPIED(rex)) {
10920         /*  Here is a serious problem: we cannot rewrite subbeg,
10921             since it may be needed if this match fails.  Thus
10922             $` inside (?{}) could fail... */
10923         eval_state->subbeg     = rex->subbeg;
10924         eval_state->sublen     = rex->sublen;
10925         eval_state->suboffset  = rex->suboffset;
10926         eval_state->subcoffset = rex->subcoffset;
10927 #ifdef PERL_ANY_COW
10928         eval_state->saved_copy = rex->saved_copy;
10929 #endif
10930         RXp_MATCH_COPIED_off(rex);
10931     }
10932     else
10933         eval_state->subbeg = NULL;
10934     rex->subbeg = (char *)reginfo->strbeg;
10935     rex->suboffset = 0;
10936     rex->subcoffset = 0;
10937     rex->sublen = reginfo->strend - reginfo->strbeg;
10938 }
10939 
10940 
10941 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10942 
10943 static void
S_cleanup_regmatch_info_aux(pTHX_ void * arg)10944 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10945 {
10946     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10947     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10948     regmatch_slab *s;
10949 
10950     Safefree(aux->poscache);
10951 
10952     if (eval_state) {
10953 
10954         /* undo the effects of S_setup_eval_state() */
10955 
10956         if (eval_state->subbeg) {
10957             regexp * const rex = eval_state->rex;
10958             rex->subbeg     = eval_state->subbeg;
10959             rex->sublen     = eval_state->sublen;
10960             rex->suboffset  = eval_state->suboffset;
10961             rex->subcoffset = eval_state->subcoffset;
10962 #ifdef PERL_ANY_COW
10963             rex->saved_copy = eval_state->saved_copy;
10964 #endif
10965             RXp_MATCH_COPIED_on(rex);
10966         }
10967         if (eval_state->pos_magic)
10968         {
10969             eval_state->pos_magic->mg_len = eval_state->pos;
10970             eval_state->pos_magic->mg_flags =
10971                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10972                | (eval_state->pos_flags & MGf_BYTES);
10973         }
10974 
10975         PL_curpm = eval_state->curpm;
10976         SvREFCNT_dec(eval_state->sv);
10977     }
10978 
10979     PL_regmatch_state = aux->old_regmatch_state;
10980     PL_regmatch_slab  = aux->old_regmatch_slab;
10981 
10982     /* free all slabs above current one - this must be the last action
10983      * of this function, as aux and eval_state are allocated within
10984      * slabs and may be freed here */
10985 
10986     s = PL_regmatch_slab->next;
10987     if (s) {
10988         PL_regmatch_slab->next = NULL;
10989         while (s) {
10990             regmatch_slab * const osl = s;
10991             s = s->next;
10992             Safefree(osl);
10993         }
10994     }
10995 }
10996 
10997 
10998 STATIC void
S_to_utf8_substr(pTHX_ regexp * prog)10999 S_to_utf8_substr(pTHX_ regexp *prog)
11000 {
11001     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11002      * on the converted value */
11003 
11004     int i = 1;
11005 
11006     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11007 
11008     do {
11009         if (prog->substrs->data[i].substr
11010             && !prog->substrs->data[i].utf8_substr) {
11011             SV* const sv = newSVsv(prog->substrs->data[i].substr);
11012             prog->substrs->data[i].utf8_substr = sv;
11013             sv_utf8_upgrade(sv);
11014             if (SvVALID(prog->substrs->data[i].substr)) {
11015                 if (SvTAIL(prog->substrs->data[i].substr)) {
11016                     /* Trim the trailing \n that fbm_compile added last
11017                        time.  */
11018                     SvCUR_set(sv, SvCUR(sv) - 1);
11019                     /* Whilst this makes the SV technically "invalid" (as its
11020                        buffer is no longer followed by "\0") when fbm_compile()
11021                        adds the "\n" back, a "\0" is restored.  */
11022                     fbm_compile(sv, FBMcf_TAIL);
11023                 } else
11024                     fbm_compile(sv, 0);
11025             }
11026             if (prog->substrs->data[i].substr == prog->check_substr)
11027                 prog->check_utf8 = sv;
11028         }
11029     } while (i--);
11030 }
11031 
11032 STATIC bool
S_to_byte_substr(pTHX_ regexp * prog)11033 S_to_byte_substr(pTHX_ regexp *prog)
11034 {
11035     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11036      * on the converted value; returns FALSE if can't be converted. */
11037 
11038     int i = 1;
11039 
11040     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11041 
11042     do {
11043         if (prog->substrs->data[i].utf8_substr
11044             && !prog->substrs->data[i].substr) {
11045             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11046             if (! sv_utf8_downgrade(sv, TRUE)) {
11047                 SvREFCNT_dec_NN(sv);
11048                 return FALSE;
11049             }
11050             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11051                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11052                     /* Trim the trailing \n that fbm_compile added last
11053                         time.  */
11054                     SvCUR_set(sv, SvCUR(sv) - 1);
11055                     fbm_compile(sv, FBMcf_TAIL);
11056                 } else
11057                     fbm_compile(sv, 0);
11058             }
11059             prog->substrs->data[i].substr = sv;
11060             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11061                 prog->check_substr = sv;
11062         }
11063     } while (i--);
11064 
11065     return TRUE;
11066 }
11067 
11068 #ifndef PERL_IN_XSUB_RE
11069 
11070 bool
Perl_is_grapheme(pTHX_ const U8 * strbeg,const U8 * s,const U8 * strend,const UV cp)11071 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11072 {
11073     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
11074      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
11075      * the larger string bounded by 'strbeg' and 'strend'.
11076      *
11077      * 'cp' needs to be assigned (if not, a future version of the Unicode
11078      * Standard could make it something that combines with adjacent characters,
11079      * so code using it would then break), and there has to be a GCB break
11080      * before and after the character. */
11081 
11082 
11083     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11084     const U8 * prev_cp_start;
11085 
11086     PERL_ARGS_ASSERT_IS_GRAPHEME;
11087 
11088     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
11089         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11090     {
11091         /* These are considered graphemes */
11092         return TRUE;
11093     }
11094 
11095     /* Otherwise, unassigned code points are forbidden */
11096     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11097                                     _invlist_search(PL_Assigned_invlist, cp))))
11098     {
11099         return FALSE;
11100     }
11101 
11102     cp_gcb_val = getGCB_VAL_CP(cp);
11103 
11104     /* Find the GCB value of the previous code point in the input */
11105     prev_cp_start = utf8_hop_back(s, -1, strbeg);
11106     if (UNLIKELY(prev_cp_start == s)) {
11107         prev_cp_gcb_val = GCB_EDGE;
11108     }
11109     else {
11110         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11111     }
11112 
11113     /* And check that is a grapheme boundary */
11114     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11115                 TRUE /* is UTF-8 encoded */ ))
11116     {
11117         return FALSE;
11118     }
11119 
11120     /* Similarly verify there is a break between the current character and the
11121      * following one */
11122     s += UTF8SKIP(s);
11123     if (s >= strend) {
11124         next_cp_gcb_val = GCB_EDGE;
11125     }
11126     else {
11127         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11128     }
11129 
11130     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11131 }
11132 
11133 /*
11134 =for apidoc_section $unicode
11135 
11136 =for apidoc isSCRIPT_RUN
11137 
11138 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11139 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
11140 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
11141 two degenerate cases given below, this function returns TRUE iff all code
11142 points in it come from any combination of three "scripts" given by the Unicode
11143 "Script Extensions" property: Common, Inherited, and possibly one other.
11144 Additionally all decimal digits must come from the same consecutive sequence of
11145 10.
11146 
11147 For example, if all the characters in the sequence are Greek, or Common, or
11148 Inherited, this function will return TRUE, provided any decimal digits in it
11149 are from the same block of digits in Common.  (These are the ASCII digits
11150 "0".."9" and additionally a block for full width forms of these, and several
11151 others used in mathematical notation.)   For scripts (unlike Greek) that have
11152 their own digits defined this will accept either digits from that set or from
11153 one of the Common digit sets, but not a combination of the two.  Some scripts,
11154 such as Arabic, have more than one set of digits.  All digits must come from
11155 the same set for this function to return TRUE.
11156 
11157 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11158 contain the script found, using the C<SCX_enum> typedef.  Its value will be
11159 C<SCX_INVALID> if the function returns FALSE.
11160 
11161 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11162 will be C<SCX_INVALID>.
11163 
11164 If the sequence contains a single code point which is unassigned to a character
11165 in the version of Unicode being used, the function will return TRUE, and the
11166 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
11167 in the input sequence will result in the function treating the input as not
11168 being a script run.
11169 
11170 The returned script will be C<SCX_Inherited> iff all the code points in it are
11171 from the Inherited script.
11172 
11173 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11174 it are from the Inherited or Common scripts.
11175 
11176 =cut
11177 
11178 */
11179 
11180 bool
Perl_isSCRIPT_RUN(pTHX_ const U8 * s,const U8 * send,const bool utf8_target)11181 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11182 {
11183     /* Basically, it looks at each character in the sequence to see if the
11184      * above conditions are met; if not it fails.  It uses an inversion map to
11185      * find the enum corresponding to the script of each character.  But this
11186      * is complicated by the fact that a few code points can be in any of
11187      * several scripts.  The data has been constructed so that there are
11188      * additional enum values (all negative) for these situations.  The
11189      * absolute value of those is an index into another table which contains
11190      * pointers to auxiliary tables for each such situation.  Each aux array
11191      * lists all the scripts for the given situation.  There is another,
11192      * parallel, table that gives the number of entries in each aux table.
11193      * These are all defined in charclass_invlists.h */
11194 
11195     /* XXX Here are the additional things UTS 39 says could be done:
11196      *
11197      * Forbid sequences of the same nonspacing mark
11198      *
11199      * Check to see that all the characters are in the sets of exemplar
11200      * characters for at least one language in the Unicode Common Locale Data
11201      * Repository [CLDR]. */
11202 
11203 
11204     /* Things that match /\d/u */
11205     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
11206     UV * decimals_array = invlist_array(decimals_invlist);
11207 
11208     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11209      * not currently known) */
11210     UV zero_of_run = 0;
11211 
11212     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
11213     SCX_enum script_of_char = SCX_INVALID;
11214 
11215     /* If the script remains not fully determined from iteration to iteration,
11216      * this is the current intersection of the possiblities.  */
11217     SCX_enum * intersection = NULL;
11218     PERL_UINT_FAST8_T intersection_len = 0;
11219 
11220     bool retval = TRUE;
11221     SCX_enum * ret_script = NULL;
11222 
11223     assert(send >= s);
11224 
11225     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11226 
11227     /* All code points in 0..255 are either Common or Latin, so must be a
11228      * script run.  We can return immediately unless we need to know which
11229      * script it is. */
11230     if (! utf8_target && LIKELY(send > s)) {
11231         if (ret_script == NULL) {
11232             return TRUE;
11233         }
11234 
11235         /* If any character is Latin, the run is Latin */
11236         while (s < send) {
11237             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11238                 *ret_script = SCX_Latin;
11239                 return TRUE;
11240             }
11241         }
11242 
11243         /* Here, all are Common */
11244         *ret_script = SCX_Common;
11245         return TRUE;
11246     }
11247 
11248     /* Look at each character in the sequence */
11249     while (s < send) {
11250         /* If the current character being examined is a digit, this is the code
11251          * point of the zero for its sequence of 10 */
11252         UV zero_of_char;
11253 
11254         UV cp;
11255 
11256         /* The code allows all scripts to use the ASCII digits.  This is
11257          * because they are in the Common script.  Hence any ASCII ones found
11258          * are ok, unless and until a digit from another set has already been
11259          * encountered.  digit ranges in Common are not similarly blessed) */
11260         if (UNLIKELY(isDIGIT(*s))) {
11261             if (UNLIKELY(script_of_run == SCX_Unknown)) {
11262                 retval = FALSE;
11263                 break;
11264             }
11265             if (zero_of_run) {
11266                 if (zero_of_run != '0') {
11267                     retval = FALSE;
11268                     break;
11269                 }
11270             }
11271             else {
11272                 zero_of_run = '0';
11273             }
11274             s++;
11275             continue;
11276         }
11277 
11278         /* Here, isn't an ASCII digit.  Find the code point of the character */
11279         if (! UTF8_IS_INVARIANT(*s)) {
11280             Size_t len;
11281             cp = valid_utf8_to_uvchr((U8 *) s, &len);
11282             s += len;
11283         }
11284         else {
11285             cp = *(s++);
11286         }
11287 
11288         /* If is within the range [+0 .. +9] of the script's zero, it also is a
11289          * digit in that script.  We can skip the rest of this code for this
11290          * character. */
11291         if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11292             continue;
11293         }
11294 
11295         /* Find the character's script.  The correct values are hard-coded here
11296          * for small-enough code points. */
11297         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
11298                                unlikely to change */
11299             if (       cp > 255
11300                 || (   isALPHA_L1(cp)
11301                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
11302             {
11303                 script_of_char = SCX_Latin;
11304             }
11305             else {
11306                 script_of_char = SCX_Common;
11307             }
11308         }
11309         else {
11310             script_of_char = _Perl_SCX_invmap[
11311                                        _invlist_search(PL_SCX_invlist, cp)];
11312         }
11313 
11314         /* We arbitrarily accept a single unassigned character, but not in
11315          * combination with anything else, and not a run of them. */
11316         if (   UNLIKELY(script_of_run == SCX_Unknown)
11317             || UNLIKELY(   script_of_run != SCX_INVALID
11318                         && script_of_char == SCX_Unknown))
11319         {
11320             retval = FALSE;
11321             break;
11322         }
11323 
11324         /* For the first character, or the run is inherited, the run's script
11325          * is set to the char's */
11326         if (   UNLIKELY(script_of_run == SCX_INVALID)
11327             || UNLIKELY(script_of_run == SCX_Inherited))
11328         {
11329             script_of_run = script_of_char;
11330         }
11331 
11332         /* For the character's script to be Unknown, it must be the first
11333          * character in the sequence (for otherwise a test above would have
11334          * prevented us from reaching here), and we have set the run's script
11335          * to it.  Nothing further to be done for this character */
11336         if (UNLIKELY(script_of_char == SCX_Unknown)) {
11337             continue;
11338         }
11339 
11340         /* We accept 'inherited' script characters currently even at the
11341          * beginning.  (We know that no characters in Inherited are digits, or
11342          * we'd have to check for that) */
11343         if (UNLIKELY(script_of_char == SCX_Inherited)) {
11344             continue;
11345         }
11346 
11347         /* If the run so far is Common, and the new character isn't, change the
11348          * run's script to that of this character */
11349         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11350             script_of_run = script_of_char;
11351         }
11352 
11353         /* Now we can see if the script of the new character is the same as
11354          * that of the run */
11355         if (LIKELY(script_of_char == script_of_run)) {
11356             /* By far the most common case */
11357             goto scripts_match;
11358         }
11359 
11360         /* Here, the script of the run isn't Common.  But characters in Common
11361          * match any script */
11362         if (script_of_char == SCX_Common) {
11363             goto scripts_match;
11364         }
11365 
11366 #ifndef HAS_SCX_AUX_TABLES
11367 
11368         /* Too early a Unicode version to have a code point belonging to more
11369          * than one script, so, if the scripts don't exactly match, fail */
11370         PERL_UNUSED_VAR(intersection_len);
11371         retval = FALSE;
11372         break;
11373 
11374 #else
11375 
11376         /* Here there is no exact match between the character's script and the
11377          * run's.  And we've handled the special cases of scripts Unknown,
11378          * Inherited, and Common.
11379          *
11380          * Negative script numbers signify that the value may be any of several
11381          * scripts, and we need to look at auxiliary information to make our
11382          * deterimination.  But if both are non-negative, we can fail now */
11383         if (LIKELY(script_of_char >= 0)) {
11384             const SCX_enum * search_in;
11385             PERL_UINT_FAST8_T search_in_len;
11386             PERL_UINT_FAST8_T i;
11387 
11388             if (LIKELY(script_of_run >= 0)) {
11389                 retval = FALSE;
11390                 break;
11391             }
11392 
11393             /* Use the previously constructed set of possible scripts, if any.
11394              * */
11395             if (intersection) {
11396                 search_in = intersection;
11397                 search_in_len = intersection_len;
11398             }
11399             else {
11400                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11401                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11402             }
11403 
11404             for (i = 0; i < search_in_len; i++) {
11405                 if (search_in[i] == script_of_char) {
11406                     script_of_run = script_of_char;
11407                     goto scripts_match;
11408                 }
11409             }
11410 
11411             retval = FALSE;
11412             break;
11413         }
11414         else if (LIKELY(script_of_run >= 0)) {
11415             /* script of character could be one of several, but run is a single
11416              * script */
11417             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11418             const PERL_UINT_FAST8_T search_in_len
11419                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11420             PERL_UINT_FAST8_T i;
11421 
11422             for (i = 0; i < search_in_len; i++) {
11423                 if (search_in[i] == script_of_run) {
11424                     script_of_char = script_of_run;
11425                     goto scripts_match;
11426                 }
11427             }
11428 
11429             retval = FALSE;
11430             break;
11431         }
11432         else {
11433             /* Both run and char could be in one of several scripts.  If the
11434              * intersection is empty, then this character isn't in this script
11435              * run.  Otherwise, we need to calculate the intersection to use
11436              * for future iterations of the loop, unless we are already at the
11437              * final character */
11438             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11439             const PERL_UINT_FAST8_T char_len
11440                                       = SCX_AUX_TABLE_lengths[-script_of_char];
11441             const SCX_enum * search_run;
11442             PERL_UINT_FAST8_T run_len;
11443 
11444             SCX_enum * new_overlap = NULL;
11445             PERL_UINT_FAST8_T i, j;
11446 
11447             if (intersection) {
11448                 search_run = intersection;
11449                 run_len = intersection_len;
11450             }
11451             else {
11452                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11453                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11454             }
11455 
11456             intersection_len = 0;
11457 
11458             for (i = 0; i < run_len; i++) {
11459                 for (j = 0; j < char_len; j++) {
11460                     if (search_run[i] == search_char[j]) {
11461 
11462                         /* Here, the script at i,j matches.  That means this
11463                          * character is in the run.  But continue on to find
11464                          * the complete intersection, for the next loop
11465                          * iteration, and for the digit check after it.
11466                          *
11467                          * On the first found common script, we malloc space
11468                          * for the intersection list for the worst case of the
11469                          * intersection, which is the minimum of the number of
11470                          * scripts remaining in each set. */
11471                         if (intersection_len == 0) {
11472                             Newx(new_overlap,
11473                                  MIN(run_len - i, char_len - j),
11474                                  SCX_enum);
11475                         }
11476                         new_overlap[intersection_len++] = search_run[i];
11477                     }
11478                 }
11479             }
11480 
11481             /* Here we've looked through everything.  If they have no scripts
11482              * in common, not a run */
11483             if (intersection_len == 0) {
11484                 retval = FALSE;
11485                 break;
11486             }
11487 
11488             /* If there is only a single script in common, set to that.
11489              * Otherwise, use the intersection going forward */
11490             Safefree(intersection);
11491             intersection = NULL;
11492             if (intersection_len == 1) {
11493                 script_of_run = script_of_char = new_overlap[0];
11494                 Safefree(new_overlap);
11495                 new_overlap = NULL;
11496             }
11497             else {
11498                 intersection = new_overlap;
11499             }
11500         }
11501 
11502 #endif
11503 
11504   scripts_match:
11505 
11506         /* Here, the script of the character is compatible with that of the
11507          * run.  That means that in most cases, it continues the script run.
11508          * Either it and the run match exactly, or one or both can be in any of
11509          * several scripts, and the intersection is not empty.  However, if the
11510          * character is a decimal digit, it could still mean failure if it is
11511          * from the wrong sequence of 10.  So, we need to look at if it's a
11512          * digit.  We've already handled the 10 digits [0-9], and the next
11513          * lowest one is this one: */
11514         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11515             continue;   /* Not a digit; this character is part of the run */
11516         }
11517 
11518         /* If we have a definitive '0' for the script of this character, we
11519          * know that for this to be a digit, it must be in the range of +0..+9
11520          * of that zero. */
11521         if (   script_of_char >= 0
11522             && (zero_of_char = script_zeros[script_of_char]))
11523         {
11524             if (! withinCOUNT(cp, zero_of_char, 9)) {
11525                 continue;   /* Not a digit; this character is part of the run
11526                              */
11527             }
11528 
11529         }
11530         else {  /* Need to look up if this character is a digit or not */
11531             SSize_t index_of_zero_of_char;
11532             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11533             if (     UNLIKELY(index_of_zero_of_char < 0)
11534                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11535             {
11536                 continue;   /* Not a digit; this character is part of the run.
11537                              */
11538             }
11539 
11540             zero_of_char = decimals_array[index_of_zero_of_char];
11541         }
11542 
11543         /* Here, the character is a decimal digit, and the zero of its sequence
11544          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11545          * they better be the same. */
11546         if (zero_of_run) {
11547             if (zero_of_run != zero_of_char) {
11548                 retval = FALSE;
11549                 break;
11550             }
11551         }
11552         else {  /* Otherwise we now have a zero for this run */
11553             zero_of_run = zero_of_char;
11554         }
11555     } /* end of looping through CLOSESR text */
11556 
11557     Safefree(intersection);
11558 
11559     if (ret_script != NULL) {
11560         if (retval) {
11561             *ret_script = script_of_run;
11562         }
11563         else {
11564             *ret_script = SCX_INVALID;
11565         }
11566     }
11567 
11568     return retval;
11569 }
11570 
11571 #endif /* ifndef PERL_IN_XSUB_RE */
11572 
11573 /*
11574  * ex: set ts=8 sts=4 sw=4 et:
11575  */
11576