xref: /openbsd/gnu/usr.bin/perl/regexec.c (revision 3d61058a)
1 /*    regexec.c
2  */
3 
4 /*
5  *	One Ring to rule them all, One Ring to find them
6  *
7  *     [p.v of _The Lord of the Rings_, opening poem]
8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10  */
11 
12 /* This file contains functions for executing a regular expression.  See
13  * also regcomp.c which funnily enough, contains functions for compiling
14  * a regular expression.
15  *
16  * This file is also copied at build time to ext/re/re_exec.c, where
17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18  * This causes the main functions to be compiled under new names and with
19  * debugging support added, which makes "use re 'debug'" work.
20  */
21 
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23  * confused with the original package (see point 3 below).  Thanks, Henry!
24  */
25 
26 /* Additional note: this code is very heavily munged from Henry's version
27  * in places.  In some spots I've traded clarity for efficiency, so don't
28  * blame Henry for some of the lack of readability.
29  */
30 
31 /* The names of the functions have been changed from regcomp and
32  * regexec to  pregcomp and pregexec in order to avoid conflicts
33  * with the POSIX routines of the same names.
34 */
35 
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39 
40 /*
41  * pregcomp and pregexec -- regsub and regerror are not used in perl
42  *
43  *	Copyright (c) 1986 by University of Toronto.
44  *	Written by Henry Spencer.  Not derived from licensed software.
45  *
46  *	Permission is granted to anyone to use this software for any
47  *	purpose on any computer system, and to redistribute it freely,
48  *	subject to the following restrictions:
49  *
50  *	1. The author is not responsible for the consequences of use of
51  *		this software, no matter how awful, even if they arise
52  *		from defects in it.
53  *
54  *	2. The origin of this software must not be misrepresented, either
55  *		by explicit claim or by omission.
56  *
57  *	3. Altered versions must be plainly marked as such, and must not
58  *		be misrepresented as being the original software.
59  *
60  ****    Alterations to Henry's code are...
61  ****
62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64  ****    by Larry Wall and others
65  ****
66  ****    You may distribute under the terms of either the GNU General Public
67  ****    License or the Artistic License, as specified in the README file.
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEX_ENGINE
75 #define PERL_IN_REGEXEC_C
76 #include "perl.h"
77 
78 #ifdef PERL_IN_XSUB_RE
79 #  include "re_comp.h"
80 #else
81 #  include "regcomp.h"
82 #endif
83 
84 #include "invlist_inline.h"
85 #include "unicode_constants.h"
86 
87 static const char b_utf8_locale_required[] =
88  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
89                                                 "  Assuming a UTF-8 locale";
90 
91 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND                       \
92     STMT_START {                                                            \
93         if (! IN_UTF8_CTYPE_LOCALE) {                                       \
94           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
95                                                 b_utf8_locale_required);    \
96         }                                                                   \
97     } STMT_END
98 
99 static const char sets_utf8_locale_required[] =
100       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
101 
102 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n)                     \
103     STMT_START {                                                            \
104         if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\
105           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
106                                              sets_utf8_locale_required);    \
107         }                                                                   \
108     } STMT_END
109 
110 #ifdef DEBUGGING
111 /* At least one required character in the target string is expressible only in
112  * UTF-8. */
113 static const char non_utf8_target_but_utf8_required[]
114                 = "Can't match, because target string needs to be in UTF-8\n";
115 #endif
116 
117 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
118     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
119     goto target;                                                         \
120 } STMT_END
121 
122 #ifndef STATIC
123 #define STATIC  static
124 #endif
125 
126 /*
127  * Forwards.
128  */
129 
130 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131 
132 #define HOPc(pos,off) \
133         (char *)(reginfo->is_utf8_target \
134             ? reghop3((U8*)pos, off, \
135                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
136             : (U8*)(pos + off))
137 
138 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
139 #define HOPBACK3(pos, off, lim) \
140         (reginfo->is_utf8_target                          \
141             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
142             : (pos - off >= lim)	                         \
143                 ? (U8*)pos - off		                 \
144                 : NULL)
145 
146 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147 
148 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
149 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150 
151 /* lim must be +ve. Returns NULL on overshoot */
152 #define HOPMAYBE3(pos,off,lim) \
153         (reginfo->is_utf8_target                        \
154             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
155             : ((U8*)pos + off <= lim)                   \
156                 ? (U8*)pos + off                        \
157                 : NULL)
158 
159 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
160  * off must be >=0; args should be vars rather than expressions */
161 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
162     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
163     : (U8*)((pos + off) > lim ? lim : (pos + off)))
164 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165 
166 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
167     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168     : (U8*)(pos + off))
169 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170 
171 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
172 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173 
174 /* for use after a quantifier and before an EXACT-like node -- japhy */
175 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176  *
177  * NOTE that *nothing* that affects backtracking should be in here, specifically
178  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
179  * node that is in between two EXACT like nodes when ascertaining what the required
180  * "follow" character is. This should probably be moved to regex compile time
181  * although it may be done at run time because of the REF possibility - more
182  * investigation required. -- demerphq
183 */
184 #define JUMPABLE(rn) (                                                             \
185     OP(rn) == OPEN ||                                                              \
186     (OP(rn) == CLOSE &&                                                            \
187      !EVAL_CLOSE_PAREN_IS(cur_eval,PARNO(rn)) ) ||                                 \
188     OP(rn) == EVAL ||                                                              \
189     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
190     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
191     OP(rn) == KEEPS ||                                                             \
192     (REGNODE_TYPE(OP(rn)) == CURLY && ARG1i(rn) > 0)                                  \
193 )
194 #define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT)
195 
196 #define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF )
197 
198 /*
199   Search for mandatory following text node; for lookahead, the text must
200   follow but for lookbehind (FLAGS(rn) != 0) we skip to the next step.
201 */
202 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
203     while (JUMPABLE(rn)) { \
204         const OPCODE type = OP(rn); \
205         if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \
206             rn = REGNODE_AFTER_opcode(rn,type); \
207         else if (type == PLUS) \
208             rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \
209         else if (type == IFMATCH) \
210             rn = (FLAGS(rn) == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG1u(rn); \
211         else rn += NEXT_OFF(rn); \
212     } \
213 } STMT_END
214 
215 #define SLAB_FIRST(s) (&(s)->states[0])
216 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217 
218 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
219 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
220 static regmatch_state * S_push_slab(pTHX);
221 
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225  * are needed for the regexp context stack bookkeeping. */
226 
227 STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp * rex,I32 parenfloor,U32 maxopenparen comma_pDEPTH)228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen comma_pDEPTH)
229 {
230     const int retval = PL_savestack_ix;
231     /* Number of bytes about to be stored in the stack */
232     const SSize_t paren_bytes_to_push = sizeof(*RXp_OFFSp(rex)) * (maxopenparen - parenfloor);
233     /* Number of savestack[] entries to be filled by the paren data */
234     /* Rounding is performed in case we are few elements short */
235     const int paren_elems_to_push = (paren_bytes_to_push + sizeof(*PL_savestack) - 1) / sizeof(*PL_savestack);
236     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
237     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
238 
239     DECLARE_AND_GET_RE_DEBUG_FLAGS;
240 
241     PERL_ARGS_ASSERT_REGCPPUSH;
242 
243     if (paren_elems_to_push < 0)
244         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i",
245                    (int)paren_elems_to_push, (int)maxopenparen,
246                    (int)parenfloor);
247 
248     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
249         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
250                    " out of range (%lu-%ld)",
251                    total_elems,
252                    (unsigned long)maxopenparen,
253                    (long)parenfloor);
254 
255     DEBUG_BUFFERS_r(
256         if ((int)maxopenparen > (int)parenfloor)
257             Perl_re_exec_indentf( aTHX_
258                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
259                 depth,
260                 PTR2UV(rex),
261                 PTR2UV(RXp_OFFSp(rex))
262             );
263     );
264 
265     SSGROW(total_elems + REGCP_FRAME_ELEMS);
266     assert((IV)PL_savestack_max > (IV)(total_elems + REGCP_FRAME_ELEMS));
267 
268     /* memcpy the offs inside the stack - it's faster than for loop */
269     memcpy(&PL_savestack[PL_savestack_ix], RXp_OFFSp(rex) + parenfloor + 1, paren_bytes_to_push);
270     PL_savestack_ix += paren_elems_to_push;
271 
272     DEBUG_BUFFERS_r({
273 	I32 p;
274         for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) {
275             Perl_re_exec_indentf(aTHX_
276                 "    \\%" UVuf " %" IVdf " (%" IVdf ") .. %" IVdf " (regcppush)\n",
277                 depth,
278                 (UV)p,
279                 (IV)RXp_OFFSp(rex)[p].start,
280                 (IV)RXp_OFFSp(rex)[p].start_tmp,
281                 (IV)RXp_OFFSp(rex)[p].end
282             );
283         }
284     });
285 
286 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
287     SSPUSHINT(maxopenparen);
288     SSPUSHINT(RXp_LASTPAREN(rex));
289     SSPUSHINT(RXp_LASTCLOSEPAREN(rex));
290     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
291 
292 
293     DEBUG_BUFFERS_r({
294         Perl_re_exec_indentf(aTHX_
295                 "finished regcppush returning %" IVdf " cur: %" IVdf "\n",
296                 depth, retval, PL_savestack_ix);
297     });
298 
299     return retval;
300 }
301 
302 /* These are needed since we do not localize EVAL nodes: */
303 #define REGCP_SET(cp)                                           \
304     DEBUG_STATE_r(                                              \
305         Perl_re_exec_indentf( aTHX_                             \
306             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
307             depth, (IV)PL_savestack_ix                          \
308         )                                                       \
309     );                                                          \
310     cp = PL_savestack_ix
311 
312 #define REGCP_UNWIND(cp)                                        \
313     DEBUG_STATE_r(                                              \
314         if (cp != PL_savestack_ix)                              \
315             Perl_re_exec_indentf( aTHX_                         \
316                 "Clearing an EVAL scope, savestack=%"           \
317                 IVdf "..%" IVdf "\n",                           \
318                 depth, (IV)(cp), (IV)PL_savestack_ix            \
319             )                                                   \
320     );                                                          \
321     regcpblow(cp)
322 
323 /* set the start and end positions of capture ix */
324 #define CLOSE_ANY_CAPTURE(rex, ix, s, e)                                    \
325     RXp_OFFSp(rex)[(ix)].start = (s);                                       \
326     RXp_OFFSp(rex)[(ix)].end = (e)
327 
328 #define CLOSE_CAPTURE(rex, ix, s, e)                                        \
329     CLOSE_ANY_CAPTURE(rex, ix, s, e);                                       \
330     if (ix > RXp_LASTPAREN(rex))                                            \
331         RXp_LASTPAREN(rex) = (ix);                                          \
332     RXp_LASTCLOSEPAREN(rex) = (ix);                                         \
333     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                             \
334         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " .. %" IVdf " max: %" UVuf "\n", \
335         depth,                                                              \
336         PTR2UV(rex),                                                        \
337         PTR2UV(RXp_OFFSp(rex)),                                             \
338         (UV)(ix),                                                           \
339         (IV)RXp_OFFSp(rex)[ix].start,                                       \
340         (IV)RXp_OFFSp(rex)[ix].end,                                         \
341         (UV)RXp_LASTPAREN(rex)                                              \
342     ))
343 
344 /* the lp and lcp args match the relevant members of the
345  * regexp structure, but in practice they should all be U16
346  * instead as we have a hard limit of U16_MAX parens. See
347  * line 4003 or so of regcomp.c where we parse OPEN parens
348  * of various types. */
349 PERL_STATIC_INLINE void
S_unwind_paren(pTHX_ regexp * rex,U32 lp,U32 lcp comma_pDEPTH)350 S_unwind_paren(pTHX_ regexp *rex, U32 lp, U32 lcp comma_pDEPTH) {
351     PERL_ARGS_ASSERT_UNWIND_PAREN;
352     U32 n;
353     DECLARE_AND_GET_RE_DEBUG_FLAGS;
354     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
355         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf
356         ": invalidate (%" UVuf " .. %" UVuf ") set lcp: %" UVuf "\n",
357         depth,
358         PTR2UV(rex),
359         PTR2UV(RXp_OFFSp(rex)),
360         (UV)(lp),
361         (UV)(RXp_LASTPAREN(rex)),
362         (UV)(lcp)
363     ));
364     for (n = RXp_LASTPAREN(rex); n > lp; n--) {
365         RXp_OFFSp(rex)[n].end = -1;
366     }
367     RXp_LASTPAREN(rex) = n;
368     RXp_LASTCLOSEPAREN(rex) = lcp;
369 }
370 #define UNWIND_PAREN(lp,lcp) unwind_paren(rex,lp,lcp)
371 
372 PERL_STATIC_INLINE void
S_capture_clear(pTHX_ regexp * rex,U16 from_ix,U16 to_ix,const char * str comma_pDEPTH)373 S_capture_clear(pTHX_ regexp *rex, U16 from_ix, U16 to_ix, const char *str comma_pDEPTH) {
374     PERL_ARGS_ASSERT_CAPTURE_CLEAR;
375     PERL_UNUSED_ARG(str); /* only used for debugging */
376     U16 my_ix;
377     DECLARE_AND_GET_RE_DEBUG_FLAGS;
378     for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) {
379         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
380                 "CAPTURE_CLEAR %s \\%" IVdf ": "
381                 "%" IVdf "(%" IVdf ") .. %" IVdf
382                 " => "
383                 "%" IVdf "(%" IVdf ") .. %" IVdf
384                 "\n",
385             depth, str, (IV)my_ix,
386             (IV)RXp_OFFSp(rex)[my_ix].start,
387             (IV)RXp_OFFSp(rex)[my_ix].start_tmp,
388             (IV)RXp_OFFSp(rex)[my_ix].end,
389             (IV)-1, (IV)-1, (IV)-1));
390         RXp_OFFSp(rex)[my_ix].start = -1;
391         RXp_OFFSp(rex)[my_ix].start_tmp = -1;
392         RXp_OFFSp(rex)[my_ix].end = -1;
393     }
394 }
395 
396 #define CAPTURE_CLEAR(from_ix, to_ix, str) \
397     if (from_ix) capture_clear(rex,from_ix, to_ix, str)
398 
399 STATIC void
S_regcppop(pTHX_ regexp * rex,U32 * maxopenparen_p comma_pDEPTH)400 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p comma_pDEPTH)
401 {
402     UV i;
403     U32 paren;
404     DECLARE_AND_GET_RE_DEBUG_FLAGS;
405 
406     PERL_ARGS_ASSERT_REGCPPOP;
407 
408 
409     DEBUG_BUFFERS_r({
410         Perl_re_exec_indentf(aTHX_
411                 "starting regcppop at %" IVdf "\n",
412                 depth, PL_savestack_ix);
413     });
414 
415     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
416     i = SSPOPUV;
417     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
418     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
419     RXp_LASTCLOSEPAREN(rex) = SSPOPINT;
420     RXp_LASTPAREN(rex) = SSPOPINT;
421     *maxopenparen_p = SSPOPINT;
422 
423     i -= REGCP_OTHER_ELEMS;
424     /* Now restore the parentheses context. */
425     DEBUG_BUFFERS_r(
426         if (i || RXp_LASTPAREN(rex) + 1 <= rex->nparens)
427             Perl_re_exec_indentf( aTHX_
428                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
429                 depth,
430                 PTR2UV(rex),
431                 PTR2UV(RXp_OFFSp(rex))
432             );
433     );
434     /* substract remaining elements from the stack */
435     PL_savestack_ix -= i;
436 
437     /* static assert that offs struc size is not less than stack elem size */
438     STATIC_ASSERT_STMT(sizeof(*RXp_OFFSp(rex)) >= sizeof(*PL_savestack));
439 
440     /* calculate actual number of offs/capture groups stored */
441     /* by doing integer division (leaving potential alignment aside) */
442     i = (i * sizeof(*PL_savestack)) / sizeof(*RXp_OFFSp(rex));
443 
444     /* calculate paren starting point */
445     /* i is our number of entries which we are subtracting from *maxopenparen_p */
446     /* and we are storing + 1 this to get the beginning */
447     paren = *maxopenparen_p - i + 1;
448 
449     /* restore them */
450     memcpy(RXp_OFFSp(rex) + paren, &PL_savestack[PL_savestack_ix], i * sizeof(*RXp_OFFSp(rex)));
451 
452     DEBUG_BUFFERS_r(
453         for (; paren <= *maxopenparen_p; ++paren) {
454             Perl_re_exec_indentf(aTHX_
455                 "    \\%" UVuf " %" IVdf "(%" IVdf ") .. %" IVdf " %s (regcppop)\n",
456                 depth,
457                 (UV)paren,
458                 (IV)RXp_OFFSp(rex)[paren].start,
459                 (IV)RXp_OFFSp(rex)[paren].start_tmp,
460                 (IV)RXp_OFFSp(rex)[paren].end,
461                 (paren > RXp_LASTPAREN(rex) ? "(skipped)" : ""));
462         }
463     );
464 #if 1
465     /* It would seem that the similar code in regtry()
466      * already takes care of this, and in fact it is in
467      * a better location to since this code can #if 0-ed out
468      * but the code in regtry() is needed or otherwise tests
469      * requiring null fields (pat.t#187 and split.t#{13,14}
470      * (as of patchlevel 7877)  will fail.  Then again,
471      * this code seems to be necessary or otherwise
472      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
473      * --jhi updated by dapm */
474     for (i = RXp_LASTPAREN(rex) + 1; i <= rex->nparens; i++) {
475         if (i > *maxopenparen_p) {
476             RXp_OFFSp(rex)[i].start = -1;
477         }
478         RXp_OFFSp(rex)[i].end = -1;
479         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
480             "    \\%" UVuf ": %s   ..-1 undeffing (regcppop)\n",
481             depth,
482             (UV)i,
483             (i > *maxopenparen_p) ? "-1" : "  "
484         ));
485     }
486 #endif
487     DEBUG_BUFFERS_r({
488         Perl_re_exec_indentf(aTHX_
489                 "finished regcppop at %" IVdf "\n",
490                 depth, PL_savestack_ix);
491     });
492 }
493 
494 /* restore the parens and associated vars at savestack position ix,
495  * but without popping the stack */
496 
497 STATIC void
S_regcp_restore(pTHX_ regexp * rex,I32 ix,U32 * maxopenparen_p comma_pDEPTH)498 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p comma_pDEPTH)
499 {
500     I32 tmpix = PL_savestack_ix;
501     PERL_ARGS_ASSERT_REGCP_RESTORE;
502 
503     PL_savestack_ix = ix;
504     regcppop(rex, maxopenparen_p);
505     PL_savestack_ix = tmpix;
506 }
507 
508 #define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
509 
510 STATIC bool
S_isFOO_lc(pTHX_ const U8 classnum,const U8 character)511 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
512 {
513     /* Returns a boolean as to whether or not 'character' is a member of the
514      * Posix character class given by 'classnum' that should be equivalent to a
515      * value in the typedef 'char_class_number_'.
516      *
517      * Ideally this could be replaced by a just an array of function pointers
518      * to the C library functions that implement the macros this calls.
519      * However, to compile, the precise function signatures are required, and
520      * these may vary from platform to platform.  To avoid having to figure
521      * out what those all are on each platform, I (khw) am using this method,
522      * which adds an extra layer of function call overhead (unless the C
523      * optimizer strips it away).  But we don't particularly care about
524      * performance with locales anyway. */
525 
526     if (IN_UTF8_CTYPE_LOCALE) {
527         return cBOOL(generic_isCC_(character, classnum));
528     }
529 
530     switch ((char_class_number_) classnum) {
531         case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character);
532         case CC_ENUM_ALPHA_:        return    isU8_ALPHA_LC(character);
533         case CC_ENUM_ASCII_:        return    isU8_ASCII_LC(character);
534         case CC_ENUM_BLANK_:        return    isU8_BLANK_LC(character);
535         case CC_ENUM_CASED_:        return    isU8_CASED_LC(character);
536         case CC_ENUM_CNTRL_:        return    isU8_CNTRL_LC(character);
537         case CC_ENUM_DIGIT_:        return    isU8_DIGIT_LC(character);
538         case CC_ENUM_GRAPH_:        return    isU8_GRAPH_LC(character);
539         case CC_ENUM_LOWER_:        return    isU8_LOWER_LC(character);
540         case CC_ENUM_PRINT_:        return    isU8_PRINT_LC(character);
541         case CC_ENUM_PUNCT_:        return    isU8_PUNCT_LC(character);
542         case CC_ENUM_SPACE_:        return    isU8_SPACE_LC(character);
543         case CC_ENUM_UPPER_:        return    isU8_UPPER_LC(character);
544         case CC_ENUM_WORDCHAR_:     return isU8_WORDCHAR_LC(character);
545         case CC_ENUM_XDIGIT_:       return   isU8_XDIGIT_LC(character);
546         default:    /* VERTSPACE should never occur in locales */
547             break;
548     }
549 
550     Perl_croak(aTHX_
551                "panic: isFOO_lc() has an unexpected character class '%d'",
552                classnum);
553 
554     NOT_REACHED; /* NOTREACHED */
555     return FALSE;
556 }
557 
558 PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(pTHX_ const char * s1,const char * s2,I32 len)559 S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len)
560 {
561     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
562      * folded.  Works on all folds representable without UTF-8, except for
563      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
564      * check that the strings each have at least 'len' characters.
565      *
566      * There is almost an identical API function where s2 need not be folded:
567      * Perl_foldEQ_latin1() */
568 
569     const U8 *a = (const U8 *)s1;
570     const U8 *b = (const U8 *)s2;
571 
572     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
573 
574     assert(len >= 0);
575 
576     while (len--) {
577         assert(! isUPPER_L1(*b));
578         if (toLOWER_L1(*a) != *b) {
579             return 0;
580         }
581         a++, b++;
582     }
583     return 1;
584 }
585 
586 STATIC bool
S_isFOO_utf8_lc(pTHX_ const U8 classnum,const U8 * character,const U8 * e)587 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
588 {
589     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
590      * 'character' is a member of the Posix character class given by 'classnum'
591      * that should be equivalent to a value in the typedef
592      * 'char_class_number_'.
593      *
594      * This just calls isFOO_lc on the code point for the character if it is in
595      * the range 0-255.  Outside that range, all characters use Unicode
596      * rules, ignoring any locale.  So use the Unicode function if this class
597      * requires an inversion list, and use the Unicode macro otherwise. */
598 
599 
600     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
601 
602     if (UTF8_IS_INVARIANT(*character)) {
603         return isFOO_lc(classnum, *character);
604     }
605     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
606         return isFOO_lc(classnum,
607                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
608     }
609 
610     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
611 
612     switch ((char_class_number_) classnum) {
613         case CC_ENUM_SPACE_:     return is_XPERLSPACE_high(character);
614         case CC_ENUM_BLANK_:     return is_HORIZWS_high(character);
615         case CC_ENUM_XDIGIT_:    return is_XDIGIT_high(character);
616         case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character);
617         default:
618             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
619                                         utf8_to_uvchr_buf(character, e, NULL));
620     }
621     NOT_REACHED; /* NOTREACHED */
622 }
623 
624 STATIC U8 *
S_find_span_end(U8 * s,const U8 * send,const U8 span_byte)625 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
626 {
627     /* Returns the position of the first byte in the sequence between 's' and
628      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
629      * */
630 
631     PERL_ARGS_ASSERT_FIND_SPAN_END;
632 
633     assert(send >= s);
634 
635     if ((STRLEN) (send - s) >= PERL_WORDSIZE
636                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
637                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
638     {
639         PERL_UINTMAX_T span_word;
640 
641         /* Process per-byte until reach word boundary.  XXX This loop could be
642          * eliminated if we knew that this platform had fast unaligned reads */
643         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
644             if (*s != span_byte) {
645                 return s;
646             }
647             s++;
648         }
649 
650         /* Create a word filled with the bytes we are spanning */
651         span_word = PERL_COUNT_MULTIPLIER * span_byte;
652 
653         /* Process per-word as long as we have at least a full word left */
654         do {
655 
656             /* Keep going if the whole word is composed of 'span_byte's */
657             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
658                 s += PERL_WORDSIZE;
659                 continue;
660             }
661 
662             /* Here, at least one byte in the word isn't 'span_byte'. */
663 
664 #ifdef EBCDIC
665 
666             break;
667 
668 #else
669 
670             /* This xor leaves 1 bits only in those non-matching bytes */
671             span_word ^= * (PERL_UINTMAX_T *) s;
672 
673             /* Make sure the upper bit of each non-matching byte is set.  This
674              * makes each such byte look like an ASCII platform variant byte */
675             span_word |= span_word << 1;
676             span_word |= span_word << 2;
677             span_word |= span_word << 4;
678 
679             /* That reduces the problem to what this function solves */
680             return s + variant_byte_number(span_word);
681 
682 #endif
683 
684         } while (s + PERL_WORDSIZE <= send);
685     }
686 
687     /* Process the straggler bytes beyond the final word boundary */
688     while (s < send) {
689         if (*s != span_byte) {
690             return s;
691         }
692         s++;
693     }
694 
695     return s;
696 }
697 
698 STATIC U8 *
S_find_next_masked(U8 * s,const U8 * send,const U8 byte,const U8 mask)699 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
700 {
701     /* Returns the position of the first byte in the sequence between 's'
702      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
703      * returns 'send' if none found.  It uses word-level operations instead of
704      * byte to speed up the process */
705 
706     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
707 
708     assert(send >= s);
709     assert((byte & mask) == byte);
710 
711 #ifndef EBCDIC
712 
713     if ((STRLEN) (send - s) >= PERL_WORDSIZE
714                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
715                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
716     {
717         PERL_UINTMAX_T word, mask_word;
718 
719         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
720             if (((*s) & mask) == byte) {
721                 return s;
722             }
723             s++;
724         }
725 
726         word      = PERL_COUNT_MULTIPLIER * byte;
727         mask_word = PERL_COUNT_MULTIPLIER * mask;
728 
729         do {
730             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
731 
732             /* If 'masked' contains bytes with the bit pattern of 'byte' within
733              * it, xoring with 'word' will leave each of the 8 bits in such
734              * bytes be 0, and no byte containing any other bit pattern will be
735              * 0. */
736             masked ^= word;
737 
738             /* This causes the most significant bit to be set to 1 for any
739              * bytes in the word that aren't completely 0 */
740             masked |= masked << 1;
741             masked |= masked << 2;
742             masked |= masked << 4;
743 
744             /* The msbits are the same as what marks a byte as variant, so we
745              * can use this mask.  If all msbits are 1, the word doesn't
746              * contain 'byte' */
747             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
748                 s += PERL_WORDSIZE;
749                 continue;
750             }
751 
752             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
753              * and any that are, are 0.  Complement and re-AND to swap that */
754             masked = ~ masked;
755             masked &= PERL_VARIANTS_WORD_MASK;
756 
757             /* This reduces the problem to that solved by this function */
758             s += variant_byte_number(masked);
759             return s;
760 
761         } while (s + PERL_WORDSIZE <= send);
762     }
763 
764 #endif
765 
766     while (s < send) {
767         if (((*s) & mask) == byte) {
768             return s;
769         }
770         s++;
771     }
772 
773     return s;
774 }
775 
776 STATIC U8 *
S_find_span_end_mask(U8 * s,const U8 * send,const U8 span_byte,const U8 mask)777 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
778 {
779     /* Returns the position of the first byte in the sequence between 's' and
780      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
781      * 'span_byte' should have been ANDed with 'mask' in the call of this
782      * function.  Returns 'send' if none found.  Works like find_span_end(),
783      * except for the AND */
784 
785     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
786 
787     assert(send >= s);
788     assert((span_byte & mask) == span_byte);
789 
790     if ((STRLEN) (send - s) >= PERL_WORDSIZE
791                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
792                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
793     {
794         PERL_UINTMAX_T span_word, mask_word;
795 
796         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
797             if (((*s) & mask) != span_byte) {
798                 return s;
799             }
800             s++;
801         }
802 
803         span_word = PERL_COUNT_MULTIPLIER * span_byte;
804         mask_word = PERL_COUNT_MULTIPLIER * mask;
805 
806         do {
807             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
808 
809             if (masked == span_word) {
810                 s += PERL_WORDSIZE;
811                 continue;
812             }
813 
814 #ifdef EBCDIC
815 
816             break;
817 
818 #else
819 
820             masked ^= span_word;
821             masked |= masked << 1;
822             masked |= masked << 2;
823             masked |= masked << 4;
824             return s + variant_byte_number(masked);
825 
826 #endif
827 
828         } while (s + PERL_WORDSIZE <= send);
829     }
830 
831     while (s < send) {
832         if (((*s) & mask) != span_byte) {
833             return s;
834         }
835         s++;
836     }
837 
838     return s;
839 }
840 
841 /*
842  * pregexec and friends
843  */
844 
845 #ifndef PERL_IN_XSUB_RE
846 /*
847  - pregexec - match a regexp against a string
848  */
849 I32
Perl_pregexec(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)850 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
851          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
852 /* stringarg: the point in the string at which to begin matching */
853 /* strend:    pointer to null at end of string */
854 /* strbeg:    real beginning of string */
855 /* minend:    end of match must be >= minend bytes after stringarg. */
856 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
857  *            itself is accessed via the pointers above */
858 /* nosave:    For optimizations. */
859 {
860     PERL_ARGS_ASSERT_PREGEXEC;
861 
862     return
863         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
864                       nosave ? 0 : REXEC_COPY_STR);
865 }
866 #endif
867 
868 
869 
870 /* re_intuit_start():
871  *
872  * Based on some optimiser hints, try to find the earliest position in the
873  * string where the regex could match.
874  *
875  *   rx:     the regex to match against
876  *   sv:     the SV being matched: only used for utf8 flag; the string
877  *           itself is accessed via the pointers below. Note that on
878  *           something like an overloaded SV, SvPOK(sv) may be false
879  *           and the string pointers may point to something unrelated to
880  *           the SV itself.
881  *   strbeg: real beginning of string
882  *   strpos: the point in the string at which to begin matching
883  *   strend: pointer to the byte following the last char of the string
884  *   flags   currently unused; set to 0
885  *   data:   currently unused; set to NULL
886  *
887  * The basic idea of re_intuit_start() is to use some known information
888  * about the pattern, namely:
889  *
890  *   a) the longest known anchored substring (i.e. one that's at a
891  *      constant offset from the beginning of the pattern; but not
892  *      necessarily at a fixed offset from the beginning of the
893  *      string);
894  *   b) the longest floating substring (i.e. one that's not at a constant
895  *      offset from the beginning of the pattern);
896  *   c) Whether the pattern is anchored to the string; either
897  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
898  *      or anchored to pos(): /\G/;
899  *   d) A start class: a real or synthetic character class which
900  *      represents which characters are legal at the start of the pattern;
901  *
902  * to either quickly reject the match, or to find the earliest position
903  * within the string at which the pattern might match, thus avoiding
904  * running the full NFA engine at those earlier locations, only to
905  * eventually fail and retry further along.
906  *
907  * Returns NULL if the pattern can't match, or returns the address within
908  * the string which is the earliest place the match could occur.
909  *
910  * The longest of the anchored and floating substrings is called 'check'
911  * and is checked first. The other is called 'other' and is checked
912  * second. The 'other' substring may not be present.  For example,
913  *
914  *    /(abc|xyz)ABC\d{0,3}DEFG/
915  *
916  * will have
917  *
918  *   check substr (float)    = "DEFG", offset 6..9 chars
919  *   other substr (anchored) = "ABC",  offset 3..3 chars
920  *   stclass = [ax]
921  *
922  * Be aware that during the course of this function, sometimes 'anchored'
923  * refers to a substring being anchored relative to the start of the
924  * pattern, and sometimes to the pattern itself being anchored relative to
925  * the string. For example:
926  *
927  *   /\dabc/:   "abc" is anchored to the pattern;
928  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
929  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
930  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
931  *                    but the pattern is anchored to the string.
932  */
933 
934 char *
Perl_re_intuit_start(pTHX_ REGEXP * const rx,SV * sv,const char * const strbeg,char * strpos,char * strend,const U32 flags,re_scream_pos_data * data)935 Perl_re_intuit_start(pTHX_
936                     REGEXP * const rx,
937                     SV *sv,
938                     const char * const strbeg,
939                     char *strpos,
940                     char *strend,
941                     const U32 flags,
942                     re_scream_pos_data *data)
943 {
944     struct regexp *const prog = ReANY(rx);
945     SSize_t start_shift = prog->check_offset_min;
946     /* Should be nonnegative! */
947     SSize_t end_shift   = 0;
948     /* current lowest pos in string where the regex can start matching */
949     char *rx_origin = strpos;
950     SV *check;
951     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
952     U8   other_ix = 1 - prog->substrs->check_ix;
953     bool ml_anch = 0;
954     char *other_last = strpos;/* latest pos 'other' substr already checked to */
955     char *check_at = NULL;		/* check substr found at this pos */
956     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
957     RXi_GET_DECL(prog,progi);
958     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
959     regmatch_info *const reginfo = &reginfo_buf;
960     DECLARE_AND_GET_RE_DEBUG_FLAGS;
961 
962     PERL_ARGS_ASSERT_RE_INTUIT_START;
963     PERL_UNUSED_ARG(flags);
964     PERL_UNUSED_ARG(data);
965 
966     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
967                 "Intuit: trying to determine minimum start position...\n"));
968 
969     /* for now, assume that all substr offsets are positive. If at some point
970      * in the future someone wants to do clever things with lookbehind and
971      * -ve offsets, they'll need to fix up any code in this function
972      * which uses these offsets. See the thread beginning
973      * <20140113145929.GF27210@iabyn.com>
974      */
975     assert(prog->substrs->data[0].min_offset >= 0);
976     assert(prog->substrs->data[0].max_offset >= 0);
977     assert(prog->substrs->data[1].min_offset >= 0);
978     assert(prog->substrs->data[1].max_offset >= 0);
979     assert(prog->substrs->data[2].min_offset >= 0);
980     assert(prog->substrs->data[2].max_offset >= 0);
981 
982     /* for now, assume that if both present, that the floating substring
983      * doesn't start before the anchored substring.
984      * If you break this assumption (e.g. doing better optimisations
985      * with lookahead/behind), then you'll need to audit the code in this
986      * function carefully first
987      */
988     assert(
989             ! (  (prog->anchored_utf8 || prog->anchored_substr)
990               && (prog->float_utf8    || prog->float_substr))
991            || (prog->float_min_offset >= prog->anchored_offset));
992 
993     /* byte rather than char calculation for efficiency. It fails
994      * to quickly reject some cases that can't match, but will reject
995      * them later after doing full char arithmetic */
996     if (prog->minlen > strend - strpos) {
997         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
998                               "  String too short...\n"));
999         goto fail;
1000     }
1001 
1002     RXp_MATCH_UTF8_set(prog, utf8_target);
1003     reginfo->is_utf8_target = cBOOL(utf8_target);
1004     reginfo->info_aux = NULL;
1005     reginfo->strbeg = strbeg;
1006     reginfo->strend = strend;
1007     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1008     reginfo->intuit = 1;
1009     /* not actually used within intuit, but zero for safety anyway */
1010     reginfo->poscache_maxiter = 0;
1011 
1012     if (utf8_target) {
1013         if ((!prog->anchored_utf8 && prog->anchored_substr)
1014                 || (!prog->float_utf8 && prog->float_substr))
1015             to_utf8_substr(prog);
1016         check = prog->check_utf8;
1017     } else {
1018         if (!prog->check_substr && prog->check_utf8) {
1019             if (! to_byte_substr(prog)) {
1020                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1021             }
1022         }
1023         check = prog->check_substr;
1024     }
1025 
1026     /* dump the various substring data */
1027     DEBUG_OPTIMISE_MORE_r({
1028         int i;
1029         for (i=0; i<=2; i++) {
1030             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1031                                   : prog->substrs->data[i].substr);
1032             if (!sv)
1033                 continue;
1034 
1035             Perl_re_printf( aTHX_
1036                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1037                 " useful=%" IVdf " utf8=%d [%s]\n",
1038                 i,
1039                 (IV)prog->substrs->data[i].min_offset,
1040                 (IV)prog->substrs->data[i].max_offset,
1041                 (IV)prog->substrs->data[i].end_shift,
1042                 BmUSEFUL(sv),
1043                 utf8_target ? 1 : 0,
1044                 SvPEEK(sv));
1045         }
1046     });
1047 
1048     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1049 
1050         /* ml_anch: check after \n?
1051          *
1052          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1053          * with /.*.../, these flags will have been added by the
1054          * compiler:
1055          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1056          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1057          */
1058         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1059                    && !(prog->intflags & PREGf_IMPLICIT);
1060 
1061         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1062             /* we are only allowed to match at BOS or \G */
1063 
1064             /* trivially reject if there's a BOS anchor and we're not at BOS.
1065              *
1066              * Note that we don't try to do a similar quick reject for
1067              * \G, since generally the caller will have calculated strpos
1068              * based on pos() and gofs, so the string is already correctly
1069              * anchored by definition; and handling the exceptions would
1070              * be too fiddly (e.g. REXEC_IGNOREPOS).
1071              */
1072             if (   strpos != strbeg
1073                 && (prog->intflags & PREGf_ANCH_SBOL))
1074             {
1075                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1076                                 "  Not at start...\n"));
1077                 goto fail;
1078             }
1079 
1080             /* in the presence of an anchor, the anchored (relative to the
1081              * start of the regex) substr must also be anchored relative
1082              * to strpos. So quickly reject if substr isn't found there.
1083              * This works for \G too, because the caller will already have
1084              * subtracted gofs from pos, and gofs is the offset from the
1085              * \G to the start of the regex. For example, in /.abc\Gdef/,
1086              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1087              * caller will have set strpos=pos()-4; we look for the substr
1088              * at position pos()-4+1, which lines up with the "a" */
1089 
1090             if (prog->check_offset_min == prog->check_offset_max) {
1091                 /* Substring at constant offset from beg-of-str... */
1092                 SSize_t slen = SvCUR(check);
1093                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1094 
1095                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1096                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1097                     (IV)prog->check_offset_min));
1098 
1099                 if (SvTAIL(check)) {
1100                     /* In this case, the regex is anchored at the end too.
1101                      * Unless it's a multiline match, the lengths must match
1102                      * exactly, give or take a \n.  NB: slen >= 1 since
1103                      * the last char of check is \n */
1104                     if (!multiline
1105                         && (   strend - s > slen
1106                             || strend - s < slen - 1
1107                             || (strend - s == slen && strend[-1] != '\n')))
1108                     {
1109                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1110                                             "  String too long...\n"));
1111                         goto fail_finish;
1112                     }
1113                     /* Now should match s[0..slen-2] */
1114                     slen--;
1115                 }
1116                 if (slen && (strend - s < slen
1117                     || *SvPVX_const(check) != *s
1118                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1119                 {
1120                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121                                     "  String not equal...\n"));
1122                     goto fail_finish;
1123                 }
1124 
1125                 check_at = s;
1126                 goto success_at_start;
1127             }
1128         }
1129     }
1130 
1131     end_shift = prog->check_end_shift;
1132 
1133 #ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
1134     if (end_shift < 0)
1135         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1136                    (IV)end_shift, RX_PRECOMP(rx));
1137 #endif
1138 
1139   restart:
1140 
1141     /* This is the (re)entry point of the main loop in this function.
1142      * The goal of this loop is to:
1143      * 1) find the "check" substring in the region rx_origin..strend
1144      *    (adjusted by start_shift / end_shift). If not found, reject
1145      *    immediately.
1146      * 2) If it exists, look for the "other" substr too if defined; for
1147      *    example, if the check substr maps to the anchored substr, then
1148      *    check the floating substr, and vice-versa. If not found, go
1149      *    back to (1) with rx_origin suitably incremented.
1150      * 3) If we find an rx_origin position that doesn't contradict
1151      *    either of the substrings, then check the possible additional
1152      *    constraints on rx_origin of /^.../m or a known start class.
1153      *    If these fail, then depending on which constraints fail, jump
1154      *    back to here, or to various other re-entry points further along
1155      *    that skip some of the first steps.
1156      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1157      *    substring. If the start position was determined to be at the
1158      *    beginning of the string  - so, not rejected, but not optimised,
1159      *    since we have to run regmatch from position 0 - decrement the
1160      *    BmUSEFUL() count. Otherwise increment it.
1161      */
1162 
1163 
1164     /* first, look for the 'check' substring */
1165 
1166     {
1167         U8* start_point;
1168         U8* end_point;
1169 
1170         DEBUG_OPTIMISE_MORE_r({
1171             Perl_re_printf( aTHX_
1172                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1173                 " Start shift: %" IVdf " End shift %" IVdf
1174                 " Real end Shift: %" IVdf "\n",
1175                 (IV)(rx_origin - strbeg),
1176                 (IV)prog->check_offset_min,
1177                 (IV)start_shift,
1178                 (IV)end_shift,
1179                 (IV)prog->check_end_shift);
1180         });
1181 
1182         end_point = HOPBACK3(strend, end_shift, rx_origin);
1183         if (!end_point)
1184             goto fail_finish;
1185         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1186         if (!start_point)
1187             goto fail_finish;
1188 
1189 
1190         /* If the regex is absolutely anchored to either the start of the
1191          * string (SBOL) or to pos() (ANCH_GPOS), then
1192          * check_offset_max represents an upper bound on the string where
1193          * the substr could start. For the ANCH_GPOS case, we assume that
1194          * the caller of intuit will have already set strpos to
1195          * pos()-gofs, so in this case strpos + offset_max will still be
1196          * an upper bound on the substr.
1197          */
1198         if (!ml_anch
1199             && prog->intflags & PREGf_ANCH
1200             && prog->check_offset_max != SSize_t_MAX)
1201         {
1202             SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check));
1203             const char * const anchor =
1204                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1205             SSize_t targ_len = (char*)end_point - anchor;
1206 
1207             if (check_len > targ_len) {
1208                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1209                               "Target string too short to match required substring...\n"));
1210                 goto fail_finish;
1211             }
1212 
1213             /* do a bytes rather than chars comparison. It's conservative;
1214              * so it skips doing the HOP if the result can't possibly end
1215              * up earlier than the old value of end_point.
1216              */
1217             assert(anchor + check_len <= (char *)end_point);
1218             if (prog->check_offset_max + check_len < targ_len) {
1219                 end_point = HOP3lim((U8*)anchor,
1220                                 prog->check_offset_max,
1221                                 end_point - check_len
1222                             )
1223                             + check_len;
1224                 if (end_point < start_point)
1225                     goto fail_finish;
1226             }
1227         }
1228 
1229         check_at = fbm_instr( start_point, end_point,
1230                       check, multiline ? FBMrf_MULTILINE : 0);
1231 
1232         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1233             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1234             (IV)((char*)start_point - strbeg),
1235             (IV)((char*)end_point   - strbeg),
1236             (IV)(check_at ? check_at - strbeg : -1)
1237         ));
1238 
1239         /* Update the count-of-usability, remove useless subpatterns,
1240             unshift s.  */
1241 
1242         DEBUG_EXECUTE_r({
1243             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1244                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1245             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1246                               (check_at ? "Found" : "Did not find"),
1247                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1248                     ? "anchored" : "floating"),
1249                 quoted,
1250                 RE_SV_TAIL(check),
1251                 (check_at ? " at offset " : "...\n") );
1252         });
1253 
1254         if (!check_at)
1255             goto fail_finish;
1256         /* set rx_origin to the minimum position where the regex could start
1257          * matching, given the constraint of the just-matched check substring.
1258          * But don't set it lower than previously.
1259          */
1260 
1261         if (check_at - rx_origin > prog->check_offset_max)
1262             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1263         /* Finish the diagnostic message */
1264         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1265             "%ld (rx_origin now %" IVdf ")...\n",
1266             (long)(check_at - strbeg),
1267             (IV)(rx_origin - strbeg)
1268         ));
1269     }
1270 
1271 
1272     /* now look for the 'other' substring if defined */
1273 
1274     if (prog->substrs->data[other_ix].utf8_substr
1275         || prog->substrs->data[other_ix].substr)
1276     {
1277         /* Take into account the "other" substring. */
1278         char *last, *last1;
1279         char *s;
1280         SV* must;
1281         struct reg_substr_datum *other;
1282 
1283       do_other_substr:
1284         other = &prog->substrs->data[other_ix];
1285         if (!utf8_target && !other->substr) {
1286             if (!to_byte_substr(prog)) {
1287                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1288             }
1289         }
1290 
1291         /* if "other" is anchored:
1292          * we've previously found a floating substr starting at check_at.
1293          * This means that the regex origin must lie somewhere
1294          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1295          * and max:                 HOP3(check_at, -check_offset_min)
1296          * (except that min will be >= strpos)
1297          * So the fixed  substr must lie somewhere between
1298          *  HOP3(min, anchored_offset)
1299          *  HOP3(max, anchored_offset) + SvCUR(substr)
1300          */
1301 
1302         /* if "other" is floating
1303          * Calculate last1, the absolute latest point where the
1304          * floating substr could start in the string, ignoring any
1305          * constraints from the earlier fixed match. It is calculated
1306          * as follows:
1307          *
1308          * strend - prog->minlen (in chars) is the absolute latest
1309          * position within the string where the origin of the regex
1310          * could appear. The latest start point for the floating
1311          * substr is float_min_offset(*) on from the start of the
1312          * regex.  last1 simply combines thee two offsets.
1313          *
1314          * (*) You might think the latest start point should be
1315          * float_max_offset from the regex origin, and technically
1316          * you'd be correct. However, consider
1317          *    /a\d{2,4}bcd\w/
1318          * Here, float min, max are 3,5 and minlen is 7.
1319          * This can match either
1320          *    /a\d\dbcd\w/
1321          *    /a\d\d\dbcd\w/
1322          *    /a\d\d\d\dbcd\w/
1323          * In the first case, the regex matches minlen chars; in the
1324          * second, minlen+1, in the third, minlen+2.
1325          * In the first case, the floating offset is 3 (which equals
1326          * float_min), in the second, 4, and in the third, 5 (which
1327          * equals float_max). In all cases, the floating string bcd
1328          * can never start more than 4 chars from the end of the
1329          * string, which equals minlen - float_min. As the substring
1330          * starts to match more than float_min from the start of the
1331          * regex, it makes the regex match more than minlen chars,
1332          * and the two cancel each other out. So we can always use
1333          * float_min - minlen, rather than float_max - minlen for the
1334          * latest position in the string.
1335          *
1336          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1337          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1338          */
1339 
1340         assert(prog->minlen >= other->min_offset);
1341         last1 = HOP3c(strend,
1342                         other->min_offset - prog->minlen, strbeg);
1343 
1344         if (other_ix) {/* i.e. if (other-is-float) */
1345             /* last is the latest point where the floating substr could
1346              * start, *given* any constraints from the earlier fixed
1347              * match. This constraint is that the floating string starts
1348              * <= float_max_offset chars from the regex origin (rx_origin).
1349              * If this value is less than last1, use it instead.
1350              */
1351             assert(rx_origin <= last1);
1352             last =
1353                 /* this condition handles the offset==infinity case, and
1354                  * is a short-cut otherwise. Although it's comparing a
1355                  * byte offset to a char length, it does so in a safe way,
1356                  * since 1 char always occupies 1 or more bytes,
1357                  * so if a string range is  (last1 - rx_origin) bytes,
1358                  * it will be less than or equal to  (last1 - rx_origin)
1359                  * chars; meaning it errs towards doing the accurate HOP3
1360                  * rather than just using last1 as a short-cut */
1361                 (last1 - rx_origin) < other->max_offset
1362                     ? last1
1363                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1364         }
1365         else {
1366             assert(strpos + start_shift <= check_at);
1367             last = HOP4c(check_at, other->min_offset - start_shift,
1368                         strbeg, strend);
1369         }
1370 
1371         s = HOP3c(rx_origin, other->min_offset, strend);
1372         if (s < other_last)	/* These positions already checked */
1373             s = other_last;
1374 
1375         must = utf8_target ? other->utf8_substr : other->substr;
1376         assert(SvPOK(must));
1377         {
1378             char *from = s;
1379             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1380 
1381             if (to > strend)
1382                 to = strend;
1383             if (from > to) {
1384                 s = NULL;
1385                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1386                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1387                     (IV)(from - strbeg),
1388                     (IV)(to   - strbeg)
1389                 ));
1390             }
1391             else {
1392                 s = fbm_instr(
1393                     (unsigned char*)from,
1394                     (unsigned char*)to,
1395                     must,
1396                     multiline ? FBMrf_MULTILINE : 0
1397                 );
1398                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1399                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1400                     (IV)(from - strbeg),
1401                     (IV)(to   - strbeg),
1402                     (IV)(s ? s - strbeg : -1)
1403                 ));
1404             }
1405         }
1406 
1407         DEBUG_EXECUTE_r({
1408             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1409                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1410             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1411                 s ? "Found" : "Contradicts",
1412                 other_ix ? "floating" : "anchored",
1413                 quoted, RE_SV_TAIL(must));
1414         });
1415 
1416 
1417         if (!s) {
1418             /* last1 is latest possible substr location. If we didn't
1419              * find it before there, we never will */
1420             if (last >= last1) {
1421                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422                                         "; giving up...\n"));
1423                 goto fail_finish;
1424             }
1425 
1426             /* try to find the check substr again at a later
1427              * position. Maybe next time we'll find the "other" substr
1428              * in range too */
1429             other_last = HOP3c(last, 1, strend) /* highest failure */;
1430             rx_origin =
1431                 other_ix /* i.e. if other-is-float */
1432                     ? HOP3c(rx_origin, 1, strend)
1433                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1434             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1435                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1436                 (other_ix ? "floating" : "anchored"),
1437                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1438                 (IV)(rx_origin - strbeg)
1439             ));
1440             goto restart;
1441         }
1442         else {
1443             if (other_ix) { /* if (other-is-float) */
1444                 /* other_last is set to s, not s+1, since its possible for
1445                  * a floating substr to fail first time, then succeed
1446                  * second time at the same floating position; e.g.:
1447                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1448                  * The first time round, anchored and float match at
1449                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1450                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1451                  */
1452                 other_last = s;
1453             }
1454             else {
1455                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1456                 other_last = HOP3c(s, 1, strend);
1457             }
1458             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1459                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1460                   (long)(s - strbeg),
1461                 (IV)(rx_origin - strbeg)
1462               ));
1463 
1464         }
1465     }
1466     else {
1467         DEBUG_OPTIMISE_MORE_r(
1468             Perl_re_printf( aTHX_
1469                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1470                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1471                 " strend:%" IVdf "\n",
1472                 (IV)prog->check_offset_min,
1473                 (IV)prog->check_offset_max,
1474                 (IV)(check_at-strbeg),
1475                 (IV)(rx_origin-strbeg),
1476                 (IV)(rx_origin-check_at),
1477                 (IV)(strend-strbeg)
1478             )
1479         );
1480     }
1481 
1482   postprocess_substr_matches:
1483 
1484     /* handle the extra constraint of /^.../m if present */
1485 
1486     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1487         char *s;
1488 
1489         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1490                         "  looking for /^/m anchor"));
1491 
1492         /* we have failed the constraint of a \n before rx_origin.
1493          * Find the next \n, if any, even if it's beyond the current
1494          * anchored and/or floating substrings. Whether we should be
1495          * scanning ahead for the next \n or the next substr is debatable.
1496          * On the one hand you'd expect rare substrings to appear less
1497          * often than \n's. On the other hand, searching for \n means
1498          * we're effectively flipping between check_substr and "\n" on each
1499          * iteration as the current "rarest" candidate string, which
1500          * means for example that we'll quickly reject the whole string if
1501          * hasn't got a \n, rather than trying every substr position
1502          * first
1503          */
1504 
1505         s = HOP3c(strend, - prog->minlen, strpos);
1506         if (s <= rx_origin ||
1507             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1508         {
1509             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1510                             "  Did not find /%s^%s/m...\n",
1511                             PL_colors[0], PL_colors[1]));
1512             goto fail_finish;
1513         }
1514 
1515         /* earliest possible origin is 1 char after the \n.
1516          * (since *rx_origin == '\n', it's safe to ++ here rather than
1517          * HOP(rx_origin, 1)) */
1518         rx_origin++;
1519 
1520         if (prog->substrs->check_ix == 0  /* check is anchored */
1521             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1522         {
1523             /* Position contradicts check-string; either because
1524              * check was anchored (and thus has no wiggle room),
1525              * or check was float and rx_origin is above the float range */
1526             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1527                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1528                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1529             goto restart;
1530         }
1531 
1532         /* if we get here, the check substr must have been float,
1533          * is in range, and we may or may not have had an anchored
1534          * "other" substr which still contradicts */
1535         assert(prog->substrs->check_ix); /* check is float */
1536 
1537         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1538             /* whoops, the anchored "other" substr exists, so we still
1539              * contradict. On the other hand, the float "check" substr
1540              * didn't contradict, so just retry the anchored "other"
1541              * substr */
1542             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1543                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1544                 PL_colors[0], PL_colors[1],
1545                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1546                 (IV)(rx_origin - strbeg)
1547             ));
1548             goto do_other_substr;
1549         }
1550 
1551         /* success: we don't contradict the found floating substring
1552          * (and there's no anchored substr). */
1553         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1554             "  Found /%s^%s/m with rx_origin %ld...\n",
1555             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1556     }
1557     else {
1558         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1559             "  (multiline anchor test skipped)\n"));
1560     }
1561 
1562   success_at_start:
1563 
1564 
1565     /* if we have a starting character class, then test that extra constraint.
1566      * (trie stclasses are too expensive to use here, we are better off to
1567      * leave it to regmatch itself) */
1568 
1569     if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) {
1570         const U8* const str = (U8*)STRING(progi->regstclass);
1571 
1572         /* XXX this value could be pre-computed */
1573         const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT
1574                     ?  (reginfo->is_utf8_pat
1575                         ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1576                         : (SSize_t)STR_LEN(progi->regstclass))
1577                     : 1);
1578         char * endpos;
1579         char *s;
1580         /* latest pos that a matching float substr constrains rx start to */
1581         char *rx_max_float = NULL;
1582 
1583         /* if the current rx_origin is anchored, either by satisfying an
1584          * anchored substring constraint, or a /^.../m constraint, then we
1585          * can reject the current origin if the start class isn't found
1586          * at the current position. If we have a float-only match, then
1587          * rx_origin is constrained to a range; so look for the start class
1588          * in that range. if neither, then look for the start class in the
1589          * whole rest of the string */
1590 
1591         /* XXX DAPM it's not clear what the minlen test is for, and why
1592          * it's not used in the floating case. Nothing in the test suite
1593          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1594          * Here are some old comments, which may or may not be correct:
1595          *
1596          *   minlen == 0 is possible if regstclass is \b or \B,
1597          *   and the fixed substr is ''$.
1598          *   Since minlen is already taken into account, rx_origin+1 is
1599          *   before strend; accidentally, minlen >= 1 guaranties no false
1600          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1601          *   0) below assumes that regstclass does not come from lookahead...
1602          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1603          *   This leaves EXACTF-ish only, which are dealt with in
1604          *   find_byclass().
1605          */
1606 
1607         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1608             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1609         else if (prog->float_substr || prog->float_utf8) {
1610             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1611             endpos = HOP3clim(rx_max_float, cl_l, strend);
1612         }
1613         else
1614             endpos= strend;
1615 
1616         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1617             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1618             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1619               (IV)start_shift, (IV)(check_at - strbeg),
1620               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1621 
1622         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1623                             reginfo);
1624         if (!s) {
1625             if (endpos == strend) {
1626                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1627                                 "  Could not match STCLASS...\n") );
1628                 goto fail;
1629             }
1630             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1631                                "  This position contradicts STCLASS...\n") );
1632             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1633                         && !(prog->intflags & PREGf_IMPLICIT))
1634                 goto fail;
1635 
1636             /* Contradict one of substrings */
1637             if (prog->anchored_substr || prog->anchored_utf8) {
1638                 if (prog->substrs->check_ix == 1) { /* check is float */
1639                     /* Have both, check_string is floating */
1640                     assert(rx_origin + start_shift <= check_at);
1641                     if (rx_origin + start_shift != check_at) {
1642                         /* not at latest position float substr could match:
1643                          * Recheck anchored substring, but not floating.
1644                          * The condition above is in bytes rather than
1645                          * chars for efficiency. It's conservative, in
1646                          * that it errs on the side of doing 'goto
1647                          * do_other_substr'. In this case, at worst,
1648                          * an extra anchored search may get done, but in
1649                          * practice the extra fbm_instr() is likely to
1650                          * get skipped anyway. */
1651                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1652                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1653                             (long)(other_last - strbeg),
1654                             (IV)(rx_origin - strbeg)
1655                         ));
1656                         goto do_other_substr;
1657                     }
1658                 }
1659             }
1660             else {
1661                 /* float-only */
1662 
1663                 if (ml_anch) {
1664                     /* In the presence of ml_anch, we might be able to
1665                      * find another \n without breaking the current float
1666                      * constraint. */
1667 
1668                     /* strictly speaking this should be HOP3c(..., 1, ...),
1669                      * but since we goto a block of code that's going to
1670                      * search for the next \n if any, its safe here */
1671                     rx_origin++;
1672                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1673                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1674                               PL_colors[0], PL_colors[1],
1675                               (long)(rx_origin - strbeg)) );
1676                     goto postprocess_substr_matches;
1677                 }
1678 
1679                 /* strictly speaking this can never be true; but might
1680                  * be if we ever allow intuit without substrings */
1681                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1682                     goto fail;
1683 
1684                 rx_origin = rx_max_float;
1685             }
1686 
1687             /* at this point, any matching substrings have been
1688              * contradicted. Start again... */
1689 
1690             rx_origin = HOP3c(rx_origin, 1, strend);
1691 
1692             /* uses bytes rather than char calculations for efficiency.
1693              * It's conservative: it errs on the side of doing 'goto restart',
1694              * where there is code that does a proper char-based test */
1695             if (rx_origin + start_shift + end_shift > strend) {
1696                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1697                                        "  Could not match STCLASS...\n") );
1698                 goto fail;
1699             }
1700             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1701                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1702                 (prog->substrs->check_ix ? "floating" : "anchored"),
1703                 (long)(rx_origin + start_shift - strbeg),
1704                 (IV)(rx_origin - strbeg)
1705             ));
1706             goto restart;
1707         }
1708 
1709         /* Success !!! */
1710 
1711         if (rx_origin != s) {
1712             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1713                         "  By STCLASS: moving %ld --> %ld\n",
1714                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1715                    );
1716         }
1717         else {
1718             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1719                                   "  Does not contradict STCLASS...\n");
1720                    );
1721         }
1722     }
1723 
1724     /* Decide whether using the substrings helped */
1725 
1726     if (rx_origin != strpos) {
1727         /* Fixed substring is found far enough so that the match
1728            cannot start at strpos. */
1729 
1730         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1731         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);	/* hooray/5 */
1732     }
1733     else {
1734         /* The found rx_origin position does not prohibit matching at
1735          * strpos, so calling intuit didn't gain us anything. Decrement
1736          * the BmUSEFUL() count on the check substring, and if we reach
1737          * zero, free it.  */
1738         if (!(prog->intflags & PREGf_NAUGHTY)
1739             && (utf8_target ? (
1740                 prog->check_utf8		/* Could be deleted already */
1741                 && --BmUSEFUL(prog->check_utf8) < 0
1742                 && (prog->check_utf8 == prog->float_utf8)
1743             ) : (
1744                 prog->check_substr		/* Could be deleted already */
1745                 && --BmUSEFUL(prog->check_substr) < 0
1746                 && (prog->check_substr == prog->float_substr)
1747             )))
1748         {
1749             /* If flags & SOMETHING - do not do it many times on the same match */
1750             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1751             /* XXX Does the destruction order has to change with utf8_target? */
1752             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1753             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1754             prog->check_substr = prog->check_utf8 = NULL;	/* disable */
1755             prog->float_substr = prog->float_utf8 = NULL;	/* clear */
1756             check = NULL;			/* abort */
1757             /* XXXX This is a remnant of the old implementation.  It
1758                     looks wasteful, since now INTUIT can use many
1759                     other heuristics. */
1760             prog->extflags &= ~RXf_USE_INTUIT;
1761         }
1762     }
1763 
1764     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1765             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1766              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1767 
1768     return rx_origin;
1769 
1770   fail_finish:				/* Substring not found */
1771     if (prog->check_substr || prog->check_utf8)		/* could be removed already */
1772         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1773   fail:
1774     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1775                           PL_colors[4], PL_colors[5]));
1776     return NULL;
1777 }
1778 
1779 
1780 #define DECL_TRIE_TYPE(scan) \
1781     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1782                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1783                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1784                     trie_type = ((FLAGS(scan) == EXACT)                             \
1785                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1786                                  : (FLAGS(scan) == EXACTL)                          \
1787                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1788                                     : (FLAGS(scan) == EXACTFAA)                     \
1789                                       ? (utf8_target                                \
1790                                          ? trie_utf8_exactfa_fold                   \
1791                                          : trie_latin_utf8_exactfa_fold)            \
1792                                       : (FLAGS(scan) == EXACTFLU8                   \
1793                                          ? (utf8_target                             \
1794                                            ? trie_flu8                              \
1795                                            : trie_flu8_latin)                       \
1796                                          : (utf8_target                             \
1797                                            ? trie_utf8_fold                         \
1798                                            : trie_latin_utf8_fold)))
1799 
1800 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1801  * 'foldbuf+sizeof(foldbuf)' */
1802 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1803 STMT_START {                                                                        \
1804     STRLEN skiplen;                                                                 \
1805     U8 flags = FOLD_FLAGS_FULL;                                                     \
1806     switch (trie_type) {                                                            \
1807     case trie_flu8:                                                                 \
1808         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1809         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1810             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1811         }                                                                           \
1812         goto do_trie_utf8_fold;                                                     \
1813     case trie_utf8_exactfa_fold:                                                    \
1814         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1815         /* FALLTHROUGH */                                                           \
1816     case trie_utf8_fold:                                                            \
1817       do_trie_utf8_fold:                                                            \
1818         if ( foldlen>0 ) {                                                          \
1819             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1820             foldlen -= len;                                                         \
1821             uscan += len;                                                           \
1822             len=0;                                                                  \
1823         } else {                                                                    \
1824             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1825                                                                             flags); \
1826             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1827             skiplen = UVCHR_SKIP( uvc );                                            \
1828             foldlen -= skiplen;                                                     \
1829             uscan = foldbuf + skiplen;                                              \
1830         }                                                                           \
1831         break;                                                                      \
1832     case trie_flu8_latin:                                                           \
1833         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1834         goto do_trie_latin_utf8_fold;                                               \
1835     case trie_latin_utf8_exactfa_fold:                                              \
1836         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1837         /* FALLTHROUGH */                                                           \
1838     case trie_latin_utf8_fold:                                                      \
1839       do_trie_latin_utf8_fold:                                                      \
1840         if ( foldlen>0 ) {                                                          \
1841             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1842             foldlen -= len;                                                         \
1843             uscan += len;                                                           \
1844             len=0;                                                                  \
1845         } else {                                                                    \
1846             len = 1;                                                                \
1847             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1848             skiplen = UVCHR_SKIP( uvc );                                            \
1849             foldlen -= skiplen;                                                     \
1850             uscan = foldbuf + skiplen;                                              \
1851         }                                                                           \
1852         break;                                                                      \
1853     case trie_utf8l:                                                                \
1854         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1855         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1856             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1857         }                                                                           \
1858         /* FALLTHROUGH */                                                           \
1859     case trie_utf8:                                                                 \
1860         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1861         break;                                                                      \
1862     case trie_plain:                                                                \
1863         uvc = (UV)*uc;                                                              \
1864         len = 1;                                                                    \
1865     }                                                                               \
1866     if (uvc < 256) {                                                                \
1867         charid = trie->charmap[ uvc ];                                              \
1868     }                                                                               \
1869     else {                                                                          \
1870         charid = 0;                                                                 \
1871         if (widecharmap) {                                                          \
1872             SV** const svpp = hv_fetch(widecharmap,                                 \
1873                         (char*)&uvc, sizeof(UV), 0);                                \
1874             if (svpp)                                                               \
1875                 charid = (U16)SvIV(*svpp);                                          \
1876         }                                                                           \
1877     }                                                                               \
1878 } STMT_END
1879 
1880 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1881     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1882                 startpos, doutf8, depth)
1883 
1884 #define GET_ANYOFH_INVLIST(prog, n)                                         \
1885                         GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL)
1886 
1887 #define REXEC_FBC_UTF8_SCAN(CODE)                           \
1888     STMT_START {                                            \
1889         while (s < strend) {                                \
1890             CODE                                            \
1891             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1892         }                                                   \
1893     } STMT_END
1894 
1895 #define REXEC_FBC_NON_UTF8_SCAN(CODE)                       \
1896     STMT_START {                                            \
1897         while (s < strend) {                                \
1898             CODE                                            \
1899             s++;                                            \
1900         }                                                   \
1901     } STMT_END
1902 
1903 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                     \
1904     STMT_START {                                            \
1905         while (s < strend) {                                \
1906             REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)            \
1907         }                                                   \
1908     } STMT_END
1909 
1910 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND)                 \
1911     STMT_START {                                            \
1912         while (s < strend) {                                \
1913             REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)        \
1914         }                                                   \
1915     } STMT_END
1916 
1917 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)                   \
1918     if (COND) {                                                \
1919         FBC_CHECK_AND_TRY                                      \
1920         s += UTF8_SAFE_SKIP(s, reginfo->strend);               \
1921         previous_occurrence_end = s;                           \
1922     }                                                          \
1923     else {                                                     \
1924         s += UTF8SKIP(s);                                      \
1925     }
1926 
1927 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)               \
1928     if (COND) {                                                \
1929         FBC_CHECK_AND_TRY                                      \
1930         s++;                                                   \
1931         previous_occurrence_end = s;                           \
1932     }                                                          \
1933     else {                                                     \
1934         s++;                                                   \
1935     }
1936 
1937 #define LAST_REGTRY_SKIPPED_FORWARD(reginfo) (reginfo->cutpoint)
1938 
1939 /* We keep track of where the next character should start after an occurrence
1940  * of the one we're looking for.  Knowing that, we can see right away if the
1941  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1942  * don't accept the 2nd and succeeding adjacent occurrences */
1943 #define FBC_CHECK_AND_TRY                                           \
1944         if (   (   doevery                                          \
1945                 || s != previous_occurrence_end                     \
1946                 || LAST_REGTRY_SKIPPED_FORWARD(reginfo) )           \
1947             && (   reginfo->intuit                                  \
1948                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1949         {                                                           \
1950             goto got_it;                                            \
1951         }
1952 
1953 
1954 /* These differ from the above macros in that they call a function which
1955  * returns the next occurrence of the thing being looked for in 's'; and
1956  * 'strend' if there is no such occurrence.  'f' is something like fcn(a,b,c)
1957  * */
1958 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f)                    \
1959     while (s < strend) {                                    \
1960         s = (char *) (f);                                   \
1961         if (s >= strend) {                                  \
1962             break;                                          \
1963         }                                                   \
1964                                                             \
1965         FBC_CHECK_AND_TRY                                   \
1966         s += UTF8SKIP(s);                                   \
1967         previous_occurrence_end = s;                        \
1968     }
1969 
1970 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f)                \
1971     while (s < strend) {                                    \
1972         s = (char *) (f);                                   \
1973         if (s >= strend) {                                  \
1974             break;                                          \
1975         }                                                   \
1976                                                             \
1977         FBC_CHECK_AND_TRY                                   \
1978         s++;                                                \
1979         previous_occurrence_end = s;                        \
1980     }
1981 
1982 /* This is like the above macro except the function returns NULL if there is no
1983  * occurrence, and there is a further condition that must be matched besides
1984  * the function */
1985 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND)         \
1986     while (s < strend) {                                    \
1987         s = (char *) (f);                                     \
1988         if (s == NULL) {                                    \
1989             s = (char *) strend;                            \
1990             break;                                          \
1991         }                                                   \
1992                                                             \
1993         if (COND) {                                         \
1994             FBC_CHECK_AND_TRY                               \
1995             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1996             previous_occurrence_end = s;                    \
1997         }                                                   \
1998         else {                                              \
1999             s += UTF8SKIP(s);                               \
2000         }                                                   \
2001     }
2002 
2003 /* This differs from the above macros in that it is passed a single byte that
2004  * is known to begin the next occurrence of the thing being looked for in 's'.
2005  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
2006  * at that position. */
2007 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)                  \
2008     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s),     \
2009                                               COND)
2010 
2011 /* This is like the function above, but takes an entire string to look for
2012  * instead of a single byte */
2013 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND)      \
2014     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(                                     \
2015                                      ninstr(s, strend, substr, substr_end), \
2016                                      COND)
2017 
2018 /* The four macros below are slightly different versions of the same logic.
2019  *
2020  * The first is for /a and /aa when the target string is UTF-8.  This can only
2021  * match ascii, but it must advance based on UTF-8.   The other three handle
2022  * the non-UTF-8 and the more generic UTF-8 cases.   In all four, we are
2023  * looking for the boundary (or non-boundary) between a word and non-word
2024  * character.  The utf8 and non-utf8 cases have the same logic, but the details
2025  * must be different.  Find the "wordness" of the character just prior to this
2026  * one, and compare it with the wordness of this one.  If they differ, we have
2027  * a boundary.  At the beginning of the string, pretend that the previous
2028  * character was a new-line.
2029  *
2030  * All these macros uncleanly have side-effects with each other and outside
2031  * variables.  So far it's been too much trouble to clean-up
2032  *
2033  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
2034  *               a word character or not.
2035  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
2036  *               word/non-word
2037  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
2038  *
2039  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
2040  * are looking for a boundary or for a non-boundary.  If we are looking for a
2041  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
2042  * see if this tentative match actually works, and if so, to quit the loop
2043  * here.  And vice-versa if we are looking for a non-boundary.
2044  *
2045  * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
2046  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
2047  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
2048  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2049  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2050  * complement.  But in that branch we complement tmp, meaning that at the
2051  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2052  * which means at the top of the loop in the next iteration, it is
2053  * TEST_NON_UTF8(s-1) */
2054 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
2055     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
2056     tmp = TEST_NON_UTF8(tmp);                                                  \
2057     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
2058         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
2059             tmp = !tmp;                                                        \
2060             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
2061         }                                                                      \
2062         else {                                                                 \
2063             IF_FAIL;                                                           \
2064         }                                                                      \
2065     );                                                                         \
2066 
2067 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2068  * TEST_UTF8 is a macro that for the same input code points returns identically
2069  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
2070  * end pointer as well) */
2071 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
2072     if (s == reginfo->strbeg) {                                                \
2073         tmp = '\n';                                                            \
2074     }                                                                          \
2075     else { /* Back-up to the start of the previous character */                \
2076         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2077         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2078                                                        0, UTF8_ALLOW_DEFAULT); \
2079     }                                                                          \
2080     tmp = TEST_UV(tmp);                                                        \
2081     REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */                      \
2082         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2083             tmp = !tmp;                                                        \
2084             IF_SUCCESS;                                                        \
2085         }                                                                      \
2086         else {                                                                 \
2087             IF_FAIL;                                                           \
2088         }                                                                      \
2089     );
2090 
2091 /* Like the above two macros, for a UTF-8 target string.  UTF8_CODE is the
2092  * complete code for handling UTF-8.  Common to the BOUND and NBOUND cases,
2093  * set-up by the FBC_BOUND, etc macros below */
2094 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)   \
2095     UTF8_CODE;                                                                 \
2096     /* Here, things have been set up by the previous code so that tmp is the   \
2097      * return of TEST_NON_UTF8(s-1).  We also have to check if this matches    \
2098      * against the EOS, which we treat as a \n */                              \
2099     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2100         IF_SUCCESS;                                                            \
2101     }                                                                          \
2102     else {                                                                     \
2103         IF_FAIL;                                                               \
2104     }
2105 
2106 /* Same as the macro above, but the target isn't UTF-8 */
2107 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)       \
2108     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                   \
2109     tmp = TEST_NON_UTF8(tmp);                                               \
2110     REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */               \
2111         if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) {                           \
2112             IF_SUCCESS;                                                     \
2113             tmp = !tmp;                                                     \
2114         }                                                                   \
2115         else {                                                              \
2116             IF_FAIL;                                                        \
2117         }                                                                   \
2118     );                                                                      \
2119     /* Here, things have been set up by the previous code so that tmp is    \
2120      * the return of TEST_NON_UTF8(s-1).   We also have to check if this    \
2121      * matches against the EOS, which we treat as a \n */                   \
2122     if (tmp == ! TEST_NON_UTF8('\n')) {                                     \
2123         IF_SUCCESS;                                                         \
2124     }                                                                       \
2125     else {                                                                  \
2126         IF_FAIL;                                                            \
2127     }
2128 
2129 /* This is the macro to use when we want to see if something that looks like it
2130  * could match, actually does, and if so exits the loop.  It needs to be used
2131  * only for bounds checking macros, as it allows for matching beyond the end of
2132  * string (which should be zero length without having to look at the string
2133  * contents) */
2134 #define REXEC_FBC_TRYIT                                                     \
2135     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
2136         goto got_it
2137 
2138 /* The only difference between the BOUND and NBOUND cases is that
2139  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2140  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2141  * with the other one being empty (PLACEHOLDER is defined as empty).
2142  *
2143  * The TEST_FOO parameters are for operating on different forms of input, but
2144  * all should be ones that return identically for the same underlying code
2145  * points */
2146 
2147 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                   \
2148     FBC_BOUND_COMMON_UTF8(                                                  \
2149           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),       \
2150           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2151 
2152 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8)                                   \
2153     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2154 
2155 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8)                                     \
2156     FBC_BOUND_COMMON_UTF8(                                                  \
2157                     FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2158                     TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2159 
2160 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8)                                 \
2161     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2162 
2163 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                  \
2164     FBC_BOUND_COMMON_UTF8(                                                  \
2165               FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),   \
2166               TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2167 
2168 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8)                                  \
2169     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2170 
2171 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8)                                    \
2172     FBC_BOUND_COMMON_UTF8(                                                  \
2173             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),        \
2174             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2175 
2176 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8)                                \
2177     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2178 
2179 #ifdef DEBUGGING
2180 static IV
S_get_break_val_cp_checked(SV * const invlist,const UV cp_in)2181 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2182   IV cp_out = _invlist_search(invlist, cp_in);
2183   assert(cp_out >= 0);
2184   return cp_out;
2185 }
2186 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2187         invmap[S_get_break_val_cp_checked(invlist, cp)]
2188 #else
2189 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2190         invmap[_invlist_search(invlist, cp)]
2191 #endif
2192 
2193 /* Takes a pointer to an inversion list, a pointer to its corresponding
2194  * inversion map, and a code point, and returns the code point's value
2195  * according to the two arrays.  It assumes that all code points have a value.
2196  * This is used as the base macro for macros for particular properties */
2197 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2198         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2199 
2200 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2201  * of a code point, returning the value for the first code point in the string.
2202  * And it takes the particular macro name that finds the desired value given a
2203  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2204 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2205              (__ASSERT_(pos < strend)                                          \
2206                  /* Note assumes is valid UTF-8 */                             \
2207              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2208 
2209 /* Returns the GCB value for the input code point */
2210 #define getGCB_VAL_CP(cp)                                                      \
2211           _generic_GET_BREAK_VAL_CP(                                           \
2212                                     PL_GCB_invlist,                            \
2213                                     _Perl_GCB_invmap,                          \
2214                                     (cp))
2215 
2216 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2217  * bounded by pos and strend */
2218 #define getGCB_VAL_UTF8(pos, strend)                                           \
2219     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2220 
2221 /* Returns the LB value for the input code point */
2222 #define getLB_VAL_CP(cp)                                                       \
2223           _generic_GET_BREAK_VAL_CP(                                           \
2224                                     PL_LB_invlist,                             \
2225                                     _Perl_LB_invmap,                           \
2226                                     (cp))
2227 
2228 /* Returns the LB value for the first code point in the UTF-8 encoded string
2229  * bounded by pos and strend */
2230 #define getLB_VAL_UTF8(pos, strend)                                            \
2231     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2232 
2233 
2234 /* Returns the SB value for the input code point */
2235 #define getSB_VAL_CP(cp)                                                       \
2236           _generic_GET_BREAK_VAL_CP(                                           \
2237                                     PL_SB_invlist,                             \
2238                                     _Perl_SB_invmap,                     \
2239                                     (cp))
2240 
2241 /* Returns the SB value for the first code point in the UTF-8 encoded string
2242  * bounded by pos and strend */
2243 #define getSB_VAL_UTF8(pos, strend)                                            \
2244     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2245 
2246 /* Returns the WB value for the input code point */
2247 #define getWB_VAL_CP(cp)                                                       \
2248           _generic_GET_BREAK_VAL_CP(                                           \
2249                                     PL_WB_invlist,                             \
2250                                     _Perl_WB_invmap,                         \
2251                                     (cp))
2252 
2253 /* Returns the WB value for the first code point in the UTF-8 encoded string
2254  * bounded by pos and strend */
2255 #define getWB_VAL_UTF8(pos, strend)                                            \
2256     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2257 
2258 /* We know what class REx starts with.  Try to find this position... */
2259 /* if reginfo->intuit, its a dryrun */
2260 /* annoyingly all the vars in this routine have different names from their counterparts
2261    in regmatch. /grrr */
2262 STATIC char *
S_find_byclass(pTHX_ regexp * prog,const regnode * c,char * s,const char * strend,regmatch_info * reginfo)2263 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2264     const char *strend, regmatch_info *reginfo)
2265 {
2266 
2267     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2268     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2269 
2270     char *pat_string;   /* The pattern's exactish string */
2271     char *pat_end;	    /* ptr to end char of pat_string */
2272     re_fold_t folder;	/* Function for computing non-utf8 folds */
2273     const U8 *fold_array;   /* array for folding ords < 256 */
2274     STRLEN ln;
2275     STRLEN lnc;
2276     U8 c1;
2277     U8 c2;
2278     char *e = NULL;
2279 
2280     /* In some cases we accept only the first occurrence of 'x' in a sequence of
2281      * them.  This variable points to just beyond the end of the previous
2282      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2283      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2284      * hop back.) */
2285     char * previous_occurrence_end = 0;
2286 
2287     I32 tmp;            /* Scratch variable */
2288     const bool utf8_target = reginfo->is_utf8_target;
2289     UV utf8_fold_flags = 0;
2290     const bool is_utf8_pat = reginfo->is_utf8_pat;
2291     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2292                                    with a result inverts that result, as 0^1 =
2293                                    1 and 1^1 = 0 */
2294     char_class_number_ classnum;
2295 
2296     RXi_GET_DECL(prog,progi);
2297 
2298     PERL_ARGS_ASSERT_FIND_BYCLASS;
2299 
2300     /* We know what class it must start with. The case statements below have
2301      * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2302      * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2303      * ('p8' and 'pb'. */
2304     switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2305         SV * anyofh_list;
2306 
2307       case ANYOFPOSIXL_t8_pb:
2308       case ANYOFPOSIXL_t8_p8:
2309       case ANYOFL_t8_pb:
2310       case ANYOFL_t8_p8:
2311         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2312         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2313 
2314         /* FALLTHROUGH */
2315 
2316       case ANYOFD_t8_pb:
2317       case ANYOFD_t8_p8:
2318       case ANYOF_t8_pb:
2319       case ANYOF_t8_p8:
2320         REXEC_FBC_UTF8_CLASS_SCAN(
2321                 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2322         break;
2323 
2324       case ANYOFPOSIXL_tb_pb:
2325       case ANYOFPOSIXL_tb_p8:
2326       case ANYOFL_tb_pb:
2327       case ANYOFL_tb_p8:
2328         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2329         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2330 
2331         /* FALLTHROUGH */
2332 
2333       case ANYOFD_tb_pb:
2334       case ANYOFD_tb_p8:
2335       case ANYOF_tb_pb:
2336       case ANYOF_tb_p8:
2337         if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) {
2338             /* We know that s is in the bitmap range since the target isn't
2339              * UTF-8, so what happens for out-of-range values is not relevant,
2340              * so exclude that from the flags */
2341             REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2342         }
2343         else {
2344             REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2345                                                      0));
2346         }
2347         break;
2348 
2349       case ANYOFM_tb_pb: /* ARG1u() is the base byte; FLAGS() the mask byte */
2350       case ANYOFM_tb_p8:
2351         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2352              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2353         break;
2354 
2355       case ANYOFM_t8_pb:
2356       case ANYOFM_t8_p8:
2357         /* UTF-8ness doesn't matter because only matches UTF-8 invariants.  But
2358          * we do anyway for performance reasons, as otherwise we would have to
2359          * examine all the continuation characters */
2360         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2361              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2362         break;
2363 
2364       case NANYOFM_tb_pb:
2365       case NANYOFM_tb_p8:
2366         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2367            find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2368         break;
2369 
2370       case NANYOFM_t8_pb:
2371       case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2372                                   variants. */
2373         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2374                         (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2375                                                     (U8) ARG1u(c), FLAGS(c)));
2376         break;
2377 
2378       /* These nodes all require at least one code point to be in UTF-8 to
2379        * match */
2380       case ANYOFH_tb_pb:
2381       case ANYOFH_tb_p8:
2382       case ANYOFHb_tb_pb:
2383       case ANYOFHb_tb_p8:
2384       case ANYOFHbbm_tb_pb:
2385       case ANYOFHbbm_tb_p8:
2386       case ANYOFHr_tb_pb:
2387       case ANYOFHr_tb_p8:
2388       case ANYOFHs_tb_pb:
2389       case ANYOFHs_tb_p8:
2390       case EXACTFLU8_tb_pb:
2391       case EXACTFLU8_tb_p8:
2392       case EXACTFU_REQ8_tb_pb:
2393       case EXACTFU_REQ8_tb_p8:
2394         break;
2395 
2396       case ANYOFH_t8_pb:
2397       case ANYOFH_t8_p8:
2398         anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2399         REXEC_FBC_UTF8_CLASS_SCAN(
2400               (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2401                && _invlist_contains_cp(anyofh_list,
2402                                        utf8_to_uvchr_buf((U8 *) s,
2403                                                          (U8 *) strend,
2404                                                          NULL))));
2405         break;
2406 
2407       case ANYOFHb_t8_pb:
2408       case ANYOFHb_t8_p8:
2409         {
2410             /* We know what the first byte of any matched string should be. */
2411             U8 first_byte = FLAGS(c);
2412 
2413             anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2414             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2415                    _invlist_contains_cp(anyofh_list,
2416                                            utf8_to_uvchr_buf((U8 *) s,
2417                                                               (U8 *) strend,
2418                                                               NULL)));
2419         }
2420         break;
2421 
2422       case ANYOFHbbm_t8_pb:
2423       case ANYOFHbbm_t8_p8:
2424         {
2425             /* We know what the first byte of any matched string should be. */
2426             U8 first_byte = FLAGS(c);
2427 
2428             /* And a bitmap defines all the legal 2nd byte matches */
2429             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2430                                (    s < strend
2431                                 && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap,
2432                                             (U8) s[1] & UTF_CONTINUATION_MASK)));
2433         }
2434         break;
2435 
2436       case ANYOFHr_t8_pb:
2437       case ANYOFHr_t8_p8:
2438         anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2439         REXEC_FBC_UTF8_CLASS_SCAN(
2440                     (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2441                                 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2442                                 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2443                    && _invlist_contains_cp(anyofh_list,
2444                                            utf8_to_uvchr_buf((U8 *) s,
2445                                                               (U8 *) strend,
2446                                                               NULL))));
2447         break;
2448 
2449       case ANYOFHs_t8_pb:
2450       case ANYOFHs_t8_p8:
2451         anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2452         REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2453                         ((struct regnode_anyofhs *) c)->string,
2454                         /* Note FLAGS is the string length in this regnode */
2455                         ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2456                         _invlist_contains_cp(anyofh_list,
2457                                              utf8_to_uvchr_buf((U8 *) s,
2458                                                                (U8 *) strend,
2459                                                                NULL)));
2460         break;
2461 
2462       case ANYOFR_tb_pb:
2463       case ANYOFR_tb_p8:
2464         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2465                                             ANYOFRbase(c), ANYOFRdelta(c)));
2466         break;
2467 
2468       case ANYOFR_t8_pb:
2469       case ANYOFR_t8_p8:
2470         REXEC_FBC_UTF8_CLASS_SCAN(
2471                             (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2472                              && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2473                                                               (U8 *) strend,
2474                                                               NULL),
2475                                             ANYOFRbase(c), ANYOFRdelta(c))));
2476         break;
2477 
2478       case ANYOFRb_tb_pb:
2479       case ANYOFRb_tb_p8:
2480         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2481                                             ANYOFRbase(c), ANYOFRdelta(c)));
2482         break;
2483 
2484       case ANYOFRb_t8_pb:
2485       case ANYOFRb_t8_p8:
2486         {   /* We know what the first byte of any matched string should be */
2487             U8 first_byte = FLAGS(c);
2488 
2489             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2490                                 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2491                                                               (U8 *) strend,
2492                                                               NULL),
2493                                             ANYOFRbase(c), ANYOFRdelta(c)));
2494         }
2495         break;
2496 
2497       case EXACTFAA_tb_pb:
2498 
2499         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2500          * which these functions don't handle anyway */
2501         fold_array = PL_fold_latin1;
2502         folder = S_foldEQ_latin1_s2_folded;
2503         goto do_exactf_non_utf8;
2504 
2505       case EXACTF_tb_pb:
2506         fold_array = PL_fold;
2507         folder = Perl_foldEQ;
2508         goto do_exactf_non_utf8;
2509 
2510       case EXACTFL_tb_pb:
2511         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2512 
2513         if (IN_UTF8_CTYPE_LOCALE) {
2514             utf8_fold_flags = FOLDEQ_LOCALE;
2515             goto do_exactf_utf8;
2516         }
2517 
2518         fold_array = PL_fold_locale;
2519         folder = Perl_foldEQ_locale;
2520         goto do_exactf_non_utf8;
2521 
2522       case EXACTFU_tb_pb:
2523         /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2524          * don't have to worry here about this single special case in the
2525          * Latin1 range */
2526         fold_array = PL_fold_latin1;
2527         folder = S_foldEQ_latin1_s2_folded;
2528 
2529         /* FALLTHROUGH */
2530 
2531        do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2532                               are no glitches with fold-length differences
2533                               between the target string and pattern */
2534 
2535         /* The idea in the non-utf8 EXACTF* cases is to first find the first
2536          * character of the EXACTF* node and then, if necessary,
2537          * case-insensitively compare the full text of the node.  c1 is the
2538          * first character.  c2 is its fold.  This logic will not work for
2539          * Unicode semantics and the german sharp ss, which hence should not be
2540          * compiled into a node that gets here. */
2541         pat_string = STRINGs(c);
2542         ln  = STR_LENs(c);	/* length to match in octets/bytes */
2543 
2544         /* We know that we have to match at least 'ln' bytes (which is the same
2545          * as characters, since not utf8).  If we have to match 3 characters,
2546          * and there are only 2 available, we know without trying that it will
2547          * fail; so don't start a match past the required minimum number from
2548          * the far end */
2549         e = HOP3c(strend, -((SSize_t)ln), s);
2550         if (e < s)
2551             break;
2552 
2553         c1 = *pat_string;
2554         c2 = fold_array[c1];
2555         if (c1 == c2) { /* If char and fold are the same */
2556             while (s <= e) {
2557                 s = (char *) memchr(s, c1, e + 1 - s);
2558                 if (s == NULL) {
2559                     break;
2560                 }
2561 
2562                 /* Check that the rest of the node matches */
2563                 if (   (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2564                     && (reginfo->intuit || regtry(reginfo, &s)) )
2565                 {
2566                     goto got_it;
2567                 }
2568                 s++;
2569             }
2570         }
2571         else {
2572             U8 bits_differing = c1 ^ c2;
2573 
2574             /* If the folds differ in one bit position only, we can mask to
2575              * match either of them, and can use this faster find method.  Both
2576              * ASCII and EBCDIC tend to have their case folds differ in only
2577              * one position, so this is very likely */
2578             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2579                 bits_differing = ~ bits_differing;
2580                 while (s <= e) {
2581                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2582                                         (c1 & bits_differing), bits_differing);
2583                     if (s > e) {
2584                         break;
2585                     }
2586 
2587                     if (   (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2588                         && (reginfo->intuit || regtry(reginfo, &s)) )
2589                     {
2590                         goto got_it;
2591                     }
2592                     s++;
2593                 }
2594             }
2595             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2596                        should actually happen only in EXACTFL nodes */
2597                 while (s <= e) {
2598                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2599                         && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2600                         && (reginfo->intuit || regtry(reginfo, &s)) )
2601                     {
2602                         goto got_it;
2603                     }
2604                     s++;
2605                 }
2606             }
2607         }
2608         break;
2609 
2610       case EXACTFAA_tb_p8:
2611       case EXACTFAA_t8_p8:
2612         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2613                          |FOLDEQ_S2_ALREADY_FOLDED
2614                          |FOLDEQ_S2_FOLDS_SANE;
2615         goto do_exactf_utf8;
2616 
2617       case EXACTFAA_NO_TRIE_tb_pb:
2618       case EXACTFAA_NO_TRIE_t8_pb:
2619       case EXACTFAA_t8_pb:
2620 
2621         /* Here, and elsewhere in this file, the reason we can't consider a
2622          * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2623          * is because any MICRO SIGN in the pattern won't be folded.  Since the
2624          * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2625          * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2626         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2627         goto do_exactf_utf8;
2628 
2629       case EXACTFL_tb_p8:
2630       case EXACTFL_t8_pb:
2631       case EXACTFL_t8_p8:
2632         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2633         utf8_fold_flags = FOLDEQ_LOCALE;
2634         goto do_exactf_utf8;
2635 
2636       case EXACTFLU8_t8_pb:
2637       case EXACTFLU8_t8_p8:
2638         utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2639                                          | FOLDEQ_S2_FOLDS_SANE;
2640         goto do_exactf_utf8;
2641 
2642       case EXACTFU_REQ8_t8_p8:
2643         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2644         goto do_exactf_utf8;
2645 
2646       case EXACTFU_tb_p8:
2647       case EXACTFU_t8_pb:
2648       case EXACTFU_t8_p8:
2649         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2650         goto do_exactf_utf8;
2651 
2652       /* The following are problematic even though pattern isn't UTF-8.  Use
2653        * full functionality normally not done except for UTF-8. */
2654       case EXACTF_t8_pb:
2655       case EXACTFUP_tb_pb:
2656       case EXACTFUP_t8_pb:
2657 
2658        do_exactf_utf8:
2659         {
2660             unsigned expansion;
2661 
2662             /* If one of the operands is in utf8, we can't use the simpler
2663              * folding above, due to the fact that many different characters
2664              * can have the same fold, or portion of a fold, or different-
2665              * length fold */
2666             pat_string = STRINGs(c);
2667             ln  = STR_LENs(c);	/* length to match in octets/bytes */
2668             pat_end = pat_string + ln;
2669             lnc = is_utf8_pat       /* length to match in characters */
2670                   ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2671                   : ln;
2672 
2673             /* We have 'lnc' characters to match in the pattern, but because of
2674              * multi-character folding, each character in the target can match
2675              * up to 3 characters (Unicode guarantees it will never exceed
2676              * this) if it is utf8-encoded; and up to 2 if not (based on the
2677              * fact that the Latin 1 folds are already determined, and the only
2678              * multi-char fold in that range is the sharp-s folding to 'ss'.
2679              * Thus, a pattern character can match as little as 1/3 of a string
2680              * character.  Adjust lnc accordingly, rounding up, so that if we
2681              * need to match at least 4+1/3 chars, that really is 5. */
2682             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2683             lnc = (lnc + expansion - 1) / expansion;
2684 
2685             /* As in the non-UTF8 case, if we have to match 3 characters, and
2686              * only 2 are left, it's guaranteed to fail, so don't start a match
2687              * that would require us to go beyond the end of the string */
2688             e = HOP3c(strend, -((SSize_t)lnc), s);
2689 
2690             /* XXX Note that we could recalculate e to stop the loop earlier,
2691              * as the worst case expansion above will rarely be met, and as we
2692              * go along we would usually find that e moves further to the left.
2693              * This would happen only after we reached the point in the loop
2694              * where if there were no expansion we should fail.  Unclear if
2695              * worth the expense */
2696 
2697             while (s <= e) {
2698                 char *my_strend= (char *)strend;
2699                 if (   foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2700                                          pat_string, NULL, ln, is_utf8_pat,
2701                                          utf8_fold_flags)
2702                     && (reginfo->intuit || regtry(reginfo, &s)) )
2703                 {
2704                     goto got_it;
2705                 }
2706                 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2707             }
2708         }
2709         break;
2710 
2711       case BOUNDA_tb_pb:
2712       case BOUNDA_tb_p8:
2713       case BOUND_tb_pb:  /* /d without utf8 target is /a */
2714       case BOUND_tb_p8:
2715         /* regcomp.c makes sure that these only have the traditional \b
2716          * meaning. */
2717         assert(FLAGS(c) == TRADITIONAL_BOUND);
2718 
2719         FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2720         break;
2721 
2722       case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2723       case BOUNDA_t8_p8:
2724         /* regcomp.c makes sure that these only have the traditional \b
2725          * meaning. */
2726         assert(FLAGS(c) == TRADITIONAL_BOUND);
2727 
2728         FBC_BOUND_A_UTF8(isWORDCHAR_A);
2729         break;
2730 
2731       case NBOUNDA_tb_pb:
2732       case NBOUNDA_tb_p8:
2733       case NBOUND_tb_pb: /* /d without utf8 target is /a */
2734       case NBOUND_tb_p8:
2735         /* regcomp.c makes sure that these only have the traditional \b
2736          * meaning. */
2737         assert(FLAGS(c) == TRADITIONAL_BOUND);
2738 
2739         FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2740         break;
2741 
2742       case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2743       case NBOUNDA_t8_p8:
2744         /* regcomp.c makes sure that these only have the traditional \b
2745          * meaning. */
2746         assert(FLAGS(c) == TRADITIONAL_BOUND);
2747 
2748         FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2749         break;
2750 
2751       case NBOUNDU_tb_pb:
2752       case NBOUNDU_tb_p8:
2753         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2754             FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2755             break;
2756         }
2757 
2758         to_complement = 1;
2759         goto do_boundu_non_utf8;
2760 
2761       case NBOUNDL_tb_pb:
2762       case NBOUNDL_tb_p8:
2763         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2764         if (FLAGS(c) == TRADITIONAL_BOUND) {
2765             FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2766             break;
2767         }
2768 
2769         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2770 
2771         to_complement = 1;
2772         goto do_boundu_non_utf8;
2773 
2774       case BOUNDL_tb_pb:
2775       case BOUNDL_tb_p8:
2776         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2777         if (FLAGS(c) == TRADITIONAL_BOUND) {
2778             FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2779             break;
2780         }
2781 
2782         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2783 
2784         goto do_boundu_non_utf8;
2785 
2786       case BOUNDU_tb_pb:
2787       case BOUNDU_tb_p8:
2788         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2789             FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2790             break;
2791         }
2792 
2793       do_boundu_non_utf8:
2794         if (s == reginfo->strbeg) {
2795             if (reginfo->intuit || regtry(reginfo, &s))
2796             {
2797                 goto got_it;
2798             }
2799 
2800             /* Didn't match.  Try at the next position (if there is one) */
2801             s++;
2802             if (UNLIKELY(s >= reginfo->strend)) {
2803                 break;
2804             }
2805         }
2806 
2807         switch((bound_type) FLAGS(c)) {
2808           case TRADITIONAL_BOUND: /* Should have already been handled */
2809             assert(0);
2810             break;
2811 
2812           case GCB_BOUND:
2813             /* Not utf8.  Everything is a GCB except between CR and LF */
2814             while (s < strend) {
2815                 if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2816                                       || UCHARAT(s) != '\n'))
2817                     && (reginfo->intuit || regtry(reginfo, &s)))
2818                 {
2819                     goto got_it;
2820                 }
2821                 s++;
2822             }
2823 
2824             break;
2825 
2826           case LB_BOUND:
2827             {
2828                 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2829                 while (s < strend) {
2830                     LB_enum after = getLB_VAL_CP((U8) *s);
2831                     if (to_complement ^ isLB(before,
2832                                              after,
2833                                              (U8*) reginfo->strbeg,
2834                                              (U8*) s,
2835                                              (U8*) reginfo->strend,
2836                                              0 /* target not utf8 */ )
2837                         && (reginfo->intuit || regtry(reginfo, &s)))
2838                     {
2839                         goto got_it;
2840                     }
2841                     before = after;
2842                     s++;
2843                 }
2844             }
2845 
2846             break;
2847 
2848           case SB_BOUND:
2849             {
2850                 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2851                 while (s < strend) {
2852                     SB_enum after = getSB_VAL_CP((U8) *s);
2853                     if ((to_complement ^ isSB(before,
2854                                               after,
2855                                               (U8*) reginfo->strbeg,
2856                                               (U8*) s,
2857                                               (U8*) reginfo->strend,
2858                                              0 /* target not utf8 */ ))
2859                         && (reginfo->intuit || regtry(reginfo, &s)))
2860                     {
2861                         goto got_it;
2862                     }
2863                     before = after;
2864                     s++;
2865                 }
2866             }
2867 
2868             break;
2869 
2870           case WB_BOUND:
2871             {
2872                 WB_enum previous = WB_UNKNOWN;
2873                 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2874                 while (s < strend) {
2875                     WB_enum after = getWB_VAL_CP((U8) *s);
2876                     if ((to_complement ^ isWB(previous,
2877                                               before,
2878                                               after,
2879                                               (U8*) reginfo->strbeg,
2880                                               (U8*) s,
2881                                               (U8*) reginfo->strend,
2882                                                0 /* target not utf8 */ ))
2883                         && (reginfo->intuit || regtry(reginfo, &s)))
2884                     {
2885                         goto got_it;
2886                     }
2887                     previous = before;
2888                     before = after;
2889                     s++;
2890                 }
2891             }
2892         }
2893 
2894         /* Here are at the final position in the target string, which is a
2895          * boundary by definition, so matches, depending on other constraints.
2896          * */
2897         if (   reginfo->intuit
2898             || (s <= reginfo->strend && regtry(reginfo, &s)))
2899         {
2900             goto got_it;
2901         }
2902 
2903         break;
2904 
2905       case BOUNDL_t8_pb:
2906       case BOUNDL_t8_p8:
2907         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2908         if (FLAGS(c) == TRADITIONAL_BOUND) {
2909             FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2910                            isWORDCHAR_LC_utf8_safe);
2911             break;
2912         }
2913 
2914         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2915 
2916         to_complement = 1;
2917         goto do_boundu_utf8;
2918 
2919       case NBOUNDL_t8_pb:
2920       case NBOUNDL_t8_p8:
2921         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2922         if (FLAGS(c) == TRADITIONAL_BOUND) {
2923             FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2924                             isWORDCHAR_LC_utf8_safe);
2925             break;
2926         }
2927 
2928         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2929 
2930         to_complement = 1;
2931         goto do_boundu_utf8;
2932 
2933       case NBOUND_t8_pb:
2934       case NBOUND_t8_p8:
2935         /* regcomp.c makes sure that these only have the traditional \b
2936          * meaning. */
2937         assert(FLAGS(c) == TRADITIONAL_BOUND);
2938 
2939         /* FALLTHROUGH */
2940 
2941       case NBOUNDU_t8_pb:
2942       case NBOUNDU_t8_p8:
2943         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2944             FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2945                             isWORDCHAR_utf8_safe);
2946             break;
2947         }
2948 
2949         to_complement = 1;
2950         goto do_boundu_utf8;
2951 
2952       case BOUND_t8_pb:
2953       case BOUND_t8_p8:
2954         /* regcomp.c makes sure that these only have the traditional \b
2955          * meaning. */
2956         assert(FLAGS(c) == TRADITIONAL_BOUND);
2957 
2958         /* FALLTHROUGH */
2959 
2960       case BOUNDU_t8_pb:
2961       case BOUNDU_t8_p8:
2962         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2963             FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2964             break;
2965         }
2966 
2967       do_boundu_utf8:
2968         if (s == reginfo->strbeg) {
2969             if (reginfo->intuit || regtry(reginfo, &s))
2970             {
2971                 goto got_it;
2972             }
2973 
2974             /* Didn't match.  Try at the next position (if there is one) */
2975             s += UTF8_SAFE_SKIP(s, reginfo->strend);
2976             if (UNLIKELY(s >= reginfo->strend)) {
2977                 break;
2978             }
2979         }
2980 
2981         switch((bound_type) FLAGS(c)) {
2982           case TRADITIONAL_BOUND: /* Should have already been handled */
2983             assert(0);
2984             break;
2985 
2986           case GCB_BOUND:
2987             {
2988                 GCB_enum before = getGCB_VAL_UTF8(
2989                                            reghop3((U8*)s, -1,
2990                                                    (U8*)(reginfo->strbeg)),
2991                                            (U8*) reginfo->strend);
2992                 while (s < strend) {
2993                     GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2994                                                     (U8*) reginfo->strend);
2995                     if (   (to_complement ^ isGCB(before,
2996                                                   after,
2997                                                   (U8*) reginfo->strbeg,
2998                                                   (U8*) s,
2999                                                   1 /* target is utf8 */ ))
3000                         && (reginfo->intuit || regtry(reginfo, &s)))
3001                     {
3002                         goto got_it;
3003                     }
3004                     before = after;
3005                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3006                 }
3007             }
3008             break;
3009 
3010           case LB_BOUND:
3011             {
3012                 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
3013                                                         -1,
3014                                                         (U8*)(reginfo->strbeg)),
3015                                                    (U8*) reginfo->strend);
3016                 while (s < strend) {
3017                     LB_enum after = getLB_VAL_UTF8((U8*) s,
3018                                                    (U8*) reginfo->strend);
3019                     if (to_complement ^ isLB(before,
3020                                              after,
3021                                              (U8*) reginfo->strbeg,
3022                                              (U8*) s,
3023                                              (U8*) reginfo->strend,
3024                                              1 /* target is utf8 */ )
3025                         && (reginfo->intuit || regtry(reginfo, &s)))
3026                     {
3027                         goto got_it;
3028                     }
3029                     before = after;
3030                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3031                 }
3032             }
3033 
3034             break;
3035 
3036           case SB_BOUND:
3037             {
3038                 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
3039                                                     -1,
3040                                                     (U8*)(reginfo->strbeg)),
3041                                                   (U8*) reginfo->strend);
3042                 while (s < strend) {
3043                     SB_enum after = getSB_VAL_UTF8((U8*) s,
3044                                                      (U8*) reginfo->strend);
3045                     if ((to_complement ^ isSB(before,
3046                                               after,
3047                                               (U8*) reginfo->strbeg,
3048                                               (U8*) s,
3049                                               (U8*) reginfo->strend,
3050                                               1 /* target is utf8 */ ))
3051                         && (reginfo->intuit || regtry(reginfo, &s)))
3052                     {
3053                         goto got_it;
3054                     }
3055                     before = after;
3056                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3057                 }
3058             }
3059 
3060             break;
3061 
3062           case WB_BOUND:
3063             {
3064                 /* We are at a boundary between char_sub_0 and char_sub_1.
3065                  * We also keep track of the value for char_sub_-1 as we
3066                  * loop through the line.   Context may be needed to make a
3067                  * determination, and if so, this can save having to
3068                  * recalculate it */
3069                 WB_enum previous = WB_UNKNOWN;
3070                 WB_enum before = getWB_VAL_UTF8(
3071                                           reghop3((U8*)s,
3072                                                   -1,
3073                                                   (U8*)(reginfo->strbeg)),
3074                                           (U8*) reginfo->strend);
3075                 while (s < strend) {
3076                     WB_enum after = getWB_VAL_UTF8((U8*) s,
3077                                                     (U8*) reginfo->strend);
3078                     if ((to_complement ^ isWB(previous,
3079                                               before,
3080                                               after,
3081                                               (U8*) reginfo->strbeg,
3082                                               (U8*) s,
3083                                               (U8*) reginfo->strend,
3084                                               1 /* target is utf8 */ ))
3085                         && (reginfo->intuit || regtry(reginfo, &s)))
3086                     {
3087                         goto got_it;
3088                     }
3089                     previous = before;
3090                     before = after;
3091                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3092                 }
3093             }
3094         }
3095 
3096         /* Here are at the final position in the target string, which is a
3097          * boundary by definition, so matches, depending on other constraints.
3098          * */
3099 
3100         if (   reginfo->intuit
3101             || (s <= reginfo->strend && regtry(reginfo, &s)))
3102         {
3103             goto got_it;
3104         }
3105         break;
3106 
3107       case LNBREAK_t8_pb:
3108       case LNBREAK_t8_p8:
3109         REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
3110         break;
3111 
3112       case LNBREAK_tb_pb:
3113       case LNBREAK_tb_p8:
3114         REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
3115         break;
3116 
3117       /* The argument to all the POSIX node types is the class number to pass
3118        * to generic_isCC_() to build a mask for searching in PL_charclass[] */
3119 
3120       case NPOSIXL_t8_pb:
3121       case NPOSIXL_t8_p8:
3122         to_complement = 1;
3123         /* FALLTHROUGH */
3124 
3125       case POSIXL_t8_pb:
3126       case POSIXL_t8_p8:
3127         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3128         REXEC_FBC_UTF8_CLASS_SCAN(
3129             to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3130                                                           (U8 *) strend)));
3131         break;
3132 
3133       case NPOSIXL_tb_pb:
3134       case NPOSIXL_tb_p8:
3135         to_complement = 1;
3136         /* FALLTHROUGH */
3137 
3138       case POSIXL_tb_pb:
3139       case POSIXL_tb_p8:
3140         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3141         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3142                                 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3143         break;
3144 
3145       case NPOSIXA_t8_pb:
3146       case NPOSIXA_t8_p8:
3147         /* The complement of something that matches only ASCII matches all
3148          * non-ASCII, plus everything in ASCII that isn't in the class. */
3149         REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
3150                                   || ! generic_isCC_A_(*s, FLAGS(c)));
3151         break;
3152 
3153       case POSIXA_t8_pb:
3154       case POSIXA_t8_p8:
3155         /* Don't need to worry about utf8, as it can match only a single
3156          * byte invariant character.  But we do anyway for performance reasons,
3157          * as otherwise we would have to examine all the continuation
3158          * characters */
3159         REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c)));
3160         break;
3161 
3162       case NPOSIXD_tb_pb:
3163       case NPOSIXD_tb_p8:
3164       case NPOSIXA_tb_pb:
3165       case NPOSIXA_tb_p8:
3166         to_complement = 1;
3167         /* FALLTHROUGH */
3168 
3169       case POSIXD_tb_pb:
3170       case POSIXD_tb_p8:
3171       case POSIXA_tb_pb:
3172       case POSIXA_tb_p8:
3173         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3174                         to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c))));
3175         break;
3176 
3177       case NPOSIXU_tb_pb:
3178       case NPOSIXU_tb_p8:
3179         to_complement = 1;
3180         /* FALLTHROUGH */
3181 
3182       case POSIXU_tb_pb:
3183       case POSIXU_tb_p8:
3184             REXEC_FBC_NON_UTF8_CLASS_SCAN(
3185                                  to_complement ^ cBOOL(generic_isCC_(*s,
3186                                                                     FLAGS(c))));
3187         break;
3188 
3189       case NPOSIXD_t8_pb:
3190       case NPOSIXD_t8_p8:
3191       case NPOSIXU_t8_pb:
3192       case NPOSIXU_t8_p8:
3193         to_complement = 1;
3194         /* FALLTHROUGH */
3195 
3196       case POSIXD_t8_pb:
3197       case POSIXD_t8_p8:
3198       case POSIXU_t8_pb:
3199       case POSIXU_t8_p8:
3200         classnum = (char_class_number_) FLAGS(c);
3201         switch (classnum) {
3202           default:
3203             REXEC_FBC_UTF8_CLASS_SCAN(
3204                         to_complement ^ cBOOL(_invlist_contains_cp(
3205                                                 PL_XPosix_ptrs[classnum],
3206                                                 utf8_to_uvchr_buf((U8 *) s,
3207                                                                 (U8 *) strend,
3208                                                                 NULL))));
3209             break;
3210 
3211           case CC_ENUM_SPACE_:
3212             REXEC_FBC_UTF8_CLASS_SCAN(
3213                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3214             break;
3215 
3216           case CC_ENUM_BLANK_:
3217             REXEC_FBC_UTF8_CLASS_SCAN(
3218                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3219             break;
3220 
3221           case CC_ENUM_XDIGIT_:
3222             REXEC_FBC_UTF8_CLASS_SCAN(
3223                         to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3224             break;
3225 
3226           case CC_ENUM_VERTSPACE_:
3227             REXEC_FBC_UTF8_CLASS_SCAN(
3228                         to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3229             break;
3230 
3231           case CC_ENUM_CNTRL_:
3232             REXEC_FBC_UTF8_CLASS_SCAN(
3233                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3234             break;
3235         }
3236         break;
3237 
3238       case AHOCORASICKC_tb_pb:
3239       case AHOCORASICKC_tb_p8:
3240       case AHOCORASICKC_t8_pb:
3241       case AHOCORASICKC_t8_p8:
3242       case AHOCORASICK_tb_pb:
3243       case AHOCORASICK_tb_p8:
3244       case AHOCORASICK_t8_pb:
3245       case AHOCORASICK_t8_p8:
3246         {
3247             DECL_TRIE_TYPE(c);
3248             /* what trie are we using right now */
3249             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG1u( c ) ];
3250             reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3251             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3252 
3253             const char *last_start = strend - trie->minlen;
3254 #ifdef DEBUGGING
3255             const char *real_start = s;
3256 #endif
3257             STRLEN maxlen = trie->maxlen;
3258             SV *sv_points;
3259             U8 **points; /* map of where we were in the input string
3260                             when reading a given char. For ASCII this
3261                             is unnecessary overhead as the relationship
3262                             is always 1:1, but for Unicode, especially
3263                             case folded Unicode this is not true. */
3264             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3265             U8 *bitmap=NULL;
3266 
3267 
3268             DECLARE_AND_GET_RE_DEBUG_FLAGS;
3269 
3270             /* We can't just allocate points here. We need to wrap it in
3271              * an SV so it gets freed properly if there is a croak while
3272              * running the match */
3273             ENTER;
3274             SAVETMPS;
3275             sv_points=newSV(maxlen * sizeof(U8 *));
3276             SvCUR_set(sv_points,
3277                 maxlen * sizeof(U8 *));
3278             SvPOK_on(sv_points);
3279             sv_2mortal(sv_points);
3280             points=(U8**)SvPV_nolen(sv_points );
3281             if ( trie_type != trie_utf8_fold
3282                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
3283             {
3284                 if (trie->bitmap)
3285                     bitmap=(U8*)trie->bitmap;
3286                 else
3287                     bitmap=(U8*)ANYOF_BITMAP(c);
3288             }
3289             /* this is the Aho-Corasick algorithm modified a touch
3290                to include special handling for long "unknown char" sequences.
3291                The basic idea being that we use AC as long as we are dealing
3292                with a possible matching char, when we encounter an unknown char
3293                (and we have not encountered an accepting state) we scan forward
3294                until we find a legal starting char.
3295                AC matching is basically that of trie matching, except that when
3296                we encounter a failing transition, we fall back to the current
3297                states "fail state", and try the current char again, a process
3298                we repeat until we reach the root state, state 1, or a legal
3299                transition. If we fail on the root state then we can either
3300                terminate if we have reached an accepting state previously, or
3301                restart the entire process from the beginning if we have not.
3302 
3303              */
3304             while (s <= last_start) {
3305                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3306                 U8 *uc = (U8*)s;
3307                 U16 charid = 0;
3308                 U32 base = 1;
3309                 U32 state = 1;
3310                 UV uvc = 0;
3311                 STRLEN len = 0;
3312                 STRLEN foldlen = 0;
3313                 U8 *uscan = (U8*)NULL;
3314                 U8 *leftmost = NULL;
3315 #ifdef DEBUGGING
3316                 U32 accepted_word= 0;
3317 #endif
3318                 U32 pointpos = 0;
3319 
3320                 while ( state && uc <= (U8*)strend ) {
3321                     int failed=0;
3322                     U32 word = aho->states[ state ].wordnum;
3323 
3324                     if( state==1 ) {
3325                         if ( bitmap ) {
3326                             DEBUG_TRIE_EXECUTE_r(
3327                                 if (  uc <= (U8*)last_start
3328                                     && !BITMAP_TEST(bitmap,*uc) )
3329                                 {
3330                                     dump_exec_pos( (char *)uc, c, strend,
3331                                         real_start,
3332                                         (char *)uc, utf8_target, 0 );
3333                                     Perl_re_printf( aTHX_
3334                                         " Scanning for legal start char...\n");
3335                                 }
3336                             );
3337                             if (utf8_target) {
3338                                 while (  uc <= (U8*)last_start
3339                                        && !BITMAP_TEST(bitmap,*uc) )
3340                                 {
3341                                     uc += UTF8SKIP(uc);
3342                                 }
3343                             } else {
3344                                 while (  uc <= (U8*)last_start
3345                                        && ! BITMAP_TEST(bitmap,*uc) )
3346                                 {
3347                                     uc++;
3348                                 }
3349                             }
3350                             s= (char *)uc;
3351                         }
3352                         if (uc >(U8*)last_start) break;
3353                     }
3354 
3355                     if ( word ) {
3356                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3357                                                                     % maxlen ];
3358                         if (!leftmost || lpos < leftmost) {
3359                             DEBUG_r(accepted_word=word);
3360                             leftmost= lpos;
3361                         }
3362                         if (base==0) break;
3363 
3364                     }
3365                     points[pointpos++ % maxlen]= uc;
3366                     if (foldlen || uc < (U8*)strend) {
3367                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3368                                              (U8 *) strend, uscan, len, uvc,
3369                                              charid, foldlen, foldbuf,
3370                                              uniflags);
3371                         DEBUG_TRIE_EXECUTE_r({
3372                             dump_exec_pos( (char *)uc, c, strend,
3373                                         real_start, s, utf8_target, 0);
3374                             Perl_re_printf( aTHX_
3375                                 " Charid:%3u CP:%4" UVxf " ",
3376                                  charid, uvc);
3377                         });
3378                     }
3379                     else {
3380                         len = 0;
3381                         charid = 0;
3382                     }
3383 
3384 
3385                     do {
3386 #ifdef DEBUGGING
3387                         word = aho->states[ state ].wordnum;
3388 #endif
3389                         base = aho->states[ state ].trans.base;
3390 
3391                         DEBUG_TRIE_EXECUTE_r({
3392                             if (failed)
3393                                 dump_exec_pos((char *)uc, c, strend, real_start,
3394                                     s,   utf8_target, 0 );
3395                             Perl_re_printf( aTHX_
3396                                 "%sState: %4" UVxf ", word=%" UVxf,
3397                                 failed ? " Fail transition to " : "",
3398                                 (UV)state, (UV)word);
3399                         });
3400                         if ( base ) {
3401                             U32 tmp;
3402                             I32 offset;
3403                             if (charid &&
3404                                  ( ((offset = base + charid
3405                                     - 1 - trie->uniquecharcount)) >= 0)
3406                                  && ((U32)offset < trie->lasttrans)
3407                                  && trie->trans[offset].check == state
3408                                  && (tmp=trie->trans[offset].next))
3409                             {
3410                                 DEBUG_TRIE_EXECUTE_r(
3411                                     Perl_re_printf( aTHX_ " - legal\n"));
3412                                 state = tmp;
3413                                 break;
3414                             }
3415                             else {
3416                                 DEBUG_TRIE_EXECUTE_r(
3417                                     Perl_re_printf( aTHX_ " - fail\n"));
3418                                 failed = 1;
3419                                 state = aho->fail[state];
3420                             }
3421                         }
3422                         else {
3423                             /* we must be accepting here */
3424                             DEBUG_TRIE_EXECUTE_r(
3425                                     Perl_re_printf( aTHX_ " - accepting\n"));
3426                             failed = 1;
3427                             break;
3428                         }
3429                     } while(state);
3430                     uc += len;
3431                     if (failed) {
3432                         if (leftmost)
3433                             break;
3434                         if (!state) state = 1;
3435                     }
3436                 }
3437                 if ( aho->states[ state ].wordnum ) {
3438                     U8 *lpos = points[ (pointpos
3439                                       - trie->wordinfo[aho->states[ state ]
3440                                                     .wordnum].len) % maxlen ];
3441                     if (!leftmost || lpos < leftmost) {
3442                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3443                         leftmost = lpos;
3444                     }
3445                 }
3446                 if (leftmost) {
3447                     s = (char*)leftmost;
3448                     DEBUG_TRIE_EXECUTE_r({
3449                         Perl_re_printf( aTHX_  "Matches word #%" UVxf
3450                                         " at position %" IVdf ". Trying full"
3451                                         " pattern...\n",
3452                             (UV)accepted_word, (IV)(s - real_start)
3453                         );
3454                     });
3455                     if (reginfo->intuit || regtry(reginfo, &s)) {
3456                         FREETMPS;
3457                         LEAVE;
3458                         goto got_it;
3459                     }
3460                     if (s < reginfo->strend) {
3461                         s = HOPc(s,1);
3462                     }
3463                     DEBUG_TRIE_EXECUTE_r({
3464                         Perl_re_printf( aTHX_
3465                                        "Pattern failed. Looking for new start"
3466                                        " point...\n");
3467                     });
3468                 } else {
3469                     DEBUG_TRIE_EXECUTE_r(
3470                         Perl_re_printf( aTHX_ "No match.\n"));
3471                     break;
3472                 }
3473             }
3474             FREETMPS;
3475             LEAVE;
3476         }
3477         break;
3478 
3479       case EXACTFU_REQ8_t8_pb:
3480       case EXACTFUP_tb_p8:
3481       case EXACTFUP_t8_p8:
3482       case EXACTF_tb_p8:
3483       case EXACTF_t8_p8:   /* This node only generated for non-utf8 patterns */
3484       case EXACTFAA_NO_TRIE_tb_p8:
3485       case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3486                                       patterns */
3487         assert(0);
3488 
3489       default:
3490         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3491     } /* End of switch on node type */
3492 
3493     return 0;
3494 
3495   got_it:
3496     return s;
3497 }
3498 
3499 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3500  * flags have same meanings as with regexec_flags() */
3501 
3502 static void
S_reg_set_capture_string(pTHX_ REGEXP * const rx,char * strbeg,char * strend,SV * sv,U32 flags,bool utf8_target)3503 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3504                             char *strbeg,
3505                             char *strend,
3506                             SV *sv,
3507                             U32 flags,
3508                             bool utf8_target)
3509 {
3510     struct regexp *const prog = ReANY(rx);
3511 
3512     if (flags & REXEC_COPY_STR) {
3513 #ifdef PERL_ANY_COW
3514         if (SvCANCOW(sv)) {
3515             DEBUG_C(Perl_re_printf( aTHX_
3516                               "Copy on write: regexp capture, type %d\n",
3517                                     (int) SvTYPE(sv)));
3518             /* Create a new COW SV to share the match string and store
3519              * in saved_copy, unless the current COW SV in saved_copy
3520              * is valid and suitable for our purpose */
3521             if ((   RXp_SAVED_COPY(prog)
3522                  && SvIsCOW(RXp_SAVED_COPY(prog))
3523                  && SvPOKp(RXp_SAVED_COPY(prog))
3524                  && SvIsCOW(sv)
3525                  && SvPOKp(sv)
3526                  && SvPVX(sv) == SvPVX(RXp_SAVED_COPY(prog))))
3527             {
3528                 /* just reuse saved_copy SV */
3529                 if (RXp_MATCH_COPIED(prog)) {
3530                     Safefree(RXp_SUBBEG(prog));
3531                     RXp_MATCH_COPIED_off(prog);
3532                 }
3533             }
3534             else {
3535                 /* create new COW SV to share string */
3536                 RXp_MATCH_COPY_FREE(prog);
3537                 RXp_SAVED_COPY(prog) = sv_setsv_cow(RXp_SAVED_COPY(prog), sv);
3538             }
3539             RXp_SUBBEG(prog) = (char *)SvPVX_const(RXp_SAVED_COPY(prog));
3540             assert (SvPOKp(RXp_SAVED_COPY(prog)));
3541             RXp_SUBLEN(prog)  = strend - strbeg;
3542             RXp_SUBOFFSET(prog) = 0;
3543             RXp_SUBCOFFSET(prog) = 0;
3544         } else
3545 #endif
3546         {
3547             SSize_t min = 0;
3548             SSize_t max = strend - strbeg;
3549             SSize_t sublen;
3550 
3551             /* NOTE: the following if block is not used or tested
3552              * in standard builds. It is only used when PERL_SAWAMPERSAND is
3553              * defined */
3554 
3555             if (    (flags & REXEC_COPY_SKIP_POST)
3556                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3557                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3558             ) { /* don't copy $' part of string */
3559                 SSize_t offs_end;
3560                 U32 n = 0;
3561                 max = -1;
3562                 /* calculate the right-most part of the string covered
3563                  * by a capture. Due to lookahead, this may be to
3564                  * the right of $&, so we have to scan all captures */
3565                 while (n <= RXp_LASTPAREN(prog)) {
3566                     if ((offs_end = RXp_OFFS_END(prog,n)) > max)
3567                         max = offs_end;
3568                     n++;
3569                 }
3570                 if (max == -1)
3571                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3572                             ? RXp_OFFS_START(prog,0)
3573                             : 0;
3574                 assert(max >= 0 && max <= strend - strbeg);
3575             }
3576 
3577             /* NOTE: the following if block is not used or tested
3578              * in standard builds. It is only used when PERL_SAWAMPERSAND is
3579              * defined */
3580             if (    (flags & REXEC_COPY_SKIP_PRE)
3581                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3582                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3583             ) { /* don't copy $` part of string */
3584                 U32 n = 0;
3585                 min = max;
3586                 /* calculate the left-most part of the string covered
3587                  * by a capture. Due to lookbehind, this may be to
3588                  * the left of $&, so we have to scan all captures */
3589                 while (min && n <= RXp_LASTPAREN(prog)) {
3590                     SSize_t start = RXp_OFFS_START(prog,n);
3591                     if (   start != -1
3592                         && start < min)
3593                     {
3594                         min = start;
3595                     }
3596                     n++;
3597                 }
3598                 if (PL_sawampersand & SAWAMPERSAND_RIGHT) {
3599                     SSize_t end = RXp_OFFS_END(prog,0);
3600                     if ( min > end )
3601                         min = end;
3602                 }
3603             }
3604 
3605             assert(min >= 0 && min <= max && min <= strend - strbeg);
3606             sublen = max - min;
3607 
3608             if (RXp_MATCH_COPIED(prog)) {
3609                 if (sublen > RXp_SUBLEN(prog))
3610                     RXp_SUBBEG(prog) =
3611                             (char*)saferealloc(RXp_SUBBEG(prog), sublen+1);
3612             }
3613             else
3614                 RXp_SUBBEG(prog) = (char*)safemalloc(sublen+1);
3615             Copy(strbeg + min, RXp_SUBBEG(prog), sublen, char);
3616             RXp_SUBBEG(prog)[sublen] = '\0';
3617             RXp_SUBOFFSET(prog) = min;
3618             RXp_SUBLEN(prog) = sublen;
3619             RXp_MATCH_COPIED_on(prog);
3620         }
3621         RXp_SUBCOFFSET(prog) = RXp_SUBOFFSET(prog);
3622         if (RXp_SUBOFFSET(prog) && utf8_target) {
3623             /* Convert byte offset to chars.
3624              * XXX ideally should only compute this if @-/@+
3625              * has been seen, a la PL_sawampersand ??? */
3626 
3627             /* If there's a direct correspondence between the
3628              * string which we're matching and the original SV,
3629              * then we can use the utf8 len cache associated with
3630              * the SV. In particular, it means that under //g,
3631              * sv_pos_b2u() will use the previously cached
3632              * position to speed up working out the new length of
3633              * subcoffset, rather than counting from the start of
3634              * the string each time. This stops
3635              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3636              * from going quadratic */
3637             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3638                 RXp_SUBCOFFSET(prog) = sv_pos_b2u_flags(sv, RXp_SUBCOFFSET(prog),
3639                                                 SV_GMAGIC|SV_CONST_RETURN);
3640             else
3641                 RXp_SUBCOFFSET(prog) = utf8_length((U8*)strbeg,
3642                                     (U8*)(strbeg+RXp_SUBOFFSET(prog)));
3643         }
3644     }
3645     else {
3646         RXp_MATCH_COPY_FREE(prog);
3647         RXp_SUBBEG(prog) = strbeg;
3648         RXp_SUBOFFSET(prog) = 0;
3649         RXp_SUBCOFFSET(prog) = 0;
3650         RXp_SUBLEN(prog) = strend - strbeg;
3651     }
3652 }
3653 
3654 
3655 
3656 
3657 /*
3658  - regexec_flags - match a regexp against a string
3659  */
3660 I32
Perl_regexec_flags(pTHX_ REGEXP * const rx,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * sv,void * data,U32 flags)3661 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3662               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3663 /* stringarg: the point in the string at which to begin matching */
3664 /* strend:    pointer to null at end of string */
3665 /* strbeg:    real beginning of string */
3666 /* minend:    end of match must be >= minend bytes after stringarg. */
3667 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3668  *            itself is accessed via the pointers above */
3669 /* data:      May be used for some additional optimizations.
3670               Currently unused. */
3671 /* flags:     For optimizations. See REXEC_* in regexp.h */
3672 
3673 {
3674     struct regexp *const prog = ReANY(rx);
3675     char *s;
3676     regnode *c;
3677     char *startpos;
3678     SSize_t minlen;		/* must match at least this many chars */
3679     SSize_t dontbother = 0;	/* how many characters not to try at end */
3680     const bool utf8_target = cBOOL(DO_UTF8(sv));
3681     I32 multiline;
3682     RXi_GET_DECL(prog,progi);
3683     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3684     regmatch_info *const reginfo = &reginfo_buf;
3685     regexp_paren_pair *swap = NULL;
3686     I32 oldsave;
3687     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3688 
3689     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3690     PERL_UNUSED_ARG(data);
3691 
3692     /* Be paranoid... */
3693     if (prog == NULL) {
3694         Perl_croak(aTHX_ "NULL regexp parameter");
3695     }
3696 
3697     DEBUG_EXECUTE_r(
3698         debug_start_match(rx, utf8_target, stringarg, strend,
3699         "Matching");
3700     );
3701 
3702     startpos = stringarg;
3703 
3704     /* set these early as they may be used by the HOP macros below */
3705     reginfo->strbeg = strbeg;
3706     reginfo->strend = strend;
3707     reginfo->is_utf8_target = cBOOL(utf8_target);
3708 
3709     if (prog->intflags & PREGf_GPOS_SEEN) {
3710         MAGIC *mg;
3711 
3712         /* set reginfo->ganch, the position where \G can match */
3713 
3714         reginfo->ganch =
3715             (flags & REXEC_IGNOREPOS)
3716             ? stringarg /* use start pos rather than pos() */
3717             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3718               /* Defined pos(): */
3719             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3720             : strbeg; /* pos() not defined; use start of string */
3721 
3722         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3723             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3724 
3725         /* in the presence of \G, we may need to start looking earlier in
3726          * the string than the suggested start point of stringarg:
3727          * if prog->gofs is set, then that's a known, fixed minimum
3728          * offset, such as
3729          * /..\G/:   gofs = 2
3730          * /ab|c\G/: gofs = 1
3731          * or if the minimum offset isn't known, then we have to go back
3732          * to the start of the string, e.g. /w+\G/
3733          */
3734 
3735         if (prog->intflags & PREGf_ANCH_GPOS) {
3736             if (prog->gofs) {
3737                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3738                 if (!startpos ||
3739                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3740                 {
3741                     DEBUG_GPOS_r(Perl_re_printf( aTHX_
3742                             "fail: ganch-gofs before earliest possible start\n"));
3743                     return 0;
3744                 }
3745             }
3746             else
3747                 startpos = reginfo->ganch;
3748         }
3749         else if (prog->gofs) {
3750             startpos = HOPBACKc(startpos, prog->gofs);
3751             if (!startpos)
3752                 startpos = strbeg;
3753         }
3754         else if (prog->intflags & PREGf_GPOS_FLOAT)
3755             startpos = strbeg;
3756     }
3757 
3758     minlen = prog->minlen;
3759     if ((startpos + minlen) > strend || startpos < strbeg) {
3760         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3761                         "Regex match can't succeed, so not even tried\n"));
3762         return 0;
3763     }
3764 
3765     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3766      * which will call destuctors to reset PL_regmatch_state, free higher
3767      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3768      * regmatch_info_aux_eval */
3769 
3770     oldsave = PL_savestack_ix;
3771 
3772     s = startpos;
3773 
3774     if ((prog->extflags & RXf_USE_INTUIT)
3775         && !(flags & REXEC_CHECKED))
3776     {
3777         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3778                                     flags, NULL);
3779         if (!s)
3780             return 0;
3781 
3782         if (prog->extflags & RXf_CHECK_ALL) {
3783             /* we can match based purely on the result of INTUIT.
3784              * Set up captures etc just for $& and $-[0]
3785              * (an intuit-only match wont have $1,$2,..) */
3786             assert(!prog->nparens);
3787 
3788             /* s/// doesn't like it if $& is earlier than where we asked it to
3789              * start searching (which can happen on something like /.\G/) */
3790             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3791                     && (s < stringarg))
3792             {
3793                 /* this should only be possible under \G */
3794                 assert(prog->intflags & PREGf_GPOS_SEEN);
3795                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3796                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3797                 goto phooey;
3798             }
3799 
3800             /* match via INTUIT shouldn't have any captures.
3801              * Let @-, @+, $^N know */
3802             RXp_LASTPAREN(prog) = RXp_LASTCLOSEPAREN(prog) = 0;
3803             RXp_MATCH_UTF8_set(prog, utf8_target);
3804             SSize_t match_start = s - strbeg;
3805             SSize_t match_end = utf8_target
3806                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3807                 : s - strbeg + prog->minlenret;
3808             CLOSE_ANY_CAPTURE(prog, 0, match_start, match_end);
3809             if ( !(flags & REXEC_NOT_FIRST) )
3810                 S_reg_set_capture_string(aTHX_ rx,
3811                                         strbeg, strend,
3812                                         sv, flags, utf8_target);
3813 
3814             return 1;
3815         }
3816     }
3817 
3818     multiline = prog->extflags & RXf_PMf_MULTILINE;
3819 
3820     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3821         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3822                               "String too short [regexec_flags]...\n"));
3823         goto phooey;
3824     }
3825 
3826     /* Check validity of program. */
3827     if (UCHARAT(progi->program) != REG_MAGIC) {
3828         Perl_croak(aTHX_ "corrupted regexp program");
3829     }
3830 
3831     RXp_MATCH_TAINTED_off(prog);
3832     RXp_MATCH_UTF8_set(prog, utf8_target);
3833 
3834     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
3835     reginfo->intuit = 0;
3836     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3837     reginfo->warned = FALSE;
3838     reginfo->sv = sv;
3839     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3840     /* see how far we have to get to not match where we matched before */
3841     reginfo->till = stringarg + minend;
3842 
3843     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3844         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3845            S_cleanup_regmatch_info_aux has executed (registered by
3846            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3847            magic belonging to this SV.
3848            Not newSVsv, either, as it does not COW.
3849         */
3850         reginfo->sv = newSV_type(SVt_NULL);
3851         SvSetSV_nosteal(reginfo->sv, sv);
3852         SAVEFREESV(reginfo->sv);
3853     }
3854 
3855     /* reserve next 2 or 3 slots in PL_regmatch_state:
3856      * slot N+0: may currently be in use: skip it
3857      * slot N+1: use for regmatch_info_aux struct
3858      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3859      * slot N+3: ready for use by regmatch()
3860      */
3861 
3862     {
3863         regmatch_state *old_regmatch_state;
3864         regmatch_slab  *old_regmatch_slab;
3865         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3866 
3867         /* on first ever match, allocate first slab */
3868         if (!PL_regmatch_slab) {
3869             Newx(PL_regmatch_slab, 1, regmatch_slab);
3870             PL_regmatch_slab->prev = NULL;
3871             PL_regmatch_slab->next = NULL;
3872             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3873         }
3874 
3875         old_regmatch_state = PL_regmatch_state;
3876         old_regmatch_slab  = PL_regmatch_slab;
3877 
3878         for (i=0; i <= max; i++) {
3879             if (i == 1)
3880                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3881             else if (i ==2)
3882                 reginfo->info_aux_eval =
3883                 reginfo->info_aux->info_aux_eval =
3884                             &(PL_regmatch_state->u.info_aux_eval);
3885 
3886             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3887                 PL_regmatch_state = S_push_slab(aTHX);
3888         }
3889 
3890         /* note initial PL_regmatch_state position; at end of match we'll
3891          * pop back to there and free any higher slabs */
3892 
3893         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3894         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3895         reginfo->info_aux->poscache = NULL;
3896 
3897         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3898 
3899         if ((prog->extflags & RXf_EVAL_SEEN))
3900             S_setup_eval_state(aTHX_ reginfo);
3901         else
3902             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3903     }
3904 
3905     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3906         /* We have to be careful. If the previous successful match
3907            was from this regex we don't want a subsequent partially
3908            successful match to clobber the old results.
3909            So when we detect this possibility we add a swap buffer
3910            to the re, and switch the buffer each match. If we fail,
3911            we switch it back; otherwise we leave it swapped.
3912         */
3913         swap = RXp_OFFSp(prog);
3914         /* avoid leak if we die, or clean up anyway if match completes */
3915         SAVEFREEPV(swap);
3916         Newxz(RXp_OFFSp(prog), (prog->nparens + 1), regexp_paren_pair);
3917         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3918             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3919             0,
3920             PTR2UV(prog),
3921             PTR2UV(swap),
3922             PTR2UV(RXp_OFFSp(prog))
3923         ));
3924     }
3925 
3926     if (prog->recurse_locinput)
3927         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3928 
3929     /* Simplest case: anchored match (but not \G) need be tried only once,
3930      * or with MBOL, only at the beginning of each line.
3931      *
3932      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3933      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3934      * match at the start of the string then it won't match anywhere else
3935      * either; while with /.*.../, if it doesn't match at the beginning,
3936      * the earliest it could match is at the start of the next line */
3937 
3938     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3939         char *end;
3940 
3941         if (regtry(reginfo, &s))
3942             goto got_it;
3943 
3944         if (!(prog->intflags & PREGf_ANCH_MBOL))
3945             goto phooey;
3946 
3947         /* didn't match at start, try at other newline positions */
3948 
3949         if (minlen)
3950             dontbother = minlen - 1;
3951         end = HOP3c(strend, -dontbother, strbeg) - 1;
3952 
3953         /* skip to next newline */
3954 
3955         while (s <= end) { /* note it could be possible to match at the end of the string */
3956             /* NB: newlines are the same in unicode as they are in latin */
3957             if (*s++ != '\n')
3958                 continue;
3959             if (prog->check_substr || prog->check_utf8) {
3960             /* note that with PREGf_IMPLICIT, intuit can only fail
3961              * or return the start position, so it's of limited utility.
3962              * Nevertheless, I made the decision that the potential for
3963              * quick fail was still worth it - DAPM */
3964                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3965                 if (!s)
3966                     goto phooey;
3967             }
3968             if (regtry(reginfo, &s))
3969                 goto got_it;
3970         }
3971         goto phooey;
3972     } /* end anchored search */
3973 
3974     /* anchored \G match */
3975     if (prog->intflags & PREGf_ANCH_GPOS)
3976     {
3977         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3978         assert(prog->intflags & PREGf_GPOS_SEEN);
3979         /* For anchored \G, the only position it can match from is
3980          * (ganch-gofs); we already set startpos to this above; if intuit
3981          * moved us on from there, we can't possibly succeed */
3982         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3983         if (s == startpos && regtry(reginfo, &s))
3984             goto got_it;
3985         goto phooey;
3986     }
3987 
3988     /* Messy cases:  unanchored match. */
3989 
3990     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3991         /* we have /x+whatever/ */
3992         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3993         char ch;
3994 #ifdef DEBUGGING
3995         int did_match = 0;
3996 #endif
3997         if (utf8_target) {
3998             if (! prog->anchored_utf8) {
3999                 to_utf8_substr(prog);
4000             }
4001             ch = SvPVX_const(prog->anchored_utf8)[0];
4002             REXEC_FBC_UTF8_SCAN(
4003                 if (*s == ch) {
4004                     DEBUG_EXECUTE_r( did_match = 1 );
4005                     if (regtry(reginfo, &s)) goto got_it;
4006                     s += UTF8_SAFE_SKIP(s, strend);
4007                     while (s < strend && *s == ch)
4008                         s += UTF8SKIP(s);
4009                 }
4010             );
4011 
4012         }
4013         else {
4014             if (! prog->anchored_substr) {
4015                 if (! to_byte_substr(prog)) {
4016                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4017                 }
4018             }
4019             ch = SvPVX_const(prog->anchored_substr)[0];
4020             REXEC_FBC_NON_UTF8_SCAN(
4021                 if (*s == ch) {
4022                     DEBUG_EXECUTE_r( did_match = 1 );
4023                     if (regtry(reginfo, &s)) goto got_it;
4024                     s++;
4025                     while (s < strend && *s == ch)
4026                         s++;
4027                 }
4028             );
4029         }
4030         DEBUG_EXECUTE_r(if (!did_match)
4031                 Perl_re_printf( aTHX_
4032                                   "Did not find anchored character...\n")
4033                );
4034     }
4035     else if (prog->anchored_substr != NULL
4036               || prog->anchored_utf8 != NULL
4037               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
4038                   && prog->float_max_offset < strend - s)) {
4039         SV *must;
4040         SSize_t back_max;
4041         SSize_t back_min;
4042         char *last;
4043         char *last1;		/* Last position checked before */
4044 #ifdef DEBUGGING
4045         int did_match = 0;
4046 #endif
4047         if (prog->anchored_substr || prog->anchored_utf8) {
4048             if (utf8_target) {
4049                 if (! prog->anchored_utf8) {
4050                     to_utf8_substr(prog);
4051                 }
4052                 must = prog->anchored_utf8;
4053             }
4054             else {
4055                 if (! prog->anchored_substr) {
4056                     if (! to_byte_substr(prog)) {
4057                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4058                     }
4059                 }
4060                 must = prog->anchored_substr;
4061             }
4062             back_max = back_min = prog->anchored_offset;
4063         } else {
4064             if (utf8_target) {
4065                 if (! prog->float_utf8) {
4066                     to_utf8_substr(prog);
4067                 }
4068                 must = prog->float_utf8;
4069             }
4070             else {
4071                 if (! prog->float_substr) {
4072                     if (! to_byte_substr(prog)) {
4073                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4074                     }
4075                 }
4076                 must = prog->float_substr;
4077             }
4078             back_max = prog->float_max_offset;
4079             back_min = prog->float_min_offset;
4080         }
4081 
4082         if (back_min<0) {
4083             last = strend;
4084         } else {
4085             last = HOP3c(strend,	/* Cannot start after this */
4086                   -(SSize_t)(CHR_SVLEN(must)
4087                          - (SvTAIL(must) != 0) + back_min), strbeg);
4088         }
4089         if (s > reginfo->strbeg)
4090             last1 = HOPc(s, -1);
4091         else
4092             last1 = s - 1;	/* bogus */
4093 
4094         /* XXXX check_substr already used to find "s", can optimize if
4095            check_substr==must. */
4096         dontbother = 0;
4097         strend = HOPc(strend, -dontbother);
4098         while ( (s <= last) &&
4099                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
4100                                   (unsigned char*)strend, must,
4101                                   multiline ? FBMrf_MULTILINE : 0)) ) {
4102             DEBUG_EXECUTE_r( did_match = 1 );
4103             if (HOPc(s, -back_max) > last1) {
4104                 last1 = HOPc(s, -back_min);
4105                 s = HOPc(s, -back_max);
4106             }
4107             else {
4108                 char * const t = (last1 >= reginfo->strbeg)
4109                                     ? HOPc(last1, 1) : last1 + 1;
4110 
4111                 last1 = HOPc(s, -back_min);
4112                 s = t;
4113             }
4114             if (utf8_target) {
4115                 while (s <= last1) {
4116                     if (regtry(reginfo, &s))
4117                         goto got_it;
4118                     if (s >= last1) {
4119                         s++; /* to break out of outer loop */
4120                         break;
4121                     }
4122                     s += UTF8SKIP(s);
4123                 }
4124             }
4125             else {
4126                 while (s <= last1) {
4127                     if (regtry(reginfo, &s))
4128                         goto got_it;
4129                     s++;
4130                 }
4131             }
4132         }
4133         DEBUG_EXECUTE_r(if (!did_match) {
4134             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4135                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4136             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
4137                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
4138                                ? "anchored" : "floating"),
4139                 quoted, RE_SV_TAIL(must));
4140         });
4141         goto phooey;
4142     }
4143     else if ( (c = progi->regstclass) ) {
4144         if (minlen) {
4145             const OPCODE op = OP(progi->regstclass);
4146             /* don't bother with what can't match */
4147             if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE)
4148                 strend = HOPc(strend, -(minlen - 1));
4149         }
4150         DEBUG_EXECUTE_r({
4151             SV * const prop = sv_newmortal();
4152             regprop(prog, prop, c, reginfo, NULL);
4153             {
4154                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4155                     s,strend-s,PL_dump_re_max_len);
4156                 Perl_re_printf( aTHX_
4157                     "Matching stclass %.*s against %s (%d bytes)\n",
4158                     (int)SvCUR(prop), SvPVX_const(prop),
4159                      quoted, (int)(strend - s));
4160             }
4161         });
4162         if (find_byclass(prog, c, s, strend, reginfo))
4163             goto got_it;
4164         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
4165     }
4166     else {
4167         dontbother = 0;
4168         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4169             /* Trim the end. */
4170             char *last= NULL;
4171             SV* float_real;
4172             STRLEN len;
4173             const char *little;
4174 
4175             if (utf8_target) {
4176                 if (! prog->float_utf8) {
4177                     to_utf8_substr(prog);
4178                 }
4179                 float_real = prog->float_utf8;
4180             }
4181             else {
4182                 if (! prog->float_substr) {
4183                     if (! to_byte_substr(prog)) {
4184                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4185                     }
4186                 }
4187                 float_real = prog->float_substr;
4188             }
4189 
4190             little = SvPV_const(float_real, len);
4191             if (SvTAIL(float_real)) {
4192                     /* This means that float_real contains an artificial \n on
4193                      * the end due to the presence of something like this:
4194                      * /foo$/ where we can match both "foo" and "foo\n" at the
4195                      * end of the string.  So we have to compare the end of the
4196                      * string first against the float_real without the \n and
4197                      * then against the full float_real with the string.  We
4198                      * have to watch out for cases where the string might be
4199                      * smaller than the float_real or the float_real without
4200                      * the \n. */
4201                     char *checkpos= strend - len;
4202                     DEBUG_OPTIMISE_r(
4203                         Perl_re_printf( aTHX_
4204                             "%sChecking for float_real.%s\n",
4205                             PL_colors[4], PL_colors[5]));
4206                     if (checkpos + 1 < strbeg) {
4207                         /* can't match, even if we remove the trailing \n
4208                          * string is too short to match */
4209                         DEBUG_EXECUTE_r(
4210                             Perl_re_printf( aTHX_
4211                                 "%sString shorter than required trailing substring, cannot match.%s\n",
4212                                 PL_colors[4], PL_colors[5]));
4213                         goto phooey;
4214                     } else if (memEQ(checkpos + 1, little, len - 1)) {
4215                         /* can match, the end of the string matches without the
4216                          * "\n" */
4217                         last = checkpos + 1;
4218                     } else if (checkpos < strbeg) {
4219                         /* cant match, string is too short when the "\n" is
4220                          * included */
4221                         DEBUG_EXECUTE_r(
4222                             Perl_re_printf( aTHX_
4223                                 "%sString does not contain required trailing substring, cannot match.%s\n",
4224                                 PL_colors[4], PL_colors[5]));
4225                         goto phooey;
4226                     } else if (!multiline) {
4227                         /* non multiline match, so compare with the "\n" at the
4228                          * end of the string */
4229                         if (memEQ(checkpos, little, len)) {
4230                             last= checkpos;
4231                         } else {
4232                             DEBUG_EXECUTE_r(
4233                                 Perl_re_printf( aTHX_
4234                                     "%sString does not contain required trailing substring, cannot match.%s\n",
4235                                     PL_colors[4], PL_colors[5]));
4236                             goto phooey;
4237                         }
4238                     } else {
4239                         /* multiline match, so we have to search for a place
4240                          * where the full string is located */
4241                         goto find_last;
4242                     }
4243             } else {
4244                   find_last:
4245                     if (len)
4246                         last = rninstr(s, strend, little, little + len);
4247                     else
4248                         last = strend;	/* matching "$" */
4249             }
4250             if (!last) {
4251                 /* at one point this block contained a comment which was
4252                  * probably incorrect, which said that this was a "should not
4253                  * happen" case.  Even if it was true when it was written I am
4254                  * pretty sure it is not anymore, so I have removed the comment
4255                  * and replaced it with this one. Yves */
4256                 DEBUG_EXECUTE_r(
4257                     Perl_re_printf( aTHX_
4258                         "%sString does not contain required substring, cannot match.%s\n",
4259                         PL_colors[4], PL_colors[5]
4260                     ));
4261                 goto phooey;
4262             }
4263             dontbother = strend - last + prog->float_min_offset;
4264         }
4265         if (minlen && (dontbother < minlen))
4266             dontbother = minlen - 1;
4267         strend -= dontbother; 		   /* this one's always in bytes! */
4268         /* We don't know much -- general case. */
4269         if (utf8_target) {
4270             for (;;) {
4271                 if (regtry(reginfo, &s))
4272                     goto got_it;
4273                 if (s >= strend)
4274                     break;
4275                 s += UTF8SKIP(s);
4276             };
4277         }
4278         else {
4279             do {
4280                 if (regtry(reginfo, &s))
4281                     goto got_it;
4282             } while (s++ < strend);
4283         }
4284     }
4285 
4286     /* Failure. */
4287     goto phooey;
4288 
4289   got_it:
4290     /* s/// doesn't like it if $& is earlier than where we asked it to
4291      * start searching (which can happen on something like /.\G/) */
4292     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
4293             && (RXp_OFFS_START(prog,0) < stringarg - strbeg))
4294     {
4295         /* this should only be possible under \G */
4296         assert(prog->intflags & PREGf_GPOS_SEEN);
4297         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4298             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4299         goto phooey;
4300     }
4301 
4302     /* clean up; this will trigger destructors that will free all slabs
4303      * above the current one, and cleanup the regmatch_info_aux
4304      * and regmatch_info_aux_eval sructs */
4305 
4306     LEAVE_SCOPE(oldsave);
4307 
4308     if (RXp_PAREN_NAMES(prog))
4309         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4310 
4311     /* make sure $`, $&, $', and $digit will work later */
4312     if ( !(flags & REXEC_NOT_FIRST) )
4313         S_reg_set_capture_string(aTHX_ rx,
4314                                     strbeg, reginfo->strend,
4315                                     sv, flags, utf8_target);
4316 
4317     return 1;
4318 
4319   phooey:
4320     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
4321                           PL_colors[4], PL_colors[5]));
4322 
4323     if (swap) {
4324         /* we failed :-( roll it back.
4325          * Since the swap buffer will be freed on scope exit which follows
4326          * shortly, restore the old captures by copying 'swap's original
4327          * data to the new offs buffer
4328          */
4329         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4330             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4331             0,
4332             PTR2UV(prog),
4333             PTR2UV(RXp_OFFSp(prog)),
4334             PTR2UV(swap)
4335         ));
4336 
4337         Copy(swap, RXp_OFFSp(prog), prog->nparens + 1, regexp_paren_pair);
4338     }
4339 
4340     /* clean up; this will trigger destructors that will free all slabs
4341      * above the current one, and cleanup the regmatch_info_aux
4342      * and regmatch_info_aux_eval sructs */
4343 
4344     LEAVE_SCOPE(oldsave);
4345 
4346     return 0;
4347 }
4348 
4349 
4350 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4351  * Do inc before dec, in case old and new rex are the same */
4352 #define SET_reg_curpm(Re2)                          \
4353     if (reginfo->info_aux_eval) {                   \
4354         (void)ReREFCNT_inc(Re2);		    \
4355         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));	    \
4356         PM_SETRE((PL_reg_curpm), (Re2));	    \
4357     }
4358 
4359 
4360 /*
4361  - regtry - try match at specific point
4362 
4363  NOTE: *startpos may be modifed by regtry() to signal to the caller
4364        that the next match should start at a specific position in the
4365        string. The macro LAST_REGTRY_SKIPPED_FORWARD(reginfo) can be
4366        used to detect when this has happened.
4367 
4368  */
4369 STATIC bool			/* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info * reginfo,char ** startposp)4370 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4371 {
4372     CHECKPOINT lastcp;
4373     REGEXP *const rx = reginfo->prog;
4374     regexp *const prog = ReANY(rx);
4375     SSize_t result;
4376 #ifdef DEBUGGING
4377     U32 depth = 0; /* used by REGCP_SET */
4378 #endif
4379     RXi_GET_DECL(prog,progi);
4380     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4381 
4382     PERL_ARGS_ASSERT_REGTRY;
4383 
4384     reginfo->cutpoint=NULL;
4385 
4386     RXp_OFFSp(prog)[0].start = *startposp - reginfo->strbeg;
4387     RXp_LASTPAREN(prog) = 0;
4388     RXp_LASTCLOSEPAREN(prog) = 0;
4389 
4390     /* XXXX What this code is doing here?!!!  There should be no need
4391        to do this again and again, RXp_LASTPAREN(prog) should take care of
4392        this!  --ilya*/
4393 
4394     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4395      * Actually, the code in regcppop() (which Ilya may be meaning by
4396      * RXp_LASTPAREN(prog)), is not needed at all by the test suite
4397      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4398      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4399      * Meanwhile, this code *is* needed for the
4400      * above-mentioned test suite tests to succeed.  The common theme
4401      * on those tests seems to be returning null fields from matches.
4402      * --jhi updated by dapm */
4403 
4404     /* After encountering a variant of the issue mentioned above I think
4405      * the point Ilya was making is that if we properly unwind whenever
4406      * we set lastparen to a smaller value then we should not need to do
4407      * this every time, only when needed. So if we have tests that fail if
4408      * we remove this, then it suggests somewhere else we are improperly
4409      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4410      * places it is called, and related regcp() routines. - Yves */
4411 #if 1
4412     if (prog->nparens) {
4413         regexp_paren_pair *pp = RXp_OFFSp(prog);
4414         I32 i;
4415         for (i = prog->nparens; i > (I32)RXp_LASTPAREN(prog); i--) {
4416             ++pp;
4417             pp->start = -1;
4418             pp->end = -1;
4419         }
4420     }
4421 #endif
4422     REGCP_SET(lastcp);
4423     result = regmatch(reginfo, *startposp, progi->program + 1);
4424     if (result != -1) {
4425         RXp_OFFSp(prog)[0].end = result;
4426         return 1;
4427     }
4428     if (reginfo->cutpoint)
4429         *startposp= reginfo->cutpoint;
4430     REGCP_UNWIND(lastcp);
4431     return 0;
4432 }
4433 
4434 /* this is used to determine how far from the left messages like
4435    'failed...' are printed in regexec.c. It should be set such that
4436    messages are inline with the regop output that created them.
4437 */
4438 #define REPORT_CODE_OFF 29
4439 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4440 
4441 #ifdef PERL_RE_BUILD_DEBUG
4442 int
Perl_re_exec_indentf(pTHX_ const char * fmt,U32 depth,...)4443 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4444 {
4445     va_list ap;
4446     int result;
4447     PerlIO *f= Perl_debug_log;
4448     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4449     va_start(ap, depth);
4450     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4451     result = PerlIO_vprintf(f, fmt, ap);
4452     va_end(ap);
4453     return result;
4454 }
4455 #endif /* DEBUGGING */
4456 
4457 /* grab a new slab and return the first slot in it */
4458 
4459 STATIC regmatch_state *
S_push_slab(pTHX)4460 S_push_slab(pTHX)
4461 {
4462     regmatch_slab *s = PL_regmatch_slab->next;
4463     if (!s) {
4464         Newx(s, 1, regmatch_slab);
4465         s->prev = PL_regmatch_slab;
4466         s->next = NULL;
4467         PL_regmatch_slab->next = s;
4468     }
4469     PL_regmatch_slab = s;
4470     return SLAB_FIRST(s);
4471 }
4472 
4473 #ifdef DEBUGGING
4474 
4475 STATIC void
S_debug_start_match(pTHX_ const REGEXP * prog,const bool utf8_target,const char * start,const char * end,const char * blurb)4476 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4477     const char *start, const char *end, const char *blurb)
4478 {
4479     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4480 
4481     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4482 
4483     if (!PL_colorset)
4484             reginitcolors();
4485     {
4486         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4487             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4488 
4489         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4490             start, end - start, PL_dump_re_max_len);
4491 
4492         Perl_re_printf( aTHX_
4493             "%s%s REx%s %s against %s\n",
4494                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4495 
4496         if (utf8_target||utf8_pat)
4497             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4498                 utf8_pat ? "pattern" : "",
4499                 utf8_pat && utf8_target ? " and " : "",
4500                 utf8_target ? "string" : ""
4501             );
4502     }
4503 }
4504 
4505 STATIC void
S_dump_exec_pos(pTHX_ const char * locinput,const regnode * scan,const char * loc_regeol,const char * loc_bostr,const char * loc_reg_starttry,const bool utf8_target,const U32 depth)4506 S_dump_exec_pos(pTHX_ const char *locinput,
4507                       const regnode *scan,
4508                       const char *loc_regeol,
4509                       const char *loc_bostr,
4510                       const char *loc_reg_starttry,
4511                       const bool utf8_target,
4512                       const U32 depth
4513                 )
4514 {
4515     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4516     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4517     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4518     /* The part of the string before starttry has one color
4519        (pref0_len chars), between starttry and current
4520        position another one (pref_len - pref0_len chars),
4521        after the current position the third one.
4522        We assume that pref0_len <= pref_len, otherwise we
4523        decrease pref0_len.  */
4524     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4525         ? (5 + taill) - l : locinput - loc_bostr;
4526     int pref0_len;
4527 
4528     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4529 
4530     if (utf8_target) {
4531         while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) {
4532             pref_len++;
4533         }
4534     }
4535     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4536     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4537         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4538               ? (5 + taill) - pref_len : loc_regeol - locinput);
4539     if (utf8_target) {
4540         while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) {
4541             l--;
4542         }
4543     }
4544     if (pref0_len < 0)
4545         pref0_len = 0;
4546     if (pref0_len > pref_len)
4547         pref0_len = pref_len;
4548     {
4549         const int is_uni = utf8_target ? 1 : 0;
4550 
4551         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4552             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4553 
4554         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4555                     (locinput - pref_len + pref0_len),
4556                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4557 
4558         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4559                     locinput, loc_regeol - locinput, 10, 0, 1);
4560 
4561         const STRLEN tlen=len0+len1+len2;
4562         Perl_re_printf( aTHX_
4563                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4564                     (IV)(locinput - loc_bostr),
4565                     len0, s0,
4566                     len1, s1,
4567                     (docolor ? "" : "> <"),
4568                     len2, s2,
4569                     (int)(tlen > 19 ? 0 :  19 - tlen),
4570                     "",
4571                     (UV)depth);
4572     }
4573 }
4574 
4575 #endif
4576 
4577 /* reg_check_named_buff_matched()
4578  * Checks to see if a named buffer has matched. The data array of
4579  * buffer numbers corresponding to the buffer is expected to reside
4580  * in the regexp->data->data array in the slot stored in the ARG1u() of
4581  * node involved. Note that this routine doesn't actually care about the
4582  * name, that information is not preserved from compilation to execution.
4583  * Returns the index of the leftmost defined buffer with the given name
4584  * or 0 if non of the buffers matched.
4585  */
4586 STATIC I32
S_reg_check_named_buff_matched(const regexp * rex,const regnode * scan)4587 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4588 {
4589     I32 n;
4590     RXi_GET_DECL(rex,rexi);
4591     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
4592     I32 *nums=(I32*)SvPVX(sv_dat);
4593 
4594     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4595 
4596     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4597         if ((I32)RXp_LASTPAREN(rex) >= nums[n] &&
4598             RXp_OFFS_END(rex,nums[n]) != -1)
4599         {
4600             return nums[n];
4601         }
4602     }
4603     return 0;
4604 }
4605 
4606 static bool
S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,struct next_matchable_info * m,regmatch_info * reginfo)4607 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4608                           struct next_matchable_info * m,
4609                           regmatch_info *reginfo)
4610 {
4611     /* This function determines various characteristics about every possible
4612      * initial match of the passed-in EXACTish <text_node>, and stores them in
4613      * <*m>.
4614      *
4615      * That includes a match string and a parallel mask, such that if you AND
4616      * the target string with the mask and compare with the match string,
4617      * you'll have a pretty good idea, perhaps even perfect, if that portion of
4618      * the target matches or not.
4619      *
4620      * The motivation behind this function is to allow the caller to set up
4621      * tight loops for matching.  Consider patterns like '.*B' or '.*?B' where
4622      * B is an arbitrary EXACTish node.  To find the end of .*, we look for the
4623      * beginning oF B, which is the passed in <text_node>  That's where this
4624      * function comes in.  The values it returns can quickly be used to rule
4625      * out many, or all, cases of possible matches not actually being the
4626      * beginning of B, <text_node>.  It is also used in regrepeat() where we
4627      * have 'A*', for arbitrary 'A'.  This sets up criteria to more efficiently
4628      * determine where the span of 'A's stop.
4629      *
4630      * If <text_node> is of type EXACT, there is only one possible character
4631      * that can match its first character, and so the situation is quite
4632      * simple.  But things can get much more complicated if folding is
4633      * involved.  It may be that the first character of an EXACTFish node
4634      * doesn't participate in any possible fold, e.g., punctuation, so it can
4635      * be matched only by itself.  The vast majority of characters that are in
4636      * folds match just two things, their lower and upper-case equivalents.
4637      * But not all are like that; some have multiple possible matches, or match
4638      * sequences of more than one character.  This function sorts all that out.
4639      *
4640      * It returns information about all possibilities of what the first
4641      * character(s) of <text_node> could look like.  Again, if <text_node> is a
4642      * plain EXACT node, that's just the actual first bytes of the first
4643      * character; but otherwise it is the bytes, that when masked, match all
4644      * possible combinations of all the initial bytes of all the characters
4645      * that could match, folded.  (Actually, this is a slight over promise.  It
4646      * handles only up to the initial 5 bytes, which is enough for all Unicode
4647      * characters, but not for all non-Unicode ones.)
4648      *
4649      * Here's an example to clarify.  Suppose the first character of
4650      * <text_node> is the letter 'C', and we are under /i matching.  That means
4651      * 'c' also matches.  The representations of these two characters differ in
4652      * just one bit, so the mask would be a zero in that position and ones in
4653      * the other 7.  And the returned string would be the AND of these two
4654      * characters, and would be one byte long, since these characters are each
4655      * a single byte.  ANDing the target <text_node> with this mask will yield
4656      * the returned string if and only if <text_node> begins with one of these
4657      * two characters.  So, the function would also return that the definitive
4658      * length matched is 1 byte.
4659      *
4660      * Now, suppose instead of the letter 'C',  <text_node> begins with the
4661      * letter 'F'.  The situation is much more complicated because there are
4662      * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4663      * begins with 'f', and hence could match.  We add these into the returned
4664      * string and mask, but the result isn't definitive; the caller has to
4665      * check further if its AND and compare pass.  But the failure of that
4666      * compare will quickly rule out most possible inputs.
4667      *
4668      * Much of this could be done in regcomp.c at compile time, except for
4669      * locale-dependent, and UTF-8 target dependent data.  Extra data fields
4670      * could be used for one or the other eventualities.
4671      *
4672      * If this function determines that no possible character in the target
4673      * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
4674      * situation occurs if the first character in <text_node> requires UTF-8 to
4675      * represent, and the target string isn't in UTF-8.)
4676      *
4677      * Some analysis is in GH #18414, located at the time of this writing at:
4678      * https://github.com/Perl/perl5/issues/18414
4679      */
4680 
4681     const bool utf8_target = reginfo->is_utf8_target;
4682     bool utf8_pat = reginfo->is_utf8_pat;
4683 
4684     PERL_UINT_FAST8_T i;
4685 
4686     /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4687      */
4688     U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4689     U8 lengths[MAX_MATCHES] = { 0 };
4690 
4691     U8 index_of_longest = 0;
4692 
4693     U8 *pat = (U8*)STRING(text_node);
4694     Size_t pat_len = STR_LEN(text_node);
4695     U8 op = OP(text_node);
4696 
4697     U8 byte_mask[5]  = {0};
4698     U8 byte_anded[5] = {0};
4699 
4700     /* There are some folds in Unicode to multiple characters.  This will hold
4701      * such characters that could fold to the beginning of 'text_node' */
4702     UV multi_fold_from = 0;
4703 
4704     /* We may have to create a modified copy of the pattern */
4705     U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4706 
4707     m->max_length = 0;
4708     m->min_length = 255;
4709     m->count = 0;
4710 
4711     /* Even if the first character in the node can match something in Latin1,
4712      * if there is anything in the node that can't, the match must fail */
4713     if (! utf8_target && isEXACT_REQ8(op)) {
4714         return FALSE;
4715     }
4716 
4717 /* Define a temporary op for use in this function, using an existing one that
4718  * should never be a real op during execution */
4719 #define TURKISH  PSEUDO
4720 
4721     /* What to do about these two nodes had to be deferred to runtime (which is
4722      * now).  If the extra information we now have so indicates, turn them into
4723      * EXACTFU nodes */
4724     if (   (op == EXACTF && utf8_target)
4725         || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4726     {
4727         if (op == EXACTFL && IN_UTF8_TURKIC_LOCALE) {
4728             op = TURKISH;
4729         }
4730         else {
4731             op = EXACTFU;
4732         }
4733 
4734         /* And certain situations are better handled if we create a modified
4735          * version of the pattern */
4736         if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4737                            specific problematic characters */
4738             if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4739 
4740                 /* The node could start with characters that are the first ones
4741                  * of a multi-character fold. */
4742                 multi_fold_from
4743                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4744                 if (multi_fold_from) {
4745 
4746                     /* Here, they do form a sequence that matches the fold of a
4747                      * single character.  That single character then is a
4748                      * possible match.  Below we will look again at this, but
4749                      * the code below is expecting every character in the
4750                      * pattern to be folded, which the input isn't required to
4751                      * be in this case.  So, just fold the single character,
4752                      * and the result will be in the expected form. */
4753                     _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4754                                        FOLD_FLAGS_FULL);
4755                     pat = mod_pat;
4756                 }
4757                          /* Turkish has a couple extra possibilities. */
4758                 else if (   UNLIKELY(op == TURKISH)
4759                          &&  pat_len >= 3
4760                          &&  isALPHA_FOLD_EQ(pat[0], 'f')
4761                          && (   memBEGINs(pat + 1, pat_len - 1,
4762                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4763                              || (   pat_len >= 4
4764                                  && isALPHA_FOLD_EQ(pat[1], 'f')
4765                                  && memBEGINs(pat + 2, pat_len - 2,
4766                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4767                 ))) {
4768                     /* The macros for finding a multi-char fold don't include
4769                      * the Turkish possibilities, in which U+130 folds to 'i'.
4770                      * Hard-code these.  It's very unlikely that Unicode will
4771                      * ever add any others.  */
4772                     if (pat[1] == 'f') {
4773                         pat_len = 3;
4774                         Copy("ffi", mod_pat, pat_len, U8);
4775                     }
4776                     else {
4777                         pat_len = 2;
4778                         Copy("fi", mod_pat, pat_len, U8);
4779                     }
4780                     pat = mod_pat;
4781                 }
4782                 else if (    UTF8_IS_DOWNGRADEABLE_START(*pat)
4783                          &&  LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4784                          &&  LIKELY(memNEs(pat, pat_len,
4785                                            LATIN_SMALL_LETTER_SHARP_S_UTF8))
4786                          && (LIKELY(op != TURKISH || *pat != 'I')))
4787                 {
4788                     /* For all cases of things between 0-255, except the ones
4789                      * in the conditional above, the fold is just the lower
4790                      * case, which is faster than the more general case. */
4791                     mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4792                                                                      pat[1]));
4793                     pat_len = 1;
4794                     pat = mod_pat;
4795                     utf8_pat = FALSE;
4796                 }
4797                 else {  /* Code point above 255, or needs special handling */
4798                     _to_utf8_fold_flags(pat, pat + pat_len,
4799                                         mod_pat, &pat_len,
4800                                         FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4801                     pat = mod_pat;
4802                 }
4803             }
4804         }
4805         else if /* Below is not a UTF-8 pattern; there's a somewhat different
4806                    set of problematic characters */
4807                 ((multi_fold_from
4808                           = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4809         {
4810             /* We may have to canonicalize a multi-char fold, as in the UTF-8
4811              * case */
4812             _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4813                                FOLD_FLAGS_FULL);
4814             pat = mod_pat;
4815         }
4816         else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4817             mod_pat[0] = mod_pat[1] = 's';
4818             pat_len = 2;
4819             utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4820                                        chars, and speeds copying */
4821             pat = mod_pat;
4822         }
4823         else if (LIKELY(op != TURKISH || *pat != 'I')) {
4824             mod_pat[0] = toLOWER_L1(*pat);
4825             pat_len = 1;
4826             pat = mod_pat;
4827         }
4828     }
4829     else if /* Below isn't a node that we convert to UTF-8 */
4830             (     utf8_target
4831              && ! utf8_pat
4832              &&   op == EXACTFAA_NO_TRIE
4833              &&  *pat == LATIN_SMALL_LETTER_SHARP_S)
4834     {
4835         /* A very special case.  Folding U+DF goes to U+17F under /iaa.  We
4836          * did this at compile time when the pattern was UTF-8 , but otherwise
4837          * we couldn't do it earlier, because it requires a UTF-8 target for
4838          * this match to be legal. */
4839         pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4840         Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4841              LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4842         pat = mod_pat;
4843         utf8_pat = TRUE;
4844     }
4845 
4846     /* Here, we have taken care of the initial work for a few very problematic
4847      * situations, possibly creating a modified pattern.
4848      *
4849      * Now ready for the general case.  We build up all the possible things
4850      * that could match the first character of the pattern into the elements of
4851      * 'matches[]'
4852      *
4853      * Everything generally matches at least itself.  But if there is a
4854      * UTF8ness mismatch, we have to convert to that of the target string. */
4855     if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
4856         matches[0][0] = pat[0];
4857         lengths[0] = 1;
4858         m->count++;
4859     }
4860     else if (utf8_target) {
4861         if (utf8_pat) {
4862             lengths[0] = UTF8SKIP(pat);
4863             Copy(pat, matches[0], lengths[0], U8);
4864             m->count++;
4865         }
4866         else {  /* target is UTF-8, pattern isn't */
4867             matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4868             matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4869             lengths[0] = 2;
4870             m->count++;
4871         }
4872     }
4873     else if (! utf8_pat) {  /* Neither is UTF-8 */
4874         matches[0][0] = pat[0];
4875         lengths[0] = 1;
4876         m->count++;
4877     }
4878     else     /* target isn't UTF-8; pattern is.  No match possible unless the
4879                 pattern's first character can fit in a byte */
4880          if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4881     {
4882         matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4883         lengths[0] = 1;
4884         m->count++;
4885     }
4886 
4887     /* Here we have taken care of any necessary node-type changes */
4888 
4889     if (m->count) {
4890         m->max_length = lengths[0];
4891         m->min_length = lengths[0];
4892     }
4893 
4894     /* For non-folding nodes, there are no other possible candidate matches,
4895      * but for foldable ones, we have to look further. */
4896     if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4897         UV folded;  /* The first character in the pattern, folded */
4898         U32 first_fold_from;    /* A character that folds to it */
4899         const U32 * remaining_fold_froms;   /* The remaining characters that
4900                                                fold to it, if any */
4901         Size_t folds_to_count;  /* The total number of characters that fold to
4902                                    'folded' */
4903 
4904         /* If the node begins with a sequence of more than one character that
4905          * together form the fold of a single character, it is called a
4906          * 'multi-character fold', and the normal functions don't handle this
4907          * case.  We set 'multi_fold_from' to the single folded-from character,
4908          * which is handled in an extra iteration below */
4909         if (utf8_pat) {
4910             folded = valid_utf8_to_uvchr(pat, NULL);
4911             multi_fold_from
4912                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4913         }
4914         else {
4915             folded = *pat;
4916 
4917             /* This may generate illegal combinations for things like EXACTF,
4918              * but rather than repeat the logic and exclude them here, all such
4919              * illegalities are checked for and skipped below in the loop */
4920             multi_fold_from
4921                         = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4922         }
4923 
4924         /* Everything matches at least itself; initialize to that because the
4925          * only the branches below that set it are the ones where the number
4926          * isn't 1. */
4927         folds_to_count = 1;
4928 
4929         /* There are a few special cases for locale-dependent nodes, where the
4930          * run-time context was needed before we could know what matched */
4931         if (UNLIKELY(op == EXACTFL) && folded < 256)  {
4932             first_fold_from = PL_fold_locale[folded];
4933         }
4934         else if (   op == EXACTFL && utf8_target && utf8_pat
4935                  && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4936                                             LATIN_SMALL_LETTER_LONG_S_UTF8))
4937         {
4938             first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4939         }
4940         else if (UNLIKELY(    op == TURKISH
4941                           && (   isALPHA_FOLD_EQ(folded, 'i')
4942                               || inRANGE(folded,
4943                                          LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4944                                          LATIN_SMALL_LETTER_DOTLESS_I))))
4945         {   /* Turkish folding requires special handling */
4946             if (folded == 'i')
4947                 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4948             else if (folded == 'I')
4949                 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4950             else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4951                 first_fold_from = 'i';
4952             else first_fold_from = 'I';
4953         }
4954         else {
4955             /* Here, isn't a special case: use the generic function to
4956              * calculate what folds to this */
4957           redo_multi:
4958             /* Look up what code points (besides itself) fold to 'folded';
4959              * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4960             folds_to_count = _inverse_folds(folded, &first_fold_from,
4961                                                        &remaining_fold_froms);
4962         }
4963 
4964         /* Add each character that folds to 'folded' to the list of them,
4965          * subject to limitations based on the node type and target UTF8ness.
4966          * If there was a character that folded to multiple characters, do an
4967          * extra iteration for it.  (Note the extra iteration if there is a
4968          * multi-character fold) */
4969         for (i = 0; i < folds_to_count
4970                       + UNLIKELY(multi_fold_from != 0); i++)
4971         {
4972             UV fold_from = 0;
4973 
4974             if (i >= folds_to_count) {  /* Final iteration: handle the
4975                                            multi-char */
4976                 fold_from = multi_fold_from;
4977             }
4978             else if (i == 0) {
4979                 fold_from = first_fold_from;
4980             }
4981             else if (i < folds_to_count) {
4982                 fold_from = remaining_fold_froms[i-1];
4983             }
4984 
4985             if (folded == fold_from) {  /* We already added the character
4986                                            itself */
4987                 continue;
4988             }
4989 
4990             /* EXACTF doesn't have any non-ascii folds */
4991             if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4992                 continue;
4993             }
4994 
4995             /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4996              * */
4997             if (    isASCII(folded) != isASCII(fold_from)
4998                 &&  inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4999 
5000             {
5001                 continue;
5002             }
5003 
5004             /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
5005              * locale, but those have been converted to EXACTFU above) */
5006             if (   op == EXACTFL
5007                 && (folded < 256) != (fold_from < 256))
5008             {
5009                 continue;
5010             }
5011 
5012             /* If this triggers, it likely is because of the unlikely case
5013              * where a new Unicode standard has changed what MAX_MATCHES should
5014              * be set to */
5015             assert(m->count < MAX_MATCHES);
5016 
5017             /* Add this character to the list of possible matches */
5018             if (utf8_target) {
5019                 uvchr_to_utf8(matches[(U8) m->count], fold_from);
5020                 lengths[m->count] = UVCHR_SKIP(fold_from);
5021                 m->count++;
5022             }
5023             else { /* Non-UTF8 target: no code point above 255 can appear in it
5024                     */
5025                 if (fold_from > 255) {
5026                     continue;
5027                 }
5028 
5029                 matches[m->count][0] = fold_from;
5030                 lengths[m->count] = 1;
5031                 m->count++;
5032             }
5033 
5034             /* Update min and mlengths */
5035             if (m->min_length > lengths[m->count-1]) {
5036                 m->min_length = lengths[m->count-1];
5037             }
5038 
5039             if (m->max_length < lengths[m->count-1]) {
5040                 index_of_longest = m->count - 1;
5041                 m->max_length = lengths[index_of_longest];
5042             }
5043         } /* looped through each potential fold */
5044 
5045         /* If there is something that folded to an initial multi-character
5046          * fold, repeat, using it.  This catches some edge cases.  An example
5047          * of one is /ss/i when UTF-8 encoded.  The function
5048          * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
5049          * (LATIN SMALL SHARP S).  If it returned a list of characters, this
5050          * code wouldn't be needed.  But since it doesn't, we have to look what
5051          * folds to the U+DF.  In this case, U+1E9E does, and has to be added.
5052          * */
5053         if (multi_fold_from) {
5054             folded = multi_fold_from;
5055             multi_fold_from = 0;
5056             goto redo_multi;
5057         }
5058     } /* End of finding things that participate in this fold */
5059 
5060     if (m->count == 0) {    /* If nothing found, can't match */
5061         m->min_length = 0;
5062         return FALSE;
5063     }
5064 
5065     /* Have calculated all possible matches.  Now calculate the mask and AND
5066      * values */
5067     m->initial_exact = 0;
5068     m->initial_definitive = 0;
5069 
5070     {
5071         unsigned int mask_ones = 0;
5072         unsigned int possible_ones = 0;
5073         U8 j;
5074 
5075         /* For each byte that is in all possible matches ... */
5076         for (j = 0; j < MIN(m->min_length, 5); j++) {
5077 
5078             /* Initialize the accumulator for this byte */
5079             byte_mask[j] = 0xFF;
5080             byte_anded[j] = matches[0][j];
5081 
5082             /* Then the rest of the rows (folds).  The mask is based on, like,
5083              * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
5084              * where they differ. */
5085             for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
5086                 byte_mask[j]  &= ~ (byte_anded[j] ^ matches[i][j]);
5087                 byte_anded[j] &= matches[i][j];
5088             }
5089 
5090             /* Keep track of the number of initial mask bytes that are all one
5091              * bits.  The code calling this can use this number to know that
5092              * a string that matches this number of bytes in the pattern is an
5093              * exact match of that pattern for this number of bytes.  But also
5094              * counted are the number of initial bytes that in total have a
5095              * single zero bit.  If a string matches those, masked, it must be
5096              * one of two possibilites, both of which this function has
5097              * determined are legal.  (But if that single 0 is one of the
5098              * initial bits for masking a UTF-8 start byte, that could
5099              * incorrectly lead to different length strings appearing to be
5100              * equivalent, so only do this optimization when the matchables are
5101              * all the same length.  This was uncovered by testing
5102              * /\x{029E}/i.) */
5103             if (m->min_length == m->max_length) {
5104                 mask_ones += PL_bitcount[byte_mask[j]];
5105                 possible_ones += 8;
5106                 if (mask_ones + 1 >= possible_ones) {
5107                     m->initial_definitive++;
5108                     if (mask_ones >= possible_ones) {
5109                         m->initial_exact++;
5110                     }
5111                 }
5112             }
5113         }
5114     }
5115 
5116     /* The first byte is separate for speed */
5117     m->first_byte_mask = byte_mask[0];
5118     m->first_byte_anded = byte_anded[0];
5119 
5120     /* Then pack up to the next 4 bytes into a word */
5121     m->mask32 = m->anded32 = 0;
5122     for (i = 1; i < MIN(m->min_length, 5); i++) {
5123         U8 which = i;
5124         U8 shift = (which - 1) * 8;
5125         m->mask32  |= (U32) byte_mask[i]  << shift;
5126         m->anded32 |= (U32) byte_anded[i] << shift;
5127     }
5128 
5129     /* Finally, take the match strings and place them sequentially into a
5130      * one-dimensional array.  (This is done to save significant space in the
5131      * structure.) Sort so the longest (presumably the least likely) is last.
5132      * XXX When this gets moved to regcomp, may want to fully sort shortest
5133      * first, but above we generally used the folded code point first, and
5134      * those tend to be no longer than their upper case values, so this is
5135      * already pretty well sorted by size.
5136      *
5137      * If the asserts fail, it's most likely because a new version of the
5138      * Unicode standard requires more space; simply increase the declaration
5139      * size. */
5140     {
5141         U8 cur_pos = 0;
5142         U8 output_index = 0;
5143 
5144         if (m->count > 1) { /* No need to sort a single entry */
5145             for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
5146 
5147                 /* Keep the same order for all but the longest.  (If the
5148                  * asserts fail, it could be because m->matches is declared too
5149                  * short, either because of a new Unicode release, or an
5150                  * overlooked test case, or it could be a bug.) */
5151                 if (i != index_of_longest) {
5152                     assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5153                     Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5154                     cur_pos += lengths[i];
5155                     m->lengths[output_index++] = lengths[i];
5156                 }
5157             }
5158         }
5159 
5160         assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5161         Copy(matches[index_of_longest], m->matches + cur_pos,
5162              lengths[index_of_longest], U8);
5163 
5164         /* Place the longest match last */
5165         m->lengths[output_index] = lengths[index_of_longest];
5166     }
5167 
5168 
5169     return TRUE;
5170 }
5171 
5172 PERL_STATIC_FORCE_INLINE    /* We want speed at the expense of size */
5173 bool
S_test_EXACTISH_ST(const char * loc,struct next_matchable_info info)5174 S_test_EXACTISH_ST(const char * loc,
5175                    struct next_matchable_info info)
5176 {
5177     /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5178      * bytes starting at 'loc' can match based on 'next_matchable_info' */
5179 
5180     U32 input32 = 0;
5181 
5182     /* Check the first byte */
5183     if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5184         return FALSE;
5185 
5186     /* Pack the next up-to-4 bytes into a 32 bit word */
5187     switch (info.min_length) {
5188         default:
5189             input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5190             /* FALLTHROUGH */
5191         case 4:
5192             input32 |= (U8) loc[3] << 2 * 8;
5193             /* FALLTHROUGH */
5194         case 3:
5195             input32 |= (U8) loc[2] << 1 * 8;
5196             /* FALLTHROUGH */
5197         case 2:
5198             input32 |= (U8) loc[1];
5199             break;
5200         case 1:
5201             return TRUE;    /* We already tested and passed the 0th byte */
5202         case 0:
5203             ASSUME(0);
5204     }
5205 
5206     /* And AND that with the mask and compare that with the assembled ANDED
5207      * values */
5208     return (input32 & info.mask32) == info.anded32;
5209 }
5210 
5211 STATIC bool
S_isGCB(pTHX_ const GCB_enum before,const GCB_enum after,const U8 * const strbeg,const U8 * const curpos,const bool utf8_target)5212 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5213 {
5214     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5215      * between the inputs.  See https://www.unicode.org/reports/tr29/. */
5216 
5217     PERL_ARGS_ASSERT_ISGCB;
5218 
5219     switch (GCB_table[before][after]) {
5220         case GCB_BREAKABLE:
5221             return TRUE;
5222 
5223         case GCB_NOBREAK:
5224             return FALSE;
5225 
5226         case GCB_RI_then_RI:
5227             {
5228                 int RI_count = 1;
5229                 U8 * temp_pos = (U8 *) curpos;
5230 
5231                 /* Do not break within emoji flag sequences. That is, do not
5232                  * break between regional indicator (RI) symbols if there is an
5233                  * odd number of RI characters before the break point.
5234                  *  GB12   sot (RI RI)* RI × RI
5235                  *  GB13 [^RI] (RI RI)* RI × RI */
5236 
5237                 while (backup_one_GCB(strbeg,
5238                                     &temp_pos,
5239                                     utf8_target) == GCB_Regional_Indicator)
5240                 {
5241                     RI_count++;
5242                 }
5243 
5244                 return RI_count % 2 != 1;
5245             }
5246 
5247         case GCB_EX_then_EM:
5248 
5249             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
5250             {
5251                 U8 * temp_pos = (U8 *) curpos;
5252                 GCB_enum prev;
5253 
5254                 do {
5255                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5256                 }
5257                 while (prev == GCB_Extend);
5258 
5259                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5260             }
5261 
5262         case GCB_Maybe_Emoji_NonBreak:
5263 
5264             {
5265 
5266             /* Do not break within emoji modifier sequences or emoji zwj sequences.
5267               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5268               */
5269                 U8 * temp_pos = (U8 *) curpos;
5270                 GCB_enum prev;
5271 
5272                 do {
5273                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5274                 }
5275                 while (prev == GCB_Extend);
5276 
5277                 return prev != GCB_ExtPict_XX;
5278             }
5279 
5280         default:
5281             break;
5282     }
5283 
5284 #ifdef DEBUGGING
5285     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5286                                   before, after, GCB_table[before][after]);
5287     assert(0);
5288 #endif
5289     return TRUE;
5290 }
5291 
5292 STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5293 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5294 {
5295     GCB_enum gcb;
5296 
5297     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5298 
5299     if (*curpos < strbeg) {
5300         return GCB_EDGE;
5301     }
5302 
5303     if (utf8_target) {
5304         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5305         U8 * prev_prev_char_pos;
5306 
5307         if (! prev_char_pos) {
5308             return GCB_EDGE;
5309         }
5310 
5311         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5312             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5313             *curpos = prev_char_pos;
5314             prev_char_pos = prev_prev_char_pos;
5315         }
5316         else {
5317             *curpos = (U8 *) strbeg;
5318             return GCB_EDGE;
5319         }
5320     }
5321     else {
5322         if (*curpos - 2 < strbeg) {
5323             *curpos = (U8 *) strbeg;
5324             return GCB_EDGE;
5325         }
5326         (*curpos)--;
5327         gcb = getGCB_VAL_CP(*(*curpos - 1));
5328     }
5329 
5330     return gcb;
5331 }
5332 
5333 /* Combining marks attach to most classes that precede them, but this defines
5334  * the exceptions (from TR14) */
5335 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
5336                                      || prev == LB_Mandatory_Break      \
5337                                      || prev == LB_Carriage_Return      \
5338                                      || prev == LB_Line_Feed            \
5339                                      || prev == LB_Next_Line            \
5340                                      || prev == LB_Space                \
5341                                      || prev == LB_ZWSpace))
5342 
5343 STATIC bool
S_isLB(pTHX_ LB_enum before,LB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5344 S_isLB(pTHX_ LB_enum before,
5345              LB_enum after,
5346              const U8 * const strbeg,
5347              const U8 * const curpos,
5348              const U8 * const strend,
5349              const bool utf8_target)
5350 {
5351     U8 * temp_pos = (U8 *) curpos;
5352     LB_enum prev = before;
5353 
5354     /* Is the boundary between 'before' and 'after' line-breakable?
5355      * Most of this is just a table lookup of a generated table from Unicode
5356      * rules.  But some rules require context to decide, and so have to be
5357      * implemented in code */
5358 
5359     PERL_ARGS_ASSERT_ISLB;
5360 
5361     /* Rule numbers in the comments below are as of Unicode 9.0 */
5362 
5363   redo:
5364     before = prev;
5365     switch (LB_table[before][after]) {
5366         case LB_BREAKABLE:
5367             return TRUE;
5368 
5369         case LB_NOBREAK:
5370         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5371             return FALSE;
5372 
5373         case LB_SP_foo + LB_BREAKABLE:
5374         case LB_SP_foo + LB_NOBREAK:
5375         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5376 
5377             /* When we have something following a SP, we have to look at the
5378              * context in order to know what to do.
5379              *
5380              * SP SP should not reach here because LB7: Do not break before
5381              * spaces.  (For two spaces in a row there is nothing that
5382              * overrides that) */
5383             assert(after != LB_Space);
5384 
5385             /* Here we have a space followed by a non-space.  Mostly this is a
5386              * case of LB18: "Break after spaces".  But there are complications
5387              * as the handling of spaces is somewhat tricky.  They are in a
5388              * number of rules, which have to be applied in priority order, but
5389              * something earlier in the string can cause a rule to be skipped
5390              * and a lower priority rule invoked.  A prime example is LB7 which
5391              * says don't break before a space.  But rule LB8 (lower priority)
5392              * says that the first break opportunity after a ZW is after any
5393              * span of spaces immediately after it.  If a ZW comes before a SP
5394              * in the input, rule LB8 applies, and not LB7.  Other such rules
5395              * involve combining marks which are rules 9 and 10, but they may
5396              * override higher priority rules if they come earlier in the
5397              * string.  Since we're doing random access into the middle of the
5398              * string, we have to look for rules that should get applied based
5399              * on both string position and priority.  Combining marks do not
5400              * attach to either ZW nor SP, so we don't have to consider them
5401              * until later.
5402              *
5403              * To check for LB8, we have to find the first non-space character
5404              * before this span of spaces */
5405             do {
5406                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5407             }
5408             while (prev == LB_Space);
5409 
5410             /* LB8 Break before any character following a zero-width space,
5411              * even if one or more spaces intervene.
5412              *      ZW SP* ÷
5413              * So if we have a ZW just before this span, and to get here this
5414              * is the final space in the span. */
5415             if (prev == LB_ZWSpace) {
5416                 return TRUE;
5417             }
5418 
5419             /* Here, not ZW SP+.  There are several rules that have higher
5420              * priority than LB18 and can be resolved now, as they don't depend
5421              * on anything earlier in the string (except ZW, which we have
5422              * already handled).  One of these rules is LB11 Do not break
5423              * before Word joiner, but we have specially encoded that in the
5424              * lookup table so it is caught by the single test below which
5425              * catches the other ones. */
5426             if (LB_table[LB_Space][after] - LB_SP_foo
5427                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5428             {
5429                 return FALSE;
5430             }
5431 
5432             /* If we get here, we have to XXX consider combining marks. */
5433             if (prev == LB_Combining_Mark) {
5434 
5435                 /* What happens with these depends on the character they
5436                  * follow.  */
5437                 do {
5438                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5439                 }
5440                 while (prev == LB_Combining_Mark);
5441 
5442                 /* Most times these attach to and inherit the characteristics
5443                  * of that character, but not always, and when not, they are to
5444                  * be treated as AL by rule LB10. */
5445                 if (! LB_CM_ATTACHES_TO(prev)) {
5446                     prev = LB_Alphabetic;
5447                 }
5448             }
5449 
5450             /* Here, we have the character preceding the span of spaces all set
5451              * up.  We follow LB18: "Break after spaces" unless the table shows
5452              * that is overridden */
5453             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5454 
5455         case LB_CM_ZWJ_foo:
5456 
5457             /* We don't know how to treat the CM except by looking at the first
5458              * non-CM character preceding it.  ZWJ is treated as CM */
5459             do {
5460                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5461             }
5462             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5463 
5464             /* Here, 'prev' is that first earlier non-CM character.  If the CM
5465              * attaches to it, then it inherits the behavior of 'prev'.  If it
5466              * doesn't attach, it is to be treated as an AL */
5467             if (! LB_CM_ATTACHES_TO(prev)) {
5468                 prev = LB_Alphabetic;
5469             }
5470 
5471             goto redo;
5472 
5473         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5474         case LB_HY_or_BA_then_foo + LB_NOBREAK:
5475 
5476             /* LB21a Don't break after Hebrew + Hyphen.
5477              * HL (HY | BA) × */
5478 
5479             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5480                                                           == LB_Hebrew_Letter)
5481             {
5482                 return FALSE;
5483             }
5484 
5485             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5486 
5487         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5488         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5489 
5490             /* LB25a (PR | PO) × ( OP | HY )? NU */
5491             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5492                 return FALSE;
5493             }
5494 
5495             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5496                                                                 == LB_BREAKABLE;
5497 
5498         case LB_SY_or_IS_then_various + LB_BREAKABLE:
5499         case LB_SY_or_IS_then_various + LB_NOBREAK:
5500         {
5501             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5502 
5503             LB_enum temp = prev;
5504             do {
5505                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5506             }
5507             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5508             if (temp == LB_Numeric) {
5509                 return FALSE;
5510             }
5511 
5512             return LB_table[prev][after] - LB_SY_or_IS_then_various
5513                                                                == LB_BREAKABLE;
5514         }
5515 
5516         case LB_various_then_PO_or_PR + LB_BREAKABLE:
5517         case LB_various_then_PO_or_PR + LB_NOBREAK:
5518         {
5519             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5520 
5521             LB_enum temp = prev;
5522             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5523             {
5524                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5525             }
5526             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5527                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5528             }
5529             if (temp == LB_Numeric) {
5530                 return FALSE;
5531             }
5532             return LB_various_then_PO_or_PR;
5533         }
5534 
5535         case LB_RI_then_RI + LB_NOBREAK:
5536         case LB_RI_then_RI + LB_BREAKABLE:
5537             {
5538                 int RI_count = 1;
5539 
5540                 /* LB30a Break between two regional indicator symbols if and
5541                  * only if there are an even number of regional indicators
5542                  * preceding the position of the break.
5543                  *
5544                  *    sot (RI RI)* RI × RI
5545                  *  [^RI] (RI RI)* RI × RI */
5546 
5547                 while (backup_one_LB(strbeg,
5548                                      &temp_pos,
5549                                      utf8_target) == LB_Regional_Indicator)
5550                 {
5551                     RI_count++;
5552                 }
5553 
5554                 return RI_count % 2 == 0;
5555             }
5556 
5557         default:
5558             break;
5559     }
5560 
5561 #ifdef DEBUGGING
5562     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5563                                   before, after, LB_table[before][after]);
5564     assert(0);
5565 #endif
5566     return TRUE;
5567 }
5568 
5569 STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5570 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5571 {
5572 
5573     LB_enum lb;
5574 
5575     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5576 
5577     if (*curpos >= strend) {
5578         return LB_EDGE;
5579     }
5580 
5581     if (utf8_target) {
5582         *curpos += UTF8SKIP(*curpos);
5583         if (*curpos >= strend) {
5584             return LB_EDGE;
5585         }
5586         lb = getLB_VAL_UTF8(*curpos, strend);
5587     }
5588     else {
5589         (*curpos)++;
5590         if (*curpos >= strend) {
5591             return LB_EDGE;
5592         }
5593         lb = getLB_VAL_CP(**curpos);
5594     }
5595 
5596     return lb;
5597 }
5598 
5599 STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5600 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5601 {
5602     LB_enum lb;
5603 
5604     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5605 
5606     if (*curpos < strbeg) {
5607         return LB_EDGE;
5608     }
5609 
5610     if (utf8_target) {
5611         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5612         U8 * prev_prev_char_pos;
5613 
5614         if (! prev_char_pos) {
5615             return LB_EDGE;
5616         }
5617 
5618         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5619             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5620             *curpos = prev_char_pos;
5621             prev_char_pos = prev_prev_char_pos;
5622         }
5623         else {
5624             *curpos = (U8 *) strbeg;
5625             return LB_EDGE;
5626         }
5627     }
5628     else {
5629         if (*curpos - 2 < strbeg) {
5630             *curpos = (U8 *) strbeg;
5631             return LB_EDGE;
5632         }
5633         (*curpos)--;
5634         lb = getLB_VAL_CP(*(*curpos - 1));
5635     }
5636 
5637     return lb;
5638 }
5639 
5640 STATIC bool
S_isSB(pTHX_ SB_enum before,SB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5641 S_isSB(pTHX_ SB_enum before,
5642              SB_enum after,
5643              const U8 * const strbeg,
5644              const U8 * const curpos,
5645              const U8 * const strend,
5646              const bool utf8_target)
5647 {
5648     /* returns a boolean indicating if there is a Sentence Boundary Break
5649      * between the inputs.  See https://www.unicode.org/reports/tr29/ */
5650 
5651     U8 * lpos = (U8 *) curpos;
5652     bool has_para_sep = FALSE;
5653     bool has_sp = FALSE;
5654 
5655     PERL_ARGS_ASSERT_ISSB;
5656 
5657     /* Break at the start and end of text.
5658         SB1.  sot  ÷
5659         SB2.  ÷  eot
5660       But unstated in Unicode is don't break if the text is empty */
5661     if (before == SB_EDGE || after == SB_EDGE) {
5662         return before != after;
5663     }
5664 
5665     /* SB 3: Do not break within CRLF. */
5666     if (before == SB_CR && after == SB_LF) {
5667         return FALSE;
5668     }
5669 
5670     /* Break after paragraph separators.  CR and LF are considered
5671      * so because Unicode views text as like word processing text where there
5672      * are no newlines except between paragraphs, and the word processor takes
5673      * care of wrapping without there being hard line-breaks in the text *./
5674        SB4.  Sep | CR | LF  ÷ */
5675     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5676         return TRUE;
5677     }
5678 
5679     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5680      * (See Section 6.2, Replacing Ignore Rules.)
5681         SB5.  X (Extend | Format)*  →  X */
5682     if (after == SB_Extend || after == SB_Format) {
5683 
5684         /* Implied is that these characters attach to everything
5685          * immediately prior to them except for those separator-type
5686          * characters.  And the rules earlier have already handled the case
5687          * when one of those immediately precedes the extend char */
5688         return FALSE;
5689     }
5690 
5691     if (before == SB_Extend || before == SB_Format) {
5692         U8 * temp_pos = lpos;
5693         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5694         if (   backup != SB_EDGE
5695             && backup != SB_Sep
5696             && backup != SB_CR
5697             && backup != SB_LF)
5698         {
5699             before = backup;
5700             lpos = temp_pos;
5701         }
5702 
5703         /* Here, both 'before' and 'backup' are these types; implied is that we
5704          * don't break between them */
5705         if (backup == SB_Extend || backup == SB_Format) {
5706             return FALSE;
5707         }
5708     }
5709 
5710     /* Do not break after ambiguous terminators like period, if they are
5711      * immediately followed by a number or lowercase letter, if they are
5712      * between uppercase letters, if the first following letter (optionally
5713      * after certain punctuation) is lowercase, or if they are followed by
5714      * "continuation" punctuation such as comma, colon, or semicolon. For
5715      * example, a period may be an abbreviation or numeric period, and thus may
5716      * not mark the end of a sentence.
5717 
5718      * SB6. ATerm  ×  Numeric */
5719     if (before == SB_ATerm && after == SB_Numeric) {
5720         return FALSE;
5721     }
5722 
5723     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5724     if (before == SB_ATerm && after == SB_Upper) {
5725         U8 * temp_pos = lpos;
5726         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5727         if (backup == SB_Upper || backup == SB_Lower) {
5728             return FALSE;
5729         }
5730     }
5731 
5732     /* The remaining rules that aren't the final one, all require an STerm or
5733      * an ATerm after having backed up over some Close* Sp*, and in one case an
5734      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5735      * So do that backup now, setting flags if either Sp or a paragraph
5736      * separator are found */
5737 
5738     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5739         has_para_sep = TRUE;
5740         before = backup_one_SB(strbeg, &lpos, utf8_target);
5741     }
5742 
5743     if (before == SB_Sp) {
5744         has_sp = TRUE;
5745         do {
5746             before = backup_one_SB(strbeg, &lpos, utf8_target);
5747         }
5748         while (before == SB_Sp);
5749     }
5750 
5751     while (before == SB_Close) {
5752         before = backup_one_SB(strbeg, &lpos, utf8_target);
5753     }
5754 
5755     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5756      * most cases an STerm */
5757     if (before == SB_STerm || before == SB_ATerm) {
5758 
5759         /* So, here the lhs matches
5760          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5761          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5762          * The rules that apply here are:
5763          *
5764          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5765                                            | LF | STerm | ATerm) )* Lower
5766            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5767            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5768            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5769            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5770          */
5771 
5772         /* And all but SB11 forbid having seen a paragraph separator */
5773         if (! has_para_sep) {
5774             if (before == SB_ATerm) {          /* SB8 */
5775                 U8 * rpos = (U8 *) curpos;
5776                 SB_enum later = after;
5777 
5778                 while (    later != SB_OLetter
5779                         && later != SB_Upper
5780                         && later != SB_Lower
5781                         && later != SB_Sep
5782                         && later != SB_CR
5783                         && later != SB_LF
5784                         && later != SB_STerm
5785                         && later != SB_ATerm
5786                         && later != SB_EDGE)
5787                 {
5788                     later = advance_one_SB(&rpos, strend, utf8_target);
5789                 }
5790                 if (later == SB_Lower) {
5791                     return FALSE;
5792                 }
5793             }
5794 
5795             if (   after == SB_SContinue    /* SB8a */
5796                 || after == SB_STerm
5797                 || after == SB_ATerm)
5798             {
5799                 return FALSE;
5800             }
5801 
5802             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5803                 if (   after == SB_Close
5804                     || after == SB_Sp
5805                     || after == SB_Sep
5806                     || after == SB_CR
5807                     || after == SB_LF)
5808                 {
5809                     return FALSE;
5810                 }
5811             }
5812 
5813             /* SB10.  This and SB9 could probably be combined some way, but khw
5814              * has decided to follow the Unicode rule book precisely for
5815              * simplified maintenance */
5816             if (   after == SB_Sp
5817                 || after == SB_Sep
5818                 || after == SB_CR
5819                 || after == SB_LF)
5820             {
5821                 return FALSE;
5822             }
5823         }
5824 
5825         /* SB11.  */
5826         return TRUE;
5827     }
5828 
5829     /* Otherwise, do not break.
5830     SB12.  Any  ×  Any */
5831 
5832     return FALSE;
5833 }
5834 
5835 STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5836 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5837 {
5838     SB_enum sb;
5839 
5840     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5841 
5842     if (*curpos >= strend) {
5843         return SB_EDGE;
5844     }
5845 
5846     if (utf8_target) {
5847         do {
5848             *curpos += UTF8SKIP(*curpos);
5849             if (*curpos >= strend) {
5850                 return SB_EDGE;
5851             }
5852             sb = getSB_VAL_UTF8(*curpos, strend);
5853         } while (sb == SB_Extend || sb == SB_Format);
5854     }
5855     else {
5856         do {
5857             (*curpos)++;
5858             if (*curpos >= strend) {
5859                 return SB_EDGE;
5860             }
5861             sb = getSB_VAL_CP(**curpos);
5862         } while (sb == SB_Extend || sb == SB_Format);
5863     }
5864 
5865     return sb;
5866 }
5867 
5868 STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5869 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5870 {
5871     SB_enum sb;
5872 
5873     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5874 
5875     if (*curpos < strbeg) {
5876         return SB_EDGE;
5877     }
5878 
5879     if (utf8_target) {
5880         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5881         if (! prev_char_pos) {
5882             return SB_EDGE;
5883         }
5884 
5885         /* Back up over Extend and Format.  curpos is always just to the right
5886          * of the character whose value we are getting */
5887         do {
5888             U8 * prev_prev_char_pos;
5889             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5890                                                                       strbeg)))
5891             {
5892                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5893                 *curpos = prev_char_pos;
5894                 prev_char_pos = prev_prev_char_pos;
5895             }
5896             else {
5897                 *curpos = (U8 *) strbeg;
5898                 return SB_EDGE;
5899             }
5900         } while (sb == SB_Extend || sb == SB_Format);
5901     }
5902     else {
5903         do {
5904             if (*curpos - 2 < strbeg) {
5905                 *curpos = (U8 *) strbeg;
5906                 return SB_EDGE;
5907             }
5908             (*curpos)--;
5909             sb = getSB_VAL_CP(*(*curpos - 1));
5910         } while (sb == SB_Extend || sb == SB_Format);
5911     }
5912 
5913     return sb;
5914 }
5915 
5916 STATIC bool
S_isWB(pTHX_ WB_enum previous,WB_enum before,WB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5917 S_isWB(pTHX_ WB_enum previous,
5918              WB_enum before,
5919              WB_enum after,
5920              const U8 * const strbeg,
5921              const U8 * const curpos,
5922              const U8 * const strend,
5923              const bool utf8_target)
5924 {
5925     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5926      *  a Unicode word break, using their published algorithm, but tailored for
5927      *  Perl by treating spans of white space as one unit.  Context may be
5928      *  needed to make this determination.  If the value for the character
5929      *  before 'before' is known, it is passed as 'previous'; otherwise that
5930      *  should be set to WB_UNKNOWN.  The other input parameters give the
5931      *  boundaries and current position in the matching of the string.  That
5932      *  is, 'curpos' marks the position where the character whose wb value is
5933      *  'after' begins.  See https://www.unicode.org/reports/tr29/ */
5934 
5935     U8 * before_pos = (U8 *) curpos;
5936     U8 * after_pos = (U8 *) curpos;
5937     WB_enum prev = before;
5938     WB_enum next;
5939 
5940     PERL_ARGS_ASSERT_ISWB;
5941 
5942     /* Rule numbers in the comments below are as of Unicode 9.0 */
5943 
5944   redo:
5945     before = prev;
5946     switch (WB_table[before][after]) {
5947         case WB_BREAKABLE:
5948             return TRUE;
5949 
5950         case WB_NOBREAK:
5951             return FALSE;
5952 
5953         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5954             next = advance_one_WB(&after_pos, strend, utf8_target,
5955                                  FALSE /* Don't skip Extend nor Format */ );
5956             /* A space immediately preceding an Extend or Format is attached
5957              * to by them, and hence gets separated from previous spaces.
5958              * Otherwise don't break between horizontal white space */
5959             return next == WB_Extend || next == WB_Format;
5960 
5961         /* WB4 Ignore Format and Extend characters, except when they appear at
5962          * the beginning of a region of text.  This code currently isn't
5963          * general purpose, but it works as the rules are currently and likely
5964          * to be laid out.  The reason it works is that when 'they appear at
5965          * the beginning of a region of text', the rule is to break before
5966          * them, just like any other character.  Therefore, the default rule
5967          * applies and we don't have to look in more depth.  Should this ever
5968          * change, we would have to have 2 'case' statements, like in the rules
5969          * below, and backup a single character (not spacing over the extend
5970          * ones) and then see if that is one of the region-end characters and
5971          * go from there */
5972         case WB_Ex_or_FO_or_ZWJ_then_foo:
5973             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5974             goto redo;
5975 
5976         case WB_DQ_then_HL + WB_BREAKABLE:
5977         case WB_DQ_then_HL + WB_NOBREAK:
5978 
5979             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5980 
5981             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5982                                                             == WB_Hebrew_Letter)
5983             {
5984                 return FALSE;
5985             }
5986 
5987              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5988 
5989         case WB_HL_then_DQ + WB_BREAKABLE:
5990         case WB_HL_then_DQ + WB_NOBREAK:
5991 
5992             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5993 
5994             if (advance_one_WB(&after_pos, strend, utf8_target,
5995                                        TRUE /* Do skip Extend and Format */ )
5996                                                             == WB_Hebrew_Letter)
5997             {
5998                 return FALSE;
5999             }
6000 
6001             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
6002 
6003         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
6004         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
6005 
6006             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
6007              *       | Single_Quote) (ALetter | Hebrew_Letter) */
6008 
6009             next = advance_one_WB(&after_pos, strend, utf8_target,
6010                                        TRUE /* Do skip Extend and Format */ );
6011 
6012             if (next == WB_ALetter || next == WB_Hebrew_Letter)
6013             {
6014                 return FALSE;
6015             }
6016 
6017             return WB_table[before][after]
6018                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
6019 
6020         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
6021         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
6022 
6023             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
6024              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
6025 
6026             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
6027             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
6028             {
6029                 return FALSE;
6030             }
6031 
6032             return WB_table[before][after]
6033                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
6034 
6035         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
6036         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
6037 
6038             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
6039              * */
6040 
6041             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
6042                                                             == WB_Numeric)
6043             {
6044                 return FALSE;
6045             }
6046 
6047             return WB_table[before][after]
6048                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
6049 
6050         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
6051         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
6052 
6053             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
6054 
6055             if (advance_one_WB(&after_pos, strend, utf8_target,
6056                                        TRUE /* Do skip Extend and Format */ )
6057                                                             == WB_Numeric)
6058             {
6059                 return FALSE;
6060             }
6061 
6062             return WB_table[before][after]
6063                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
6064 
6065         case WB_RI_then_RI + WB_NOBREAK:
6066         case WB_RI_then_RI + WB_BREAKABLE:
6067             {
6068                 int RI_count = 1;
6069 
6070                 /* Do not break within emoji flag sequences. That is, do not
6071                  * break between regional indicator (RI) symbols if there is an
6072                  * odd number of RI characters before the potential break
6073                  * point.
6074                  *
6075                  * WB15   sot (RI RI)* RI × RI
6076                  * WB16 [^RI] (RI RI)* RI × RI */
6077 
6078                 while (backup_one_WB(&previous,
6079                                      strbeg,
6080                                      &before_pos,
6081                                      utf8_target) == WB_Regional_Indicator)
6082                 {
6083                     RI_count++;
6084                 }
6085 
6086                 return RI_count % 2 != 1;
6087             }
6088 
6089         default:
6090             break;
6091     }
6092 
6093 #ifdef DEBUGGING
6094     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
6095                                   before, after, WB_table[before][after]);
6096     assert(0);
6097 #endif
6098     return TRUE;
6099 }
6100 
6101 STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target,const bool skip_Extend_Format)6102 S_advance_one_WB(pTHX_ U8 ** curpos,
6103                        const U8 * const strend,
6104                        const bool utf8_target,
6105                        const bool skip_Extend_Format)
6106 {
6107     WB_enum wb;
6108 
6109     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
6110 
6111     if (*curpos >= strend) {
6112         return WB_EDGE;
6113     }
6114 
6115     if (utf8_target) {
6116 
6117         /* Advance over Extend and Format */
6118         do {
6119             *curpos += UTF8SKIP(*curpos);
6120             if (*curpos >= strend) {
6121                 return WB_EDGE;
6122             }
6123             wb = getWB_VAL_UTF8(*curpos, strend);
6124         } while (    skip_Extend_Format
6125                  && (wb == WB_Extend || wb == WB_Format));
6126     }
6127     else {
6128         do {
6129             (*curpos)++;
6130             if (*curpos >= strend) {
6131                 return WB_EDGE;
6132             }
6133             wb = getWB_VAL_CP(**curpos);
6134         } while (    skip_Extend_Format
6135                  && (wb == WB_Extend || wb == WB_Format));
6136     }
6137 
6138     return wb;
6139 }
6140 
6141 STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous,const U8 * const strbeg,U8 ** curpos,const bool utf8_target)6142 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
6143 {
6144     WB_enum wb;
6145 
6146     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
6147 
6148     /* If we know what the previous character's break value is, don't have
6149         * to look it up */
6150     if (*previous != WB_UNKNOWN) {
6151         wb = *previous;
6152 
6153         /* But we need to move backwards by one */
6154         if (utf8_target) {
6155             *curpos = reghopmaybe3(*curpos, -1, strbeg);
6156             if (! *curpos) {
6157                 *previous = WB_EDGE;
6158                 *curpos = (U8 *) strbeg;
6159             }
6160             else {
6161                 *previous = WB_UNKNOWN;
6162             }
6163         }
6164         else {
6165             (*curpos)--;
6166             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6167         }
6168 
6169         /* And we always back up over these three types */
6170         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6171             return wb;
6172         }
6173     }
6174 
6175     if (*curpos < strbeg) {
6176         return WB_EDGE;
6177     }
6178 
6179     if (utf8_target) {
6180         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6181         if (! prev_char_pos) {
6182             return WB_EDGE;
6183         }
6184 
6185         /* Back up over Extend and Format.  curpos is always just to the right
6186          * of the character whose value we are getting */
6187         do {
6188             U8 * prev_prev_char_pos;
6189             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6190                                                    -1,
6191                                                    strbeg)))
6192             {
6193                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6194                 *curpos = prev_char_pos;
6195                 prev_char_pos = prev_prev_char_pos;
6196             }
6197             else {
6198                 *curpos = (U8 *) strbeg;
6199                 return WB_EDGE;
6200             }
6201         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6202     }
6203     else {
6204         do {
6205             if (*curpos - 2 < strbeg) {
6206                 *curpos = (U8 *) strbeg;
6207                 return WB_EDGE;
6208             }
6209             (*curpos)--;
6210             wb = getWB_VAL_CP(*(*curpos - 1));
6211         } while (wb == WB_Extend || wb == WB_Format);
6212     }
6213 
6214     return wb;
6215 }
6216 
6217 /* Macros for regmatch(), using its internal variables */
6218 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6219 #define NEXTCHR_IS_EOS (nextbyte < 0)
6220 
6221 #define SET_nextchr \
6222     nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6223 
6224 #define SET_locinput(p) \
6225     locinput = (p);  \
6226     SET_nextchr
6227 
6228 #define sayYES goto yes
6229 #define sayNO goto no
6230 #define sayNO_SILENT goto no_silent
6231 
6232 /* we don't use STMT_START/END here because it leads to
6233    "unreachable code" warnings, which are bogus, but distracting. */
6234 #define CACHEsayNO \
6235     if (ST.cache_mask) \
6236        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6237     sayNO
6238 
6239 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
6240 (                                                           \
6241     (   ( st )                                         ) && \
6242     (   ( st )->u.eval.close_paren                     ) && \
6243     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6244 )
6245 
6246 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
6247 (                                                           \
6248     (   ( st )                                         ) && \
6249     (   ( st )->u.eval.close_paren                     ) && \
6250     (   ( expr )                                       ) && \
6251     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6252 )
6253 
6254 
6255 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6256     (st)->u.eval.close_paren = ( (expr) + 1 )
6257 
6258 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6259     (st)->u.eval.close_paren = 0
6260 
6261 /* push a new state then goto it */
6262 
6263 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
6264     pushinput = input; \
6265     pusheol = eol; \
6266     pushsr0 = sr0; \
6267     scan = node; \
6268     st->resume_state = state; \
6269     goto push_state;
6270 
6271 /* push a new state with success backtracking, then goto it */
6272 
6273 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
6274     pushinput = input; \
6275     pusheol = eol;     \
6276     pushsr0 = sr0; \
6277     scan = node; \
6278     st->resume_state = state; \
6279     goto push_yes_state;
6280 
6281 #define DEBUG_STATE_pp(pp)                                  \
6282     DEBUG_STATE_r({                                         \
6283         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
6284         Perl_re_printf( aTHX_                               \
6285             "%*s" pp " %s%s%s%s%s\n",                       \
6286             INDENT_CHARS(depth), "",                        \
6287             REGNODE_NAME(st->resume_state),                  \
6288             ((st==yes_state||st==mark_state) ? "[" : ""),   \
6289             ((st==yes_state) ? "Y" : ""),                   \
6290             ((st==mark_state) ? "M" : ""),                  \
6291             ((st==yes_state||st==mark_state) ? "]" : "")    \
6292         );                                                  \
6293     });
6294 
6295 /*
6296 
6297 regmatch() - main matching routine
6298 
6299 This is basically one big switch statement in a loop. We execute an op,
6300 set 'next' to point the next op, and continue. If we come to a point which
6301 we may need to backtrack to on failure such as (A|B|C), we push a
6302 backtrack state onto the backtrack stack. On failure, we pop the top
6303 state, and re-enter the loop at the state indicated. If there are no more
6304 states to pop, we return failure.
6305 
6306 Sometimes we also need to backtrack on success; for example /A+/, where
6307 after successfully matching one A, we need to go back and try to
6308 match another one; similarly for lookahead assertions: if the assertion
6309 completes successfully, we backtrack to the state just before the assertion
6310 and then carry on.  In these cases, the pushed state is marked as
6311 'backtrack on success too'. This marking is in fact done by a chain of
6312 pointers, each pointing to the previous 'yes' state. On success, we pop to
6313 the nearest yes state, discarding any intermediate failure-only states.
6314 Sometimes a yes state is pushed just to force some cleanup code to be
6315 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6316 it to free the inner regex.
6317 
6318 Note that failure backtracking rewinds the cursor position, while
6319 success backtracking leaves it alone.
6320 
6321 A pattern is complete when the END op is executed, while a subpattern
6322 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6323 ops trigger the "pop to last yes state if any, otherwise return true"
6324 behaviour.
6325 
6326 A common convention in this function is to use A and B to refer to the two
6327 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6328 the subpattern to be matched possibly multiple times, while B is the entire
6329 rest of the pattern. Variable and state names reflect this convention.
6330 
6331 The states in the main switch are the union of ops and failure/success of
6332 substates associated with that op.  For example, IFMATCH is the op
6333 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6334 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6335 successfully matched A and IFMATCH_A_fail is a state saying that we have
6336 just failed to match A. Resume states always come in pairs. The backtrack
6337 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6338 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6339 on success or failure.
6340 
6341 The struct that holds a backtracking state is actually a big union, with
6342 one variant for each major type of op. The variable st points to the
6343 top-most backtrack struct. To make the code clearer, within each
6344 block of code we #define ST to alias the relevant union.
6345 
6346 Here's a concrete example of a (vastly oversimplified) IFMATCH
6347 implementation:
6348 
6349     switch (state) {
6350     ....
6351 
6352 #define ST st->u.ifmatch
6353 
6354     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6355         ST.foo = ...; // some state we wish to save
6356         ...
6357         // push a yes backtrack state with a resume value of
6358         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6359         // first node of A:
6360         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6361         // NOTREACHED
6362 
6363     case IFMATCH_A: // we have successfully executed A; now continue with B
6364         next = B;
6365         bar = ST.foo; // do something with the preserved value
6366         break;
6367 
6368     case IFMATCH_A_fail: // A failed, so the assertion failed
6369         ...;   // do some housekeeping, then ...
6370         sayNO; // propagate the failure
6371 
6372 #undef ST
6373 
6374     ...
6375     }
6376 
6377 For any old-timers reading this who are familiar with the old recursive
6378 approach, the code above is equivalent to:
6379 
6380     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6381     {
6382         int foo = ...
6383         ...
6384         if (regmatch(A)) {
6385             next = B;
6386             bar = foo;
6387             break;
6388         }
6389         ...;   // do some housekeeping, then ...
6390         sayNO; // propagate the failure
6391     }
6392 
6393 The topmost backtrack state, pointed to by st, is usually free. If you
6394 want to claim it, populate any ST.foo fields in it with values you wish to
6395 save, then do one of
6396 
6397         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6398         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6399 
6400 which sets that backtrack state's resume value to 'resume_state', pushes a
6401 new free entry to the top of the backtrack stack, then goes to 'node'.
6402 On backtracking, the free slot is popped, and the saved state becomes the
6403 new free state. An ST.foo field in this new top state can be temporarily
6404 accessed to retrieve values, but once the main loop is re-entered, it
6405 becomes available for reuse.
6406 
6407 Note that the depth of the backtrack stack constantly increases during the
6408 left-to-right execution of the pattern, rather than going up and down with
6409 the pattern nesting. For example the stack is at its maximum at Z at the
6410 end of the pattern, rather than at X in the following:
6411 
6412     /(((X)+)+)+....(Y)+....Z/
6413 
6414 The only exceptions to this are lookahead/behind assertions and the cut,
6415 (?>A), which pop all the backtrack states associated with A before
6416 continuing.
6417 
6418 Backtrack state structs are allocated in slabs of about 4K in size.
6419 PL_regmatch_state and st always point to the currently active state,
6420 and PL_regmatch_slab points to the slab currently containing
6421 PL_regmatch_state.  The first time regmatch() is called, the first slab is
6422 allocated, and is never freed until interpreter destruction. When the slab
6423 is full, a new one is allocated and chained to the end. At exit from
6424 regmatch(), slabs allocated since entry are freed.
6425 
6426 In order to work with variable length lookbehinds, an upper limit is placed on
6427 lookbehinds which is set to where the match position is at the end of where the
6428 lookbehind would get to.  Nothing in the lookbehind should match above that,
6429 except we should be able to look beyond if for things like \b, which need the
6430 next character in the string to be able to determine if this is a boundary or
6431 not.  We also can't match the end of string/line unless we are also at the end
6432 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6433 that match a width, we have to add a condition that they are within the legal
6434 bounds of our window into the string.
6435 
6436 */
6437 
6438 /* returns -1 on failure, $+[0] on success */
6439 STATIC SSize_t
S_regmatch(pTHX_ regmatch_info * reginfo,char * startpos,regnode * prog)6440 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6441 {
6442     const bool utf8_target = reginfo->is_utf8_target;
6443     const U32 uniflags = UTF8_ALLOW_DEFAULT;
6444     REGEXP *rex_sv = reginfo->prog;
6445     regexp *rex = ReANY(rex_sv);
6446     RXi_GET_DECL(rex,rexi);
6447     /* the current state. This is a cached copy of PL_regmatch_state */
6448     regmatch_state *st;
6449     /* cache heavy used fields of st in registers */
6450     regnode *scan;
6451     regnode *next;
6452     U32 n = 0;	/* general value; init to avoid compiler warning */
6453     U32 utmp = 0;  /* tmp variable - valid for at most one opcode */
6454     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
6455     SSize_t endref = 0; /* offset of end of backref when ln is start */
6456     char *locinput = startpos;
6457     char *loceol = reginfo->strend;
6458     char *pushinput; /* where to continue after a PUSH */
6459     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
6460     U8   *pushsr0;   /* save starting pos of script run */
6461     PERL_INT_FAST16_T nextbyte;   /* is always set to UCHARAT(locinput), or -1
6462                                      at EOS */
6463 
6464     bool result = 0;	    /* return value of S_regmatch */
6465     U32 depth = 0;            /* depth of backtrack stack */
6466     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6467     const U32 max_nochange_depth =
6468         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6469         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6470     regmatch_state *yes_state = NULL; /* state to pop to on success of
6471                                                             subpattern */
6472     /* mark_state piggy backs on the yes_state logic so that when we unwind
6473        the stack on success we can update the mark_state as we go */
6474     regmatch_state *mark_state = NULL; /* last mark state we have seen */
6475     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6476     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
6477     U32 state_num;
6478     bool no_final = 0;      /* prevent failure from backtracking? */
6479     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
6480     char *startpoint = locinput;
6481     SV *popmark = NULL;     /* are we looking for a mark? */
6482     SV *sv_commit = NULL;   /* last mark name seen in failure */
6483     SV *sv_yes_mark = NULL; /* last mark name we have seen
6484                                during a successful match */
6485     U32 lastopen = 0;       /* last open we saw */
6486     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6487     SV* const oreplsv = GvSVn(PL_replgv);
6488     /* these three flags are set by various ops to signal information to
6489      * the very next op. They have a useful lifetime of exactly one loop
6490      * iteration, and are not preserved or restored by state pushes/pops
6491      */
6492     bool sw = 0;	    /* the condition value in (?(cond)a|b) */
6493     bool minmod = 0;	    /* the next "{n,m}" is a "{n,m}?" */
6494     int logical = 0;	    /* the following EVAL is:
6495                                 0: (?{...})
6496                                 1: (?(?{...})X|Y)
6497                                 2: (??{...})
6498                                or the following IFMATCH/UNLESSM is:
6499                                 false: plain (?=foo)
6500                                 true:  used as a condition: (?(?=foo))
6501                             */
6502     PAD* last_pad = NULL;
6503     dMULTICALL;
6504     U8 gimme = G_SCALAR;
6505     CV *caller_cv = NULL;	/* who called us */
6506     CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
6507     U32 maxopenparen = 0;       /* max '(' index seen so far */
6508     int to_complement;  /* Invert the result? */
6509     char_class_number_ classnum;
6510     bool is_utf8_pat = reginfo->is_utf8_pat;
6511     bool match = FALSE;
6512     I32 orig_savestack_ix = PL_savestack_ix;
6513     U8 * script_run_begin = NULL;
6514     char *match_end= NULL; /* where a match MUST end to be considered successful */
6515     bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6516     re_fold_t folder = NULL;  /* used by various EXACTish regops */
6517     const U8 * fold_array = NULL; /* used by various EXACTish regops */
6518 
6519 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6520 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6521 #  define SOLARIS_BAD_OPTIMIZER
6522     const U32 *pl_charclass_dup = PL_charclass;
6523 #  define PL_charclass pl_charclass_dup
6524 #endif
6525 
6526 #ifdef DEBUGGING
6527     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6528 #endif
6529 
6530     /* protect against undef(*^R) */
6531     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6532 
6533     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6534     multicall_oldcatch = 0;
6535     PERL_UNUSED_VAR(multicall_cop);
6536 
6537     PERL_ARGS_ASSERT_REGMATCH;
6538 
6539     st = PL_regmatch_state;
6540 
6541     /* Note that nextbyte is a byte even in UTF */
6542     SET_nextchr;
6543     scan = prog;
6544 
6545     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6546             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6547             Perl_re_printf( aTHX_ "regmatch start\n" );
6548     }));
6549 
6550     while (scan != NULL) {
6551         next = scan + NEXT_OFF(scan);
6552         if (next == scan)
6553             next = NULL;
6554         state_num = OP(scan);
6555 
6556       reenter_switch:
6557         DEBUG_EXECUTE_r(
6558             if (state_num <= REGNODE_MAX) {
6559                 SV * const prop = sv_newmortal();
6560                 regnode *rnext = regnext(scan);
6561 
6562                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6563                 regprop(rex, prop, scan, reginfo, NULL);
6564                 Perl_re_printf( aTHX_
6565                     "%*s%" IVdf ":%s(%" IVdf ")\n",
6566                     INDENT_CHARS(depth), "",
6567                     (IV)(scan - rexi->program),
6568                     SvPVX_const(prop),
6569                     (REGNODE_TYPE(OP(scan)) == END || !rnext) ?
6570                         0 : (IV)(rnext - rexi->program));
6571             }
6572         );
6573 
6574         to_complement = 0;
6575 
6576         SET_nextchr;
6577         assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6578 
6579         switch (state_num) {
6580             SV * anyofh_list;
6581 
6582         case SBOL: /*  /^../ and /\A../  */
6583             if (locinput == reginfo->strbeg)
6584                 break;
6585             sayNO;
6586 
6587         case MBOL: /*  /^../m  */
6588             if (locinput == reginfo->strbeg ||
6589                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6590             {
6591                 break;
6592             }
6593             sayNO;
6594 
6595         case GPOS: /*  \G  */
6596             if (locinput == reginfo->ganch)
6597                 break;
6598             sayNO;
6599 
6600         case KEEPS: /*   \K  */
6601             /* update the startpoint */
6602             st->u.keeper.val = RXp_OFFS_START(rex,0);
6603             RXp_OFFSp(rex)[0].start = locinput - reginfo->strbeg;
6604             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6605                             script_run_begin);
6606             NOT_REACHED; /* NOTREACHED */
6607 
6608         case KEEPS_next_fail:
6609             /* rollback the start point change */
6610             RXp_OFFSp(rex)[0].start = st->u.keeper.val;
6611             sayNO_SILENT;
6612             NOT_REACHED; /* NOTREACHED */
6613 
6614         case MEOL: /* /..$/m  */
6615             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6616                 sayNO;
6617             break;
6618 
6619         case SEOL: /* /..$/  */
6620             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6621                 sayNO;
6622             if (reginfo->strend - locinput > 1)
6623                 sayNO;
6624             break;
6625 
6626         case EOS: /*  \z  */
6627             if (!NEXTCHR_IS_EOS)
6628                 sayNO;
6629             break;
6630 
6631         case SANY: /*  /./s  */
6632             if (NEXTCHR_IS_EOS || locinput >= loceol)
6633                 sayNO;
6634             goto increment_locinput;
6635 
6636         case REG_ANY: /*  /./  */
6637             if (   NEXTCHR_IS_EOS
6638                 || locinput >= loceol
6639                 || nextbyte == '\n')
6640             {
6641                 sayNO;
6642             }
6643             goto increment_locinput;
6644 
6645 
6646 #undef  ST
6647 #define ST st->u.trie
6648         case TRIEC: /* (ab|cd) with known charclass */
6649             /* In this case the charclass data is available inline so
6650                we can fail fast without a lot of extra overhead.
6651              */
6652             if ( !   NEXTCHR_IS_EOS
6653                 &&   locinput < loceol
6654                 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6655             {
6656                 DEBUG_EXECUTE_r(
6657                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6658                               depth, PL_colors[4], PL_colors[5])
6659                 );
6660                 sayNO_SILENT;
6661                 NOT_REACHED; /* NOTREACHED */
6662             }
6663             /* FALLTHROUGH */
6664         case TRIE:  /* (ab|cd)  */
6665             /* the basic plan of execution of the trie is:
6666              * At the beginning, run though all the states, and
6667              * find the longest-matching word. Also remember the position
6668              * of the shortest matching word. For example, this pattern:
6669              *    1  2 3 4    5
6670              *    ab|a|x|abcd|abc
6671              * when matched against the string "abcde", will generate
6672              * accept states for all words except 3, with the longest
6673              * matching word being 4, and the shortest being 2 (with
6674              * the position being after char 1 of the string).
6675              *
6676              * Then for each matching word, in word order (i.e. 1,2,4,5),
6677              * we run the remainder of the pattern; on each try setting
6678              * the current position to the character following the word,
6679              * returning to try the next word on failure.
6680              *
6681              * We avoid having to build a list of words at runtime by
6682              * using a compile-time structure, wordinfo[].prev, which
6683              * gives, for each word, the previous accepting word (if any).
6684              * In the case above it would contain the mappings 1->2, 2->0,
6685              * 3->0, 4->5, 5->1.  We can use this table to generate, from
6686              * the longest word (4 above), a list of all words, by
6687              * following the list of prev pointers; this gives us the
6688              * unordered list 4,5,1,2. Then given the current word we have
6689              * just tried, we can go through the list and find the
6690              * next-biggest word to try (so if we just failed on word 2,
6691              * the next in the list is 4).
6692              *
6693              * Since at runtime we don't record the matching position in
6694              * the string for each word, we have to work that out for
6695              * each word we're about to process. The wordinfo table holds
6696              * the character length of each word; given that we recorded
6697              * at the start: the position of the shortest word and its
6698              * length in chars, we just need to move the pointer the
6699              * difference between the two char lengths. Depending on
6700              * Unicode status and folding, that's cheap or expensive.
6701              *
6702              * This algorithm is optimised for the case where are only a
6703              * small number of accept states, i.e. 0,1, or maybe 2.
6704              * With lots of accepts states, and having to try all of them,
6705              * it becomes quadratic on number of accept states to find all
6706              * the next words.
6707              */
6708 
6709             {
6710                 /* what type of TRIE am I? (utf8 makes this contextual) */
6711                 DECL_TRIE_TYPE(scan);
6712 
6713                 /* what trie are we using right now */
6714                 reg_trie_data * const trie
6715                     = (reg_trie_data*)rexi->data->data[ ARG1u( scan ) ];
6716                 ST.before_paren = trie->before_paren;
6717                 ST.after_paren = trie->after_paren;
6718                 assert(ST.before_paren<=rex->nparens);
6719                 assert(ST.after_paren<=rex->nparens);
6720 
6721                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG1u( scan ) + 1 ]);
6722                 U32 state = trie->startstate;
6723 
6724                 if (FLAGS(scan) == EXACTL || FLAGS(scan) == EXACTFLU8) {
6725                     CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
6726                     if (utf8_target
6727                         && ! NEXTCHR_IS_EOS
6728                         && UTF8_IS_ABOVE_LATIN1(nextbyte)
6729                         && FLAGS(scan) == EXACTL)
6730                     {
6731                         /* We only output for EXACTL, as we let the folder
6732                          * output this message for EXACTFLU8 to avoid
6733                          * duplication */
6734                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6735                                                                reginfo->strend);
6736                     }
6737                 }
6738                 if (   trie->bitmap
6739                     && (     NEXTCHR_IS_EOS
6740                         ||   locinput >= loceol
6741                         || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6742                 {
6743                     if (trie->states[ state ].wordnum) {
6744                          DEBUG_EXECUTE_r(
6745                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6746                                           depth, PL_colors[4], PL_colors[5])
6747                         );
6748                         if (!trie->jump)
6749                             break;
6750                     } else {
6751                         DEBUG_EXECUTE_r(
6752                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6753                                           depth, PL_colors[4], PL_colors[5])
6754                         );
6755                         sayNO_SILENT;
6756                    }
6757                 }
6758 
6759             {
6760                 U8 *uc = ( U8* )locinput;
6761 
6762                 STRLEN len = 0;
6763                 STRLEN foldlen = 0;
6764                 U8 *uscan = (U8*)NULL;
6765                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6766                 U32 charcount = 0; /* how many input chars we have matched */
6767                 U32 accepted = 0; /* have we seen any accepting states? */
6768 
6769                 ST.jump = trie->jump;
6770                 ST.j_before_paren = trie->j_before_paren;
6771                 ST.j_after_paren= trie->j_after_paren;
6772                 ST.me = scan;
6773                 ST.firstpos = NULL;
6774                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6775                 ST.nextword = 0;
6776 
6777                 /* fully traverse the TRIE; note the position of the
6778                    shortest accept state and the wordnum of the longest
6779                    accept state */
6780 
6781                 while ( state && uc <= (U8*)(loceol) ) {
6782                     U32 base = trie->states[ state ].trans.base;
6783                     UV uvc = 0;
6784                     U16 charid = 0;
6785                     U16 wordnum;
6786                     wordnum = trie->states[ state ].wordnum;
6787 
6788                     if (wordnum) { /* it's an accept state */
6789                         if (!accepted) {
6790                             accepted = 1;
6791                             /* record first match position */
6792                             if (ST.longfold) {
6793                                 ST.firstpos = (U8*)locinput;
6794                                 ST.firstchars = 0;
6795                             }
6796                             else {
6797                                 ST.firstpos = uc;
6798                                 ST.firstchars = charcount;
6799                             }
6800                         }
6801                         if (!ST.nextword || wordnum < ST.nextword)
6802                             ST.nextword = wordnum;
6803                         ST.topword = wordnum;
6804                     }
6805 
6806                     DEBUG_TRIE_EXECUTE_r({
6807                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6808                                 /* HERE */
6809                                 PerlIO_printf( Perl_debug_log,
6810                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6811                                     INDENT_CHARS(depth), "", PL_colors[4],
6812                                     (UV)state, (accepted ? 'Y' : 'N'));
6813                     });
6814 
6815                     /* read a char and goto next state */
6816                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6817                         I32 offset;
6818                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6819                                              (U8 *) loceol, uscan,
6820                                              len, uvc, charid, foldlen,
6821                                              foldbuf, uniflags);
6822                         charcount++;
6823                         if (foldlen>0)
6824                             ST.longfold = TRUE;
6825                         if (charid &&
6826                              ( ((offset =
6827                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6828 
6829                              && ((U32)offset < trie->lasttrans)
6830                              && trie->trans[offset].check == state)
6831                         {
6832                             state = trie->trans[offset].next;
6833                         }
6834                         else {
6835                             state = 0;
6836                         }
6837                         uc += len;
6838 
6839                     }
6840                     else {
6841                         state = 0;
6842                     }
6843                     DEBUG_TRIE_EXECUTE_r(
6844                         Perl_re_printf( aTHX_
6845                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6846                             charid, uvc, (UV)state, PL_colors[5] );
6847                     );
6848                 }
6849                 if (!accepted)
6850                    sayNO;
6851 
6852                 /* calculate total number of accept states */
6853                 {
6854                     U16 w = ST.topword;
6855                     accepted = 0;
6856                     while (w) {
6857                         w = trie->wordinfo[w].prev;
6858                         accepted++;
6859                     }
6860                     ST.accepted = accepted;
6861                 }
6862 
6863                 DEBUG_EXECUTE_r(
6864                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6865                         depth,
6866                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6867                 );
6868                 goto trie_first_try; /* jump into the fail handler */
6869             }}
6870             NOT_REACHED; /* NOTREACHED */
6871 
6872         case TRIE_next_fail: /* we failed - try next alternative */
6873         {
6874             U8 *uc;
6875             if (RE_PESSIMISTIC_PARENS) {
6876                 REGCP_UNWIND(ST.lastcp);
6877                 regcppop(rex,&maxopenparen);
6878             }
6879             if ( ST.jump ) {
6880                 /* undo any captures done in the tail part of a branch,
6881                  * e.g.
6882                  *    /(?:X(.)(.)|Y(.)).../
6883                  * where the trie just matches X then calls out to do the
6884                  * rest of the branch */
6885                 REGCP_UNWIND(ST.cp);
6886                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6887                 if (ST.after_paren) {
6888                     assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens);
6889                     CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "TRIE_next_fail");
6890                 }
6891             }
6892             if (!--ST.accepted) {
6893                 DEBUG_EXECUTE_r({
6894                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6895                         depth,
6896                         PL_colors[4],
6897                         PL_colors[5] );
6898                 });
6899                 sayNO_SILENT;
6900             }
6901             {
6902                 /* Find next-highest word to process.  Note that this code
6903                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6904                 U16 min = 0;
6905                 U16 word;
6906                 U16 const nextword = ST.nextword;
6907                 reg_trie_wordinfo * const wordinfo
6908                     = ((reg_trie_data*)rexi->data->data[ARG1u(ST.me)])->wordinfo;
6909                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6910                     if (word > nextword && (!min || word < min))
6911                         min = word;
6912                 }
6913                 ST.nextword = min;
6914             }
6915 
6916           trie_first_try:
6917             if (do_cutgroup) {
6918                 do_cutgroup = 0;
6919                 no_final = 0;
6920             }
6921 
6922             if ( ST.jump ) {
6923                 ST.lastparen = RXp_LASTPAREN(rex);
6924                 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
6925                 REGCP_SET(ST.cp);
6926             }
6927 
6928             /* find start char of end of current word */
6929             {
6930                 U32 chars; /* how many chars to skip */
6931                 reg_trie_data * const trie
6932                     = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)];
6933 
6934                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6935                             >=  ST.firstchars);
6936                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6937                             - ST.firstchars;
6938                 uc = ST.firstpos;
6939 
6940                 if (ST.longfold) {
6941                     /* the hard option - fold each char in turn and find
6942                      * its folded length (which may be different */
6943                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6944                     STRLEN foldlen;
6945                     STRLEN len;
6946                     UV uvc;
6947                     U8 *uscan;
6948 
6949                     while (chars) {
6950                         if (utf8_target) {
6951                             /* XXX This assumes the length is well-formed, as
6952                              * does the UTF8SKIP below */
6953                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6954                                                     uniflags);
6955                             uc += len;
6956                         }
6957                         else {
6958                             uvc = *uc;
6959                             uc++;
6960                         }
6961                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6962                         uscan = foldbuf;
6963                         while (foldlen) {
6964                             if (!--chars)
6965                                 break;
6966                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6967                                                  uniflags);
6968                             uscan += len;
6969                             foldlen -= len;
6970                         }
6971                     }
6972                 }
6973                 else {
6974                     if (utf8_target)
6975                         uc = utf8_hop(uc, chars);
6976                     else
6977                         uc += chars;
6978                 }
6979             }
6980             if (ST.jump && ST.jump[ST.nextword]) {
6981                 scan = ST.me + ST.jump[ST.nextword];
6982                 ST.before_paren = ST.j_before_paren[ST.nextword];
6983                 assert(ST.before_paren <= rex->nparens);
6984                 ST.after_paren = ST.j_after_paren[ST.nextword];
6985                 assert(ST.after_paren <= rex->nparens);
6986             } else {
6987                 scan = ST.me + NEXT_OFF(ST.me);
6988             }
6989 
6990 
6991             DEBUG_EXECUTE_r({
6992                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6993                     depth,
6994                     PL_colors[4],
6995                     ST.nextword,
6996                     PL_colors[5]
6997                     );
6998             });
6999 
7000             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
7001                 if (RE_PESSIMISTIC_PARENS) {
7002                     (void)regcppush(rex, 0, maxopenparen);
7003                     REGCP_SET(ST.lastcp);
7004                 }
7005                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
7006                                 script_run_begin);
7007                 NOT_REACHED; /* NOTREACHED */
7008             }
7009             /* only one choice left - just continue */
7010             DEBUG_EXECUTE_r({
7011                 AV *const trie_words
7012                     = MUTABLE_AV(rexi->data->data[ARG1u(ST.me)+TRIE_WORDS_OFFSET]);
7013                 SV ** const tmp = trie_words
7014                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
7015                 SV *sv= tmp ? sv_newmortal() : NULL;
7016 
7017                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
7018                     depth, PL_colors[4],
7019                     ST.nextword,
7020                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
7021                             PL_colors[0], PL_colors[1],
7022                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
7023                         )
7024                     : "not compiled under -Dr",
7025                     PL_colors[5] );
7026             });
7027 
7028             locinput = (char*)uc;
7029             continue; /* execute rest of RE */
7030             /* NOTREACHED */
7031         }
7032 #undef  ST
7033 
7034         case LEXACT_REQ8:
7035             if (! utf8_target) {
7036                 sayNO;
7037             }
7038             /* FALLTHROUGH */
7039 
7040         case LEXACT:
7041         {
7042             char *s;
7043 
7044             s = STRINGl(scan);
7045             ln = STR_LENl(scan);
7046             goto join_short_long_exact;
7047 
7048         case EXACTL:             /*  /abc/l       */
7049             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7050 
7051             /* Complete checking would involve going through every character
7052              * matched by the string to see if any is above latin1.  But the
7053              * comparison otherwise might very well be a fast assembly
7054              * language routine, and I (khw) don't think slowing things down
7055              * just to check for this warning is worth it.  So this just checks
7056              * the first character */
7057             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
7058                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
7059             }
7060             goto do_exact;
7061         case EXACT_REQ8:
7062             if (! utf8_target) {
7063                 sayNO;
7064             }
7065             /* FALLTHROUGH */
7066 
7067         case EXACT:             /*  /abc/        */
7068           do_exact:
7069             s = STRINGs(scan);
7070             ln = STR_LENs(scan);
7071 
7072           join_short_long_exact:
7073             if (utf8_target != is_utf8_pat) {
7074                 /* The target and the pattern have differing utf8ness. */
7075                 char *l = locinput;
7076                 const char * const e = s + ln;
7077 
7078                 if (utf8_target) {
7079                     /* The target is utf8, the pattern is not utf8.
7080                      * Above-Latin1 code points can't match the pattern;
7081                      * invariants match exactly, and the other Latin1 ones need
7082                      * to be downgraded to a single byte in order to do the
7083                      * comparison.  (If we could be confident that the target
7084                      * is not malformed, this could be refactored to have fewer
7085                      * tests by just assuming that if the first bytes match, it
7086                      * is an invariant, but there are tests in the test suite
7087                      * dealing with (??{...}) which violate this) */
7088                     while (s < e) {
7089                         if (   l >= loceol
7090                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
7091                         {
7092                             sayNO;
7093                         }
7094                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
7095                             if (*l != *s) {
7096                                 sayNO;
7097                             }
7098                             l++;
7099                         }
7100                         else {
7101                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
7102                             {
7103                                 sayNO;
7104                             }
7105                             l += 2;
7106                         }
7107                         s++;
7108                     }
7109                 }
7110                 else {
7111                     /* The target is not utf8, the pattern is utf8. */
7112                     while (s < e) {
7113                         if (   l >= loceol
7114                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
7115                         {
7116                             sayNO;
7117                         }
7118                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
7119                             if (*s != *l) {
7120                                 sayNO;
7121                             }
7122                             s++;
7123                         }
7124                         else {
7125                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
7126                             {
7127                                 sayNO;
7128                             }
7129                             s += 2;
7130                         }
7131                         l++;
7132                     }
7133                 }
7134                 locinput = l;
7135             }
7136             else {
7137                 /* The target and the pattern have the same utf8ness. */
7138                 /* Inline the first character, for speed. */
7139                 if (   loceol - locinput < ln
7140                     || UCHARAT(s) != nextbyte
7141                     || (ln > 1 && memNE(s, locinput, ln)))
7142                 {
7143                     sayNO;
7144                 }
7145                 locinput += ln;
7146             }
7147             break;
7148             }
7149 
7150         case EXACTFL:            /*  /abc/il      */
7151           {
7152             const char * s;
7153             U32 fold_utf8_flags;
7154 
7155             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7156             folder = Perl_foldEQ_locale;
7157             fold_array = PL_fold_locale;
7158             fold_utf8_flags = FOLDEQ_LOCALE;
7159             goto do_exactf;
7160 
7161         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
7162                                       is effectively /u; hence to match, target
7163                                       must be UTF-8. */
7164             if (! utf8_target) {
7165                 sayNO;
7166             }
7167             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
7168                                              | FOLDEQ_S2_FOLDS_SANE;
7169             folder = S_foldEQ_latin1_s2_folded;
7170             fold_array = PL_fold_latin1;
7171             goto do_exactf;
7172 
7173         case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
7174             if (! utf8_target) {
7175                 sayNO;
7176             }
7177             assert(is_utf8_pat);
7178             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7179 #ifdef DEBUGGING
7180             /* this is only used in an assert check, so we restrict it to DEBUGGING mode.
7181              * In theory neither of these variables should be used in this mode. */
7182             folder = NULL;
7183             fold_array = NULL;
7184 #endif
7185             goto do_exactf;
7186 
7187         case EXACTFUP:          /*  /foo/iu, and something is problematic in
7188                                     'foo' so can't take shortcuts. */
7189             assert(! is_utf8_pat);
7190             folder = Perl_foldEQ_latin1;
7191             fold_array = PL_fold_latin1;
7192             fold_utf8_flags = 0;
7193             goto do_exactf;
7194 
7195         case EXACTFU:            /*  /abc/iu      */
7196             folder = S_foldEQ_latin1_s2_folded;
7197             fold_array = PL_fold_latin1;
7198             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7199             goto do_exactf;
7200 
7201         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
7202                                    patterns */
7203             assert(! is_utf8_pat);
7204             /* FALLTHROUGH */
7205         case EXACTFAA:            /*  /abc/iaa     */
7206             folder = S_foldEQ_latin1_s2_folded;
7207             fold_array = PL_fold_latin1;
7208             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7209             if (is_utf8_pat || ! utf8_target) {
7210 
7211                 /* The possible presence of a MICRO SIGN in the pattern forbids
7212                  * us to view a non-UTF-8 pattern as folded when there is a
7213                  * UTF-8 target */
7214                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7215                                   |FOLDEQ_S2_FOLDS_SANE;
7216             }
7217             goto do_exactf;
7218 
7219 
7220         case EXACTF:             /*  /abc/i    This node only generated for
7221                                                non-utf8 patterns */
7222             assert(! is_utf8_pat);
7223             folder = Perl_foldEQ;
7224             fold_array = PL_fold;
7225             fold_utf8_flags = 0;
7226 
7227           do_exactf:
7228             s = STRINGs(scan);
7229             ln = STR_LENs(scan);
7230 
7231             if (   utf8_target
7232                 || is_utf8_pat
7233                 || state_num == EXACTFUP
7234                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7235             {
7236               /* Either target or the pattern are utf8, or has the issue where
7237                * the fold lengths may differ. */
7238                 const char * const l = locinput;
7239                 char *e = loceol;
7240 
7241                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
7242                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
7243                 {
7244                     sayNO;
7245                 }
7246                 locinput = e;
7247                 break;
7248             }
7249 
7250             /* Neither the target nor the pattern are utf8 */
7251             assert(fold_array);
7252             if (UCHARAT(s) != nextbyte
7253                 && !NEXTCHR_IS_EOS
7254                 && UCHARAT(s) != fold_array[nextbyte])
7255             {
7256                 sayNO;
7257             }
7258             if (loceol - locinput < ln)
7259                 sayNO;
7260             assert(folder);
7261             if (ln > 1 && ! folder(aTHX_ locinput, s, ln))
7262                 sayNO;
7263             locinput += ln;
7264             break;
7265         }
7266 
7267         case NBOUNDL: /*  /\B/l  */
7268             to_complement = 1;
7269             /* FALLTHROUGH */
7270 
7271         case BOUNDL:  /*  /\b/l  */
7272         {
7273             bool b1, b2;
7274             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7275 
7276             if (FLAGS(scan) != TRADITIONAL_BOUND) {
7277                 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7278                 goto boundu;
7279             }
7280 
7281             if (utf8_target) {
7282                 if (locinput == reginfo->strbeg)
7283                     b1 = isWORDCHAR_LC('\n');
7284                 else {
7285                     U8 *p = reghop3((U8*)locinput, -1,
7286                                     (U8*)(reginfo->strbeg));
7287                     b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7288                 }
7289                 b2 = (NEXTCHR_IS_EOS)
7290                     ? isWORDCHAR_LC('\n')
7291                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7292                                               (U8*) reginfo->strend);
7293             }
7294             else { /* Here the string isn't utf8 */
7295                 b1 = (locinput == reginfo->strbeg)
7296                      ? isWORDCHAR_LC('\n')
7297                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
7298                 b2 = (NEXTCHR_IS_EOS)
7299                     ? isWORDCHAR_LC('\n')
7300                     : isWORDCHAR_LC(nextbyte);
7301             }
7302             if (to_complement ^ (b1 == b2)) {
7303                 sayNO;
7304             }
7305             break;
7306         }
7307 
7308         case NBOUND:  /*  /\B/   */
7309             to_complement = 1;
7310             /* FALLTHROUGH */
7311 
7312         case BOUND:   /*  /\b/   */
7313             if (utf8_target) {
7314                 goto bound_utf8;
7315             }
7316             goto bound_ascii_match_only;
7317 
7318         case NBOUNDA: /*  /\B/a  */
7319             to_complement = 1;
7320             /* FALLTHROUGH */
7321 
7322         case BOUNDA:  /*  /\b/a  */
7323         {
7324             bool b1, b2;
7325 
7326           bound_ascii_match_only:
7327             /* Here the string isn't utf8, or is utf8 and only ascii characters
7328              * are to match \w.  In the latter case looking at the byte just
7329              * prior to the current one may be just the final byte of a
7330              * multi-byte character.  This is ok.  There are two cases:
7331              * 1) it is a single byte character, and then the test is doing
7332              *    just what it's supposed to.
7333              * 2) it is a multi-byte character, in which case the final byte is
7334              *    never mistakable for ASCII, and so the test will say it is
7335              *    not a word character, which is the correct answer. */
7336             b1 = (locinput == reginfo->strbeg)
7337                  ? isWORDCHAR_A('\n')
7338                  : isWORDCHAR_A(UCHARAT(locinput - 1));
7339             b2 = (NEXTCHR_IS_EOS)
7340                 ? isWORDCHAR_A('\n')
7341                 : isWORDCHAR_A(nextbyte);
7342             if (to_complement ^ (b1 == b2)) {
7343                 sayNO;
7344             }
7345             break;
7346         }
7347 
7348         case NBOUNDU: /*  /\B/u  */
7349             to_complement = 1;
7350             /* FALLTHROUGH */
7351 
7352         case BOUNDU:  /*  /\b/u  */
7353 
7354           boundu:
7355             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7356                 match = FALSE;
7357             }
7358             else if (utf8_target) {
7359               bound_utf8:
7360                 switch((bound_type) FLAGS(scan)) {
7361                     case TRADITIONAL_BOUND:
7362                     {
7363                         bool b1, b2;
7364                         if (locinput == reginfo->strbeg) {
7365                             b1 = 0 /* isWORDCHAR_L1('\n') */;
7366                         }
7367                         else {
7368                             U8 *p = reghop3((U8*)locinput, -1,
7369                                             (U8*)(reginfo->strbeg));
7370 
7371                             b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7372                         }
7373                         b2 = (NEXTCHR_IS_EOS)
7374                             ? 0 /* isWORDCHAR_L1('\n') */
7375                             : isWORDCHAR_utf8_safe((U8*)locinput,
7376                                                    (U8*) reginfo->strend);
7377                         match = cBOOL(b1 != b2);
7378                         break;
7379                     }
7380                     case GCB_BOUND:
7381                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7382                             match = TRUE; /* GCB always matches at begin and
7383                                              end */
7384                         }
7385                         else {
7386                             /* Find the gcb values of previous and current
7387                              * chars, then see if is a break point */
7388                             match = isGCB(getGCB_VAL_UTF8(
7389                                                 reghop3((U8*)locinput,
7390                                                         -1,
7391                                                         (U8*)(reginfo->strbeg)),
7392                                                 (U8*) reginfo->strend),
7393                                           getGCB_VAL_UTF8((U8*) locinput,
7394                                                         (U8*) reginfo->strend),
7395                                           (U8*) reginfo->strbeg,
7396                                           (U8*) locinput,
7397                                           utf8_target);
7398                         }
7399                         break;
7400 
7401                     case LB_BOUND:
7402                         if (locinput == reginfo->strbeg) {
7403                             match = FALSE;
7404                         }
7405                         else if (NEXTCHR_IS_EOS) {
7406                             match = TRUE;
7407                         }
7408                         else {
7409                             match = isLB(getLB_VAL_UTF8(
7410                                                 reghop3((U8*)locinput,
7411                                                         -1,
7412                                                         (U8*)(reginfo->strbeg)),
7413                                                 (U8*) reginfo->strend),
7414                                           getLB_VAL_UTF8((U8*) locinput,
7415                                                         (U8*) reginfo->strend),
7416                                           (U8*) reginfo->strbeg,
7417                                           (U8*) locinput,
7418                                           (U8*) reginfo->strend,
7419                                           utf8_target);
7420                         }
7421                         break;
7422 
7423                     case SB_BOUND: /* Always matches at begin and end */
7424                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7425                             match = TRUE;
7426                         }
7427                         else {
7428                             match = isSB(getSB_VAL_UTF8(
7429                                                 reghop3((U8*)locinput,
7430                                                         -1,
7431                                                         (U8*)(reginfo->strbeg)),
7432                                                 (U8*) reginfo->strend),
7433                                           getSB_VAL_UTF8((U8*) locinput,
7434                                                         (U8*) reginfo->strend),
7435                                           (U8*) reginfo->strbeg,
7436                                           (U8*) locinput,
7437                                           (U8*) reginfo->strend,
7438                                           utf8_target);
7439                         }
7440                         break;
7441 
7442                     case WB_BOUND:
7443                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7444                             match = TRUE;
7445                         }
7446                         else {
7447                             match = isWB(WB_UNKNOWN,
7448                                          getWB_VAL_UTF8(
7449                                                 reghop3((U8*)locinput,
7450                                                         -1,
7451                                                         (U8*)(reginfo->strbeg)),
7452                                                 (U8*) reginfo->strend),
7453                                           getWB_VAL_UTF8((U8*) locinput,
7454                                                         (U8*) reginfo->strend),
7455                                           (U8*) reginfo->strbeg,
7456                                           (U8*) locinput,
7457                                           (U8*) reginfo->strend,
7458                                           utf8_target);
7459                         }
7460                         break;
7461                 }
7462             }
7463             else {  /* Not utf8 target */
7464                 switch((bound_type) FLAGS(scan)) {
7465                     case TRADITIONAL_BOUND:
7466                     {
7467                         bool b1, b2;
7468                         b1 = (locinput == reginfo->strbeg)
7469                             ? 0 /* isWORDCHAR_L1('\n') */
7470                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
7471                         b2 = (NEXTCHR_IS_EOS)
7472                             ? 0 /* isWORDCHAR_L1('\n') */
7473                             : isWORDCHAR_L1(nextbyte);
7474                         match = cBOOL(b1 != b2);
7475                         break;
7476                     }
7477 
7478                     case GCB_BOUND:
7479                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7480                             match = TRUE; /* GCB always matches at begin and
7481                                              end */
7482                         }
7483                         else {  /* Only CR-LF combo isn't a GCB in 0-255
7484                                    range */
7485                             match =    UCHARAT(locinput - 1) != '\r'
7486                                     || UCHARAT(locinput) != '\n';
7487                         }
7488                         break;
7489 
7490                     case LB_BOUND:
7491                         if (locinput == reginfo->strbeg) {
7492                             match = FALSE;
7493                         }
7494                         else if (NEXTCHR_IS_EOS) {
7495                             match = TRUE;
7496                         }
7497                         else {
7498                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7499                                          getLB_VAL_CP(UCHARAT(locinput)),
7500                                          (U8*) reginfo->strbeg,
7501                                          (U8*) locinput,
7502                                          (U8*) reginfo->strend,
7503                                          utf8_target);
7504                         }
7505                         break;
7506 
7507                     case SB_BOUND: /* Always matches at begin and end */
7508                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7509                             match = TRUE;
7510                         }
7511                         else {
7512                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7513                                          getSB_VAL_CP(UCHARAT(locinput)),
7514                                          (U8*) reginfo->strbeg,
7515                                          (U8*) locinput,
7516                                          (U8*) reginfo->strend,
7517                                          utf8_target);
7518                         }
7519                         break;
7520 
7521                     case WB_BOUND:
7522                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7523                             match = TRUE;
7524                         }
7525                         else {
7526                             match = isWB(WB_UNKNOWN,
7527                                          getWB_VAL_CP(UCHARAT(locinput -1)),
7528                                          getWB_VAL_CP(UCHARAT(locinput)),
7529                                          (U8*) reginfo->strbeg,
7530                                          (U8*) locinput,
7531                                          (U8*) reginfo->strend,
7532                                          utf8_target);
7533                         }
7534                         break;
7535                 }
7536             }
7537 
7538             if (to_complement ^ ! match) {
7539                 sayNO;
7540             }
7541             break;
7542 
7543         case ANYOFPOSIXL:
7544         case ANYOFL:  /*  /[abc]/l      */
7545             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7546             CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7547 
7548             /* FALLTHROUGH */
7549         case ANYOFD:  /*   /[abc]/d       */
7550         case ANYOF:  /*   /[abc]/       */
7551             if (NEXTCHR_IS_EOS || locinput >= loceol)
7552                 sayNO;
7553             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7554                 && ! ANYOF_FLAGS(scan)
7555                 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan))
7556             {
7557                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7558                     sayNO;
7559                 }
7560                 locinput++;
7561             }
7562             else {
7563                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7564                                                                    utf8_target))
7565                 {
7566                     sayNO;
7567                 }
7568                 goto increment_locinput;
7569             }
7570             break;
7571 
7572         case ANYOFM:
7573             if (   NEXTCHR_IS_EOS
7574                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG1u(scan)
7575                 || locinput >= loceol)
7576             {
7577                 sayNO;
7578             }
7579             locinput++; /* ANYOFM is always single byte */
7580             break;
7581 
7582         case NANYOFM:
7583             if (   NEXTCHR_IS_EOS
7584                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG1u(scan)
7585                 || locinput >= loceol)
7586             {
7587                 sayNO;
7588             }
7589             goto increment_locinput;
7590             break;
7591 
7592         case ANYOFH:
7593             if (   ! utf8_target
7594                 ||   NEXTCHR_IS_EOS
7595                 ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7596                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7597                 || ! _invlist_contains_cp(anyofh_list,
7598                                           utf8_to_uvchr_buf((U8 *) locinput,
7599                                                             (U8 *) loceol,
7600                                                             NULL)))
7601             {
7602                 sayNO;
7603             }
7604             goto increment_locinput;
7605             break;
7606 
7607         case ANYOFHb:
7608             if (   ! utf8_target
7609                 ||   NEXTCHR_IS_EOS
7610                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
7611                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7612                 || ! _invlist_contains_cp(anyofh_list,
7613                                           utf8_to_uvchr_buf((U8 *) locinput,
7614                                                             (U8 *) loceol,
7615                                                             NULL)))
7616             {
7617                 sayNO;
7618             }
7619             goto increment_locinput;
7620             break;
7621 
7622         case ANYOFHbbm:
7623             if (   ! utf8_target
7624                 ||   NEXTCHR_IS_EOS
7625                 ||   ANYOF_FLAGS(scan) != (U8) locinput[0]
7626                 ||   locinput >= reginfo->strend
7627                 || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap,
7628                                    (U8) locinput[1] & UTF_CONTINUATION_MASK))
7629             {
7630                 sayNO;
7631             }
7632             goto increment_locinput;
7633             break;
7634 
7635         case ANYOFHr:
7636             if (   ! utf8_target
7637                 ||   NEXTCHR_IS_EOS
7638                 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7639                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7640                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7641                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7642                 || ! _invlist_contains_cp(anyofh_list,
7643                                           utf8_to_uvchr_buf((U8 *) locinput,
7644                                                             (U8 *) loceol,
7645                                                             NULL)))
7646             {
7647                 sayNO;
7648             }
7649             goto increment_locinput;
7650             break;
7651 
7652         case ANYOFHs:
7653             if (   ! utf8_target
7654                 ||   NEXTCHR_IS_EOS
7655                 ||   loceol - locinput < FLAGS(scan)
7656                 ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7657                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7658                 || ! _invlist_contains_cp(anyofh_list,
7659                                           utf8_to_uvchr_buf((U8 *) locinput,
7660                                                             (U8 *) loceol,
7661                                                             NULL)))
7662             {
7663                 sayNO;
7664             }
7665             goto increment_locinput;
7666             break;
7667 
7668         case ANYOFR:
7669             if (NEXTCHR_IS_EOS) {
7670                 sayNO;
7671             }
7672 
7673             if (utf8_target) {
7674                 if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7675                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7676                                                 (U8 *) reginfo->strend,
7677                                                 NULL),
7678                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
7679                 {
7680                     sayNO;
7681                 }
7682             }
7683             else {
7684                 if (! withinCOUNT((U8) *locinput,
7685                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7686                 {
7687                     sayNO;
7688                 }
7689             }
7690             goto increment_locinput;
7691             break;
7692 
7693         case ANYOFRb:
7694             if (NEXTCHR_IS_EOS) {
7695                 sayNO;
7696             }
7697 
7698             if (utf8_target) {
7699                 if (     ANYOF_FLAGS(scan) != (U8) *locinput
7700                     || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7701                                                 (U8 *) reginfo->strend,
7702                                                 NULL),
7703                                      ANYOFRbase(scan), ANYOFRdelta(scan)))
7704                 {
7705                     sayNO;
7706                 }
7707             }
7708             else {
7709                 if (! withinCOUNT((U8) *locinput,
7710                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7711                 {
7712                     sayNO;
7713                 }
7714             }
7715             goto increment_locinput;
7716             break;
7717 
7718         /* The argument (FLAGS) to all the POSIX node types is the class number
7719          * */
7720 
7721         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
7722             to_complement = 1;
7723             /* FALLTHROUGH */
7724 
7725         case POSIXL:    /* \w or [:punct:] etc. under /l */
7726             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7727             if (NEXTCHR_IS_EOS || locinput >= loceol)
7728                 sayNO;
7729 
7730             /* Use isFOO_lc() for characters within Latin1.  (Note that
7731              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7732              * wouldn't be invariant) */
7733             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7734                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7735                     sayNO;
7736                 }
7737 
7738                 locinput++;
7739                 break;
7740             }
7741 
7742             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7743                 /* An above Latin-1 code point, or malformed */
7744                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7745                                                        reginfo->strend);
7746                 goto utf8_posix_above_latin1;
7747             }
7748 
7749             /* Here is a UTF-8 variant code point below 256 and the target is
7750              * UTF-8 */
7751             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7752                                             EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7753                                             *(locinput + 1))))))
7754             {
7755                 sayNO;
7756             }
7757 
7758             goto increment_locinput;
7759 
7760         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7761             to_complement = 1;
7762             /* FALLTHROUGH */
7763 
7764         case POSIXD:    /* \w or [:punct:] etc. under /d */
7765             if (utf8_target) {
7766                 goto utf8_posix;
7767             }
7768             goto posixa;
7769 
7770         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7771 
7772             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7773                 sayNO;
7774             }
7775 
7776             /* All UTF-8 variants match */
7777             if (! UTF8_IS_INVARIANT(nextbyte)) {
7778                 goto increment_locinput;
7779             }
7780 
7781             to_complement = 1;
7782             goto join_nposixa;
7783 
7784         case POSIXA:    /* \w or [:punct:] etc. under /a */
7785 
7786           posixa:
7787             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7788              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7789              * character is a single byte */
7790 
7791             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7792                 sayNO;
7793             }
7794 
7795           join_nposixa:
7796 
7797             if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte,
7798                                                                 FLAGS(scan)))))
7799             {
7800                 sayNO;
7801             }
7802 
7803             /* Here we are either not in utf8, or we matched a utf8-invariant,
7804              * so the next char is the next byte */
7805             locinput++;
7806             break;
7807 
7808         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7809             to_complement = 1;
7810             /* FALLTHROUGH */
7811 
7812         case POSIXU:    /* \w or [:punct:] etc. under /u */
7813           utf8_posix:
7814             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7815                 sayNO;
7816             }
7817 
7818             /* Use generic_isCC_() for characters within Latin1.  (Note that
7819              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7820              * wouldn't be invariant) */
7821             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7822                 if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte,
7823                                                            FLAGS(scan)))))
7824                 {
7825                     sayNO;
7826                 }
7827                 locinput++;
7828             }
7829             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7830                 if (! (to_complement
7831                        ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7832                                                                *(locinput + 1)),
7833                                              FLAGS(scan)))))
7834                 {
7835                     sayNO;
7836                 }
7837                 locinput += 2;
7838             }
7839             else {  /* Handle above Latin-1 code points */
7840               utf8_posix_above_latin1:
7841                 classnum = (char_class_number_) FLAGS(scan);
7842                 switch (classnum) {
7843                     default:
7844                         if (! (to_complement
7845                            ^ cBOOL(_invlist_contains_cp(
7846                                       PL_XPosix_ptrs[classnum],
7847                                       utf8_to_uvchr_buf((U8 *) locinput,
7848                                                         (U8 *) reginfo->strend,
7849                                                         NULL)))))
7850                         {
7851                             sayNO;
7852                         }
7853                         break;
7854                     case CC_ENUM_SPACE_:
7855                         if (! (to_complement
7856                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
7857                         {
7858                             sayNO;
7859                         }
7860                         break;
7861                     case CC_ENUM_BLANK_:
7862                         if (! (to_complement
7863                                         ^ cBOOL(is_HORIZWS_high(locinput))))
7864                         {
7865                             sayNO;
7866                         }
7867                         break;
7868                     case CC_ENUM_XDIGIT_:
7869                         if (! (to_complement
7870                                         ^ cBOOL(is_XDIGIT_high(locinput))))
7871                         {
7872                             sayNO;
7873                         }
7874                         break;
7875                     case CC_ENUM_VERTSPACE_:
7876                         if (! (to_complement
7877                                         ^ cBOOL(is_VERTWS_high(locinput))))
7878                         {
7879                             sayNO;
7880                         }
7881                         break;
7882                     case CC_ENUM_CNTRL_:    /* These can't match above Latin1 */
7883                     case CC_ENUM_ASCII_:
7884                         if (! to_complement) {
7885                             sayNO;
7886                         }
7887                         break;
7888                 }
7889                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7890             }
7891             break;
7892 
7893         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7894                        a Unicode extended Grapheme Cluster */
7895             if (NEXTCHR_IS_EOS || locinput >= loceol)
7896                 sayNO;
7897             if  (! utf8_target) {
7898 
7899                 /* Match either CR LF  or '.', as all the other possibilities
7900                  * require utf8 */
7901                 locinput++;	    /* Match the . or CR */
7902                 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7903                                        match the LF */
7904                     && locinput <  loceol
7905                     && UCHARAT(locinput) == '\n')
7906                 {
7907                     locinput++;
7908                 }
7909             }
7910             else {
7911 
7912                 /* Get the gcb type for the current character */
7913                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7914                                                        (U8*) reginfo->strend);
7915 
7916                 /* Then scan through the input until we get to the first
7917                  * character whose type is supposed to be a gcb with the
7918                  * current character.  (There is always a break at the
7919                  * end-of-input) */
7920                 locinput += UTF8SKIP(locinput);
7921                 while (locinput < loceol) {
7922                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7923                                                          (U8*) reginfo->strend);
7924                     if (isGCB(prev_gcb, cur_gcb,
7925                               (U8*) reginfo->strbeg, (U8*) locinput,
7926                               utf8_target))
7927                     {
7928                         break;
7929                     }
7930 
7931                     prev_gcb = cur_gcb;
7932                     locinput += UTF8SKIP(locinput);
7933                 }
7934 
7935 
7936             }
7937             break;
7938 
7939         case REFFLN:  /*  /\g{name}/il  */
7940         {   /* The capture buffer cases.  The ones beginning with N for the
7941                named buffers just convert to the equivalent numbered and
7942                pretend they were called as the corresponding numbered buffer
7943                op.  */
7944             /* don't initialize these in the declaration, it makes C++
7945                unhappy */
7946             const char *s;
7947             char type;
7948             re_fold_t folder;
7949             const U8 *fold_array;
7950             UV utf8_fold_flags;
7951 
7952             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7953             folder = Perl_foldEQ_locale;
7954             fold_array = PL_fold_locale;
7955             type = REFFL;
7956             utf8_fold_flags = FOLDEQ_LOCALE;
7957             goto do_nref;
7958 
7959         case REFFAN:  /*  /\g{name}/iaa  */
7960             folder = Perl_foldEQ_latin1;
7961             fold_array = PL_fold_latin1;
7962             type = REFFA;
7963             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7964             goto do_nref;
7965 
7966         case REFFUN:  /*  /\g{name}/iu  */
7967             folder = Perl_foldEQ_latin1;
7968             fold_array = PL_fold_latin1;
7969             type = REFFU;
7970             utf8_fold_flags = 0;
7971             goto do_nref;
7972 
7973         case REFFN:  /*  /\g{name}/i  */
7974             folder = Perl_foldEQ;
7975             fold_array = PL_fold;
7976             type = REFF;
7977             utf8_fold_flags = 0;
7978             goto do_nref;
7979 
7980         case REFN:  /*  /\g{name}/   */
7981             type = REF;
7982             folder = NULL;
7983             fold_array = NULL;
7984             utf8_fold_flags = 0;
7985           do_nref:
7986 
7987             /* For the named back references, find the corresponding buffer
7988              * number */
7989             n = reg_check_named_buff_matched(rex,scan);
7990 
7991             if ( ! n ) {
7992                 sayNO;
7993             }
7994             goto do_nref_ref_common;
7995 
7996         case REFFL:  /*  /\1/il  */
7997             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7998             folder = Perl_foldEQ_locale;
7999             fold_array = PL_fold_locale;
8000             utf8_fold_flags = FOLDEQ_LOCALE;
8001             goto do_ref;
8002 
8003         case REFFA:  /*  /\1/iaa  */
8004             folder = Perl_foldEQ_latin1;
8005             fold_array = PL_fold_latin1;
8006             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8007             goto do_ref;
8008 
8009         case REFFU:  /*  /\1/iu  */
8010             folder = Perl_foldEQ_latin1;
8011             fold_array = PL_fold_latin1;
8012             utf8_fold_flags = 0;
8013             goto do_ref;
8014 
8015         case REFF:  /*  /\1/i  */
8016             folder = Perl_foldEQ;
8017             fold_array = PL_fold;
8018             utf8_fold_flags = 0;
8019             goto do_ref;
8020 
8021 #undef  ST
8022 #define ST st->u.backref
8023         case REF:  /*  /\1/    */
8024             folder = NULL;
8025             fold_array = NULL;
8026             utf8_fold_flags = 0;
8027 
8028           do_ref:
8029             type = OP(scan);
8030             n = ARG1u(scan);  /* which paren pair */
8031             if (rex->logical_to_parno) {
8032                 n = rex->logical_to_parno[n];
8033                 do {
8034                     if ( RXp_LASTPAREN(rex) < n ||
8035                          RXp_OFFS_START(rex,n) == -1 ||
8036                          RXp_OFFS_END(rex,n) == -1
8037                     ) {
8038                         n = rex->parno_to_logical_next[n];
8039                     }
8040                     else {
8041                         break;
8042                     }
8043                 } while(n);
8044 
8045                 if (!n) /* this means there is nothing that matched */
8046                     sayNO;
8047             }
8048 
8049           do_nref_ref_common:
8050             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8051             if (RXp_LASTPAREN(rex) < n)
8052                 sayNO;
8053 
8054             ln = RXp_OFFSp(rex)[n].start;
8055             endref = RXp_OFFSp(rex)[n].end;
8056             if (ln == -1 || endref == -1)
8057                 sayNO;			/* Do not match unless seen CLOSEn. */
8058 
8059             if (ln == endref)
8060                 goto ref_yes;
8061 
8062             s = reginfo->strbeg + ln;
8063             if (type != REF	/* REF can do byte comparison */
8064                 && (utf8_target || type == REFFU || type == REFFL))
8065             {
8066                 char * limit = loceol;
8067 
8068                 /* This call case insensitively compares the entire buffer
8069                     * at s, with the current input starting at locinput, but
8070                     * not going off the end given by loceol, and
8071                     * returns in <limit> upon success, how much of the
8072                     * current input was matched */
8073                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
8074                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
8075                 {
8076                     sayNO;
8077                 }
8078                 locinput = limit;
8079                 goto ref_yes;
8080             }
8081 
8082             /* Not utf8:  Inline the first character, for speed. */
8083             if ( ! NEXTCHR_IS_EOS
8084                 && locinput < loceol
8085                 && UCHARAT(s) != nextbyte
8086                 && (   type == REF
8087                     || UCHARAT(s) != fold_array[nextbyte]))
8088             {
8089                 sayNO;
8090             }
8091             ln = endref - ln;
8092             if (locinput + ln > loceol)
8093                 sayNO;
8094             if (ln > 1 && (type == REF
8095                            ? memNE(s, locinput, ln)
8096                            : ! folder(aTHX_ locinput, s, ln)))
8097                 sayNO;
8098             locinput += ln;
8099         }
8100         ref_yes:
8101             if (FLAGS(scan)) { /* == VOLATILE_REF but only other value is 0 */
8102                 ST.cp = regcppush(rex, ARG2u(scan) - 1, maxopenparen);
8103                 REGCP_SET(ST.lastcp);
8104                 PUSH_STATE_GOTO(REF_next, next, locinput, loceol,
8105                                 script_run_begin);
8106             }
8107             break;
8108             NOT_REACHED; /* NOTREACHED */
8109 
8110         case REF_next:
8111             sayYES;
8112             break;
8113 
8114         case REF_next_fail:
8115             REGCP_UNWIND(ST.lastcp);
8116             regcppop(rex, &maxopenparen);
8117             sayNO;
8118             break;
8119 
8120         case NOTHING: /* null op; e.g. the 'nothing' following
8121                        * the '*' in m{(a+|b)*}' */
8122             break;
8123         case TAIL: /* placeholder while compiling (A|B|C) */
8124             break;
8125 
8126 #undef  ST
8127 #define ST st->u.eval
8128 #define CUR_EVAL cur_eval->u.eval
8129 
8130         {
8131             SV *ret;
8132             REGEXP *re_sv;
8133             regexp *re;
8134             regexp_internal *rei;
8135             regnode *startpoint;
8136             U32 arg;
8137 
8138         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
8139             arg = ARG1u(scan);
8140             if (cur_eval && cur_eval->locinput == locinput) {
8141                 if ( ++nochange_depth > max_nochange_depth )
8142                     Perl_croak(aTHX_
8143                         "Pattern subroutine nesting without pos change"
8144                         " exceeded limit in regex");
8145             } else {
8146                 nochange_depth = 0;
8147             }
8148             re_sv = rex_sv;
8149             re = rex;
8150             rei = rexi;
8151             startpoint = scan + ARG2i(scan);
8152             EVAL_CLOSE_PAREN_SET( st, arg );
8153             /* Detect infinite recursion
8154              *
8155              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
8156              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
8157              * So we track the position in the string we are at each time
8158              * we recurse and if we try to enter the same routine twice from
8159              * the same position we throw an error.
8160              */
8161             if ( rex->recurse_locinput[arg] == locinput ) {
8162                 /* FIXME: we should show the regop that is failing as part
8163                  * of the error message. */
8164                 Perl_croak(aTHX_ "Infinite recursion in regex");
8165             } else {
8166                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
8167                 rex->recurse_locinput[arg]= locinput;
8168 
8169                 DEBUG_r({
8170                     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8171                     DEBUG_STACK_r({
8172                         Perl_re_exec_indentf( aTHX_
8173                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
8174                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
8175                         );
8176                     });
8177                 });
8178             }
8179 
8180             /* Save all the positions seen so far. */
8181             ST.cp = regcppush(rex, 0, maxopenparen);
8182             REGCP_SET(ST.lastcp);
8183 
8184             /* and then jump to the code we share with EVAL */
8185             goto eval_recurse_doit;
8186             /* NOTREACHED */
8187 
8188         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
8189             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
8190                 if ( ++nochange_depth > max_nochange_depth )
8191                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
8192             } else {
8193                 nochange_depth = 0;
8194             }
8195             {
8196                 /* execute the code in the {...} */
8197 
8198                 dSP;
8199                 IV before;
8200                 OP * const oop = PL_op;
8201                 COP * const ocurcop = PL_curcop;
8202                 OP *nop;
8203                 CV *newcv;
8204 
8205                 /* save *all* paren positions */
8206                 ST.cp = regcppush(rex, 0, maxopenparen);
8207                 REGCP_SET(ST.lastcp);
8208 
8209                 if (!caller_cv)
8210                     caller_cv = find_runcv(NULL);
8211 
8212                 n = ARG1u(scan);
8213 
8214                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8215                     newcv = (ReANY(
8216                                     (REGEXP*)(rexi->data->data[n])
8217                             ))->qr_anoncv;
8218                     nop = (OP*)rexi->data->data[n+1];
8219                 }
8220                 else if (rexi->data->what[n] == 'l') { /* literal code */
8221                     newcv = caller_cv;
8222                     nop = (OP*)rexi->data->data[n];
8223                     assert(CvDEPTH(newcv));
8224                 }
8225                 else {
8226                     /* literal with own CV */
8227                     assert(rexi->data->what[n] == 'L');
8228                     newcv = rex->qr_anoncv;
8229                     nop = (OP*)rexi->data->data[n];
8230                 }
8231 
8232                 /* Some notes about MULTICALL and the context and save stacks.
8233                  *
8234                  * In something like
8235                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
8236                  * since codeblocks don't introduce a new scope (so that
8237                  * local() etc accumulate), at the end of a successful
8238                  * match there will be a SAVEt_CLEARSV on the savestack
8239                  * for each of $x, $y, $z. If the three code blocks above
8240                  * happen to have come from different CVs (e.g. via
8241                  * embedded qr//s), then we must ensure that during any
8242                  * savestack unwinding, PL_comppad always points to the
8243                  * right pad at each moment. We achieve this by
8244                  * interleaving SAVEt_COMPPAD's on the savestack whenever
8245                  * there is a change of pad.
8246                  * In theory whenever we call a code block, we should
8247                  * push a CXt_SUB context, then pop it on return from
8248                  * that code block. This causes a bit of an issue in that
8249                  * normally popping a context also clears the savestack
8250                  * back to cx->blk_oldsaveix, but here we specifically
8251                  * don't want to clear the save stack on exit from the
8252                  * code block.
8253                  * Also for efficiency we don't want to keep pushing and
8254                  * popping the single SUB context as we backtrack etc.
8255                  * So instead, we push a single context the first time
8256                  * we need, it, then hang onto it until the end of this
8257                  * function. Whenever we encounter a new code block, we
8258                  * update the CV etc if that's changed. During the times
8259                  * in this function where we're not executing a code
8260                  * block, having the SUB context still there is a bit
8261                  * naughty - but we hope that no-one notices.
8262                  * When the SUB context is initially pushed, we fake up
8263                  * cx->blk_oldsaveix to be as if we'd pushed this context
8264                  * on first entry to S_regmatch rather than at some random
8265                  * point during the regexe execution. That way if we
8266                  * croak, popping the context stack will ensure that
8267                  * *everything* SAVEd by this function is undone and then
8268                  * the context popped, rather than e.g., popping the
8269                  * context (and restoring the original PL_comppad) then
8270                  * popping more of the savestack and restoring a bad
8271                  * PL_comppad.
8272                  */
8273 
8274                 /* If this is the first EVAL, push a MULTICALL. On
8275                  * subsequent calls, if we're executing a different CV, or
8276                  * if PL_comppad has got messed up from backtracking
8277                  * through SAVECOMPPADs, then refresh the context.
8278                  */
8279                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8280                 {
8281                     U8 flags = (CXp_SUB_RE |
8282                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8283                     SAVECOMPPAD();
8284                     if (last_pushed_cv) {
8285                         CHANGE_MULTICALL_FLAGS(newcv, flags);
8286                     }
8287                     else {
8288                         PUSH_MULTICALL_FLAGS(newcv, flags);
8289                     }
8290                     /* see notes above */
8291                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8292 
8293                     last_pushed_cv = newcv;
8294                 }
8295                 else {
8296                     /* these assignments are just to silence compiler
8297                      * warnings */
8298                     multicall_cop = NULL;
8299                 }
8300                 last_pad = PL_comppad;
8301 
8302                 /* the initial nextstate you would normally execute
8303                  * at the start of an eval (which would cause error
8304                  * messages to come from the eval), may be optimised
8305                  * away from the execution path in the regex code blocks;
8306                  * so manually set PL_curcop to it initially */
8307                 {
8308                     OP *o = cUNOPx(nop)->op_first;
8309                     assert(o->op_type == OP_NULL);
8310                     if (o->op_targ == OP_SCOPE) {
8311                         o = cUNOPo->op_first;
8312                     }
8313                     else {
8314                         assert(o->op_targ == OP_LEAVE);
8315                         o = cUNOPo->op_first;
8316                         assert(o->op_type == OP_ENTER);
8317                         o = OpSIBLING(o);
8318                     }
8319 
8320                     if (o->op_type != OP_STUB) {
8321                         assert(    o->op_type == OP_NEXTSTATE
8322                                 || o->op_type == OP_DBSTATE
8323                                 || (o->op_type == OP_NULL
8324                                     &&  (  o->op_targ == OP_NEXTSTATE
8325                                         || o->op_targ == OP_DBSTATE
8326                                         )
8327                                     )
8328                         );
8329                         PL_curcop = (COP*)o;
8330                     }
8331                 }
8332                 nop = nop->op_next;
8333 
8334                 DEBUG_STATE_r( Perl_re_printf( aTHX_
8335                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8336 
8337                 RXp_OFFSp(rex)[0].end = locinput - reginfo->strbeg;
8338                 if (reginfo->info_aux_eval->pos_magic)
8339                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8340                                   reginfo->sv, reginfo->strbeg,
8341                                   locinput - reginfo->strbeg);
8342 
8343                 if (sv_yes_mark) {
8344                     SV *sv_mrk = get_sv("REGMARK", 1);
8345                     sv_setsv(sv_mrk, sv_yes_mark);
8346                 }
8347 
8348                 /* we don't use MULTICALL here as we want to call the
8349                  * first op of the block of interest, rather than the
8350                  * first op of the sub. Also, we don't want to free
8351                  * the savestack frame */
8352                 before = (IV)(SP-PL_stack_base);
8353                 PL_op = nop;
8354                 CALLRUNOPS(aTHX);			/* Scalar context. */
8355                 SPAGAIN;
8356                 if ((IV)(SP-PL_stack_base) == before)
8357                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
8358                 else {
8359                     ret = POPs;
8360                     PUTBACK;
8361                 }
8362 
8363                 /* before restoring everything, evaluate the returned
8364                  * value, so that 'uninit' warnings don't use the wrong
8365                  * PL_op or pad. Also need to process any magic vars
8366                  * (e.g. $1) *before* parentheses are restored */
8367 
8368                 PL_op = NULL;
8369 
8370                 re_sv = NULL;
8371                 if (logical == 0) {       /* /(?{ ... })/ and /(*{ ... })/ */
8372                     SV *replsv = save_scalar(PL_replgv);
8373                     sv_setsv(replsv, ret); /* $^R */
8374                     SvSETMAGIC(replsv);
8375                 }
8376                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
8377                     sw = cBOOL(SvTRUE_NN(ret));
8378                     logical = 0;
8379                 }
8380                 else {                   /*  /(??{ ... })  */
8381                     /*  if its overloaded, let the regex compiler handle
8382                      *  it; otherwise extract regex, or stringify  */
8383                     if (SvGMAGICAL(ret))
8384                         ret = sv_mortalcopy(ret);
8385                     if (!SvAMAGIC(ret)) {
8386                         SV *sv = ret;
8387                         if (SvROK(sv))
8388                             sv = SvRV(sv);
8389                         if (SvTYPE(sv) == SVt_REGEXP)
8390                             re_sv = (REGEXP*) sv;
8391                         else if (SvSMAGICAL(ret)) {
8392                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8393                             if (mg)
8394                                 re_sv = (REGEXP *) mg->mg_obj;
8395                         }
8396 
8397                         /* force any undef warnings here */
8398                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8399                             ret = sv_mortalcopy(ret);
8400                             (void) SvPV_force_nolen(ret);
8401                         }
8402                     }
8403 
8404                 }
8405 
8406                 /* *** Note that at this point we don't restore
8407                  * PL_comppad, (or pop the CxSUB) on the assumption it may
8408                  * be used again soon. This is safe as long as nothing
8409                  * in the regexp code uses the pad ! */
8410                 PL_op = oop;
8411                 PL_curcop = ocurcop;
8412                 regcp_restore(rex, ST.lastcp, &maxopenparen);
8413                 PL_curpm_under = PL_curpm;
8414                 PL_curpm = PL_reg_curpm;
8415 
8416                 if (logical != 2) {
8417                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8418                                     script_run_begin);
8419                     /* NOTREACHED */
8420                 }
8421             }
8422 
8423                 /* only /(??{ ... })/  from now on */
8424                 logical = 0;
8425                 {
8426                     /* extract RE object from returned value; compiling if
8427                      * necessary */
8428 
8429                     if (re_sv) {
8430                         re_sv = reg_temp_copy(NULL, re_sv);
8431                     }
8432                     else {
8433                         U32 pm_flags = 0;
8434 
8435                         if (SvUTF8(ret) && IN_BYTES) {
8436                             /* In use 'bytes': make a copy of the octet
8437                              * sequence, but without the flag on */
8438                             STRLEN len;
8439                             const char *const p = SvPV(ret, len);
8440                             ret = newSVpvn_flags(p, len, SVs_TEMP);
8441                         }
8442                         if (rex->intflags & PREGf_USE_RE_EVAL)
8443                             pm_flags |= PMf_USE_RE_EVAL;
8444 
8445                         /* if we got here, it should be an engine which
8446                          * supports compiling code blocks and stuff */
8447                         assert(rex->engine && rex->engine->op_comp);
8448                         assert(!(FLAGS(scan) & ~RXf_PMf_COMPILETIME));
8449                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8450                                     rex->engine, NULL, NULL,
8451                                     /* copy /msixn etc to inner pattern */
8452                                     ARG2i(scan),
8453                                     pm_flags);
8454 
8455                         if (!(SvFLAGS(ret)
8456                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
8457                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8458                             /* This isn't a first class regexp. Instead, it's
8459                                caching a regexp onto an existing, Perl visible
8460                                scalar.  */
8461                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8462                         }
8463                     }
8464                     SAVEFREESV(re_sv);
8465                     re = ReANY(re_sv);
8466                 }
8467                 RXp_MATCH_COPIED_off(re);
8468                 RXp_SUBBEG(re) = RXp_SUBBEG(rex);
8469                 RXp_SUBLEN(re) = RXp_SUBLEN(rex);
8470                 RXp_SUBOFFSET(re) = RXp_SUBOFFSET(rex);
8471                 RXp_SUBCOFFSET(re) = RXp_SUBCOFFSET(rex);
8472                 RXp_LASTPAREN(re) = 0;
8473                 RXp_LASTCLOSEPAREN(re) = 0;
8474                 rei = RXi_GET(re);
8475                 DEBUG_EXECUTE_r(
8476                     debug_start_match(re_sv, utf8_target, locinput,
8477                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
8478                 );
8479                 startpoint = rei->program + 1;
8480                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8481                                              * close_paren only for GOSUB */
8482                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8483 
8484                 /* note we saved the paren state earlier:
8485                 ST.cp = regcppush(rex, 0, maxopenparen);
8486                 REGCP_SET(ST.lastcp);
8487                 */
8488                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8489                 maxopenparen = 0;
8490                 /* run the pattern returned from (??{...}) */
8491 
8492               eval_recurse_doit: /* Share code with GOSUB below this line
8493                             * At this point we expect the stack context to be
8494                             * set up correctly */
8495 
8496                 /* invalidate the S-L poscache. We're now executing a
8497                  * different set of WHILEM ops (and their associated
8498                  * indexes) against the same string, so the bits in the
8499                  * cache are meaningless. Setting maxiter to zero forces
8500                  * the cache to be invalidated and zeroed before reuse.
8501                  * XXX This is too dramatic a measure. Ideally we should
8502                  * save the old cache and restore when running the outer
8503                  * pattern again */
8504                 reginfo->poscache_maxiter = 0;
8505 
8506                 /* the new regexp might have a different is_utf8_pat than we do */
8507                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8508 
8509                 ST.prev_rex = rex_sv;
8510                 ST.prev_curlyx = cur_curlyx;
8511                 rex_sv = re_sv;
8512                 SET_reg_curpm(rex_sv);
8513                 rex = re;
8514                 rexi = rei;
8515                 cur_curlyx = NULL;
8516                 ST.B = next;
8517                 ST.prev_eval = cur_eval;
8518                 cur_eval = st;
8519                 /* now continue from first node in postoned RE */
8520                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8521                                     loceol, script_run_begin);
8522                 NOT_REACHED; /* NOTREACHED */
8523         }
8524 
8525         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8526             /* note: this is called twice; first after popping B, then A */
8527             DEBUG_STACK_r({
8528                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
8529                     depth, cur_eval, ST.prev_eval);
8530             });
8531 
8532 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8533             if ( cur_eval && CUR_EVAL.close_paren ) {\
8534                 DEBUG_STACK_r({ \
8535                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8536                         depth,    \
8537                         CUR_EVAL.close_paren - 1,\
8538                         cur_eval, \
8539                         VAL);     \
8540                 });               \
8541                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8542             }
8543 
8544             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8545 
8546             rex_sv = ST.prev_rex;
8547             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8548             SET_reg_curpm(rex_sv);
8549             rex = ReANY(rex_sv);
8550             rexi = RXi_GET(rex);
8551             {
8552                 /* preserve $^R across LEAVE's. See Bug 121070. */
8553                 SV *save_sv= GvSV(PL_replgv);
8554                 SV *replsv;
8555                 SvREFCNT_inc(save_sv);
8556                 regcpblow(ST.cp); /* LEAVE in disguise */
8557                 /* don't move this initialization up */
8558                 replsv = GvSV(PL_replgv);
8559                 sv_setsv(replsv, save_sv);
8560                 SvSETMAGIC(replsv);
8561                 SvREFCNT_dec(save_sv);
8562             }
8563             cur_eval = ST.prev_eval;
8564             cur_curlyx = ST.prev_curlyx;
8565 
8566             /* Invalidate cache. See "invalidate" comment above. */
8567             reginfo->poscache_maxiter = 0;
8568             if ( nochange_depth )
8569                 nochange_depth--;
8570 
8571             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8572             sayYES;
8573 
8574 
8575         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8576             REGCP_UNWIND(ST.lastcp);
8577             regcppop(rex, &maxopenparen);
8578             sayNO;
8579 
8580         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8581             /* note: this is called twice; first after popping B, then A */
8582             DEBUG_STACK_r({
8583                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8584                     depth, cur_eval, ST.prev_eval);
8585             });
8586 
8587             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8588 
8589             rex_sv = ST.prev_rex;
8590             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8591             SET_reg_curpm(rex_sv);
8592             rex = ReANY(rex_sv);
8593             rexi = RXi_GET(rex);
8594 
8595             REGCP_UNWIND(ST.lastcp);
8596             regcppop(rex, &maxopenparen);
8597             cur_eval = ST.prev_eval;
8598             cur_curlyx = ST.prev_curlyx;
8599 
8600             /* Invalidate cache. See "invalidate" comment above. */
8601             reginfo->poscache_maxiter = 0;
8602             if ( nochange_depth )
8603                 nochange_depth--;
8604 
8605             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8606             sayNO_SILENT;
8607 #undef ST
8608 
8609         case OPEN: /*  (  */
8610             n = PARNO(scan);  /* which paren pair */
8611             RXp_OFFSp(rex)[n].start_tmp = locinput - reginfo->strbeg;
8612             if (n > maxopenparen)
8613                 maxopenparen = n;
8614             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8615                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8616                 depth,
8617                 PTR2UV(rex),
8618                 PTR2UV(RXp_OFFSp(rex)),
8619                 (UV)n,
8620                 (IV)RXp_OFFSp(rex)[n].start_tmp,
8621                 (UV)maxopenparen
8622             ));
8623             lastopen = n;
8624             break;
8625 
8626         case SROPEN: /*  (*SCRIPT_RUN:  */
8627             script_run_begin = (U8 *) locinput;
8628             break;
8629 
8630 
8631         case CLOSE:  /*  )  */
8632             n = PARNO(scan);  /* which paren pair */
8633             CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8634                              locinput - reginfo->strbeg);
8635             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8636                 goto fake_end;
8637 
8638             break;
8639 
8640         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
8641 
8642             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8643             {
8644                 sayNO;
8645             }
8646 
8647             break;
8648 
8649 
8650         case ACCEPT:  /*  (*ACCEPT)  */
8651             is_accepted = true;
8652             if (FLAGS(scan))
8653                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
8654             utmp = ARG2u(scan);
8655 
8656             if ( utmp ) {
8657                 regnode *cursor;
8658                 for (
8659                     cursor = scan;
8660                     cursor && ( OP(cursor) != END );
8661                     cursor = (
8662                                REGNODE_TYPE( OP(cursor) ) == END
8663                                || REGNODE_TYPE( OP(cursor) ) == WHILEM
8664                              )
8665                              ? REGNODE_AFTER(cursor)
8666                              : regnext(cursor)
8667                 ){
8668                     if ( OP(cursor) != CLOSE )
8669                         continue;
8670 
8671                     n = PARNO(cursor);
8672 
8673                     if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8674                         continue;       /* so skip this one */
8675 
8676                     CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8677                                      locinput - reginfo->strbeg);
8678 
8679                     if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8680                         break;
8681                 }
8682             }
8683             goto fake_end;
8684             /* NOTREACHED */
8685 
8686         case GROUPP:  /*  (?(1))  */
8687             n = ARG1u(scan);  /* which paren pair */
8688             sw = cBOOL(RXp_LASTPAREN(rex) >= n && RXp_OFFS_END(rex,n) != -1);
8689             break;
8690 
8691         case GROUPPN:  /*  (?(<name>))  */
8692             /* reg_check_named_buff_matched returns 0 for no match */
8693             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8694             break;
8695 
8696         case INSUBP:   /*  (?(R))  */
8697             n = ARG1u(scan);
8698             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8699              * of SCAN is already set up as matches a eval.close_paren */
8700             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8701             break;
8702 
8703         case DEFINEP:  /*  (?(DEFINE))  */
8704             sw = 0;
8705             break;
8706 
8707         case IFTHEN:   /*  (?(cond)A|B)  */
8708             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8709             if (sw)
8710                 next = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
8711             else {
8712                 next = scan + ARG1u(scan);
8713                 if (OP(next) == IFTHEN) /* Fake one. */
8714                     next = REGNODE_AFTER_type(next,tregnode_IFTHEN);
8715             }
8716             break;
8717 
8718         case LOGICAL:  /* modifier for EVAL and IFMATCH */
8719             logical = FLAGS(scan) & EVAL_FLAGS_MASK; /* reserve a bit for optimistic eval */
8720             break;
8721 
8722 /*******************************************************************
8723 
8724 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8725 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8726 STAR/PLUS/CURLY/CURLYN are used instead.)
8727 
8728 A*B is compiled as <CURLYX><A><WHILEM><B>
8729 
8730 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8731 state, which contains the current count, initialised to -1. It also sets
8732 cur_curlyx to point to this state, with any previous value saved in the
8733 state block.
8734 
8735 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8736 since the pattern may possibly match zero times (i.e. it's a while {} loop
8737 rather than a do {} while loop).
8738 
8739 Each entry to WHILEM represents a successful match of A. The count in the
8740 CURLYX block is incremented, another WHILEM state is pushed, and execution
8741 passes to A or B depending on greediness and the current count.
8742 
8743 For example, if matching against the string a1a2a3b (where the aN are
8744 substrings that match /A/), then the match progresses as follows: (the
8745 pushed states are interspersed with the bits of strings matched so far):
8746 
8747     <CURLYX cnt=-1>
8748     <CURLYX cnt=0><WHILEM>
8749     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8750     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8751     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8752     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8753 
8754 (Contrast this with something like CURLYM, which maintains only a single
8755 backtrack state:
8756 
8757     <CURLYM cnt=0> a1
8758     a1 <CURLYM cnt=1> a2
8759     a1 a2 <CURLYM cnt=2> a3
8760     a1 a2 a3 <CURLYM cnt=3> b
8761 )
8762 
8763 Each WHILEM state block marks a point to backtrack to upon partial failure
8764 of A or B, and also contains some minor state data related to that
8765 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
8766 overall state, such as the count, and pointers to the A and B ops.
8767 
8768 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8769 must always point to the *current* CURLYX block, the rules are:
8770 
8771 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8772 and set cur_curlyx to point the new block.
8773 
8774 When popping the CURLYX block after a successful or unsuccessful match,
8775 restore the previous cur_curlyx.
8776 
8777 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8778 to the outer one saved in the CURLYX block.
8779 
8780 When popping the WHILEM block after a successful or unsuccessful B match,
8781 restore the previous cur_curlyx.
8782 
8783 Here's an example for the pattern (AI* BI)*BO
8784 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8785 
8786 cur_
8787 curlyx backtrack stack
8788 ------ ---------------
8789 NULL
8790 CO     <CO prev=NULL> <WO>
8791 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8792 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8793 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8794 
8795 At this point the pattern succeeds, and we work back down the stack to
8796 clean up, restoring as we go:
8797 
8798 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8799 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8800 CO     <CO prev=NULL> <WO>
8801 NULL
8802 
8803 *******************************************************************/
8804 
8805 #define ST st->u.curlyx
8806 
8807         case CURLYX:    /* start of /A*B/  (for complex A) */
8808         {
8809             /* No need to save/restore up to this paren */
8810             I32 parenfloor = FLAGS(scan);
8811 
8812             assert(next); /* keep Coverity happy */
8813             if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */
8814                 next += ARG1u(next);
8815 
8816             /* XXXX Probably it is better to teach regpush to support
8817                parenfloor > maxopenparen ... */
8818             if (parenfloor > (I32)RXp_LASTPAREN(rex))
8819                 parenfloor = RXp_LASTPAREN(rex); /* Pessimization... */
8820 
8821             ST.prev_curlyx= cur_curlyx;
8822             cur_curlyx = st;
8823             ST.cp = PL_savestack_ix;
8824 
8825             /* these fields contain the state of the current curly.
8826              * they are accessed by subsequent WHILEMs */
8827             ST.parenfloor = parenfloor;
8828             ST.me = scan;
8829             ST.B = next;
8830             ST.minmod = minmod;
8831             minmod = 0;
8832             ST.count = -1;	/* this will be updated by WHILEM */
8833             ST.lastloc = NULL;  /* this will be updated by WHILEM */
8834 
8835             PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol,
8836                                 script_run_begin);
8837             NOT_REACHED; /* NOTREACHED */
8838         }
8839 
8840         case CURLYX_end: /* just finished matching all of A*B */
8841             cur_curlyx = ST.prev_curlyx;
8842             sayYES;
8843             NOT_REACHED; /* NOTREACHED */
8844 
8845         case CURLYX_end_fail: /* just failed to match all of A*B */
8846             regcpblow(ST.cp);
8847             cur_curlyx = ST.prev_curlyx;
8848             sayNO;
8849             NOT_REACHED; /* NOTREACHED */
8850 
8851 
8852 #undef ST
8853 #define ST st->u.whilem
8854 
8855         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8856         {
8857             /* see the discussion above about CURLYX/WHILEM */
8858             I32 n;
8859             int min, max;
8860             /* U16 first_paren, last_paren; */
8861             regnode *A;
8862 
8863             assert(cur_curlyx); /* keep Coverity happy */
8864 
8865             min = ARG1i(cur_curlyx->u.curlyx.me);
8866             max = ARG2i(cur_curlyx->u.curlyx.me);
8867             /* first_paren = ARG3a(cur_curlyx->u.curlyx.me); */
8868             /* last_paren = ARG3b(cur_curlyx->u.curlyx.me);  */
8869             A = REGNODE_AFTER(cur_curlyx->u.curlyx.me);
8870             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8871             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8872             ST.cache_offset = 0;
8873             ST.cache_mask = 0;
8874 
8875             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8876                   depth, (long)n, min, max)
8877             );
8878 
8879             /* First just match a string of min A's. */
8880             if (n < min) {
8881                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8882                 cur_curlyx->u.curlyx.lastloc = locinput;
8883                 REGCP_SET(ST.lastcp);
8884 
8885                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8886                                 script_run_begin);
8887                 NOT_REACHED; /* NOTREACHED */
8888             }
8889 
8890             /* If degenerate A matches "", assume A done. */
8891 
8892             if (locinput == cur_curlyx->u.curlyx.lastloc) {
8893                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8894                    depth)
8895                 );
8896                 goto do_whilem_B_max;
8897             }
8898 
8899             /* super-linear cache processing.
8900              *
8901              * The idea here is that for certain types of CURLYX/WHILEM -
8902              * principally those whose upper bound is infinity (and
8903              * excluding regexes that have things like \1 and other very
8904              * non-regular expressiony things), then if a pattern like
8905              * /....A*.../ fails and we backtrack to the WHILEM, then we
8906              * make a note that this particular WHILEM op was at string
8907              * position 47 (say) when the rest of pattern failed. Then, if
8908              * we ever find ourselves back at that WHILEM, and at string
8909              * position 47 again, we can just fail immediately rather than
8910              * running the rest of the pattern again.
8911              *
8912              * This is very handy when patterns start to go
8913              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8914              * with a combinatorial explosion of backtracking.
8915              *
8916              * The cache is implemented as a bit array, with one bit per
8917              * string byte position per WHILEM op (up to 16) - so its
8918              * between 0.25 and 2x the string size.
8919              *
8920              * To avoid allocating a poscache buffer every time, we do an
8921              * initially countdown; only after we have  executed a WHILEM
8922              * op (string-length x #WHILEMs) times do we allocate the
8923              * cache.
8924              *
8925              * The top 4 bits of FLAGS(scan) byte say how many different
8926              * relevant CURLLYX/WHILEM op pairs there are, while the
8927              * bottom 4-bits is the identifying index number of this
8928              * WHILEM.
8929              */
8930 
8931             if (FLAGS(scan)) {
8932 
8933                 if (!reginfo->poscache_maxiter) {
8934                     /* start the countdown: Postpone detection until we
8935                      * know the match is not *that* much linear. */
8936                     reginfo->poscache_maxiter
8937                         =    (reginfo->strend - reginfo->strbeg + 1)
8938                            * (FLAGS(scan)>>4);
8939                     /* possible overflow for long strings and many CURLYX's */
8940                     if (reginfo->poscache_maxiter < 0)
8941                         reginfo->poscache_maxiter = I32_MAX;
8942                     reginfo->poscache_iter = reginfo->poscache_maxiter;
8943                 }
8944 
8945                 if (reginfo->poscache_iter-- == 0) {
8946                     /* initialise cache */
8947                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8948                     regmatch_info_aux *const aux = reginfo->info_aux;
8949                     if (aux->poscache) {
8950                         if ((SSize_t)reginfo->poscache_size < size) {
8951                             Renew(aux->poscache, size, char);
8952                             reginfo->poscache_size = size;
8953                         }
8954                         Zero(aux->poscache, size, char);
8955                     }
8956                     else {
8957                         reginfo->poscache_size = size;
8958                         Newxz(aux->poscache, size, char);
8959                     }
8960                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8961       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8962                               PL_colors[4], PL_colors[5])
8963                     );
8964                 }
8965 
8966                 if (reginfo->poscache_iter < 0) {
8967                     /* have we already failed at this position? */
8968                     SSize_t offset, mask;
8969 
8970                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8971                     offset  = (FLAGS(scan) & 0xf) - 1
8972                                 +   (locinput - reginfo->strbeg)
8973                                   * (FLAGS(scan)>>4);
8974                     mask    = 1 << (offset % 8);
8975                     offset /= 8;
8976                     if (reginfo->info_aux->poscache[offset] & mask) {
8977                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8978                             depth)
8979                         );
8980                         cur_curlyx->u.curlyx.count--;
8981                         sayNO; /* cache records failure */
8982                     }
8983                     ST.cache_offset = offset;
8984                     ST.cache_mask   = mask;
8985                 }
8986             }
8987 
8988             /* Prefer B over A for minimal matching. */
8989 
8990             if (cur_curlyx->u.curlyx.minmod) {
8991                 ST.save_curlyx = cur_curlyx;
8992                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8993                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8994                                     locinput, loceol, script_run_begin);
8995                 NOT_REACHED; /* NOTREACHED */
8996             }
8997 
8998             /* Prefer A over B for maximal matching. */
8999 
9000             if (n < max) { /* More greed allowed? */
9001                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
9002                             maxopenparen);
9003                 cur_curlyx->u.curlyx.lastloc = locinput;
9004                 REGCP_SET(ST.lastcp);
9005                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
9006                                 script_run_begin);
9007                 NOT_REACHED; /* NOTREACHED */
9008             }
9009             goto do_whilem_B_max;
9010         }
9011         NOT_REACHED; /* NOTREACHED */
9012 
9013         case WHILEM_B_min: /* just matched B in a minimal match */
9014         case WHILEM_B_max: /* just matched B in a maximal match */
9015             cur_curlyx = ST.save_curlyx;
9016             sayYES;
9017             NOT_REACHED; /* NOTREACHED */
9018 
9019         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
9020             cur_curlyx = ST.save_curlyx;
9021             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9022             cur_curlyx->u.curlyx.count--;
9023             CACHEsayNO;
9024             NOT_REACHED; /* NOTREACHED */
9025 
9026         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
9027             /* FALLTHROUGH */
9028         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
9029             REGCP_UNWIND(ST.lastcp);
9030             regcppop(rex, &maxopenparen);
9031             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9032             cur_curlyx->u.curlyx.count--;
9033             CACHEsayNO;
9034             NOT_REACHED; /* NOTREACHED */
9035 
9036         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
9037             REGCP_UNWIND(ST.lastcp);
9038             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9039             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
9040                 depth)
9041             );
9042 
9043           do_whilem_B_max:
9044             /* now try B */
9045             ST.save_curlyx = cur_curlyx;
9046             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
9047             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
9048                                 locinput, loceol, script_run_begin);
9049             NOT_REACHED; /* NOTREACHED */
9050 
9051         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
9052             cur_curlyx = ST.save_curlyx;
9053 
9054             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2i(cur_curlyx->u.curlyx.me)) {
9055                 /* Maximum greed exceeded */
9056                 cur_curlyx->u.curlyx.count--;
9057                 CACHEsayNO;
9058             }
9059 
9060             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
9061             );
9062             /* Try grabbing another A and see if it helps. */
9063             cur_curlyx->u.curlyx.lastloc = locinput;
9064             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
9065             REGCP_SET(ST.lastcp);
9066             PUSH_STATE_GOTO(WHILEM_A_min,
9067                 /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me),
9068                 locinput, loceol, script_run_begin);
9069             NOT_REACHED; /* NOTREACHED */
9070 
9071 #undef  ST
9072 #define ST st->u.branch
9073 
9074         case BRANCHJ:	    /*  /(...|A|...)/ with long next pointer */
9075             next = scan + ARG1u(scan);
9076             if (next == scan)
9077                 next = NULL;
9078             ST.before_paren = ARG2a(scan);
9079             ST.after_paren = ARG2b(scan);
9080             goto branch_logic;
9081             NOT_REACHED; /* NOTREACHED */
9082 
9083         case BRANCH:	    /*  /(...|A|...)/ */
9084             ST.before_paren = ARG1a(scan);
9085             ST.after_paren = ARG1b(scan);
9086           branch_logic:
9087             scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
9088             assert(scan);
9089             ST.lastparen = RXp_LASTPAREN(rex);
9090             ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9091             ST.next_branch = next;
9092             REGCP_SET(ST.cp);
9093             if (RE_PESSIMISTIC_PARENS) {
9094                 regcppush(rex, 0, maxopenparen);
9095                 REGCP_SET(ST.lastcp);
9096             }
9097 
9098             /* Now go into the branch */
9099             if (has_cutgroup) {
9100                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9101                                     script_run_begin);
9102             } else {
9103                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9104                                 script_run_begin);
9105             }
9106             NOT_REACHED; /* NOTREACHED */
9107 
9108         case CUTGROUP:  /*  /(*THEN)/  */
9109             sv_yes_mark = st->u.mark.mark_name = FLAGS(scan)
9110                 ? MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ])
9111                 : NULL;
9112             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
9113                             script_run_begin);
9114             NOT_REACHED; /* NOTREACHED */
9115 
9116         case CUTGROUP_next_fail:
9117             do_cutgroup = 1;
9118             no_final = 1;
9119             if (st->u.mark.mark_name)
9120                 sv_commit = st->u.mark.mark_name;
9121             sayNO;
9122             NOT_REACHED; /* NOTREACHED */
9123 
9124         case BRANCH_next:
9125             sayYES;
9126             NOT_REACHED; /* NOTREACHED */
9127 
9128         case BRANCH_next_fail: /* that branch failed; try the next, if any */
9129             if (do_cutgroup) {
9130                 do_cutgroup = 0;
9131                 no_final = 0;
9132             }
9133             if (RE_PESSIMISTIC_PARENS) {
9134                 REGCP_UNWIND(ST.lastcp);
9135                 regcppop(rex,&maxopenparen);
9136             }
9137             REGCP_UNWIND(ST.cp);
9138             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9139             CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "BRANCH_next_fail");
9140             scan = ST.next_branch;
9141             /* no more branches? */
9142             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
9143                 DEBUG_EXECUTE_r({
9144                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
9145                         depth,
9146                         PL_colors[4],
9147                         PL_colors[5] );
9148                 });
9149                 sayNO_SILENT;
9150             }
9151             continue; /* execute next BRANCH[J] op */
9152             /* NOTREACHED */
9153 
9154         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
9155             minmod = 1;
9156             break;
9157 
9158 #undef  ST
9159 #define ST st->u.curlym
9160 
9161         case CURLYM:	/* /A{m,n}B/ where A is fixed-length */
9162 
9163             /* This is an optimisation of CURLYX that enables us to push
9164              * only a single backtracking state, no matter how many matches
9165              * there are in {m,n}. It relies on the pattern being constant
9166              * length, with no parens to influence future backrefs
9167              */
9168 
9169             ST.me = scan;
9170             scan = REGNODE_AFTER_type(scan, tregnode_CURLYM);
9171 
9172             ST.lastparen      = RXp_LASTPAREN(rex);
9173             ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9174 
9175             /* if paren positive, emulate an OPEN/CLOSE around A */
9176             if (FLAGS(ST.me)) {
9177                 U32 paren = FLAGS(ST.me);
9178                 lastopen = paren;
9179                 if (paren > maxopenparen)
9180                     maxopenparen = paren;
9181                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
9182             }
9183             ST.A = scan;
9184             ST.B = next;
9185             ST.alen = 0;
9186             ST.count = 0;
9187             ST.minmod = minmod;
9188             minmod = 0;
9189             ST.Binfo.count = -1;
9190             REGCP_SET(ST.cp);
9191 
9192             if (!(ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me))) /* min/max */
9193                 goto curlym_do_B;
9194 
9195           curlym_do_A: /* execute the A in /A{m,n}B/  */
9196             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
9197                                 script_run_begin);
9198             NOT_REACHED; /* NOTREACHED */
9199 
9200         case CURLYM_A: /* we've just matched an A */
9201             ST.count++;
9202             /* after first match, determine A's length: u.curlym.alen */
9203             if (ST.count == 1) {
9204                 if (reginfo->is_utf8_target) {
9205                     char *s = st->locinput;
9206                     while (s < locinput) {
9207                         ST.alen++;
9208                         s += UTF8SKIP(s);
9209                     }
9210                 }
9211                 else {
9212                     ST.alen = locinput - st->locinput;
9213                 }
9214                 if (ST.alen == 0)
9215                     ST.count = ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me);
9216             }
9217             DEBUG_EXECUTE_r(
9218                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
9219                           depth, (IV) ST.count, (IV)ST.alen)
9220             );
9221 
9222             if (FLAGS(ST.me)) {
9223                 /* emulate CLOSE: mark current A as captured */
9224                 U32 paren = (U32)FLAGS(ST.me);
9225                 CLOSE_CAPTURE(rex, paren,
9226                     HOPc(locinput, -ST.alen) - reginfo->strbeg,
9227                     locinput - reginfo->strbeg);
9228             }
9229 
9230             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9231                 goto fake_end;
9232 
9233 
9234             if (!is_accepted) {
9235                 I32 max = (ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me));
9236                 if ( max == REG_INFTY || ST.count < max )
9237                     goto curlym_do_A; /* try to match another A */
9238             }
9239             goto curlym_do_B; /* try to match B */
9240 
9241         case CURLYM_A_fail: /* just failed to match an A */
9242             REGCP_UNWIND(ST.cp);
9243 
9244 
9245             if (ST.minmod || ST.count < ARG1i(ST.me) /* min*/
9246                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9247                 sayNO;
9248 
9249           curlym_do_B: /* execute the B in /A{m,n}B/  */
9250             if (is_accepted)
9251                 goto curlym_close_B;
9252 
9253             if (ST.Binfo.count < 0) {
9254                 /* calculate possible match of 1st char following curly */
9255                 assert(ST.B);
9256                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
9257                     regnode *text_node = ST.B;
9258                     if (! HAS_TEXT(text_node))
9259                         FIND_NEXT_IMPT(text_node);
9260                     if (REGNODE_TYPE(OP(text_node)) == EXACT) {
9261                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9262                                                         &ST.Binfo, reginfo))
9263                         {
9264                             sayNO;
9265                         }
9266                     }
9267                 }
9268             }
9269 
9270             DEBUG_EXECUTE_r(
9271                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
9272                     depth, (IV)ST.count)
9273             );
9274             if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
9275                 assert(ST.Binfo.count > 0);
9276 
9277                 /* Do a quick test to hopefully rule out most non-matches */
9278                 if (     locinput + ST.Binfo.min_length > loceol
9279                     || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9280                 {
9281                     DEBUG_OPTIMISE_r(
9282                         Perl_re_exec_indentf( aTHX_
9283                             "CURLYM Fast bail next target=0x%X anded==0x%X"
9284                                                                 " mask=0x%X\n",
9285                             depth,
9286                             (int) nextbyte, ST.Binfo.first_byte_anded,
9287                                             ST.Binfo.first_byte_mask)
9288                     );
9289                     state_num = CURLYM_B_fail;
9290                     goto reenter_switch;
9291                 }
9292             }
9293 
9294           curlym_close_B:
9295             if (FLAGS(ST.me)) {
9296                 /* emulate CLOSE: mark current A as captured */
9297                 U32 paren = (U32)FLAGS(ST.me);
9298                 if (ST.count || is_accepted) {
9299                     CLOSE_CAPTURE(rex, paren,
9300                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
9301                         locinput - reginfo->strbeg);
9302                 }
9303                 else
9304                     RXp_OFFSp(rex)[paren].end = -1;
9305 
9306                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9307                 {
9308                     if (ST.count || is_accepted)
9309                         goto fake_end;
9310                     else
9311                         sayNO;
9312                 }
9313             }
9314 
9315             if (is_accepted)
9316                 goto fake_end;
9317 
9318             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
9319                             script_run_begin);
9320             NOT_REACHED; /* NOTREACHED */
9321 
9322         case CURLYM_B_fail: /* just failed to match a B */
9323             REGCP_UNWIND(ST.cp);
9324             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9325             if (ST.minmod) {
9326                 I32 max = ARG2i(ST.me);
9327                 if (max != REG_INFTY && ST.count == max)
9328                     sayNO;
9329                 goto curlym_do_A; /* try to match a further A */
9330             }
9331             /* backtrack one A */
9332             if (ST.count == ARG1i(ST.me) /* min */)
9333                 sayNO;
9334             ST.count--;
9335             SET_locinput(HOPc(locinput, -ST.alen));
9336             goto curlym_do_B; /* try to match B */
9337 
9338 #undef ST
9339 #define ST st->u.curly
9340 
9341 #define CURLY_SETPAREN(paren, success)                                      \
9342     if (paren) {                                                            \
9343         if (success) {                                                      \
9344             CLOSE_CAPTURE(rex, paren, HOPc(locinput, -1) - reginfo->strbeg, \
9345                                  locinput - reginfo->strbeg);               \
9346         }                                                                   \
9347         else {                                                              \
9348             RXp_OFFSp(rex)[paren].end = -1;                                 \
9349             RXp_LASTPAREN(rex)  = ST.lastparen;                             \
9350             RXp_LASTCLOSEPAREN(rex) = ST.lastcloseparen;                    \
9351         }                                                                   \
9352     }
9353 
9354         case STAR:		/*  /A*B/ where A is width 1 char */
9355             ST.paren = 0;
9356             ST.min = 0;
9357             ST.max = REG_INFTY;
9358             scan = REGNODE_AFTER_type(scan,tregnode_STAR);
9359             goto repeat;
9360 
9361         case PLUS:		/*  /A+B/ where A is width 1 char */
9362             ST.paren = 0;
9363             ST.min = 1;
9364             ST.max = REG_INFTY;
9365             scan = REGNODE_AFTER_type(scan,tregnode_PLUS);
9366             goto repeat;
9367 
9368         case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
9369             ST.paren = FLAGS(scan);     /* Which paren to set */
9370             ST.lastparen      = RXp_LASTPAREN(rex);
9371             ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9372             if (ST.paren > maxopenparen)
9373                 maxopenparen = ST.paren;
9374             ST.min = ARG1i(scan);  /* min to match */
9375             ST.max = ARG2i(scan);  /* max to match */
9376             scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN));
9377 
9378             /* handle the single-char capture called as a GOSUB etc */
9379             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9380             {
9381                 char *li = locinput;
9382                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9383                     sayNO;
9384                 SET_locinput(li);
9385                 goto fake_end;
9386             }
9387 
9388             goto repeat;
9389 
9390         case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
9391             ST.paren = 0;
9392             ST.min = ARG1i(scan);  /* min to match */
9393             ST.max = ARG2i(scan);  /* max to match */
9394             scan = REGNODE_AFTER_type(scan, tregnode_CURLY);
9395           repeat:
9396             /*
9397             * Lookahead to avoid useless match attempts
9398             * when we know what character comes next.
9399             *
9400             * Used to only do .*x and .*?x, but now it allows
9401             * for )'s, ('s and (?{ ... })'s to be in the way
9402             * of the quantifier and the EXACT-like node.  -- japhy
9403             */
9404 
9405             assert(ST.min <= ST.max);
9406             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9407                 ST.Binfo.count = 0;
9408             }
9409             else {
9410                 regnode *text_node = next;
9411 
9412                 if (! HAS_TEXT(text_node))
9413                     FIND_NEXT_IMPT(text_node);
9414 
9415                 if (! HAS_TEXT(text_node))
9416                     ST.Binfo.count = 0;
9417                 else {
9418                     if ( REGNODE_TYPE(OP(text_node)) != EXACT ) {
9419                         ST.Binfo.count = 0;
9420                     }
9421                     else {
9422                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9423                                                         &ST.Binfo, reginfo))
9424                         {
9425                             sayNO;
9426                         }
9427                     }
9428                 }
9429             }
9430 
9431             ST.A = scan;
9432             ST.B = next;
9433             if (minmod) {
9434                 char *li = locinput;
9435                 minmod = 0;
9436                 if (ST.min &&
9437                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9438                             < ST.min)
9439                     sayNO;
9440                 SET_locinput(li);
9441                 ST.count = ST.min;
9442                 REGCP_SET(ST.cp);
9443 
9444                 if (ST.Binfo.count <= 0)
9445                     goto curly_try_B_min;
9446 
9447                 ST.oldloc = locinput;
9448 
9449                 /* set ST.maxpos to the furthest point along the
9450                  * string that could possibly match, i.e., that a match could
9451                  * start at. */
9452                 if  (ST.max == REG_INFTY) {
9453                     ST.maxpos = loceol - 1;
9454                     if (utf8_target)
9455                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9456                             ST.maxpos--;
9457                 }
9458                 else if (utf8_target) {
9459                     int m = ST.max - ST.min;
9460                     for (ST.maxpos = locinput;
9461                          m >0 && ST.maxpos <  loceol; m--)
9462                         ST.maxpos += UTF8SKIP(ST.maxpos);
9463                 }
9464                 else {
9465                     ST.maxpos = locinput + ST.max - ST.min;
9466                     if (ST.maxpos >=  loceol)
9467                         ST.maxpos =  loceol - 1;
9468                 }
9469                 goto curly_try_B_min_known;
9470 
9471             }
9472             else {
9473                 /* avoid taking address of locinput, so it can remain
9474                  * a register var */
9475                 char *li = locinput;
9476                 if (ST.max)
9477                     ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9478                 else
9479                     ST.count = 0;
9480                 if (ST.count < ST.min)
9481                     sayNO;
9482                 SET_locinput(li);
9483                 if ((ST.count > ST.min)
9484                     && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL))
9485                 {
9486                     /* A{m,n} must come at the end of the string, there's
9487                      * no point in backing off ... */
9488                     ST.min = ST.count;
9489                     /* ...except that $ and \Z can match before *and* after
9490                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
9491                        We may back off by one in this case. */
9492                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9493                         ST.min--;
9494                 }
9495                 REGCP_SET(ST.cp);
9496                 goto curly_try_B_max;
9497             }
9498             NOT_REACHED; /* NOTREACHED */
9499 
9500         case CURLY_B_min_fail:
9501             /* failed to find B in a non-greedy match. */
9502             if (RE_PESSIMISTIC_PARENS) {
9503                 REGCP_UNWIND(ST.lastcp);
9504                 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9505             }
9506             REGCP_UNWIND(ST.cp);
9507             if (ST.paren) {
9508                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9509             }
9510 
9511             if (ST.Binfo.count == 0) {
9512                 /* failed -- move forward one */
9513                 char *li = locinput;
9514                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9515                     sayNO;
9516                 }
9517                 locinput = li;
9518                 ST.count++;
9519                 if (!(   ST.count <= ST.max
9520                         /* count overflow ? */
9521                      || (ST.max == REG_INFTY && ST.count > 0))
9522                 )
9523                     sayNO;
9524             }
9525             else {
9526                 int n;
9527                 /* Couldn't or didn't -- move forward. */
9528                 ST.oldloc = locinput;
9529                 if (utf8_target)
9530                     locinput += UTF8SKIP(locinput);
9531                 else
9532                     locinput++;
9533                 ST.count++;
9534 
9535               curly_try_B_min_known:
9536                 /* find the next place where 'B' could work, then call B */
9537                 if (locinput + ST.Binfo.initial_exact < loceol) {
9538                     if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9539 
9540                         /* Here, the mask is all 1's for the entire length of
9541                          * any possible match.  (That actually means that there
9542                          * is only one possible match.)  Look for the next
9543                          * occurrence */
9544                         locinput = ninstr(locinput, loceol,
9545                                         (char *) ST.Binfo.matches,
9546                                         (char *) ST.Binfo.matches
9547                                                     + ST.Binfo.initial_exact);
9548                         if (locinput == NULL) {
9549                             sayNO;
9550                         }
9551                     }
9552                     else do {
9553                         /* If the first byte(s) of the mask are all ones, it
9554                          * means those bytes must match identically, so can use
9555                          * ninstr() to find the next possible matchpoint */
9556                         if (ST.Binfo.initial_exact > 0) {
9557                             locinput = ninstr(locinput, loceol,
9558                                               (char *) ST.Binfo.matches,
9559                                               (char *) ST.Binfo.matches
9560                                                      + ST.Binfo.initial_exact);
9561                         }
9562                         else { /* Otherwise find the next byte that matches,
9563                                   masked */
9564                             locinput = (char *) find_next_masked(
9565                                                 (U8 *) locinput, (U8 *) loceol,
9566                                                 ST.Binfo.first_byte_anded,
9567                                                 ST.Binfo.first_byte_mask);
9568                             /* Advance to the end of a multi-byte character */
9569                             if (utf8_target) {
9570                                 while (   locinput < loceol
9571                                     && UTF8_IS_CONTINUATION(*locinput))
9572                                 {
9573                                     locinput++;
9574                                 }
9575                             }
9576                         }
9577                         if (   locinput == NULL
9578                             || locinput + ST.Binfo.min_length > loceol)
9579                         {
9580                             sayNO;
9581                         }
9582 
9583                         /* Here, we have found a possible match point; if can't
9584                          * rule it out, quit the loop so can check fully */
9585                         if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9586                             break;
9587                         }
9588 
9589                         locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9590 
9591                     } while (locinput <= ST.maxpos);
9592                 }
9593 
9594                 if (locinput > ST.maxpos)
9595                     sayNO;
9596 
9597                 n = (utf8_target)
9598                     ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9599                     : (STRLEN) (locinput - ST.oldloc);
9600 
9601 
9602                 /* Here is at the beginning of a character that meets the mask
9603                  * criteria.  Need to make sure that some real possibility */
9604 
9605                 if (n) {
9606                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9607                      * at what may be the beginning of b; check that everything
9608                      * between oldloc and locinput matches */
9609                     char *li = ST.oldloc;
9610                     ST.count += n;
9611                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9612                         sayNO;
9613                     assert(n == REG_INFTY || locinput == li);
9614                 }
9615             }
9616 
9617           curly_try_B_min:
9618             if (RE_PESSIMISTIC_PARENS) {
9619                 (void)regcppush(rex, 0, maxopenparen);
9620                 REGCP_SET(ST.lastcp);
9621             }
9622             CURLY_SETPAREN(ST.paren, ST.count);
9623             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9624                             script_run_begin);
9625             NOT_REACHED; /* NOTREACHED */
9626 
9627 
9628           curly_try_B_max:
9629             /* a successful greedy match: now try to match B */
9630             if (        ST.Binfo.count <= 0
9631                 || (    ST.Binfo.count > 0
9632                     &&  locinput + ST.Binfo.min_length <= loceol
9633                     &&  S_test_EXACTISH_ST(locinput, ST.Binfo)))
9634             {
9635                 if (RE_PESSIMISTIC_PARENS) {
9636                     (void)regcppush(rex, 0, maxopenparen);
9637                     REGCP_SET(ST.lastcp);
9638                 }
9639                 CURLY_SETPAREN(ST.paren, ST.count);
9640                 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9641                                 script_run_begin);
9642                 NOT_REACHED; /* NOTREACHED */
9643             }
9644             goto CURLY_B_all_failed;
9645             NOT_REACHED; /* NOTREACHED */
9646 
9647         case CURLY_B_max_fail:
9648             /* failed to find B in a greedy match */
9649 
9650             if (RE_PESSIMISTIC_PARENS) {
9651                 REGCP_UNWIND(ST.lastcp);
9652                 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9653             }
9654           CURLY_B_all_failed:
9655             REGCP_UNWIND(ST.cp);
9656             if (ST.paren) {
9657                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9658             }
9659             /*  back up. */
9660             if (--ST.count < ST.min)
9661                 sayNO;
9662             locinput = HOPc(locinput, -1);
9663             goto curly_try_B_max;
9664 
9665 #undef ST
9666 
9667         case END: /*  last op of main pattern  */
9668           fake_end:
9669             if (cur_eval) {
9670                 /* we've just finished A in /(??{A})B/; now continue with B */
9671                 is_accepted= false;
9672                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9673                 st->u.eval.prev_rex = rex_sv;		/* inner */
9674 
9675                 /* Save *all* the positions. */
9676                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9677                 rex_sv = CUR_EVAL.prev_rex;
9678                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9679                 SET_reg_curpm(rex_sv);
9680                 rex = ReANY(rex_sv);
9681                 rexi = RXi_GET(rex);
9682 
9683                 st->u.eval.prev_curlyx = cur_curlyx;
9684                 cur_curlyx = CUR_EVAL.prev_curlyx;
9685 
9686                 REGCP_SET(st->u.eval.lastcp);
9687 
9688                 /* Restore parens of the outer rex without popping the
9689                  * savestack */
9690                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9691 
9692                 st->u.eval.prev_eval = cur_eval;
9693                 cur_eval = CUR_EVAL.prev_eval;
9694                 DEBUG_EXECUTE_r(
9695                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
9696                                       depth, cur_eval););
9697                 if ( nochange_depth )
9698                     nochange_depth--;
9699 
9700                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9701 
9702                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
9703                                     st->u.eval.prev_eval->u.eval.B,
9704                                     locinput, loceol, script_run_begin);
9705             }
9706 
9707             if (locinput < reginfo->till) {
9708                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9709                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9710                                       PL_colors[4],
9711                                       (long)(locinput - startpos),
9712                                       (long)(reginfo->till - startpos),
9713                                       PL_colors[5]));
9714 
9715                 sayNO_SILENT;		/* Cannot match: too short. */
9716             }
9717             sayYES;			/* Success! */
9718 
9719         case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9720                                 matches end at the right spot, required for
9721                                 variable length matches. */
9722             if (match_end && locinput != match_end)
9723             {
9724                 DEBUG_EXECUTE_r(
9725                 Perl_re_exec_indentf( aTHX_
9726                     "%sLOOKBEHIND_END: subpattern failed...%s\n",
9727                     depth, PL_colors[4], PL_colors[5]));
9728                 sayNO;            /* Variable length match didn't line up */
9729             }
9730             /* FALLTHROUGH */
9731 
9732         case SUCCEED: /* successful SUSPEND/CURLYM and
9733                                             *lookahead* IFMATCH/UNLESSM*/
9734             DEBUG_EXECUTE_r(
9735             Perl_re_exec_indentf( aTHX_
9736                 "%sSUCCEED: subpattern success...%s\n",
9737                 depth, PL_colors[4], PL_colors[5]));
9738             sayYES;			/* Success! */
9739 
9740 #undef  ST
9741 #define ST st->u.ifmatch
9742 
9743         case SUSPEND:	/* (?>A) */
9744             ST.wanted = 1;
9745             ST.start = locinput;
9746             ST.end = loceol;
9747             ST.count = 1;
9748             goto do_ifmatch;
9749 
9750         case UNLESSM:	/* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9751             ST.wanted = 0;
9752             goto ifmatch_trivial_fail_test;
9753 
9754         case IFMATCH:	/* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9755             ST.wanted = 1;
9756           ifmatch_trivial_fail_test:
9757             ST.prev_match_end= match_end;
9758             ST.count = NEXT_OFF(scan) + 1; /* next_off repurposed to be
9759                                               lookbehind count, requires
9760                                               non-zero flags */
9761             if (! FLAGS(scan)) {    /* 'flags' zero means lookahed */
9762 
9763                 /* Lookahead starts here and ends at the normal place */
9764                 ST.start = locinput;
9765                 ST.end = loceol;
9766                 match_end = NULL;
9767             }
9768             else {
9769                 PERL_UINT_FAST8_T back_count = FLAGS(scan);
9770                 char * s;
9771                 match_end = locinput;
9772 
9773                 /* Lookbehind can look beyond the current position */
9774                 ST.end = loceol;
9775 
9776                 /* ... and starts at the first place in the input that is in
9777                  * the range of the possible start positions */
9778                 for (; ST.count > 0; ST.count--, back_count--) {
9779                     s = HOPBACKc(locinput, back_count);
9780                     if (s) {
9781                         ST.start = s;
9782                         goto do_ifmatch;
9783                     }
9784                 }
9785 
9786                 /* If the lookbehind doesn't start in the actual string, is a
9787                  * trivial match failure */
9788                 match_end = ST.prev_match_end;
9789                 if (logical) {
9790                     logical = 0;
9791                     sw = 1 - cBOOL(ST.wanted);
9792                 }
9793                 else if (ST.wanted)
9794                     sayNO;
9795 
9796                 /* Here, we didn't want it to match, so is actually success */
9797                 next = scan + ARG1u(scan);
9798                 if (next == scan)
9799                     next = NULL;
9800                 break;
9801             }
9802 
9803           do_ifmatch:
9804             ST.me = scan;
9805             ST.logical = logical;
9806             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9807 
9808             /* execute body of (?...A) */
9809             PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start,
9810                                 ST.end, script_run_begin);
9811             NOT_REACHED; /* NOTREACHED */
9812 
9813         {
9814             bool matched;
9815 
9816         case IFMATCH_A_fail: /* body of (?...A) failed */
9817             if (! ST.logical && ST.count > 1) {
9818 
9819                 /* It isn't a real failure until we've tried all starting
9820                  * positions.  Move to the next starting position and retry */
9821                 ST.count--;
9822                 ST.start = HOPc(ST.start, 1);
9823                 scan = ST.me;
9824                 logical = ST.logical;
9825                 goto do_ifmatch;
9826             }
9827 
9828             /* Here, all starting positions have been tried. */
9829             matched = FALSE;
9830             goto ifmatch_done;
9831 
9832         case IFMATCH_A: /* body of (?...A) succeeded */
9833             matched = TRUE;
9834           ifmatch_done:
9835             sw = matched == ST.wanted;
9836             match_end = ST.prev_match_end;
9837             if (! ST.logical && !sw) {
9838                 sayNO;
9839             }
9840 
9841             if (OP(ST.me) != SUSPEND) {
9842                 /* restore old position except for (?>...) */
9843                 locinput = st->locinput;
9844                 loceol = st->loceol;
9845                 script_run_begin = st->sr0;
9846             }
9847             scan = ST.me + ARG1u(ST.me);
9848             if (scan == ST.me)
9849                 scan = NULL;
9850             continue; /* execute B */
9851         }
9852 
9853 #undef ST
9854 
9855         case LONGJMP: /*  alternative with many branches compiles to
9856                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9857             next = scan + ARG1u(scan);
9858             if (next == scan)
9859                 next = NULL;
9860             break;
9861 
9862         case COMMIT:  /*  (*COMMIT)  */
9863             reginfo->cutpoint = loceol;
9864             /* FALLTHROUGH */
9865 
9866         case PRUNE:   /*  (*PRUNE)   */
9867             if (FLAGS(scan))
9868                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9869             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9870                             script_run_begin);
9871             NOT_REACHED; /* NOTREACHED */
9872 
9873         case COMMIT_next_fail:
9874             no_final = 1;
9875             /* FALLTHROUGH */
9876             sayNO;
9877             NOT_REACHED; /* NOTREACHED */
9878 
9879         case OPFAIL:   /* (*FAIL)  */
9880             if (FLAGS(scan))
9881                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9882             if (logical) {
9883                 /* deal with (?(?!)X|Y) properly,
9884                  * make sure we trigger the no branch
9885                  * of the trailing IFTHEN structure*/
9886                 sw= 0;
9887                 break;
9888             } else {
9889                 sayNO;
9890             }
9891             NOT_REACHED; /* NOTREACHED */
9892 
9893 #define ST st->u.mark
9894         case MARKPOINT: /*  (*MARK:foo)  */
9895             ST.prev_mark = mark_state;
9896             ST.mark_name = sv_commit = sv_yes_mark
9897                 = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9898             mark_state = st;
9899             ST.mark_loc = locinput;
9900             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9901                                 script_run_begin);
9902             NOT_REACHED; /* NOTREACHED */
9903 
9904         case MARKPOINT_next:
9905             mark_state = ST.prev_mark;
9906             sayYES;
9907             NOT_REACHED; /* NOTREACHED */
9908 
9909         case MARKPOINT_next_fail:
9910             if (popmark && sv_eq(ST.mark_name,popmark))
9911             {
9912                 if (ST.mark_loc > startpoint)
9913                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9914                 popmark = NULL; /* we found our mark */
9915                 sv_commit = ST.mark_name;
9916 
9917                 DEBUG_EXECUTE_r({
9918                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9919                             depth,
9920                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9921                 });
9922             }
9923             mark_state = ST.prev_mark;
9924             sv_yes_mark = mark_state ?
9925                 mark_state->u.mark.mark_name : NULL;
9926             sayNO;
9927             NOT_REACHED; /* NOTREACHED */
9928 
9929         case SKIP:  /*  (*SKIP)  */
9930             if (!FLAGS(scan)) {
9931                 /* (*SKIP) : if we fail we cut here*/
9932                 ST.mark_name = NULL;
9933                 ST.mark_loc = locinput;
9934                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9935                                 script_run_begin);
9936             } else {
9937                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9938                    otherwise do nothing.  Meaning we need to scan
9939                  */
9940                 regmatch_state *cur = mark_state;
9941                 SV *find = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9942 
9943                 while (cur) {
9944                     if ( sv_eq( cur->u.mark.mark_name,
9945                                 find ) )
9946                     {
9947                         ST.mark_name = find;
9948                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9949                                          script_run_begin);
9950                     }
9951                     cur = cur->u.mark.prev_mark;
9952                 }
9953             }
9954             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9955             break;
9956 
9957         case SKIP_next_fail:
9958             if (ST.mark_name) {
9959                 /* (*CUT:NAME) - Set up to search for the name as we
9960                    collapse the stack*/
9961                 popmark = ST.mark_name;
9962             } else {
9963                 /* (*CUT) - No name, we cut here.*/
9964                 if (ST.mark_loc > startpoint)
9965                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9966                 /* but we set sv_commit to latest mark_name if there
9967                    is one so they can test to see how things lead to this
9968                    cut */
9969                 if (mark_state)
9970                     sv_commit=mark_state->u.mark.mark_name;
9971             }
9972             no_final = 1;
9973             sayNO;
9974             NOT_REACHED; /* NOTREACHED */
9975 #undef ST
9976 
9977         case LNBREAK: /* \R */
9978             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9979                 locinput += n;
9980             } else
9981                 sayNO;
9982             break;
9983 
9984         default:
9985             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9986                           PTR2UV(scan), OP(scan));
9987             Perl_croak(aTHX_ "regexp memory corruption");
9988 
9989         /* this is a point to jump to in order to increment
9990          * locinput by one character */
9991           increment_locinput:
9992             assert(!NEXTCHR_IS_EOS);
9993             if (utf8_target) {
9994                 locinput += PL_utf8skip[nextbyte];
9995                 /* locinput is allowed to go 1 char off the end (signifying
9996                  * EOS), but not 2+ */
9997                 if (locinput >  loceol)
9998                     sayNO;
9999             }
10000             else
10001                 locinput++;
10002             break;
10003 
10004         } /* end switch */
10005 
10006         /* switch break jumps here */
10007         scan = next; /* prepare to execute the next op and ... */
10008         continue;    /* ... jump back to the top, reusing st */
10009         /* NOTREACHED */
10010 
10011       push_yes_state:
10012         /* push a state that backtracks on success */
10013         st->u.yes.prev_yes_state = yes_state;
10014         yes_state = st;
10015         /* FALLTHROUGH */
10016       push_state:
10017         /* push a new regex state, then continue at scan  */
10018         {
10019             regmatch_state *newst;
10020             DECLARE_AND_GET_RE_DEBUG_FLAGS;
10021 
10022             DEBUG_r( /* DEBUG_STACK_r */
10023               if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
10024                 regmatch_state *cur = st;
10025                 regmatch_state *curyes = yes_state;
10026                 U32 i;
10027                 regmatch_slab *slab = PL_regmatch_slab;
10028                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
10029                     if (cur < SLAB_FIRST(slab)) {
10030                         slab = slab->prev;
10031                         cur = SLAB_LAST(slab);
10032                     }
10033                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
10034                         depth,
10035                         i ? "    " : "push",
10036                         depth - i, REGNODE_NAME(cur->resume_state),
10037                         (curyes == cur) ? "yes" : ""
10038                     );
10039                     if (curyes == cur)
10040                         curyes = cur->u.yes.prev_yes_state;
10041                 }
10042             } else {
10043                 DEBUG_STATE_pp("push")
10044             });
10045             depth++;
10046             st->locinput = locinput;
10047             st->loceol = loceol;
10048             st->sr0 = script_run_begin;
10049             newst = st+1;
10050             if (newst >  SLAB_LAST(PL_regmatch_slab))
10051                 newst = S_push_slab(aTHX);
10052             PL_regmatch_state = newst;
10053 
10054             locinput = pushinput;
10055             loceol = pusheol;
10056             script_run_begin = pushsr0;
10057             st = newst;
10058             continue;
10059             /* NOTREACHED */
10060         }
10061     }
10062 #ifdef SOLARIS_BAD_OPTIMIZER
10063 #  undef PL_charclass
10064 #endif
10065 
10066     /*
10067     * We get here only if there's trouble -- normally "case END" is
10068     * the terminating point.
10069     */
10070     Perl_croak(aTHX_ "corrupted regexp pointers");
10071     NOT_REACHED; /* NOTREACHED */
10072 
10073   yes:
10074     if (yes_state) {
10075         /* we have successfully completed a subexpression, but we must now
10076          * pop to the state marked by yes_state and continue from there */
10077         assert(st != yes_state);
10078 #ifdef DEBUGGING
10079         while (st != yes_state) {
10080             st--;
10081             if (st < SLAB_FIRST(PL_regmatch_slab)) {
10082                 PL_regmatch_slab = PL_regmatch_slab->prev;
10083                 st = SLAB_LAST(PL_regmatch_slab);
10084             }
10085             DEBUG_STATE_r({
10086                 if (no_final) {
10087                     DEBUG_STATE_pp("pop (no final)");
10088                 } else {
10089                     DEBUG_STATE_pp("pop (yes)");
10090                 }
10091             });
10092             depth--;
10093         }
10094 #else
10095         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
10096             || yes_state > SLAB_LAST(PL_regmatch_slab))
10097         {
10098             /* not in this slab, pop slab */
10099             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
10100             PL_regmatch_slab = PL_regmatch_slab->prev;
10101             st = SLAB_LAST(PL_regmatch_slab);
10102         }
10103         depth -= (st - yes_state);
10104 #endif
10105         st = yes_state;
10106         yes_state = st->u.yes.prev_yes_state;
10107         PL_regmatch_state = st;
10108 
10109         if (no_final) {
10110             locinput= st->locinput;
10111             loceol= st->loceol;
10112             script_run_begin = st->sr0;
10113         }
10114         state_num = st->resume_state + no_final;
10115         goto reenter_switch;
10116     }
10117 
10118     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
10119                           PL_colors[4], PL_colors[5]));
10120 
10121     if (reginfo->info_aux_eval) {
10122         /* each successfully executed (?{...}) block does the equivalent of
10123          *   local $^R = do {...}
10124          * When popping the save stack, all these locals would be undone;
10125          * bypass this by setting the outermost saved $^R to the latest
10126          * value */
10127         /* I don't know if this is needed or works properly now.
10128          * see code related to PL_replgv elsewhere in this file.
10129          * Yves
10130          */
10131         if (oreplsv != GvSV(PL_replgv)) {
10132             sv_setsv(oreplsv, GvSV(PL_replgv));
10133             SvSETMAGIC(oreplsv);
10134         }
10135     }
10136     result = 1;
10137     goto final_exit;
10138 
10139   no:
10140     DEBUG_EXECUTE_r(
10141         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
10142             depth,
10143             PL_colors[4], PL_colors[5])
10144         );
10145 
10146   no_silent:
10147     if (no_final) {
10148         if (yes_state) {
10149             goto yes;
10150         } else {
10151             goto final_exit;
10152         }
10153     }
10154     if (depth) {
10155         /* there's a previous state to backtrack to */
10156         st--;
10157         if (st < SLAB_FIRST(PL_regmatch_slab)) {
10158             PL_regmatch_slab = PL_regmatch_slab->prev;
10159             st = SLAB_LAST(PL_regmatch_slab);
10160         }
10161         PL_regmatch_state = st;
10162         locinput= st->locinput;
10163         loceol= st->loceol;
10164         script_run_begin = st->sr0;
10165 
10166         DEBUG_STATE_pp("pop");
10167         depth--;
10168         if (yes_state == st)
10169             yes_state = st->u.yes.prev_yes_state;
10170 
10171         state_num = st->resume_state + 1; /* failure = success + 1 */
10172         PERL_ASYNC_CHECK();
10173         goto reenter_switch;
10174     }
10175     result = 0;
10176 
10177   final_exit:
10178     if (rex->intflags & PREGf_VERBARG_SEEN) {
10179         SV *sv_err = get_sv("REGERROR", 1);
10180         SV *sv_mrk = get_sv("REGMARK", 1);
10181         if (result) {
10182             sv_commit = &PL_sv_no;
10183             if (!sv_yes_mark)
10184                 sv_yes_mark = &PL_sv_yes;
10185         } else {
10186             if (!sv_commit)
10187                 sv_commit = &PL_sv_yes;
10188             sv_yes_mark = &PL_sv_no;
10189         }
10190         assert(sv_err);
10191         assert(sv_mrk);
10192         sv_setsv(sv_err, sv_commit);
10193         sv_setsv(sv_mrk, sv_yes_mark);
10194     }
10195 
10196 
10197     if (last_pushed_cv) {
10198         dSP;
10199         /* see "Some notes about MULTICALL" above */
10200         POP_MULTICALL;
10201         PERL_UNUSED_VAR(SP);
10202     }
10203     else
10204         LEAVE_SCOPE(orig_savestack_ix);
10205 
10206     assert(!result ||  locinput - reginfo->strbeg >= 0);
10207     return result ?  locinput - reginfo->strbeg : -1;
10208 }
10209 
10210 /*
10211  - regrepeat - repeatedly match something simple, report how many
10212  *
10213  * What 'simple' means is a node which can be the operand of a quantifier like
10214  * '+', or {1,3}
10215  *
10216  * startposp - pointer to a pointer to the start position.  This is updated
10217  *             to point to the byte following the highest successful
10218  *             match.
10219  * p         - the regnode to be repeatedly matched against.
10220  * loceol    - pointer to the end position beyond which we aren't supposed to
10221  *             look.
10222  * reginfo   - struct holding match state, such as utf8_target
10223  * max       - maximum number of things to match.
10224  * depth     - (for debugging) backtracking depth.
10225  */
10226 STATIC I32
S_regrepeat(pTHX_ regexp * prog,char ** startposp,const regnode * p,char * loceol,regmatch_info * const reginfo,I32 max comma_pDEPTH)10227 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
10228             char * loceol, regmatch_info *const reginfo, I32 max comma_pDEPTH)
10229 {
10230     char *scan;     /* Pointer to current position in target string */
10231     I32 c;
10232     char *this_eol = loceol;   /* potentially adjusted version. */
10233     I32 hardcount = 0;  /* How many matches so far */
10234     bool utf8_target = reginfo->is_utf8_target;
10235     unsigned int to_complement = 0;  /* Invert the result? */
10236     char_class_number_ classnum;
10237 
10238     PERL_ARGS_ASSERT_REGREPEAT;
10239 
10240     /* This routine is structured so that we switch on the input OP.  Each OP
10241      * case: statement contains a loop to repeatedly apply the OP, advancing
10242      * the input until it fails, or reaches the end of the input, or until it
10243      * reaches the upper limit of matches. */
10244 
10245     scan = *startposp;
10246     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
10247                                max */
10248         max = I32_MAX;
10249     else if (! utf8_target && this_eol - scan > max)
10250         this_eol = scan + max;
10251 
10252     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
10253      * down to the maximum of how far we should go in it (but leaving it set to
10254      * the real end if the maximum permissible would take us beyond that).
10255      * This allows us to make the loop exit condition that we haven't gone past
10256      * <this_eol> to also mean that we haven't exceeded the max permissible
10257      * count, saving a test each time through the loop.  But it assumes that
10258      * the OP matches a single byte, which is true for most of the OPs below
10259      * when applied to a non-UTF-8 target.  Those relatively few OPs that don't
10260      * have this characteristic have to compensate.
10261      *
10262      * There is no such adjustment for UTF-8 targets, since the number of bytes
10263      * per character can vary.  OPs will have to test both that the count is
10264      * less than the max permissible (using <hardcount> to keep track), and
10265      * that we are still within the bounds of the string (using <this_eol>.  A
10266      * few OPs match a single byte no matter what the encoding.  They can omit
10267      * the max test if, for the UTF-8 case, they do the adjustment that was
10268      * skipped above.
10269      *
10270      * Thus, the code above sets things up for the common case; and exceptional
10271      * cases need extra work; the common case is to make sure <scan> doesn't go
10272      * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
10273      * count doesn't exceed the maximum permissible */
10274 
10275     switch (with_t_UTF8ness(OP(p), utf8_target)) {
10276         SV * anyofh_list;
10277 
10278       case REG_ANY_t8:
10279         while (scan < this_eol && hardcount < max && *scan != '\n') {
10280             scan += UTF8SKIP(scan);
10281             hardcount++;
10282         }
10283         break;
10284 
10285       case REG_ANY_tb:
10286         scan = (char *) memchr(scan, '\n', this_eol - scan);
10287         if (! scan) {
10288             scan = this_eol;
10289         }
10290         break;
10291 
10292       case SANY_t8:
10293         while (scan < this_eol && hardcount < max) {
10294             scan += UTF8SKIP(scan);
10295             hardcount++;
10296         }
10297         break;
10298 
10299       case SANY_tb:
10300         scan = this_eol;
10301         break;
10302 
10303       case EXACT_REQ8_tb:
10304       case LEXACT_REQ8_tb:
10305       case EXACTFU_REQ8_tb:
10306         break;
10307 
10308       case EXACTL_t8:
10309         if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10310             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10311         }
10312         /* FALLTHROUGH */
10313 
10314       case EXACTL_tb:
10315       case EXACTFL_t8:
10316       case EXACTFL_tb:
10317       case EXACTFLU8_t8:
10318       case EXACTFLU8_tb:
10319         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10320         /* FALLTHROUGH */
10321 
10322       case EXACT_REQ8_t8:
10323       case LEXACT_REQ8_t8:
10324       case EXACTFU_REQ8_t8:
10325       case LEXACT_t8:
10326       case LEXACT_tb:
10327       case EXACT_t8:
10328       case EXACT_tb:
10329       case EXACTF_t8:
10330       case EXACTF_tb:
10331       case EXACTFAA_NO_TRIE_t8:
10332       case EXACTFAA_NO_TRIE_tb:
10333       case EXACTFAA_t8:
10334       case EXACTFAA_tb:
10335       case EXACTFU_t8:
10336       case EXACTFU_tb:
10337       case EXACTFUP_t8:
10338       case EXACTFUP_tb:
10339 
10340       {
10341         struct next_matchable_info Binfo;
10342         PERL_UINT_FAST8_T definitive_len;
10343 
10344         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10345 
10346         /* Set up termination info, and quit if we can rule out that we've
10347          * gotten a match of the termination criteria */
10348         if (   ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10349             ||   scan + Binfo.min_length > this_eol
10350             || ! S_test_EXACTISH_ST(scan, Binfo))
10351         {
10352             break;
10353         }
10354 
10355         definitive_len = Binfo.initial_definitive;
10356 
10357         /* Here there are potential matches, and the first byte(s) matched our
10358          * filter
10359          *
10360          * If we got a definitive match of some initial bytes, there is no
10361          * possibility of false positives as far as it got */
10362         if (definitive_len > 0) {
10363 
10364             /* If as far as it got is the maximum possible, there were no false
10365              * positives at all.  Since we have everything set up, see how many
10366              * repeats there are. */
10367             if (definitive_len >= Binfo.max_length) {
10368 
10369                 /* We've already found one match */
10370                 scan += definitive_len;
10371                 hardcount++;
10372 
10373                 /* If want more than the one match, and there is room for more,
10374                  * see if there are any */
10375                 if (hardcount < max && scan + definitive_len <= this_eol) {
10376 
10377                     /* If the character is only a single byte long, just span
10378                      * all such bytes. */
10379                     if (definitive_len == 1) {
10380                         const char * orig_scan = scan;
10381 
10382                         if (this_eol - (scan - hardcount) > max) {
10383                             this_eol = scan - hardcount + max;
10384                         }
10385 
10386                         /* Use different routines depending on whether it's an
10387                          * exact match or matches with a mask */
10388                         if (Binfo.initial_exact == 1) {
10389                             scan = (char *) find_span_end((U8 *) scan,
10390                                                           (U8 *) this_eol,
10391                                                           Binfo.matches[0]);
10392                         }
10393                         else {
10394                             scan = (char *) find_span_end_mask(
10395                                                        (U8 *) scan,
10396                                                        (U8 *) this_eol,
10397                                                        Binfo.first_byte_anded,
10398                                                        Binfo.first_byte_mask);
10399                         }
10400 
10401                         hardcount += scan - orig_scan;
10402                     }
10403                     else { /* Here, the full character definitive match is more
10404                               than one byte */
10405                         while (   hardcount < max
10406                                && scan + definitive_len <= this_eol
10407                                && S_test_EXACTISH_ST(scan, Binfo))
10408                         {
10409                                 scan += definitive_len;
10410                                 hardcount++;
10411                         }
10412                     }
10413                 }
10414 
10415                 break;
10416             }   /* End of a full character is definitively matched */
10417 
10418             /* Here, an initial portion of the character matched definitively,
10419              * and the rest matched as well, but could have false positives */
10420 
10421             do {
10422                 int i;
10423                 U8 * matches = Binfo.matches;
10424 
10425                 /* The first bytes were definitive.  Look at the remaining */
10426                 for (i = 0; i < Binfo.count; i++) {
10427                     if (memEQ(scan + definitive_len,
10428                               matches + definitive_len,
10429                               Binfo.lengths[i] - definitive_len))
10430                     {
10431                         goto found_a_completion;
10432                     }
10433 
10434                     matches += Binfo.lengths[i];
10435                 }
10436 
10437                 /* Didn't find anything to complete our initial match.  Stop
10438                  * here */
10439                 break;
10440 
10441               found_a_completion:
10442 
10443                 /* Here, matched a full character, Include it in the result,
10444                  * and then look to see if the next char matches */
10445                 hardcount++;
10446                 scan += Binfo.lengths[i];
10447 
10448             } while (   hardcount < max
10449                      && scan + definitive_len < this_eol
10450                      && S_test_EXACTISH_ST(scan, Binfo));
10451 
10452             /* Here, have advanced as far as possible */
10453             break;
10454         } /* End of found some initial bytes that definitively matched */
10455 
10456         /* Here, we can't rule out that we have found the beginning of 'B', but
10457          * there were no initial bytes that could rule out anything
10458          * definitively. Use brute force to examine all the possibilities */
10459         while (scan < this_eol && hardcount < max) {
10460             int i;
10461             U8 * matches = Binfo.matches;
10462 
10463             for (i = 0; i < Binfo.count; i++) {
10464                 if (memEQ(scan, matches, Binfo.lengths[i])) {
10465                     goto found1;
10466                 }
10467 
10468                 matches += Binfo.lengths[i];
10469             }
10470 
10471             break;
10472 
10473           found1:
10474             hardcount++;
10475             scan += Binfo.lengths[i];
10476         }
10477 
10478         break;
10479       }
10480 
10481       case ANYOFPOSIXL_t8:
10482       case ANYOFL_t8:
10483         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10484         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10485 
10486         /* FALLTHROUGH */
10487       case ANYOFD_t8:
10488       case ANYOF_t8:
10489         while (   hardcount < max
10490                && scan < this_eol
10491                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10492         {
10493             scan += UTF8SKIP(scan);
10494             hardcount++;
10495         }
10496         break;
10497 
10498       case ANYOFPOSIXL_tb:
10499       case ANYOFL_tb:
10500         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10501         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10502         /* FALLTHROUGH */
10503 
10504       case ANYOFD_tb:
10505       case ANYOF_tb:
10506         if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) {
10507             while (   scan < this_eol
10508                    && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10509                 scan++;
10510         }
10511         else {
10512             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10513                 scan++;
10514         }
10515         break;
10516 
10517       case ANYOFM_t8:
10518         if (this_eol - scan > max) {
10519 
10520             /* We didn't adjust <this_eol> at the beginning of this routine
10521              * because is UTF-8, but it is actually ok to do so, since here, to
10522              * match, 1 char == 1 byte. */
10523             this_eol = scan + max;
10524         }
10525         /* FALLTHROUGH */
10526 
10527       case ANYOFM_tb:
10528         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10529                                            (U8) ARG1u(p), FLAGS(p));
10530         break;
10531 
10532       case NANYOFM_t8:
10533         while (     hardcount < max
10534                &&   scan < this_eol
10535                &&  (*scan & FLAGS(p)) != ARG1u(p))
10536         {
10537             scan += UTF8SKIP(scan);
10538             hardcount++;
10539         }
10540         break;
10541 
10542       case NANYOFM_tb:
10543         scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10544                                          (U8) ARG1u(p), FLAGS(p));
10545         break;
10546 
10547       case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10548       case ANYOFHb_tb:
10549       case ANYOFHbbm_tb:
10550       case ANYOFHr_tb:
10551       case ANYOFHs_tb:
10552         break;
10553 
10554       case ANYOFH_t8:
10555         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10556         while (  hardcount < max
10557                && scan < this_eol
10558                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10559                && _invlist_contains_cp(anyofh_list,
10560                                              utf8_to_uvchr_buf((U8 *) scan,
10561                                                                (U8 *) this_eol,
10562                                                                NULL)))
10563         {
10564             scan += UTF8SKIP(scan);
10565             hardcount++;
10566         }
10567         break;
10568 
10569       case ANYOFHb_t8:
10570         /* we know the first byte must be the FLAGS field */
10571         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10572         while (   hardcount < max
10573                && scan < this_eol
10574                && (U8) *scan == ANYOF_FLAGS(p)
10575                && _invlist_contains_cp(anyofh_list,
10576                                              utf8_to_uvchr_buf((U8 *) scan,
10577                                                                (U8 *) this_eol,
10578                                                                NULL)))
10579         {
10580             scan += UTF8SKIP(scan);
10581             hardcount++;
10582         }
10583         break;
10584 
10585       case ANYOFHbbm_t8:
10586         while (   hardcount < max
10587                && scan + 1 < this_eol
10588                && (U8) *scan == ANYOF_FLAGS(p)
10589                && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap,
10590                                 (U8) scan[1] & UTF_CONTINUATION_MASK))
10591         {
10592             scan += 2;  /* This node only matces 2-byte UTF-8 */
10593             hardcount++;
10594         }
10595         break;
10596 
10597       case ANYOFHr_t8:
10598         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10599         while (  hardcount < max
10600                && scan < this_eol
10601                && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10602                           LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10603                           HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10604                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10605                && _invlist_contains_cp(anyofh_list,
10606                                              utf8_to_uvchr_buf((U8 *) scan,
10607                                                                (U8 *) this_eol,
10608                                                                NULL)))
10609         {
10610             scan += UTF8SKIP(scan);
10611             hardcount++;
10612         }
10613         break;
10614 
10615       case ANYOFHs_t8:
10616         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10617         while (   hardcount < max
10618                && scan + FLAGS(p) < this_eol
10619                && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10620                && _invlist_contains_cp(anyofh_list,
10621                                              utf8_to_uvchr_buf((U8 *) scan,
10622                                                                (U8 *) this_eol,
10623                                                                NULL)))
10624         {
10625             scan += UTF8SKIP(scan);
10626             hardcount++;
10627         }
10628         break;
10629 
10630       case ANYOFR_t8:
10631         while (   hardcount < max
10632                && scan < this_eol
10633                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10634                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10635                                             (U8 *) this_eol,
10636                                             NULL),
10637                               ANYOFRbase(p), ANYOFRdelta(p)))
10638         {
10639             scan += UTF8SKIP(scan);
10640             hardcount++;
10641         }
10642         break;
10643 
10644       case ANYOFR_tb:
10645         while (   hardcount < max
10646                && scan < this_eol
10647                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10648         {
10649             scan++;
10650             hardcount++;
10651         }
10652         break;
10653 
10654       case ANYOFRb_t8:
10655         while (   hardcount < max
10656                && scan < this_eol
10657                && (U8) *scan == ANYOF_FLAGS(p)
10658                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10659                                             (U8 *) this_eol,
10660                                             NULL),
10661                               ANYOFRbase(p), ANYOFRdelta(p)))
10662         {
10663             scan += UTF8SKIP(scan);
10664             hardcount++;
10665         }
10666         break;
10667 
10668       case ANYOFRb_tb:
10669         while (   hardcount < max
10670                && scan < this_eol
10671                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10672         {
10673             scan++;
10674             hardcount++;
10675         }
10676         break;
10677 
10678     /* The argument (FLAGS) to all the POSIX node types is the class number */
10679 
10680       case NPOSIXL_tb:
10681         to_complement = 1;
10682         /* FALLTHROUGH */
10683 
10684       case POSIXL_tb:
10685         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10686         while (   scan < this_eol
10687                && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10688         {
10689             scan++;
10690         }
10691         break;
10692 
10693       case NPOSIXL_t8:
10694         to_complement = 1;
10695         /* FALLTHROUGH */
10696 
10697       case POSIXL_t8:
10698         while (   hardcount < max && scan < this_eol
10699                && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10700                                                       (U8 *) scan,
10701                                                       (U8 *) this_eol)))
10702         {
10703             scan += UTF8SKIP(scan);
10704             hardcount++;
10705         }
10706         break;
10707 
10708       case POSIXD_tb:
10709         /* FALLTHROUGH */
10710 
10711       case POSIXA_t8:
10712         if (this_eol - scan > max) {
10713 
10714             /* We didn't adjust <this_eol> at the beginning of this routine
10715              * because is UTF-8, but it is actually ok to do so, since here, to
10716              * match, 1 char == 1 byte. */
10717             this_eol = scan + max;
10718         }
10719         /* FALLTHROUGH */
10720 
10721       case POSIXA_tb:
10722         while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) {
10723             scan++;
10724         }
10725         break;
10726 
10727       case NPOSIXD_tb:
10728         /* FALLTHROUGH */
10729 
10730       case NPOSIXA_tb:
10731         while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) {
10732             scan++;
10733         }
10734         break;
10735 
10736       case NPOSIXA_t8:
10737 
10738         /* The complement of something that matches only ASCII matches all
10739          * non-ASCII, plus everything in ASCII that isn't in the class. */
10740         while (   hardcount < max && scan < this_eol
10741                && (   ! isASCII_utf8_safe(scan, loceol)
10742                    || ! generic_isCC_A_((U8) *scan, FLAGS(p))))
10743             {
10744                 scan += UTF8SKIP(scan);
10745                 hardcount++;
10746             }
10747         break;
10748 
10749       case NPOSIXU_tb:
10750         to_complement = 1;
10751         /* FALLTHROUGH */
10752 
10753       case POSIXU_tb:
10754         while (   scan < this_eol
10755                && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p))))
10756         {
10757             scan++;
10758         }
10759         break;
10760 
10761       case NPOSIXU_t8:
10762       case NPOSIXD_t8:
10763         to_complement = 1;
10764         /* FALLTHROUGH */
10765 
10766       case POSIXD_t8:
10767       case POSIXU_t8:
10768         classnum = (char_class_number_) FLAGS(p);
10769         switch (classnum) {
10770           default:
10771             while (   hardcount < max && scan < this_eol
10772                    && to_complement
10773                     ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10774                        utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10775             {
10776                 scan += UTF8SKIP(scan);
10777                 hardcount++;
10778             }
10779             break;
10780 
10781             /* For the classes below, the knowledge of how to handle every code
10782              * point is compiled into Perl via a macro.  This code is written
10783              * for making the loops as tight as possible.  It could be
10784              * refactored to save space instead. */
10785 
10786           case CC_ENUM_SPACE_:
10787             while (   hardcount < max
10788                    && scan < this_eol
10789                    && (to_complement
10790                                    ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10791             {
10792                 scan += UTF8SKIP(scan);
10793                 hardcount++;
10794             }
10795             break;
10796           case CC_ENUM_BLANK_:
10797             while (   hardcount < max
10798                    && scan < this_eol
10799                    && (to_complement
10800                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10801             {
10802                 scan += UTF8SKIP(scan);
10803                 hardcount++;
10804             }
10805             break;
10806           case CC_ENUM_XDIGIT_:
10807             while (   hardcount < max
10808                    && scan < this_eol
10809                    && (to_complement
10810                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10811             {
10812                 scan += UTF8SKIP(scan);
10813                 hardcount++;
10814             }
10815             break;
10816           case CC_ENUM_VERTSPACE_:
10817             while (   hardcount < max
10818                    && scan < this_eol
10819                    && (to_complement
10820                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10821             {
10822                 scan += UTF8SKIP(scan);
10823                 hardcount++;
10824             }
10825             break;
10826           case CC_ENUM_CNTRL_:
10827             while (   hardcount < max
10828                    && scan < this_eol
10829                    && (to_complement
10830                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10831             {
10832                 scan += UTF8SKIP(scan);
10833                 hardcount++;
10834             }
10835             break;
10836         }
10837         break;
10838 
10839       case LNBREAK_t8:
10840         while (    hardcount < max && scan < this_eol
10841                && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10842         {
10843             scan += c;
10844             hardcount++;
10845         }
10846         break;
10847 
10848       case LNBREAK_tb:
10849         /* LNBREAK can match one or two latin chars, which is ok, but we have
10850          * to use hardcount in this situation, and throw away the adjustment to
10851          * <this_eol> done before the switch statement */
10852         while (
10853             hardcount < max && scan < loceol
10854             && (c = is_LNBREAK_latin1_safe(scan, loceol))
10855         ) {
10856             scan += c;
10857             hardcount++;
10858         }
10859         break;
10860 
10861     default:
10862         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10863                          " node type %d='%s'", OP(p), REGNODE_NAME(OP(p)));
10864         NOT_REACHED; /* NOTREACHED */
10865 
10866     }
10867 
10868     if (hardcount)
10869         c = hardcount;
10870     else
10871         c = scan - *startposp;
10872     *startposp = scan;
10873 
10874     DEBUG_r({
10875         DECLARE_AND_GET_RE_DEBUG_FLAGS;
10876         DEBUG_EXECUTE_r({
10877             SV * const prop = sv_newmortal();
10878             regprop(prog, prop, p, reginfo, NULL);
10879             Perl_re_exec_indentf( aTHX_
10880                         "%s can match %" IVdf " times out of %" IVdf "...\n",
10881                         depth, SvPVX_const(prop),(IV)c,(IV)max);
10882         });
10883     });
10884 
10885     return(c);
10886 }
10887 
10888 /*
10889  - reginclass - determine if a character falls into a character class
10890 
10891   n is the ANYOF-type regnode
10892   p is the target string
10893   p_end points to one byte beyond the end of the target string
10894   utf8_target tells whether p is in UTF-8.
10895 
10896   Returns true if matched; false otherwise.
10897 
10898   Note that this can be a synthetic start class, a combination of various
10899   nodes, so things you think might be mutually exclusive, such as locale,
10900   aren't.  It can match both locale and non-locale
10901 
10902  */
10903 
10904 STATIC bool
S_reginclass(pTHX_ regexp * const prog,const regnode * const n,const U8 * const p,const U8 * const p_end,const bool utf8_target)10905 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10906 {
10907     const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10908                         ? 0
10909                         : ANYOF_FLAGS(n);
10910     bool match = FALSE;
10911     UV c = *p;
10912 
10913     PERL_ARGS_ASSERT_REGINCLASS;
10914 
10915     /* If c is not already the code point, get it.  Note that
10916      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10917     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10918         STRLEN c_len = 0;
10919         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10920         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10921         if (c_len == (STRLEN)-1) {
10922             _force_out_malformed_utf8_message(p, p_end,
10923                                               utf8n_flags,
10924                                               1 /* 1 means die */ );
10925             NOT_REACHED; /* NOTREACHED */
10926         }
10927         if (     c > 255
10928             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10929             && ! (flags & ANYOFL_UTF8_LOCALE_REQD))
10930         {
10931             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10932         }
10933     }
10934 
10935     /* If this character is potentially in the bitmap, check it */
10936     if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10937         if (ANYOF_BITMAP_TEST(n, c))
10938             match = TRUE;
10939         else if (  (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)
10940                  && OP(n) == ANYOFD
10941                  && ! utf8_target
10942                  && ! isASCII(c))
10943         {
10944             match = TRUE;
10945         }
10946         else if (flags & ANYOF_LOCALE_FLAGS) {
10947             if (  (flags & ANYOFL_FOLD)
10948                 && c < 256
10949                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10950             {
10951                 match = TRUE;
10952             }
10953             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10954                      && c <= U8_MAX  /* param to isFOO_lc() */
10955             ) {
10956                 /* The data structure is arranged so bits 0, 2, 4, ... are set
10957                  * if the class includes the Posix character class given by
10958                  * bit/2; and 1, 3, 5, ... are set if the class includes the
10959                  * complemented Posix class given by int(bit/2), so the
10960                  * remainder modulo 2 tells us if to complement or not.
10961                  *
10962                  * Note that this code assumes that all the classes are closed
10963                  * under folding.  For example, if a character matches \w, then
10964                  * its fold does too; and vice versa.  This should be true for
10965                  * any well-behaved locale for all the currently defined Posix
10966                  * classes, except for :lower: and :upper:, which are handled
10967                  * by the pseudo-class :cased: which matches if either of the
10968                  * other two does.  To get rid of this assumption, an outer
10969                  * loop could be used below to iterate over both the source
10970                  * character, and its fold (if different) */
10971 
10972                 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10973 
10974                 do {
10975                     /* Find the next set bit indicating a class to try matching
10976                      * against */
10977                     U8 bit_pos = lsbit_pos32(posixl_bits);
10978 
10979                     if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10980                         match = TRUE;
10981                         break;
10982                     }
10983 
10984                     /* Remove this class from consideration; repeat */
10985                     POSIXL_CLEAR(posixl_bits, bit_pos);
10986                 } while(posixl_bits != 0);
10987             }
10988         }
10989     }
10990 
10991     /* If the bitmap didn't (or couldn't) match, and something outside the
10992      * bitmap could match, try that. */
10993     if (!match) {
10994         if (      c >= NUM_ANYOF_CODE_POINTS
10995             &&    ANYOF_ONLY_HAS_BITMAP(n)
10996             && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES))
10997         {
10998             /* In this case, the ARG is set up so that the final bit indicates
10999              * whether it matches or not */
11000             match = ARG1u(n) & 1;
11001         }
11002         else
11003             /* Here, the main way it could match is if the code point is
11004              * outside the bitmap and an inversion list exists to handle such
11005              * things.  The other ways all occur when a flag is set to indicate
11006              * we need special handling.  That handling doesn't come in to
11007              * effect for ANYOFD nodes unless the target string is UTF-8 and
11008              * that matters to code point being matched. */
11009              if (    c >= NUM_ANYOF_CODE_POINTS
11010                  || (   (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)
11011                      && (   UNLIKELY(OP(n) != ANYOFD)
11012                          || (utf8_target && ! isASCII_uvchr(c)
11013 #                               if NUM_ANYOF_CODE_POINTS > 256
11014                                                                && c < 256
11015 #                               endif
11016                                                                          ))))
11017         {
11018             /* Here, we have an inversion list for outside-the-bitmap code
11019              * points and/or the flag is set indicating special handling is
11020              * needed.  The flag is set when there is auxiliary data beyond the
11021              * normal inversion list, or if there is the possibility of a
11022              * special Turkic locale match, even without auxiliary data.
11023              *
11024              * First check if there is an inversion list and/or auxiliary data.
11025              * */
11026             if (ANYOF_HAS_AUX(n)) {
11027                 SV* only_utf8_locale = NULL;
11028 
11029                 /* This call will return in 'definition' the union of the base
11030                  * non-bitmap inversion list, if any, plus the deferred
11031                  * definitions of user-defined properties, if any.  It croaks
11032                  * if there is such a property but which still has no definition
11033                  * available */
11034                 SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0,
11035                                                       &only_utf8_locale, NULL);
11036                 if (definition) {
11037                     /* Most likely is the outside-the-bitmap inversion list. */
11038                     if (_invlist_contains_cp(definition, c)) {
11039                         match = TRUE;
11040                     }
11041                     else /* Failing that, hardcode the two tests for a Turkic
11042                             locale */
11043                          if (   UNLIKELY(IN_UTF8_TURKIC_LOCALE)
11044                              && isALPHA_FOLD_EQ(*p, 'i'))
11045                     {
11046                         /* Turkish locales have these hard-coded rules
11047                          * overriding normal ones */
11048                         if (*p == 'i') {
11049                             if (_invlist_contains_cp(definition,
11050                                          LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11051                             {
11052                                 match = TRUE;
11053                             }
11054                         }
11055                         else if (_invlist_contains_cp(definition,
11056                                                  LATIN_SMALL_LETTER_DOTLESS_I))
11057                         {
11058                             match = TRUE;
11059                         }
11060                     }
11061                 }
11062 
11063                 if (   UNLIKELY(only_utf8_locale)
11064                     && UNLIKELY(IN_UTF8_CTYPE_LOCALE)
11065                     && ! match)
11066                 {
11067                     match = _invlist_contains_cp(only_utf8_locale, c);
11068                 }
11069             }
11070 
11071             /* In a Turkic locale under folding, hard-code the I i case pair
11072              * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES
11073              * flag set unless [Ii] were match possibilities */
11074             if (UNLIKELY(IN_UTF8_TURKIC_LOCALE) && ! match) {
11075                 if (utf8_target) {
11076                     if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
11077                         if (ANYOF_BITMAP_TEST(n, 'i')) {
11078                             match = TRUE;
11079                         }
11080                     }
11081                     else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
11082                         if (ANYOF_BITMAP_TEST(n, 'I')) {
11083                             match = TRUE;
11084                         }
11085                     }
11086                 }
11087 
11088 #if NUM_ANYOF_CODE_POINTS > 256
11089                 /* Larger bitmap means these special cases aren't handled
11090                  * outside the bitmap above. */
11091                 if (*p == 'i') {
11092                     if (ANYOF_BITMAP_TEST(n,
11093                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11094                     {
11095                         match = TRUE;
11096                     }
11097                 }
11098                 else if (*p == 'I') {
11099                     if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) {
11100                         match = TRUE;
11101                     }
11102                 }
11103 #endif
11104             }
11105         }
11106 
11107         if (   UNICODE_IS_SUPER(c)
11108             && (flags & ANYOF_WARN_SUPER__shared)
11109             && OP(n) != ANYOFD
11110             && ckWARN_d(WARN_NON_UNICODE))
11111         {
11112             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
11113                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
11114         }
11115     }
11116 
11117 #if ANYOF_INVERT != 1
11118     /* Depending on compiler optimization cBOOL takes time, so if don't have to
11119      * use it, don't */
11120 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
11121 #endif
11122 
11123     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
11124     return (flags & ANYOF_INVERT) ^ match;
11125 }
11126 
11127 STATIC U8 *
S_reghop3(U8 * s,SSize_t off,const U8 * lim)11128 S_reghop3(U8 *s, SSize_t off, const U8* lim)
11129 {
11130     /* return the position 'off' UTF-8 characters away from 's', forward if
11131      * 'off' >= 0, backwards if negative.  But don't go outside of position
11132      * 'lim', which better be < s  if off < 0 */
11133 
11134     PERL_ARGS_ASSERT_REGHOP3;
11135 
11136     if (off >= 0) {
11137         while (off-- && s < lim) {
11138             /* XXX could check well-formedness here */
11139             U8 *new_s = s + UTF8SKIP(s);
11140             if (new_s > lim) /* lim may be in the middle of a long character */
11141                 return s;
11142             s = new_s;
11143         }
11144     }
11145     else {
11146         while (off++ && s > lim) {
11147             s--;
11148             if (UTF8_IS_CONTINUED(*s)) {
11149                 while (s > lim && UTF8_IS_CONTINUATION(*s))
11150                     s--;
11151                 if (! UTF8_IS_START(*s)) {
11152                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11153                 }
11154             }
11155             /* XXX could check well-formedness here */
11156         }
11157     }
11158     return s;
11159 }
11160 
11161 STATIC U8 *
S_reghop4(U8 * s,SSize_t off,const U8 * llim,const U8 * rlim)11162 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
11163 {
11164     PERL_ARGS_ASSERT_REGHOP4;
11165 
11166     return utf8_hop_safe(s, off, llim, rlim);
11167 }
11168 
11169 /* like reghop3, but returns NULL on overrun, rather than returning last
11170  * char pos */
11171 
11172 STATIC U8 *
S_reghopmaybe3(U8 * s,SSize_t off,const U8 * const lim)11173 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
11174 {
11175     PERL_ARGS_ASSERT_REGHOPMAYBE3;
11176 
11177     if (off >= 0) {
11178         while (off-- && s < lim) {
11179             /* XXX could check well-formedness here */
11180             s += UTF8SKIP(s);
11181         }
11182         if (off >= 0)
11183             return NULL;
11184     }
11185     else {
11186         while (off++ && s > lim) {
11187             s--;
11188             if (UTF8_IS_CONTINUED(*s)) {
11189                 while (s > lim && UTF8_IS_CONTINUATION(*s))
11190                     s--;
11191                 if (! UTF8_IS_START(*s)) {
11192                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11193                 }
11194             }
11195             /* XXX could check well-formedness here */
11196         }
11197         if (off <= 0)
11198             return NULL;
11199     }
11200     return s;
11201 }
11202 
11203 
11204 /* when executing a regex that may have (?{}), extra stuff needs setting
11205    up that will be visible to the called code, even before the current
11206    match has finished. In particular:
11207 
11208    * $_ is localised to the SV currently being matched;
11209    * pos($_) is created if necessary, ready to be updated on each call-out
11210      to code;
11211    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
11212      isn't set until the current pattern is successfully finished), so that
11213      $1 etc of the match-so-far can be seen;
11214    * save the old values of subbeg etc of the current regex, and  set then
11215      to the current string (again, this is normally only done at the end
11216      of execution)
11217 */
11218 
11219 static void
S_setup_eval_state(pTHX_ regmatch_info * const reginfo)11220 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
11221 {
11222     MAGIC *mg;
11223     regexp *const rex = ReANY(reginfo->prog);
11224     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
11225 
11226     eval_state->rex = rex;
11227     eval_state->sv  = reginfo->sv;
11228 
11229     if (reginfo->sv) {
11230         /* Make $_ available to executed code. */
11231         if (reginfo->sv != DEFSV) {
11232             SAVE_DEFSV;
11233             DEFSV_set(reginfo->sv);
11234         }
11235         /* will be dec'd by S_cleanup_regmatch_info_aux */
11236         SvREFCNT_inc_NN(reginfo->sv);
11237 
11238         if (!(mg = mg_find_mglob(reginfo->sv))) {
11239             /* prepare for quick setting of pos */
11240             mg = sv_magicext_mglob(reginfo->sv);
11241             mg->mg_len = -1;
11242         }
11243         eval_state->pos_magic = mg;
11244         eval_state->pos       = mg->mg_len;
11245         eval_state->pos_flags = mg->mg_flags;
11246     }
11247     else
11248         eval_state->pos_magic = NULL;
11249 
11250     if (!PL_reg_curpm) {
11251         /* PL_reg_curpm is a fake PMOP that we can attach the current
11252          * regex to and point PL_curpm at, so that $1 et al are visible
11253          * within a /(?{})/. It's just allocated once per interpreter the
11254          * first time its needed */
11255         Newxz(PL_reg_curpm, 1, PMOP);
11256 #ifdef USE_ITHREADS
11257         {
11258             SV* const repointer = &PL_sv_undef;
11259             /* this regexp is also owned by the new PL_reg_curpm, which
11260                will try to free it.  */
11261             av_push(PL_regex_padav, repointer);
11262             PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
11263             PL_regex_pad = AvARRAY(PL_regex_padav);
11264         }
11265 #endif
11266     }
11267     SET_reg_curpm(reginfo->prog);
11268     eval_state->curpm = PL_curpm;
11269     PL_curpm_under = PL_curpm;
11270     PL_curpm = PL_reg_curpm;
11271     if (RXp_MATCH_COPIED(rex)) {
11272         /*  Here is a serious problem: we cannot rewrite subbeg,
11273             since it may be needed if this match fails.  Thus
11274             $` inside (?{}) could fail... */
11275         eval_state->subbeg     = RXp_SUBBEG(rex);
11276         eval_state->sublen     = RXp_SUBLEN(rex);
11277         eval_state->suboffset  = RXp_SUBOFFSET(rex);
11278         eval_state->subcoffset = RXp_SUBCOFFSET(rex);
11279 #ifdef PERL_ANY_COW
11280         eval_state->saved_copy = RXp_SAVED_COPY(rex);
11281 #endif
11282         RXp_MATCH_COPIED_off(rex);
11283     }
11284     else
11285         eval_state->subbeg = NULL;
11286     RXp_SUBBEG(rex) = (char *)reginfo->strbeg;
11287     RXp_SUBOFFSET(rex) = 0;
11288     RXp_SUBCOFFSET(rex) = 0;
11289     RXp_SUBLEN(rex) = reginfo->strend - reginfo->strbeg;
11290 }
11291 
11292 
11293 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
11294 
11295 static void
S_cleanup_regmatch_info_aux(pTHX_ void * arg)11296 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
11297 {
11298     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
11299     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
11300     regmatch_slab *s;
11301 
11302     Safefree(aux->poscache);
11303 
11304     if (eval_state) {
11305 
11306         /* undo the effects of S_setup_eval_state() */
11307 
11308         if (eval_state->subbeg) {
11309             regexp * const rex = eval_state->rex;
11310             RXp_SUBBEG(rex) = eval_state->subbeg;
11311             RXp_SUBLEN(rex)     = eval_state->sublen;
11312             RXp_SUBOFFSET(rex)  = eval_state->suboffset;
11313             RXp_SUBCOFFSET(rex) = eval_state->subcoffset;
11314 #ifdef PERL_ANY_COW
11315             RXp_SAVED_COPY(rex) = eval_state->saved_copy;
11316 #endif
11317             RXp_MATCH_COPIED_on(rex);
11318         }
11319         if (eval_state->pos_magic)
11320         {
11321             eval_state->pos_magic->mg_len = eval_state->pos;
11322             eval_state->pos_magic->mg_flags =
11323                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
11324                | (eval_state->pos_flags & MGf_BYTES);
11325         }
11326 
11327         PL_curpm = eval_state->curpm;
11328         SvREFCNT_dec(eval_state->sv);
11329     }
11330 
11331     PL_regmatch_state = aux->old_regmatch_state;
11332     PL_regmatch_slab  = aux->old_regmatch_slab;
11333 
11334     /* free all slabs above current one - this must be the last action
11335      * of this function, as aux and eval_state are allocated within
11336      * slabs and may be freed here */
11337 
11338     s = PL_regmatch_slab->next;
11339     if (s) {
11340         PL_regmatch_slab->next = NULL;
11341         while (s) {
11342             regmatch_slab * const osl = s;
11343             s = s->next;
11344             Safefree(osl);
11345         }
11346     }
11347 }
11348 
11349 
11350 STATIC void
S_to_utf8_substr(pTHX_ regexp * prog)11351 S_to_utf8_substr(pTHX_ regexp *prog)
11352 {
11353     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11354      * on the converted value */
11355 
11356     int i = 1;
11357 
11358     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11359 
11360     do {
11361         if (prog->substrs->data[i].substr
11362             && !prog->substrs->data[i].utf8_substr) {
11363             SV* const sv = newSVsv(prog->substrs->data[i].substr);
11364             prog->substrs->data[i].utf8_substr = sv;
11365             sv_utf8_upgrade(sv);
11366             if (SvVALID(prog->substrs->data[i].substr)) {
11367                 if (SvTAIL(prog->substrs->data[i].substr)) {
11368                     /* Trim the trailing \n that fbm_compile added last
11369                        time.  */
11370                     SvCUR_set(sv, SvCUR(sv) - 1);
11371                     /* Whilst this makes the SV technically "invalid" (as its
11372                        buffer is no longer followed by "\0") when fbm_compile()
11373                        adds the "\n" back, a "\0" is restored.  */
11374                     fbm_compile(sv, FBMcf_TAIL);
11375                 } else
11376                     fbm_compile(sv, 0);
11377             }
11378             if (prog->substrs->data[i].substr == prog->check_substr)
11379                 prog->check_utf8 = sv;
11380         }
11381     } while (i--);
11382 }
11383 
11384 STATIC bool
S_to_byte_substr(pTHX_ regexp * prog)11385 S_to_byte_substr(pTHX_ regexp *prog)
11386 {
11387     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11388      * on the converted value; returns FALSE if can't be converted. */
11389 
11390     int i = 1;
11391 
11392     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11393 
11394     do {
11395         if (prog->substrs->data[i].utf8_substr
11396             && !prog->substrs->data[i].substr) {
11397             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11398             if (! sv_utf8_downgrade(sv, TRUE)) {
11399                 SvREFCNT_dec_NN(sv);
11400                 return FALSE;
11401             }
11402             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11403                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11404                     /* Trim the trailing \n that fbm_compile added last
11405                         time.  */
11406                     SvCUR_set(sv, SvCUR(sv) - 1);
11407                     fbm_compile(sv, FBMcf_TAIL);
11408                 } else
11409                     fbm_compile(sv, 0);
11410             }
11411             prog->substrs->data[i].substr = sv;
11412             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11413                 prog->check_substr = sv;
11414         }
11415     } while (i--);
11416 
11417     return TRUE;
11418 }
11419 
11420 #ifndef PERL_IN_XSUB_RE
11421 
11422 bool
Perl_is_grapheme(pTHX_ const U8 * strbeg,const U8 * s,const U8 * strend,const UV cp)11423 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11424 {
11425     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
11426      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
11427      * the larger string bounded by 'strbeg' and 'strend'.
11428      *
11429      * 'cp' needs to be assigned (if not, a future version of the Unicode
11430      * Standard could make it something that combines with adjacent characters,
11431      * so code using it would then break), and there has to be a GCB break
11432      * before and after the character. */
11433 
11434 
11435     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11436     const U8 * prev_cp_start;
11437 
11438     PERL_ARGS_ASSERT_IS_GRAPHEME;
11439 
11440     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
11441         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11442     {
11443         /* These are considered graphemes */
11444         return TRUE;
11445     }
11446 
11447     /* Otherwise, unassigned code points are forbidden */
11448     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11449                                     _invlist_search(PL_Assigned_invlist, cp))))
11450     {
11451         return FALSE;
11452     }
11453 
11454     cp_gcb_val = getGCB_VAL_CP(cp);
11455 
11456     /* Find the GCB value of the previous code point in the input */
11457     prev_cp_start = utf8_hop_back(s, -1, strbeg);
11458     if (UNLIKELY(prev_cp_start == s)) {
11459         prev_cp_gcb_val = GCB_EDGE;
11460     }
11461     else {
11462         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11463     }
11464 
11465     /* And check that is a grapheme boundary */
11466     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11467                 TRUE /* is UTF-8 encoded */ ))
11468     {
11469         return FALSE;
11470     }
11471 
11472     /* Similarly verify there is a break between the current character and the
11473      * following one */
11474     s += UTF8SKIP(s);
11475     if (s >= strend) {
11476         next_cp_gcb_val = GCB_EDGE;
11477     }
11478     else {
11479         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11480     }
11481 
11482     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11483 }
11484 
11485 /*
11486 =for apidoc_section $unicode
11487 
11488 =for apidoc isSCRIPT_RUN
11489 
11490 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11491 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
11492 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
11493 two degenerate cases given below, this function returns TRUE iff all code
11494 points in it come from any combination of three "scripts" given by the Unicode
11495 "Script Extensions" property: Common, Inherited, and possibly one other.
11496 Additionally all decimal digits must come from the same consecutive sequence of
11497 10.
11498 
11499 For example, if all the characters in the sequence are Greek, or Common, or
11500 Inherited, this function will return TRUE, provided any decimal digits in it
11501 are from the same block of digits in Common.  (These are the ASCII digits
11502 "0".."9" and additionally a block for full width forms of these, and several
11503 others used in mathematical notation.)   For scripts (unlike Greek) that have
11504 their own digits defined this will accept either digits from that set or from
11505 one of the Common digit sets, but not a combination of the two.  Some scripts,
11506 such as Arabic, have more than one set of digits.  All digits must come from
11507 the same set for this function to return TRUE.
11508 
11509 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11510 contain the script found, using the C<SCX_enum> typedef.  Its value will be
11511 C<SCX_INVALID> if the function returns FALSE.
11512 
11513 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11514 will be C<SCX_INVALID>.
11515 
11516 If the sequence contains a single code point which is unassigned to a character
11517 in the version of Unicode being used, the function will return TRUE, and the
11518 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
11519 in the input sequence will result in the function treating the input as not
11520 being a script run.
11521 
11522 The returned script will be C<SCX_Inherited> iff all the code points in it are
11523 from the Inherited script.
11524 
11525 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11526 it are from the Inherited or Common scripts.
11527 
11528 =cut
11529 
11530 */
11531 
11532 bool
Perl_isSCRIPT_RUN(pTHX_ const U8 * s,const U8 * send,const bool utf8_target)11533 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11534 {
11535     /* Basically, it looks at each character in the sequence to see if the
11536      * above conditions are met; if not it fails.  It uses an inversion map to
11537      * find the enum corresponding to the script of each character.  But this
11538      * is complicated by the fact that a few code points can be in any of
11539      * several scripts.  The data has been constructed so that there are
11540      * additional enum values (all negative) for these situations.  The
11541      * absolute value of those is an index into another table which contains
11542      * pointers to auxiliary tables for each such situation.  Each aux array
11543      * lists all the scripts for the given situation.  There is another,
11544      * parallel, table that gives the number of entries in each aux table.
11545      * These are all defined in charclass_invlists.h */
11546 
11547     /* XXX Here are the additional things UTS 39 says could be done:
11548      *
11549      * Forbid sequences of the same nonspacing mark
11550      *
11551      * Check to see that all the characters are in the sets of exemplar
11552      * characters for at least one language in the Unicode Common Locale Data
11553      * Repository [CLDR]. */
11554 
11555 
11556     /* Things that match /\d/u */
11557     SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_];
11558     UV * decimals_array = invlist_array(decimals_invlist);
11559 
11560     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11561      * not currently known) */
11562     UV zero_of_run = 0;
11563 
11564     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
11565     SCX_enum script_of_char = SCX_INVALID;
11566 
11567     /* If the script remains not fully determined from iteration to iteration,
11568      * this is the current intersection of the possiblities.  */
11569     SCX_enum * intersection = NULL;
11570     PERL_UINT_FAST8_T intersection_len = 0;
11571 
11572     bool retval = TRUE;
11573     SCX_enum * ret_script = NULL;
11574 
11575     assert(send >= s);
11576 
11577     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11578 
11579     /* All code points in 0..255 are either Common or Latin, so must be a
11580      * script run.  We can return immediately unless we need to know which
11581      * script it is. */
11582     if (! utf8_target && LIKELY(send > s)) {
11583         if (ret_script == NULL) {
11584             return TRUE;
11585         }
11586 
11587         /* If any character is Latin, the run is Latin */
11588         while (s < send) {
11589             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11590                 *ret_script = SCX_Latin;
11591                 return TRUE;
11592             }
11593         }
11594 
11595         /* Here, all are Common */
11596         *ret_script = SCX_Common;
11597         return TRUE;
11598     }
11599 
11600     /* Look at each character in the sequence */
11601     while (s < send) {
11602         /* If the current character being examined is a digit, this is the code
11603          * point of the zero for its sequence of 10 */
11604         UV zero_of_char;
11605 
11606         UV cp;
11607 
11608         /* The code allows all scripts to use the ASCII digits.  This is
11609          * because they are in the Common script.  Hence any ASCII ones found
11610          * are ok, unless and until a digit from another set has already been
11611          * encountered.  digit ranges in Common are not similarly blessed) */
11612         if (UNLIKELY(isDIGIT(*s))) {
11613             if (UNLIKELY(script_of_run == SCX_Unknown)) {
11614                 retval = FALSE;
11615                 break;
11616             }
11617             if (zero_of_run) {
11618                 if (zero_of_run != '0') {
11619                     retval = FALSE;
11620                     break;
11621                 }
11622             }
11623             else {
11624                 zero_of_run = '0';
11625             }
11626             s++;
11627             continue;
11628         }
11629 
11630         /* Here, isn't an ASCII digit.  Find the code point of the character */
11631         if (! UTF8_IS_INVARIANT(*s)) {
11632             Size_t len;
11633             cp = valid_utf8_to_uvchr((U8 *) s, &len);
11634             s += len;
11635         }
11636         else {
11637             cp = *(s++);
11638         }
11639 
11640         /* If is within the range [+0 .. +9] of the script's zero, it also is a
11641          * digit in that script.  We can skip the rest of this code for this
11642          * character. */
11643         if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11644             continue;
11645         }
11646 
11647         /* Find the character's script.  The correct values are hard-coded here
11648          * for small-enough code points. */
11649         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
11650                                unlikely to change */
11651             if (       cp > 255
11652                 || (   isALPHA_L1(cp)
11653                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
11654             {
11655                 script_of_char = SCX_Latin;
11656             }
11657             else {
11658                 script_of_char = SCX_Common;
11659             }
11660         }
11661         else {
11662             script_of_char = _Perl_SCX_invmap[
11663                                        _invlist_search(PL_SCX_invlist, cp)];
11664         }
11665 
11666         /* We arbitrarily accept a single unassigned character, but not in
11667          * combination with anything else, and not a run of them. */
11668         if (   UNLIKELY(script_of_run == SCX_Unknown)
11669             || UNLIKELY(   script_of_run != SCX_INVALID
11670                         && script_of_char == SCX_Unknown))
11671         {
11672             retval = FALSE;
11673             break;
11674         }
11675 
11676         /* For the first character, or the run is inherited, the run's script
11677          * is set to the char's */
11678         if (   UNLIKELY(script_of_run == SCX_INVALID)
11679             || UNLIKELY(script_of_run == SCX_Inherited))
11680         {
11681             script_of_run = script_of_char;
11682         }
11683 
11684         /* For the character's script to be Unknown, it must be the first
11685          * character in the sequence (for otherwise a test above would have
11686          * prevented us from reaching here), and we have set the run's script
11687          * to it.  Nothing further to be done for this character */
11688         if (UNLIKELY(script_of_char == SCX_Unknown)) {
11689             continue;
11690         }
11691 
11692         /* We accept 'inherited' script characters currently even at the
11693          * beginning.  (We know that no characters in Inherited are digits, or
11694          * we'd have to check for that) */
11695         if (UNLIKELY(script_of_char == SCX_Inherited)) {
11696             continue;
11697         }
11698 
11699         /* If the run so far is Common, and the new character isn't, change the
11700          * run's script to that of this character */
11701         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11702             script_of_run = script_of_char;
11703         }
11704 
11705         /* Now we can see if the script of the new character is the same as
11706          * that of the run */
11707         if (LIKELY(script_of_char == script_of_run)) {
11708             /* By far the most common case */
11709             goto scripts_match;
11710         }
11711 
11712         /* Here, the script of the run isn't Common.  But characters in Common
11713          * match any script */
11714         if (script_of_char == SCX_Common) {
11715             goto scripts_match;
11716         }
11717 
11718 #ifndef HAS_SCX_AUX_TABLES
11719 
11720         /* Too early a Unicode version to have a code point belonging to more
11721          * than one script, so, if the scripts don't exactly match, fail */
11722         PERL_UNUSED_VAR(intersection_len);
11723         retval = FALSE;
11724         break;
11725 
11726 #else
11727 
11728         /* Here there is no exact match between the character's script and the
11729          * run's.  And we've handled the special cases of scripts Unknown,
11730          * Inherited, and Common.
11731          *
11732          * Negative script numbers signify that the value may be any of several
11733          * scripts, and we need to look at auxiliary information to make our
11734          * determination.  But if both are non-negative, we can fail now */
11735         if (LIKELY(script_of_char >= 0)) {
11736             const SCX_enum * search_in;
11737             PERL_UINT_FAST8_T search_in_len;
11738             PERL_UINT_FAST8_T i;
11739 
11740             if (LIKELY(script_of_run >= 0)) {
11741                 retval = FALSE;
11742                 break;
11743             }
11744 
11745             /* Use the previously constructed set of possible scripts, if any.
11746              * */
11747             if (intersection) {
11748                 search_in = intersection;
11749                 search_in_len = intersection_len;
11750             }
11751             else {
11752                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11753                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11754             }
11755 
11756             for (i = 0; i < search_in_len; i++) {
11757                 if (search_in[i] == script_of_char) {
11758                     script_of_run = script_of_char;
11759                     goto scripts_match;
11760                 }
11761             }
11762 
11763             retval = FALSE;
11764             break;
11765         }
11766         else if (LIKELY(script_of_run >= 0)) {
11767             /* script of character could be one of several, but run is a single
11768              * script */
11769             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11770             const PERL_UINT_FAST8_T search_in_len
11771                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11772             PERL_UINT_FAST8_T i;
11773 
11774             for (i = 0; i < search_in_len; i++) {
11775                 if (search_in[i] == script_of_run) {
11776                     script_of_char = script_of_run;
11777                     goto scripts_match;
11778                 }
11779             }
11780 
11781             retval = FALSE;
11782             break;
11783         }
11784         else {
11785             /* Both run and char could be in one of several scripts.  If the
11786              * intersection is empty, then this character isn't in this script
11787              * run.  Otherwise, we need to calculate the intersection to use
11788              * for future iterations of the loop, unless we are already at the
11789              * final character */
11790             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11791             const PERL_UINT_FAST8_T char_len
11792                                       = SCX_AUX_TABLE_lengths[-script_of_char];
11793             const SCX_enum * search_run;
11794             PERL_UINT_FAST8_T run_len;
11795 
11796             SCX_enum * new_overlap = NULL;
11797             PERL_UINT_FAST8_T i, j;
11798 
11799             if (intersection) {
11800                 search_run = intersection;
11801                 run_len = intersection_len;
11802             }
11803             else {
11804                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11805                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11806             }
11807 
11808             intersection_len = 0;
11809 
11810             for (i = 0; i < run_len; i++) {
11811                 for (j = 0; j < char_len; j++) {
11812                     if (search_run[i] == search_char[j]) {
11813 
11814                         /* Here, the script at i,j matches.  That means this
11815                          * character is in the run.  But continue on to find
11816                          * the complete intersection, for the next loop
11817                          * iteration, and for the digit check after it.
11818                          *
11819                          * On the first found common script, we malloc space
11820                          * for the intersection list for the worst case of the
11821                          * intersection, which is the minimum of the number of
11822                          * scripts remaining in each set. */
11823                         if (intersection_len == 0) {
11824                             Newx(new_overlap,
11825                                  MIN(run_len - i, char_len - j),
11826                                  SCX_enum);
11827                         }
11828                         new_overlap[intersection_len++] = search_run[i];
11829                     }
11830                 }
11831             }
11832 
11833             /* Here we've looked through everything.  If they have no scripts
11834              * in common, not a run */
11835             if (intersection_len == 0) {
11836                 retval = FALSE;
11837                 break;
11838             }
11839 
11840             /* If there is only a single script in common, set to that.
11841              * Otherwise, use the intersection going forward */
11842             Safefree(intersection);
11843             intersection = NULL;
11844             if (intersection_len == 1) {
11845                 script_of_run = script_of_char = new_overlap[0];
11846                 Safefree(new_overlap);
11847                 new_overlap = NULL;
11848             }
11849             else {
11850                 intersection = new_overlap;
11851             }
11852         }
11853 
11854 #endif
11855 
11856   scripts_match:
11857 
11858         /* Here, the script of the character is compatible with that of the
11859          * run.  That means that in most cases, it continues the script run.
11860          * Either it and the run match exactly, or one or both can be in any of
11861          * several scripts, and the intersection is not empty.  However, if the
11862          * character is a decimal digit, it could still mean failure if it is
11863          * from the wrong sequence of 10.  So, we need to look at if it's a
11864          * digit.  We've already handled the 10 digits [0-9], and the next
11865          * lowest one is this one: */
11866         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11867             continue;   /* Not a digit; this character is part of the run */
11868         }
11869 
11870         /* If we have a definitive '0' for the script of this character, we
11871          * know that for this to be a digit, it must be in the range of +0..+9
11872          * of that zero. */
11873         if (   script_of_char >= 0
11874             && (zero_of_char = script_zeros[script_of_char]))
11875         {
11876             if (! withinCOUNT(cp, zero_of_char, 9)) {
11877                 continue;   /* Not a digit; this character is part of the run
11878                              */
11879             }
11880 
11881         }
11882         else {  /* Need to look up if this character is a digit or not */
11883             SSize_t index_of_zero_of_char;
11884             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11885             if (     UNLIKELY(index_of_zero_of_char < 0)
11886                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11887             {
11888                 continue;   /* Not a digit; this character is part of the run.
11889                              */
11890             }
11891 
11892             zero_of_char = decimals_array[index_of_zero_of_char];
11893         }
11894 
11895         /* Here, the character is a decimal digit, and the zero of its sequence
11896          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11897          * they better be the same. */
11898         if (zero_of_run) {
11899             if (zero_of_run != zero_of_char) {
11900                 retval = FALSE;
11901                 break;
11902             }
11903         }
11904         else {  /* Otherwise we now have a zero for this run */
11905             zero_of_run = zero_of_char;
11906         }
11907     } /* end of looping through CLOSESR text */
11908 
11909     Safefree(intersection);
11910 
11911     if (ret_script != NULL) {
11912         if (retval) {
11913             *ret_script = script_of_run;
11914         }
11915         else {
11916             *ret_script = SCX_INVALID;
11917         }
11918     }
11919 
11920     return retval;
11921 }
11922 #endif /* ifndef PERL_IN_XSUB_RE */
11923 
11924 /* Buffer logic. */
11925 SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx,SV * const key,SV * const value,const U32 flags)11926 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
11927                     const U32 flags)
11928 {
11929     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
11930 
11931     PERL_UNUSED_ARG(value);
11932 
11933     if (flags & RXapif_FETCH) {
11934         return reg_named_buff_fetch(rx, key, flags);
11935     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
11936         Perl_croak_no_modify();
11937         return NULL;
11938     } else if (flags & RXapif_EXISTS) {
11939         return reg_named_buff_exists(rx, key, flags)
11940             ? &PL_sv_yes
11941             : &PL_sv_no;
11942     } else if (flags & RXapif_REGNAMES) {
11943         return reg_named_buff_all(rx, flags);
11944     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
11945         return reg_named_buff_scalar(rx, flags);
11946     } else {
11947         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
11948         return NULL;
11949     }
11950 }
11951 
11952 SV*
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx,const SV * const lastkey,const U32 flags)11953 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
11954                          const U32 flags)
11955 {
11956     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
11957     PERL_UNUSED_ARG(lastkey);
11958 
11959     if (flags & RXapif_FIRSTKEY)
11960         return reg_named_buff_firstkey(rx, flags);
11961     else if (flags & RXapif_NEXTKEY)
11962         return reg_named_buff_nextkey(rx, flags);
11963     else {
11964         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
11965                                             (int)flags);
11966         return NULL;
11967     }
11968 }
11969 
11970 SV*
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r,SV * const namesv,const U32 flags)11971 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
11972                           const U32 flags)
11973 {
11974     SV *ret;
11975     struct regexp *const rx = ReANY(r);
11976 
11977     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
11978 
11979     if (rx && RXp_PAREN_NAMES(rx)) {
11980         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
11981         if (he_str) {
11982             IV i;
11983             SV* sv_dat=HeVAL(he_str);
11984             I32 *nums=(I32*)SvPVX(sv_dat);
11985             AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL;
11986             for ( i=0; i<SvIVX(sv_dat); i++ ) {
11987                 if ((I32)(rx->nparens) >= nums[i]
11988                     && RXp_OFFS_VALID(rx,nums[i]))
11989                 {
11990                     ret = newSVpvs("");
11991                     Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE);
11992                     if (!retarray)
11993                         return ret;
11994                 } else {
11995                     if (retarray)
11996                         ret = newSV_type(SVt_NULL);
11997                 }
11998                 if (retarray)
11999                     av_push_simple(retarray, ret);
12000             }
12001             if (retarray)
12002                 return newRV_noinc(MUTABLE_SV(retarray));
12003         }
12004     }
12005     return NULL;
12006 }
12007 
12008 bool
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r,SV * const key,const U32 flags)12009 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
12010                            const U32 flags)
12011 {
12012     struct regexp *const rx = ReANY(r);
12013 
12014     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
12015 
12016     if (rx && RXp_PAREN_NAMES(rx)) {
12017         if (flags & RXapif_ALL) {
12018             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
12019         } else {
12020             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
12021             if (sv) {
12022                 SvREFCNT_dec_NN(sv);
12023                 return TRUE;
12024             } else {
12025                 return FALSE;
12026             }
12027         }
12028     } else {
12029         return FALSE;
12030     }
12031 }
12032 
12033 SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r,const U32 flags)12034 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
12035 {
12036     struct regexp *const rx = ReANY(r);
12037 
12038     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
12039 
12040     if ( rx && RXp_PAREN_NAMES(rx) ) {
12041         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
12042 
12043         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
12044     } else {
12045         return FALSE;
12046     }
12047 }
12048 
12049 SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r,const U32 flags)12050 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
12051 {
12052     struct regexp *const rx = ReANY(r);
12053     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12054 
12055     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
12056 
12057     if (rx && RXp_PAREN_NAMES(rx)) {
12058         HV *hv = RXp_PAREN_NAMES(rx);
12059         HE *temphe;
12060         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12061             IV i;
12062             IV parno = 0;
12063             SV* sv_dat = HeVAL(temphe);
12064             I32 *nums = (I32*)SvPVX(sv_dat);
12065             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12066                 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12067                     RXp_OFFS_VALID(rx,nums[i]))
12068                 {
12069                     parno = nums[i];
12070                     break;
12071                 }
12072             }
12073             if (parno || flags & RXapif_ALL) {
12074                 return newSVhek(HeKEY_hek(temphe));
12075             }
12076         }
12077     }
12078     return NULL;
12079 }
12080 
12081 SV*
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r,const U32 flags)12082 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
12083 {
12084     SV *ret;
12085     AV *av;
12086     SSize_t length;
12087     struct regexp *const rx = ReANY(r);
12088 
12089     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
12090 
12091     if (rx && RXp_PAREN_NAMES(rx)) {
12092         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
12093             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
12094         } else if (flags & RXapif_ONE) {
12095             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
12096             av = MUTABLE_AV(SvRV(ret));
12097             length = av_count(av);
12098             SvREFCNT_dec_NN(ret);
12099             return newSViv(length);
12100         } else {
12101             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
12102                                                 (int)flags);
12103             return NULL;
12104         }
12105     }
12106     return &PL_sv_undef;
12107 }
12108 
12109 SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r,const U32 flags)12110 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
12111 {
12112     struct regexp *const rx = ReANY(r);
12113     AV *av = newAV();
12114 
12115     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
12116 
12117     if (rx && RXp_PAREN_NAMES(rx)) {
12118         HV *hv= RXp_PAREN_NAMES(rx);
12119         HE *temphe;
12120         (void)hv_iterinit(hv);
12121         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12122             IV i;
12123             IV parno = 0;
12124             SV* sv_dat = HeVAL(temphe);
12125             I32 *nums = (I32*)SvPVX(sv_dat);
12126             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12127                 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12128                     RXp_OFFS_VALID(rx,nums[i]))
12129                 {
12130                     parno = nums[i];
12131                     break;
12132                 }
12133             }
12134             if (parno || flags & RXapif_ALL) {
12135                 av_push_simple(av, newSVhek(HeKEY_hek(temphe)));
12136             }
12137         }
12138     }
12139 
12140     return newRV_noinc(MUTABLE_SV(av));
12141 }
12142 
12143 void
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re,const I32 paren,SV * const sv)12144 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren,
12145                              SV * const sv)
12146 {
12147     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
12148     Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0);
12149 }
12150 
12151 #ifndef PERL_IN_XSUB_RE
12152 
12153 void
Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re,const I32 paren,SV * const sv,U32 flags)12154 Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren,
12155                                    SV * const sv, U32 flags)
12156 {
12157     struct regexp *const rx = ReANY(re);
12158     char *s = NULL;
12159     SSize_t i,t = 0;
12160     SSize_t s1, t1;
12161     I32 n = paren;
12162     I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12163 
12164     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS;
12165 
12166     if (      n == RX_BUFF_IDX_CARET_PREMATCH
12167            || n == RX_BUFF_IDX_CARET_FULLMATCH
12168            || n == RX_BUFF_IDX_CARET_POSTMATCH
12169        )
12170     {
12171         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12172         if (!keepcopy) {
12173             /* on something like
12174              *    $r = qr/.../;
12175              *    /$qr/p;
12176              * the KEEPCOPY is set on the PMOP rather than the regex */
12177             if (PL_curpm && re == PM_GETRE(PL_curpm))
12178                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12179         }
12180         if (!keepcopy)
12181             goto ret_undef;
12182     }
12183 
12184     if (!RXp_SUBBEG(rx))
12185         goto ret_undef;
12186 
12187     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
12188         /* no need to distinguish between them any more */
12189         n = RX_BUFF_IDX_FULLMATCH;
12190 
12191     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
12192         && (i = RXp_OFFS_START(rx,0)) != -1)
12193     {
12194         /* $`, ${^PREMATCH} */
12195         s = RXp_SUBBEG(rx);
12196     }
12197     else
12198     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
12199         && (t = RXp_OFFS_END(rx,0)) != -1)
12200     {
12201         /* $', ${^POSTMATCH} */
12202         s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + t;
12203         i = RXp_SUBLEN(rx) + RXp_SUBOFFSET(rx) - t;
12204     }
12205     else /* when flags is true we do an absolute lookup, and compare against rx->nparens */
12206     if (inRANGE(n, 0, flags ? (I32)rx->nparens : logical_nparens)) {
12207         I32 *map = (!flags && n) ? rx->logical_to_parno : NULL;
12208         I32 true_parno = map ? map[n] : n;
12209         do {
12210             if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1)  &&
12211                 ((t1 = RXp_OFFS_END(rx,true_parno)) != -1))
12212             {
12213                 /* $&, ${^MATCH}, $1 ... */
12214                 i = t1 - s1;
12215                 s = RXp_SUBBEG(rx) + s1 - RXp_SUBOFFSET(rx);
12216                 goto found_it;
12217             }
12218             else if (map) {
12219                 true_parno = rx->parno_to_logical_next[true_parno];
12220             }
12221             else {
12222                 break;
12223             }
12224         } while (true_parno);
12225         goto ret_undef;
12226     } else {
12227         goto ret_undef;
12228     }
12229 
12230   found_it:
12231     assert(s >= RXp_SUBBEG(rx));
12232     assert((STRLEN)RXp_SUBLEN(rx) >= (STRLEN)((s - RXp_SUBBEG(rx)) + i) );
12233     if (i >= 0) {
12234 #ifdef NO_TAINT_SUPPORT
12235         sv_setpvn(sv, s, i);
12236 #else
12237         const int oldtainted = TAINT_get;
12238         TAINT_NOT;
12239         sv_setpvn(sv, s, i);
12240         TAINT_set(oldtainted);
12241 #endif
12242         if (RXp_MATCH_UTF8(rx))
12243             SvUTF8_on(sv);
12244         else
12245             SvUTF8_off(sv);
12246         if (TAINTING_get) {
12247             if (RXp_MATCH_TAINTED(rx)) {
12248                 if (SvTYPE(sv) >= SVt_PVMG) {
12249                     MAGIC* const mg = SvMAGIC(sv);
12250                     MAGIC* mgt;
12251                     TAINT;
12252                     SvMAGIC_set(sv, mg->mg_moremagic);
12253                     SvTAINT(sv);
12254                     if ((mgt = SvMAGIC(sv))) {
12255                         mg->mg_moremagic = mgt;
12256                         SvMAGIC_set(sv, mg);
12257                     }
12258                 } else {
12259                     TAINT;
12260                     SvTAINT(sv);
12261                 }
12262             } else
12263                 SvTAINTED_off(sv);
12264         }
12265     } else {
12266       ret_undef:
12267         sv_set_undef(sv);
12268         return;
12269     }
12270 }
12271 
12272 #endif
12273 
12274 void
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx,const I32 paren,SV const * const value)12275 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
12276                                                          SV const * const value)
12277 {
12278     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
12279 
12280     PERL_UNUSED_ARG(rx);
12281     PERL_UNUSED_ARG(paren);
12282     PERL_UNUSED_ARG(value);
12283 
12284     if (!PL_localizing)
12285         Perl_croak_no_modify();
12286 }
12287 
12288 I32
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r,const SV * const sv,const I32 paren)12289 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
12290                               const I32 paren)
12291 {
12292     struct regexp *const rx = ReANY(r);
12293     I32 i,j;
12294     I32 s1, t1;
12295     I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12296 
12297     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
12298 
12299     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
12300         || paren == RX_BUFF_IDX_CARET_FULLMATCH
12301         || paren == RX_BUFF_IDX_CARET_POSTMATCH
12302     )
12303     {
12304         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12305         if (!keepcopy) {
12306             /* on something like
12307              *    $r = qr/.../;
12308              *    /$qr/p;
12309              * the KEEPCOPY is set on the PMOP rather than the regex */
12310             if (PL_curpm && r == PM_GETRE(PL_curpm))
12311                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12312         }
12313         if (!keepcopy)
12314             goto warn_undef;
12315     }
12316 
12317     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
12318     switch (paren) {
12319       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
12320       case RX_BUFF_IDX_PREMATCH:       /* $` */
12321         if ( (i = RXp_OFFS_START(rx,0)) != -1) {
12322             if (i > 0) {
12323                 s1 = 0;
12324                 t1 = i;
12325                 goto getlen;
12326             }
12327         }
12328         return 0;
12329 
12330       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
12331       case RX_BUFF_IDX_POSTMATCH:       /* $' */
12332         if ( (j = RXp_OFFS_END(rx,0)) != -1 ) {
12333             i = RXp_SUBLEN(rx) - j;
12334             if (i > 0) {
12335                 s1 = j;
12336                 t1 = RXp_SUBLEN(rx);
12337                 goto getlen;
12338             }
12339         }
12340         return 0;
12341 
12342       default: /* $& / ${^MATCH}, $1, $2, ... */
12343         if (paren <= logical_nparens) {
12344             I32 true_paren = rx->logical_to_parno
12345                              ? rx->logical_to_parno[paren]
12346                              : paren;
12347             do {
12348                 if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) &&
12349                     ((t1 = RXp_OFFS_END(rx,true_paren)) != -1))
12350                 {
12351                     i = t1 - s1;
12352                     goto getlen;
12353                 } else if (rx->parno_to_logical_next) {
12354                     true_paren = rx->parno_to_logical_next[true_paren];
12355                 } else {
12356                     break;
12357                 }
12358             } while(true_paren);
12359         }
12360       warn_undef:
12361         if (ckWARN(WARN_UNINITIALIZED))
12362             report_uninit((const SV *)sv);
12363         return 0;
12364     }
12365   getlen:
12366     if (i > 0 && RXp_MATCH_UTF8(rx)) {
12367         const char * const s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + s1;
12368         const U8 *ep;
12369         STRLEN el;
12370 
12371         i = t1 - s1;
12372         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
12373             i = el;
12374     }
12375     return i;
12376 }
12377 
12378 /*
12379  * ex: set ts=8 sts=4 sw=4 et:
12380  */
12381