xref: /openbsd/gnu/usr.bin/perl/regexec.c (revision e0680481)
1 /*    regexec.c
2  */
3 
4 /*
5  *	One Ring to rule them all, One Ring to find them
6  *
7  *     [p.v of _The Lord of the Rings_, opening poem]
8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10  */
11 
12 /* This file contains functions for executing a regular expression.  See
13  * also regcomp.c which funnily enough, contains functions for compiling
14  * a regular expression.
15  *
16  * This file is also copied at build time to ext/re/re_exec.c, where
17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18  * This causes the main functions to be compiled under new names and with
19  * debugging support added, which makes "use re 'debug'" work.
20  */
21 
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23  * confused with the original package (see point 3 below).  Thanks, Henry!
24  */
25 
26 /* Additional note: this code is very heavily munged from Henry's version
27  * in places.  In some spots I've traded clarity for efficiency, so don't
28  * blame Henry for some of the lack of readability.
29  */
30 
31 /* The names of the functions have been changed from regcomp and
32  * regexec to  pregcomp and pregexec in order to avoid conflicts
33  * with the POSIX routines of the same names.
34 */
35 
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39 
40 /*
41  * pregcomp and pregexec -- regsub and regerror are not used in perl
42  *
43  *	Copyright (c) 1986 by University of Toronto.
44  *	Written by Henry Spencer.  Not derived from licensed software.
45  *
46  *	Permission is granted to anyone to use this software for any
47  *	purpose on any computer system, and to redistribute it freely,
48  *	subject to the following restrictions:
49  *
50  *	1. The author is not responsible for the consequences of use of
51  *		this software, no matter how awful, even if they arise
52  *		from defects in it.
53  *
54  *	2. The origin of this software must not be misrepresented, either
55  *		by explicit claim or by omission.
56  *
57  *	3. Altered versions must be plainly marked as such, and must not
58  *		be misrepresented as being the original software.
59  *
60  ****    Alterations to Henry's code are...
61  ****
62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64  ****    by Larry Wall and others
65  ****
66  ****    You may distribute under the terms of either the GNU General Public
67  ****    License or the Artistic License, as specified in the README file.
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEX_ENGINE
75 #define PERL_IN_REGEXEC_C
76 #include "perl.h"
77 
78 #ifdef PERL_IN_XSUB_RE
79 #  include "re_comp.h"
80 #else
81 #  include "regcomp.h"
82 #endif
83 
84 #include "invlist_inline.h"
85 #include "unicode_constants.h"
86 
87 static const char b_utf8_locale_required[] =
88  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
89                                                 "  Assuming a UTF-8 locale";
90 
91 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND                       \
92     STMT_START {                                                            \
93         if (! IN_UTF8_CTYPE_LOCALE) {                                       \
94           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
95                                                 b_utf8_locale_required);    \
96         }                                                                   \
97     } STMT_END
98 
99 static const char sets_utf8_locale_required[] =
100       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
101 
102 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n)                     \
103     STMT_START {                                                            \
104         if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\
105           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
106                                              sets_utf8_locale_required);    \
107         }                                                                   \
108     } STMT_END
109 
110 #ifdef DEBUGGING
111 /* At least one required character in the target string is expressible only in
112  * UTF-8. */
113 static const char non_utf8_target_but_utf8_required[]
114                 = "Can't match, because target string needs to be in UTF-8\n";
115 #endif
116 
117 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
118     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
119     goto target;                                                         \
120 } STMT_END
121 
122 #ifndef STATIC
123 #define STATIC  static
124 #endif
125 
126 /*
127  * Forwards.
128  */
129 
130 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131 
132 #define HOPc(pos,off) \
133         (char *)(reginfo->is_utf8_target \
134             ? reghop3((U8*)pos, off, \
135                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
136             : (U8*)(pos + off))
137 
138 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
139 #define HOPBACK3(pos, off, lim) \
140         (reginfo->is_utf8_target                          \
141             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
142             : (pos - off >= lim)	                         \
143                 ? (U8*)pos - off		                 \
144                 : NULL)
145 
146 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147 
148 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
149 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150 
151 /* lim must be +ve. Returns NULL on overshoot */
152 #define HOPMAYBE3(pos,off,lim) \
153         (reginfo->is_utf8_target                        \
154             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
155             : ((U8*)pos + off <= lim)                   \
156                 ? (U8*)pos + off                        \
157                 : NULL)
158 
159 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
160  * off must be >=0; args should be vars rather than expressions */
161 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
162     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
163     : (U8*)((pos + off) > lim ? lim : (pos + off)))
164 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165 
166 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
167     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168     : (U8*)(pos + off))
169 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170 
171 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
172 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173 
174 /* for use after a quantifier and before an EXACT-like node -- japhy */
175 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176  *
177  * NOTE that *nothing* that affects backtracking should be in here, specifically
178  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
179  * node that is in between two EXACT like nodes when ascertaining what the required
180  * "follow" character is. This should probably be moved to regex compile time
181  * although it may be done at run time because of the REF possibility - more
182  * investigation required. -- demerphq
183 */
184 #define JUMPABLE(rn) (                                                             \
185     OP(rn) == OPEN ||                                                              \
186     (OP(rn) == CLOSE &&                                                            \
187      !EVAL_CLOSE_PAREN_IS(cur_eval,PARNO(rn)) ) ||                                 \
188     OP(rn) == EVAL ||                                                              \
189     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
190     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
191     OP(rn) == KEEPS ||                                                             \
192     (REGNODE_TYPE(OP(rn)) == CURLY && ARG1i(rn) > 0)                                  \
193 )
194 #define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT)
195 
196 #define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF )
197 
198 /*
199   Search for mandatory following text node; for lookahead, the text must
200   follow but for lookbehind (FLAGS(rn) != 0) we skip to the next step.
201 */
202 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
203     while (JUMPABLE(rn)) { \
204         const OPCODE type = OP(rn); \
205         if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \
206             rn = REGNODE_AFTER_opcode(rn,type); \
207         else if (type == PLUS) \
208             rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \
209         else if (type == IFMATCH) \
210             rn = (FLAGS(rn) == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG1u(rn); \
211         else rn += NEXT_OFF(rn); \
212     } \
213 } STMT_END
214 
215 #define SLAB_FIRST(s) (&(s)->states[0])
216 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217 
218 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
219 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
220 static regmatch_state * S_push_slab(pTHX);
221 
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225  * are needed for the regexp context stack bookkeeping. */
226 
227 STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp * rex,I32 parenfloor,U32 maxopenparen comma_pDEPTH)228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen comma_pDEPTH)
229 {
230     const int retval = PL_savestack_ix;
231     /* Number of bytes about to be stored in the stack */
232     const SSize_t paren_bytes_to_push = sizeof(*RXp_OFFSp(rex)) * (maxopenparen - parenfloor);
233     /* Number of savestack[] entries to be filled by the paren data */
234     /* Rounding is performed in case we are few elements short */
235     const int paren_elems_to_push = (paren_bytes_to_push + sizeof(*PL_savestack) - 1) / sizeof(*PL_savestack);
236     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
237     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
238 
239     DECLARE_AND_GET_RE_DEBUG_FLAGS;
240 
241     PERL_ARGS_ASSERT_REGCPPUSH;
242 
243     if (paren_elems_to_push < 0)
244         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i",
245                    (int)paren_elems_to_push, (int)maxopenparen,
246                    (int)parenfloor);
247 
248     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
249         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
250                    " out of range (%lu-%ld)",
251                    total_elems,
252                    (unsigned long)maxopenparen,
253                    (long)parenfloor);
254 
255     DEBUG_BUFFERS_r(
256         if ((int)maxopenparen > (int)parenfloor)
257             Perl_re_exec_indentf( aTHX_
258                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
259                 depth,
260                 PTR2UV(rex),
261                 PTR2UV(RXp_OFFSp(rex))
262             );
263     );
264 
265     SSGROW(total_elems + REGCP_FRAME_ELEMS);
266     assert((IV)PL_savestack_max > (IV)(total_elems + REGCP_FRAME_ELEMS));
267 
268     /* memcpy the offs inside the stack - it's faster than for loop */
269     memcpy(&PL_savestack[PL_savestack_ix], RXp_OFFSp(rex) + parenfloor + 1, paren_bytes_to_push);
270     PL_savestack_ix += paren_elems_to_push;
271 
272     DEBUG_BUFFERS_r({
273 	I32 p;
274         for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) {
275             Perl_re_exec_indentf(aTHX_
276                 "    \\%" UVuf " %" IVdf " (%" IVdf ") .. %" IVdf " (regcppush)\n",
277                 depth,
278                 (UV)p,
279                 (IV)RXp_OFFSp(rex)[p].start,
280                 (IV)RXp_OFFSp(rex)[p].start_tmp,
281                 (IV)RXp_OFFSp(rex)[p].end
282             );
283         }
284     });
285 
286 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
287     SSPUSHINT(maxopenparen);
288     SSPUSHINT(RXp_LASTPAREN(rex));
289     SSPUSHINT(RXp_LASTCLOSEPAREN(rex));
290     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
291 
292 
293     DEBUG_BUFFERS_r({
294         Perl_re_exec_indentf(aTHX_
295                 "finished regcppush returning %" IVdf " cur: %" IVdf "\n",
296                 depth, retval, PL_savestack_ix);
297     });
298 
299     return retval;
300 }
301 
302 /* These are needed since we do not localize EVAL nodes: */
303 #define REGCP_SET(cp)                                           \
304     DEBUG_STATE_r(                                              \
305         Perl_re_exec_indentf( aTHX_                             \
306             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
307             depth, (IV)PL_savestack_ix                          \
308         )                                                       \
309     );                                                          \
310     cp = PL_savestack_ix
311 
312 #define REGCP_UNWIND(cp)                                        \
313     DEBUG_STATE_r(                                              \
314         if (cp != PL_savestack_ix)                              \
315             Perl_re_exec_indentf( aTHX_                         \
316                 "Clearing an EVAL scope, savestack=%"           \
317                 IVdf "..%" IVdf "\n",                           \
318                 depth, (IV)(cp), (IV)PL_savestack_ix            \
319             )                                                   \
320     );                                                          \
321     regcpblow(cp)
322 
323 /* set the start and end positions of capture ix */
324 #define CLOSE_ANY_CAPTURE(rex, ix, s, e)                                    \
325     RXp_OFFSp(rex)[(ix)].start = (s);                                       \
326     RXp_OFFSp(rex)[(ix)].end = (e)
327 
328 #define CLOSE_CAPTURE(rex, ix, s, e)                                        \
329     CLOSE_ANY_CAPTURE(rex, ix, s, e);                                       \
330     if (ix > RXp_LASTPAREN(rex))                                            \
331         RXp_LASTPAREN(rex) = (ix);                                          \
332     RXp_LASTCLOSEPAREN(rex) = (ix);                                         \
333     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                             \
334         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " .. %" IVdf " max: %" UVuf "\n", \
335         depth,                                                              \
336         PTR2UV(rex),                                                        \
337         PTR2UV(RXp_OFFSp(rex)),                                             \
338         (UV)(ix),                                                           \
339         (IV)RXp_OFFSp(rex)[ix].start,                                       \
340         (IV)RXp_OFFSp(rex)[ix].end,                                         \
341         (UV)RXp_LASTPAREN(rex)                                              \
342     ))
343 
344 /* the lp and lcp args match the relevant members of the
345  * regexp structure, but in practice they should all be U16
346  * instead as we have a hard limit of U16_MAX parens. See
347  * line 4003 or so of regcomp.c where we parse OPEN parens
348  * of various types. */
349 PERL_STATIC_INLINE void
S_unwind_paren(pTHX_ regexp * rex,U32 lp,U32 lcp comma_pDEPTH)350 S_unwind_paren(pTHX_ regexp *rex, U32 lp, U32 lcp comma_pDEPTH) {
351     PERL_ARGS_ASSERT_UNWIND_PAREN;
352     U32 n;
353     DECLARE_AND_GET_RE_DEBUG_FLAGS;
354     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
355         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf
356         ": invalidate (%" UVuf " .. %" UVuf ") set lcp: %" UVuf "\n",
357         depth,
358         PTR2UV(rex),
359         PTR2UV(RXp_OFFSp(rex)),
360         (UV)(lp),
361         (UV)(RXp_LASTPAREN(rex)),
362         (UV)(lcp)
363     ));
364     for (n = RXp_LASTPAREN(rex); n > lp; n--) {
365         RXp_OFFSp(rex)[n].end = -1;
366     }
367     RXp_LASTPAREN(rex) = n;
368     RXp_LASTCLOSEPAREN(rex) = lcp;
369 }
370 #define UNWIND_PAREN(lp,lcp) unwind_paren(rex,lp,lcp)
371 
372 PERL_STATIC_INLINE void
S_capture_clear(pTHX_ regexp * rex,U16 from_ix,U16 to_ix,const char * str comma_pDEPTH)373 S_capture_clear(pTHX_ regexp *rex, U16 from_ix, U16 to_ix, const char *str comma_pDEPTH) {
374     PERL_ARGS_ASSERT_CAPTURE_CLEAR;
375     PERL_UNUSED_ARG(str); /* only used for debugging */
376     U16 my_ix;
377     DECLARE_AND_GET_RE_DEBUG_FLAGS;
378     for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) {
379         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
380                 "CAPTURE_CLEAR %s \\%" IVdf ": "
381                 "%" IVdf "(%" IVdf ") .. %" IVdf
382                 " => "
383                 "%" IVdf "(%" IVdf ") .. %" IVdf
384                 "\n",
385             depth, str, (IV)my_ix,
386             (IV)RXp_OFFSp(rex)[my_ix].start,
387             (IV)RXp_OFFSp(rex)[my_ix].start_tmp,
388             (IV)RXp_OFFSp(rex)[my_ix].end,
389             (IV)-1, (IV)-1, (IV)-1));
390         RXp_OFFSp(rex)[my_ix].start = -1;
391         RXp_OFFSp(rex)[my_ix].start_tmp = -1;
392         RXp_OFFSp(rex)[my_ix].end = -1;
393     }
394 }
395 
396 #define CAPTURE_CLEAR(from_ix, to_ix, str) \
397     if (from_ix) capture_clear(rex,from_ix, to_ix, str)
398 
399 STATIC void
S_regcppop(pTHX_ regexp * rex,U32 * maxopenparen_p comma_pDEPTH)400 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p comma_pDEPTH)
401 {
402     UV i;
403     U32 paren;
404     DECLARE_AND_GET_RE_DEBUG_FLAGS;
405 
406     PERL_ARGS_ASSERT_REGCPPOP;
407 
408 
409     DEBUG_BUFFERS_r({
410         Perl_re_exec_indentf(aTHX_
411                 "starting regcppop at %" IVdf "\n",
412                 depth, PL_savestack_ix);
413     });
414 
415     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
416     i = SSPOPUV;
417     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
418     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
419     RXp_LASTCLOSEPAREN(rex) = SSPOPINT;
420     RXp_LASTPAREN(rex) = SSPOPINT;
421     *maxopenparen_p = SSPOPINT;
422 
423     i -= REGCP_OTHER_ELEMS;
424     /* Now restore the parentheses context. */
425     DEBUG_BUFFERS_r(
426         if (i || RXp_LASTPAREN(rex) + 1 <= rex->nparens)
427             Perl_re_exec_indentf( aTHX_
428                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
429                 depth,
430                 PTR2UV(rex),
431                 PTR2UV(RXp_OFFSp(rex))
432             );
433     );
434     /* substract remaining elements from the stack */
435     PL_savestack_ix -= i;
436 
437     /* static assert that offs struc size is not less than stack elem size */
438     STATIC_ASSERT_STMT(sizeof(*RXp_OFFSp(rex)) >= sizeof(*PL_savestack));
439 
440     /* calculate actual number of offs/capture groups stored */
441     /* by doing integer division (leaving potential alignment aside) */
442     i = (i * sizeof(*PL_savestack)) / sizeof(*RXp_OFFSp(rex));
443 
444     /* calculate paren starting point */
445     /* i is our number of entries which we are subtracting from *maxopenparen_p */
446     /* and we are storing + 1 this to get the beginning */
447     paren = *maxopenparen_p - i + 1;
448 
449     /* restore them */
450     memcpy(RXp_OFFSp(rex) + paren, &PL_savestack[PL_savestack_ix], i * sizeof(*RXp_OFFSp(rex)));
451 
452     DEBUG_BUFFERS_r(
453         for (; paren <= *maxopenparen_p; ++paren) {
454             Perl_re_exec_indentf(aTHX_
455                 "    \\%" UVuf " %" IVdf "(%" IVdf ") .. %" IVdf " %s (regcppop)\n",
456                 depth,
457                 (UV)paren,
458                 (IV)RXp_OFFSp(rex)[paren].start,
459                 (IV)RXp_OFFSp(rex)[paren].start_tmp,
460                 (IV)RXp_OFFSp(rex)[paren].end,
461                 (paren > RXp_LASTPAREN(rex) ? "(skipped)" : ""));
462         }
463     );
464 #if 1
465     /* It would seem that the similar code in regtry()
466      * already takes care of this, and in fact it is in
467      * a better location to since this code can #if 0-ed out
468      * but the code in regtry() is needed or otherwise tests
469      * requiring null fields (pat.t#187 and split.t#{13,14}
470      * (as of patchlevel 7877)  will fail.  Then again,
471      * this code seems to be necessary or otherwise
472      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
473      * --jhi updated by dapm */
474     for (i = RXp_LASTPAREN(rex) + 1; i <= rex->nparens; i++) {
475         if (i > *maxopenparen_p) {
476             RXp_OFFSp(rex)[i].start = -1;
477         }
478         RXp_OFFSp(rex)[i].end = -1;
479         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
480             "    \\%" UVuf ": %s   ..-1 undeffing (regcppop)\n",
481             depth,
482             (UV)i,
483             (i > *maxopenparen_p) ? "-1" : "  "
484         ));
485     }
486 #endif
487     DEBUG_BUFFERS_r({
488         Perl_re_exec_indentf(aTHX_
489                 "finished regcppop at %" IVdf "\n",
490                 depth, PL_savestack_ix);
491     });
492 }
493 
494 /* restore the parens and associated vars at savestack position ix,
495  * but without popping the stack */
496 
497 STATIC void
S_regcp_restore(pTHX_ regexp * rex,I32 ix,U32 * maxopenparen_p comma_pDEPTH)498 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p comma_pDEPTH)
499 {
500     I32 tmpix = PL_savestack_ix;
501     PERL_ARGS_ASSERT_REGCP_RESTORE;
502 
503     PL_savestack_ix = ix;
504     regcppop(rex, maxopenparen_p);
505     PL_savestack_ix = tmpix;
506 }
507 
508 #define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
509 
510 STATIC bool
S_isFOO_lc(pTHX_ const U8 classnum,const U8 character)511 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
512 {
513     /* Returns a boolean as to whether or not 'character' is a member of the
514      * Posix character class given by 'classnum' that should be equivalent to a
515      * value in the typedef 'char_class_number_'.
516      *
517      * Ideally this could be replaced by a just an array of function pointers
518      * to the C library functions that implement the macros this calls.
519      * However, to compile, the precise function signatures are required, and
520      * these may vary from platform to platform.  To avoid having to figure
521      * out what those all are on each platform, I (khw) am using this method,
522      * which adds an extra layer of function call overhead (unless the C
523      * optimizer strips it away).  But we don't particularly care about
524      * performance with locales anyway. */
525 
526     if (IN_UTF8_CTYPE_LOCALE) {
527         return cBOOL(generic_isCC_(character, classnum));
528     }
529 
530     switch ((char_class_number_) classnum) {
531         case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character);
532         case CC_ENUM_ALPHA_:        return    isU8_ALPHA_LC(character);
533         case CC_ENUM_ASCII_:        return    isU8_ASCII_LC(character);
534         case CC_ENUM_BLANK_:        return    isU8_BLANK_LC(character);
535         case CC_ENUM_CASED_:        return    isU8_CASED_LC(character);
536         case CC_ENUM_CNTRL_:        return    isU8_CNTRL_LC(character);
537         case CC_ENUM_DIGIT_:        return    isU8_DIGIT_LC(character);
538         case CC_ENUM_GRAPH_:        return    isU8_GRAPH_LC(character);
539         case CC_ENUM_LOWER_:        return    isU8_LOWER_LC(character);
540         case CC_ENUM_PRINT_:        return    isU8_PRINT_LC(character);
541         case CC_ENUM_PUNCT_:        return    isU8_PUNCT_LC(character);
542         case CC_ENUM_SPACE_:        return    isU8_SPACE_LC(character);
543         case CC_ENUM_UPPER_:        return    isU8_UPPER_LC(character);
544         case CC_ENUM_WORDCHAR_:     return isU8_WORDCHAR_LC(character);
545         case CC_ENUM_XDIGIT_:       return   isU8_XDIGIT_LC(character);
546         default:    /* VERTSPACE should never occur in locales */
547             break;
548     }
549 
550     Perl_croak(aTHX_
551                "panic: isFOO_lc() has an unexpected character class '%d'",
552                classnum);
553 
554     NOT_REACHED; /* NOTREACHED */
555     return FALSE;
556 }
557 
558 PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(pTHX_ const char * s1,const char * s2,I32 len)559 S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len)
560 {
561     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
562      * folded.  Works on all folds representable without UTF-8, except for
563      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
564      * check that the strings each have at least 'len' characters.
565      *
566      * There is almost an identical API function where s2 need not be folded:
567      * Perl_foldEQ_latin1() */
568 
569     const U8 *a = (const U8 *)s1;
570     const U8 *b = (const U8 *)s2;
571 
572     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
573 
574     assert(len >= 0);
575 
576     while (len--) {
577         assert(! isUPPER_L1(*b));
578         if (toLOWER_L1(*a) != *b) {
579             return 0;
580         }
581         a++, b++;
582     }
583     return 1;
584 }
585 
586 STATIC bool
S_isFOO_utf8_lc(pTHX_ const U8 classnum,const U8 * character,const U8 * e)587 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
588 {
589     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
590      * 'character' is a member of the Posix character class given by 'classnum'
591      * that should be equivalent to a value in the typedef
592      * 'char_class_number_'.
593      *
594      * This just calls isFOO_lc on the code point for the character if it is in
595      * the range 0-255.  Outside that range, all characters use Unicode
596      * rules, ignoring any locale.  So use the Unicode function if this class
597      * requires an inversion list, and use the Unicode macro otherwise. */
598 
599 
600     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
601 
602     if (UTF8_IS_INVARIANT(*character)) {
603         return isFOO_lc(classnum, *character);
604     }
605     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
606         return isFOO_lc(classnum,
607                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
608     }
609 
610     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
611 
612     switch ((char_class_number_) classnum) {
613         case CC_ENUM_SPACE_:     return is_XPERLSPACE_high(character);
614         case CC_ENUM_BLANK_:     return is_HORIZWS_high(character);
615         case CC_ENUM_XDIGIT_:    return is_XDIGIT_high(character);
616         case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character);
617         default:
618             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
619                                         utf8_to_uvchr_buf(character, e, NULL));
620     }
621     NOT_REACHED; /* NOTREACHED */
622 }
623 
624 STATIC U8 *
S_find_span_end(U8 * s,const U8 * send,const U8 span_byte)625 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
626 {
627     /* Returns the position of the first byte in the sequence between 's' and
628      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
629      * */
630 
631     PERL_ARGS_ASSERT_FIND_SPAN_END;
632 
633     assert(send >= s);
634 
635     if ((STRLEN) (send - s) >= PERL_WORDSIZE
636                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
637                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
638     {
639         PERL_UINTMAX_T span_word;
640 
641         /* Process per-byte until reach word boundary.  XXX This loop could be
642          * eliminated if we knew that this platform had fast unaligned reads */
643         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
644             if (*s != span_byte) {
645                 return s;
646             }
647             s++;
648         }
649 
650         /* Create a word filled with the bytes we are spanning */
651         span_word = PERL_COUNT_MULTIPLIER * span_byte;
652 
653         /* Process per-word as long as we have at least a full word left */
654         do {
655 
656             /* Keep going if the whole word is composed of 'span_byte's */
657             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
658                 s += PERL_WORDSIZE;
659                 continue;
660             }
661 
662             /* Here, at least one byte in the word isn't 'span_byte'. */
663 
664 #ifdef EBCDIC
665 
666             break;
667 
668 #else
669 
670             /* This xor leaves 1 bits only in those non-matching bytes */
671             span_word ^= * (PERL_UINTMAX_T *) s;
672 
673             /* Make sure the upper bit of each non-matching byte is set.  This
674              * makes each such byte look like an ASCII platform variant byte */
675             span_word |= span_word << 1;
676             span_word |= span_word << 2;
677             span_word |= span_word << 4;
678 
679             /* That reduces the problem to what this function solves */
680             return s + variant_byte_number(span_word);
681 
682 #endif
683 
684         } while (s + PERL_WORDSIZE <= send);
685     }
686 
687     /* Process the straggler bytes beyond the final word boundary */
688     while (s < send) {
689         if (*s != span_byte) {
690             return s;
691         }
692         s++;
693     }
694 
695     return s;
696 }
697 
698 STATIC U8 *
S_find_next_masked(U8 * s,const U8 * send,const U8 byte,const U8 mask)699 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
700 {
701     /* Returns the position of the first byte in the sequence between 's'
702      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
703      * returns 'send' if none found.  It uses word-level operations instead of
704      * byte to speed up the process */
705 
706     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
707 
708     assert(send >= s);
709     assert((byte & mask) == byte);
710 
711 #ifndef EBCDIC
712 
713     if ((STRLEN) (send - s) >= PERL_WORDSIZE
714                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
715                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
716     {
717         PERL_UINTMAX_T word, mask_word;
718 
719         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
720             if (((*s) & mask) == byte) {
721                 return s;
722             }
723             s++;
724         }
725 
726         word      = PERL_COUNT_MULTIPLIER * byte;
727         mask_word = PERL_COUNT_MULTIPLIER * mask;
728 
729         do {
730             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
731 
732             /* If 'masked' contains bytes with the bit pattern of 'byte' within
733              * it, xoring with 'word' will leave each of the 8 bits in such
734              * bytes be 0, and no byte containing any other bit pattern will be
735              * 0. */
736             masked ^= word;
737 
738             /* This causes the most significant bit to be set to 1 for any
739              * bytes in the word that aren't completely 0 */
740             masked |= masked << 1;
741             masked |= masked << 2;
742             masked |= masked << 4;
743 
744             /* The msbits are the same as what marks a byte as variant, so we
745              * can use this mask.  If all msbits are 1, the word doesn't
746              * contain 'byte' */
747             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
748                 s += PERL_WORDSIZE;
749                 continue;
750             }
751 
752             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
753              * and any that are, are 0.  Complement and re-AND to swap that */
754             masked = ~ masked;
755             masked &= PERL_VARIANTS_WORD_MASK;
756 
757             /* This reduces the problem to that solved by this function */
758             s += variant_byte_number(masked);
759             return s;
760 
761         } while (s + PERL_WORDSIZE <= send);
762     }
763 
764 #endif
765 
766     while (s < send) {
767         if (((*s) & mask) == byte) {
768             return s;
769         }
770         s++;
771     }
772 
773     return s;
774 }
775 
776 STATIC U8 *
S_find_span_end_mask(U8 * s,const U8 * send,const U8 span_byte,const U8 mask)777 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
778 {
779     /* Returns the position of the first byte in the sequence between 's' and
780      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
781      * 'span_byte' should have been ANDed with 'mask' in the call of this
782      * function.  Returns 'send' if none found.  Works like find_span_end(),
783      * except for the AND */
784 
785     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
786 
787     assert(send >= s);
788     assert((span_byte & mask) == span_byte);
789 
790     if ((STRLEN) (send - s) >= PERL_WORDSIZE
791                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
792                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
793     {
794         PERL_UINTMAX_T span_word, mask_word;
795 
796         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
797             if (((*s) & mask) != span_byte) {
798                 return s;
799             }
800             s++;
801         }
802 
803         span_word = PERL_COUNT_MULTIPLIER * span_byte;
804         mask_word = PERL_COUNT_MULTIPLIER * mask;
805 
806         do {
807             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
808 
809             if (masked == span_word) {
810                 s += PERL_WORDSIZE;
811                 continue;
812             }
813 
814 #ifdef EBCDIC
815 
816             break;
817 
818 #else
819 
820             masked ^= span_word;
821             masked |= masked << 1;
822             masked |= masked << 2;
823             masked |= masked << 4;
824             return s + variant_byte_number(masked);
825 
826 #endif
827 
828         } while (s + PERL_WORDSIZE <= send);
829     }
830 
831     while (s < send) {
832         if (((*s) & mask) != span_byte) {
833             return s;
834         }
835         s++;
836     }
837 
838     return s;
839 }
840 
841 /*
842  * pregexec and friends
843  */
844 
845 #ifndef PERL_IN_XSUB_RE
846 /*
847  - pregexec - match a regexp against a string
848  */
849 I32
Perl_pregexec(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)850 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
851          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
852 /* stringarg: the point in the string at which to begin matching */
853 /* strend:    pointer to null at end of string */
854 /* strbeg:    real beginning of string */
855 /* minend:    end of match must be >= minend bytes after stringarg. */
856 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
857  *            itself is accessed via the pointers above */
858 /* nosave:    For optimizations. */
859 {
860     PERL_ARGS_ASSERT_PREGEXEC;
861 
862     return
863         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
864                       nosave ? 0 : REXEC_COPY_STR);
865 }
866 #endif
867 
868 
869 
870 /* re_intuit_start():
871  *
872  * Based on some optimiser hints, try to find the earliest position in the
873  * string where the regex could match.
874  *
875  *   rx:     the regex to match against
876  *   sv:     the SV being matched: only used for utf8 flag; the string
877  *           itself is accessed via the pointers below. Note that on
878  *           something like an overloaded SV, SvPOK(sv) may be false
879  *           and the string pointers may point to something unrelated to
880  *           the SV itself.
881  *   strbeg: real beginning of string
882  *   strpos: the point in the string at which to begin matching
883  *   strend: pointer to the byte following the last char of the string
884  *   flags   currently unused; set to 0
885  *   data:   currently unused; set to NULL
886  *
887  * The basic idea of re_intuit_start() is to use some known information
888  * about the pattern, namely:
889  *
890  *   a) the longest known anchored substring (i.e. one that's at a
891  *      constant offset from the beginning of the pattern; but not
892  *      necessarily at a fixed offset from the beginning of the
893  *      string);
894  *   b) the longest floating substring (i.e. one that's not at a constant
895  *      offset from the beginning of the pattern);
896  *   c) Whether the pattern is anchored to the string; either
897  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
898  *      or anchored to pos(): /\G/;
899  *   d) A start class: a real or synthetic character class which
900  *      represents which characters are legal at the start of the pattern;
901  *
902  * to either quickly reject the match, or to find the earliest position
903  * within the string at which the pattern might match, thus avoiding
904  * running the full NFA engine at those earlier locations, only to
905  * eventually fail and retry further along.
906  *
907  * Returns NULL if the pattern can't match, or returns the address within
908  * the string which is the earliest place the match could occur.
909  *
910  * The longest of the anchored and floating substrings is called 'check'
911  * and is checked first. The other is called 'other' and is checked
912  * second. The 'other' substring may not be present.  For example,
913  *
914  *    /(abc|xyz)ABC\d{0,3}DEFG/
915  *
916  * will have
917  *
918  *   check substr (float)    = "DEFG", offset 6..9 chars
919  *   other substr (anchored) = "ABC",  offset 3..3 chars
920  *   stclass = [ax]
921  *
922  * Be aware that during the course of this function, sometimes 'anchored'
923  * refers to a substring being anchored relative to the start of the
924  * pattern, and sometimes to the pattern itself being anchored relative to
925  * the string. For example:
926  *
927  *   /\dabc/:   "abc" is anchored to the pattern;
928  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
929  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
930  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
931  *                    but the pattern is anchored to the string.
932  */
933 
934 char *
Perl_re_intuit_start(pTHX_ REGEXP * const rx,SV * sv,const char * const strbeg,char * strpos,char * strend,const U32 flags,re_scream_pos_data * data)935 Perl_re_intuit_start(pTHX_
936                     REGEXP * const rx,
937                     SV *sv,
938                     const char * const strbeg,
939                     char *strpos,
940                     char *strend,
941                     const U32 flags,
942                     re_scream_pos_data *data)
943 {
944     struct regexp *const prog = ReANY(rx);
945     SSize_t start_shift = prog->check_offset_min;
946     /* Should be nonnegative! */
947     SSize_t end_shift   = 0;
948     /* current lowest pos in string where the regex can start matching */
949     char *rx_origin = strpos;
950     SV *check;
951     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
952     U8   other_ix = 1 - prog->substrs->check_ix;
953     bool ml_anch = 0;
954     char *other_last = strpos;/* latest pos 'other' substr already checked to */
955     char *check_at = NULL;		/* check substr found at this pos */
956     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
957     RXi_GET_DECL(prog,progi);
958     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
959     regmatch_info *const reginfo = &reginfo_buf;
960     DECLARE_AND_GET_RE_DEBUG_FLAGS;
961 
962     PERL_ARGS_ASSERT_RE_INTUIT_START;
963     PERL_UNUSED_ARG(flags);
964     PERL_UNUSED_ARG(data);
965 
966     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
967                 "Intuit: trying to determine minimum start position...\n"));
968 
969     /* for now, assume that all substr offsets are positive. If at some point
970      * in the future someone wants to do clever things with lookbehind and
971      * -ve offsets, they'll need to fix up any code in this function
972      * which uses these offsets. See the thread beginning
973      * <20140113145929.GF27210@iabyn.com>
974      */
975     assert(prog->substrs->data[0].min_offset >= 0);
976     assert(prog->substrs->data[0].max_offset >= 0);
977     assert(prog->substrs->data[1].min_offset >= 0);
978     assert(prog->substrs->data[1].max_offset >= 0);
979     assert(prog->substrs->data[2].min_offset >= 0);
980     assert(prog->substrs->data[2].max_offset >= 0);
981 
982     /* for now, assume that if both present, that the floating substring
983      * doesn't start before the anchored substring.
984      * If you break this assumption (e.g. doing better optimisations
985      * with lookahead/behind), then you'll need to audit the code in this
986      * function carefully first
987      */
988     assert(
989             ! (  (prog->anchored_utf8 || prog->anchored_substr)
990               && (prog->float_utf8    || prog->float_substr))
991            || (prog->float_min_offset >= prog->anchored_offset));
992 
993     /* byte rather than char calculation for efficiency. It fails
994      * to quickly reject some cases that can't match, but will reject
995      * them later after doing full char arithmetic */
996     if (prog->minlen > strend - strpos) {
997         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
998                               "  String too short...\n"));
999         goto fail;
1000     }
1001 
1002     RXp_MATCH_UTF8_set(prog, utf8_target);
1003     reginfo->is_utf8_target = cBOOL(utf8_target);
1004     reginfo->info_aux = NULL;
1005     reginfo->strbeg = strbeg;
1006     reginfo->strend = strend;
1007     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1008     reginfo->intuit = 1;
1009     /* not actually used within intuit, but zero for safety anyway */
1010     reginfo->poscache_maxiter = 0;
1011 
1012     if (utf8_target) {
1013         if ((!prog->anchored_utf8 && prog->anchored_substr)
1014                 || (!prog->float_utf8 && prog->float_substr))
1015             to_utf8_substr(prog);
1016         check = prog->check_utf8;
1017     } else {
1018         if (!prog->check_substr && prog->check_utf8) {
1019             if (! to_byte_substr(prog)) {
1020                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1021             }
1022         }
1023         check = prog->check_substr;
1024     }
1025 
1026     /* dump the various substring data */
1027     DEBUG_OPTIMISE_MORE_r({
1028         int i;
1029         for (i=0; i<=2; i++) {
1030             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1031                                   : prog->substrs->data[i].substr);
1032             if (!sv)
1033                 continue;
1034 
1035             Perl_re_printf( aTHX_
1036                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1037                 " useful=%" IVdf " utf8=%d [%s]\n",
1038                 i,
1039                 (IV)prog->substrs->data[i].min_offset,
1040                 (IV)prog->substrs->data[i].max_offset,
1041                 (IV)prog->substrs->data[i].end_shift,
1042                 BmUSEFUL(sv),
1043                 utf8_target ? 1 : 0,
1044                 SvPEEK(sv));
1045         }
1046     });
1047 
1048     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1049 
1050         /* ml_anch: check after \n?
1051          *
1052          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1053          * with /.*.../, these flags will have been added by the
1054          * compiler:
1055          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1056          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1057          */
1058         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1059                    && !(prog->intflags & PREGf_IMPLICIT);
1060 
1061         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1062             /* we are only allowed to match at BOS or \G */
1063 
1064             /* trivially reject if there's a BOS anchor and we're not at BOS.
1065              *
1066              * Note that we don't try to do a similar quick reject for
1067              * \G, since generally the caller will have calculated strpos
1068              * based on pos() and gofs, so the string is already correctly
1069              * anchored by definition; and handling the exceptions would
1070              * be too fiddly (e.g. REXEC_IGNOREPOS).
1071              */
1072             if (   strpos != strbeg
1073                 && (prog->intflags & PREGf_ANCH_SBOL))
1074             {
1075                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1076                                 "  Not at start...\n"));
1077                 goto fail;
1078             }
1079 
1080             /* in the presence of an anchor, the anchored (relative to the
1081              * start of the regex) substr must also be anchored relative
1082              * to strpos. So quickly reject if substr isn't found there.
1083              * This works for \G too, because the caller will already have
1084              * subtracted gofs from pos, and gofs is the offset from the
1085              * \G to the start of the regex. For example, in /.abc\Gdef/,
1086              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1087              * caller will have set strpos=pos()-4; we look for the substr
1088              * at position pos()-4+1, which lines up with the "a" */
1089 
1090             if (prog->check_offset_min == prog->check_offset_max) {
1091                 /* Substring at constant offset from beg-of-str... */
1092                 SSize_t slen = SvCUR(check);
1093                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1094 
1095                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1096                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1097                     (IV)prog->check_offset_min));
1098 
1099                 if (SvTAIL(check)) {
1100                     /* In this case, the regex is anchored at the end too.
1101                      * Unless it's a multiline match, the lengths must match
1102                      * exactly, give or take a \n.  NB: slen >= 1 since
1103                      * the last char of check is \n */
1104                     if (!multiline
1105                         && (   strend - s > slen
1106                             || strend - s < slen - 1
1107                             || (strend - s == slen && strend[-1] != '\n')))
1108                     {
1109                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1110                                             "  String too long...\n"));
1111                         goto fail_finish;
1112                     }
1113                     /* Now should match s[0..slen-2] */
1114                     slen--;
1115                 }
1116                 if (slen && (strend - s < slen
1117                     || *SvPVX_const(check) != *s
1118                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1119                 {
1120                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121                                     "  String not equal...\n"));
1122                     goto fail_finish;
1123                 }
1124 
1125                 check_at = s;
1126                 goto success_at_start;
1127             }
1128         }
1129     }
1130 
1131     end_shift = prog->check_end_shift;
1132 
1133 #ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
1134     if (end_shift < 0)
1135         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1136                    (IV)end_shift, RX_PRECOMP(rx));
1137 #endif
1138 
1139   restart:
1140 
1141     /* This is the (re)entry point of the main loop in this function.
1142      * The goal of this loop is to:
1143      * 1) find the "check" substring in the region rx_origin..strend
1144      *    (adjusted by start_shift / end_shift). If not found, reject
1145      *    immediately.
1146      * 2) If it exists, look for the "other" substr too if defined; for
1147      *    example, if the check substr maps to the anchored substr, then
1148      *    check the floating substr, and vice-versa. If not found, go
1149      *    back to (1) with rx_origin suitably incremented.
1150      * 3) If we find an rx_origin position that doesn't contradict
1151      *    either of the substrings, then check the possible additional
1152      *    constraints on rx_origin of /^.../m or a known start class.
1153      *    If these fail, then depending on which constraints fail, jump
1154      *    back to here, or to various other re-entry points further along
1155      *    that skip some of the first steps.
1156      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1157      *    substring. If the start position was determined to be at the
1158      *    beginning of the string  - so, not rejected, but not optimised,
1159      *    since we have to run regmatch from position 0 - decrement the
1160      *    BmUSEFUL() count. Otherwise increment it.
1161      */
1162 
1163 
1164     /* first, look for the 'check' substring */
1165 
1166     {
1167         U8* start_point;
1168         U8* end_point;
1169 
1170         DEBUG_OPTIMISE_MORE_r({
1171             Perl_re_printf( aTHX_
1172                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1173                 " Start shift: %" IVdf " End shift %" IVdf
1174                 " Real end Shift: %" IVdf "\n",
1175                 (IV)(rx_origin - strbeg),
1176                 (IV)prog->check_offset_min,
1177                 (IV)start_shift,
1178                 (IV)end_shift,
1179                 (IV)prog->check_end_shift);
1180         });
1181 
1182         end_point = HOPBACK3(strend, end_shift, rx_origin);
1183         if (!end_point)
1184             goto fail_finish;
1185         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1186         if (!start_point)
1187             goto fail_finish;
1188 
1189 
1190         /* If the regex is absolutely anchored to either the start of the
1191          * string (SBOL) or to pos() (ANCH_GPOS), then
1192          * check_offset_max represents an upper bound on the string where
1193          * the substr could start. For the ANCH_GPOS case, we assume that
1194          * the caller of intuit will have already set strpos to
1195          * pos()-gofs, so in this case strpos + offset_max will still be
1196          * an upper bound on the substr.
1197          */
1198         if (!ml_anch
1199             && prog->intflags & PREGf_ANCH
1200             && prog->check_offset_max != SSize_t_MAX)
1201         {
1202             SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check));
1203             const char * const anchor =
1204                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1205             SSize_t targ_len = (char*)end_point - anchor;
1206 
1207             if (check_len > targ_len) {
1208                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1209                               "Target string too short to match required substring...\n"));
1210                 goto fail_finish;
1211             }
1212 
1213             /* do a bytes rather than chars comparison. It's conservative;
1214              * so it skips doing the HOP if the result can't possibly end
1215              * up earlier than the old value of end_point.
1216              */
1217             assert(anchor + check_len <= (char *)end_point);
1218             if (prog->check_offset_max + check_len < targ_len) {
1219                 end_point = HOP3lim((U8*)anchor,
1220                                 prog->check_offset_max,
1221                                 end_point - check_len
1222                             )
1223                             + check_len;
1224                 if (end_point < start_point)
1225                     goto fail_finish;
1226             }
1227         }
1228 
1229         check_at = fbm_instr( start_point, end_point,
1230                       check, multiline ? FBMrf_MULTILINE : 0);
1231 
1232         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1233             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1234             (IV)((char*)start_point - strbeg),
1235             (IV)((char*)end_point   - strbeg),
1236             (IV)(check_at ? check_at - strbeg : -1)
1237         ));
1238 
1239         /* Update the count-of-usability, remove useless subpatterns,
1240             unshift s.  */
1241 
1242         DEBUG_EXECUTE_r({
1243             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1244                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1245             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1246                               (check_at ? "Found" : "Did not find"),
1247                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1248                     ? "anchored" : "floating"),
1249                 quoted,
1250                 RE_SV_TAIL(check),
1251                 (check_at ? " at offset " : "...\n") );
1252         });
1253 
1254         if (!check_at)
1255             goto fail_finish;
1256         /* set rx_origin to the minimum position where the regex could start
1257          * matching, given the constraint of the just-matched check substring.
1258          * But don't set it lower than previously.
1259          */
1260 
1261         if (check_at - rx_origin > prog->check_offset_max)
1262             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1263         /* Finish the diagnostic message */
1264         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1265             "%ld (rx_origin now %" IVdf ")...\n",
1266             (long)(check_at - strbeg),
1267             (IV)(rx_origin - strbeg)
1268         ));
1269     }
1270 
1271 
1272     /* now look for the 'other' substring if defined */
1273 
1274     if (prog->substrs->data[other_ix].utf8_substr
1275         || prog->substrs->data[other_ix].substr)
1276     {
1277         /* Take into account the "other" substring. */
1278         char *last, *last1;
1279         char *s;
1280         SV* must;
1281         struct reg_substr_datum *other;
1282 
1283       do_other_substr:
1284         other = &prog->substrs->data[other_ix];
1285         if (!utf8_target && !other->substr) {
1286             if (!to_byte_substr(prog)) {
1287                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1288             }
1289         }
1290 
1291         /* if "other" is anchored:
1292          * we've previously found a floating substr starting at check_at.
1293          * This means that the regex origin must lie somewhere
1294          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1295          * and max:                 HOP3(check_at, -check_offset_min)
1296          * (except that min will be >= strpos)
1297          * So the fixed  substr must lie somewhere between
1298          *  HOP3(min, anchored_offset)
1299          *  HOP3(max, anchored_offset) + SvCUR(substr)
1300          */
1301 
1302         /* if "other" is floating
1303          * Calculate last1, the absolute latest point where the
1304          * floating substr could start in the string, ignoring any
1305          * constraints from the earlier fixed match. It is calculated
1306          * as follows:
1307          *
1308          * strend - prog->minlen (in chars) is the absolute latest
1309          * position within the string where the origin of the regex
1310          * could appear. The latest start point for the floating
1311          * substr is float_min_offset(*) on from the start of the
1312          * regex.  last1 simply combines thee two offsets.
1313          *
1314          * (*) You might think the latest start point should be
1315          * float_max_offset from the regex origin, and technically
1316          * you'd be correct. However, consider
1317          *    /a\d{2,4}bcd\w/
1318          * Here, float min, max are 3,5 and minlen is 7.
1319          * This can match either
1320          *    /a\d\dbcd\w/
1321          *    /a\d\d\dbcd\w/
1322          *    /a\d\d\d\dbcd\w/
1323          * In the first case, the regex matches minlen chars; in the
1324          * second, minlen+1, in the third, minlen+2.
1325          * In the first case, the floating offset is 3 (which equals
1326          * float_min), in the second, 4, and in the third, 5 (which
1327          * equals float_max). In all cases, the floating string bcd
1328          * can never start more than 4 chars from the end of the
1329          * string, which equals minlen - float_min. As the substring
1330          * starts to match more than float_min from the start of the
1331          * regex, it makes the regex match more than minlen chars,
1332          * and the two cancel each other out. So we can always use
1333          * float_min - minlen, rather than float_max - minlen for the
1334          * latest position in the string.
1335          *
1336          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1337          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1338          */
1339 
1340         assert(prog->minlen >= other->min_offset);
1341         last1 = HOP3c(strend,
1342                         other->min_offset - prog->minlen, strbeg);
1343 
1344         if (other_ix) {/* i.e. if (other-is-float) */
1345             /* last is the latest point where the floating substr could
1346              * start, *given* any constraints from the earlier fixed
1347              * match. This constraint is that the floating string starts
1348              * <= float_max_offset chars from the regex origin (rx_origin).
1349              * If this value is less than last1, use it instead.
1350              */
1351             assert(rx_origin <= last1);
1352             last =
1353                 /* this condition handles the offset==infinity case, and
1354                  * is a short-cut otherwise. Although it's comparing a
1355                  * byte offset to a char length, it does so in a safe way,
1356                  * since 1 char always occupies 1 or more bytes,
1357                  * so if a string range is  (last1 - rx_origin) bytes,
1358                  * it will be less than or equal to  (last1 - rx_origin)
1359                  * chars; meaning it errs towards doing the accurate HOP3
1360                  * rather than just using last1 as a short-cut */
1361                 (last1 - rx_origin) < other->max_offset
1362                     ? last1
1363                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1364         }
1365         else {
1366             assert(strpos + start_shift <= check_at);
1367             last = HOP4c(check_at, other->min_offset - start_shift,
1368                         strbeg, strend);
1369         }
1370 
1371         s = HOP3c(rx_origin, other->min_offset, strend);
1372         if (s < other_last)	/* These positions already checked */
1373             s = other_last;
1374 
1375         must = utf8_target ? other->utf8_substr : other->substr;
1376         assert(SvPOK(must));
1377         {
1378             char *from = s;
1379             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1380 
1381             if (to > strend)
1382                 to = strend;
1383             if (from > to) {
1384                 s = NULL;
1385                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1386                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1387                     (IV)(from - strbeg),
1388                     (IV)(to   - strbeg)
1389                 ));
1390             }
1391             else {
1392                 s = fbm_instr(
1393                     (unsigned char*)from,
1394                     (unsigned char*)to,
1395                     must,
1396                     multiline ? FBMrf_MULTILINE : 0
1397                 );
1398                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1399                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1400                     (IV)(from - strbeg),
1401                     (IV)(to   - strbeg),
1402                     (IV)(s ? s - strbeg : -1)
1403                 ));
1404             }
1405         }
1406 
1407         DEBUG_EXECUTE_r({
1408             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1409                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1410             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1411                 s ? "Found" : "Contradicts",
1412                 other_ix ? "floating" : "anchored",
1413                 quoted, RE_SV_TAIL(must));
1414         });
1415 
1416 
1417         if (!s) {
1418             /* last1 is latest possible substr location. If we didn't
1419              * find it before there, we never will */
1420             if (last >= last1) {
1421                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422                                         "; giving up...\n"));
1423                 goto fail_finish;
1424             }
1425 
1426             /* try to find the check substr again at a later
1427              * position. Maybe next time we'll find the "other" substr
1428              * in range too */
1429             other_last = HOP3c(last, 1, strend) /* highest failure */;
1430             rx_origin =
1431                 other_ix /* i.e. if other-is-float */
1432                     ? HOP3c(rx_origin, 1, strend)
1433                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1434             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1435                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1436                 (other_ix ? "floating" : "anchored"),
1437                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1438                 (IV)(rx_origin - strbeg)
1439             ));
1440             goto restart;
1441         }
1442         else {
1443             if (other_ix) { /* if (other-is-float) */
1444                 /* other_last is set to s, not s+1, since its possible for
1445                  * a floating substr to fail first time, then succeed
1446                  * second time at the same floating position; e.g.:
1447                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1448                  * The first time round, anchored and float match at
1449                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1450                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1451                  */
1452                 other_last = s;
1453             }
1454             else {
1455                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1456                 other_last = HOP3c(s, 1, strend);
1457             }
1458             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1459                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1460                   (long)(s - strbeg),
1461                 (IV)(rx_origin - strbeg)
1462               ));
1463 
1464         }
1465     }
1466     else {
1467         DEBUG_OPTIMISE_MORE_r(
1468             Perl_re_printf( aTHX_
1469                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1470                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1471                 " strend:%" IVdf "\n",
1472                 (IV)prog->check_offset_min,
1473                 (IV)prog->check_offset_max,
1474                 (IV)(check_at-strbeg),
1475                 (IV)(rx_origin-strbeg),
1476                 (IV)(rx_origin-check_at),
1477                 (IV)(strend-strbeg)
1478             )
1479         );
1480     }
1481 
1482   postprocess_substr_matches:
1483 
1484     /* handle the extra constraint of /^.../m if present */
1485 
1486     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1487         char *s;
1488 
1489         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1490                         "  looking for /^/m anchor"));
1491 
1492         /* we have failed the constraint of a \n before rx_origin.
1493          * Find the next \n, if any, even if it's beyond the current
1494          * anchored and/or floating substrings. Whether we should be
1495          * scanning ahead for the next \n or the next substr is debatable.
1496          * On the one hand you'd expect rare substrings to appear less
1497          * often than \n's. On the other hand, searching for \n means
1498          * we're effectively flipping between check_substr and "\n" on each
1499          * iteration as the current "rarest" candidate string, which
1500          * means for example that we'll quickly reject the whole string if
1501          * hasn't got a \n, rather than trying every substr position
1502          * first
1503          */
1504 
1505         s = HOP3c(strend, - prog->minlen, strpos);
1506         if (s <= rx_origin ||
1507             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1508         {
1509             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1510                             "  Did not find /%s^%s/m...\n",
1511                             PL_colors[0], PL_colors[1]));
1512             goto fail_finish;
1513         }
1514 
1515         /* earliest possible origin is 1 char after the \n.
1516          * (since *rx_origin == '\n', it's safe to ++ here rather than
1517          * HOP(rx_origin, 1)) */
1518         rx_origin++;
1519 
1520         if (prog->substrs->check_ix == 0  /* check is anchored */
1521             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1522         {
1523             /* Position contradicts check-string; either because
1524              * check was anchored (and thus has no wiggle room),
1525              * or check was float and rx_origin is above the float range */
1526             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1527                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1528                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1529             goto restart;
1530         }
1531 
1532         /* if we get here, the check substr must have been float,
1533          * is in range, and we may or may not have had an anchored
1534          * "other" substr which still contradicts */
1535         assert(prog->substrs->check_ix); /* check is float */
1536 
1537         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1538             /* whoops, the anchored "other" substr exists, so we still
1539              * contradict. On the other hand, the float "check" substr
1540              * didn't contradict, so just retry the anchored "other"
1541              * substr */
1542             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1543                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1544                 PL_colors[0], PL_colors[1],
1545                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1546                 (IV)(rx_origin - strbeg)
1547             ));
1548             goto do_other_substr;
1549         }
1550 
1551         /* success: we don't contradict the found floating substring
1552          * (and there's no anchored substr). */
1553         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1554             "  Found /%s^%s/m with rx_origin %ld...\n",
1555             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1556     }
1557     else {
1558         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1559             "  (multiline anchor test skipped)\n"));
1560     }
1561 
1562   success_at_start:
1563 
1564 
1565     /* if we have a starting character class, then test that extra constraint.
1566      * (trie stclasses are too expensive to use here, we are better off to
1567      * leave it to regmatch itself) */
1568 
1569     if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) {
1570         const U8* const str = (U8*)STRING(progi->regstclass);
1571 
1572         /* XXX this value could be pre-computed */
1573         const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT
1574                     ?  (reginfo->is_utf8_pat
1575                         ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1576                         : (SSize_t)STR_LEN(progi->regstclass))
1577                     : 1);
1578         char * endpos;
1579         char *s;
1580         /* latest pos that a matching float substr constrains rx start to */
1581         char *rx_max_float = NULL;
1582 
1583         /* if the current rx_origin is anchored, either by satisfying an
1584          * anchored substring constraint, or a /^.../m constraint, then we
1585          * can reject the current origin if the start class isn't found
1586          * at the current position. If we have a float-only match, then
1587          * rx_origin is constrained to a range; so look for the start class
1588          * in that range. if neither, then look for the start class in the
1589          * whole rest of the string */
1590 
1591         /* XXX DAPM it's not clear what the minlen test is for, and why
1592          * it's not used in the floating case. Nothing in the test suite
1593          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1594          * Here are some old comments, which may or may not be correct:
1595          *
1596          *   minlen == 0 is possible if regstclass is \b or \B,
1597          *   and the fixed substr is ''$.
1598          *   Since minlen is already taken into account, rx_origin+1 is
1599          *   before strend; accidentally, minlen >= 1 guaranties no false
1600          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1601          *   0) below assumes that regstclass does not come from lookahead...
1602          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1603          *   This leaves EXACTF-ish only, which are dealt with in
1604          *   find_byclass().
1605          */
1606 
1607         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1608             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1609         else if (prog->float_substr || prog->float_utf8) {
1610             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1611             endpos = HOP3clim(rx_max_float, cl_l, strend);
1612         }
1613         else
1614             endpos= strend;
1615 
1616         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1617             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1618             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1619               (IV)start_shift, (IV)(check_at - strbeg),
1620               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1621 
1622         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1623                             reginfo);
1624         if (!s) {
1625             if (endpos == strend) {
1626                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1627                                 "  Could not match STCLASS...\n") );
1628                 goto fail;
1629             }
1630             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1631                                "  This position contradicts STCLASS...\n") );
1632             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1633                         && !(prog->intflags & PREGf_IMPLICIT))
1634                 goto fail;
1635 
1636             /* Contradict one of substrings */
1637             if (prog->anchored_substr || prog->anchored_utf8) {
1638                 if (prog->substrs->check_ix == 1) { /* check is float */
1639                     /* Have both, check_string is floating */
1640                     assert(rx_origin + start_shift <= check_at);
1641                     if (rx_origin + start_shift != check_at) {
1642                         /* not at latest position float substr could match:
1643                          * Recheck anchored substring, but not floating.
1644                          * The condition above is in bytes rather than
1645                          * chars for efficiency. It's conservative, in
1646                          * that it errs on the side of doing 'goto
1647                          * do_other_substr'. In this case, at worst,
1648                          * an extra anchored search may get done, but in
1649                          * practice the extra fbm_instr() is likely to
1650                          * get skipped anyway. */
1651                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1652                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1653                             (long)(other_last - strbeg),
1654                             (IV)(rx_origin - strbeg)
1655                         ));
1656                         goto do_other_substr;
1657                     }
1658                 }
1659             }
1660             else {
1661                 /* float-only */
1662 
1663                 if (ml_anch) {
1664                     /* In the presence of ml_anch, we might be able to
1665                      * find another \n without breaking the current float
1666                      * constraint. */
1667 
1668                     /* strictly speaking this should be HOP3c(..., 1, ...),
1669                      * but since we goto a block of code that's going to
1670                      * search for the next \n if any, its safe here */
1671                     rx_origin++;
1672                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1673                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1674                               PL_colors[0], PL_colors[1],
1675                               (long)(rx_origin - strbeg)) );
1676                     goto postprocess_substr_matches;
1677                 }
1678 
1679                 /* strictly speaking this can never be true; but might
1680                  * be if we ever allow intuit without substrings */
1681                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1682                     goto fail;
1683 
1684                 rx_origin = rx_max_float;
1685             }
1686 
1687             /* at this point, any matching substrings have been
1688              * contradicted. Start again... */
1689 
1690             rx_origin = HOP3c(rx_origin, 1, strend);
1691 
1692             /* uses bytes rather than char calculations for efficiency.
1693              * It's conservative: it errs on the side of doing 'goto restart',
1694              * where there is code that does a proper char-based test */
1695             if (rx_origin + start_shift + end_shift > strend) {
1696                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1697                                        "  Could not match STCLASS...\n") );
1698                 goto fail;
1699             }
1700             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1701                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1702                 (prog->substrs->check_ix ? "floating" : "anchored"),
1703                 (long)(rx_origin + start_shift - strbeg),
1704                 (IV)(rx_origin - strbeg)
1705             ));
1706             goto restart;
1707         }
1708 
1709         /* Success !!! */
1710 
1711         if (rx_origin != s) {
1712             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1713                         "  By STCLASS: moving %ld --> %ld\n",
1714                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1715                    );
1716         }
1717         else {
1718             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1719                                   "  Does not contradict STCLASS...\n");
1720                    );
1721         }
1722     }
1723 
1724     /* Decide whether using the substrings helped */
1725 
1726     if (rx_origin != strpos) {
1727         /* Fixed substring is found far enough so that the match
1728            cannot start at strpos. */
1729 
1730         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1731         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);	/* hooray/5 */
1732     }
1733     else {
1734         /* The found rx_origin position does not prohibit matching at
1735          * strpos, so calling intuit didn't gain us anything. Decrement
1736          * the BmUSEFUL() count on the check substring, and if we reach
1737          * zero, free it.  */
1738         if (!(prog->intflags & PREGf_NAUGHTY)
1739             && (utf8_target ? (
1740                 prog->check_utf8		/* Could be deleted already */
1741                 && --BmUSEFUL(prog->check_utf8) < 0
1742                 && (prog->check_utf8 == prog->float_utf8)
1743             ) : (
1744                 prog->check_substr		/* Could be deleted already */
1745                 && --BmUSEFUL(prog->check_substr) < 0
1746                 && (prog->check_substr == prog->float_substr)
1747             )))
1748         {
1749             /* If flags & SOMETHING - do not do it many times on the same match */
1750             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1751             /* XXX Does the destruction order has to change with utf8_target? */
1752             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1753             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1754             prog->check_substr = prog->check_utf8 = NULL;	/* disable */
1755             prog->float_substr = prog->float_utf8 = NULL;	/* clear */
1756             check = NULL;			/* abort */
1757             /* XXXX This is a remnant of the old implementation.  It
1758                     looks wasteful, since now INTUIT can use many
1759                     other heuristics. */
1760             prog->extflags &= ~RXf_USE_INTUIT;
1761         }
1762     }
1763 
1764     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1765             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1766              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1767 
1768     return rx_origin;
1769 
1770   fail_finish:				/* Substring not found */
1771     if (prog->check_substr || prog->check_utf8)		/* could be removed already */
1772         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1773   fail:
1774     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1775                           PL_colors[4], PL_colors[5]));
1776     return NULL;
1777 }
1778 
1779 
1780 #define DECL_TRIE_TYPE(scan) \
1781     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1782                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1783                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1784                     trie_type = ((FLAGS(scan) == EXACT)                             \
1785                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1786                                  : (FLAGS(scan) == EXACTL)                          \
1787                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1788                                     : (FLAGS(scan) == EXACTFAA)                     \
1789                                       ? (utf8_target                                \
1790                                          ? trie_utf8_exactfa_fold                   \
1791                                          : trie_latin_utf8_exactfa_fold)            \
1792                                       : (FLAGS(scan) == EXACTFLU8                   \
1793                                          ? (utf8_target                             \
1794                                            ? trie_flu8                              \
1795                                            : trie_flu8_latin)                       \
1796                                          : (utf8_target                             \
1797                                            ? trie_utf8_fold                         \
1798                                            : trie_latin_utf8_fold)))
1799 
1800 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1801  * 'foldbuf+sizeof(foldbuf)' */
1802 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1803 STMT_START {                                                                        \
1804     STRLEN skiplen;                                                                 \
1805     U8 flags = FOLD_FLAGS_FULL;                                                     \
1806     switch (trie_type) {                                                            \
1807     case trie_flu8:                                                                 \
1808         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1809         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1810             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1811         }                                                                           \
1812         goto do_trie_utf8_fold;                                                     \
1813     case trie_utf8_exactfa_fold:                                                    \
1814         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1815         /* FALLTHROUGH */                                                           \
1816     case trie_utf8_fold:                                                            \
1817       do_trie_utf8_fold:                                                            \
1818         if ( foldlen>0 ) {                                                          \
1819             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1820             foldlen -= len;                                                         \
1821             uscan += len;                                                           \
1822             len=0;                                                                  \
1823         } else {                                                                    \
1824             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1825                                                                             flags); \
1826             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1827             skiplen = UVCHR_SKIP( uvc );                                            \
1828             foldlen -= skiplen;                                                     \
1829             uscan = foldbuf + skiplen;                                              \
1830         }                                                                           \
1831         break;                                                                      \
1832     case trie_flu8_latin:                                                           \
1833         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1834         goto do_trie_latin_utf8_fold;                                               \
1835     case trie_latin_utf8_exactfa_fold:                                              \
1836         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1837         /* FALLTHROUGH */                                                           \
1838     case trie_latin_utf8_fold:                                                      \
1839       do_trie_latin_utf8_fold:                                                      \
1840         if ( foldlen>0 ) {                                                          \
1841             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1842             foldlen -= len;                                                         \
1843             uscan += len;                                                           \
1844             len=0;                                                                  \
1845         } else {                                                                    \
1846             len = 1;                                                                \
1847             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1848             skiplen = UVCHR_SKIP( uvc );                                            \
1849             foldlen -= skiplen;                                                     \
1850             uscan = foldbuf + skiplen;                                              \
1851         }                                                                           \
1852         break;                                                                      \
1853     case trie_utf8l:                                                                \
1854         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                         \
1855         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1856             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1857         }                                                                           \
1858         /* FALLTHROUGH */                                                           \
1859     case trie_utf8:                                                                 \
1860         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1861         break;                                                                      \
1862     case trie_plain:                                                                \
1863         uvc = (UV)*uc;                                                              \
1864         len = 1;                                                                    \
1865     }                                                                               \
1866     if (uvc < 256) {                                                                \
1867         charid = trie->charmap[ uvc ];                                              \
1868     }                                                                               \
1869     else {                                                                          \
1870         charid = 0;                                                                 \
1871         if (widecharmap) {                                                          \
1872             SV** const svpp = hv_fetch(widecharmap,                                 \
1873                         (char*)&uvc, sizeof(UV), 0);                                \
1874             if (svpp)                                                               \
1875                 charid = (U16)SvIV(*svpp);                                          \
1876         }                                                                           \
1877     }                                                                               \
1878 } STMT_END
1879 
1880 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1881     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1882                 startpos, doutf8, depth)
1883 
1884 #define GET_ANYOFH_INVLIST(prog, n)                                         \
1885                         GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL)
1886 
1887 #define REXEC_FBC_UTF8_SCAN(CODE)                           \
1888     STMT_START {                                            \
1889         while (s < strend) {                                \
1890             CODE                                            \
1891             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1892         }                                                   \
1893     } STMT_END
1894 
1895 #define REXEC_FBC_NON_UTF8_SCAN(CODE)                       \
1896     STMT_START {                                            \
1897         while (s < strend) {                                \
1898             CODE                                            \
1899             s++;                                            \
1900         }                                                   \
1901     } STMT_END
1902 
1903 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                     \
1904     STMT_START {                                            \
1905         while (s < strend) {                                \
1906             REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)            \
1907         }                                                   \
1908     } STMT_END
1909 
1910 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND)                 \
1911     STMT_START {                                            \
1912         while (s < strend) {                                \
1913             REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)        \
1914         }                                                   \
1915     } STMT_END
1916 
1917 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)                   \
1918     if (COND) {                                                \
1919         FBC_CHECK_AND_TRY                                      \
1920         s += UTF8_SAFE_SKIP(s, reginfo->strend);               \
1921         previous_occurrence_end = s;                           \
1922     }                                                          \
1923     else {                                                     \
1924         s += UTF8SKIP(s);                                      \
1925     }
1926 
1927 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)               \
1928     if (COND) {                                                \
1929         FBC_CHECK_AND_TRY                                      \
1930         s++;                                                   \
1931         previous_occurrence_end = s;                           \
1932     }                                                          \
1933     else {                                                     \
1934         s++;                                                   \
1935     }
1936 
1937 /* We keep track of where the next character should start after an occurrence
1938  * of the one we're looking for.  Knowing that, we can see right away if the
1939  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1940  * don't accept the 2nd and succeeding adjacent occurrences */
1941 #define FBC_CHECK_AND_TRY                                           \
1942         if (   (   doevery                                          \
1943                 || s != previous_occurrence_end)                    \
1944             && (   reginfo->intuit                                  \
1945                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1946         {                                                           \
1947             goto got_it;                                            \
1948         }
1949 
1950 
1951 /* These differ from the above macros in that they call a function which
1952  * returns the next occurrence of the thing being looked for in 's'; and
1953  * 'strend' if there is no such occurrence.  'f' is something like fcn(a,b,c)
1954  * */
1955 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f)                    \
1956     while (s < strend) {                                    \
1957         s = (char *) (f);                                   \
1958         if (s >= strend) {                                  \
1959             break;                                          \
1960         }                                                   \
1961                                                             \
1962         FBC_CHECK_AND_TRY                                   \
1963         s += UTF8SKIP(s);                                   \
1964         previous_occurrence_end = s;                        \
1965     }
1966 
1967 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f)                \
1968     while (s < strend) {                                    \
1969         s = (char *) (f);                                   \
1970         if (s >= strend) {                                  \
1971             break;                                          \
1972         }                                                   \
1973                                                             \
1974         FBC_CHECK_AND_TRY                                   \
1975         s++;                                                \
1976         previous_occurrence_end = s;                        \
1977     }
1978 
1979 /* This is like the above macro except the function returns NULL if there is no
1980  * occurrence, and there is a further condition that must be matched besides
1981  * the function */
1982 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND)         \
1983     while (s < strend) {                                    \
1984         s = (char *) (f);                                     \
1985         if (s == NULL) {                                    \
1986             s = (char *) strend;                            \
1987             break;                                          \
1988         }                                                   \
1989                                                             \
1990         if (COND) {                                         \
1991             FBC_CHECK_AND_TRY                               \
1992             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1993             previous_occurrence_end = s;                    \
1994         }                                                   \
1995         else {                                              \
1996             s += UTF8SKIP(s);                               \
1997         }                                                   \
1998     }
1999 
2000 /* This differs from the above macros in that it is passed a single byte that
2001  * is known to begin the next occurrence of the thing being looked for in 's'.
2002  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
2003  * at that position. */
2004 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)                  \
2005     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s),     \
2006                                               COND)
2007 
2008 /* This is like the function above, but takes an entire string to look for
2009  * instead of a single byte */
2010 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND)      \
2011     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(                                     \
2012                                      ninstr(s, strend, substr, substr_end), \
2013                                      COND)
2014 
2015 /* The four macros below are slightly different versions of the same logic.
2016  *
2017  * The first is for /a and /aa when the target string is UTF-8.  This can only
2018  * match ascii, but it must advance based on UTF-8.   The other three handle
2019  * the non-UTF-8 and the more generic UTF-8 cases.   In all four, we are
2020  * looking for the boundary (or non-boundary) between a word and non-word
2021  * character.  The utf8 and non-utf8 cases have the same logic, but the details
2022  * must be different.  Find the "wordness" of the character just prior to this
2023  * one, and compare it with the wordness of this one.  If they differ, we have
2024  * a boundary.  At the beginning of the string, pretend that the previous
2025  * character was a new-line.
2026  *
2027  * All these macros uncleanly have side-effects with each other and outside
2028  * variables.  So far it's been too much trouble to clean-up
2029  *
2030  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
2031  *               a word character or not.
2032  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
2033  *               word/non-word
2034  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
2035  *
2036  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
2037  * are looking for a boundary or for a non-boundary.  If we are looking for a
2038  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
2039  * see if this tentative match actually works, and if so, to quit the loop
2040  * here.  And vice-versa if we are looking for a non-boundary.
2041  *
2042  * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
2043  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
2044  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
2045  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2046  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2047  * complement.  But in that branch we complement tmp, meaning that at the
2048  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2049  * which means at the top of the loop in the next iteration, it is
2050  * TEST_NON_UTF8(s-1) */
2051 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
2052     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
2053     tmp = TEST_NON_UTF8(tmp);                                                  \
2054     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
2055         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
2056             tmp = !tmp;                                                        \
2057             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
2058         }                                                                      \
2059         else {                                                                 \
2060             IF_FAIL;                                                           \
2061         }                                                                      \
2062     );                                                                         \
2063 
2064 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2065  * TEST_UTF8 is a macro that for the same input code points returns identically
2066  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
2067  * end pointer as well) */
2068 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
2069     if (s == reginfo->strbeg) {                                                \
2070         tmp = '\n';                                                            \
2071     }                                                                          \
2072     else { /* Back-up to the start of the previous character */                \
2073         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2074         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2075                                                        0, UTF8_ALLOW_DEFAULT); \
2076     }                                                                          \
2077     tmp = TEST_UV(tmp);                                                        \
2078     REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */                      \
2079         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2080             tmp = !tmp;                                                        \
2081             IF_SUCCESS;                                                        \
2082         }                                                                      \
2083         else {                                                                 \
2084             IF_FAIL;                                                           \
2085         }                                                                      \
2086     );
2087 
2088 /* Like the above two macros, for a UTF-8 target string.  UTF8_CODE is the
2089  * complete code for handling UTF-8.  Common to the BOUND and NBOUND cases,
2090  * set-up by the FBC_BOUND, etc macros below */
2091 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)   \
2092     UTF8_CODE;                                                                 \
2093     /* Here, things have been set up by the previous code so that tmp is the   \
2094      * return of TEST_NON_UTF8(s-1).  We also have to check if this matches    \
2095      * against the EOS, which we treat as a \n */                              \
2096     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2097         IF_SUCCESS;                                                            \
2098     }                                                                          \
2099     else {                                                                     \
2100         IF_FAIL;                                                               \
2101     }
2102 
2103 /* Same as the macro above, but the target isn't UTF-8 */
2104 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)       \
2105     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                   \
2106     tmp = TEST_NON_UTF8(tmp);                                               \
2107     REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */               \
2108         if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) {                           \
2109             IF_SUCCESS;                                                     \
2110             tmp = !tmp;                                                     \
2111         }                                                                   \
2112         else {                                                              \
2113             IF_FAIL;                                                        \
2114         }                                                                   \
2115     );                                                                      \
2116     /* Here, things have been set up by the previous code so that tmp is    \
2117      * the return of TEST_NON_UTF8(s-1).   We also have to check if this    \
2118      * matches against the EOS, which we treat as a \n */                   \
2119     if (tmp == ! TEST_NON_UTF8('\n')) {                                     \
2120         IF_SUCCESS;                                                         \
2121     }                                                                       \
2122     else {                                                                  \
2123         IF_FAIL;                                                            \
2124     }
2125 
2126 /* This is the macro to use when we want to see if something that looks like it
2127  * could match, actually does, and if so exits the loop.  It needs to be used
2128  * only for bounds checking macros, as it allows for matching beyond the end of
2129  * string (which should be zero length without having to look at the string
2130  * contents) */
2131 #define REXEC_FBC_TRYIT                                                     \
2132     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
2133         goto got_it
2134 
2135 /* The only difference between the BOUND and NBOUND cases is that
2136  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2137  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2138  * with the other one being empty (PLACEHOLDER is defined as empty).
2139  *
2140  * The TEST_FOO parameters are for operating on different forms of input, but
2141  * all should be ones that return identically for the same underlying code
2142  * points */
2143 
2144 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                   \
2145     FBC_BOUND_COMMON_UTF8(                                                  \
2146           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),       \
2147           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2148 
2149 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8)                                   \
2150     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2151 
2152 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8)                                     \
2153     FBC_BOUND_COMMON_UTF8(                                                  \
2154                     FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2155                     TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2156 
2157 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8)                                 \
2158     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2159 
2160 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                  \
2161     FBC_BOUND_COMMON_UTF8(                                                  \
2162               FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),   \
2163               TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2164 
2165 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8)                                  \
2166     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2167 
2168 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8)                                    \
2169     FBC_BOUND_COMMON_UTF8(                                                  \
2170             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),        \
2171             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2172 
2173 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8)                                \
2174     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2175 
2176 #ifdef DEBUGGING
2177 static IV
S_get_break_val_cp_checked(SV * const invlist,const UV cp_in)2178 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2179   IV cp_out = _invlist_search(invlist, cp_in);
2180   assert(cp_out >= 0);
2181   return cp_out;
2182 }
2183 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2184         invmap[S_get_break_val_cp_checked(invlist, cp)]
2185 #else
2186 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2187         invmap[_invlist_search(invlist, cp)]
2188 #endif
2189 
2190 /* Takes a pointer to an inversion list, a pointer to its corresponding
2191  * inversion map, and a code point, and returns the code point's value
2192  * according to the two arrays.  It assumes that all code points have a value.
2193  * This is used as the base macro for macros for particular properties */
2194 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2195         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2196 
2197 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2198  * of a code point, returning the value for the first code point in the string.
2199  * And it takes the particular macro name that finds the desired value given a
2200  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2201 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2202              (__ASSERT_(pos < strend)                                          \
2203                  /* Note assumes is valid UTF-8 */                             \
2204              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2205 
2206 /* Returns the GCB value for the input code point */
2207 #define getGCB_VAL_CP(cp)                                                      \
2208           _generic_GET_BREAK_VAL_CP(                                           \
2209                                     PL_GCB_invlist,                            \
2210                                     _Perl_GCB_invmap,                          \
2211                                     (cp))
2212 
2213 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2214  * bounded by pos and strend */
2215 #define getGCB_VAL_UTF8(pos, strend)                                           \
2216     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2217 
2218 /* Returns the LB value for the input code point */
2219 #define getLB_VAL_CP(cp)                                                       \
2220           _generic_GET_BREAK_VAL_CP(                                           \
2221                                     PL_LB_invlist,                             \
2222                                     _Perl_LB_invmap,                           \
2223                                     (cp))
2224 
2225 /* Returns the LB value for the first code point in the UTF-8 encoded string
2226  * bounded by pos and strend */
2227 #define getLB_VAL_UTF8(pos, strend)                                            \
2228     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2229 
2230 
2231 /* Returns the SB value for the input code point */
2232 #define getSB_VAL_CP(cp)                                                       \
2233           _generic_GET_BREAK_VAL_CP(                                           \
2234                                     PL_SB_invlist,                             \
2235                                     _Perl_SB_invmap,                     \
2236                                     (cp))
2237 
2238 /* Returns the SB value for the first code point in the UTF-8 encoded string
2239  * bounded by pos and strend */
2240 #define getSB_VAL_UTF8(pos, strend)                                            \
2241     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2242 
2243 /* Returns the WB value for the input code point */
2244 #define getWB_VAL_CP(cp)                                                       \
2245           _generic_GET_BREAK_VAL_CP(                                           \
2246                                     PL_WB_invlist,                             \
2247                                     _Perl_WB_invmap,                         \
2248                                     (cp))
2249 
2250 /* Returns the WB value for the first code point in the UTF-8 encoded string
2251  * bounded by pos and strend */
2252 #define getWB_VAL_UTF8(pos, strend)                                            \
2253     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2254 
2255 /* We know what class REx starts with.  Try to find this position... */
2256 /* if reginfo->intuit, its a dryrun */
2257 /* annoyingly all the vars in this routine have different names from their counterparts
2258    in regmatch. /grrr */
2259 STATIC char *
S_find_byclass(pTHX_ regexp * prog,const regnode * c,char * s,const char * strend,regmatch_info * reginfo)2260 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2261     const char *strend, regmatch_info *reginfo)
2262 {
2263 
2264     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2265     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2266 
2267     char *pat_string;   /* The pattern's exactish string */
2268     char *pat_end;	    /* ptr to end char of pat_string */
2269     re_fold_t folder;	/* Function for computing non-utf8 folds */
2270     const U8 *fold_array;   /* array for folding ords < 256 */
2271     STRLEN ln;
2272     STRLEN lnc;
2273     U8 c1;
2274     U8 c2;
2275     char *e = NULL;
2276 
2277     /* In some cases we accept only the first occurrence of 'x' in a sequence of
2278      * them.  This variable points to just beyond the end of the previous
2279      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2280      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2281      * hop back.) */
2282     char * previous_occurrence_end = 0;
2283 
2284     I32 tmp;            /* Scratch variable */
2285     const bool utf8_target = reginfo->is_utf8_target;
2286     UV utf8_fold_flags = 0;
2287     const bool is_utf8_pat = reginfo->is_utf8_pat;
2288     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2289                                    with a result inverts that result, as 0^1 =
2290                                    1 and 1^1 = 0 */
2291     char_class_number_ classnum;
2292 
2293     RXi_GET_DECL(prog,progi);
2294 
2295     PERL_ARGS_ASSERT_FIND_BYCLASS;
2296 
2297     /* We know what class it must start with. The case statements below have
2298      * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2299      * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2300      * ('p8' and 'pb'. */
2301     switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2302         SV * anyofh_list;
2303 
2304       case ANYOFPOSIXL_t8_pb:
2305       case ANYOFPOSIXL_t8_p8:
2306       case ANYOFL_t8_pb:
2307       case ANYOFL_t8_p8:
2308         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2309         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2310 
2311         /* FALLTHROUGH */
2312 
2313       case ANYOFD_t8_pb:
2314       case ANYOFD_t8_p8:
2315       case ANYOF_t8_pb:
2316       case ANYOF_t8_p8:
2317         REXEC_FBC_UTF8_CLASS_SCAN(
2318                 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2319         break;
2320 
2321       case ANYOFPOSIXL_tb_pb:
2322       case ANYOFPOSIXL_tb_p8:
2323       case ANYOFL_tb_pb:
2324       case ANYOFL_tb_p8:
2325         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2326         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2327 
2328         /* FALLTHROUGH */
2329 
2330       case ANYOFD_tb_pb:
2331       case ANYOFD_tb_p8:
2332       case ANYOF_tb_pb:
2333       case ANYOF_tb_p8:
2334         if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) {
2335             /* We know that s is in the bitmap range since the target isn't
2336              * UTF-8, so what happens for out-of-range values is not relevant,
2337              * so exclude that from the flags */
2338             REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2339         }
2340         else {
2341             REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2342                                                      0));
2343         }
2344         break;
2345 
2346       case ANYOFM_tb_pb: /* ARG1u() is the base byte; FLAGS() the mask byte */
2347       case ANYOFM_tb_p8:
2348         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2349              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2350         break;
2351 
2352       case ANYOFM_t8_pb:
2353       case ANYOFM_t8_p8:
2354         /* UTF-8ness doesn't matter because only matches UTF-8 invariants.  But
2355          * we do anyway for performance reasons, as otherwise we would have to
2356          * examine all the continuation characters */
2357         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2358              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2359         break;
2360 
2361       case NANYOFM_tb_pb:
2362       case NANYOFM_tb_p8:
2363         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2364            find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2365         break;
2366 
2367       case NANYOFM_t8_pb:
2368       case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2369                                   variants. */
2370         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2371                         (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2372                                                     (U8) ARG1u(c), FLAGS(c)));
2373         break;
2374 
2375       /* These nodes all require at least one code point to be in UTF-8 to
2376        * match */
2377       case ANYOFH_tb_pb:
2378       case ANYOFH_tb_p8:
2379       case ANYOFHb_tb_pb:
2380       case ANYOFHb_tb_p8:
2381       case ANYOFHbbm_tb_pb:
2382       case ANYOFHbbm_tb_p8:
2383       case ANYOFHr_tb_pb:
2384       case ANYOFHr_tb_p8:
2385       case ANYOFHs_tb_pb:
2386       case ANYOFHs_tb_p8:
2387       case EXACTFLU8_tb_pb:
2388       case EXACTFLU8_tb_p8:
2389       case EXACTFU_REQ8_tb_pb:
2390       case EXACTFU_REQ8_tb_p8:
2391         break;
2392 
2393       case ANYOFH_t8_pb:
2394       case ANYOFH_t8_p8:
2395         anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2396         REXEC_FBC_UTF8_CLASS_SCAN(
2397               (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2398                && _invlist_contains_cp(anyofh_list,
2399                                        utf8_to_uvchr_buf((U8 *) s,
2400                                                          (U8 *) strend,
2401                                                          NULL))));
2402         break;
2403 
2404       case ANYOFHb_t8_pb:
2405       case ANYOFHb_t8_p8:
2406         {
2407             /* We know what the first byte of any matched string should be. */
2408             U8 first_byte = FLAGS(c);
2409 
2410             anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2411             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2412                    _invlist_contains_cp(anyofh_list,
2413                                            utf8_to_uvchr_buf((U8 *) s,
2414                                                               (U8 *) strend,
2415                                                               NULL)));
2416         }
2417         break;
2418 
2419       case ANYOFHbbm_t8_pb:
2420       case ANYOFHbbm_t8_p8:
2421         {
2422             /* We know what the first byte of any matched string should be. */
2423             U8 first_byte = FLAGS(c);
2424 
2425             /* And a bitmap defines all the legal 2nd byte matches */
2426             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2427                                (    s < strend
2428                                 && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap,
2429                                             (U8) s[1] & UTF_CONTINUATION_MASK)));
2430         }
2431         break;
2432 
2433       case ANYOFHr_t8_pb:
2434       case ANYOFHr_t8_p8:
2435         anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2436         REXEC_FBC_UTF8_CLASS_SCAN(
2437                     (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2438                                 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2439                                 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2440                    && _invlist_contains_cp(anyofh_list,
2441                                            utf8_to_uvchr_buf((U8 *) s,
2442                                                               (U8 *) strend,
2443                                                               NULL))));
2444         break;
2445 
2446       case ANYOFHs_t8_pb:
2447       case ANYOFHs_t8_p8:
2448         anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2449         REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2450                         ((struct regnode_anyofhs *) c)->string,
2451                         /* Note FLAGS is the string length in this regnode */
2452                         ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2453                         _invlist_contains_cp(anyofh_list,
2454                                              utf8_to_uvchr_buf((U8 *) s,
2455                                                                (U8 *) strend,
2456                                                                NULL)));
2457         break;
2458 
2459       case ANYOFR_tb_pb:
2460       case ANYOFR_tb_p8:
2461         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2462                                             ANYOFRbase(c), ANYOFRdelta(c)));
2463         break;
2464 
2465       case ANYOFR_t8_pb:
2466       case ANYOFR_t8_p8:
2467         REXEC_FBC_UTF8_CLASS_SCAN(
2468                             (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2469                              && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2470                                                               (U8 *) strend,
2471                                                               NULL),
2472                                             ANYOFRbase(c), ANYOFRdelta(c))));
2473         break;
2474 
2475       case ANYOFRb_tb_pb:
2476       case ANYOFRb_tb_p8:
2477         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2478                                             ANYOFRbase(c), ANYOFRdelta(c)));
2479         break;
2480 
2481       case ANYOFRb_t8_pb:
2482       case ANYOFRb_t8_p8:
2483         {   /* We know what the first byte of any matched string should be */
2484             U8 first_byte = FLAGS(c);
2485 
2486             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2487                                 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2488                                                               (U8 *) strend,
2489                                                               NULL),
2490                                             ANYOFRbase(c), ANYOFRdelta(c)));
2491         }
2492         break;
2493 
2494       case EXACTFAA_tb_pb:
2495 
2496         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2497          * which these functions don't handle anyway */
2498         fold_array = PL_fold_latin1;
2499         folder = S_foldEQ_latin1_s2_folded;
2500         goto do_exactf_non_utf8;
2501 
2502       case EXACTF_tb_pb:
2503         fold_array = PL_fold;
2504         folder = Perl_foldEQ;
2505         goto do_exactf_non_utf8;
2506 
2507       case EXACTFL_tb_pb:
2508         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2509 
2510         if (IN_UTF8_CTYPE_LOCALE) {
2511             utf8_fold_flags = FOLDEQ_LOCALE;
2512             goto do_exactf_utf8;
2513         }
2514 
2515         fold_array = PL_fold_locale;
2516         folder = Perl_foldEQ_locale;
2517         goto do_exactf_non_utf8;
2518 
2519       case EXACTFU_tb_pb:
2520         /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2521          * don't have to worry here about this single special case in the
2522          * Latin1 range */
2523         fold_array = PL_fold_latin1;
2524         folder = S_foldEQ_latin1_s2_folded;
2525 
2526         /* FALLTHROUGH */
2527 
2528        do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2529                               are no glitches with fold-length differences
2530                               between the target string and pattern */
2531 
2532         /* The idea in the non-utf8 EXACTF* cases is to first find the first
2533          * character of the EXACTF* node and then, if necessary,
2534          * case-insensitively compare the full text of the node.  c1 is the
2535          * first character.  c2 is its fold.  This logic will not work for
2536          * Unicode semantics and the german sharp ss, which hence should not be
2537          * compiled into a node that gets here. */
2538         pat_string = STRINGs(c);
2539         ln  = STR_LENs(c);	/* length to match in octets/bytes */
2540 
2541         /* We know that we have to match at least 'ln' bytes (which is the same
2542          * as characters, since not utf8).  If we have to match 3 characters,
2543          * and there are only 2 available, we know without trying that it will
2544          * fail; so don't start a match past the required minimum number from
2545          * the far end */
2546         e = HOP3c(strend, -((SSize_t)ln), s);
2547         if (e < s)
2548             break;
2549 
2550         c1 = *pat_string;
2551         c2 = fold_array[c1];
2552         if (c1 == c2) { /* If char and fold are the same */
2553             while (s <= e) {
2554                 s = (char *) memchr(s, c1, e + 1 - s);
2555                 if (s == NULL) {
2556                     break;
2557                 }
2558 
2559                 /* Check that the rest of the node matches */
2560                 if (   (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2561                     && (reginfo->intuit || regtry(reginfo, &s)) )
2562                 {
2563                     goto got_it;
2564                 }
2565                 s++;
2566             }
2567         }
2568         else {
2569             U8 bits_differing = c1 ^ c2;
2570 
2571             /* If the folds differ in one bit position only, we can mask to
2572              * match either of them, and can use this faster find method.  Both
2573              * ASCII and EBCDIC tend to have their case folds differ in only
2574              * one position, so this is very likely */
2575             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2576                 bits_differing = ~ bits_differing;
2577                 while (s <= e) {
2578                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2579                                         (c1 & bits_differing), bits_differing);
2580                     if (s > e) {
2581                         break;
2582                     }
2583 
2584                     if (   (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2585                         && (reginfo->intuit || regtry(reginfo, &s)) )
2586                     {
2587                         goto got_it;
2588                     }
2589                     s++;
2590                 }
2591             }
2592             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2593                        should actually happen only in EXACTFL nodes */
2594                 while (s <= e) {
2595                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2596                         && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2597                         && (reginfo->intuit || regtry(reginfo, &s)) )
2598                     {
2599                         goto got_it;
2600                     }
2601                     s++;
2602                 }
2603             }
2604         }
2605         break;
2606 
2607       case EXACTFAA_tb_p8:
2608       case EXACTFAA_t8_p8:
2609         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2610                          |FOLDEQ_S2_ALREADY_FOLDED
2611                          |FOLDEQ_S2_FOLDS_SANE;
2612         goto do_exactf_utf8;
2613 
2614       case EXACTFAA_NO_TRIE_tb_pb:
2615       case EXACTFAA_NO_TRIE_t8_pb:
2616       case EXACTFAA_t8_pb:
2617 
2618         /* Here, and elsewhere in this file, the reason we can't consider a
2619          * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2620          * is because any MICRO SIGN in the pattern won't be folded.  Since the
2621          * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2622          * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2623         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2624         goto do_exactf_utf8;
2625 
2626       case EXACTFL_tb_p8:
2627       case EXACTFL_t8_pb:
2628       case EXACTFL_t8_p8:
2629         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2630         utf8_fold_flags = FOLDEQ_LOCALE;
2631         goto do_exactf_utf8;
2632 
2633       case EXACTFLU8_t8_pb:
2634       case EXACTFLU8_t8_p8:
2635         utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2636                                          | FOLDEQ_S2_FOLDS_SANE;
2637         goto do_exactf_utf8;
2638 
2639       case EXACTFU_REQ8_t8_p8:
2640         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2641         goto do_exactf_utf8;
2642 
2643       case EXACTFU_tb_p8:
2644       case EXACTFU_t8_pb:
2645       case EXACTFU_t8_p8:
2646         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2647         goto do_exactf_utf8;
2648 
2649       /* The following are problematic even though pattern isn't UTF-8.  Use
2650        * full functionality normally not done except for UTF-8. */
2651       case EXACTF_t8_pb:
2652       case EXACTFUP_tb_pb:
2653       case EXACTFUP_t8_pb:
2654 
2655        do_exactf_utf8:
2656         {
2657             unsigned expansion;
2658 
2659             /* If one of the operands is in utf8, we can't use the simpler
2660              * folding above, due to the fact that many different characters
2661              * can have the same fold, or portion of a fold, or different-
2662              * length fold */
2663             pat_string = STRINGs(c);
2664             ln  = STR_LENs(c);	/* length to match in octets/bytes */
2665             pat_end = pat_string + ln;
2666             lnc = is_utf8_pat       /* length to match in characters */
2667                   ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2668                   : ln;
2669 
2670             /* We have 'lnc' characters to match in the pattern, but because of
2671              * multi-character folding, each character in the target can match
2672              * up to 3 characters (Unicode guarantees it will never exceed
2673              * this) if it is utf8-encoded; and up to 2 if not (based on the
2674              * fact that the Latin 1 folds are already determined, and the only
2675              * multi-char fold in that range is the sharp-s folding to 'ss'.
2676              * Thus, a pattern character can match as little as 1/3 of a string
2677              * character.  Adjust lnc accordingly, rounding up, so that if we
2678              * need to match at least 4+1/3 chars, that really is 5. */
2679             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2680             lnc = (lnc + expansion - 1) / expansion;
2681 
2682             /* As in the non-UTF8 case, if we have to match 3 characters, and
2683              * only 2 are left, it's guaranteed to fail, so don't start a match
2684              * that would require us to go beyond the end of the string */
2685             e = HOP3c(strend, -((SSize_t)lnc), s);
2686 
2687             /* XXX Note that we could recalculate e to stop the loop earlier,
2688              * as the worst case expansion above will rarely be met, and as we
2689              * go along we would usually find that e moves further to the left.
2690              * This would happen only after we reached the point in the loop
2691              * where if there were no expansion we should fail.  Unclear if
2692              * worth the expense */
2693 
2694             while (s <= e) {
2695                 char *my_strend= (char *)strend;
2696                 if (   foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2697                                          pat_string, NULL, ln, is_utf8_pat,
2698                                          utf8_fold_flags)
2699                     && (reginfo->intuit || regtry(reginfo, &s)) )
2700                 {
2701                     goto got_it;
2702                 }
2703                 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2704             }
2705         }
2706         break;
2707 
2708       case BOUNDA_tb_pb:
2709       case BOUNDA_tb_p8:
2710       case BOUND_tb_pb:  /* /d without utf8 target is /a */
2711       case BOUND_tb_p8:
2712         /* regcomp.c makes sure that these only have the traditional \b
2713          * meaning. */
2714         assert(FLAGS(c) == TRADITIONAL_BOUND);
2715 
2716         FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2717         break;
2718 
2719       case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2720       case BOUNDA_t8_p8:
2721         /* regcomp.c makes sure that these only have the traditional \b
2722          * meaning. */
2723         assert(FLAGS(c) == TRADITIONAL_BOUND);
2724 
2725         FBC_BOUND_A_UTF8(isWORDCHAR_A);
2726         break;
2727 
2728       case NBOUNDA_tb_pb:
2729       case NBOUNDA_tb_p8:
2730       case NBOUND_tb_pb: /* /d without utf8 target is /a */
2731       case NBOUND_tb_p8:
2732         /* regcomp.c makes sure that these only have the traditional \b
2733          * meaning. */
2734         assert(FLAGS(c) == TRADITIONAL_BOUND);
2735 
2736         FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2737         break;
2738 
2739       case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2740       case NBOUNDA_t8_p8:
2741         /* regcomp.c makes sure that these only have the traditional \b
2742          * meaning. */
2743         assert(FLAGS(c) == TRADITIONAL_BOUND);
2744 
2745         FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2746         break;
2747 
2748       case NBOUNDU_tb_pb:
2749       case NBOUNDU_tb_p8:
2750         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2751             FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2752             break;
2753         }
2754 
2755         to_complement = 1;
2756         goto do_boundu_non_utf8;
2757 
2758       case NBOUNDL_tb_pb:
2759       case NBOUNDL_tb_p8:
2760         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2761         if (FLAGS(c) == TRADITIONAL_BOUND) {
2762             FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2763             break;
2764         }
2765 
2766         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2767 
2768         to_complement = 1;
2769         goto do_boundu_non_utf8;
2770 
2771       case BOUNDL_tb_pb:
2772       case BOUNDL_tb_p8:
2773         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2774         if (FLAGS(c) == TRADITIONAL_BOUND) {
2775             FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2776             break;
2777         }
2778 
2779         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2780 
2781         goto do_boundu_non_utf8;
2782 
2783       case BOUNDU_tb_pb:
2784       case BOUNDU_tb_p8:
2785         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2786             FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2787             break;
2788         }
2789 
2790       do_boundu_non_utf8:
2791         if (s == reginfo->strbeg) {
2792             if (reginfo->intuit || regtry(reginfo, &s))
2793             {
2794                 goto got_it;
2795             }
2796 
2797             /* Didn't match.  Try at the next position (if there is one) */
2798             s++;
2799             if (UNLIKELY(s >= reginfo->strend)) {
2800                 break;
2801             }
2802         }
2803 
2804         switch((bound_type) FLAGS(c)) {
2805           case TRADITIONAL_BOUND: /* Should have already been handled */
2806             assert(0);
2807             break;
2808 
2809           case GCB_BOUND:
2810             /* Not utf8.  Everything is a GCB except between CR and LF */
2811             while (s < strend) {
2812                 if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2813                                       || UCHARAT(s) != '\n'))
2814                     && (reginfo->intuit || regtry(reginfo, &s)))
2815                 {
2816                     goto got_it;
2817                 }
2818                 s++;
2819             }
2820 
2821             break;
2822 
2823           case LB_BOUND:
2824             {
2825                 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2826                 while (s < strend) {
2827                     LB_enum after = getLB_VAL_CP((U8) *s);
2828                     if (to_complement ^ isLB(before,
2829                                              after,
2830                                              (U8*) reginfo->strbeg,
2831                                              (U8*) s,
2832                                              (U8*) reginfo->strend,
2833                                              0 /* target not utf8 */ )
2834                         && (reginfo->intuit || regtry(reginfo, &s)))
2835                     {
2836                         goto got_it;
2837                     }
2838                     before = after;
2839                     s++;
2840                 }
2841             }
2842 
2843             break;
2844 
2845           case SB_BOUND:
2846             {
2847                 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2848                 while (s < strend) {
2849                     SB_enum after = getSB_VAL_CP((U8) *s);
2850                     if ((to_complement ^ isSB(before,
2851                                               after,
2852                                               (U8*) reginfo->strbeg,
2853                                               (U8*) s,
2854                                               (U8*) reginfo->strend,
2855                                              0 /* target not utf8 */ ))
2856                         && (reginfo->intuit || regtry(reginfo, &s)))
2857                     {
2858                         goto got_it;
2859                     }
2860                     before = after;
2861                     s++;
2862                 }
2863             }
2864 
2865             break;
2866 
2867           case WB_BOUND:
2868             {
2869                 WB_enum previous = WB_UNKNOWN;
2870                 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2871                 while (s < strend) {
2872                     WB_enum after = getWB_VAL_CP((U8) *s);
2873                     if ((to_complement ^ isWB(previous,
2874                                               before,
2875                                               after,
2876                                               (U8*) reginfo->strbeg,
2877                                               (U8*) s,
2878                                               (U8*) reginfo->strend,
2879                                                0 /* target not utf8 */ ))
2880                         && (reginfo->intuit || regtry(reginfo, &s)))
2881                     {
2882                         goto got_it;
2883                     }
2884                     previous = before;
2885                     before = after;
2886                     s++;
2887                 }
2888             }
2889         }
2890 
2891         /* Here are at the final position in the target string, which is a
2892          * boundary by definition, so matches, depending on other constraints.
2893          * */
2894         if (   reginfo->intuit
2895             || (s <= reginfo->strend && regtry(reginfo, &s)))
2896         {
2897             goto got_it;
2898         }
2899 
2900         break;
2901 
2902       case BOUNDL_t8_pb:
2903       case BOUNDL_t8_p8:
2904         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2905         if (FLAGS(c) == TRADITIONAL_BOUND) {
2906             FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2907                            isWORDCHAR_LC_utf8_safe);
2908             break;
2909         }
2910 
2911         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2912 
2913         to_complement = 1;
2914         goto do_boundu_utf8;
2915 
2916       case NBOUNDL_t8_pb:
2917       case NBOUNDL_t8_p8:
2918         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2919         if (FLAGS(c) == TRADITIONAL_BOUND) {
2920             FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2921                             isWORDCHAR_LC_utf8_safe);
2922             break;
2923         }
2924 
2925         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2926 
2927         to_complement = 1;
2928         goto do_boundu_utf8;
2929 
2930       case NBOUND_t8_pb:
2931       case NBOUND_t8_p8:
2932         /* regcomp.c makes sure that these only have the traditional \b
2933          * meaning. */
2934         assert(FLAGS(c) == TRADITIONAL_BOUND);
2935 
2936         /* FALLTHROUGH */
2937 
2938       case NBOUNDU_t8_pb:
2939       case NBOUNDU_t8_p8:
2940         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2941             FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2942                             isWORDCHAR_utf8_safe);
2943             break;
2944         }
2945 
2946         to_complement = 1;
2947         goto do_boundu_utf8;
2948 
2949       case BOUND_t8_pb:
2950       case BOUND_t8_p8:
2951         /* regcomp.c makes sure that these only have the traditional \b
2952          * meaning. */
2953         assert(FLAGS(c) == TRADITIONAL_BOUND);
2954 
2955         /* FALLTHROUGH */
2956 
2957       case BOUNDU_t8_pb:
2958       case BOUNDU_t8_p8:
2959         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2960             FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2961             break;
2962         }
2963 
2964       do_boundu_utf8:
2965         if (s == reginfo->strbeg) {
2966             if (reginfo->intuit || regtry(reginfo, &s))
2967             {
2968                 goto got_it;
2969             }
2970 
2971             /* Didn't match.  Try at the next position (if there is one) */
2972             s += UTF8_SAFE_SKIP(s, reginfo->strend);
2973             if (UNLIKELY(s >= reginfo->strend)) {
2974                 break;
2975             }
2976         }
2977 
2978         switch((bound_type) FLAGS(c)) {
2979           case TRADITIONAL_BOUND: /* Should have already been handled */
2980             assert(0);
2981             break;
2982 
2983           case GCB_BOUND:
2984             {
2985                 GCB_enum before = getGCB_VAL_UTF8(
2986                                            reghop3((U8*)s, -1,
2987                                                    (U8*)(reginfo->strbeg)),
2988                                            (U8*) reginfo->strend);
2989                 while (s < strend) {
2990                     GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2991                                                     (U8*) reginfo->strend);
2992                     if (   (to_complement ^ isGCB(before,
2993                                                   after,
2994                                                   (U8*) reginfo->strbeg,
2995                                                   (U8*) s,
2996                                                   1 /* target is utf8 */ ))
2997                         && (reginfo->intuit || regtry(reginfo, &s)))
2998                     {
2999                         goto got_it;
3000                     }
3001                     before = after;
3002                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3003                 }
3004             }
3005             break;
3006 
3007           case LB_BOUND:
3008             {
3009                 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
3010                                                         -1,
3011                                                         (U8*)(reginfo->strbeg)),
3012                                                    (U8*) reginfo->strend);
3013                 while (s < strend) {
3014                     LB_enum after = getLB_VAL_UTF8((U8*) s,
3015                                                    (U8*) reginfo->strend);
3016                     if (to_complement ^ isLB(before,
3017                                              after,
3018                                              (U8*) reginfo->strbeg,
3019                                              (U8*) s,
3020                                              (U8*) reginfo->strend,
3021                                              1 /* target is utf8 */ )
3022                         && (reginfo->intuit || regtry(reginfo, &s)))
3023                     {
3024                         goto got_it;
3025                     }
3026                     before = after;
3027                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3028                 }
3029             }
3030 
3031             break;
3032 
3033           case SB_BOUND:
3034             {
3035                 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
3036                                                     -1,
3037                                                     (U8*)(reginfo->strbeg)),
3038                                                   (U8*) reginfo->strend);
3039                 while (s < strend) {
3040                     SB_enum after = getSB_VAL_UTF8((U8*) s,
3041                                                      (U8*) reginfo->strend);
3042                     if ((to_complement ^ isSB(before,
3043                                               after,
3044                                               (U8*) reginfo->strbeg,
3045                                               (U8*) s,
3046                                               (U8*) reginfo->strend,
3047                                               1 /* target is utf8 */ ))
3048                         && (reginfo->intuit || regtry(reginfo, &s)))
3049                     {
3050                         goto got_it;
3051                     }
3052                     before = after;
3053                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3054                 }
3055             }
3056 
3057             break;
3058 
3059           case WB_BOUND:
3060             {
3061                 /* We are at a boundary between char_sub_0 and char_sub_1.
3062                  * We also keep track of the value for char_sub_-1 as we
3063                  * loop through the line.   Context may be needed to make a
3064                  * determination, and if so, this can save having to
3065                  * recalculate it */
3066                 WB_enum previous = WB_UNKNOWN;
3067                 WB_enum before = getWB_VAL_UTF8(
3068                                           reghop3((U8*)s,
3069                                                   -1,
3070                                                   (U8*)(reginfo->strbeg)),
3071                                           (U8*) reginfo->strend);
3072                 while (s < strend) {
3073                     WB_enum after = getWB_VAL_UTF8((U8*) s,
3074                                                     (U8*) reginfo->strend);
3075                     if ((to_complement ^ isWB(previous,
3076                                               before,
3077                                               after,
3078                                               (U8*) reginfo->strbeg,
3079                                               (U8*) s,
3080                                               (U8*) reginfo->strend,
3081                                               1 /* target is utf8 */ ))
3082                         && (reginfo->intuit || regtry(reginfo, &s)))
3083                     {
3084                         goto got_it;
3085                     }
3086                     previous = before;
3087                     before = after;
3088                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
3089                 }
3090             }
3091         }
3092 
3093         /* Here are at the final position in the target string, which is a
3094          * boundary by definition, so matches, depending on other constraints.
3095          * */
3096 
3097         if (   reginfo->intuit
3098             || (s <= reginfo->strend && regtry(reginfo, &s)))
3099         {
3100             goto got_it;
3101         }
3102         break;
3103 
3104       case LNBREAK_t8_pb:
3105       case LNBREAK_t8_p8:
3106         REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
3107         break;
3108 
3109       case LNBREAK_tb_pb:
3110       case LNBREAK_tb_p8:
3111         REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
3112         break;
3113 
3114       /* The argument to all the POSIX node types is the class number to pass
3115        * to generic_isCC_() to build a mask for searching in PL_charclass[] */
3116 
3117       case NPOSIXL_t8_pb:
3118       case NPOSIXL_t8_p8:
3119         to_complement = 1;
3120         /* FALLTHROUGH */
3121 
3122       case POSIXL_t8_pb:
3123       case POSIXL_t8_p8:
3124         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3125         REXEC_FBC_UTF8_CLASS_SCAN(
3126             to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3127                                                           (U8 *) strend)));
3128         break;
3129 
3130       case NPOSIXL_tb_pb:
3131       case NPOSIXL_tb_p8:
3132         to_complement = 1;
3133         /* FALLTHROUGH */
3134 
3135       case POSIXL_tb_pb:
3136       case POSIXL_tb_p8:
3137         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3138         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3139                                 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3140         break;
3141 
3142       case NPOSIXA_t8_pb:
3143       case NPOSIXA_t8_p8:
3144         /* The complement of something that matches only ASCII matches all
3145          * non-ASCII, plus everything in ASCII that isn't in the class. */
3146         REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
3147                                   || ! generic_isCC_A_(*s, FLAGS(c)));
3148         break;
3149 
3150       case POSIXA_t8_pb:
3151       case POSIXA_t8_p8:
3152         /* Don't need to worry about utf8, as it can match only a single
3153          * byte invariant character.  But we do anyway for performance reasons,
3154          * as otherwise we would have to examine all the continuation
3155          * characters */
3156         REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c)));
3157         break;
3158 
3159       case NPOSIXD_tb_pb:
3160       case NPOSIXD_tb_p8:
3161       case NPOSIXA_tb_pb:
3162       case NPOSIXA_tb_p8:
3163         to_complement = 1;
3164         /* FALLTHROUGH */
3165 
3166       case POSIXD_tb_pb:
3167       case POSIXD_tb_p8:
3168       case POSIXA_tb_pb:
3169       case POSIXA_tb_p8:
3170         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3171                         to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c))));
3172         break;
3173 
3174       case NPOSIXU_tb_pb:
3175       case NPOSIXU_tb_p8:
3176         to_complement = 1;
3177         /* FALLTHROUGH */
3178 
3179       case POSIXU_tb_pb:
3180       case POSIXU_tb_p8:
3181             REXEC_FBC_NON_UTF8_CLASS_SCAN(
3182                                  to_complement ^ cBOOL(generic_isCC_(*s,
3183                                                                     FLAGS(c))));
3184         break;
3185 
3186       case NPOSIXD_t8_pb:
3187       case NPOSIXD_t8_p8:
3188       case NPOSIXU_t8_pb:
3189       case NPOSIXU_t8_p8:
3190         to_complement = 1;
3191         /* FALLTHROUGH */
3192 
3193       case POSIXD_t8_pb:
3194       case POSIXD_t8_p8:
3195       case POSIXU_t8_pb:
3196       case POSIXU_t8_p8:
3197         classnum = (char_class_number_) FLAGS(c);
3198         switch (classnum) {
3199           default:
3200             REXEC_FBC_UTF8_CLASS_SCAN(
3201                         to_complement ^ cBOOL(_invlist_contains_cp(
3202                                                 PL_XPosix_ptrs[classnum],
3203                                                 utf8_to_uvchr_buf((U8 *) s,
3204                                                                 (U8 *) strend,
3205                                                                 NULL))));
3206             break;
3207 
3208           case CC_ENUM_SPACE_:
3209             REXEC_FBC_UTF8_CLASS_SCAN(
3210                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3211             break;
3212 
3213           case CC_ENUM_BLANK_:
3214             REXEC_FBC_UTF8_CLASS_SCAN(
3215                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3216             break;
3217 
3218           case CC_ENUM_XDIGIT_:
3219             REXEC_FBC_UTF8_CLASS_SCAN(
3220                         to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3221             break;
3222 
3223           case CC_ENUM_VERTSPACE_:
3224             REXEC_FBC_UTF8_CLASS_SCAN(
3225                         to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3226             break;
3227 
3228           case CC_ENUM_CNTRL_:
3229             REXEC_FBC_UTF8_CLASS_SCAN(
3230                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3231             break;
3232         }
3233         break;
3234 
3235       case AHOCORASICKC_tb_pb:
3236       case AHOCORASICKC_tb_p8:
3237       case AHOCORASICKC_t8_pb:
3238       case AHOCORASICKC_t8_p8:
3239       case AHOCORASICK_tb_pb:
3240       case AHOCORASICK_tb_p8:
3241       case AHOCORASICK_t8_pb:
3242       case AHOCORASICK_t8_p8:
3243         {
3244             DECL_TRIE_TYPE(c);
3245             /* what trie are we using right now */
3246             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG1u( c ) ];
3247             reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3248             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3249 
3250             const char *last_start = strend - trie->minlen;
3251 #ifdef DEBUGGING
3252             const char *real_start = s;
3253 #endif
3254             STRLEN maxlen = trie->maxlen;
3255             SV *sv_points;
3256             U8 **points; /* map of where we were in the input string
3257                             when reading a given char. For ASCII this
3258                             is unnecessary overhead as the relationship
3259                             is always 1:1, but for Unicode, especially
3260                             case folded Unicode this is not true. */
3261             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3262             U8 *bitmap=NULL;
3263 
3264 
3265             DECLARE_AND_GET_RE_DEBUG_FLAGS;
3266 
3267             /* We can't just allocate points here. We need to wrap it in
3268              * an SV so it gets freed properly if there is a croak while
3269              * running the match */
3270             ENTER;
3271             SAVETMPS;
3272             sv_points=newSV(maxlen * sizeof(U8 *));
3273             SvCUR_set(sv_points,
3274                 maxlen * sizeof(U8 *));
3275             SvPOK_on(sv_points);
3276             sv_2mortal(sv_points);
3277             points=(U8**)SvPV_nolen(sv_points );
3278             if ( trie_type != trie_utf8_fold
3279                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
3280             {
3281                 if (trie->bitmap)
3282                     bitmap=(U8*)trie->bitmap;
3283                 else
3284                     bitmap=(U8*)ANYOF_BITMAP(c);
3285             }
3286             /* this is the Aho-Corasick algorithm modified a touch
3287                to include special handling for long "unknown char" sequences.
3288                The basic idea being that we use AC as long as we are dealing
3289                with a possible matching char, when we encounter an unknown char
3290                (and we have not encountered an accepting state) we scan forward
3291                until we find a legal starting char.
3292                AC matching is basically that of trie matching, except that when
3293                we encounter a failing transition, we fall back to the current
3294                states "fail state", and try the current char again, a process
3295                we repeat until we reach the root state, state 1, or a legal
3296                transition. If we fail on the root state then we can either
3297                terminate if we have reached an accepting state previously, or
3298                restart the entire process from the beginning if we have not.
3299 
3300              */
3301             while (s <= last_start) {
3302                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3303                 U8 *uc = (U8*)s;
3304                 U16 charid = 0;
3305                 U32 base = 1;
3306                 U32 state = 1;
3307                 UV uvc = 0;
3308                 STRLEN len = 0;
3309                 STRLEN foldlen = 0;
3310                 U8 *uscan = (U8*)NULL;
3311                 U8 *leftmost = NULL;
3312 #ifdef DEBUGGING
3313                 U32 accepted_word= 0;
3314 #endif
3315                 U32 pointpos = 0;
3316 
3317                 while ( state && uc <= (U8*)strend ) {
3318                     int failed=0;
3319                     U32 word = aho->states[ state ].wordnum;
3320 
3321                     if( state==1 ) {
3322                         if ( bitmap ) {
3323                             DEBUG_TRIE_EXECUTE_r(
3324                                 if (  uc <= (U8*)last_start
3325                                     && !BITMAP_TEST(bitmap,*uc) )
3326                                 {
3327                                     dump_exec_pos( (char *)uc, c, strend,
3328                                         real_start,
3329                                         (char *)uc, utf8_target, 0 );
3330                                     Perl_re_printf( aTHX_
3331                                         " Scanning for legal start char...\n");
3332                                 }
3333                             );
3334                             if (utf8_target) {
3335                                 while (  uc <= (U8*)last_start
3336                                        && !BITMAP_TEST(bitmap,*uc) )
3337                                 {
3338                                     uc += UTF8SKIP(uc);
3339                                 }
3340                             } else {
3341                                 while (  uc <= (U8*)last_start
3342                                        && ! BITMAP_TEST(bitmap,*uc) )
3343                                 {
3344                                     uc++;
3345                                 }
3346                             }
3347                             s= (char *)uc;
3348                         }
3349                         if (uc >(U8*)last_start) break;
3350                     }
3351 
3352                     if ( word ) {
3353                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3354                                                                     % maxlen ];
3355                         if (!leftmost || lpos < leftmost) {
3356                             DEBUG_r(accepted_word=word);
3357                             leftmost= lpos;
3358                         }
3359                         if (base==0) break;
3360 
3361                     }
3362                     points[pointpos++ % maxlen]= uc;
3363                     if (foldlen || uc < (U8*)strend) {
3364                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3365                                              (U8 *) strend, uscan, len, uvc,
3366                                              charid, foldlen, foldbuf,
3367                                              uniflags);
3368                         DEBUG_TRIE_EXECUTE_r({
3369                             dump_exec_pos( (char *)uc, c, strend,
3370                                         real_start, s, utf8_target, 0);
3371                             Perl_re_printf( aTHX_
3372                                 " Charid:%3u CP:%4" UVxf " ",
3373                                  charid, uvc);
3374                         });
3375                     }
3376                     else {
3377                         len = 0;
3378                         charid = 0;
3379                     }
3380 
3381 
3382                     do {
3383 #ifdef DEBUGGING
3384                         word = aho->states[ state ].wordnum;
3385 #endif
3386                         base = aho->states[ state ].trans.base;
3387 
3388                         DEBUG_TRIE_EXECUTE_r({
3389                             if (failed)
3390                                 dump_exec_pos((char *)uc, c, strend, real_start,
3391                                     s,   utf8_target, 0 );
3392                             Perl_re_printf( aTHX_
3393                                 "%sState: %4" UVxf ", word=%" UVxf,
3394                                 failed ? " Fail transition to " : "",
3395                                 (UV)state, (UV)word);
3396                         });
3397                         if ( base ) {
3398                             U32 tmp;
3399                             I32 offset;
3400                             if (charid &&
3401                                  ( ((offset = base + charid
3402                                     - 1 - trie->uniquecharcount)) >= 0)
3403                                  && ((U32)offset < trie->lasttrans)
3404                                  && trie->trans[offset].check == state
3405                                  && (tmp=trie->trans[offset].next))
3406                             {
3407                                 DEBUG_TRIE_EXECUTE_r(
3408                                     Perl_re_printf( aTHX_ " - legal\n"));
3409                                 state = tmp;
3410                                 break;
3411                             }
3412                             else {
3413                                 DEBUG_TRIE_EXECUTE_r(
3414                                     Perl_re_printf( aTHX_ " - fail\n"));
3415                                 failed = 1;
3416                                 state = aho->fail[state];
3417                             }
3418                         }
3419                         else {
3420                             /* we must be accepting here */
3421                             DEBUG_TRIE_EXECUTE_r(
3422                                     Perl_re_printf( aTHX_ " - accepting\n"));
3423                             failed = 1;
3424                             break;
3425                         }
3426                     } while(state);
3427                     uc += len;
3428                     if (failed) {
3429                         if (leftmost)
3430                             break;
3431                         if (!state) state = 1;
3432                     }
3433                 }
3434                 if ( aho->states[ state ].wordnum ) {
3435                     U8 *lpos = points[ (pointpos
3436                                       - trie->wordinfo[aho->states[ state ]
3437                                                     .wordnum].len) % maxlen ];
3438                     if (!leftmost || lpos < leftmost) {
3439                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3440                         leftmost = lpos;
3441                     }
3442                 }
3443                 if (leftmost) {
3444                     s = (char*)leftmost;
3445                     DEBUG_TRIE_EXECUTE_r({
3446                         Perl_re_printf( aTHX_  "Matches word #%" UVxf
3447                                         " at position %" IVdf ". Trying full"
3448                                         " pattern...\n",
3449                             (UV)accepted_word, (IV)(s - real_start)
3450                         );
3451                     });
3452                     if (reginfo->intuit || regtry(reginfo, &s)) {
3453                         FREETMPS;
3454                         LEAVE;
3455                         goto got_it;
3456                     }
3457                     if (s < reginfo->strend) {
3458                         s = HOPc(s,1);
3459                     }
3460                     DEBUG_TRIE_EXECUTE_r({
3461                         Perl_re_printf( aTHX_
3462                                        "Pattern failed. Looking for new start"
3463                                        " point...\n");
3464                     });
3465                 } else {
3466                     DEBUG_TRIE_EXECUTE_r(
3467                         Perl_re_printf( aTHX_ "No match.\n"));
3468                     break;
3469                 }
3470             }
3471             FREETMPS;
3472             LEAVE;
3473         }
3474         break;
3475 
3476       case EXACTFU_REQ8_t8_pb:
3477       case EXACTFUP_tb_p8:
3478       case EXACTFUP_t8_p8:
3479       case EXACTF_tb_p8:
3480       case EXACTF_t8_p8:   /* This node only generated for non-utf8 patterns */
3481       case EXACTFAA_NO_TRIE_tb_p8:
3482       case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3483                                       patterns */
3484         assert(0);
3485 
3486       default:
3487         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3488     } /* End of switch on node type */
3489 
3490     return 0;
3491 
3492   got_it:
3493     return s;
3494 }
3495 
3496 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3497  * flags have same meanings as with regexec_flags() */
3498 
3499 static void
S_reg_set_capture_string(pTHX_ REGEXP * const rx,char * strbeg,char * strend,SV * sv,U32 flags,bool utf8_target)3500 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3501                             char *strbeg,
3502                             char *strend,
3503                             SV *sv,
3504                             U32 flags,
3505                             bool utf8_target)
3506 {
3507     struct regexp *const prog = ReANY(rx);
3508 
3509     if (flags & REXEC_COPY_STR) {
3510 #ifdef PERL_ANY_COW
3511         if (SvCANCOW(sv)) {
3512             DEBUG_C(Perl_re_printf( aTHX_
3513                               "Copy on write: regexp capture, type %d\n",
3514                                     (int) SvTYPE(sv)));
3515             /* Create a new COW SV to share the match string and store
3516              * in saved_copy, unless the current COW SV in saved_copy
3517              * is valid and suitable for our purpose */
3518             if ((   RXp_SAVED_COPY(prog)
3519                  && SvIsCOW(RXp_SAVED_COPY(prog))
3520                  && SvPOKp(RXp_SAVED_COPY(prog))
3521                  && SvIsCOW(sv)
3522                  && SvPOKp(sv)
3523                  && SvPVX(sv) == SvPVX(RXp_SAVED_COPY(prog))))
3524             {
3525                 /* just reuse saved_copy SV */
3526                 if (RXp_MATCH_COPIED(prog)) {
3527                     Safefree(RXp_SUBBEG(prog));
3528                     RXp_MATCH_COPIED_off(prog);
3529                 }
3530             }
3531             else {
3532                 /* create new COW SV to share string */
3533                 RXp_MATCH_COPY_FREE(prog);
3534                 RXp_SAVED_COPY(prog) = sv_setsv_cow(RXp_SAVED_COPY(prog), sv);
3535             }
3536             RXp_SUBBEG(prog) = (char *)SvPVX_const(RXp_SAVED_COPY(prog));
3537             assert (SvPOKp(RXp_SAVED_COPY(prog)));
3538             RXp_SUBLEN(prog)  = strend - strbeg;
3539             RXp_SUBOFFSET(prog) = 0;
3540             RXp_SUBCOFFSET(prog) = 0;
3541         } else
3542 #endif
3543         {
3544             SSize_t min = 0;
3545             SSize_t max = strend - strbeg;
3546             SSize_t sublen;
3547 
3548             if (    (flags & REXEC_COPY_SKIP_POST)
3549                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3550                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3551             ) { /* don't copy $' part of string */
3552                 SSize_t offs_end;
3553                 U32 n = 0;
3554                 max = -1;
3555                 /* calculate the right-most part of the string covered
3556                  * by a capture. Due to lookahead, this may be to
3557                  * the right of $&, so we have to scan all captures */
3558                 while (n <= RXp_LASTPAREN(prog)) {
3559                     if ((offs_end = RXp_OFFS_END(prog,n)) > max)
3560                         max = offs_end;
3561                     n++;
3562                 }
3563                 if (max == -1)
3564                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3565                             ? RXp_OFFS_START(prog,0)
3566                             : 0;
3567                 assert(max >= 0 && max <= strend - strbeg);
3568             }
3569 
3570             if (    (flags & REXEC_COPY_SKIP_PRE)
3571                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3572                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3573             ) { /* don't copy $` part of string */
3574                 U32 n = 0;
3575                 min = max;
3576                 /* calculate the left-most part of the string covered
3577                  * by a capture. Due to lookbehind, this may be to
3578                  * the left of $&, so we have to scan all captures */
3579                 while (min && n <= RXp_LASTPAREN(prog)) {
3580                     I32 start = RXp_OFFS_START(prog,n);
3581                     if (   start != -1
3582                         && start < min)
3583                     {
3584                         min = start;
3585                     }
3586                     n++;
3587                 }
3588                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3589                     && min >  RXp_OFFS_END(prog,0)
3590                 )
3591                     min = RXp_OFFS_END(prog,0);
3592 
3593             }
3594 
3595             assert(min >= 0 && min <= max && min <= strend - strbeg);
3596             sublen = max - min;
3597 
3598             if (RXp_MATCH_COPIED(prog)) {
3599                 if (sublen > RXp_SUBLEN(prog))
3600                     RXp_SUBBEG(prog) =
3601                             (char*)saferealloc(RXp_SUBBEG(prog), sublen+1);
3602             }
3603             else
3604                 RXp_SUBBEG(prog) = (char*)safemalloc(sublen+1);
3605             Copy(strbeg + min, RXp_SUBBEG(prog), sublen, char);
3606             RXp_SUBBEG(prog)[sublen] = '\0';
3607             RXp_SUBOFFSET(prog) = min;
3608             RXp_SUBLEN(prog) = sublen;
3609             RXp_MATCH_COPIED_on(prog);
3610         }
3611         RXp_SUBCOFFSET(prog) = RXp_SUBOFFSET(prog);
3612         if (RXp_SUBOFFSET(prog) && utf8_target) {
3613             /* Convert byte offset to chars.
3614              * XXX ideally should only compute this if @-/@+
3615              * has been seen, a la PL_sawampersand ??? */
3616 
3617             /* If there's a direct correspondence between the
3618              * string which we're matching and the original SV,
3619              * then we can use the utf8 len cache associated with
3620              * the SV. In particular, it means that under //g,
3621              * sv_pos_b2u() will use the previously cached
3622              * position to speed up working out the new length of
3623              * subcoffset, rather than counting from the start of
3624              * the string each time. This stops
3625              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3626              * from going quadratic */
3627             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3628                 RXp_SUBCOFFSET(prog) = sv_pos_b2u_flags(sv, RXp_SUBCOFFSET(prog),
3629                                                 SV_GMAGIC|SV_CONST_RETURN);
3630             else
3631                 RXp_SUBCOFFSET(prog) = utf8_length((U8*)strbeg,
3632                                     (U8*)(strbeg+RXp_SUBOFFSET(prog)));
3633         }
3634     }
3635     else {
3636         RXp_MATCH_COPY_FREE(prog);
3637         RXp_SUBBEG(prog) = strbeg;
3638         RXp_SUBOFFSET(prog) = 0;
3639         RXp_SUBCOFFSET(prog) = 0;
3640         RXp_SUBLEN(prog) = strend - strbeg;
3641     }
3642 }
3643 
3644 
3645 
3646 
3647 /*
3648  - regexec_flags - match a regexp against a string
3649  */
3650 I32
Perl_regexec_flags(pTHX_ REGEXP * const rx,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * sv,void * data,U32 flags)3651 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3652               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3653 /* stringarg: the point in the string at which to begin matching */
3654 /* strend:    pointer to null at end of string */
3655 /* strbeg:    real beginning of string */
3656 /* minend:    end of match must be >= minend bytes after stringarg. */
3657 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3658  *            itself is accessed via the pointers above */
3659 /* data:      May be used for some additional optimizations.
3660               Currently unused. */
3661 /* flags:     For optimizations. See REXEC_* in regexp.h */
3662 
3663 {
3664     struct regexp *const prog = ReANY(rx);
3665     char *s;
3666     regnode *c;
3667     char *startpos;
3668     SSize_t minlen;		/* must match at least this many chars */
3669     SSize_t dontbother = 0;	/* how many characters not to try at end */
3670     const bool utf8_target = cBOOL(DO_UTF8(sv));
3671     I32 multiline;
3672     RXi_GET_DECL(prog,progi);
3673     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3674     regmatch_info *const reginfo = &reginfo_buf;
3675     regexp_paren_pair *swap = NULL;
3676     I32 oldsave;
3677     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3678 
3679     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3680     PERL_UNUSED_ARG(data);
3681 
3682     /* Be paranoid... */
3683     if (prog == NULL) {
3684         Perl_croak(aTHX_ "NULL regexp parameter");
3685     }
3686 
3687     DEBUG_EXECUTE_r(
3688         debug_start_match(rx, utf8_target, stringarg, strend,
3689         "Matching");
3690     );
3691 
3692     startpos = stringarg;
3693 
3694     /* set these early as they may be used by the HOP macros below */
3695     reginfo->strbeg = strbeg;
3696     reginfo->strend = strend;
3697     reginfo->is_utf8_target = cBOOL(utf8_target);
3698 
3699     if (prog->intflags & PREGf_GPOS_SEEN) {
3700         MAGIC *mg;
3701 
3702         /* set reginfo->ganch, the position where \G can match */
3703 
3704         reginfo->ganch =
3705             (flags & REXEC_IGNOREPOS)
3706             ? stringarg /* use start pos rather than pos() */
3707             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3708               /* Defined pos(): */
3709             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3710             : strbeg; /* pos() not defined; use start of string */
3711 
3712         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3713             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3714 
3715         /* in the presence of \G, we may need to start looking earlier in
3716          * the string than the suggested start point of stringarg:
3717          * if prog->gofs is set, then that's a known, fixed minimum
3718          * offset, such as
3719          * /..\G/:   gofs = 2
3720          * /ab|c\G/: gofs = 1
3721          * or if the minimum offset isn't known, then we have to go back
3722          * to the start of the string, e.g. /w+\G/
3723          */
3724 
3725         if (prog->intflags & PREGf_ANCH_GPOS) {
3726             if (prog->gofs) {
3727                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3728                 if (!startpos ||
3729                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3730                 {
3731                     DEBUG_GPOS_r(Perl_re_printf( aTHX_
3732                             "fail: ganch-gofs before earliest possible start\n"));
3733                     return 0;
3734                 }
3735             }
3736             else
3737                 startpos = reginfo->ganch;
3738         }
3739         else if (prog->gofs) {
3740             startpos = HOPBACKc(startpos, prog->gofs);
3741             if (!startpos)
3742                 startpos = strbeg;
3743         }
3744         else if (prog->intflags & PREGf_GPOS_FLOAT)
3745             startpos = strbeg;
3746     }
3747 
3748     minlen = prog->minlen;
3749     if ((startpos + minlen) > strend || startpos < strbeg) {
3750         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3751                         "Regex match can't succeed, so not even tried\n"));
3752         return 0;
3753     }
3754 
3755     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3756      * which will call destuctors to reset PL_regmatch_state, free higher
3757      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3758      * regmatch_info_aux_eval */
3759 
3760     oldsave = PL_savestack_ix;
3761 
3762     s = startpos;
3763 
3764     if ((prog->extflags & RXf_USE_INTUIT)
3765         && !(flags & REXEC_CHECKED))
3766     {
3767         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3768                                     flags, NULL);
3769         if (!s)
3770             return 0;
3771 
3772         if (prog->extflags & RXf_CHECK_ALL) {
3773             /* we can match based purely on the result of INTUIT.
3774              * Set up captures etc just for $& and $-[0]
3775              * (an intuit-only match wont have $1,$2,..) */
3776             assert(!prog->nparens);
3777 
3778             /* s/// doesn't like it if $& is earlier than where we asked it to
3779              * start searching (which can happen on something like /.\G/) */
3780             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3781                     && (s < stringarg))
3782             {
3783                 /* this should only be possible under \G */
3784                 assert(prog->intflags & PREGf_GPOS_SEEN);
3785                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3786                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3787                 goto phooey;
3788             }
3789 
3790             /* match via INTUIT shouldn't have any captures.
3791              * Let @-, @+, $^N know */
3792             RXp_LASTPAREN(prog) = RXp_LASTCLOSEPAREN(prog) = 0;
3793             RXp_MATCH_UTF8_set(prog, utf8_target);
3794             SSize_t match_start = s - strbeg;
3795             SSize_t match_end = utf8_target
3796                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3797                 : s - strbeg + prog->minlenret;
3798             CLOSE_ANY_CAPTURE(prog, 0, match_start, match_end);
3799             if ( !(flags & REXEC_NOT_FIRST) )
3800                 S_reg_set_capture_string(aTHX_ rx,
3801                                         strbeg, strend,
3802                                         sv, flags, utf8_target);
3803 
3804             return 1;
3805         }
3806     }
3807 
3808     multiline = prog->extflags & RXf_PMf_MULTILINE;
3809 
3810     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3811         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3812                               "String too short [regexec_flags]...\n"));
3813         goto phooey;
3814     }
3815 
3816     /* Check validity of program. */
3817     if (UCHARAT(progi->program) != REG_MAGIC) {
3818         Perl_croak(aTHX_ "corrupted regexp program");
3819     }
3820 
3821     RXp_MATCH_TAINTED_off(prog);
3822     RXp_MATCH_UTF8_set(prog, utf8_target);
3823 
3824     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
3825     reginfo->intuit = 0;
3826     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3827     reginfo->warned = FALSE;
3828     reginfo->sv = sv;
3829     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3830     /* see how far we have to get to not match where we matched before */
3831     reginfo->till = stringarg + minend;
3832 
3833     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3834         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3835            S_cleanup_regmatch_info_aux has executed (registered by
3836            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3837            magic belonging to this SV.
3838            Not newSVsv, either, as it does not COW.
3839         */
3840         reginfo->sv = newSV_type(SVt_NULL);
3841         SvSetSV_nosteal(reginfo->sv, sv);
3842         SAVEFREESV(reginfo->sv);
3843     }
3844 
3845     /* reserve next 2 or 3 slots in PL_regmatch_state:
3846      * slot N+0: may currently be in use: skip it
3847      * slot N+1: use for regmatch_info_aux struct
3848      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3849      * slot N+3: ready for use by regmatch()
3850      */
3851 
3852     {
3853         regmatch_state *old_regmatch_state;
3854         regmatch_slab  *old_regmatch_slab;
3855         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3856 
3857         /* on first ever match, allocate first slab */
3858         if (!PL_regmatch_slab) {
3859             Newx(PL_regmatch_slab, 1, regmatch_slab);
3860             PL_regmatch_slab->prev = NULL;
3861             PL_regmatch_slab->next = NULL;
3862             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3863         }
3864 
3865         old_regmatch_state = PL_regmatch_state;
3866         old_regmatch_slab  = PL_regmatch_slab;
3867 
3868         for (i=0; i <= max; i++) {
3869             if (i == 1)
3870                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3871             else if (i ==2)
3872                 reginfo->info_aux_eval =
3873                 reginfo->info_aux->info_aux_eval =
3874                             &(PL_regmatch_state->u.info_aux_eval);
3875 
3876             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3877                 PL_regmatch_state = S_push_slab(aTHX);
3878         }
3879 
3880         /* note initial PL_regmatch_state position; at end of match we'll
3881          * pop back to there and free any higher slabs */
3882 
3883         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3884         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3885         reginfo->info_aux->poscache = NULL;
3886 
3887         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3888 
3889         if ((prog->extflags & RXf_EVAL_SEEN))
3890             S_setup_eval_state(aTHX_ reginfo);
3891         else
3892             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3893     }
3894 
3895     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3896         /* We have to be careful. If the previous successful match
3897            was from this regex we don't want a subsequent partially
3898            successful match to clobber the old results.
3899            So when we detect this possibility we add a swap buffer
3900            to the re, and switch the buffer each match. If we fail,
3901            we switch it back; otherwise we leave it swapped.
3902         */
3903         swap = RXp_OFFSp(prog);
3904         /* avoid leak if we die, or clean up anyway if match completes */
3905         SAVEFREEPV(swap);
3906         Newxz(RXp_OFFSp(prog), (prog->nparens + 1), regexp_paren_pair);
3907         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3908             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3909             0,
3910             PTR2UV(prog),
3911             PTR2UV(swap),
3912             PTR2UV(RXp_OFFSp(prog))
3913         ));
3914     }
3915 
3916     if (prog->recurse_locinput)
3917         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3918 
3919     /* Simplest case: anchored match (but not \G) need be tried only once,
3920      * or with MBOL, only at the beginning of each line.
3921      *
3922      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3923      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3924      * match at the start of the string then it won't match anywhere else
3925      * either; while with /.*.../, if it doesn't match at the beginning,
3926      * the earliest it could match is at the start of the next line */
3927 
3928     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3929         char *end;
3930 
3931         if (regtry(reginfo, &s))
3932             goto got_it;
3933 
3934         if (!(prog->intflags & PREGf_ANCH_MBOL))
3935             goto phooey;
3936 
3937         /* didn't match at start, try at other newline positions */
3938 
3939         if (minlen)
3940             dontbother = minlen - 1;
3941         end = HOP3c(strend, -dontbother, strbeg) - 1;
3942 
3943         /* skip to next newline */
3944 
3945         while (s <= end) { /* note it could be possible to match at the end of the string */
3946             /* NB: newlines are the same in unicode as they are in latin */
3947             if (*s++ != '\n')
3948                 continue;
3949             if (prog->check_substr || prog->check_utf8) {
3950             /* note that with PREGf_IMPLICIT, intuit can only fail
3951              * or return the start position, so it's of limited utility.
3952              * Nevertheless, I made the decision that the potential for
3953              * quick fail was still worth it - DAPM */
3954                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3955                 if (!s)
3956                     goto phooey;
3957             }
3958             if (regtry(reginfo, &s))
3959                 goto got_it;
3960         }
3961         goto phooey;
3962     } /* end anchored search */
3963 
3964     /* anchored \G match */
3965     if (prog->intflags & PREGf_ANCH_GPOS)
3966     {
3967         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3968         assert(prog->intflags & PREGf_GPOS_SEEN);
3969         /* For anchored \G, the only position it can match from is
3970          * (ganch-gofs); we already set startpos to this above; if intuit
3971          * moved us on from there, we can't possibly succeed */
3972         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3973         if (s == startpos && regtry(reginfo, &s))
3974             goto got_it;
3975         goto phooey;
3976     }
3977 
3978     /* Messy cases:  unanchored match. */
3979 
3980     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3981         /* we have /x+whatever/ */
3982         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3983         char ch;
3984 #ifdef DEBUGGING
3985         int did_match = 0;
3986 #endif
3987         if (utf8_target) {
3988             if (! prog->anchored_utf8) {
3989                 to_utf8_substr(prog);
3990             }
3991             ch = SvPVX_const(prog->anchored_utf8)[0];
3992             REXEC_FBC_UTF8_SCAN(
3993                 if (*s == ch) {
3994                     DEBUG_EXECUTE_r( did_match = 1 );
3995                     if (regtry(reginfo, &s)) goto got_it;
3996                     s += UTF8_SAFE_SKIP(s, strend);
3997                     while (s < strend && *s == ch)
3998                         s += UTF8SKIP(s);
3999                 }
4000             );
4001 
4002         }
4003         else {
4004             if (! prog->anchored_substr) {
4005                 if (! to_byte_substr(prog)) {
4006                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4007                 }
4008             }
4009             ch = SvPVX_const(prog->anchored_substr)[0];
4010             REXEC_FBC_NON_UTF8_SCAN(
4011                 if (*s == ch) {
4012                     DEBUG_EXECUTE_r( did_match = 1 );
4013                     if (regtry(reginfo, &s)) goto got_it;
4014                     s++;
4015                     while (s < strend && *s == ch)
4016                         s++;
4017                 }
4018             );
4019         }
4020         DEBUG_EXECUTE_r(if (!did_match)
4021                 Perl_re_printf( aTHX_
4022                                   "Did not find anchored character...\n")
4023                );
4024     }
4025     else if (prog->anchored_substr != NULL
4026               || prog->anchored_utf8 != NULL
4027               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
4028                   && prog->float_max_offset < strend - s)) {
4029         SV *must;
4030         SSize_t back_max;
4031         SSize_t back_min;
4032         char *last;
4033         char *last1;		/* Last position checked before */
4034 #ifdef DEBUGGING
4035         int did_match = 0;
4036 #endif
4037         if (prog->anchored_substr || prog->anchored_utf8) {
4038             if (utf8_target) {
4039                 if (! prog->anchored_utf8) {
4040                     to_utf8_substr(prog);
4041                 }
4042                 must = prog->anchored_utf8;
4043             }
4044             else {
4045                 if (! prog->anchored_substr) {
4046                     if (! to_byte_substr(prog)) {
4047                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4048                     }
4049                 }
4050                 must = prog->anchored_substr;
4051             }
4052             back_max = back_min = prog->anchored_offset;
4053         } else {
4054             if (utf8_target) {
4055                 if (! prog->float_utf8) {
4056                     to_utf8_substr(prog);
4057                 }
4058                 must = prog->float_utf8;
4059             }
4060             else {
4061                 if (! prog->float_substr) {
4062                     if (! to_byte_substr(prog)) {
4063                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4064                     }
4065                 }
4066                 must = prog->float_substr;
4067             }
4068             back_max = prog->float_max_offset;
4069             back_min = prog->float_min_offset;
4070         }
4071 
4072         if (back_min<0) {
4073             last = strend;
4074         } else {
4075             last = HOP3c(strend,	/* Cannot start after this */
4076                   -(SSize_t)(CHR_SVLEN(must)
4077                          - (SvTAIL(must) != 0) + back_min), strbeg);
4078         }
4079         if (s > reginfo->strbeg)
4080             last1 = HOPc(s, -1);
4081         else
4082             last1 = s - 1;	/* bogus */
4083 
4084         /* XXXX check_substr already used to find "s", can optimize if
4085            check_substr==must. */
4086         dontbother = 0;
4087         strend = HOPc(strend, -dontbother);
4088         while ( (s <= last) &&
4089                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
4090                                   (unsigned char*)strend, must,
4091                                   multiline ? FBMrf_MULTILINE : 0)) ) {
4092             DEBUG_EXECUTE_r( did_match = 1 );
4093             if (HOPc(s, -back_max) > last1) {
4094                 last1 = HOPc(s, -back_min);
4095                 s = HOPc(s, -back_max);
4096             }
4097             else {
4098                 char * const t = (last1 >= reginfo->strbeg)
4099                                     ? HOPc(last1, 1) : last1 + 1;
4100 
4101                 last1 = HOPc(s, -back_min);
4102                 s = t;
4103             }
4104             if (utf8_target) {
4105                 while (s <= last1) {
4106                     if (regtry(reginfo, &s))
4107                         goto got_it;
4108                     if (s >= last1) {
4109                         s++; /* to break out of outer loop */
4110                         break;
4111                     }
4112                     s += UTF8SKIP(s);
4113                 }
4114             }
4115             else {
4116                 while (s <= last1) {
4117                     if (regtry(reginfo, &s))
4118                         goto got_it;
4119                     s++;
4120                 }
4121             }
4122         }
4123         DEBUG_EXECUTE_r(if (!did_match) {
4124             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4125                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4126             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
4127                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
4128                                ? "anchored" : "floating"),
4129                 quoted, RE_SV_TAIL(must));
4130         });
4131         goto phooey;
4132     }
4133     else if ( (c = progi->regstclass) ) {
4134         if (minlen) {
4135             const OPCODE op = OP(progi->regstclass);
4136             /* don't bother with what can't match */
4137             if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE)
4138                 strend = HOPc(strend, -(minlen - 1));
4139         }
4140         DEBUG_EXECUTE_r({
4141             SV * const prop = sv_newmortal();
4142             regprop(prog, prop, c, reginfo, NULL);
4143             {
4144                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4145                     s,strend-s,PL_dump_re_max_len);
4146                 Perl_re_printf( aTHX_
4147                     "Matching stclass %.*s against %s (%d bytes)\n",
4148                     (int)SvCUR(prop), SvPVX_const(prop),
4149                      quoted, (int)(strend - s));
4150             }
4151         });
4152         if (find_byclass(prog, c, s, strend, reginfo))
4153             goto got_it;
4154         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
4155     }
4156     else {
4157         dontbother = 0;
4158         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4159             /* Trim the end. */
4160             char *last= NULL;
4161             SV* float_real;
4162             STRLEN len;
4163             const char *little;
4164 
4165             if (utf8_target) {
4166                 if (! prog->float_utf8) {
4167                     to_utf8_substr(prog);
4168                 }
4169                 float_real = prog->float_utf8;
4170             }
4171             else {
4172                 if (! prog->float_substr) {
4173                     if (! to_byte_substr(prog)) {
4174                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4175                     }
4176                 }
4177                 float_real = prog->float_substr;
4178             }
4179 
4180             little = SvPV_const(float_real, len);
4181             if (SvTAIL(float_real)) {
4182                     /* This means that float_real contains an artificial \n on
4183                      * the end due to the presence of something like this:
4184                      * /foo$/ where we can match both "foo" and "foo\n" at the
4185                      * end of the string.  So we have to compare the end of the
4186                      * string first against the float_real without the \n and
4187                      * then against the full float_real with the string.  We
4188                      * have to watch out for cases where the string might be
4189                      * smaller than the float_real or the float_real without
4190                      * the \n. */
4191                     char *checkpos= strend - len;
4192                     DEBUG_OPTIMISE_r(
4193                         Perl_re_printf( aTHX_
4194                             "%sChecking for float_real.%s\n",
4195                             PL_colors[4], PL_colors[5]));
4196                     if (checkpos + 1 < strbeg) {
4197                         /* can't match, even if we remove the trailing \n
4198                          * string is too short to match */
4199                         DEBUG_EXECUTE_r(
4200                             Perl_re_printf( aTHX_
4201                                 "%sString shorter than required trailing substring, cannot match.%s\n",
4202                                 PL_colors[4], PL_colors[5]));
4203                         goto phooey;
4204                     } else if (memEQ(checkpos + 1, little, len - 1)) {
4205                         /* can match, the end of the string matches without the
4206                          * "\n" */
4207                         last = checkpos + 1;
4208                     } else if (checkpos < strbeg) {
4209                         /* cant match, string is too short when the "\n" is
4210                          * included */
4211                         DEBUG_EXECUTE_r(
4212                             Perl_re_printf( aTHX_
4213                                 "%sString does not contain required trailing substring, cannot match.%s\n",
4214                                 PL_colors[4], PL_colors[5]));
4215                         goto phooey;
4216                     } else if (!multiline) {
4217                         /* non multiline match, so compare with the "\n" at the
4218                          * end of the string */
4219                         if (memEQ(checkpos, little, len)) {
4220                             last= checkpos;
4221                         } else {
4222                             DEBUG_EXECUTE_r(
4223                                 Perl_re_printf( aTHX_
4224                                     "%sString does not contain required trailing substring, cannot match.%s\n",
4225                                     PL_colors[4], PL_colors[5]));
4226                             goto phooey;
4227                         }
4228                     } else {
4229                         /* multiline match, so we have to search for a place
4230                          * where the full string is located */
4231                         goto find_last;
4232                     }
4233             } else {
4234                   find_last:
4235                     if (len)
4236                         last = rninstr(s, strend, little, little + len);
4237                     else
4238                         last = strend;	/* matching "$" */
4239             }
4240             if (!last) {
4241                 /* at one point this block contained a comment which was
4242                  * probably incorrect, which said that this was a "should not
4243                  * happen" case.  Even if it was true when it was written I am
4244                  * pretty sure it is not anymore, so I have removed the comment
4245                  * and replaced it with this one. Yves */
4246                 DEBUG_EXECUTE_r(
4247                     Perl_re_printf( aTHX_
4248                         "%sString does not contain required substring, cannot match.%s\n",
4249                         PL_colors[4], PL_colors[5]
4250                     ));
4251                 goto phooey;
4252             }
4253             dontbother = strend - last + prog->float_min_offset;
4254         }
4255         if (minlen && (dontbother < minlen))
4256             dontbother = minlen - 1;
4257         strend -= dontbother; 		   /* this one's always in bytes! */
4258         /* We don't know much -- general case. */
4259         if (utf8_target) {
4260             for (;;) {
4261                 if (regtry(reginfo, &s))
4262                     goto got_it;
4263                 if (s >= strend)
4264                     break;
4265                 s += UTF8SKIP(s);
4266             };
4267         }
4268         else {
4269             do {
4270                 if (regtry(reginfo, &s))
4271                     goto got_it;
4272             } while (s++ < strend);
4273         }
4274     }
4275 
4276     /* Failure. */
4277     goto phooey;
4278 
4279   got_it:
4280     /* s/// doesn't like it if $& is earlier than where we asked it to
4281      * start searching (which can happen on something like /.\G/) */
4282     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
4283             && (RXp_OFFS_START(prog,0) < stringarg - strbeg))
4284     {
4285         /* this should only be possible under \G */
4286         assert(prog->intflags & PREGf_GPOS_SEEN);
4287         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4288             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4289         goto phooey;
4290     }
4291 
4292     /* clean up; this will trigger destructors that will free all slabs
4293      * above the current one, and cleanup the regmatch_info_aux
4294      * and regmatch_info_aux_eval sructs */
4295 
4296     LEAVE_SCOPE(oldsave);
4297 
4298     if (RXp_PAREN_NAMES(prog))
4299         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4300 
4301     /* make sure $`, $&, $', and $digit will work later */
4302     if ( !(flags & REXEC_NOT_FIRST) )
4303         S_reg_set_capture_string(aTHX_ rx,
4304                                     strbeg, reginfo->strend,
4305                                     sv, flags, utf8_target);
4306 
4307     return 1;
4308 
4309   phooey:
4310     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
4311                           PL_colors[4], PL_colors[5]));
4312 
4313     if (swap) {
4314         /* we failed :-( roll it back.
4315          * Since the swap buffer will be freed on scope exit which follows
4316          * shortly, restore the old captures by copying 'swap's original
4317          * data to the new offs buffer
4318          */
4319         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4320             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4321             0,
4322             PTR2UV(prog),
4323             PTR2UV(RXp_OFFSp(prog)),
4324             PTR2UV(swap)
4325         ));
4326 
4327         Copy(swap, RXp_OFFSp(prog), prog->nparens + 1, regexp_paren_pair);
4328     }
4329 
4330     /* clean up; this will trigger destructors that will free all slabs
4331      * above the current one, and cleanup the regmatch_info_aux
4332      * and regmatch_info_aux_eval sructs */
4333 
4334     LEAVE_SCOPE(oldsave);
4335 
4336     return 0;
4337 }
4338 
4339 
4340 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4341  * Do inc before dec, in case old and new rex are the same */
4342 #define SET_reg_curpm(Re2)                          \
4343     if (reginfo->info_aux_eval) {                   \
4344         (void)ReREFCNT_inc(Re2);		    \
4345         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));	    \
4346         PM_SETRE((PL_reg_curpm), (Re2));	    \
4347     }
4348 
4349 
4350 /*
4351  - regtry - try match at specific point
4352  */
4353 STATIC bool			/* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info * reginfo,char ** startposp)4354 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4355 {
4356     CHECKPOINT lastcp;
4357     REGEXP *const rx = reginfo->prog;
4358     regexp *const prog = ReANY(rx);
4359     SSize_t result;
4360 #ifdef DEBUGGING
4361     U32 depth = 0; /* used by REGCP_SET */
4362 #endif
4363     RXi_GET_DECL(prog,progi);
4364     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4365 
4366     PERL_ARGS_ASSERT_REGTRY;
4367 
4368     reginfo->cutpoint=NULL;
4369 
4370     RXp_OFFSp(prog)[0].start = *startposp - reginfo->strbeg;
4371     RXp_LASTPAREN(prog) = 0;
4372     RXp_LASTCLOSEPAREN(prog) = 0;
4373 
4374     /* XXXX What this code is doing here?!!!  There should be no need
4375        to do this again and again, RXp_LASTPAREN(prog) should take care of
4376        this!  --ilya*/
4377 
4378     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4379      * Actually, the code in regcppop() (which Ilya may be meaning by
4380      * RXp_LASTPAREN(prog)), is not needed at all by the test suite
4381      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4382      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4383      * Meanwhile, this code *is* needed for the
4384      * above-mentioned test suite tests to succeed.  The common theme
4385      * on those tests seems to be returning null fields from matches.
4386      * --jhi updated by dapm */
4387 
4388     /* After encountering a variant of the issue mentioned above I think
4389      * the point Ilya was making is that if we properly unwind whenever
4390      * we set lastparen to a smaller value then we should not need to do
4391      * this every time, only when needed. So if we have tests that fail if
4392      * we remove this, then it suggests somewhere else we are improperly
4393      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4394      * places it is called, and related regcp() routines. - Yves */
4395 #if 1
4396     if (prog->nparens) {
4397         regexp_paren_pair *pp = RXp_OFFSp(prog);
4398         I32 i;
4399         for (i = prog->nparens; i > (I32)RXp_LASTPAREN(prog); i--) {
4400             ++pp;
4401             pp->start = -1;
4402             pp->end = -1;
4403         }
4404     }
4405 #endif
4406     REGCP_SET(lastcp);
4407     result = regmatch(reginfo, *startposp, progi->program + 1);
4408     if (result != -1) {
4409         RXp_OFFSp(prog)[0].end = result;
4410         return 1;
4411     }
4412     if (reginfo->cutpoint)
4413         *startposp= reginfo->cutpoint;
4414     REGCP_UNWIND(lastcp);
4415     return 0;
4416 }
4417 
4418 /* this is used to determine how far from the left messages like
4419    'failed...' are printed in regexec.c. It should be set such that
4420    messages are inline with the regop output that created them.
4421 */
4422 #define REPORT_CODE_OFF 29
4423 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4424 #ifdef DEBUGGING
4425 int
Perl_re_exec_indentf(pTHX_ const char * fmt,U32 depth,...)4426 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4427 {
4428     va_list ap;
4429     int result;
4430     PerlIO *f= Perl_debug_log;
4431     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4432     va_start(ap, depth);
4433     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4434     result = PerlIO_vprintf(f, fmt, ap);
4435     va_end(ap);
4436     return result;
4437 }
4438 #endif /* DEBUGGING */
4439 
4440 /* grab a new slab and return the first slot in it */
4441 
4442 STATIC regmatch_state *
S_push_slab(pTHX)4443 S_push_slab(pTHX)
4444 {
4445     regmatch_slab *s = PL_regmatch_slab->next;
4446     if (!s) {
4447         Newx(s, 1, regmatch_slab);
4448         s->prev = PL_regmatch_slab;
4449         s->next = NULL;
4450         PL_regmatch_slab->next = s;
4451     }
4452     PL_regmatch_slab = s;
4453     return SLAB_FIRST(s);
4454 }
4455 
4456 #ifdef DEBUGGING
4457 
4458 STATIC void
S_debug_start_match(pTHX_ const REGEXP * prog,const bool utf8_target,const char * start,const char * end,const char * blurb)4459 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4460     const char *start, const char *end, const char *blurb)
4461 {
4462     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4463 
4464     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4465 
4466     if (!PL_colorset)
4467             reginitcolors();
4468     {
4469         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4470             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4471 
4472         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4473             start, end - start, PL_dump_re_max_len);
4474 
4475         Perl_re_printf( aTHX_
4476             "%s%s REx%s %s against %s\n",
4477                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4478 
4479         if (utf8_target||utf8_pat)
4480             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4481                 utf8_pat ? "pattern" : "",
4482                 utf8_pat && utf8_target ? " and " : "",
4483                 utf8_target ? "string" : ""
4484             );
4485     }
4486 }
4487 
4488 STATIC void
S_dump_exec_pos(pTHX_ const char * locinput,const regnode * scan,const char * loc_regeol,const char * loc_bostr,const char * loc_reg_starttry,const bool utf8_target,const U32 depth)4489 S_dump_exec_pos(pTHX_ const char *locinput,
4490                       const regnode *scan,
4491                       const char *loc_regeol,
4492                       const char *loc_bostr,
4493                       const char *loc_reg_starttry,
4494                       const bool utf8_target,
4495                       const U32 depth
4496                 )
4497 {
4498     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4499     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4500     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4501     /* The part of the string before starttry has one color
4502        (pref0_len chars), between starttry and current
4503        position another one (pref_len - pref0_len chars),
4504        after the current position the third one.
4505        We assume that pref0_len <= pref_len, otherwise we
4506        decrease pref0_len.  */
4507     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4508         ? (5 + taill) - l : locinput - loc_bostr;
4509     int pref0_len;
4510 
4511     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4512 
4513     if (utf8_target) {
4514         while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) {
4515             pref_len++;
4516         }
4517     }
4518     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4519     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4520         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4521               ? (5 + taill) - pref_len : loc_regeol - locinput);
4522     if (utf8_target) {
4523         while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) {
4524             l--;
4525         }
4526     }
4527     if (pref0_len < 0)
4528         pref0_len = 0;
4529     if (pref0_len > pref_len)
4530         pref0_len = pref_len;
4531     {
4532         const int is_uni = utf8_target ? 1 : 0;
4533 
4534         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4535             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4536 
4537         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4538                     (locinput - pref_len + pref0_len),
4539                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4540 
4541         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4542                     locinput, loc_regeol - locinput, 10, 0, 1);
4543 
4544         const STRLEN tlen=len0+len1+len2;
4545         Perl_re_printf( aTHX_
4546                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4547                     (IV)(locinput - loc_bostr),
4548                     len0, s0,
4549                     len1, s1,
4550                     (docolor ? "" : "> <"),
4551                     len2, s2,
4552                     (int)(tlen > 19 ? 0 :  19 - tlen),
4553                     "",
4554                     (UV)depth);
4555     }
4556 }
4557 
4558 #endif
4559 
4560 /* reg_check_named_buff_matched()
4561  * Checks to see if a named buffer has matched. The data array of
4562  * buffer numbers corresponding to the buffer is expected to reside
4563  * in the regexp->data->data array in the slot stored in the ARG1u() of
4564  * node involved. Note that this routine doesn't actually care about the
4565  * name, that information is not preserved from compilation to execution.
4566  * Returns the index of the leftmost defined buffer with the given name
4567  * or 0 if non of the buffers matched.
4568  */
4569 STATIC I32
S_reg_check_named_buff_matched(const regexp * rex,const regnode * scan)4570 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4571 {
4572     I32 n;
4573     RXi_GET_DECL(rex,rexi);
4574     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
4575     I32 *nums=(I32*)SvPVX(sv_dat);
4576 
4577     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4578 
4579     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4580         if ((I32)RXp_LASTPAREN(rex) >= nums[n] &&
4581             RXp_OFFS_END(rex,nums[n]) != -1)
4582         {
4583             return nums[n];
4584         }
4585     }
4586     return 0;
4587 }
4588 
4589 static bool
S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,struct next_matchable_info * m,regmatch_info * reginfo)4590 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4591                           struct next_matchable_info * m,
4592                           regmatch_info *reginfo)
4593 {
4594     /* This function determines various characteristics about every possible
4595      * initial match of the passed-in EXACTish <text_node>, and stores them in
4596      * <*m>.
4597      *
4598      * That includes a match string and a parallel mask, such that if you AND
4599      * the target string with the mask and compare with the match string,
4600      * you'll have a pretty good idea, perhaps even perfect, if that portion of
4601      * the target matches or not.
4602      *
4603      * The motivation behind this function is to allow the caller to set up
4604      * tight loops for matching.  Consider patterns like '.*B' or '.*?B' where
4605      * B is an arbitrary EXACTish node.  To find the end of .*, we look for the
4606      * beginning oF B, which is the passed in <text_node>  That's where this
4607      * function comes in.  The values it returns can quickly be used to rule
4608      * out many, or all, cases of possible matches not actually being the
4609      * beginning of B, <text_node>.  It is also used in regrepeat() where we
4610      * have 'A*', for arbitrary 'A'.  This sets up criteria to more efficiently
4611      * determine where the span of 'A's stop.
4612      *
4613      * If <text_node> is of type EXACT, there is only one possible character
4614      * that can match its first character, and so the situation is quite
4615      * simple.  But things can get much more complicated if folding is
4616      * involved.  It may be that the first character of an EXACTFish node
4617      * doesn't participate in any possible fold, e.g., punctuation, so it can
4618      * be matched only by itself.  The vast majority of characters that are in
4619      * folds match just two things, their lower and upper-case equivalents.
4620      * But not all are like that; some have multiple possible matches, or match
4621      * sequences of more than one character.  This function sorts all that out.
4622      *
4623      * It returns information about all possibilities of what the first
4624      * character(s) of <text_node> could look like.  Again, if <text_node> is a
4625      * plain EXACT node, that's just the actual first bytes of the first
4626      * character; but otherwise it is the bytes, that when masked, match all
4627      * possible combinations of all the initial bytes of all the characters
4628      * that could match, folded.  (Actually, this is a slight over promise.  It
4629      * handles only up to the initial 5 bytes, which is enough for all Unicode
4630      * characters, but not for all non-Unicode ones.)
4631      *
4632      * Here's an example to clarify.  Suppose the first character of
4633      * <text_node> is the letter 'C', and we are under /i matching.  That means
4634      * 'c' also matches.  The representations of these two characters differ in
4635      * just one bit, so the mask would be a zero in that position and ones in
4636      * the other 7.  And the returned string would be the AND of these two
4637      * characters, and would be one byte long, since these characters are each
4638      * a single byte.  ANDing the target <text_node> with this mask will yield
4639      * the returned string if and only if <text_node> begins with one of these
4640      * two characters.  So, the function would also return that the definitive
4641      * length matched is 1 byte.
4642      *
4643      * Now, suppose instead of the letter 'C',  <text_node> begins with the
4644      * letter 'F'.  The situation is much more complicated because there are
4645      * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4646      * begins with 'f', and hence could match.  We add these into the returned
4647      * string and mask, but the result isn't definitive; the caller has to
4648      * check further if its AND and compare pass.  But the failure of that
4649      * compare will quickly rule out most possible inputs.
4650      *
4651      * Much of this could be done in regcomp.c at compile time, except for
4652      * locale-dependent, and UTF-8 target dependent data.  Extra data fields
4653      * could be used for one or the other eventualities.
4654      *
4655      * If this function determines that no possible character in the target
4656      * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
4657      * situation occurs if the first character in <text_node> requires UTF-8 to
4658      * represent, and the target string isn't in UTF-8.)
4659      *
4660      * Some analysis is in GH #18414, located at the time of this writing at:
4661      * https://github.com/Perl/perl5/issues/18414
4662      */
4663 
4664     const bool utf8_target = reginfo->is_utf8_target;
4665     bool utf8_pat = reginfo->is_utf8_pat;
4666 
4667     PERL_UINT_FAST8_T i;
4668 
4669     /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4670      */
4671     U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4672     U8 lengths[MAX_MATCHES] = { 0 };
4673 
4674     U8 index_of_longest = 0;
4675 
4676     U8 *pat = (U8*)STRING(text_node);
4677     Size_t pat_len = STR_LEN(text_node);
4678     U8 op = OP(text_node);
4679 
4680     U8 byte_mask[5]  = {0};
4681     U8 byte_anded[5] = {0};
4682 
4683     /* There are some folds in Unicode to multiple characters.  This will hold
4684      * such characters that could fold to the beginning of 'text_node' */
4685     UV multi_fold_from = 0;
4686 
4687     /* We may have to create a modified copy of the pattern */
4688     U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4689 
4690     m->max_length = 0;
4691     m->min_length = 255;
4692     m->count = 0;
4693 
4694     /* Even if the first character in the node can match something in Latin1,
4695      * if there is anything in the node that can't, the match must fail */
4696     if (! utf8_target && isEXACT_REQ8(op)) {
4697         return FALSE;
4698     }
4699 
4700 /* Define a temporary op for use in this function, using an existing one that
4701  * should never be a real op during execution */
4702 #define TURKISH  PSEUDO
4703 
4704     /* What to do about these two nodes had to be deferred to runtime (which is
4705      * now).  If the extra information we now have so indicates, turn them into
4706      * EXACTFU nodes */
4707     if (   (op == EXACTF && utf8_target)
4708         || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4709     {
4710         if (op == EXACTFL && IN_UTF8_TURKIC_LOCALE) {
4711             op = TURKISH;
4712         }
4713         else {
4714             op = EXACTFU;
4715         }
4716 
4717         /* And certain situations are better handled if we create a modified
4718          * version of the pattern */
4719         if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4720                            specific problematic characters */
4721             if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4722 
4723                 /* The node could start with characters that are the first ones
4724                  * of a multi-character fold. */
4725                 multi_fold_from
4726                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4727                 if (multi_fold_from) {
4728 
4729                     /* Here, they do form a sequence that matches the fold of a
4730                      * single character.  That single character then is a
4731                      * possible match.  Below we will look again at this, but
4732                      * the code below is expecting every character in the
4733                      * pattern to be folded, which the input isn't required to
4734                      * be in this case.  So, just fold the single character,
4735                      * and the result will be in the expected form. */
4736                     _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4737                                        FOLD_FLAGS_FULL);
4738                     pat = mod_pat;
4739                 }
4740                          /* Turkish has a couple extra possibilities. */
4741                 else if (   UNLIKELY(op == TURKISH)
4742                          &&  pat_len >= 3
4743                          &&  isALPHA_FOLD_EQ(pat[0], 'f')
4744                          && (   memBEGINs(pat + 1, pat_len - 1,
4745                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4746                              || (   pat_len >= 4
4747                                  && isALPHA_FOLD_EQ(pat[1], 'f')
4748                                  && memBEGINs(pat + 2, pat_len - 2,
4749                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4750                 ))) {
4751                     /* The macros for finding a multi-char fold don't include
4752                      * the Turkish possibilities, in which U+130 folds to 'i'.
4753                      * Hard-code these.  It's very unlikely that Unicode will
4754                      * ever add any others.  */
4755                     if (pat[1] == 'f') {
4756                         pat_len = 3;
4757                         Copy("ffi", mod_pat, pat_len, U8);
4758                     }
4759                     else {
4760                         pat_len = 2;
4761                         Copy("fi", mod_pat, pat_len, U8);
4762                     }
4763                     pat = mod_pat;
4764                 }
4765                 else if (    UTF8_IS_DOWNGRADEABLE_START(*pat)
4766                          &&  LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4767                          &&  LIKELY(memNEs(pat, pat_len,
4768                                            LATIN_SMALL_LETTER_SHARP_S_UTF8))
4769                          && (LIKELY(op != TURKISH || *pat != 'I')))
4770                 {
4771                     /* For all cases of things between 0-255, except the ones
4772                      * in the conditional above, the fold is just the lower
4773                      * case, which is faster than the more general case. */
4774                     mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4775                                                                      pat[1]));
4776                     pat_len = 1;
4777                     pat = mod_pat;
4778                     utf8_pat = FALSE;
4779                 }
4780                 else {  /* Code point above 255, or needs special handling */
4781                     _to_utf8_fold_flags(pat, pat + pat_len,
4782                                         mod_pat, &pat_len,
4783                                         FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4784                     pat = mod_pat;
4785                 }
4786             }
4787         }
4788         else if /* Below is not a UTF-8 pattern; there's a somewhat different
4789                    set of problematic characters */
4790                 ((multi_fold_from
4791                           = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4792         {
4793             /* We may have to canonicalize a multi-char fold, as in the UTF-8
4794              * case */
4795             _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4796                                FOLD_FLAGS_FULL);
4797             pat = mod_pat;
4798         }
4799         else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4800             mod_pat[0] = mod_pat[1] = 's';
4801             pat_len = 2;
4802             utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4803                                        chars, and speeds copying */
4804             pat = mod_pat;
4805         }
4806         else if (LIKELY(op != TURKISH || *pat != 'I')) {
4807             mod_pat[0] = toLOWER_L1(*pat);
4808             pat_len = 1;
4809             pat = mod_pat;
4810         }
4811     }
4812     else if /* Below isn't a node that we convert to UTF-8 */
4813             (     utf8_target
4814              && ! utf8_pat
4815              &&   op == EXACTFAA_NO_TRIE
4816              &&  *pat == LATIN_SMALL_LETTER_SHARP_S)
4817     {
4818         /* A very special case.  Folding U+DF goes to U+17F under /iaa.  We
4819          * did this at compile time when the pattern was UTF-8 , but otherwise
4820          * we couldn't do it earlier, because it requires a UTF-8 target for
4821          * this match to be legal. */
4822         pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4823         Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4824              LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4825         pat = mod_pat;
4826         utf8_pat = TRUE;
4827     }
4828 
4829     /* Here, we have taken care of the initial work for a few very problematic
4830      * situations, possibly creating a modified pattern.
4831      *
4832      * Now ready for the general case.  We build up all the possible things
4833      * that could match the first character of the pattern into the elements of
4834      * 'matches[]'
4835      *
4836      * Everything generally matches at least itself.  But if there is a
4837      * UTF8ness mismatch, we have to convert to that of the target string. */
4838     if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
4839         matches[0][0] = pat[0];
4840         lengths[0] = 1;
4841         m->count++;
4842     }
4843     else if (utf8_target) {
4844         if (utf8_pat) {
4845             lengths[0] = UTF8SKIP(pat);
4846             Copy(pat, matches[0], lengths[0], U8);
4847             m->count++;
4848         }
4849         else {  /* target is UTF-8, pattern isn't */
4850             matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4851             matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4852             lengths[0] = 2;
4853             m->count++;
4854         }
4855     }
4856     else if (! utf8_pat) {  /* Neither is UTF-8 */
4857         matches[0][0] = pat[0];
4858         lengths[0] = 1;
4859         m->count++;
4860     }
4861     else     /* target isn't UTF-8; pattern is.  No match possible unless the
4862                 pattern's first character can fit in a byte */
4863          if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4864     {
4865         matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4866         lengths[0] = 1;
4867         m->count++;
4868     }
4869 
4870     /* Here we have taken care of any necessary node-type changes */
4871 
4872     if (m->count) {
4873         m->max_length = lengths[0];
4874         m->min_length = lengths[0];
4875     }
4876 
4877     /* For non-folding nodes, there are no other possible candidate matches,
4878      * but for foldable ones, we have to look further. */
4879     if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4880         UV folded;  /* The first character in the pattern, folded */
4881         U32 first_fold_from;    /* A character that folds to it */
4882         const U32 * remaining_fold_froms;   /* The remaining characters that
4883                                                fold to it, if any */
4884         Size_t folds_to_count;  /* The total number of characters that fold to
4885                                    'folded' */
4886 
4887         /* If the node begins with a sequence of more than one character that
4888          * together form the fold of a single character, it is called a
4889          * 'multi-character fold', and the normal functions don't handle this
4890          * case.  We set 'multi_fold_from' to the single folded-from character,
4891          * which is handled in an extra iteration below */
4892         if (utf8_pat) {
4893             folded = valid_utf8_to_uvchr(pat, NULL);
4894             multi_fold_from
4895                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4896         }
4897         else {
4898             folded = *pat;
4899 
4900             /* This may generate illegal combinations for things like EXACTF,
4901              * but rather than repeat the logic and exclude them here, all such
4902              * illegalities are checked for and skipped below in the loop */
4903             multi_fold_from
4904                         = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4905         }
4906 
4907         /* Everything matches at least itself; initialize to that because the
4908          * only the branches below that set it are the ones where the number
4909          * isn't 1. */
4910         folds_to_count = 1;
4911 
4912         /* There are a few special cases for locale-dependent nodes, where the
4913          * run-time context was needed before we could know what matched */
4914         if (UNLIKELY(op == EXACTFL) && folded < 256)  {
4915             first_fold_from = PL_fold_locale[folded];
4916         }
4917         else if (   op == EXACTFL && utf8_target && utf8_pat
4918                  && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4919                                             LATIN_SMALL_LETTER_LONG_S_UTF8))
4920         {
4921             first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4922         }
4923         else if (UNLIKELY(    op == TURKISH
4924                           && (   isALPHA_FOLD_EQ(folded, 'i')
4925                               || inRANGE(folded,
4926                                          LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4927                                          LATIN_SMALL_LETTER_DOTLESS_I))))
4928         {   /* Turkish folding requires special handling */
4929             if (folded == 'i')
4930                 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4931             else if (folded == 'I')
4932                 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4933             else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4934                 first_fold_from = 'i';
4935             else first_fold_from = 'I';
4936         }
4937         else {
4938             /* Here, isn't a special case: use the generic function to
4939              * calculate what folds to this */
4940           redo_multi:
4941             /* Look up what code points (besides itself) fold to 'folded';
4942              * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4943             folds_to_count = _inverse_folds(folded, &first_fold_from,
4944                                                        &remaining_fold_froms);
4945         }
4946 
4947         /* Add each character that folds to 'folded' to the list of them,
4948          * subject to limitations based on the node type and target UTF8ness.
4949          * If there was a character that folded to multiple characters, do an
4950          * extra iteration for it.  (Note the extra iteration if there is a
4951          * multi-character fold) */
4952         for (i = 0; i < folds_to_count
4953                       + UNLIKELY(multi_fold_from != 0); i++)
4954         {
4955             UV fold_from = 0;
4956 
4957             if (i >= folds_to_count) {  /* Final iteration: handle the
4958                                            multi-char */
4959                 fold_from = multi_fold_from;
4960             }
4961             else if (i == 0) {
4962                 fold_from = first_fold_from;
4963             }
4964             else if (i < folds_to_count) {
4965                 fold_from = remaining_fold_froms[i-1];
4966             }
4967 
4968             if (folded == fold_from) {  /* We already added the character
4969                                            itself */
4970                 continue;
4971             }
4972 
4973             /* EXACTF doesn't have any non-ascii folds */
4974             if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4975                 continue;
4976             }
4977 
4978             /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4979              * */
4980             if (    isASCII(folded) != isASCII(fold_from)
4981                 &&  inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4982 
4983             {
4984                 continue;
4985             }
4986 
4987             /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4988              * locale, but those have been converted to EXACTFU above) */
4989             if (   op == EXACTFL
4990                 && (folded < 256) != (fold_from < 256))
4991             {
4992                 continue;
4993             }
4994 
4995             /* If this triggers, it likely is because of the unlikely case
4996              * where a new Unicode standard has changed what MAX_MATCHES should
4997              * be set to */
4998             assert(m->count < MAX_MATCHES);
4999 
5000             /* Add this character to the list of possible matches */
5001             if (utf8_target) {
5002                 uvchr_to_utf8(matches[(U8) m->count], fold_from);
5003                 lengths[m->count] = UVCHR_SKIP(fold_from);
5004                 m->count++;
5005             }
5006             else { /* Non-UTF8 target: no code point above 255 can appear in it
5007                     */
5008                 if (fold_from > 255) {
5009                     continue;
5010                 }
5011 
5012                 matches[m->count][0] = fold_from;
5013                 lengths[m->count] = 1;
5014                 m->count++;
5015             }
5016 
5017             /* Update min and mlengths */
5018             if (m->min_length > lengths[m->count-1]) {
5019                 m->min_length = lengths[m->count-1];
5020             }
5021 
5022             if (m->max_length < lengths[m->count-1]) {
5023                 index_of_longest = m->count - 1;
5024                 m->max_length = lengths[index_of_longest];
5025             }
5026         } /* looped through each potential fold */
5027 
5028         /* If there is something that folded to an initial multi-character
5029          * fold, repeat, using it.  This catches some edge cases.  An example
5030          * of one is /ss/i when UTF-8 encoded.  The function
5031          * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
5032          * (LATIN SMALL SHARP S).  If it returned a list of characters, this
5033          * code wouldn't be needed.  But since it doesn't, we have to look what
5034          * folds to the U+DF.  In this case, U+1E9E does, and has to be added.
5035          * */
5036         if (multi_fold_from) {
5037             folded = multi_fold_from;
5038             multi_fold_from = 0;
5039             goto redo_multi;
5040         }
5041     } /* End of finding things that participate in this fold */
5042 
5043     if (m->count == 0) {    /* If nothing found, can't match */
5044         m->min_length = 0;
5045         return FALSE;
5046     }
5047 
5048     /* Have calculated all possible matches.  Now calculate the mask and AND
5049      * values */
5050     m->initial_exact = 0;
5051     m->initial_definitive = 0;
5052 
5053     {
5054         unsigned int mask_ones = 0;
5055         unsigned int possible_ones = 0;
5056         U8 j;
5057 
5058         /* For each byte that is in all possible matches ... */
5059         for (j = 0; j < MIN(m->min_length, 5); j++) {
5060 
5061             /* Initialize the accumulator for this byte */
5062             byte_mask[j] = 0xFF;
5063             byte_anded[j] = matches[0][j];
5064 
5065             /* Then the rest of the rows (folds).  The mask is based on, like,
5066              * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
5067              * where they differ. */
5068             for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
5069                 byte_mask[j]  &= ~ (byte_anded[j] ^ matches[i][j]);
5070                 byte_anded[j] &= matches[i][j];
5071             }
5072 
5073             /* Keep track of the number of initial mask bytes that are all one
5074              * bits.  The code calling this can use this number to know that
5075              * a string that matches this number of bytes in the pattern is an
5076              * exact match of that pattern for this number of bytes.  But also
5077              * counted are the number of initial bytes that in total have a
5078              * single zero bit.  If a string matches those, masked, it must be
5079              * one of two possibilites, both of which this function has
5080              * determined are legal.  (But if that single 0 is one of the
5081              * initial bits for masking a UTF-8 start byte, that could
5082              * incorrectly lead to different length strings appearing to be
5083              * equivalent, so only do this optimization when the matchables are
5084              * all the same length.  This was uncovered by testing
5085              * /\x{029E}/i.) */
5086             if (m->min_length == m->max_length) {
5087                 mask_ones += PL_bitcount[byte_mask[j]];
5088                 possible_ones += 8;
5089                 if (mask_ones + 1 >= possible_ones) {
5090                     m->initial_definitive++;
5091                     if (mask_ones >= possible_ones) {
5092                         m->initial_exact++;
5093                     }
5094                 }
5095             }
5096         }
5097     }
5098 
5099     /* The first byte is separate for speed */
5100     m->first_byte_mask = byte_mask[0];
5101     m->first_byte_anded = byte_anded[0];
5102 
5103     /* Then pack up to the next 4 bytes into a word */
5104     m->mask32 = m->anded32 = 0;
5105     for (i = 1; i < MIN(m->min_length, 5); i++) {
5106         U8 which = i;
5107         U8 shift = (which - 1) * 8;
5108         m->mask32  |= (U32) byte_mask[i]  << shift;
5109         m->anded32 |= (U32) byte_anded[i] << shift;
5110     }
5111 
5112     /* Finally, take the match strings and place them sequentially into a
5113      * one-dimensional array.  (This is done to save significant space in the
5114      * structure.) Sort so the longest (presumably the least likely) is last.
5115      * XXX When this gets moved to regcomp, may want to fully sort shortest
5116      * first, but above we generally used the folded code point first, and
5117      * those tend to be no longer than their upper case values, so this is
5118      * already pretty well sorted by size.
5119      *
5120      * If the asserts fail, it's most likely because a new version of the
5121      * Unicode standard requires more space; simply increase the declaration
5122      * size. */
5123     {
5124         U8 cur_pos = 0;
5125         U8 output_index = 0;
5126 
5127         if (m->count > 1) { /* No need to sort a single entry */
5128             for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
5129 
5130                 /* Keep the same order for all but the longest.  (If the
5131                  * asserts fail, it could be because m->matches is declared too
5132                  * short, either because of a new Unicode release, or an
5133                  * overlooked test case, or it could be a bug.) */
5134                 if (i != index_of_longest) {
5135                     assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5136                     Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5137                     cur_pos += lengths[i];
5138                     m->lengths[output_index++] = lengths[i];
5139                 }
5140             }
5141         }
5142 
5143         assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5144         Copy(matches[index_of_longest], m->matches + cur_pos,
5145              lengths[index_of_longest], U8);
5146 
5147         /* Place the longest match last */
5148         m->lengths[output_index] = lengths[index_of_longest];
5149     }
5150 
5151 
5152     return TRUE;
5153 }
5154 
5155 PERL_STATIC_FORCE_INLINE    /* We want speed at the expense of size */
5156 bool
S_test_EXACTISH_ST(const char * loc,struct next_matchable_info info)5157 S_test_EXACTISH_ST(const char * loc,
5158                    struct next_matchable_info info)
5159 {
5160     /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5161      * bytes starting at 'loc' can match based on 'next_matchable_info' */
5162 
5163     U32 input32 = 0;
5164 
5165     /* Check the first byte */
5166     if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5167         return FALSE;
5168 
5169     /* Pack the next up-to-4 bytes into a 32 bit word */
5170     switch (info.min_length) {
5171         default:
5172             input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5173             /* FALLTHROUGH */
5174         case 4:
5175             input32 |= (U8) loc[3] << 2 * 8;
5176             /* FALLTHROUGH */
5177         case 3:
5178             input32 |= (U8) loc[2] << 1 * 8;
5179             /* FALLTHROUGH */
5180         case 2:
5181             input32 |= (U8) loc[1];
5182             break;
5183         case 1:
5184             return TRUE;    /* We already tested and passed the 0th byte */
5185         case 0:
5186             ASSUME(0);
5187     }
5188 
5189     /* And AND that with the mask and compare that with the assembled ANDED
5190      * values */
5191     return (input32 & info.mask32) == info.anded32;
5192 }
5193 
5194 STATIC bool
S_isGCB(pTHX_ const GCB_enum before,const GCB_enum after,const U8 * const strbeg,const U8 * const curpos,const bool utf8_target)5195 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5196 {
5197     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5198      * between the inputs.  See https://www.unicode.org/reports/tr29/. */
5199 
5200     PERL_ARGS_ASSERT_ISGCB;
5201 
5202     switch (GCB_table[before][after]) {
5203         case GCB_BREAKABLE:
5204             return TRUE;
5205 
5206         case GCB_NOBREAK:
5207             return FALSE;
5208 
5209         case GCB_RI_then_RI:
5210             {
5211                 int RI_count = 1;
5212                 U8 * temp_pos = (U8 *) curpos;
5213 
5214                 /* Do not break within emoji flag sequences. That is, do not
5215                  * break between regional indicator (RI) symbols if there is an
5216                  * odd number of RI characters before the break point.
5217                  *  GB12   sot (RI RI)* RI × RI
5218                  *  GB13 [^RI] (RI RI)* RI × RI */
5219 
5220                 while (backup_one_GCB(strbeg,
5221                                     &temp_pos,
5222                                     utf8_target) == GCB_Regional_Indicator)
5223                 {
5224                     RI_count++;
5225                 }
5226 
5227                 return RI_count % 2 != 1;
5228             }
5229 
5230         case GCB_EX_then_EM:
5231 
5232             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
5233             {
5234                 U8 * temp_pos = (U8 *) curpos;
5235                 GCB_enum prev;
5236 
5237                 do {
5238                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5239                 }
5240                 while (prev == GCB_Extend);
5241 
5242                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5243             }
5244 
5245         case GCB_Maybe_Emoji_NonBreak:
5246 
5247             {
5248 
5249             /* Do not break within emoji modifier sequences or emoji zwj sequences.
5250               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5251               */
5252                 U8 * temp_pos = (U8 *) curpos;
5253                 GCB_enum prev;
5254 
5255                 do {
5256                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5257                 }
5258                 while (prev == GCB_Extend);
5259 
5260                 return prev != GCB_ExtPict_XX;
5261             }
5262 
5263         default:
5264             break;
5265     }
5266 
5267 #ifdef DEBUGGING
5268     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5269                                   before, after, GCB_table[before][after]);
5270     assert(0);
5271 #endif
5272     return TRUE;
5273 }
5274 
5275 STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5276 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5277 {
5278     GCB_enum gcb;
5279 
5280     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5281 
5282     if (*curpos < strbeg) {
5283         return GCB_EDGE;
5284     }
5285 
5286     if (utf8_target) {
5287         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5288         U8 * prev_prev_char_pos;
5289 
5290         if (! prev_char_pos) {
5291             return GCB_EDGE;
5292         }
5293 
5294         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5295             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5296             *curpos = prev_char_pos;
5297             prev_char_pos = prev_prev_char_pos;
5298         }
5299         else {
5300             *curpos = (U8 *) strbeg;
5301             return GCB_EDGE;
5302         }
5303     }
5304     else {
5305         if (*curpos - 2 < strbeg) {
5306             *curpos = (U8 *) strbeg;
5307             return GCB_EDGE;
5308         }
5309         (*curpos)--;
5310         gcb = getGCB_VAL_CP(*(*curpos - 1));
5311     }
5312 
5313     return gcb;
5314 }
5315 
5316 /* Combining marks attach to most classes that precede them, but this defines
5317  * the exceptions (from TR14) */
5318 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
5319                                      || prev == LB_Mandatory_Break      \
5320                                      || prev == LB_Carriage_Return      \
5321                                      || prev == LB_Line_Feed            \
5322                                      || prev == LB_Next_Line            \
5323                                      || prev == LB_Space                \
5324                                      || prev == LB_ZWSpace))
5325 
5326 STATIC bool
S_isLB(pTHX_ LB_enum before,LB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5327 S_isLB(pTHX_ LB_enum before,
5328              LB_enum after,
5329              const U8 * const strbeg,
5330              const U8 * const curpos,
5331              const U8 * const strend,
5332              const bool utf8_target)
5333 {
5334     U8 * temp_pos = (U8 *) curpos;
5335     LB_enum prev = before;
5336 
5337     /* Is the boundary between 'before' and 'after' line-breakable?
5338      * Most of this is just a table lookup of a generated table from Unicode
5339      * rules.  But some rules require context to decide, and so have to be
5340      * implemented in code */
5341 
5342     PERL_ARGS_ASSERT_ISLB;
5343 
5344     /* Rule numbers in the comments below are as of Unicode 9.0 */
5345 
5346   redo:
5347     before = prev;
5348     switch (LB_table[before][after]) {
5349         case LB_BREAKABLE:
5350             return TRUE;
5351 
5352         case LB_NOBREAK:
5353         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5354             return FALSE;
5355 
5356         case LB_SP_foo + LB_BREAKABLE:
5357         case LB_SP_foo + LB_NOBREAK:
5358         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5359 
5360             /* When we have something following a SP, we have to look at the
5361              * context in order to know what to do.
5362              *
5363              * SP SP should not reach here because LB7: Do not break before
5364              * spaces.  (For two spaces in a row there is nothing that
5365              * overrides that) */
5366             assert(after != LB_Space);
5367 
5368             /* Here we have a space followed by a non-space.  Mostly this is a
5369              * case of LB18: "Break after spaces".  But there are complications
5370              * as the handling of spaces is somewhat tricky.  They are in a
5371              * number of rules, which have to be applied in priority order, but
5372              * something earlier in the string can cause a rule to be skipped
5373              * and a lower priority rule invoked.  A prime example is LB7 which
5374              * says don't break before a space.  But rule LB8 (lower priority)
5375              * says that the first break opportunity after a ZW is after any
5376              * span of spaces immediately after it.  If a ZW comes before a SP
5377              * in the input, rule LB8 applies, and not LB7.  Other such rules
5378              * involve combining marks which are rules 9 and 10, but they may
5379              * override higher priority rules if they come earlier in the
5380              * string.  Since we're doing random access into the middle of the
5381              * string, we have to look for rules that should get applied based
5382              * on both string position and priority.  Combining marks do not
5383              * attach to either ZW nor SP, so we don't have to consider them
5384              * until later.
5385              *
5386              * To check for LB8, we have to find the first non-space character
5387              * before this span of spaces */
5388             do {
5389                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5390             }
5391             while (prev == LB_Space);
5392 
5393             /* LB8 Break before any character following a zero-width space,
5394              * even if one or more spaces intervene.
5395              *      ZW SP* ÷
5396              * So if we have a ZW just before this span, and to get here this
5397              * is the final space in the span. */
5398             if (prev == LB_ZWSpace) {
5399                 return TRUE;
5400             }
5401 
5402             /* Here, not ZW SP+.  There are several rules that have higher
5403              * priority than LB18 and can be resolved now, as they don't depend
5404              * on anything earlier in the string (except ZW, which we have
5405              * already handled).  One of these rules is LB11 Do not break
5406              * before Word joiner, but we have specially encoded that in the
5407              * lookup table so it is caught by the single test below which
5408              * catches the other ones. */
5409             if (LB_table[LB_Space][after] - LB_SP_foo
5410                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5411             {
5412                 return FALSE;
5413             }
5414 
5415             /* If we get here, we have to XXX consider combining marks. */
5416             if (prev == LB_Combining_Mark) {
5417 
5418                 /* What happens with these depends on the character they
5419                  * follow.  */
5420                 do {
5421                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5422                 }
5423                 while (prev == LB_Combining_Mark);
5424 
5425                 /* Most times these attach to and inherit the characteristics
5426                  * of that character, but not always, and when not, they are to
5427                  * be treated as AL by rule LB10. */
5428                 if (! LB_CM_ATTACHES_TO(prev)) {
5429                     prev = LB_Alphabetic;
5430                 }
5431             }
5432 
5433             /* Here, we have the character preceding the span of spaces all set
5434              * up.  We follow LB18: "Break after spaces" unless the table shows
5435              * that is overridden */
5436             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5437 
5438         case LB_CM_ZWJ_foo:
5439 
5440             /* We don't know how to treat the CM except by looking at the first
5441              * non-CM character preceding it.  ZWJ is treated as CM */
5442             do {
5443                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5444             }
5445             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5446 
5447             /* Here, 'prev' is that first earlier non-CM character.  If the CM
5448              * attaches to it, then it inherits the behavior of 'prev'.  If it
5449              * doesn't attach, it is to be treated as an AL */
5450             if (! LB_CM_ATTACHES_TO(prev)) {
5451                 prev = LB_Alphabetic;
5452             }
5453 
5454             goto redo;
5455 
5456         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5457         case LB_HY_or_BA_then_foo + LB_NOBREAK:
5458 
5459             /* LB21a Don't break after Hebrew + Hyphen.
5460              * HL (HY | BA) × */
5461 
5462             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5463                                                           == LB_Hebrew_Letter)
5464             {
5465                 return FALSE;
5466             }
5467 
5468             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5469 
5470         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5471         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5472 
5473             /* LB25a (PR | PO) × ( OP | HY )? NU */
5474             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5475                 return FALSE;
5476             }
5477 
5478             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5479                                                                 == LB_BREAKABLE;
5480 
5481         case LB_SY_or_IS_then_various + LB_BREAKABLE:
5482         case LB_SY_or_IS_then_various + LB_NOBREAK:
5483         {
5484             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5485 
5486             LB_enum temp = prev;
5487             do {
5488                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5489             }
5490             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5491             if (temp == LB_Numeric) {
5492                 return FALSE;
5493             }
5494 
5495             return LB_table[prev][after] - LB_SY_or_IS_then_various
5496                                                                == LB_BREAKABLE;
5497         }
5498 
5499         case LB_various_then_PO_or_PR + LB_BREAKABLE:
5500         case LB_various_then_PO_or_PR + LB_NOBREAK:
5501         {
5502             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5503 
5504             LB_enum temp = prev;
5505             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5506             {
5507                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5508             }
5509             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5510                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5511             }
5512             if (temp == LB_Numeric) {
5513                 return FALSE;
5514             }
5515             return LB_various_then_PO_or_PR;
5516         }
5517 
5518         case LB_RI_then_RI + LB_NOBREAK:
5519         case LB_RI_then_RI + LB_BREAKABLE:
5520             {
5521                 int RI_count = 1;
5522 
5523                 /* LB30a Break between two regional indicator symbols if and
5524                  * only if there are an even number of regional indicators
5525                  * preceding the position of the break.
5526                  *
5527                  *    sot (RI RI)* RI × RI
5528                  *  [^RI] (RI RI)* RI × RI */
5529 
5530                 while (backup_one_LB(strbeg,
5531                                      &temp_pos,
5532                                      utf8_target) == LB_Regional_Indicator)
5533                 {
5534                     RI_count++;
5535                 }
5536 
5537                 return RI_count % 2 == 0;
5538             }
5539 
5540         default:
5541             break;
5542     }
5543 
5544 #ifdef DEBUGGING
5545     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5546                                   before, after, LB_table[before][after]);
5547     assert(0);
5548 #endif
5549     return TRUE;
5550 }
5551 
5552 STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5553 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5554 {
5555 
5556     LB_enum lb;
5557 
5558     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5559 
5560     if (*curpos >= strend) {
5561         return LB_EDGE;
5562     }
5563 
5564     if (utf8_target) {
5565         *curpos += UTF8SKIP(*curpos);
5566         if (*curpos >= strend) {
5567             return LB_EDGE;
5568         }
5569         lb = getLB_VAL_UTF8(*curpos, strend);
5570     }
5571     else {
5572         (*curpos)++;
5573         if (*curpos >= strend) {
5574             return LB_EDGE;
5575         }
5576         lb = getLB_VAL_CP(**curpos);
5577     }
5578 
5579     return lb;
5580 }
5581 
5582 STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5583 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5584 {
5585     LB_enum lb;
5586 
5587     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5588 
5589     if (*curpos < strbeg) {
5590         return LB_EDGE;
5591     }
5592 
5593     if (utf8_target) {
5594         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5595         U8 * prev_prev_char_pos;
5596 
5597         if (! prev_char_pos) {
5598             return LB_EDGE;
5599         }
5600 
5601         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5602             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5603             *curpos = prev_char_pos;
5604             prev_char_pos = prev_prev_char_pos;
5605         }
5606         else {
5607             *curpos = (U8 *) strbeg;
5608             return LB_EDGE;
5609         }
5610     }
5611     else {
5612         if (*curpos - 2 < strbeg) {
5613             *curpos = (U8 *) strbeg;
5614             return LB_EDGE;
5615         }
5616         (*curpos)--;
5617         lb = getLB_VAL_CP(*(*curpos - 1));
5618     }
5619 
5620     return lb;
5621 }
5622 
5623 STATIC bool
S_isSB(pTHX_ SB_enum before,SB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5624 S_isSB(pTHX_ SB_enum before,
5625              SB_enum after,
5626              const U8 * const strbeg,
5627              const U8 * const curpos,
5628              const U8 * const strend,
5629              const bool utf8_target)
5630 {
5631     /* returns a boolean indicating if there is a Sentence Boundary Break
5632      * between the inputs.  See https://www.unicode.org/reports/tr29/ */
5633 
5634     U8 * lpos = (U8 *) curpos;
5635     bool has_para_sep = FALSE;
5636     bool has_sp = FALSE;
5637 
5638     PERL_ARGS_ASSERT_ISSB;
5639 
5640     /* Break at the start and end of text.
5641         SB1.  sot  ÷
5642         SB2.  ÷  eot
5643       But unstated in Unicode is don't break if the text is empty */
5644     if (before == SB_EDGE || after == SB_EDGE) {
5645         return before != after;
5646     }
5647 
5648     /* SB 3: Do not break within CRLF. */
5649     if (before == SB_CR && after == SB_LF) {
5650         return FALSE;
5651     }
5652 
5653     /* Break after paragraph separators.  CR and LF are considered
5654      * so because Unicode views text as like word processing text where there
5655      * are no newlines except between paragraphs, and the word processor takes
5656      * care of wrapping without there being hard line-breaks in the text *./
5657        SB4.  Sep | CR | LF  ÷ */
5658     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5659         return TRUE;
5660     }
5661 
5662     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5663      * (See Section 6.2, Replacing Ignore Rules.)
5664         SB5.  X (Extend | Format)*  →  X */
5665     if (after == SB_Extend || after == SB_Format) {
5666 
5667         /* Implied is that the these characters attach to everything
5668          * immediately prior to them except for those separator-type
5669          * characters.  And the rules earlier have already handled the case
5670          * when one of those immediately precedes the extend char */
5671         return FALSE;
5672     }
5673 
5674     if (before == SB_Extend || before == SB_Format) {
5675         U8 * temp_pos = lpos;
5676         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5677         if (   backup != SB_EDGE
5678             && backup != SB_Sep
5679             && backup != SB_CR
5680             && backup != SB_LF)
5681         {
5682             before = backup;
5683             lpos = temp_pos;
5684         }
5685 
5686         /* Here, both 'before' and 'backup' are these types; implied is that we
5687          * don't break between them */
5688         if (backup == SB_Extend || backup == SB_Format) {
5689             return FALSE;
5690         }
5691     }
5692 
5693     /* Do not break after ambiguous terminators like period, if they are
5694      * immediately followed by a number or lowercase letter, if they are
5695      * between uppercase letters, if the first following letter (optionally
5696      * after certain punctuation) is lowercase, or if they are followed by
5697      * "continuation" punctuation such as comma, colon, or semicolon. For
5698      * example, a period may be an abbreviation or numeric period, and thus may
5699      * not mark the end of a sentence.
5700 
5701      * SB6. ATerm  ×  Numeric */
5702     if (before == SB_ATerm && after == SB_Numeric) {
5703         return FALSE;
5704     }
5705 
5706     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5707     if (before == SB_ATerm && after == SB_Upper) {
5708         U8 * temp_pos = lpos;
5709         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5710         if (backup == SB_Upper || backup == SB_Lower) {
5711             return FALSE;
5712         }
5713     }
5714 
5715     /* The remaining rules that aren't the final one, all require an STerm or
5716      * an ATerm after having backed up over some Close* Sp*, and in one case an
5717      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5718      * So do that backup now, setting flags if either Sp or a paragraph
5719      * separator are found */
5720 
5721     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5722         has_para_sep = TRUE;
5723         before = backup_one_SB(strbeg, &lpos, utf8_target);
5724     }
5725 
5726     if (before == SB_Sp) {
5727         has_sp = TRUE;
5728         do {
5729             before = backup_one_SB(strbeg, &lpos, utf8_target);
5730         }
5731         while (before == SB_Sp);
5732     }
5733 
5734     while (before == SB_Close) {
5735         before = backup_one_SB(strbeg, &lpos, utf8_target);
5736     }
5737 
5738     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5739      * most cases an STerm */
5740     if (before == SB_STerm || before == SB_ATerm) {
5741 
5742         /* So, here the lhs matches
5743          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5744          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5745          * The rules that apply here are:
5746          *
5747          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5748                                            | LF | STerm | ATerm) )* Lower
5749            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5750            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5751            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5752            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5753          */
5754 
5755         /* And all but SB11 forbid having seen a paragraph separator */
5756         if (! has_para_sep) {
5757             if (before == SB_ATerm) {          /* SB8 */
5758                 U8 * rpos = (U8 *) curpos;
5759                 SB_enum later = after;
5760 
5761                 while (    later != SB_OLetter
5762                         && later != SB_Upper
5763                         && later != SB_Lower
5764                         && later != SB_Sep
5765                         && later != SB_CR
5766                         && later != SB_LF
5767                         && later != SB_STerm
5768                         && later != SB_ATerm
5769                         && later != SB_EDGE)
5770                 {
5771                     later = advance_one_SB(&rpos, strend, utf8_target);
5772                 }
5773                 if (later == SB_Lower) {
5774                     return FALSE;
5775                 }
5776             }
5777 
5778             if (   after == SB_SContinue    /* SB8a */
5779                 || after == SB_STerm
5780                 || after == SB_ATerm)
5781             {
5782                 return FALSE;
5783             }
5784 
5785             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5786                 if (   after == SB_Close
5787                     || after == SB_Sp
5788                     || after == SB_Sep
5789                     || after == SB_CR
5790                     || after == SB_LF)
5791                 {
5792                     return FALSE;
5793                 }
5794             }
5795 
5796             /* SB10.  This and SB9 could probably be combined some way, but khw
5797              * has decided to follow the Unicode rule book precisely for
5798              * simplified maintenance */
5799             if (   after == SB_Sp
5800                 || after == SB_Sep
5801                 || after == SB_CR
5802                 || after == SB_LF)
5803             {
5804                 return FALSE;
5805             }
5806         }
5807 
5808         /* SB11.  */
5809         return TRUE;
5810     }
5811 
5812     /* Otherwise, do not break.
5813     SB12.  Any  ×  Any */
5814 
5815     return FALSE;
5816 }
5817 
5818 STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5819 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5820 {
5821     SB_enum sb;
5822 
5823     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5824 
5825     if (*curpos >= strend) {
5826         return SB_EDGE;
5827     }
5828 
5829     if (utf8_target) {
5830         do {
5831             *curpos += UTF8SKIP(*curpos);
5832             if (*curpos >= strend) {
5833                 return SB_EDGE;
5834             }
5835             sb = getSB_VAL_UTF8(*curpos, strend);
5836         } while (sb == SB_Extend || sb == SB_Format);
5837     }
5838     else {
5839         do {
5840             (*curpos)++;
5841             if (*curpos >= strend) {
5842                 return SB_EDGE;
5843             }
5844             sb = getSB_VAL_CP(**curpos);
5845         } while (sb == SB_Extend || sb == SB_Format);
5846     }
5847 
5848     return sb;
5849 }
5850 
5851 STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5852 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5853 {
5854     SB_enum sb;
5855 
5856     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5857 
5858     if (*curpos < strbeg) {
5859         return SB_EDGE;
5860     }
5861 
5862     if (utf8_target) {
5863         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5864         if (! prev_char_pos) {
5865             return SB_EDGE;
5866         }
5867 
5868         /* Back up over Extend and Format.  curpos is always just to the right
5869          * of the character whose value we are getting */
5870         do {
5871             U8 * prev_prev_char_pos;
5872             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5873                                                                       strbeg)))
5874             {
5875                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5876                 *curpos = prev_char_pos;
5877                 prev_char_pos = prev_prev_char_pos;
5878             }
5879             else {
5880                 *curpos = (U8 *) strbeg;
5881                 return SB_EDGE;
5882             }
5883         } while (sb == SB_Extend || sb == SB_Format);
5884     }
5885     else {
5886         do {
5887             if (*curpos - 2 < strbeg) {
5888                 *curpos = (U8 *) strbeg;
5889                 return SB_EDGE;
5890             }
5891             (*curpos)--;
5892             sb = getSB_VAL_CP(*(*curpos - 1));
5893         } while (sb == SB_Extend || sb == SB_Format);
5894     }
5895 
5896     return sb;
5897 }
5898 
5899 STATIC bool
S_isWB(pTHX_ WB_enum previous,WB_enum before,WB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5900 S_isWB(pTHX_ WB_enum previous,
5901              WB_enum before,
5902              WB_enum after,
5903              const U8 * const strbeg,
5904              const U8 * const curpos,
5905              const U8 * const strend,
5906              const bool utf8_target)
5907 {
5908     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5909      *  a Unicode word break, using their published algorithm, but tailored for
5910      *  Perl by treating spans of white space as one unit.  Context may be
5911      *  needed to make this determination.  If the value for the character
5912      *  before 'before' is known, it is passed as 'previous'; otherwise that
5913      *  should be set to WB_UNKNOWN.  The other input parameters give the
5914      *  boundaries and current position in the matching of the string.  That
5915      *  is, 'curpos' marks the position where the character whose wb value is
5916      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5917 
5918     U8 * before_pos = (U8 *) curpos;
5919     U8 * after_pos = (U8 *) curpos;
5920     WB_enum prev = before;
5921     WB_enum next;
5922 
5923     PERL_ARGS_ASSERT_ISWB;
5924 
5925     /* Rule numbers in the comments below are as of Unicode 9.0 */
5926 
5927   redo:
5928     before = prev;
5929     switch (WB_table[before][after]) {
5930         case WB_BREAKABLE:
5931             return TRUE;
5932 
5933         case WB_NOBREAK:
5934             return FALSE;
5935 
5936         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5937             next = advance_one_WB(&after_pos, strend, utf8_target,
5938                                  FALSE /* Don't skip Extend nor Format */ );
5939             /* A space immediately preceding an Extend or Format is attached
5940              * to by them, and hence gets separated from previous spaces.
5941              * Otherwise don't break between horizontal white space */
5942             return next == WB_Extend || next == WB_Format;
5943 
5944         /* WB4 Ignore Format and Extend characters, except when they appear at
5945          * the beginning of a region of text.  This code currently isn't
5946          * general purpose, but it works as the rules are currently and likely
5947          * to be laid out.  The reason it works is that when 'they appear at
5948          * the beginning of a region of text', the rule is to break before
5949          * them, just like any other character.  Therefore, the default rule
5950          * applies and we don't have to look in more depth.  Should this ever
5951          * change, we would have to have 2 'case' statements, like in the rules
5952          * below, and backup a single character (not spacing over the extend
5953          * ones) and then see if that is one of the region-end characters and
5954          * go from there */
5955         case WB_Ex_or_FO_or_ZWJ_then_foo:
5956             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5957             goto redo;
5958 
5959         case WB_DQ_then_HL + WB_BREAKABLE:
5960         case WB_DQ_then_HL + WB_NOBREAK:
5961 
5962             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5963 
5964             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5965                                                             == WB_Hebrew_Letter)
5966             {
5967                 return FALSE;
5968             }
5969 
5970              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5971 
5972         case WB_HL_then_DQ + WB_BREAKABLE:
5973         case WB_HL_then_DQ + WB_NOBREAK:
5974 
5975             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5976 
5977             if (advance_one_WB(&after_pos, strend, utf8_target,
5978                                        TRUE /* Do skip Extend and Format */ )
5979                                                             == WB_Hebrew_Letter)
5980             {
5981                 return FALSE;
5982             }
5983 
5984             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5985 
5986         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5987         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5988 
5989             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5990              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5991 
5992             next = advance_one_WB(&after_pos, strend, utf8_target,
5993                                        TRUE /* Do skip Extend and Format */ );
5994 
5995             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5996             {
5997                 return FALSE;
5998             }
5999 
6000             return WB_table[before][after]
6001                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
6002 
6003         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
6004         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
6005 
6006             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
6007              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
6008 
6009             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
6010             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
6011             {
6012                 return FALSE;
6013             }
6014 
6015             return WB_table[before][after]
6016                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
6017 
6018         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
6019         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
6020 
6021             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
6022              * */
6023 
6024             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
6025                                                             == WB_Numeric)
6026             {
6027                 return FALSE;
6028             }
6029 
6030             return WB_table[before][after]
6031                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
6032 
6033         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
6034         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
6035 
6036             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
6037 
6038             if (advance_one_WB(&after_pos, strend, utf8_target,
6039                                        TRUE /* Do skip Extend and Format */ )
6040                                                             == WB_Numeric)
6041             {
6042                 return FALSE;
6043             }
6044 
6045             return WB_table[before][after]
6046                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
6047 
6048         case WB_RI_then_RI + WB_NOBREAK:
6049         case WB_RI_then_RI + WB_BREAKABLE:
6050             {
6051                 int RI_count = 1;
6052 
6053                 /* Do not break within emoji flag sequences. That is, do not
6054                  * break between regional indicator (RI) symbols if there is an
6055                  * odd number of RI characters before the potential break
6056                  * point.
6057                  *
6058                  * WB15   sot (RI RI)* RI × RI
6059                  * WB16 [^RI] (RI RI)* RI × RI */
6060 
6061                 while (backup_one_WB(&previous,
6062                                      strbeg,
6063                                      &before_pos,
6064                                      utf8_target) == WB_Regional_Indicator)
6065                 {
6066                     RI_count++;
6067                 }
6068 
6069                 return RI_count % 2 != 1;
6070             }
6071 
6072         default:
6073             break;
6074     }
6075 
6076 #ifdef DEBUGGING
6077     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
6078                                   before, after, WB_table[before][after]);
6079     assert(0);
6080 #endif
6081     return TRUE;
6082 }
6083 
6084 STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target,const bool skip_Extend_Format)6085 S_advance_one_WB(pTHX_ U8 ** curpos,
6086                        const U8 * const strend,
6087                        const bool utf8_target,
6088                        const bool skip_Extend_Format)
6089 {
6090     WB_enum wb;
6091 
6092     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
6093 
6094     if (*curpos >= strend) {
6095         return WB_EDGE;
6096     }
6097 
6098     if (utf8_target) {
6099 
6100         /* Advance over Extend and Format */
6101         do {
6102             *curpos += UTF8SKIP(*curpos);
6103             if (*curpos >= strend) {
6104                 return WB_EDGE;
6105             }
6106             wb = getWB_VAL_UTF8(*curpos, strend);
6107         } while (    skip_Extend_Format
6108                  && (wb == WB_Extend || wb == WB_Format));
6109     }
6110     else {
6111         do {
6112             (*curpos)++;
6113             if (*curpos >= strend) {
6114                 return WB_EDGE;
6115             }
6116             wb = getWB_VAL_CP(**curpos);
6117         } while (    skip_Extend_Format
6118                  && (wb == WB_Extend || wb == WB_Format));
6119     }
6120 
6121     return wb;
6122 }
6123 
6124 STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous,const U8 * const strbeg,U8 ** curpos,const bool utf8_target)6125 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
6126 {
6127     WB_enum wb;
6128 
6129     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
6130 
6131     /* If we know what the previous character's break value is, don't have
6132         * to look it up */
6133     if (*previous != WB_UNKNOWN) {
6134         wb = *previous;
6135 
6136         /* But we need to move backwards by one */
6137         if (utf8_target) {
6138             *curpos = reghopmaybe3(*curpos, -1, strbeg);
6139             if (! *curpos) {
6140                 *previous = WB_EDGE;
6141                 *curpos = (U8 *) strbeg;
6142             }
6143             else {
6144                 *previous = WB_UNKNOWN;
6145             }
6146         }
6147         else {
6148             (*curpos)--;
6149             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6150         }
6151 
6152         /* And we always back up over these three types */
6153         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6154             return wb;
6155         }
6156     }
6157 
6158     if (*curpos < strbeg) {
6159         return WB_EDGE;
6160     }
6161 
6162     if (utf8_target) {
6163         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6164         if (! prev_char_pos) {
6165             return WB_EDGE;
6166         }
6167 
6168         /* Back up over Extend and Format.  curpos is always just to the right
6169          * of the character whose value we are getting */
6170         do {
6171             U8 * prev_prev_char_pos;
6172             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6173                                                    -1,
6174                                                    strbeg)))
6175             {
6176                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6177                 *curpos = prev_char_pos;
6178                 prev_char_pos = prev_prev_char_pos;
6179             }
6180             else {
6181                 *curpos = (U8 *) strbeg;
6182                 return WB_EDGE;
6183             }
6184         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6185     }
6186     else {
6187         do {
6188             if (*curpos - 2 < strbeg) {
6189                 *curpos = (U8 *) strbeg;
6190                 return WB_EDGE;
6191             }
6192             (*curpos)--;
6193             wb = getWB_VAL_CP(*(*curpos - 1));
6194         } while (wb == WB_Extend || wb == WB_Format);
6195     }
6196 
6197     return wb;
6198 }
6199 
6200 /* Macros for regmatch(), using its internal variables */
6201 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6202 #define NEXTCHR_IS_EOS (nextbyte < 0)
6203 
6204 #define SET_nextchr \
6205     nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6206 
6207 #define SET_locinput(p) \
6208     locinput = (p);  \
6209     SET_nextchr
6210 
6211 #define sayYES goto yes
6212 #define sayNO goto no
6213 #define sayNO_SILENT goto no_silent
6214 
6215 /* we don't use STMT_START/END here because it leads to
6216    "unreachable code" warnings, which are bogus, but distracting. */
6217 #define CACHEsayNO \
6218     if (ST.cache_mask) \
6219        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6220     sayNO
6221 
6222 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
6223 (                                                           \
6224     (   ( st )                                         ) && \
6225     (   ( st )->u.eval.close_paren                     ) && \
6226     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6227 )
6228 
6229 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
6230 (                                                           \
6231     (   ( st )                                         ) && \
6232     (   ( st )->u.eval.close_paren                     ) && \
6233     (   ( expr )                                       ) && \
6234     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6235 )
6236 
6237 
6238 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6239     (st)->u.eval.close_paren = ( (expr) + 1 )
6240 
6241 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6242     (st)->u.eval.close_paren = 0
6243 
6244 /* push a new state then goto it */
6245 
6246 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
6247     pushinput = input; \
6248     pusheol = eol; \
6249     pushsr0 = sr0; \
6250     scan = node; \
6251     st->resume_state = state; \
6252     goto push_state;
6253 
6254 /* push a new state with success backtracking, then goto it */
6255 
6256 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
6257     pushinput = input; \
6258     pusheol = eol;     \
6259     pushsr0 = sr0; \
6260     scan = node; \
6261     st->resume_state = state; \
6262     goto push_yes_state;
6263 
6264 #define DEBUG_STATE_pp(pp)                                  \
6265     DEBUG_STATE_r({                                         \
6266         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
6267         Perl_re_printf( aTHX_                               \
6268             "%*s" pp " %s%s%s%s%s\n",                       \
6269             INDENT_CHARS(depth), "",                        \
6270             REGNODE_NAME(st->resume_state),                  \
6271             ((st==yes_state||st==mark_state) ? "[" : ""),   \
6272             ((st==yes_state) ? "Y" : ""),                   \
6273             ((st==mark_state) ? "M" : ""),                  \
6274             ((st==yes_state||st==mark_state) ? "]" : "")    \
6275         );                                                  \
6276     });
6277 
6278 /*
6279 
6280 regmatch() - main matching routine
6281 
6282 This is basically one big switch statement in a loop. We execute an op,
6283 set 'next' to point the next op, and continue. If we come to a point which
6284 we may need to backtrack to on failure such as (A|B|C), we push a
6285 backtrack state onto the backtrack stack. On failure, we pop the top
6286 state, and re-enter the loop at the state indicated. If there are no more
6287 states to pop, we return failure.
6288 
6289 Sometimes we also need to backtrack on success; for example /A+/, where
6290 after successfully matching one A, we need to go back and try to
6291 match another one; similarly for lookahead assertions: if the assertion
6292 completes successfully, we backtrack to the state just before the assertion
6293 and then carry on.  In these cases, the pushed state is marked as
6294 'backtrack on success too'. This marking is in fact done by a chain of
6295 pointers, each pointing to the previous 'yes' state. On success, we pop to
6296 the nearest yes state, discarding any intermediate failure-only states.
6297 Sometimes a yes state is pushed just to force some cleanup code to be
6298 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6299 it to free the inner regex.
6300 
6301 Note that failure backtracking rewinds the cursor position, while
6302 success backtracking leaves it alone.
6303 
6304 A pattern is complete when the END op is executed, while a subpattern
6305 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6306 ops trigger the "pop to last yes state if any, otherwise return true"
6307 behaviour.
6308 
6309 A common convention in this function is to use A and B to refer to the two
6310 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6311 the subpattern to be matched possibly multiple times, while B is the entire
6312 rest of the pattern. Variable and state names reflect this convention.
6313 
6314 The states in the main switch are the union of ops and failure/success of
6315 substates associated with that op.  For example, IFMATCH is the op
6316 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6317 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6318 successfully matched A and IFMATCH_A_fail is a state saying that we have
6319 just failed to match A. Resume states always come in pairs. The backtrack
6320 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6321 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6322 on success or failure.
6323 
6324 The struct that holds a backtracking state is actually a big union, with
6325 one variant for each major type of op. The variable st points to the
6326 top-most backtrack struct. To make the code clearer, within each
6327 block of code we #define ST to alias the relevant union.
6328 
6329 Here's a concrete example of a (vastly oversimplified) IFMATCH
6330 implementation:
6331 
6332     switch (state) {
6333     ....
6334 
6335 #define ST st->u.ifmatch
6336 
6337     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6338         ST.foo = ...; // some state we wish to save
6339         ...
6340         // push a yes backtrack state with a resume value of
6341         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6342         // first node of A:
6343         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6344         // NOTREACHED
6345 
6346     case IFMATCH_A: // we have successfully executed A; now continue with B
6347         next = B;
6348         bar = ST.foo; // do something with the preserved value
6349         break;
6350 
6351     case IFMATCH_A_fail: // A failed, so the assertion failed
6352         ...;   // do some housekeeping, then ...
6353         sayNO; // propagate the failure
6354 
6355 #undef ST
6356 
6357     ...
6358     }
6359 
6360 For any old-timers reading this who are familiar with the old recursive
6361 approach, the code above is equivalent to:
6362 
6363     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6364     {
6365         int foo = ...
6366         ...
6367         if (regmatch(A)) {
6368             next = B;
6369             bar = foo;
6370             break;
6371         }
6372         ...;   // do some housekeeping, then ...
6373         sayNO; // propagate the failure
6374     }
6375 
6376 The topmost backtrack state, pointed to by st, is usually free. If you
6377 want to claim it, populate any ST.foo fields in it with values you wish to
6378 save, then do one of
6379 
6380         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6381         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6382 
6383 which sets that backtrack state's resume value to 'resume_state', pushes a
6384 new free entry to the top of the backtrack stack, then goes to 'node'.
6385 On backtracking, the free slot is popped, and the saved state becomes the
6386 new free state. An ST.foo field in this new top state can be temporarily
6387 accessed to retrieve values, but once the main loop is re-entered, it
6388 becomes available for reuse.
6389 
6390 Note that the depth of the backtrack stack constantly increases during the
6391 left-to-right execution of the pattern, rather than going up and down with
6392 the pattern nesting. For example the stack is at its maximum at Z at the
6393 end of the pattern, rather than at X in the following:
6394 
6395     /(((X)+)+)+....(Y)+....Z/
6396 
6397 The only exceptions to this are lookahead/behind assertions and the cut,
6398 (?>A), which pop all the backtrack states associated with A before
6399 continuing.
6400 
6401 Backtrack state structs are allocated in slabs of about 4K in size.
6402 PL_regmatch_state and st always point to the currently active state,
6403 and PL_regmatch_slab points to the slab currently containing
6404 PL_regmatch_state.  The first time regmatch() is called, the first slab is
6405 allocated, and is never freed until interpreter destruction. When the slab
6406 is full, a new one is allocated and chained to the end. At exit from
6407 regmatch(), slabs allocated since entry are freed.
6408 
6409 In order to work with variable length lookbehinds, an upper limit is placed on
6410 lookbehinds which is set to where the match position is at the end of where the
6411 lookbehind would get to.  Nothing in the lookbehind should match above that,
6412 except we should be able to look beyond if for things like \b, which need the
6413 next character in the string to be able to determine if this is a boundary or
6414 not.  We also can't match the end of string/line unless we are also at the end
6415 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6416 that match a width, we have to add a condition that they are within the legal
6417 bounds of our window into the string.
6418 
6419 */
6420 
6421 /* returns -1 on failure, $+[0] on success */
6422 STATIC SSize_t
S_regmatch(pTHX_ regmatch_info * reginfo,char * startpos,regnode * prog)6423 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6424 {
6425     const bool utf8_target = reginfo->is_utf8_target;
6426     const U32 uniflags = UTF8_ALLOW_DEFAULT;
6427     REGEXP *rex_sv = reginfo->prog;
6428     regexp *rex = ReANY(rex_sv);
6429     RXi_GET_DECL(rex,rexi);
6430     /* the current state. This is a cached copy of PL_regmatch_state */
6431     regmatch_state *st;
6432     /* cache heavy used fields of st in registers */
6433     regnode *scan;
6434     regnode *next;
6435     U32 n = 0;	/* general value; init to avoid compiler warning */
6436     U32 utmp = 0;  /* tmp variable - valid for at most one opcode */
6437     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
6438     SSize_t endref = 0; /* offset of end of backref when ln is start */
6439     char *locinput = startpos;
6440     char *loceol = reginfo->strend;
6441     char *pushinput; /* where to continue after a PUSH */
6442     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
6443     U8   *pushsr0;   /* save starting pos of script run */
6444     PERL_INT_FAST16_T nextbyte;   /* is always set to UCHARAT(locinput), or -1
6445                                      at EOS */
6446 
6447     bool result = 0;	    /* return value of S_regmatch */
6448     U32 depth = 0;            /* depth of backtrack stack */
6449     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6450     const U32 max_nochange_depth =
6451         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6452         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6453     regmatch_state *yes_state = NULL; /* state to pop to on success of
6454                                                             subpattern */
6455     /* mark_state piggy backs on the yes_state logic so that when we unwind
6456        the stack on success we can update the mark_state as we go */
6457     regmatch_state *mark_state = NULL; /* last mark state we have seen */
6458     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6459     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
6460     U32 state_num;
6461     bool no_final = 0;      /* prevent failure from backtracking? */
6462     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
6463     char *startpoint = locinput;
6464     SV *popmark = NULL;     /* are we looking for a mark? */
6465     SV *sv_commit = NULL;   /* last mark name seen in failure */
6466     SV *sv_yes_mark = NULL; /* last mark name we have seen
6467                                during a successful match */
6468     U32 lastopen = 0;       /* last open we saw */
6469     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6470     SV* const oreplsv = GvSVn(PL_replgv);
6471     /* these three flags are set by various ops to signal information to
6472      * the very next op. They have a useful lifetime of exactly one loop
6473      * iteration, and are not preserved or restored by state pushes/pops
6474      */
6475     bool sw = 0;	    /* the condition value in (?(cond)a|b) */
6476     bool minmod = 0;	    /* the next "{n,m}" is a "{n,m}?" */
6477     int logical = 0;	    /* the following EVAL is:
6478                                 0: (?{...})
6479                                 1: (?(?{...})X|Y)
6480                                 2: (??{...})
6481                                or the following IFMATCH/UNLESSM is:
6482                                 false: plain (?=foo)
6483                                 true:  used as a condition: (?(?=foo))
6484                             */
6485     PAD* last_pad = NULL;
6486     dMULTICALL;
6487     U8 gimme = G_SCALAR;
6488     CV *caller_cv = NULL;	/* who called us */
6489     CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
6490     U32 maxopenparen = 0;       /* max '(' index seen so far */
6491     int to_complement;  /* Invert the result? */
6492     char_class_number_ classnum;
6493     bool is_utf8_pat = reginfo->is_utf8_pat;
6494     bool match = FALSE;
6495     I32 orig_savestack_ix = PL_savestack_ix;
6496     U8 * script_run_begin = NULL;
6497     char *match_end= NULL; /* where a match MUST end to be considered successful */
6498     bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6499     re_fold_t folder = NULL;  /* used by various EXACTish regops */
6500     const U8 * fold_array = NULL; /* used by various EXACTish regops */
6501 
6502 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6503 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6504 #  define SOLARIS_BAD_OPTIMIZER
6505     const U32 *pl_charclass_dup = PL_charclass;
6506 #  define PL_charclass pl_charclass_dup
6507 #endif
6508 
6509 #ifdef DEBUGGING
6510     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6511 #endif
6512 
6513     /* protect against undef(*^R) */
6514     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6515 
6516     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6517     multicall_oldcatch = 0;
6518     PERL_UNUSED_VAR(multicall_cop);
6519 
6520     PERL_ARGS_ASSERT_REGMATCH;
6521 
6522     st = PL_regmatch_state;
6523 
6524     /* Note that nextbyte is a byte even in UTF */
6525     SET_nextchr;
6526     scan = prog;
6527 
6528     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6529             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6530             Perl_re_printf( aTHX_ "regmatch start\n" );
6531     }));
6532 
6533     while (scan != NULL) {
6534         next = scan + NEXT_OFF(scan);
6535         if (next == scan)
6536             next = NULL;
6537         state_num = OP(scan);
6538 
6539       reenter_switch:
6540         DEBUG_EXECUTE_r(
6541             if (state_num <= REGNODE_MAX) {
6542                 SV * const prop = sv_newmortal();
6543                 regnode *rnext = regnext(scan);
6544 
6545                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6546                 regprop(rex, prop, scan, reginfo, NULL);
6547                 Perl_re_printf( aTHX_
6548                     "%*s%" IVdf ":%s(%" IVdf ")\n",
6549                     INDENT_CHARS(depth), "",
6550                     (IV)(scan - rexi->program),
6551                     SvPVX_const(prop),
6552                     (REGNODE_TYPE(OP(scan)) == END || !rnext) ?
6553                         0 : (IV)(rnext - rexi->program));
6554             }
6555         );
6556 
6557         to_complement = 0;
6558 
6559         SET_nextchr;
6560         assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6561 
6562         switch (state_num) {
6563             SV * anyofh_list;
6564 
6565         case SBOL: /*  /^../ and /\A../  */
6566             if (locinput == reginfo->strbeg)
6567                 break;
6568             sayNO;
6569 
6570         case MBOL: /*  /^../m  */
6571             if (locinput == reginfo->strbeg ||
6572                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6573             {
6574                 break;
6575             }
6576             sayNO;
6577 
6578         case GPOS: /*  \G  */
6579             if (locinput == reginfo->ganch)
6580                 break;
6581             sayNO;
6582 
6583         case KEEPS: /*   \K  */
6584             /* update the startpoint */
6585             st->u.keeper.val = RXp_OFFS_START(rex,0);
6586             RXp_OFFSp(rex)[0].start = locinput - reginfo->strbeg;
6587             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6588                             script_run_begin);
6589             NOT_REACHED; /* NOTREACHED */
6590 
6591         case KEEPS_next_fail:
6592             /* rollback the start point change */
6593             RXp_OFFSp(rex)[0].start = st->u.keeper.val;
6594             sayNO_SILENT;
6595             NOT_REACHED; /* NOTREACHED */
6596 
6597         case MEOL: /* /..$/m  */
6598             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6599                 sayNO;
6600             break;
6601 
6602         case SEOL: /* /..$/  */
6603             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6604                 sayNO;
6605             if (reginfo->strend - locinput > 1)
6606                 sayNO;
6607             break;
6608 
6609         case EOS: /*  \z  */
6610             if (!NEXTCHR_IS_EOS)
6611                 sayNO;
6612             break;
6613 
6614         case SANY: /*  /./s  */
6615             if (NEXTCHR_IS_EOS || locinput >= loceol)
6616                 sayNO;
6617             goto increment_locinput;
6618 
6619         case REG_ANY: /*  /./  */
6620             if (   NEXTCHR_IS_EOS
6621                 || locinput >= loceol
6622                 || nextbyte == '\n')
6623             {
6624                 sayNO;
6625             }
6626             goto increment_locinput;
6627 
6628 
6629 #undef  ST
6630 #define ST st->u.trie
6631         case TRIEC: /* (ab|cd) with known charclass */
6632             /* In this case the charclass data is available inline so
6633                we can fail fast without a lot of extra overhead.
6634              */
6635             if ( !   NEXTCHR_IS_EOS
6636                 &&   locinput < loceol
6637                 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6638             {
6639                 DEBUG_EXECUTE_r(
6640                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6641                               depth, PL_colors[4], PL_colors[5])
6642                 );
6643                 sayNO_SILENT;
6644                 NOT_REACHED; /* NOTREACHED */
6645             }
6646             /* FALLTHROUGH */
6647         case TRIE:  /* (ab|cd)  */
6648             /* the basic plan of execution of the trie is:
6649              * At the beginning, run though all the states, and
6650              * find the longest-matching word. Also remember the position
6651              * of the shortest matching word. For example, this pattern:
6652              *    1  2 3 4    5
6653              *    ab|a|x|abcd|abc
6654              * when matched against the string "abcde", will generate
6655              * accept states for all words except 3, with the longest
6656              * matching word being 4, and the shortest being 2 (with
6657              * the position being after char 1 of the string).
6658              *
6659              * Then for each matching word, in word order (i.e. 1,2,4,5),
6660              * we run the remainder of the pattern; on each try setting
6661              * the current position to the character following the word,
6662              * returning to try the next word on failure.
6663              *
6664              * We avoid having to build a list of words at runtime by
6665              * using a compile-time structure, wordinfo[].prev, which
6666              * gives, for each word, the previous accepting word (if any).
6667              * In the case above it would contain the mappings 1->2, 2->0,
6668              * 3->0, 4->5, 5->1.  We can use this table to generate, from
6669              * the longest word (4 above), a list of all words, by
6670              * following the list of prev pointers; this gives us the
6671              * unordered list 4,5,1,2. Then given the current word we have
6672              * just tried, we can go through the list and find the
6673              * next-biggest word to try (so if we just failed on word 2,
6674              * the next in the list is 4).
6675              *
6676              * Since at runtime we don't record the matching position in
6677              * the string for each word, we have to work that out for
6678              * each word we're about to process. The wordinfo table holds
6679              * the character length of each word; given that we recorded
6680              * at the start: the position of the shortest word and its
6681              * length in chars, we just need to move the pointer the
6682              * difference between the two char lengths. Depending on
6683              * Unicode status and folding, that's cheap or expensive.
6684              *
6685              * This algorithm is optimised for the case where are only a
6686              * small number of accept states, i.e. 0,1, or maybe 2.
6687              * With lots of accepts states, and having to try all of them,
6688              * it becomes quadratic on number of accept states to find all
6689              * the next words.
6690              */
6691 
6692             {
6693                 /* what type of TRIE am I? (utf8 makes this contextual) */
6694                 DECL_TRIE_TYPE(scan);
6695 
6696                 /* what trie are we using right now */
6697                 reg_trie_data * const trie
6698                     = (reg_trie_data*)rexi->data->data[ ARG1u( scan ) ];
6699                 ST.before_paren = trie->before_paren;
6700                 ST.after_paren = trie->after_paren;
6701                 assert(ST.before_paren<=rex->nparens);
6702                 assert(ST.after_paren<=rex->nparens);
6703 
6704                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG1u( scan ) + 1 ]);
6705                 U32 state = trie->startstate;
6706 
6707                 if (FLAGS(scan) == EXACTL || FLAGS(scan) == EXACTFLU8) {
6708                     CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
6709                     if (utf8_target
6710                         && ! NEXTCHR_IS_EOS
6711                         && UTF8_IS_ABOVE_LATIN1(nextbyte)
6712                         && FLAGS(scan) == EXACTL)
6713                     {
6714                         /* We only output for EXACTL, as we let the folder
6715                          * output this message for EXACTFLU8 to avoid
6716                          * duplication */
6717                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6718                                                                reginfo->strend);
6719                     }
6720                 }
6721                 if (   trie->bitmap
6722                     && (     NEXTCHR_IS_EOS
6723                         ||   locinput >= loceol
6724                         || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6725                 {
6726                     if (trie->states[ state ].wordnum) {
6727                          DEBUG_EXECUTE_r(
6728                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6729                                           depth, PL_colors[4], PL_colors[5])
6730                         );
6731                         if (!trie->jump)
6732                             break;
6733                     } else {
6734                         DEBUG_EXECUTE_r(
6735                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6736                                           depth, PL_colors[4], PL_colors[5])
6737                         );
6738                         sayNO_SILENT;
6739                    }
6740                 }
6741 
6742             {
6743                 U8 *uc = ( U8* )locinput;
6744 
6745                 STRLEN len = 0;
6746                 STRLEN foldlen = 0;
6747                 U8 *uscan = (U8*)NULL;
6748                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6749                 U32 charcount = 0; /* how many input chars we have matched */
6750                 U32 accepted = 0; /* have we seen any accepting states? */
6751 
6752                 ST.jump = trie->jump;
6753                 ST.j_before_paren = trie->j_before_paren;
6754                 ST.j_after_paren= trie->j_after_paren;
6755                 ST.me = scan;
6756                 ST.firstpos = NULL;
6757                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6758                 ST.nextword = 0;
6759 
6760                 /* fully traverse the TRIE; note the position of the
6761                    shortest accept state and the wordnum of the longest
6762                    accept state */
6763 
6764                 while ( state && uc <= (U8*)(loceol) ) {
6765                     U32 base = trie->states[ state ].trans.base;
6766                     UV uvc = 0;
6767                     U16 charid = 0;
6768                     U16 wordnum;
6769                     wordnum = trie->states[ state ].wordnum;
6770 
6771                     if (wordnum) { /* it's an accept state */
6772                         if (!accepted) {
6773                             accepted = 1;
6774                             /* record first match position */
6775                             if (ST.longfold) {
6776                                 ST.firstpos = (U8*)locinput;
6777                                 ST.firstchars = 0;
6778                             }
6779                             else {
6780                                 ST.firstpos = uc;
6781                                 ST.firstchars = charcount;
6782                             }
6783                         }
6784                         if (!ST.nextword || wordnum < ST.nextword)
6785                             ST.nextword = wordnum;
6786                         ST.topword = wordnum;
6787                     }
6788 
6789                     DEBUG_TRIE_EXECUTE_r({
6790                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6791                                 /* HERE */
6792                                 PerlIO_printf( Perl_debug_log,
6793                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6794                                     INDENT_CHARS(depth), "", PL_colors[4],
6795                                     (UV)state, (accepted ? 'Y' : 'N'));
6796                     });
6797 
6798                     /* read a char and goto next state */
6799                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6800                         I32 offset;
6801                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6802                                              (U8 *) loceol, uscan,
6803                                              len, uvc, charid, foldlen,
6804                                              foldbuf, uniflags);
6805                         charcount++;
6806                         if (foldlen>0)
6807                             ST.longfold = TRUE;
6808                         if (charid &&
6809                              ( ((offset =
6810                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6811 
6812                              && ((U32)offset < trie->lasttrans)
6813                              && trie->trans[offset].check == state)
6814                         {
6815                             state = trie->trans[offset].next;
6816                         }
6817                         else {
6818                             state = 0;
6819                         }
6820                         uc += len;
6821 
6822                     }
6823                     else {
6824                         state = 0;
6825                     }
6826                     DEBUG_TRIE_EXECUTE_r(
6827                         Perl_re_printf( aTHX_
6828                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6829                             charid, uvc, (UV)state, PL_colors[5] );
6830                     );
6831                 }
6832                 if (!accepted)
6833                    sayNO;
6834 
6835                 /* calculate total number of accept states */
6836                 {
6837                     U16 w = ST.topword;
6838                     accepted = 0;
6839                     while (w) {
6840                         w = trie->wordinfo[w].prev;
6841                         accepted++;
6842                     }
6843                     ST.accepted = accepted;
6844                 }
6845 
6846                 DEBUG_EXECUTE_r(
6847                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6848                         depth,
6849                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6850                 );
6851                 goto trie_first_try; /* jump into the fail handler */
6852             }}
6853             NOT_REACHED; /* NOTREACHED */
6854 
6855         case TRIE_next_fail: /* we failed - try next alternative */
6856         {
6857             U8 *uc;
6858             if (RE_PESSIMISTIC_PARENS) {
6859                 REGCP_UNWIND(ST.lastcp);
6860                 regcppop(rex,&maxopenparen);
6861             }
6862             if ( ST.jump ) {
6863                 /* undo any captures done in the tail part of a branch,
6864                  * e.g.
6865                  *    /(?:X(.)(.)|Y(.)).../
6866                  * where the trie just matches X then calls out to do the
6867                  * rest of the branch */
6868                 REGCP_UNWIND(ST.cp);
6869                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6870                 if (ST.after_paren) {
6871                     assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens);
6872                     CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "TRIE_next_fail");
6873                 }
6874             }
6875             if (!--ST.accepted) {
6876                 DEBUG_EXECUTE_r({
6877                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6878                         depth,
6879                         PL_colors[4],
6880                         PL_colors[5] );
6881                 });
6882                 sayNO_SILENT;
6883             }
6884             {
6885                 /* Find next-highest word to process.  Note that this code
6886                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6887                 U16 min = 0;
6888                 U16 word;
6889                 U16 const nextword = ST.nextword;
6890                 reg_trie_wordinfo * const wordinfo
6891                     = ((reg_trie_data*)rexi->data->data[ARG1u(ST.me)])->wordinfo;
6892                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6893                     if (word > nextword && (!min || word < min))
6894                         min = word;
6895                 }
6896                 ST.nextword = min;
6897             }
6898 
6899           trie_first_try:
6900             if (do_cutgroup) {
6901                 do_cutgroup = 0;
6902                 no_final = 0;
6903             }
6904 
6905             if ( ST.jump ) {
6906                 ST.lastparen = RXp_LASTPAREN(rex);
6907                 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
6908                 REGCP_SET(ST.cp);
6909             }
6910 
6911             /* find start char of end of current word */
6912             {
6913                 U32 chars; /* how many chars to skip */
6914                 reg_trie_data * const trie
6915                     = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)];
6916 
6917                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6918                             >=  ST.firstchars);
6919                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6920                             - ST.firstchars;
6921                 uc = ST.firstpos;
6922 
6923                 if (ST.longfold) {
6924                     /* the hard option - fold each char in turn and find
6925                      * its folded length (which may be different */
6926                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6927                     STRLEN foldlen;
6928                     STRLEN len;
6929                     UV uvc;
6930                     U8 *uscan;
6931 
6932                     while (chars) {
6933                         if (utf8_target) {
6934                             /* XXX This assumes the length is well-formed, as
6935                              * does the UTF8SKIP below */
6936                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6937                                                     uniflags);
6938                             uc += len;
6939                         }
6940                         else {
6941                             uvc = *uc;
6942                             uc++;
6943                         }
6944                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6945                         uscan = foldbuf;
6946                         while (foldlen) {
6947                             if (!--chars)
6948                                 break;
6949                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6950                                                  uniflags);
6951                             uscan += len;
6952                             foldlen -= len;
6953                         }
6954                     }
6955                 }
6956                 else {
6957                     if (utf8_target)
6958                         uc = utf8_hop(uc, chars);
6959                     else
6960                         uc += chars;
6961                 }
6962             }
6963             if (ST.jump && ST.jump[ST.nextword]) {
6964                 scan = ST.me + ST.jump[ST.nextword];
6965                 ST.before_paren = ST.j_before_paren[ST.nextword];
6966                 assert(ST.before_paren <= rex->nparens);
6967                 ST.after_paren = ST.j_after_paren[ST.nextword];
6968                 assert(ST.after_paren <= rex->nparens);
6969             } else {
6970                 scan = ST.me + NEXT_OFF(ST.me);
6971             }
6972 
6973 
6974             DEBUG_EXECUTE_r({
6975                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6976                     depth,
6977                     PL_colors[4],
6978                     ST.nextword,
6979                     PL_colors[5]
6980                     );
6981             });
6982 
6983             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6984                 if (RE_PESSIMISTIC_PARENS) {
6985                     (void)regcppush(rex, 0, maxopenparen);
6986                     REGCP_SET(ST.lastcp);
6987                 }
6988                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6989                                 script_run_begin);
6990                 NOT_REACHED; /* NOTREACHED */
6991             }
6992             /* only one choice left - just continue */
6993             DEBUG_EXECUTE_r({
6994                 AV *const trie_words
6995                     = MUTABLE_AV(rexi->data->data[ARG1u(ST.me)+TRIE_WORDS_OFFSET]);
6996                 SV ** const tmp = trie_words
6997                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6998                 SV *sv= tmp ? sv_newmortal() : NULL;
6999 
7000                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
7001                     depth, PL_colors[4],
7002                     ST.nextword,
7003                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
7004                             PL_colors[0], PL_colors[1],
7005                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
7006                         )
7007                     : "not compiled under -Dr",
7008                     PL_colors[5] );
7009             });
7010 
7011             locinput = (char*)uc;
7012             continue; /* execute rest of RE */
7013             /* NOTREACHED */
7014         }
7015 #undef  ST
7016 
7017         case LEXACT_REQ8:
7018             if (! utf8_target) {
7019                 sayNO;
7020             }
7021             /* FALLTHROUGH */
7022 
7023         case LEXACT:
7024         {
7025             char *s;
7026 
7027             s = STRINGl(scan);
7028             ln = STR_LENl(scan);
7029             goto join_short_long_exact;
7030 
7031         case EXACTL:             /*  /abc/l       */
7032             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7033 
7034             /* Complete checking would involve going through every character
7035              * matched by the string to see if any is above latin1.  But the
7036              * comparison otherwise might very well be a fast assembly
7037              * language routine, and I (khw) don't think slowing things down
7038              * just to check for this warning is worth it.  So this just checks
7039              * the first character */
7040             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
7041                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
7042             }
7043             goto do_exact;
7044         case EXACT_REQ8:
7045             if (! utf8_target) {
7046                 sayNO;
7047             }
7048             /* FALLTHROUGH */
7049 
7050         case EXACT:             /*  /abc/        */
7051           do_exact:
7052             s = STRINGs(scan);
7053             ln = STR_LENs(scan);
7054 
7055           join_short_long_exact:
7056             if (utf8_target != is_utf8_pat) {
7057                 /* The target and the pattern have differing utf8ness. */
7058                 char *l = locinput;
7059                 const char * const e = s + ln;
7060 
7061                 if (utf8_target) {
7062                     /* The target is utf8, the pattern is not utf8.
7063                      * Above-Latin1 code points can't match the pattern;
7064                      * invariants match exactly, and the other Latin1 ones need
7065                      * to be downgraded to a single byte in order to do the
7066                      * comparison.  (If we could be confident that the target
7067                      * is not malformed, this could be refactored to have fewer
7068                      * tests by just assuming that if the first bytes match, it
7069                      * is an invariant, but there are tests in the test suite
7070                      * dealing with (??{...}) which violate this) */
7071                     while (s < e) {
7072                         if (   l >= loceol
7073                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
7074                         {
7075                             sayNO;
7076                         }
7077                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
7078                             if (*l != *s) {
7079                                 sayNO;
7080                             }
7081                             l++;
7082                         }
7083                         else {
7084                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
7085                             {
7086                                 sayNO;
7087                             }
7088                             l += 2;
7089                         }
7090                         s++;
7091                     }
7092                 }
7093                 else {
7094                     /* The target is not utf8, the pattern is utf8. */
7095                     while (s < e) {
7096                         if (   l >= loceol
7097                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
7098                         {
7099                             sayNO;
7100                         }
7101                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
7102                             if (*s != *l) {
7103                                 sayNO;
7104                             }
7105                             s++;
7106                         }
7107                         else {
7108                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
7109                             {
7110                                 sayNO;
7111                             }
7112                             s += 2;
7113                         }
7114                         l++;
7115                     }
7116                 }
7117                 locinput = l;
7118             }
7119             else {
7120                 /* The target and the pattern have the same utf8ness. */
7121                 /* Inline the first character, for speed. */
7122                 if (   loceol - locinput < ln
7123                     || UCHARAT(s) != nextbyte
7124                     || (ln > 1 && memNE(s, locinput, ln)))
7125                 {
7126                     sayNO;
7127                 }
7128                 locinput += ln;
7129             }
7130             break;
7131             }
7132 
7133         case EXACTFL:            /*  /abc/il      */
7134           {
7135             const char * s;
7136             U32 fold_utf8_flags;
7137 
7138             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7139             folder = Perl_foldEQ_locale;
7140             fold_array = PL_fold_locale;
7141             fold_utf8_flags = FOLDEQ_LOCALE;
7142             goto do_exactf;
7143 
7144         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
7145                                       is effectively /u; hence to match, target
7146                                       must be UTF-8. */
7147             if (! utf8_target) {
7148                 sayNO;
7149             }
7150             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
7151                                              | FOLDEQ_S2_FOLDS_SANE;
7152             folder = S_foldEQ_latin1_s2_folded;
7153             fold_array = PL_fold_latin1;
7154             goto do_exactf;
7155 
7156         case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
7157             if (! utf8_target) {
7158                 sayNO;
7159             }
7160             assert(is_utf8_pat);
7161             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7162 #ifdef DEBUGGING
7163             /* this is only used in an assert check, so we restrict it to DEBUGGING mode.
7164              * In theory neither of these variables should be used in this mode. */
7165             folder = NULL;
7166             fold_array = NULL;
7167 #endif
7168             goto do_exactf;
7169 
7170         case EXACTFUP:          /*  /foo/iu, and something is problematic in
7171                                     'foo' so can't take shortcuts. */
7172             assert(! is_utf8_pat);
7173             folder = Perl_foldEQ_latin1;
7174             fold_array = PL_fold_latin1;
7175             fold_utf8_flags = 0;
7176             goto do_exactf;
7177 
7178         case EXACTFU:            /*  /abc/iu      */
7179             folder = S_foldEQ_latin1_s2_folded;
7180             fold_array = PL_fold_latin1;
7181             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7182             goto do_exactf;
7183 
7184         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
7185                                    patterns */
7186             assert(! is_utf8_pat);
7187             /* FALLTHROUGH */
7188         case EXACTFAA:            /*  /abc/iaa     */
7189             folder = S_foldEQ_latin1_s2_folded;
7190             fold_array = PL_fold_latin1;
7191             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7192             if (is_utf8_pat || ! utf8_target) {
7193 
7194                 /* The possible presence of a MICRO SIGN in the pattern forbids
7195                  * us to view a non-UTF-8 pattern as folded when there is a
7196                  * UTF-8 target */
7197                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7198                                   |FOLDEQ_S2_FOLDS_SANE;
7199             }
7200             goto do_exactf;
7201 
7202 
7203         case EXACTF:             /*  /abc/i    This node only generated for
7204                                                non-utf8 patterns */
7205             assert(! is_utf8_pat);
7206             folder = Perl_foldEQ;
7207             fold_array = PL_fold;
7208             fold_utf8_flags = 0;
7209 
7210           do_exactf:
7211             s = STRINGs(scan);
7212             ln = STR_LENs(scan);
7213 
7214             if (   utf8_target
7215                 || is_utf8_pat
7216                 || state_num == EXACTFUP
7217                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7218             {
7219               /* Either target or the pattern are utf8, or has the issue where
7220                * the fold lengths may differ. */
7221                 const char * const l = locinput;
7222                 char *e = loceol;
7223 
7224                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
7225                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
7226                 {
7227                     sayNO;
7228                 }
7229                 locinput = e;
7230                 break;
7231             }
7232 
7233             /* Neither the target nor the pattern are utf8 */
7234             assert(fold_array);
7235             if (UCHARAT(s) != nextbyte
7236                 && !NEXTCHR_IS_EOS
7237                 && UCHARAT(s) != fold_array[nextbyte])
7238             {
7239                 sayNO;
7240             }
7241             if (loceol - locinput < ln)
7242                 sayNO;
7243             assert(folder);
7244             if (ln > 1 && ! folder(aTHX_ locinput, s, ln))
7245                 sayNO;
7246             locinput += ln;
7247             break;
7248         }
7249 
7250         case NBOUNDL: /*  /\B/l  */
7251             to_complement = 1;
7252             /* FALLTHROUGH */
7253 
7254         case BOUNDL:  /*  /\b/l  */
7255         {
7256             bool b1, b2;
7257             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7258 
7259             if (FLAGS(scan) != TRADITIONAL_BOUND) {
7260                 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7261                 goto boundu;
7262             }
7263 
7264             if (utf8_target) {
7265                 if (locinput == reginfo->strbeg)
7266                     b1 = isWORDCHAR_LC('\n');
7267                 else {
7268                     U8 *p = reghop3((U8*)locinput, -1,
7269                                     (U8*)(reginfo->strbeg));
7270                     b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7271                 }
7272                 b2 = (NEXTCHR_IS_EOS)
7273                     ? isWORDCHAR_LC('\n')
7274                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7275                                               (U8*) reginfo->strend);
7276             }
7277             else { /* Here the string isn't utf8 */
7278                 b1 = (locinput == reginfo->strbeg)
7279                      ? isWORDCHAR_LC('\n')
7280                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
7281                 b2 = (NEXTCHR_IS_EOS)
7282                     ? isWORDCHAR_LC('\n')
7283                     : isWORDCHAR_LC(nextbyte);
7284             }
7285             if (to_complement ^ (b1 == b2)) {
7286                 sayNO;
7287             }
7288             break;
7289         }
7290 
7291         case NBOUND:  /*  /\B/   */
7292             to_complement = 1;
7293             /* FALLTHROUGH */
7294 
7295         case BOUND:   /*  /\b/   */
7296             if (utf8_target) {
7297                 goto bound_utf8;
7298             }
7299             goto bound_ascii_match_only;
7300 
7301         case NBOUNDA: /*  /\B/a  */
7302             to_complement = 1;
7303             /* FALLTHROUGH */
7304 
7305         case BOUNDA:  /*  /\b/a  */
7306         {
7307             bool b1, b2;
7308 
7309           bound_ascii_match_only:
7310             /* Here the string isn't utf8, or is utf8 and only ascii characters
7311              * are to match \w.  In the latter case looking at the byte just
7312              * prior to the current one may be just the final byte of a
7313              * multi-byte character.  This is ok.  There are two cases:
7314              * 1) it is a single byte character, and then the test is doing
7315              *    just what it's supposed to.
7316              * 2) it is a multi-byte character, in which case the final byte is
7317              *    never mistakable for ASCII, and so the test will say it is
7318              *    not a word character, which is the correct answer. */
7319             b1 = (locinput == reginfo->strbeg)
7320                  ? isWORDCHAR_A('\n')
7321                  : isWORDCHAR_A(UCHARAT(locinput - 1));
7322             b2 = (NEXTCHR_IS_EOS)
7323                 ? isWORDCHAR_A('\n')
7324                 : isWORDCHAR_A(nextbyte);
7325             if (to_complement ^ (b1 == b2)) {
7326                 sayNO;
7327             }
7328             break;
7329         }
7330 
7331         case NBOUNDU: /*  /\B/u  */
7332             to_complement = 1;
7333             /* FALLTHROUGH */
7334 
7335         case BOUNDU:  /*  /\b/u  */
7336 
7337           boundu:
7338             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7339                 match = FALSE;
7340             }
7341             else if (utf8_target) {
7342               bound_utf8:
7343                 switch((bound_type) FLAGS(scan)) {
7344                     case TRADITIONAL_BOUND:
7345                     {
7346                         bool b1, b2;
7347                         if (locinput == reginfo->strbeg) {
7348                             b1 = 0 /* isWORDCHAR_L1('\n') */;
7349                         }
7350                         else {
7351                             U8 *p = reghop3((U8*)locinput, -1,
7352                                             (U8*)(reginfo->strbeg));
7353 
7354                             b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7355                         }
7356                         b2 = (NEXTCHR_IS_EOS)
7357                             ? 0 /* isWORDCHAR_L1('\n') */
7358                             : isWORDCHAR_utf8_safe((U8*)locinput,
7359                                                    (U8*) reginfo->strend);
7360                         match = cBOOL(b1 != b2);
7361                         break;
7362                     }
7363                     case GCB_BOUND:
7364                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7365                             match = TRUE; /* GCB always matches at begin and
7366                                              end */
7367                         }
7368                         else {
7369                             /* Find the gcb values of previous and current
7370                              * chars, then see if is a break point */
7371                             match = isGCB(getGCB_VAL_UTF8(
7372                                                 reghop3((U8*)locinput,
7373                                                         -1,
7374                                                         (U8*)(reginfo->strbeg)),
7375                                                 (U8*) reginfo->strend),
7376                                           getGCB_VAL_UTF8((U8*) locinput,
7377                                                         (U8*) reginfo->strend),
7378                                           (U8*) reginfo->strbeg,
7379                                           (U8*) locinput,
7380                                           utf8_target);
7381                         }
7382                         break;
7383 
7384                     case LB_BOUND:
7385                         if (locinput == reginfo->strbeg) {
7386                             match = FALSE;
7387                         }
7388                         else if (NEXTCHR_IS_EOS) {
7389                             match = TRUE;
7390                         }
7391                         else {
7392                             match = isLB(getLB_VAL_UTF8(
7393                                                 reghop3((U8*)locinput,
7394                                                         -1,
7395                                                         (U8*)(reginfo->strbeg)),
7396                                                 (U8*) reginfo->strend),
7397                                           getLB_VAL_UTF8((U8*) locinput,
7398                                                         (U8*) reginfo->strend),
7399                                           (U8*) reginfo->strbeg,
7400                                           (U8*) locinput,
7401                                           (U8*) reginfo->strend,
7402                                           utf8_target);
7403                         }
7404                         break;
7405 
7406                     case SB_BOUND: /* Always matches at begin and end */
7407                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7408                             match = TRUE;
7409                         }
7410                         else {
7411                             match = isSB(getSB_VAL_UTF8(
7412                                                 reghop3((U8*)locinput,
7413                                                         -1,
7414                                                         (U8*)(reginfo->strbeg)),
7415                                                 (U8*) reginfo->strend),
7416                                           getSB_VAL_UTF8((U8*) locinput,
7417                                                         (U8*) reginfo->strend),
7418                                           (U8*) reginfo->strbeg,
7419                                           (U8*) locinput,
7420                                           (U8*) reginfo->strend,
7421                                           utf8_target);
7422                         }
7423                         break;
7424 
7425                     case WB_BOUND:
7426                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7427                             match = TRUE;
7428                         }
7429                         else {
7430                             match = isWB(WB_UNKNOWN,
7431                                          getWB_VAL_UTF8(
7432                                                 reghop3((U8*)locinput,
7433                                                         -1,
7434                                                         (U8*)(reginfo->strbeg)),
7435                                                 (U8*) reginfo->strend),
7436                                           getWB_VAL_UTF8((U8*) locinput,
7437                                                         (U8*) reginfo->strend),
7438                                           (U8*) reginfo->strbeg,
7439                                           (U8*) locinput,
7440                                           (U8*) reginfo->strend,
7441                                           utf8_target);
7442                         }
7443                         break;
7444                 }
7445             }
7446             else {  /* Not utf8 target */
7447                 switch((bound_type) FLAGS(scan)) {
7448                     case TRADITIONAL_BOUND:
7449                     {
7450                         bool b1, b2;
7451                         b1 = (locinput == reginfo->strbeg)
7452                             ? 0 /* isWORDCHAR_L1('\n') */
7453                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
7454                         b2 = (NEXTCHR_IS_EOS)
7455                             ? 0 /* isWORDCHAR_L1('\n') */
7456                             : isWORDCHAR_L1(nextbyte);
7457                         match = cBOOL(b1 != b2);
7458                         break;
7459                     }
7460 
7461                     case GCB_BOUND:
7462                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7463                             match = TRUE; /* GCB always matches at begin and
7464                                              end */
7465                         }
7466                         else {  /* Only CR-LF combo isn't a GCB in 0-255
7467                                    range */
7468                             match =    UCHARAT(locinput - 1) != '\r'
7469                                     || UCHARAT(locinput) != '\n';
7470                         }
7471                         break;
7472 
7473                     case LB_BOUND:
7474                         if (locinput == reginfo->strbeg) {
7475                             match = FALSE;
7476                         }
7477                         else if (NEXTCHR_IS_EOS) {
7478                             match = TRUE;
7479                         }
7480                         else {
7481                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7482                                          getLB_VAL_CP(UCHARAT(locinput)),
7483                                          (U8*) reginfo->strbeg,
7484                                          (U8*) locinput,
7485                                          (U8*) reginfo->strend,
7486                                          utf8_target);
7487                         }
7488                         break;
7489 
7490                     case SB_BOUND: /* Always matches at begin and end */
7491                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7492                             match = TRUE;
7493                         }
7494                         else {
7495                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7496                                          getSB_VAL_CP(UCHARAT(locinput)),
7497                                          (U8*) reginfo->strbeg,
7498                                          (U8*) locinput,
7499                                          (U8*) reginfo->strend,
7500                                          utf8_target);
7501                         }
7502                         break;
7503 
7504                     case WB_BOUND:
7505                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7506                             match = TRUE;
7507                         }
7508                         else {
7509                             match = isWB(WB_UNKNOWN,
7510                                          getWB_VAL_CP(UCHARAT(locinput -1)),
7511                                          getWB_VAL_CP(UCHARAT(locinput)),
7512                                          (U8*) reginfo->strbeg,
7513                                          (U8*) locinput,
7514                                          (U8*) reginfo->strend,
7515                                          utf8_target);
7516                         }
7517                         break;
7518                 }
7519             }
7520 
7521             if (to_complement ^ ! match) {
7522                 sayNO;
7523             }
7524             break;
7525 
7526         case ANYOFPOSIXL:
7527         case ANYOFL:  /*  /[abc]/l      */
7528             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7529             CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7530 
7531             /* FALLTHROUGH */
7532         case ANYOFD:  /*   /[abc]/d       */
7533         case ANYOF:  /*   /[abc]/       */
7534             if (NEXTCHR_IS_EOS || locinput >= loceol)
7535                 sayNO;
7536             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7537                 && ! ANYOF_FLAGS(scan)
7538                 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan))
7539             {
7540                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7541                     sayNO;
7542                 }
7543                 locinput++;
7544             }
7545             else {
7546                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7547                                                                    utf8_target))
7548                 {
7549                     sayNO;
7550                 }
7551                 goto increment_locinput;
7552             }
7553             break;
7554 
7555         case ANYOFM:
7556             if (   NEXTCHR_IS_EOS
7557                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG1u(scan)
7558                 || locinput >= loceol)
7559             {
7560                 sayNO;
7561             }
7562             locinput++; /* ANYOFM is always single byte */
7563             break;
7564 
7565         case NANYOFM:
7566             if (   NEXTCHR_IS_EOS
7567                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG1u(scan)
7568                 || locinput >= loceol)
7569             {
7570                 sayNO;
7571             }
7572             goto increment_locinput;
7573             break;
7574 
7575         case ANYOFH:
7576             if (   ! utf8_target
7577                 ||   NEXTCHR_IS_EOS
7578                 ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7579                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7580                 || ! _invlist_contains_cp(anyofh_list,
7581                                           utf8_to_uvchr_buf((U8 *) locinput,
7582                                                             (U8 *) loceol,
7583                                                             NULL)))
7584             {
7585                 sayNO;
7586             }
7587             goto increment_locinput;
7588             break;
7589 
7590         case ANYOFHb:
7591             if (   ! utf8_target
7592                 ||   NEXTCHR_IS_EOS
7593                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
7594                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7595                 || ! _invlist_contains_cp(anyofh_list,
7596                                           utf8_to_uvchr_buf((U8 *) locinput,
7597                                                             (U8 *) loceol,
7598                                                             NULL)))
7599             {
7600                 sayNO;
7601             }
7602             goto increment_locinput;
7603             break;
7604 
7605         case ANYOFHbbm:
7606             if (   ! utf8_target
7607                 ||   NEXTCHR_IS_EOS
7608                 ||   ANYOF_FLAGS(scan) != (U8) locinput[0]
7609                 ||   locinput >= reginfo->strend
7610                 || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap,
7611                                    (U8) locinput[1] & UTF_CONTINUATION_MASK))
7612             {
7613                 sayNO;
7614             }
7615             goto increment_locinput;
7616             break;
7617 
7618         case ANYOFHr:
7619             if (   ! utf8_target
7620                 ||   NEXTCHR_IS_EOS
7621                 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7622                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7623                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7624                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7625                 || ! _invlist_contains_cp(anyofh_list,
7626                                           utf8_to_uvchr_buf((U8 *) locinput,
7627                                                             (U8 *) loceol,
7628                                                             NULL)))
7629             {
7630                 sayNO;
7631             }
7632             goto increment_locinput;
7633             break;
7634 
7635         case ANYOFHs:
7636             if (   ! utf8_target
7637                 ||   NEXTCHR_IS_EOS
7638                 ||   loceol - locinput < FLAGS(scan)
7639                 ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7640                 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7641                 || ! _invlist_contains_cp(anyofh_list,
7642                                           utf8_to_uvchr_buf((U8 *) locinput,
7643                                                             (U8 *) loceol,
7644                                                             NULL)))
7645             {
7646                 sayNO;
7647             }
7648             goto increment_locinput;
7649             break;
7650 
7651         case ANYOFR:
7652             if (NEXTCHR_IS_EOS) {
7653                 sayNO;
7654             }
7655 
7656             if (utf8_target) {
7657                 if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7658                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7659                                                 (U8 *) reginfo->strend,
7660                                                 NULL),
7661                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
7662                 {
7663                     sayNO;
7664                 }
7665             }
7666             else {
7667                 if (! withinCOUNT((U8) *locinput,
7668                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7669                 {
7670                     sayNO;
7671                 }
7672             }
7673             goto increment_locinput;
7674             break;
7675 
7676         case ANYOFRb:
7677             if (NEXTCHR_IS_EOS) {
7678                 sayNO;
7679             }
7680 
7681             if (utf8_target) {
7682                 if (     ANYOF_FLAGS(scan) != (U8) *locinput
7683                     || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7684                                                 (U8 *) reginfo->strend,
7685                                                 NULL),
7686                                      ANYOFRbase(scan), ANYOFRdelta(scan)))
7687                 {
7688                     sayNO;
7689                 }
7690             }
7691             else {
7692                 if (! withinCOUNT((U8) *locinput,
7693                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7694                 {
7695                     sayNO;
7696                 }
7697             }
7698             goto increment_locinput;
7699             break;
7700 
7701         /* The argument (FLAGS) to all the POSIX node types is the class number
7702          * */
7703 
7704         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
7705             to_complement = 1;
7706             /* FALLTHROUGH */
7707 
7708         case POSIXL:    /* \w or [:punct:] etc. under /l */
7709             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7710             if (NEXTCHR_IS_EOS || locinput >= loceol)
7711                 sayNO;
7712 
7713             /* Use isFOO_lc() for characters within Latin1.  (Note that
7714              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7715              * wouldn't be invariant) */
7716             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7717                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7718                     sayNO;
7719                 }
7720 
7721                 locinput++;
7722                 break;
7723             }
7724 
7725             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7726                 /* An above Latin-1 code point, or malformed */
7727                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7728                                                        reginfo->strend);
7729                 goto utf8_posix_above_latin1;
7730             }
7731 
7732             /* Here is a UTF-8 variant code point below 256 and the target is
7733              * UTF-8 */
7734             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7735                                             EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7736                                             *(locinput + 1))))))
7737             {
7738                 sayNO;
7739             }
7740 
7741             goto increment_locinput;
7742 
7743         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7744             to_complement = 1;
7745             /* FALLTHROUGH */
7746 
7747         case POSIXD:    /* \w or [:punct:] etc. under /d */
7748             if (utf8_target) {
7749                 goto utf8_posix;
7750             }
7751             goto posixa;
7752 
7753         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7754 
7755             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7756                 sayNO;
7757             }
7758 
7759             /* All UTF-8 variants match */
7760             if (! UTF8_IS_INVARIANT(nextbyte)) {
7761                 goto increment_locinput;
7762             }
7763 
7764             to_complement = 1;
7765             goto join_nposixa;
7766 
7767         case POSIXA:    /* \w or [:punct:] etc. under /a */
7768 
7769           posixa:
7770             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7771              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7772              * character is a single byte */
7773 
7774             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7775                 sayNO;
7776             }
7777 
7778           join_nposixa:
7779 
7780             if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte,
7781                                                                 FLAGS(scan)))))
7782             {
7783                 sayNO;
7784             }
7785 
7786             /* Here we are either not in utf8, or we matched a utf8-invariant,
7787              * so the next char is the next byte */
7788             locinput++;
7789             break;
7790 
7791         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7792             to_complement = 1;
7793             /* FALLTHROUGH */
7794 
7795         case POSIXU:    /* \w or [:punct:] etc. under /u */
7796           utf8_posix:
7797             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7798                 sayNO;
7799             }
7800 
7801             /* Use generic_isCC_() for characters within Latin1.  (Note that
7802              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7803              * wouldn't be invariant) */
7804             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7805                 if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte,
7806                                                            FLAGS(scan)))))
7807                 {
7808                     sayNO;
7809                 }
7810                 locinput++;
7811             }
7812             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7813                 if (! (to_complement
7814                        ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7815                                                                *(locinput + 1)),
7816                                              FLAGS(scan)))))
7817                 {
7818                     sayNO;
7819                 }
7820                 locinput += 2;
7821             }
7822             else {  /* Handle above Latin-1 code points */
7823               utf8_posix_above_latin1:
7824                 classnum = (char_class_number_) FLAGS(scan);
7825                 switch (classnum) {
7826                     default:
7827                         if (! (to_complement
7828                            ^ cBOOL(_invlist_contains_cp(
7829                                       PL_XPosix_ptrs[classnum],
7830                                       utf8_to_uvchr_buf((U8 *) locinput,
7831                                                         (U8 *) reginfo->strend,
7832                                                         NULL)))))
7833                         {
7834                             sayNO;
7835                         }
7836                         break;
7837                     case CC_ENUM_SPACE_:
7838                         if (! (to_complement
7839                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
7840                         {
7841                             sayNO;
7842                         }
7843                         break;
7844                     case CC_ENUM_BLANK_:
7845                         if (! (to_complement
7846                                         ^ cBOOL(is_HORIZWS_high(locinput))))
7847                         {
7848                             sayNO;
7849                         }
7850                         break;
7851                     case CC_ENUM_XDIGIT_:
7852                         if (! (to_complement
7853                                         ^ cBOOL(is_XDIGIT_high(locinput))))
7854                         {
7855                             sayNO;
7856                         }
7857                         break;
7858                     case CC_ENUM_VERTSPACE_:
7859                         if (! (to_complement
7860                                         ^ cBOOL(is_VERTWS_high(locinput))))
7861                         {
7862                             sayNO;
7863                         }
7864                         break;
7865                     case CC_ENUM_CNTRL_:    /* These can't match above Latin1 */
7866                     case CC_ENUM_ASCII_:
7867                         if (! to_complement) {
7868                             sayNO;
7869                         }
7870                         break;
7871                 }
7872                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7873             }
7874             break;
7875 
7876         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7877                        a Unicode extended Grapheme Cluster */
7878             if (NEXTCHR_IS_EOS || locinput >= loceol)
7879                 sayNO;
7880             if  (! utf8_target) {
7881 
7882                 /* Match either CR LF  or '.', as all the other possibilities
7883                  * require utf8 */
7884                 locinput++;	    /* Match the . or CR */
7885                 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7886                                        match the LF */
7887                     && locinput <  loceol
7888                     && UCHARAT(locinput) == '\n')
7889                 {
7890                     locinput++;
7891                 }
7892             }
7893             else {
7894 
7895                 /* Get the gcb type for the current character */
7896                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7897                                                        (U8*) reginfo->strend);
7898 
7899                 /* Then scan through the input until we get to the first
7900                  * character whose type is supposed to be a gcb with the
7901                  * current character.  (There is always a break at the
7902                  * end-of-input) */
7903                 locinput += UTF8SKIP(locinput);
7904                 while (locinput < loceol) {
7905                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7906                                                          (U8*) reginfo->strend);
7907                     if (isGCB(prev_gcb, cur_gcb,
7908                               (U8*) reginfo->strbeg, (U8*) locinput,
7909                               utf8_target))
7910                     {
7911                         break;
7912                     }
7913 
7914                     prev_gcb = cur_gcb;
7915                     locinput += UTF8SKIP(locinput);
7916                 }
7917 
7918 
7919             }
7920             break;
7921 
7922         case REFFLN:  /*  /\g{name}/il  */
7923         {   /* The capture buffer cases.  The ones beginning with N for the
7924                named buffers just convert to the equivalent numbered and
7925                pretend they were called as the corresponding numbered buffer
7926                op.  */
7927             /* don't initialize these in the declaration, it makes C++
7928                unhappy */
7929             const char *s;
7930             char type;
7931             re_fold_t folder;
7932             const U8 *fold_array;
7933             UV utf8_fold_flags;
7934 
7935             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7936             folder = Perl_foldEQ_locale;
7937             fold_array = PL_fold_locale;
7938             type = REFFL;
7939             utf8_fold_flags = FOLDEQ_LOCALE;
7940             goto do_nref;
7941 
7942         case REFFAN:  /*  /\g{name}/iaa  */
7943             folder = Perl_foldEQ_latin1;
7944             fold_array = PL_fold_latin1;
7945             type = REFFA;
7946             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7947             goto do_nref;
7948 
7949         case REFFUN:  /*  /\g{name}/iu  */
7950             folder = Perl_foldEQ_latin1;
7951             fold_array = PL_fold_latin1;
7952             type = REFFU;
7953             utf8_fold_flags = 0;
7954             goto do_nref;
7955 
7956         case REFFN:  /*  /\g{name}/i  */
7957             folder = Perl_foldEQ;
7958             fold_array = PL_fold;
7959             type = REFF;
7960             utf8_fold_flags = 0;
7961             goto do_nref;
7962 
7963         case REFN:  /*  /\g{name}/   */
7964             type = REF;
7965             folder = NULL;
7966             fold_array = NULL;
7967             utf8_fold_flags = 0;
7968           do_nref:
7969 
7970             /* For the named back references, find the corresponding buffer
7971              * number */
7972             n = reg_check_named_buff_matched(rex,scan);
7973 
7974             if ( ! n ) {
7975                 sayNO;
7976             }
7977             goto do_nref_ref_common;
7978 
7979         case REFFL:  /*  /\1/il  */
7980             CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7981             folder = Perl_foldEQ_locale;
7982             fold_array = PL_fold_locale;
7983             utf8_fold_flags = FOLDEQ_LOCALE;
7984             goto do_ref;
7985 
7986         case REFFA:  /*  /\1/iaa  */
7987             folder = Perl_foldEQ_latin1;
7988             fold_array = PL_fold_latin1;
7989             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7990             goto do_ref;
7991 
7992         case REFFU:  /*  /\1/iu  */
7993             folder = Perl_foldEQ_latin1;
7994             fold_array = PL_fold_latin1;
7995             utf8_fold_flags = 0;
7996             goto do_ref;
7997 
7998         case REFF:  /*  /\1/i  */
7999             folder = Perl_foldEQ;
8000             fold_array = PL_fold;
8001             utf8_fold_flags = 0;
8002             goto do_ref;
8003 
8004 #undef  ST
8005 #define ST st->u.backref
8006         case REF:  /*  /\1/    */
8007             folder = NULL;
8008             fold_array = NULL;
8009             utf8_fold_flags = 0;
8010 
8011           do_ref:
8012             type = OP(scan);
8013             n = ARG1u(scan);  /* which paren pair */
8014             if (rex->logical_to_parno) {
8015                 n = rex->logical_to_parno[n];
8016                 do {
8017                     if ( RXp_LASTPAREN(rex) < n ||
8018                          RXp_OFFS_START(rex,n) == -1 ||
8019                          RXp_OFFS_END(rex,n) == -1
8020                     ) {
8021                         n = rex->parno_to_logical_next[n];
8022                     }
8023                     else {
8024                         break;
8025                     }
8026                 } while(n);
8027 
8028                 if (!n) /* this means there is nothing that matched */
8029                     sayNO;
8030             }
8031 
8032           do_nref_ref_common:
8033             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8034             if (RXp_LASTPAREN(rex) < n)
8035                 sayNO;
8036 
8037             ln = RXp_OFFSp(rex)[n].start;
8038             endref = RXp_OFFSp(rex)[n].end;
8039             if (ln == -1 || endref == -1)
8040                 sayNO;			/* Do not match unless seen CLOSEn. */
8041 
8042             if (ln == endref)
8043                 goto ref_yes;
8044 
8045             s = reginfo->strbeg + ln;
8046             if (type != REF	/* REF can do byte comparison */
8047                 && (utf8_target || type == REFFU || type == REFFL))
8048             {
8049                 char * limit = loceol;
8050 
8051                 /* This call case insensitively compares the entire buffer
8052                     * at s, with the current input starting at locinput, but
8053                     * not going off the end given by loceol, and
8054                     * returns in <limit> upon success, how much of the
8055                     * current input was matched */
8056                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
8057                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
8058                 {
8059                     sayNO;
8060                 }
8061                 locinput = limit;
8062                 goto ref_yes;
8063             }
8064 
8065             /* Not utf8:  Inline the first character, for speed. */
8066             if ( ! NEXTCHR_IS_EOS
8067                 && locinput < loceol
8068                 && UCHARAT(s) != nextbyte
8069                 && (   type == REF
8070                     || UCHARAT(s) != fold_array[nextbyte]))
8071             {
8072                 sayNO;
8073             }
8074             ln = endref - ln;
8075             if (locinput + ln > loceol)
8076                 sayNO;
8077             if (ln > 1 && (type == REF
8078                            ? memNE(s, locinput, ln)
8079                            : ! folder(aTHX_ locinput, s, ln)))
8080                 sayNO;
8081             locinput += ln;
8082         }
8083         ref_yes:
8084             if (FLAGS(scan)) { /* == VOLATILE_REF but only other value is 0 */
8085                 ST.cp = regcppush(rex, ARG2u(scan) - 1, maxopenparen);
8086                 REGCP_SET(ST.lastcp);
8087                 PUSH_STATE_GOTO(REF_next, next, locinput, loceol,
8088                                 script_run_begin);
8089             }
8090             break;
8091             NOT_REACHED; /* NOTREACHED */
8092 
8093         case REF_next:
8094             sayYES;
8095             break;
8096 
8097         case REF_next_fail:
8098             REGCP_UNWIND(ST.lastcp);
8099             regcppop(rex, &maxopenparen);
8100             sayNO;
8101             break;
8102 
8103         case NOTHING: /* null op; e.g. the 'nothing' following
8104                        * the '*' in m{(a+|b)*}' */
8105             break;
8106         case TAIL: /* placeholder while compiling (A|B|C) */
8107             break;
8108 
8109 #undef  ST
8110 #define ST st->u.eval
8111 #define CUR_EVAL cur_eval->u.eval
8112 
8113         {
8114             SV *ret;
8115             REGEXP *re_sv;
8116             regexp *re;
8117             regexp_internal *rei;
8118             regnode *startpoint;
8119             U32 arg;
8120 
8121         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
8122             arg = ARG1u(scan);
8123             if (cur_eval && cur_eval->locinput == locinput) {
8124                 if ( ++nochange_depth > max_nochange_depth )
8125                     Perl_croak(aTHX_
8126                         "Pattern subroutine nesting without pos change"
8127                         " exceeded limit in regex");
8128             } else {
8129                 nochange_depth = 0;
8130             }
8131             re_sv = rex_sv;
8132             re = rex;
8133             rei = rexi;
8134             startpoint = scan + ARG2i(scan);
8135             EVAL_CLOSE_PAREN_SET( st, arg );
8136             /* Detect infinite recursion
8137              *
8138              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
8139              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
8140              * So we track the position in the string we are at each time
8141              * we recurse and if we try to enter the same routine twice from
8142              * the same position we throw an error.
8143              */
8144             if ( rex->recurse_locinput[arg] == locinput ) {
8145                 /* FIXME: we should show the regop that is failing as part
8146                  * of the error message. */
8147                 Perl_croak(aTHX_ "Infinite recursion in regex");
8148             } else {
8149                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
8150                 rex->recurse_locinput[arg]= locinput;
8151 
8152                 DEBUG_r({
8153                     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8154                     DEBUG_STACK_r({
8155                         Perl_re_exec_indentf( aTHX_
8156                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
8157                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
8158                         );
8159                     });
8160                 });
8161             }
8162 
8163             /* Save all the positions seen so far. */
8164             ST.cp = regcppush(rex, 0, maxopenparen);
8165             REGCP_SET(ST.lastcp);
8166 
8167             /* and then jump to the code we share with EVAL */
8168             goto eval_recurse_doit;
8169             /* NOTREACHED */
8170 
8171         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
8172             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
8173                 if ( ++nochange_depth > max_nochange_depth )
8174                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
8175             } else {
8176                 nochange_depth = 0;
8177             }
8178             {
8179                 /* execute the code in the {...} */
8180 
8181                 dSP;
8182                 IV before;
8183                 OP * const oop = PL_op;
8184                 COP * const ocurcop = PL_curcop;
8185                 OP *nop;
8186                 CV *newcv;
8187 
8188                 /* save *all* paren positions */
8189                 ST.cp = regcppush(rex, 0, maxopenparen);
8190                 REGCP_SET(ST.lastcp);
8191 
8192                 if (!caller_cv)
8193                     caller_cv = find_runcv(NULL);
8194 
8195                 n = ARG1u(scan);
8196 
8197                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8198                     newcv = (ReANY(
8199                                     (REGEXP*)(rexi->data->data[n])
8200                             ))->qr_anoncv;
8201                     nop = (OP*)rexi->data->data[n+1];
8202                 }
8203                 else if (rexi->data->what[n] == 'l') { /* literal code */
8204                     newcv = caller_cv;
8205                     nop = (OP*)rexi->data->data[n];
8206                     assert(CvDEPTH(newcv));
8207                 }
8208                 else {
8209                     /* literal with own CV */
8210                     assert(rexi->data->what[n] == 'L');
8211                     newcv = rex->qr_anoncv;
8212                     nop = (OP*)rexi->data->data[n];
8213                 }
8214 
8215                 /* Some notes about MULTICALL and the context and save stacks.
8216                  *
8217                  * In something like
8218                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
8219                  * since codeblocks don't introduce a new scope (so that
8220                  * local() etc accumulate), at the end of a successful
8221                  * match there will be a SAVEt_CLEARSV on the savestack
8222                  * for each of $x, $y, $z. If the three code blocks above
8223                  * happen to have come from different CVs (e.g. via
8224                  * embedded qr//s), then we must ensure that during any
8225                  * savestack unwinding, PL_comppad always points to the
8226                  * right pad at each moment. We achieve this by
8227                  * interleaving SAVEt_COMPPAD's on the savestack whenever
8228                  * there is a change of pad.
8229                  * In theory whenever we call a code block, we should
8230                  * push a CXt_SUB context, then pop it on return from
8231                  * that code block. This causes a bit of an issue in that
8232                  * normally popping a context also clears the savestack
8233                  * back to cx->blk_oldsaveix, but here we specifically
8234                  * don't want to clear the save stack on exit from the
8235                  * code block.
8236                  * Also for efficiency we don't want to keep pushing and
8237                  * popping the single SUB context as we backtrack etc.
8238                  * So instead, we push a single context the first time
8239                  * we need, it, then hang onto it until the end of this
8240                  * function. Whenever we encounter a new code block, we
8241                  * update the CV etc if that's changed. During the times
8242                  * in this function where we're not executing a code
8243                  * block, having the SUB context still there is a bit
8244                  * naughty - but we hope that no-one notices.
8245                  * When the SUB context is initially pushed, we fake up
8246                  * cx->blk_oldsaveix to be as if we'd pushed this context
8247                  * on first entry to S_regmatch rather than at some random
8248                  * point during the regexe execution. That way if we
8249                  * croak, popping the context stack will ensure that
8250                  * *everything* SAVEd by this function is undone and then
8251                  * the context popped, rather than e.g., popping the
8252                  * context (and restoring the original PL_comppad) then
8253                  * popping more of the savestack and restoring a bad
8254                  * PL_comppad.
8255                  */
8256 
8257                 /* If this is the first EVAL, push a MULTICALL. On
8258                  * subsequent calls, if we're executing a different CV, or
8259                  * if PL_comppad has got messed up from backtracking
8260                  * through SAVECOMPPADs, then refresh the context.
8261                  */
8262                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8263                 {
8264                     U8 flags = (CXp_SUB_RE |
8265                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8266                     SAVECOMPPAD();
8267                     if (last_pushed_cv) {
8268                         CHANGE_MULTICALL_FLAGS(newcv, flags);
8269                     }
8270                     else {
8271                         PUSH_MULTICALL_FLAGS(newcv, flags);
8272                     }
8273                     /* see notes above */
8274                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8275 
8276                     last_pushed_cv = newcv;
8277                 }
8278                 else {
8279                     /* these assignments are just to silence compiler
8280                      * warnings */
8281                     multicall_cop = NULL;
8282                 }
8283                 last_pad = PL_comppad;
8284 
8285                 /* the initial nextstate you would normally execute
8286                  * at the start of an eval (which would cause error
8287                  * messages to come from the eval), may be optimised
8288                  * away from the execution path in the regex code blocks;
8289                  * so manually set PL_curcop to it initially */
8290                 {
8291                     OP *o = cUNOPx(nop)->op_first;
8292                     assert(o->op_type == OP_NULL);
8293                     if (o->op_targ == OP_SCOPE) {
8294                         o = cUNOPo->op_first;
8295                     }
8296                     else {
8297                         assert(o->op_targ == OP_LEAVE);
8298                         o = cUNOPo->op_first;
8299                         assert(o->op_type == OP_ENTER);
8300                         o = OpSIBLING(o);
8301                     }
8302 
8303                     if (o->op_type != OP_STUB) {
8304                         assert(    o->op_type == OP_NEXTSTATE
8305                                 || o->op_type == OP_DBSTATE
8306                                 || (o->op_type == OP_NULL
8307                                     &&  (  o->op_targ == OP_NEXTSTATE
8308                                         || o->op_targ == OP_DBSTATE
8309                                         )
8310                                     )
8311                         );
8312                         PL_curcop = (COP*)o;
8313                     }
8314                 }
8315                 nop = nop->op_next;
8316 
8317                 DEBUG_STATE_r( Perl_re_printf( aTHX_
8318                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8319 
8320                 RXp_OFFSp(rex)[0].end = locinput - reginfo->strbeg;
8321                 if (reginfo->info_aux_eval->pos_magic)
8322                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8323                                   reginfo->sv, reginfo->strbeg,
8324                                   locinput - reginfo->strbeg);
8325 
8326                 if (sv_yes_mark) {
8327                     SV *sv_mrk = get_sv("REGMARK", 1);
8328                     sv_setsv(sv_mrk, sv_yes_mark);
8329                 }
8330 
8331                 /* we don't use MULTICALL here as we want to call the
8332                  * first op of the block of interest, rather than the
8333                  * first op of the sub. Also, we don't want to free
8334                  * the savestack frame */
8335                 before = (IV)(SP-PL_stack_base);
8336                 PL_op = nop;
8337                 CALLRUNOPS(aTHX);			/* Scalar context. */
8338                 SPAGAIN;
8339                 if ((IV)(SP-PL_stack_base) == before)
8340                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
8341                 else {
8342                     ret = POPs;
8343                     PUTBACK;
8344                 }
8345 
8346                 /* before restoring everything, evaluate the returned
8347                  * value, so that 'uninit' warnings don't use the wrong
8348                  * PL_op or pad. Also need to process any magic vars
8349                  * (e.g. $1) *before* parentheses are restored */
8350 
8351                 PL_op = NULL;
8352 
8353                 re_sv = NULL;
8354                 if (logical == 0) {       /* /(?{ ... })/ and /(*{ ... })/ */
8355                     SV *replsv = save_scalar(PL_replgv);
8356                     sv_setsv(replsv, ret); /* $^R */
8357                     SvSETMAGIC(replsv);
8358                 }
8359                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
8360                     sw = cBOOL(SvTRUE_NN(ret));
8361                     logical = 0;
8362                 }
8363                 else {                   /*  /(??{ ... })  */
8364                     /*  if its overloaded, let the regex compiler handle
8365                      *  it; otherwise extract regex, or stringify  */
8366                     if (SvGMAGICAL(ret))
8367                         ret = sv_mortalcopy(ret);
8368                     if (!SvAMAGIC(ret)) {
8369                         SV *sv = ret;
8370                         if (SvROK(sv))
8371                             sv = SvRV(sv);
8372                         if (SvTYPE(sv) == SVt_REGEXP)
8373                             re_sv = (REGEXP*) sv;
8374                         else if (SvSMAGICAL(ret)) {
8375                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8376                             if (mg)
8377                                 re_sv = (REGEXP *) mg->mg_obj;
8378                         }
8379 
8380                         /* force any undef warnings here */
8381                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8382                             ret = sv_mortalcopy(ret);
8383                             (void) SvPV_force_nolen(ret);
8384                         }
8385                     }
8386 
8387                 }
8388 
8389                 /* *** Note that at this point we don't restore
8390                  * PL_comppad, (or pop the CxSUB) on the assumption it may
8391                  * be used again soon. This is safe as long as nothing
8392                  * in the regexp code uses the pad ! */
8393                 PL_op = oop;
8394                 PL_curcop = ocurcop;
8395                 regcp_restore(rex, ST.lastcp, &maxopenparen);
8396                 PL_curpm_under = PL_curpm;
8397                 PL_curpm = PL_reg_curpm;
8398 
8399                 if (logical != 2) {
8400                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8401                                     script_run_begin);
8402                     /* NOTREACHED */
8403                 }
8404             }
8405 
8406                 /* only /(??{ ... })/  from now on */
8407                 logical = 0;
8408                 {
8409                     /* extract RE object from returned value; compiling if
8410                      * necessary */
8411 
8412                     if (re_sv) {
8413                         re_sv = reg_temp_copy(NULL, re_sv);
8414                     }
8415                     else {
8416                         U32 pm_flags = 0;
8417 
8418                         if (SvUTF8(ret) && IN_BYTES) {
8419                             /* In use 'bytes': make a copy of the octet
8420                              * sequence, but without the flag on */
8421                             STRLEN len;
8422                             const char *const p = SvPV(ret, len);
8423                             ret = newSVpvn_flags(p, len, SVs_TEMP);
8424                         }
8425                         if (rex->intflags & PREGf_USE_RE_EVAL)
8426                             pm_flags |= PMf_USE_RE_EVAL;
8427 
8428                         /* if we got here, it should be an engine which
8429                          * supports compiling code blocks and stuff */
8430                         assert(rex->engine && rex->engine->op_comp);
8431                         assert(!(FLAGS(scan) & ~RXf_PMf_COMPILETIME));
8432                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8433                                     rex->engine, NULL, NULL,
8434                                     /* copy /msixn etc to inner pattern */
8435                                     ARG2i(scan),
8436                                     pm_flags);
8437 
8438                         if (!(SvFLAGS(ret)
8439                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
8440                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8441                             /* This isn't a first class regexp. Instead, it's
8442                                caching a regexp onto an existing, Perl visible
8443                                scalar.  */
8444                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8445                         }
8446                     }
8447                     SAVEFREESV(re_sv);
8448                     re = ReANY(re_sv);
8449                 }
8450                 RXp_MATCH_COPIED_off(re);
8451                 RXp_SUBBEG(re) = RXp_SUBBEG(rex);
8452                 RXp_SUBLEN(re) = RXp_SUBLEN(rex);
8453                 RXp_SUBOFFSET(re) = RXp_SUBOFFSET(rex);
8454                 RXp_SUBCOFFSET(re) = RXp_SUBCOFFSET(rex);
8455                 RXp_LASTPAREN(re) = 0;
8456                 RXp_LASTCLOSEPAREN(re) = 0;
8457                 rei = RXi_GET(re);
8458                 DEBUG_EXECUTE_r(
8459                     debug_start_match(re_sv, utf8_target, locinput,
8460                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
8461                 );
8462                 startpoint = rei->program + 1;
8463                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8464                                              * close_paren only for GOSUB */
8465                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8466 
8467                 /* note we saved the paren state earlier:
8468                 ST.cp = regcppush(rex, 0, maxopenparen);
8469                 REGCP_SET(ST.lastcp);
8470                 */
8471                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8472                 maxopenparen = 0;
8473                 /* run the pattern returned from (??{...}) */
8474 
8475               eval_recurse_doit: /* Share code with GOSUB below this line
8476                             * At this point we expect the stack context to be
8477                             * set up correctly */
8478 
8479                 /* invalidate the S-L poscache. We're now executing a
8480                  * different set of WHILEM ops (and their associated
8481                  * indexes) against the same string, so the bits in the
8482                  * cache are meaningless. Setting maxiter to zero forces
8483                  * the cache to be invalidated and zeroed before reuse.
8484                  * XXX This is too dramatic a measure. Ideally we should
8485                  * save the old cache and restore when running the outer
8486                  * pattern again */
8487                 reginfo->poscache_maxiter = 0;
8488 
8489                 /* the new regexp might have a different is_utf8_pat than we do */
8490                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8491 
8492                 ST.prev_rex = rex_sv;
8493                 ST.prev_curlyx = cur_curlyx;
8494                 rex_sv = re_sv;
8495                 SET_reg_curpm(rex_sv);
8496                 rex = re;
8497                 rexi = rei;
8498                 cur_curlyx = NULL;
8499                 ST.B = next;
8500                 ST.prev_eval = cur_eval;
8501                 cur_eval = st;
8502                 /* now continue from first node in postoned RE */
8503                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8504                                     loceol, script_run_begin);
8505                 NOT_REACHED; /* NOTREACHED */
8506         }
8507 
8508         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8509             /* note: this is called twice; first after popping B, then A */
8510             DEBUG_STACK_r({
8511                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
8512                     depth, cur_eval, ST.prev_eval);
8513             });
8514 
8515 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8516             if ( cur_eval && CUR_EVAL.close_paren ) {\
8517                 DEBUG_STACK_r({ \
8518                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8519                         depth,    \
8520                         CUR_EVAL.close_paren - 1,\
8521                         cur_eval, \
8522                         VAL);     \
8523                 });               \
8524                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8525             }
8526 
8527             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8528 
8529             rex_sv = ST.prev_rex;
8530             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8531             SET_reg_curpm(rex_sv);
8532             rex = ReANY(rex_sv);
8533             rexi = RXi_GET(rex);
8534             {
8535                 /* preserve $^R across LEAVE's. See Bug 121070. */
8536                 SV *save_sv= GvSV(PL_replgv);
8537                 SV *replsv;
8538                 SvREFCNT_inc(save_sv);
8539                 regcpblow(ST.cp); /* LEAVE in disguise */
8540                 /* don't move this initialization up */
8541                 replsv = GvSV(PL_replgv);
8542                 sv_setsv(replsv, save_sv);
8543                 SvSETMAGIC(replsv);
8544                 SvREFCNT_dec(save_sv);
8545             }
8546             cur_eval = ST.prev_eval;
8547             cur_curlyx = ST.prev_curlyx;
8548 
8549             /* Invalidate cache. See "invalidate" comment above. */
8550             reginfo->poscache_maxiter = 0;
8551             if ( nochange_depth )
8552                 nochange_depth--;
8553 
8554             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8555             sayYES;
8556 
8557 
8558         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8559             REGCP_UNWIND(ST.lastcp);
8560             regcppop(rex, &maxopenparen);
8561             sayNO;
8562 
8563         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8564             /* note: this is called twice; first after popping B, then A */
8565             DEBUG_STACK_r({
8566                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8567                     depth, cur_eval, ST.prev_eval);
8568             });
8569 
8570             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8571 
8572             rex_sv = ST.prev_rex;
8573             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8574             SET_reg_curpm(rex_sv);
8575             rex = ReANY(rex_sv);
8576             rexi = RXi_GET(rex);
8577 
8578             REGCP_UNWIND(ST.lastcp);
8579             regcppop(rex, &maxopenparen);
8580             cur_eval = ST.prev_eval;
8581             cur_curlyx = ST.prev_curlyx;
8582 
8583             /* Invalidate cache. See "invalidate" comment above. */
8584             reginfo->poscache_maxiter = 0;
8585             if ( nochange_depth )
8586                 nochange_depth--;
8587 
8588             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8589             sayNO_SILENT;
8590 #undef ST
8591 
8592         case OPEN: /*  (  */
8593             n = PARNO(scan);  /* which paren pair */
8594             RXp_OFFSp(rex)[n].start_tmp = locinput - reginfo->strbeg;
8595             if (n > maxopenparen)
8596                 maxopenparen = n;
8597             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8598                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8599                 depth,
8600                 PTR2UV(rex),
8601                 PTR2UV(RXp_OFFSp(rex)),
8602                 (UV)n,
8603                 (IV)RXp_OFFSp(rex)[n].start_tmp,
8604                 (UV)maxopenparen
8605             ));
8606             lastopen = n;
8607             break;
8608 
8609         case SROPEN: /*  (*SCRIPT_RUN:  */
8610             script_run_begin = (U8 *) locinput;
8611             break;
8612 
8613 
8614         case CLOSE:  /*  )  */
8615             n = PARNO(scan);  /* which paren pair */
8616             CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8617                              locinput - reginfo->strbeg);
8618             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8619                 goto fake_end;
8620 
8621             break;
8622 
8623         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
8624 
8625             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8626             {
8627                 sayNO;
8628             }
8629 
8630             break;
8631 
8632 
8633         case ACCEPT:  /*  (*ACCEPT)  */
8634             is_accepted = true;
8635             if (FLAGS(scan))
8636                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
8637             utmp = ARG2u(scan);
8638 
8639             if ( utmp ) {
8640                 regnode *cursor;
8641                 for (
8642                     cursor = scan;
8643                     cursor && ( OP(cursor) != END );
8644                     cursor = (
8645                                REGNODE_TYPE( OP(cursor) ) == END
8646                                || REGNODE_TYPE( OP(cursor) ) == WHILEM
8647                              )
8648                              ? REGNODE_AFTER(cursor)
8649                              : regnext(cursor)
8650                 ){
8651                     if ( OP(cursor) != CLOSE )
8652                         continue;
8653 
8654                     n = PARNO(cursor);
8655 
8656                     if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8657                         continue;       /* so skip this one */
8658 
8659                     CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8660                                      locinput - reginfo->strbeg);
8661 
8662                     if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8663                         break;
8664                 }
8665             }
8666             goto fake_end;
8667             /* NOTREACHED */
8668 
8669         case GROUPP:  /*  (?(1))  */
8670             n = ARG1u(scan);  /* which paren pair */
8671             sw = cBOOL(RXp_LASTPAREN(rex) >= n && RXp_OFFS_END(rex,n) != -1);
8672             break;
8673 
8674         case GROUPPN:  /*  (?(<name>))  */
8675             /* reg_check_named_buff_matched returns 0 for no match */
8676             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8677             break;
8678 
8679         case INSUBP:   /*  (?(R))  */
8680             n = ARG1u(scan);
8681             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8682              * of SCAN is already set up as matches a eval.close_paren */
8683             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8684             break;
8685 
8686         case DEFINEP:  /*  (?(DEFINE))  */
8687             sw = 0;
8688             break;
8689 
8690         case IFTHEN:   /*  (?(cond)A|B)  */
8691             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8692             if (sw)
8693                 next = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
8694             else {
8695                 next = scan + ARG1u(scan);
8696                 if (OP(next) == IFTHEN) /* Fake one. */
8697                     next = REGNODE_AFTER_type(next,tregnode_IFTHEN);
8698             }
8699             break;
8700 
8701         case LOGICAL:  /* modifier for EVAL and IFMATCH */
8702             logical = FLAGS(scan) & EVAL_FLAGS_MASK; /* reserve a bit for optimistic eval */
8703             break;
8704 
8705 /*******************************************************************
8706 
8707 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8708 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8709 STAR/PLUS/CURLY/CURLYN are used instead.)
8710 
8711 A*B is compiled as <CURLYX><A><WHILEM><B>
8712 
8713 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8714 state, which contains the current count, initialised to -1. It also sets
8715 cur_curlyx to point to this state, with any previous value saved in the
8716 state block.
8717 
8718 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8719 since the pattern may possibly match zero times (i.e. it's a while {} loop
8720 rather than a do {} while loop).
8721 
8722 Each entry to WHILEM represents a successful match of A. The count in the
8723 CURLYX block is incremented, another WHILEM state is pushed, and execution
8724 passes to A or B depending on greediness and the current count.
8725 
8726 For example, if matching against the string a1a2a3b (where the aN are
8727 substrings that match /A/), then the match progresses as follows: (the
8728 pushed states are interspersed with the bits of strings matched so far):
8729 
8730     <CURLYX cnt=-1>
8731     <CURLYX cnt=0><WHILEM>
8732     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8733     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8734     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8735     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8736 
8737 (Contrast this with something like CURLYM, which maintains only a single
8738 backtrack state:
8739 
8740     <CURLYM cnt=0> a1
8741     a1 <CURLYM cnt=1> a2
8742     a1 a2 <CURLYM cnt=2> a3
8743     a1 a2 a3 <CURLYM cnt=3> b
8744 )
8745 
8746 Each WHILEM state block marks a point to backtrack to upon partial failure
8747 of A or B, and also contains some minor state data related to that
8748 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
8749 overall state, such as the count, and pointers to the A and B ops.
8750 
8751 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8752 must always point to the *current* CURLYX block, the rules are:
8753 
8754 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8755 and set cur_curlyx to point the new block.
8756 
8757 When popping the CURLYX block after a successful or unsuccessful match,
8758 restore the previous cur_curlyx.
8759 
8760 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8761 to the outer one saved in the CURLYX block.
8762 
8763 When popping the WHILEM block after a successful or unsuccessful B match,
8764 restore the previous cur_curlyx.
8765 
8766 Here's an example for the pattern (AI* BI)*BO
8767 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8768 
8769 cur_
8770 curlyx backtrack stack
8771 ------ ---------------
8772 NULL
8773 CO     <CO prev=NULL> <WO>
8774 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8775 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8776 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8777 
8778 At this point the pattern succeeds, and we work back down the stack to
8779 clean up, restoring as we go:
8780 
8781 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8782 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8783 CO     <CO prev=NULL> <WO>
8784 NULL
8785 
8786 *******************************************************************/
8787 
8788 #define ST st->u.curlyx
8789 
8790         case CURLYX:    /* start of /A*B/  (for complex A) */
8791         {
8792             /* No need to save/restore up to this paren */
8793             I32 parenfloor = FLAGS(scan);
8794 
8795             assert(next); /* keep Coverity happy */
8796             if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */
8797                 next += ARG1u(next);
8798 
8799             /* XXXX Probably it is better to teach regpush to support
8800                parenfloor > maxopenparen ... */
8801             if (parenfloor > (I32)RXp_LASTPAREN(rex))
8802                 parenfloor = RXp_LASTPAREN(rex); /* Pessimization... */
8803 
8804             ST.prev_curlyx= cur_curlyx;
8805             cur_curlyx = st;
8806             ST.cp = PL_savestack_ix;
8807 
8808             /* these fields contain the state of the current curly.
8809              * they are accessed by subsequent WHILEMs */
8810             ST.parenfloor = parenfloor;
8811             ST.me = scan;
8812             ST.B = next;
8813             ST.minmod = minmod;
8814             minmod = 0;
8815             ST.count = -1;	/* this will be updated by WHILEM */
8816             ST.lastloc = NULL;  /* this will be updated by WHILEM */
8817 
8818             PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol,
8819                                 script_run_begin);
8820             NOT_REACHED; /* NOTREACHED */
8821         }
8822 
8823         case CURLYX_end: /* just finished matching all of A*B */
8824             cur_curlyx = ST.prev_curlyx;
8825             sayYES;
8826             NOT_REACHED; /* NOTREACHED */
8827 
8828         case CURLYX_end_fail: /* just failed to match all of A*B */
8829             regcpblow(ST.cp);
8830             cur_curlyx = ST.prev_curlyx;
8831             sayNO;
8832             NOT_REACHED; /* NOTREACHED */
8833 
8834 
8835 #undef ST
8836 #define ST st->u.whilem
8837 
8838         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8839         {
8840             /* see the discussion above about CURLYX/WHILEM */
8841             I32 n;
8842             int min, max;
8843             /* U16 first_paren, last_paren; */
8844             regnode *A;
8845 
8846             assert(cur_curlyx); /* keep Coverity happy */
8847 
8848             min = ARG1i(cur_curlyx->u.curlyx.me);
8849             max = ARG2i(cur_curlyx->u.curlyx.me);
8850             /* first_paren = ARG3a(cur_curlyx->u.curlyx.me); */
8851             /* last_paren = ARG3b(cur_curlyx->u.curlyx.me);  */
8852             A = REGNODE_AFTER(cur_curlyx->u.curlyx.me);
8853             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8854             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8855             ST.cache_offset = 0;
8856             ST.cache_mask = 0;
8857 
8858             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8859                   depth, (long)n, min, max)
8860             );
8861 
8862             /* First just match a string of min A's. */
8863             if (n < min) {
8864                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8865                 cur_curlyx->u.curlyx.lastloc = locinput;
8866                 REGCP_SET(ST.lastcp);
8867 
8868                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8869                                 script_run_begin);
8870                 NOT_REACHED; /* NOTREACHED */
8871             }
8872 
8873             /* If degenerate A matches "", assume A done. */
8874 
8875             if (locinput == cur_curlyx->u.curlyx.lastloc) {
8876                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8877                    depth)
8878                 );
8879                 goto do_whilem_B_max;
8880             }
8881 
8882             /* super-linear cache processing.
8883              *
8884              * The idea here is that for certain types of CURLYX/WHILEM -
8885              * principally those whose upper bound is infinity (and
8886              * excluding regexes that have things like \1 and other very
8887              * non-regular expressiony things), then if a pattern like
8888              * /....A*.../ fails and we backtrack to the WHILEM, then we
8889              * make a note that this particular WHILEM op was at string
8890              * position 47 (say) when the rest of pattern failed. Then, if
8891              * we ever find ourselves back at that WHILEM, and at string
8892              * position 47 again, we can just fail immediately rather than
8893              * running the rest of the pattern again.
8894              *
8895              * This is very handy when patterns start to go
8896              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8897              * with a combinatorial explosion of backtracking.
8898              *
8899              * The cache is implemented as a bit array, with one bit per
8900              * string byte position per WHILEM op (up to 16) - so its
8901              * between 0.25 and 2x the string size.
8902              *
8903              * To avoid allocating a poscache buffer every time, we do an
8904              * initially countdown; only after we have  executed a WHILEM
8905              * op (string-length x #WHILEMs) times do we allocate the
8906              * cache.
8907              *
8908              * The top 4 bits of FLAGS(scan) byte say how many different
8909              * relevant CURLLYX/WHILEM op pairs there are, while the
8910              * bottom 4-bits is the identifying index number of this
8911              * WHILEM.
8912              */
8913 
8914             if (FLAGS(scan)) {
8915 
8916                 if (!reginfo->poscache_maxiter) {
8917                     /* start the countdown: Postpone detection until we
8918                      * know the match is not *that* much linear. */
8919                     reginfo->poscache_maxiter
8920                         =    (reginfo->strend - reginfo->strbeg + 1)
8921                            * (FLAGS(scan)>>4);
8922                     /* possible overflow for long strings and many CURLYX's */
8923                     if (reginfo->poscache_maxiter < 0)
8924                         reginfo->poscache_maxiter = I32_MAX;
8925                     reginfo->poscache_iter = reginfo->poscache_maxiter;
8926                 }
8927 
8928                 if (reginfo->poscache_iter-- == 0) {
8929                     /* initialise cache */
8930                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8931                     regmatch_info_aux *const aux = reginfo->info_aux;
8932                     if (aux->poscache) {
8933                         if ((SSize_t)reginfo->poscache_size < size) {
8934                             Renew(aux->poscache, size, char);
8935                             reginfo->poscache_size = size;
8936                         }
8937                         Zero(aux->poscache, size, char);
8938                     }
8939                     else {
8940                         reginfo->poscache_size = size;
8941                         Newxz(aux->poscache, size, char);
8942                     }
8943                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8944       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8945                               PL_colors[4], PL_colors[5])
8946                     );
8947                 }
8948 
8949                 if (reginfo->poscache_iter < 0) {
8950                     /* have we already failed at this position? */
8951                     SSize_t offset, mask;
8952 
8953                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8954                     offset  = (FLAGS(scan) & 0xf) - 1
8955                                 +   (locinput - reginfo->strbeg)
8956                                   * (FLAGS(scan)>>4);
8957                     mask    = 1 << (offset % 8);
8958                     offset /= 8;
8959                     if (reginfo->info_aux->poscache[offset] & mask) {
8960                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8961                             depth)
8962                         );
8963                         cur_curlyx->u.curlyx.count--;
8964                         sayNO; /* cache records failure */
8965                     }
8966                     ST.cache_offset = offset;
8967                     ST.cache_mask   = mask;
8968                 }
8969             }
8970 
8971             /* Prefer B over A for minimal matching. */
8972 
8973             if (cur_curlyx->u.curlyx.minmod) {
8974                 ST.save_curlyx = cur_curlyx;
8975                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8976                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8977                                     locinput, loceol, script_run_begin);
8978                 NOT_REACHED; /* NOTREACHED */
8979             }
8980 
8981             /* Prefer A over B for maximal matching. */
8982 
8983             if (n < max) { /* More greed allowed? */
8984                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8985                             maxopenparen);
8986                 cur_curlyx->u.curlyx.lastloc = locinput;
8987                 REGCP_SET(ST.lastcp);
8988                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8989                                 script_run_begin);
8990                 NOT_REACHED; /* NOTREACHED */
8991             }
8992             goto do_whilem_B_max;
8993         }
8994         NOT_REACHED; /* NOTREACHED */
8995 
8996         case WHILEM_B_min: /* just matched B in a minimal match */
8997         case WHILEM_B_max: /* just matched B in a maximal match */
8998             cur_curlyx = ST.save_curlyx;
8999             sayYES;
9000             NOT_REACHED; /* NOTREACHED */
9001 
9002         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
9003             cur_curlyx = ST.save_curlyx;
9004             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9005             cur_curlyx->u.curlyx.count--;
9006             CACHEsayNO;
9007             NOT_REACHED; /* NOTREACHED */
9008 
9009         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
9010             /* FALLTHROUGH */
9011         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
9012             REGCP_UNWIND(ST.lastcp);
9013             regcppop(rex, &maxopenparen);
9014             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9015             cur_curlyx->u.curlyx.count--;
9016             CACHEsayNO;
9017             NOT_REACHED; /* NOTREACHED */
9018 
9019         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
9020             REGCP_UNWIND(ST.lastcp);
9021             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9022             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
9023                 depth)
9024             );
9025 
9026           do_whilem_B_max:
9027             /* now try B */
9028             ST.save_curlyx = cur_curlyx;
9029             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
9030             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
9031                                 locinput, loceol, script_run_begin);
9032             NOT_REACHED; /* NOTREACHED */
9033 
9034         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
9035             cur_curlyx = ST.save_curlyx;
9036 
9037             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2i(cur_curlyx->u.curlyx.me)) {
9038                 /* Maximum greed exceeded */
9039                 cur_curlyx->u.curlyx.count--;
9040                 CACHEsayNO;
9041             }
9042 
9043             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
9044             );
9045             /* Try grabbing another A and see if it helps. */
9046             cur_curlyx->u.curlyx.lastloc = locinput;
9047             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
9048             REGCP_SET(ST.lastcp);
9049             PUSH_STATE_GOTO(WHILEM_A_min,
9050                 /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me),
9051                 locinput, loceol, script_run_begin);
9052             NOT_REACHED; /* NOTREACHED */
9053 
9054 #undef  ST
9055 #define ST st->u.branch
9056 
9057         case BRANCHJ:	    /*  /(...|A|...)/ with long next pointer */
9058             next = scan + ARG1u(scan);
9059             if (next == scan)
9060                 next = NULL;
9061             ST.before_paren = ARG2a(scan);
9062             ST.after_paren = ARG2b(scan);
9063             goto branch_logic;
9064             NOT_REACHED; /* NOTREACHED */
9065 
9066         case BRANCH:	    /*  /(...|A|...)/ */
9067             ST.before_paren = ARG1a(scan);
9068             ST.after_paren = ARG1b(scan);
9069           branch_logic:
9070             scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
9071             assert(scan);
9072             ST.lastparen = RXp_LASTPAREN(rex);
9073             ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9074             ST.next_branch = next;
9075             REGCP_SET(ST.cp);
9076             if (RE_PESSIMISTIC_PARENS) {
9077                 regcppush(rex, 0, maxopenparen);
9078                 REGCP_SET(ST.lastcp);
9079             }
9080 
9081             /* Now go into the branch */
9082             if (has_cutgroup) {
9083                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9084                                     script_run_begin);
9085             } else {
9086                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9087                                 script_run_begin);
9088             }
9089             NOT_REACHED; /* NOTREACHED */
9090 
9091         case CUTGROUP:  /*  /(*THEN)/  */
9092             sv_yes_mark = st->u.mark.mark_name = FLAGS(scan)
9093                 ? MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ])
9094                 : NULL;
9095             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
9096                             script_run_begin);
9097             NOT_REACHED; /* NOTREACHED */
9098 
9099         case CUTGROUP_next_fail:
9100             do_cutgroup = 1;
9101             no_final = 1;
9102             if (st->u.mark.mark_name)
9103                 sv_commit = st->u.mark.mark_name;
9104             sayNO;
9105             NOT_REACHED; /* NOTREACHED */
9106 
9107         case BRANCH_next:
9108             sayYES;
9109             NOT_REACHED; /* NOTREACHED */
9110 
9111         case BRANCH_next_fail: /* that branch failed; try the next, if any */
9112             if (do_cutgroup) {
9113                 do_cutgroup = 0;
9114                 no_final = 0;
9115             }
9116             if (RE_PESSIMISTIC_PARENS) {
9117                 REGCP_UNWIND(ST.lastcp);
9118                 regcppop(rex,&maxopenparen);
9119             }
9120             REGCP_UNWIND(ST.cp);
9121             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9122             CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "BRANCH_next_fail");
9123             scan = ST.next_branch;
9124             /* no more branches? */
9125             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
9126                 DEBUG_EXECUTE_r({
9127                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
9128                         depth,
9129                         PL_colors[4],
9130                         PL_colors[5] );
9131                 });
9132                 sayNO_SILENT;
9133             }
9134             continue; /* execute next BRANCH[J] op */
9135             /* NOTREACHED */
9136 
9137         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
9138             minmod = 1;
9139             break;
9140 
9141 #undef  ST
9142 #define ST st->u.curlym
9143 
9144         case CURLYM:	/* /A{m,n}B/ where A is fixed-length */
9145 
9146             /* This is an optimisation of CURLYX that enables us to push
9147              * only a single backtracking state, no matter how many matches
9148              * there are in {m,n}. It relies on the pattern being constant
9149              * length, with no parens to influence future backrefs
9150              */
9151 
9152             ST.me = scan;
9153             scan = REGNODE_AFTER_type(scan, tregnode_CURLYM);
9154 
9155             ST.lastparen      = RXp_LASTPAREN(rex);
9156             ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9157 
9158             /* if paren positive, emulate an OPEN/CLOSE around A */
9159             if (FLAGS(ST.me)) {
9160                 U32 paren = FLAGS(ST.me);
9161                 lastopen = paren;
9162                 if (paren > maxopenparen)
9163                     maxopenparen = paren;
9164                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
9165             }
9166             ST.A = scan;
9167             ST.B = next;
9168             ST.alen = 0;
9169             ST.count = 0;
9170             ST.minmod = minmod;
9171             minmod = 0;
9172             ST.Binfo.count = -1;
9173             REGCP_SET(ST.cp);
9174 
9175             if (!(ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me))) /* min/max */
9176                 goto curlym_do_B;
9177 
9178           curlym_do_A: /* execute the A in /A{m,n}B/  */
9179             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
9180                                 script_run_begin);
9181             NOT_REACHED; /* NOTREACHED */
9182 
9183         case CURLYM_A: /* we've just matched an A */
9184             ST.count++;
9185             /* after first match, determine A's length: u.curlym.alen */
9186             if (ST.count == 1) {
9187                 if (reginfo->is_utf8_target) {
9188                     char *s = st->locinput;
9189                     while (s < locinput) {
9190                         ST.alen++;
9191                         s += UTF8SKIP(s);
9192                     }
9193                 }
9194                 else {
9195                     ST.alen = locinput - st->locinput;
9196                 }
9197                 if (ST.alen == 0)
9198                     ST.count = ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me);
9199             }
9200             DEBUG_EXECUTE_r(
9201                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
9202                           depth, (IV) ST.count, (IV)ST.alen)
9203             );
9204 
9205             if (FLAGS(ST.me)) {
9206                 /* emulate CLOSE: mark current A as captured */
9207                 U32 paren = (U32)FLAGS(ST.me);
9208                 CLOSE_CAPTURE(rex, paren,
9209                     HOPc(locinput, -ST.alen) - reginfo->strbeg,
9210                     locinput - reginfo->strbeg);
9211             }
9212 
9213             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9214                 goto fake_end;
9215 
9216 
9217             if (!is_accepted) {
9218                 I32 max = (ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me));
9219                 if ( max == REG_INFTY || ST.count < max )
9220                     goto curlym_do_A; /* try to match another A */
9221             }
9222             goto curlym_do_B; /* try to match B */
9223 
9224         case CURLYM_A_fail: /* just failed to match an A */
9225             REGCP_UNWIND(ST.cp);
9226 
9227 
9228             if (ST.minmod || ST.count < ARG1i(ST.me) /* min*/
9229                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9230                 sayNO;
9231 
9232           curlym_do_B: /* execute the B in /A{m,n}B/  */
9233             if (is_accepted)
9234                 goto curlym_close_B;
9235 
9236             if (ST.Binfo.count < 0) {
9237                 /* calculate possible match of 1st char following curly */
9238                 assert(ST.B);
9239                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
9240                     regnode *text_node = ST.B;
9241                     if (! HAS_TEXT(text_node))
9242                         FIND_NEXT_IMPT(text_node);
9243                     if (REGNODE_TYPE(OP(text_node)) == EXACT) {
9244                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9245                                                         &ST.Binfo, reginfo))
9246                         {
9247                             sayNO;
9248                         }
9249                     }
9250                 }
9251             }
9252 
9253             DEBUG_EXECUTE_r(
9254                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
9255                     depth, (IV)ST.count)
9256             );
9257             if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
9258                 assert(ST.Binfo.count > 0);
9259 
9260                 /* Do a quick test to hopefully rule out most non-matches */
9261                 if (     locinput + ST.Binfo.min_length > loceol
9262                     || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9263                 {
9264                     DEBUG_OPTIMISE_r(
9265                         Perl_re_exec_indentf( aTHX_
9266                             "CURLYM Fast bail next target=0x%X anded==0x%X"
9267                                                                 " mask=0x%X\n",
9268                             depth,
9269                             (int) nextbyte, ST.Binfo.first_byte_anded,
9270                                             ST.Binfo.first_byte_mask)
9271                     );
9272                     state_num = CURLYM_B_fail;
9273                     goto reenter_switch;
9274                 }
9275             }
9276 
9277           curlym_close_B:
9278             if (FLAGS(ST.me)) {
9279                 /* emulate CLOSE: mark current A as captured */
9280                 U32 paren = (U32)FLAGS(ST.me);
9281                 if (ST.count || is_accepted) {
9282                     CLOSE_CAPTURE(rex, paren,
9283                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
9284                         locinput - reginfo->strbeg);
9285                 }
9286                 else
9287                     RXp_OFFSp(rex)[paren].end = -1;
9288 
9289                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9290                 {
9291                     if (ST.count || is_accepted)
9292                         goto fake_end;
9293                     else
9294                         sayNO;
9295                 }
9296             }
9297 
9298             if (is_accepted)
9299                 goto fake_end;
9300 
9301             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
9302                             script_run_begin);
9303             NOT_REACHED; /* NOTREACHED */
9304 
9305         case CURLYM_B_fail: /* just failed to match a B */
9306             REGCP_UNWIND(ST.cp);
9307             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9308             if (ST.minmod) {
9309                 I32 max = ARG2i(ST.me);
9310                 if (max != REG_INFTY && ST.count == max)
9311                     sayNO;
9312                 goto curlym_do_A; /* try to match a further A */
9313             }
9314             /* backtrack one A */
9315             if (ST.count == ARG1i(ST.me) /* min */)
9316                 sayNO;
9317             ST.count--;
9318             SET_locinput(HOPc(locinput, -ST.alen));
9319             goto curlym_do_B; /* try to match B */
9320 
9321 #undef ST
9322 #define ST st->u.curly
9323 
9324 #define CURLY_SETPAREN(paren, success)                                      \
9325     if (paren) {                                                            \
9326         if (success) {                                                      \
9327             CLOSE_CAPTURE(rex, paren, HOPc(locinput, -1) - reginfo->strbeg, \
9328                                  locinput - reginfo->strbeg);               \
9329         }                                                                   \
9330         else {                                                              \
9331             RXp_OFFSp(rex)[paren].end = -1;                                 \
9332             RXp_LASTPAREN(rex)  = ST.lastparen;                             \
9333             RXp_LASTCLOSEPAREN(rex) = ST.lastcloseparen;                    \
9334         }                                                                   \
9335     }
9336 
9337         case STAR:		/*  /A*B/ where A is width 1 char */
9338             ST.paren = 0;
9339             ST.min = 0;
9340             ST.max = REG_INFTY;
9341             scan = REGNODE_AFTER_type(scan,tregnode_STAR);
9342             goto repeat;
9343 
9344         case PLUS:		/*  /A+B/ where A is width 1 char */
9345             ST.paren = 0;
9346             ST.min = 1;
9347             ST.max = REG_INFTY;
9348             scan = REGNODE_AFTER_type(scan,tregnode_PLUS);
9349             goto repeat;
9350 
9351         case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
9352             ST.paren = FLAGS(scan);     /* Which paren to set */
9353             ST.lastparen      = RXp_LASTPAREN(rex);
9354             ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9355             if (ST.paren > maxopenparen)
9356                 maxopenparen = ST.paren;
9357             ST.min = ARG1i(scan);  /* min to match */
9358             ST.max = ARG2i(scan);  /* max to match */
9359             scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN));
9360 
9361             /* handle the single-char capture called as a GOSUB etc */
9362             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9363             {
9364                 char *li = locinput;
9365                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9366                     sayNO;
9367                 SET_locinput(li);
9368                 goto fake_end;
9369             }
9370 
9371             goto repeat;
9372 
9373         case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
9374             ST.paren = 0;
9375             ST.min = ARG1i(scan);  /* min to match */
9376             ST.max = ARG2i(scan);  /* max to match */
9377             scan = REGNODE_AFTER_type(scan, tregnode_CURLY);
9378           repeat:
9379             /*
9380             * Lookahead to avoid useless match attempts
9381             * when we know what character comes next.
9382             *
9383             * Used to only do .*x and .*?x, but now it allows
9384             * for )'s, ('s and (?{ ... })'s to be in the way
9385             * of the quantifier and the EXACT-like node.  -- japhy
9386             */
9387 
9388             assert(ST.min <= ST.max);
9389             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9390                 ST.Binfo.count = 0;
9391             }
9392             else {
9393                 regnode *text_node = next;
9394 
9395                 if (! HAS_TEXT(text_node))
9396                     FIND_NEXT_IMPT(text_node);
9397 
9398                 if (! HAS_TEXT(text_node))
9399                     ST.Binfo.count = 0;
9400                 else {
9401                     if ( REGNODE_TYPE(OP(text_node)) != EXACT ) {
9402                         ST.Binfo.count = 0;
9403                     }
9404                     else {
9405                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9406                                                         &ST.Binfo, reginfo))
9407                         {
9408                             sayNO;
9409                         }
9410                     }
9411                 }
9412             }
9413 
9414             ST.A = scan;
9415             ST.B = next;
9416             if (minmod) {
9417                 char *li = locinput;
9418                 minmod = 0;
9419                 if (ST.min &&
9420                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9421                             < ST.min)
9422                     sayNO;
9423                 SET_locinput(li);
9424                 ST.count = ST.min;
9425                 REGCP_SET(ST.cp);
9426 
9427                 if (ST.Binfo.count <= 0)
9428                     goto curly_try_B_min;
9429 
9430                 ST.oldloc = locinput;
9431 
9432                 /* set ST.maxpos to the furthest point along the
9433                  * string that could possibly match, i.e., that a match could
9434                  * start at. */
9435                 if  (ST.max == REG_INFTY) {
9436                     ST.maxpos = loceol - 1;
9437                     if (utf8_target)
9438                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9439                             ST.maxpos--;
9440                 }
9441                 else if (utf8_target) {
9442                     int m = ST.max - ST.min;
9443                     for (ST.maxpos = locinput;
9444                          m >0 && ST.maxpos <  loceol; m--)
9445                         ST.maxpos += UTF8SKIP(ST.maxpos);
9446                 }
9447                 else {
9448                     ST.maxpos = locinput + ST.max - ST.min;
9449                     if (ST.maxpos >=  loceol)
9450                         ST.maxpos =  loceol - 1;
9451                 }
9452                 goto curly_try_B_min_known;
9453 
9454             }
9455             else {
9456                 /* avoid taking address of locinput, so it can remain
9457                  * a register var */
9458                 char *li = locinput;
9459                 if (ST.max)
9460                     ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9461                 else
9462                     ST.count = 0;
9463                 if (ST.count < ST.min)
9464                     sayNO;
9465                 SET_locinput(li);
9466                 if ((ST.count > ST.min)
9467                     && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL))
9468                 {
9469                     /* A{m,n} must come at the end of the string, there's
9470                      * no point in backing off ... */
9471                     ST.min = ST.count;
9472                     /* ...except that $ and \Z can match before *and* after
9473                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
9474                        We may back off by one in this case. */
9475                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9476                         ST.min--;
9477                 }
9478                 REGCP_SET(ST.cp);
9479                 goto curly_try_B_max;
9480             }
9481             NOT_REACHED; /* NOTREACHED */
9482 
9483         case CURLY_B_min_fail:
9484             /* failed to find B in a non-greedy match. */
9485             if (RE_PESSIMISTIC_PARENS) {
9486                 REGCP_UNWIND(ST.lastcp);
9487                 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9488             }
9489             REGCP_UNWIND(ST.cp);
9490             if (ST.paren) {
9491                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9492             }
9493 
9494             if (ST.Binfo.count == 0) {
9495                 /* failed -- move forward one */
9496                 char *li = locinput;
9497                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9498                     sayNO;
9499                 }
9500                 locinput = li;
9501                 ST.count++;
9502                 if (!(   ST.count <= ST.max
9503                         /* count overflow ? */
9504                      || (ST.max == REG_INFTY && ST.count > 0))
9505                 )
9506                     sayNO;
9507             }
9508             else {
9509                 int n;
9510                 /* Couldn't or didn't -- move forward. */
9511                 ST.oldloc = locinput;
9512                 if (utf8_target)
9513                     locinput += UTF8SKIP(locinput);
9514                 else
9515                     locinput++;
9516                 ST.count++;
9517 
9518               curly_try_B_min_known:
9519                 /* find the next place where 'B' could work, then call B */
9520                 if (locinput + ST.Binfo.initial_exact < loceol) {
9521                     if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9522 
9523                         /* Here, the mask is all 1's for the entire length of
9524                          * any possible match.  (That actually means that there
9525                          * is only one possible match.)  Look for the next
9526                          * occurrence */
9527                         locinput = ninstr(locinput, loceol,
9528                                         (char *) ST.Binfo.matches,
9529                                         (char *) ST.Binfo.matches
9530                                                     + ST.Binfo.initial_exact);
9531                         if (locinput == NULL) {
9532                             sayNO;
9533                         }
9534                     }
9535                     else do {
9536                         /* If the first byte(s) of the mask are all ones, it
9537                          * means those bytes must match identically, so can use
9538                          * ninstr() to find the next possible matchpoint */
9539                         if (ST.Binfo.initial_exact > 0) {
9540                             locinput = ninstr(locinput, loceol,
9541                                               (char *) ST.Binfo.matches,
9542                                               (char *) ST.Binfo.matches
9543                                                      + ST.Binfo.initial_exact);
9544                         }
9545                         else { /* Otherwise find the next byte that matches,
9546                                   masked */
9547                             locinput = (char *) find_next_masked(
9548                                                 (U8 *) locinput, (U8 *) loceol,
9549                                                 ST.Binfo.first_byte_anded,
9550                                                 ST.Binfo.first_byte_mask);
9551                             /* Advance to the end of a multi-byte character */
9552                             if (utf8_target) {
9553                                 while (   locinput < loceol
9554                                     && UTF8_IS_CONTINUATION(*locinput))
9555                                 {
9556                                     locinput++;
9557                                 }
9558                             }
9559                         }
9560                         if (   locinput == NULL
9561                             || locinput + ST.Binfo.min_length > loceol)
9562                         {
9563                             sayNO;
9564                         }
9565 
9566                         /* Here, we have found a possible match point; if can't
9567                          * rule it out, quit the loop so can check fully */
9568                         if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9569                             break;
9570                         }
9571 
9572                         locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9573 
9574                     } while (locinput <= ST.maxpos);
9575                 }
9576 
9577                 if (locinput > ST.maxpos)
9578                     sayNO;
9579 
9580                 n = (utf8_target)
9581                     ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9582                     : (STRLEN) (locinput - ST.oldloc);
9583 
9584 
9585                 /* Here is at the beginning of a character that meets the mask
9586                  * criteria.  Need to make sure that some real possibility */
9587 
9588                 if (n) {
9589                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9590                      * at what may be the beginning of b; check that everything
9591                      * between oldloc and locinput matches */
9592                     char *li = ST.oldloc;
9593                     ST.count += n;
9594                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9595                         sayNO;
9596                     assert(n == REG_INFTY || locinput == li);
9597                 }
9598             }
9599 
9600           curly_try_B_min:
9601             if (RE_PESSIMISTIC_PARENS) {
9602                 (void)regcppush(rex, 0, maxopenparen);
9603                 REGCP_SET(ST.lastcp);
9604             }
9605             CURLY_SETPAREN(ST.paren, ST.count);
9606             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9607                             script_run_begin);
9608             NOT_REACHED; /* NOTREACHED */
9609 
9610 
9611           curly_try_B_max:
9612             /* a successful greedy match: now try to match B */
9613             if (        ST.Binfo.count <= 0
9614                 || (    ST.Binfo.count > 0
9615                     &&  locinput + ST.Binfo.min_length <= loceol
9616                     &&  S_test_EXACTISH_ST(locinput, ST.Binfo)))
9617             {
9618                 if (RE_PESSIMISTIC_PARENS) {
9619                     (void)regcppush(rex, 0, maxopenparen);
9620                     REGCP_SET(ST.lastcp);
9621                 }
9622                 CURLY_SETPAREN(ST.paren, ST.count);
9623                 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9624                                 script_run_begin);
9625                 NOT_REACHED; /* NOTREACHED */
9626             }
9627             goto CURLY_B_all_failed;
9628             NOT_REACHED; /* NOTREACHED */
9629 
9630         case CURLY_B_max_fail:
9631             /* failed to find B in a greedy match */
9632 
9633             if (RE_PESSIMISTIC_PARENS) {
9634                 REGCP_UNWIND(ST.lastcp);
9635                 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9636             }
9637           CURLY_B_all_failed:
9638             REGCP_UNWIND(ST.cp);
9639             if (ST.paren) {
9640                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9641             }
9642             /*  back up. */
9643             if (--ST.count < ST.min)
9644                 sayNO;
9645             locinput = HOPc(locinput, -1);
9646             goto curly_try_B_max;
9647 
9648 #undef ST
9649 
9650         case END: /*  last op of main pattern  */
9651           fake_end:
9652             if (cur_eval) {
9653                 /* we've just finished A in /(??{A})B/; now continue with B */
9654                 is_accepted= false;
9655                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9656                 st->u.eval.prev_rex = rex_sv;		/* inner */
9657 
9658                 /* Save *all* the positions. */
9659                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9660                 rex_sv = CUR_EVAL.prev_rex;
9661                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9662                 SET_reg_curpm(rex_sv);
9663                 rex = ReANY(rex_sv);
9664                 rexi = RXi_GET(rex);
9665 
9666                 st->u.eval.prev_curlyx = cur_curlyx;
9667                 cur_curlyx = CUR_EVAL.prev_curlyx;
9668 
9669                 REGCP_SET(st->u.eval.lastcp);
9670 
9671                 /* Restore parens of the outer rex without popping the
9672                  * savestack */
9673                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9674 
9675                 st->u.eval.prev_eval = cur_eval;
9676                 cur_eval = CUR_EVAL.prev_eval;
9677                 DEBUG_EXECUTE_r(
9678                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
9679                                       depth, cur_eval););
9680                 if ( nochange_depth )
9681                     nochange_depth--;
9682 
9683                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9684 
9685                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
9686                                     st->u.eval.prev_eval->u.eval.B,
9687                                     locinput, loceol, script_run_begin);
9688             }
9689 
9690             if (locinput < reginfo->till) {
9691                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9692                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9693                                       PL_colors[4],
9694                                       (long)(locinput - startpos),
9695                                       (long)(reginfo->till - startpos),
9696                                       PL_colors[5]));
9697 
9698                 sayNO_SILENT;		/* Cannot match: too short. */
9699             }
9700             sayYES;			/* Success! */
9701 
9702         case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9703                                 matches end at the right spot, required for
9704                                 variable length matches. */
9705             if (match_end && locinput != match_end)
9706             {
9707                 DEBUG_EXECUTE_r(
9708                 Perl_re_exec_indentf( aTHX_
9709                     "%sLOOKBEHIND_END: subpattern failed...%s\n",
9710                     depth, PL_colors[4], PL_colors[5]));
9711                 sayNO;            /* Variable length match didn't line up */
9712             }
9713             /* FALLTHROUGH */
9714 
9715         case SUCCEED: /* successful SUSPEND/CURLYM and
9716                                             *lookahead* IFMATCH/UNLESSM*/
9717             DEBUG_EXECUTE_r(
9718             Perl_re_exec_indentf( aTHX_
9719                 "%sSUCCEED: subpattern success...%s\n",
9720                 depth, PL_colors[4], PL_colors[5]));
9721             sayYES;			/* Success! */
9722 
9723 #undef  ST
9724 #define ST st->u.ifmatch
9725 
9726         case SUSPEND:	/* (?>A) */
9727             ST.wanted = 1;
9728             ST.start = locinput;
9729             ST.end = loceol;
9730             ST.count = 1;
9731             goto do_ifmatch;
9732 
9733         case UNLESSM:	/* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9734             ST.wanted = 0;
9735             goto ifmatch_trivial_fail_test;
9736 
9737         case IFMATCH:	/* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9738             ST.wanted = 1;
9739           ifmatch_trivial_fail_test:
9740             ST.prev_match_end= match_end;
9741             ST.count = NEXT_OFF(scan) + 1; /* next_off repurposed to be
9742                                               lookbehind count, requires
9743                                               non-zero flags */
9744             if (! FLAGS(scan)) {    /* 'flags' zero means lookahed */
9745 
9746                 /* Lookahead starts here and ends at the normal place */
9747                 ST.start = locinput;
9748                 ST.end = loceol;
9749                 match_end = NULL;
9750             }
9751             else {
9752                 PERL_UINT_FAST8_T back_count = FLAGS(scan);
9753                 char * s;
9754                 match_end = locinput;
9755 
9756                 /* Lookbehind can look beyond the current position */
9757                 ST.end = loceol;
9758 
9759                 /* ... and starts at the first place in the input that is in
9760                  * the range of the possible start positions */
9761                 for (; ST.count > 0; ST.count--, back_count--) {
9762                     s = HOPBACKc(locinput, back_count);
9763                     if (s) {
9764                         ST.start = s;
9765                         goto do_ifmatch;
9766                     }
9767                 }
9768 
9769                 /* If the lookbehind doesn't start in the actual string, is a
9770                  * trivial match failure */
9771                 match_end = ST.prev_match_end;
9772                 if (logical) {
9773                     logical = 0;
9774                     sw = 1 - cBOOL(ST.wanted);
9775                 }
9776                 else if (ST.wanted)
9777                     sayNO;
9778 
9779                 /* Here, we didn't want it to match, so is actually success */
9780                 next = scan + ARG1u(scan);
9781                 if (next == scan)
9782                     next = NULL;
9783                 break;
9784             }
9785 
9786           do_ifmatch:
9787             ST.me = scan;
9788             ST.logical = logical;
9789             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9790 
9791             /* execute body of (?...A) */
9792             PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start,
9793                                 ST.end, script_run_begin);
9794             NOT_REACHED; /* NOTREACHED */
9795 
9796         {
9797             bool matched;
9798 
9799         case IFMATCH_A_fail: /* body of (?...A) failed */
9800             if (! ST.logical && ST.count > 1) {
9801 
9802                 /* It isn't a real failure until we've tried all starting
9803                  * positions.  Move to the next starting position and retry */
9804                 ST.count--;
9805                 ST.start = HOPc(ST.start, 1);
9806                 scan = ST.me;
9807                 logical = ST.logical;
9808                 goto do_ifmatch;
9809             }
9810 
9811             /* Here, all starting positions have been tried. */
9812             matched = FALSE;
9813             goto ifmatch_done;
9814 
9815         case IFMATCH_A: /* body of (?...A) succeeded */
9816             matched = TRUE;
9817           ifmatch_done:
9818             sw = matched == ST.wanted;
9819             match_end = ST.prev_match_end;
9820             if (! ST.logical && !sw) {
9821                 sayNO;
9822             }
9823 
9824             if (OP(ST.me) != SUSPEND) {
9825                 /* restore old position except for (?>...) */
9826                 locinput = st->locinput;
9827                 loceol = st->loceol;
9828                 script_run_begin = st->sr0;
9829             }
9830             scan = ST.me + ARG1u(ST.me);
9831             if (scan == ST.me)
9832                 scan = NULL;
9833             continue; /* execute B */
9834         }
9835 
9836 #undef ST
9837 
9838         case LONGJMP: /*  alternative with many branches compiles to
9839                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9840             next = scan + ARG1u(scan);
9841             if (next == scan)
9842                 next = NULL;
9843             break;
9844 
9845         case COMMIT:  /*  (*COMMIT)  */
9846             reginfo->cutpoint = loceol;
9847             /* FALLTHROUGH */
9848 
9849         case PRUNE:   /*  (*PRUNE)   */
9850             if (FLAGS(scan))
9851                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9852             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9853                             script_run_begin);
9854             NOT_REACHED; /* NOTREACHED */
9855 
9856         case COMMIT_next_fail:
9857             no_final = 1;
9858             /* FALLTHROUGH */
9859             sayNO;
9860             NOT_REACHED; /* NOTREACHED */
9861 
9862         case OPFAIL:   /* (*FAIL)  */
9863             if (FLAGS(scan))
9864                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9865             if (logical) {
9866                 /* deal with (?(?!)X|Y) properly,
9867                  * make sure we trigger the no branch
9868                  * of the trailing IFTHEN structure*/
9869                 sw= 0;
9870                 break;
9871             } else {
9872                 sayNO;
9873             }
9874             NOT_REACHED; /* NOTREACHED */
9875 
9876 #define ST st->u.mark
9877         case MARKPOINT: /*  (*MARK:foo)  */
9878             ST.prev_mark = mark_state;
9879             ST.mark_name = sv_commit = sv_yes_mark
9880                 = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9881             mark_state = st;
9882             ST.mark_loc = locinput;
9883             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9884                                 script_run_begin);
9885             NOT_REACHED; /* NOTREACHED */
9886 
9887         case MARKPOINT_next:
9888             mark_state = ST.prev_mark;
9889             sayYES;
9890             NOT_REACHED; /* NOTREACHED */
9891 
9892         case MARKPOINT_next_fail:
9893             if (popmark && sv_eq(ST.mark_name,popmark))
9894             {
9895                 if (ST.mark_loc > startpoint)
9896                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9897                 popmark = NULL; /* we found our mark */
9898                 sv_commit = ST.mark_name;
9899 
9900                 DEBUG_EXECUTE_r({
9901                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9902                             depth,
9903                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9904                 });
9905             }
9906             mark_state = ST.prev_mark;
9907             sv_yes_mark = mark_state ?
9908                 mark_state->u.mark.mark_name : NULL;
9909             sayNO;
9910             NOT_REACHED; /* NOTREACHED */
9911 
9912         case SKIP:  /*  (*SKIP)  */
9913             if (!FLAGS(scan)) {
9914                 /* (*SKIP) : if we fail we cut here*/
9915                 ST.mark_name = NULL;
9916                 ST.mark_loc = locinput;
9917                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9918                                 script_run_begin);
9919             } else {
9920                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9921                    otherwise do nothing.  Meaning we need to scan
9922                  */
9923                 regmatch_state *cur = mark_state;
9924                 SV *find = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9925 
9926                 while (cur) {
9927                     if ( sv_eq( cur->u.mark.mark_name,
9928                                 find ) )
9929                     {
9930                         ST.mark_name = find;
9931                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9932                                          script_run_begin);
9933                     }
9934                     cur = cur->u.mark.prev_mark;
9935                 }
9936             }
9937             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9938             break;
9939 
9940         case SKIP_next_fail:
9941             if (ST.mark_name) {
9942                 /* (*CUT:NAME) - Set up to search for the name as we
9943                    collapse the stack*/
9944                 popmark = ST.mark_name;
9945             } else {
9946                 /* (*CUT) - No name, we cut here.*/
9947                 if (ST.mark_loc > startpoint)
9948                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9949                 /* but we set sv_commit to latest mark_name if there
9950                    is one so they can test to see how things lead to this
9951                    cut */
9952                 if (mark_state)
9953                     sv_commit=mark_state->u.mark.mark_name;
9954             }
9955             no_final = 1;
9956             sayNO;
9957             NOT_REACHED; /* NOTREACHED */
9958 #undef ST
9959 
9960         case LNBREAK: /* \R */
9961             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9962                 locinput += n;
9963             } else
9964                 sayNO;
9965             break;
9966 
9967         default:
9968             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9969                           PTR2UV(scan), OP(scan));
9970             Perl_croak(aTHX_ "regexp memory corruption");
9971 
9972         /* this is a point to jump to in order to increment
9973          * locinput by one character */
9974           increment_locinput:
9975             assert(!NEXTCHR_IS_EOS);
9976             if (utf8_target) {
9977                 locinput += PL_utf8skip[nextbyte];
9978                 /* locinput is allowed to go 1 char off the end (signifying
9979                  * EOS), but not 2+ */
9980                 if (locinput >  loceol)
9981                     sayNO;
9982             }
9983             else
9984                 locinput++;
9985             break;
9986 
9987         } /* end switch */
9988 
9989         /* switch break jumps here */
9990         scan = next; /* prepare to execute the next op and ... */
9991         continue;    /* ... jump back to the top, reusing st */
9992         /* NOTREACHED */
9993 
9994       push_yes_state:
9995         /* push a state that backtracks on success */
9996         st->u.yes.prev_yes_state = yes_state;
9997         yes_state = st;
9998         /* FALLTHROUGH */
9999       push_state:
10000         /* push a new regex state, then continue at scan  */
10001         {
10002             regmatch_state *newst;
10003             DECLARE_AND_GET_RE_DEBUG_FLAGS;
10004 
10005             DEBUG_r( /* DEBUG_STACK_r */
10006               if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
10007                 regmatch_state *cur = st;
10008                 regmatch_state *curyes = yes_state;
10009                 U32 i;
10010                 regmatch_slab *slab = PL_regmatch_slab;
10011                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
10012                     if (cur < SLAB_FIRST(slab)) {
10013                         slab = slab->prev;
10014                         cur = SLAB_LAST(slab);
10015                     }
10016                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
10017                         depth,
10018                         i ? "    " : "push",
10019                         depth - i, REGNODE_NAME(cur->resume_state),
10020                         (curyes == cur) ? "yes" : ""
10021                     );
10022                     if (curyes == cur)
10023                         curyes = cur->u.yes.prev_yes_state;
10024                 }
10025             } else {
10026                 DEBUG_STATE_pp("push")
10027             });
10028             depth++;
10029             st->locinput = locinput;
10030             st->loceol = loceol;
10031             st->sr0 = script_run_begin;
10032             newst = st+1;
10033             if (newst >  SLAB_LAST(PL_regmatch_slab))
10034                 newst = S_push_slab(aTHX);
10035             PL_regmatch_state = newst;
10036 
10037             locinput = pushinput;
10038             loceol = pusheol;
10039             script_run_begin = pushsr0;
10040             st = newst;
10041             continue;
10042             /* NOTREACHED */
10043         }
10044     }
10045 #ifdef SOLARIS_BAD_OPTIMIZER
10046 #  undef PL_charclass
10047 #endif
10048 
10049     /*
10050     * We get here only if there's trouble -- normally "case END" is
10051     * the terminating point.
10052     */
10053     Perl_croak(aTHX_ "corrupted regexp pointers");
10054     NOT_REACHED; /* NOTREACHED */
10055 
10056   yes:
10057     if (yes_state) {
10058         /* we have successfully completed a subexpression, but we must now
10059          * pop to the state marked by yes_state and continue from there */
10060         assert(st != yes_state);
10061 #ifdef DEBUGGING
10062         while (st != yes_state) {
10063             st--;
10064             if (st < SLAB_FIRST(PL_regmatch_slab)) {
10065                 PL_regmatch_slab = PL_regmatch_slab->prev;
10066                 st = SLAB_LAST(PL_regmatch_slab);
10067             }
10068             DEBUG_STATE_r({
10069                 if (no_final) {
10070                     DEBUG_STATE_pp("pop (no final)");
10071                 } else {
10072                     DEBUG_STATE_pp("pop (yes)");
10073                 }
10074             });
10075             depth--;
10076         }
10077 #else
10078         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
10079             || yes_state > SLAB_LAST(PL_regmatch_slab))
10080         {
10081             /* not in this slab, pop slab */
10082             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
10083             PL_regmatch_slab = PL_regmatch_slab->prev;
10084             st = SLAB_LAST(PL_regmatch_slab);
10085         }
10086         depth -= (st - yes_state);
10087 #endif
10088         st = yes_state;
10089         yes_state = st->u.yes.prev_yes_state;
10090         PL_regmatch_state = st;
10091 
10092         if (no_final) {
10093             locinput= st->locinput;
10094             loceol= st->loceol;
10095             script_run_begin = st->sr0;
10096         }
10097         state_num = st->resume_state + no_final;
10098         goto reenter_switch;
10099     }
10100 
10101     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
10102                           PL_colors[4], PL_colors[5]));
10103 
10104     if (reginfo->info_aux_eval) {
10105         /* each successfully executed (?{...}) block does the equivalent of
10106          *   local $^R = do {...}
10107          * When popping the save stack, all these locals would be undone;
10108          * bypass this by setting the outermost saved $^R to the latest
10109          * value */
10110         /* I don't know if this is needed or works properly now.
10111          * see code related to PL_replgv elsewhere in this file.
10112          * Yves
10113          */
10114         if (oreplsv != GvSV(PL_replgv)) {
10115             sv_setsv(oreplsv, GvSV(PL_replgv));
10116             SvSETMAGIC(oreplsv);
10117         }
10118     }
10119     result = 1;
10120     goto final_exit;
10121 
10122   no:
10123     DEBUG_EXECUTE_r(
10124         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
10125             depth,
10126             PL_colors[4], PL_colors[5])
10127         );
10128 
10129   no_silent:
10130     if (no_final) {
10131         if (yes_state) {
10132             goto yes;
10133         } else {
10134             goto final_exit;
10135         }
10136     }
10137     if (depth) {
10138         /* there's a previous state to backtrack to */
10139         st--;
10140         if (st < SLAB_FIRST(PL_regmatch_slab)) {
10141             PL_regmatch_slab = PL_regmatch_slab->prev;
10142             st = SLAB_LAST(PL_regmatch_slab);
10143         }
10144         PL_regmatch_state = st;
10145         locinput= st->locinput;
10146         loceol= st->loceol;
10147         script_run_begin = st->sr0;
10148 
10149         DEBUG_STATE_pp("pop");
10150         depth--;
10151         if (yes_state == st)
10152             yes_state = st->u.yes.prev_yes_state;
10153 
10154         state_num = st->resume_state + 1; /* failure = success + 1 */
10155         PERL_ASYNC_CHECK();
10156         goto reenter_switch;
10157     }
10158     result = 0;
10159 
10160   final_exit:
10161     if (rex->intflags & PREGf_VERBARG_SEEN) {
10162         SV *sv_err = get_sv("REGERROR", 1);
10163         SV *sv_mrk = get_sv("REGMARK", 1);
10164         if (result) {
10165             sv_commit = &PL_sv_no;
10166             if (!sv_yes_mark)
10167                 sv_yes_mark = &PL_sv_yes;
10168         } else {
10169             if (!sv_commit)
10170                 sv_commit = &PL_sv_yes;
10171             sv_yes_mark = &PL_sv_no;
10172         }
10173         assert(sv_err);
10174         assert(sv_mrk);
10175         sv_setsv(sv_err, sv_commit);
10176         sv_setsv(sv_mrk, sv_yes_mark);
10177     }
10178 
10179 
10180     if (last_pushed_cv) {
10181         dSP;
10182         /* see "Some notes about MULTICALL" above */
10183         POP_MULTICALL;
10184         PERL_UNUSED_VAR(SP);
10185     }
10186     else
10187         LEAVE_SCOPE(orig_savestack_ix);
10188 
10189     assert(!result ||  locinput - reginfo->strbeg >= 0);
10190     return result ?  locinput - reginfo->strbeg : -1;
10191 }
10192 
10193 /*
10194  - regrepeat - repeatedly match something simple, report how many
10195  *
10196  * What 'simple' means is a node which can be the operand of a quantifier like
10197  * '+', or {1,3}
10198  *
10199  * startposp - pointer to a pointer to the start position.  This is updated
10200  *             to point to the byte following the highest successful
10201  *             match.
10202  * p         - the regnode to be repeatedly matched against.
10203  * loceol    - pointer to the end position beyond which we aren't supposed to
10204  *             look.
10205  * reginfo   - struct holding match state, such as utf8_target
10206  * max       - maximum number of things to match.
10207  * depth     - (for debugging) backtracking depth.
10208  */
10209 STATIC I32
S_regrepeat(pTHX_ regexp * prog,char ** startposp,const regnode * p,char * loceol,regmatch_info * const reginfo,I32 max comma_pDEPTH)10210 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
10211             char * loceol, regmatch_info *const reginfo, I32 max comma_pDEPTH)
10212 {
10213     char *scan;     /* Pointer to current position in target string */
10214     I32 c;
10215     char *this_eol = loceol;   /* potentially adjusted version. */
10216     I32 hardcount = 0;  /* How many matches so far */
10217     bool utf8_target = reginfo->is_utf8_target;
10218     unsigned int to_complement = 0;  /* Invert the result? */
10219     char_class_number_ classnum;
10220 
10221     PERL_ARGS_ASSERT_REGREPEAT;
10222 
10223     /* This routine is structured so that we switch on the input OP.  Each OP
10224      * case: statement contains a loop to repeatedly apply the OP, advancing
10225      * the input until it fails, or reaches the end of the input, or until it
10226      * reaches the upper limit of matches. */
10227 
10228     scan = *startposp;
10229     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
10230                                max */
10231         max = I32_MAX;
10232     else if (! utf8_target && this_eol - scan > max)
10233         this_eol = scan + max;
10234 
10235     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
10236      * down to the maximum of how far we should go in it (but leaving it set to
10237      * the real end if the maximum permissible would take us beyond that).
10238      * This allows us to make the loop exit condition that we haven't gone past
10239      * <this_eol> to also mean that we haven't exceeded the max permissible
10240      * count, saving a test each time through the loop.  But it assumes that
10241      * the OP matches a single byte, which is true for most of the OPs below
10242      * when applied to a non-UTF-8 target.  Those relatively few OPs that don't
10243      * have this characteristic have to compensate.
10244      *
10245      * There is no such adjustment for UTF-8 targets, since the number of bytes
10246      * per character can vary.  OPs will have to test both that the count is
10247      * less than the max permissible (using <hardcount> to keep track), and
10248      * that we are still within the bounds of the string (using <this_eol>.  A
10249      * few OPs match a single byte no matter what the encoding.  They can omit
10250      * the max test if, for the UTF-8 case, they do the adjustment that was
10251      * skipped above.
10252      *
10253      * Thus, the code above sets things up for the common case; and exceptional
10254      * cases need extra work; the common case is to make sure <scan> doesn't go
10255      * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
10256      * count doesn't exceed the maximum permissible */
10257 
10258     switch (with_t_UTF8ness(OP(p), utf8_target)) {
10259         SV * anyofh_list;
10260 
10261       case REG_ANY_t8:
10262         while (scan < this_eol && hardcount < max && *scan != '\n') {
10263             scan += UTF8SKIP(scan);
10264             hardcount++;
10265         }
10266         break;
10267 
10268       case REG_ANY_tb:
10269         scan = (char *) memchr(scan, '\n', this_eol - scan);
10270         if (! scan) {
10271             scan = this_eol;
10272         }
10273         break;
10274 
10275       case SANY_t8:
10276         while (scan < this_eol && hardcount < max) {
10277             scan += UTF8SKIP(scan);
10278             hardcount++;
10279         }
10280         break;
10281 
10282       case SANY_tb:
10283         scan = this_eol;
10284         break;
10285 
10286       case EXACT_REQ8_tb:
10287       case LEXACT_REQ8_tb:
10288       case EXACTFU_REQ8_tb:
10289         break;
10290 
10291       case EXACTL_t8:
10292         if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10293             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10294         }
10295         /* FALLTHROUGH */
10296 
10297       case EXACTL_tb:
10298       case EXACTFL_t8:
10299       case EXACTFL_tb:
10300       case EXACTFLU8_t8:
10301       case EXACTFLU8_tb:
10302         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10303         /* FALLTHROUGH */
10304 
10305       case EXACT_REQ8_t8:
10306       case LEXACT_REQ8_t8:
10307       case EXACTFU_REQ8_t8:
10308       case LEXACT_t8:
10309       case LEXACT_tb:
10310       case EXACT_t8:
10311       case EXACT_tb:
10312       case EXACTF_t8:
10313       case EXACTF_tb:
10314       case EXACTFAA_NO_TRIE_t8:
10315       case EXACTFAA_NO_TRIE_tb:
10316       case EXACTFAA_t8:
10317       case EXACTFAA_tb:
10318       case EXACTFU_t8:
10319       case EXACTFU_tb:
10320       case EXACTFUP_t8:
10321       case EXACTFUP_tb:
10322 
10323       {
10324         struct next_matchable_info Binfo;
10325         PERL_UINT_FAST8_T definitive_len;
10326 
10327         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10328 
10329         /* Set up termination info, and quit if we can rule out that we've
10330          * gotten a match of the termination criteria */
10331         if (   ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10332             ||   scan + Binfo.min_length > this_eol
10333             || ! S_test_EXACTISH_ST(scan, Binfo))
10334         {
10335             break;
10336         }
10337 
10338         definitive_len = Binfo.initial_definitive;
10339 
10340         /* Here there are potential matches, and the first byte(s) matched our
10341          * filter
10342          *
10343          * If we got a definitive match of some initial bytes, there is no
10344          * possibility of false positives as far as it got */
10345         if (definitive_len > 0) {
10346 
10347             /* If as far as it got is the maximum possible, there were no false
10348              * positives at all.  Since we have everything set up, see how many
10349              * repeats there are. */
10350             if (definitive_len >= Binfo.max_length) {
10351 
10352                 /* We've already found one match */
10353                 scan += definitive_len;
10354                 hardcount++;
10355 
10356                 /* If want more than the one match, and there is room for more,
10357                  * see if there are any */
10358                 if (hardcount < max && scan + definitive_len <= this_eol) {
10359 
10360                     /* If the character is only a single byte long, just span
10361                      * all such bytes. */
10362                     if (definitive_len == 1) {
10363                         const char * orig_scan = scan;
10364 
10365                         if (this_eol - (scan - hardcount) > max) {
10366                             this_eol = scan - hardcount + max;
10367                         }
10368 
10369                         /* Use different routines depending on whether it's an
10370                          * exact match or matches with a mask */
10371                         if (Binfo.initial_exact == 1) {
10372                             scan = (char *) find_span_end((U8 *) scan,
10373                                                           (U8 *) this_eol,
10374                                                           Binfo.matches[0]);
10375                         }
10376                         else {
10377                             scan = (char *) find_span_end_mask(
10378                                                        (U8 *) scan,
10379                                                        (U8 *) this_eol,
10380                                                        Binfo.first_byte_anded,
10381                                                        Binfo.first_byte_mask);
10382                         }
10383 
10384                         hardcount += scan - orig_scan;
10385                     }
10386                     else { /* Here, the full character definitive match is more
10387                               than one byte */
10388                         while (   hardcount < max
10389                                && scan + definitive_len <= this_eol
10390                                && S_test_EXACTISH_ST(scan, Binfo))
10391                         {
10392                                 scan += definitive_len;
10393                                 hardcount++;
10394                         }
10395                     }
10396                 }
10397 
10398                 break;
10399             }   /* End of a full character is definitively matched */
10400 
10401             /* Here, an initial portion of the character matched definitively,
10402              * and the rest matched as well, but could have false positives */
10403 
10404             do {
10405                 int i;
10406                 U8 * matches = Binfo.matches;
10407 
10408                 /* The first bytes were definitive.  Look at the remaining */
10409                 for (i = 0; i < Binfo.count; i++) {
10410                     if (memEQ(scan + definitive_len,
10411                               matches + definitive_len,
10412                               Binfo.lengths[i] - definitive_len))
10413                     {
10414                         goto found_a_completion;
10415                     }
10416 
10417                     matches += Binfo.lengths[i];
10418                 }
10419 
10420                 /* Didn't find anything to complete our initial match.  Stop
10421                  * here */
10422                 break;
10423 
10424               found_a_completion:
10425 
10426                 /* Here, matched a full character, Include it in the result,
10427                  * and then look to see if the next char matches */
10428                 hardcount++;
10429                 scan += Binfo.lengths[i];
10430 
10431             } while (   hardcount < max
10432                      && scan + definitive_len < this_eol
10433                      && S_test_EXACTISH_ST(scan, Binfo));
10434 
10435             /* Here, have advanced as far as possible */
10436             break;
10437         } /* End of found some initial bytes that definitively matched */
10438 
10439         /* Here, we can't rule out that we have found the beginning of 'B', but
10440          * there were no initial bytes that could rule out anything
10441          * definitively. Use brute force to examine all the possibilities */
10442         while (scan < this_eol && hardcount < max) {
10443             int i;
10444             U8 * matches = Binfo.matches;
10445 
10446             for (i = 0; i < Binfo.count; i++) {
10447                 if (memEQ(scan, matches, Binfo.lengths[i])) {
10448                     goto found1;
10449                 }
10450 
10451                 matches += Binfo.lengths[i];
10452             }
10453 
10454             break;
10455 
10456           found1:
10457             hardcount++;
10458             scan += Binfo.lengths[i];
10459         }
10460 
10461         break;
10462       }
10463 
10464       case ANYOFPOSIXL_t8:
10465       case ANYOFL_t8:
10466         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10467         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10468 
10469         /* FALLTHROUGH */
10470       case ANYOFD_t8:
10471       case ANYOF_t8:
10472         while (   hardcount < max
10473                && scan < this_eol
10474                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10475         {
10476             scan += UTF8SKIP(scan);
10477             hardcount++;
10478         }
10479         break;
10480 
10481       case ANYOFPOSIXL_tb:
10482       case ANYOFL_tb:
10483         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10484         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10485         /* FALLTHROUGH */
10486 
10487       case ANYOFD_tb:
10488       case ANYOF_tb:
10489         if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) {
10490             while (   scan < this_eol
10491                    && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10492                 scan++;
10493         }
10494         else {
10495             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10496                 scan++;
10497         }
10498         break;
10499 
10500       case ANYOFM_t8:
10501         if (this_eol - scan > max) {
10502 
10503             /* We didn't adjust <this_eol> at the beginning of this routine
10504              * because is UTF-8, but it is actually ok to do so, since here, to
10505              * match, 1 char == 1 byte. */
10506             this_eol = scan + max;
10507         }
10508         /* FALLTHROUGH */
10509 
10510       case ANYOFM_tb:
10511         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10512                                            (U8) ARG1u(p), FLAGS(p));
10513         break;
10514 
10515       case NANYOFM_t8:
10516         while (     hardcount < max
10517                &&   scan < this_eol
10518                &&  (*scan & FLAGS(p)) != ARG1u(p))
10519         {
10520             scan += UTF8SKIP(scan);
10521             hardcount++;
10522         }
10523         break;
10524 
10525       case NANYOFM_tb:
10526         scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10527                                          (U8) ARG1u(p), FLAGS(p));
10528         break;
10529 
10530       case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10531       case ANYOFHb_tb:
10532       case ANYOFHbbm_tb:
10533       case ANYOFHr_tb:
10534       case ANYOFHs_tb:
10535         break;
10536 
10537       case ANYOFH_t8:
10538         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10539         while (  hardcount < max
10540                && scan < this_eol
10541                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10542                && _invlist_contains_cp(anyofh_list,
10543                                              utf8_to_uvchr_buf((U8 *) scan,
10544                                                                (U8 *) this_eol,
10545                                                                NULL)))
10546         {
10547             scan += UTF8SKIP(scan);
10548             hardcount++;
10549         }
10550         break;
10551 
10552       case ANYOFHb_t8:
10553         /* we know the first byte must be the FLAGS field */
10554         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10555         while (   hardcount < max
10556                && scan < this_eol
10557                && (U8) *scan == ANYOF_FLAGS(p)
10558                && _invlist_contains_cp(anyofh_list,
10559                                              utf8_to_uvchr_buf((U8 *) scan,
10560                                                                (U8 *) this_eol,
10561                                                                NULL)))
10562         {
10563             scan += UTF8SKIP(scan);
10564             hardcount++;
10565         }
10566         break;
10567 
10568       case ANYOFHbbm_t8:
10569         while (   hardcount < max
10570                && scan + 1 < this_eol
10571                && (U8) *scan == ANYOF_FLAGS(p)
10572                && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap,
10573                                 (U8) scan[1] & UTF_CONTINUATION_MASK))
10574         {
10575             scan += 2;  /* This node only matces 2-byte UTF-8 */
10576             hardcount++;
10577         }
10578         break;
10579 
10580       case ANYOFHr_t8:
10581         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10582         while (  hardcount < max
10583                && scan < this_eol
10584                && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10585                           LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10586                           HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10587                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10588                && _invlist_contains_cp(anyofh_list,
10589                                              utf8_to_uvchr_buf((U8 *) scan,
10590                                                                (U8 *) this_eol,
10591                                                                NULL)))
10592         {
10593             scan += UTF8SKIP(scan);
10594             hardcount++;
10595         }
10596         break;
10597 
10598       case ANYOFHs_t8:
10599         anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10600         while (   hardcount < max
10601                && scan + FLAGS(p) < this_eol
10602                && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10603                && _invlist_contains_cp(anyofh_list,
10604                                              utf8_to_uvchr_buf((U8 *) scan,
10605                                                                (U8 *) this_eol,
10606                                                                NULL)))
10607         {
10608             scan += UTF8SKIP(scan);
10609             hardcount++;
10610         }
10611         break;
10612 
10613       case ANYOFR_t8:
10614         while (   hardcount < max
10615                && scan < this_eol
10616                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10617                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10618                                             (U8 *) this_eol,
10619                                             NULL),
10620                               ANYOFRbase(p), ANYOFRdelta(p)))
10621         {
10622             scan += UTF8SKIP(scan);
10623             hardcount++;
10624         }
10625         break;
10626 
10627       case ANYOFR_tb:
10628         while (   hardcount < max
10629                && scan < this_eol
10630                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10631         {
10632             scan++;
10633             hardcount++;
10634         }
10635         break;
10636 
10637       case ANYOFRb_t8:
10638         while (   hardcount < max
10639                && scan < this_eol
10640                && (U8) *scan == ANYOF_FLAGS(p)
10641                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10642                                             (U8 *) this_eol,
10643                                             NULL),
10644                               ANYOFRbase(p), ANYOFRdelta(p)))
10645         {
10646             scan += UTF8SKIP(scan);
10647             hardcount++;
10648         }
10649         break;
10650 
10651       case ANYOFRb_tb:
10652         while (   hardcount < max
10653                && scan < this_eol
10654                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10655         {
10656             scan++;
10657             hardcount++;
10658         }
10659         break;
10660 
10661     /* The argument (FLAGS) to all the POSIX node types is the class number */
10662 
10663       case NPOSIXL_tb:
10664         to_complement = 1;
10665         /* FALLTHROUGH */
10666 
10667       case POSIXL_tb:
10668         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10669         while (   scan < this_eol
10670                && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10671         {
10672             scan++;
10673         }
10674         break;
10675 
10676       case NPOSIXL_t8:
10677         to_complement = 1;
10678         /* FALLTHROUGH */
10679 
10680       case POSIXL_t8:
10681         while (   hardcount < max && scan < this_eol
10682                && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10683                                                       (U8 *) scan,
10684                                                       (U8 *) this_eol)))
10685         {
10686             scan += UTF8SKIP(scan);
10687             hardcount++;
10688         }
10689         break;
10690 
10691       case POSIXD_tb:
10692         /* FALLTHROUGH */
10693 
10694       case POSIXA_t8:
10695         if (this_eol - scan > max) {
10696 
10697             /* We didn't adjust <this_eol> at the beginning of this routine
10698              * because is UTF-8, but it is actually ok to do so, since here, to
10699              * match, 1 char == 1 byte. */
10700             this_eol = scan + max;
10701         }
10702         /* FALLTHROUGH */
10703 
10704       case POSIXA_tb:
10705         while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) {
10706             scan++;
10707         }
10708         break;
10709 
10710       case NPOSIXD_tb:
10711         /* FALLTHROUGH */
10712 
10713       case NPOSIXA_tb:
10714         while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) {
10715             scan++;
10716         }
10717         break;
10718 
10719       case NPOSIXA_t8:
10720 
10721         /* The complement of something that matches only ASCII matches all
10722          * non-ASCII, plus everything in ASCII that isn't in the class. */
10723         while (   hardcount < max && scan < this_eol
10724                && (   ! isASCII_utf8_safe(scan, loceol)
10725                    || ! generic_isCC_A_((U8) *scan, FLAGS(p))))
10726             {
10727                 scan += UTF8SKIP(scan);
10728                 hardcount++;
10729             }
10730         break;
10731 
10732       case NPOSIXU_tb:
10733         to_complement = 1;
10734         /* FALLTHROUGH */
10735 
10736       case POSIXU_tb:
10737         while (   scan < this_eol
10738                && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p))))
10739         {
10740             scan++;
10741         }
10742         break;
10743 
10744       case NPOSIXU_t8:
10745       case NPOSIXD_t8:
10746         to_complement = 1;
10747         /* FALLTHROUGH */
10748 
10749       case POSIXD_t8:
10750       case POSIXU_t8:
10751         classnum = (char_class_number_) FLAGS(p);
10752         switch (classnum) {
10753           default:
10754             while (   hardcount < max && scan < this_eol
10755                    && to_complement
10756                     ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10757                        utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10758             {
10759                 scan += UTF8SKIP(scan);
10760                 hardcount++;
10761             }
10762             break;
10763 
10764             /* For the classes below, the knowledge of how to handle every code
10765              * point is compiled into Perl via a macro.  This code is written
10766              * for making the loops as tight as possible.  It could be
10767              * refactored to save space instead. */
10768 
10769           case CC_ENUM_SPACE_:
10770             while (   hardcount < max
10771                    && scan < this_eol
10772                    && (to_complement
10773                                    ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10774             {
10775                 scan += UTF8SKIP(scan);
10776                 hardcount++;
10777             }
10778             break;
10779           case CC_ENUM_BLANK_:
10780             while (   hardcount < max
10781                    && scan < this_eol
10782                    && (to_complement
10783                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10784             {
10785                 scan += UTF8SKIP(scan);
10786                 hardcount++;
10787             }
10788             break;
10789           case CC_ENUM_XDIGIT_:
10790             while (   hardcount < max
10791                    && scan < this_eol
10792                    && (to_complement
10793                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10794             {
10795                 scan += UTF8SKIP(scan);
10796                 hardcount++;
10797             }
10798             break;
10799           case CC_ENUM_VERTSPACE_:
10800             while (   hardcount < max
10801                    && scan < this_eol
10802                    && (to_complement
10803                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10804             {
10805                 scan += UTF8SKIP(scan);
10806                 hardcount++;
10807             }
10808             break;
10809           case CC_ENUM_CNTRL_:
10810             while (   hardcount < max
10811                    && scan < this_eol
10812                    && (to_complement
10813                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10814             {
10815                 scan += UTF8SKIP(scan);
10816                 hardcount++;
10817             }
10818             break;
10819         }
10820         break;
10821 
10822       case LNBREAK_t8:
10823         while (    hardcount < max && scan < this_eol
10824                && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10825         {
10826             scan += c;
10827             hardcount++;
10828         }
10829         break;
10830 
10831       case LNBREAK_tb:
10832         /* LNBREAK can match one or two latin chars, which is ok, but we have
10833          * to use hardcount in this situation, and throw away the adjustment to
10834          * <this_eol> done before the switch statement */
10835         while (
10836             hardcount < max && scan < loceol
10837             && (c = is_LNBREAK_latin1_safe(scan, loceol))
10838         ) {
10839             scan += c;
10840             hardcount++;
10841         }
10842         break;
10843 
10844     default:
10845         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10846                          " node type %d='%s'", OP(p), REGNODE_NAME(OP(p)));
10847         NOT_REACHED; /* NOTREACHED */
10848 
10849     }
10850 
10851     if (hardcount)
10852         c = hardcount;
10853     else
10854         c = scan - *startposp;
10855     *startposp = scan;
10856 
10857     DEBUG_r({
10858         DECLARE_AND_GET_RE_DEBUG_FLAGS;
10859         DEBUG_EXECUTE_r({
10860             SV * const prop = sv_newmortal();
10861             regprop(prog, prop, p, reginfo, NULL);
10862             Perl_re_exec_indentf( aTHX_
10863                         "%s can match %" IVdf " times out of %" IVdf "...\n",
10864                         depth, SvPVX_const(prop),(IV)c,(IV)max);
10865         });
10866     });
10867 
10868     return(c);
10869 }
10870 
10871 /*
10872  - reginclass - determine if a character falls into a character class
10873 
10874   n is the ANYOF-type regnode
10875   p is the target string
10876   p_end points to one byte beyond the end of the target string
10877   utf8_target tells whether p is in UTF-8.
10878 
10879   Returns true if matched; false otherwise.
10880 
10881   Note that this can be a synthetic start class, a combination of various
10882   nodes, so things you think might be mutually exclusive, such as locale,
10883   aren't.  It can match both locale and non-locale
10884 
10885  */
10886 
10887 STATIC bool
S_reginclass(pTHX_ regexp * const prog,const regnode * const n,const U8 * const p,const U8 * const p_end,const bool utf8_target)10888 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10889 {
10890     const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10891                         ? 0
10892                         : ANYOF_FLAGS(n);
10893     bool match = FALSE;
10894     UV c = *p;
10895 
10896     PERL_ARGS_ASSERT_REGINCLASS;
10897 
10898     /* If c is not already the code point, get it.  Note that
10899      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10900     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10901         STRLEN c_len = 0;
10902         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10903         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10904         if (c_len == (STRLEN)-1) {
10905             _force_out_malformed_utf8_message(p, p_end,
10906                                               utf8n_flags,
10907                                               1 /* 1 means die */ );
10908             NOT_REACHED; /* NOTREACHED */
10909         }
10910         if (     c > 255
10911             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10912             && ! (flags & ANYOFL_UTF8_LOCALE_REQD))
10913         {
10914             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10915         }
10916     }
10917 
10918     /* If this character is potentially in the bitmap, check it */
10919     if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10920         if (ANYOF_BITMAP_TEST(n, c))
10921             match = TRUE;
10922         else if (  (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)
10923                  && OP(n) == ANYOFD
10924                  && ! utf8_target
10925                  && ! isASCII(c))
10926         {
10927             match = TRUE;
10928         }
10929         else if (flags & ANYOF_LOCALE_FLAGS) {
10930             if (  (flags & ANYOFL_FOLD)
10931                 && c < 256
10932                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10933             {
10934                 match = TRUE;
10935             }
10936             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10937                      && c <= U8_MAX  /* param to isFOO_lc() */
10938             ) {
10939                 /* The data structure is arranged so bits 0, 2, 4, ... are set
10940                  * if the class includes the Posix character class given by
10941                  * bit/2; and 1, 3, 5, ... are set if the class includes the
10942                  * complemented Posix class given by int(bit/2), so the
10943                  * remainder modulo 2 tells us if to complement or not.
10944                  *
10945                  * Note that this code assumes that all the classes are closed
10946                  * under folding.  For example, if a character matches \w, then
10947                  * its fold does too; and vice versa.  This should be true for
10948                  * any well-behaved locale for all the currently defined Posix
10949                  * classes, except for :lower: and :upper:, which are handled
10950                  * by the pseudo-class :cased: which matches if either of the
10951                  * other two does.  To get rid of this assumption, an outer
10952                  * loop could be used below to iterate over both the source
10953                  * character, and its fold (if different) */
10954 
10955                 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10956 
10957                 do {
10958                     /* Find the next set bit indicating a class to try matching
10959                      * against */
10960                     U8 bit_pos = lsbit_pos32(posixl_bits);
10961 
10962                     if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10963                         match = TRUE;
10964                         break;
10965                     }
10966 
10967                     /* Remove this class from consideration; repeat */
10968                     POSIXL_CLEAR(posixl_bits, bit_pos);
10969                 } while(posixl_bits != 0);
10970             }
10971         }
10972     }
10973 
10974     /* If the bitmap didn't (or couldn't) match, and something outside the
10975      * bitmap could match, try that. */
10976     if (!match) {
10977         if (      c >= NUM_ANYOF_CODE_POINTS
10978             &&    ANYOF_ONLY_HAS_BITMAP(n)
10979             && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES))
10980         {
10981             /* In this case, the ARG is set up so that the final bit indicates
10982              * whether it matches or not */
10983             match = ARG1u(n) & 1;
10984         }
10985         else
10986             /* Here, the main way it could match is if the code point is
10987              * outside the bitmap and an inversion list exists to handle such
10988              * things.  The other ways all occur when a flag is set to indicate
10989              * we need special handling.  That handling doesn't come in to
10990              * effect for ANYOFD nodes unless the target string is UTF-8 and
10991              * that matters to code point being matched. */
10992              if (    c >= NUM_ANYOF_CODE_POINTS
10993                  || (   (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)
10994                      && (   UNLIKELY(OP(n) != ANYOFD)
10995                          || (utf8_target && ! isASCII_uvchr(c)
10996 #                               if NUM_ANYOF_CODE_POINTS > 256
10997                                                                && c < 256
10998 #                               endif
10999                                                                          ))))
11000         {
11001             /* Here, we have an inversion list for outside-the-bitmap code
11002              * points and/or the flag is set indicating special handling is
11003              * needed.  The flag is set when there is auxiliary data beyond the
11004              * normal inversion list, or if there is the possibility of a
11005              * special Turkic locale match, even without auxiliary data.
11006              *
11007              * First check if there is an inversion list and/or auxiliary data.
11008              * */
11009             if (ANYOF_HAS_AUX(n)) {
11010                 SV* only_utf8_locale = NULL;
11011 
11012                 /* This call will return in 'definition' the union of the base
11013                  * non-bitmap inversion list, if any, plus the deferred
11014                  * definitions of user-defined properties, if any.  It croaks
11015                  * if there is such a property but which still has no definition
11016                  * available */
11017                 SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0,
11018                                                       &only_utf8_locale, NULL);
11019                 if (definition) {
11020                     /* Most likely is the outside-the-bitmap inversion list. */
11021                     if (_invlist_contains_cp(definition, c)) {
11022                         match = TRUE;
11023                     }
11024                     else /* Failing that, hardcode the two tests for a Turkic
11025                             locale */
11026                          if (   UNLIKELY(IN_UTF8_TURKIC_LOCALE)
11027                              && isALPHA_FOLD_EQ(*p, 'i'))
11028                     {
11029                         /* Turkish locales have these hard-coded rules
11030                          * overriding normal ones */
11031                         if (*p == 'i') {
11032                             if (_invlist_contains_cp(definition,
11033                                          LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11034                             {
11035                                 match = TRUE;
11036                             }
11037                         }
11038                         else if (_invlist_contains_cp(definition,
11039                                                  LATIN_SMALL_LETTER_DOTLESS_I))
11040                         {
11041                             match = TRUE;
11042                         }
11043                     }
11044                 }
11045 
11046                 if (   UNLIKELY(only_utf8_locale)
11047                     && UNLIKELY(IN_UTF8_CTYPE_LOCALE)
11048                     && ! match)
11049                 {
11050                     match = _invlist_contains_cp(only_utf8_locale, c);
11051                 }
11052             }
11053 
11054             /* In a Turkic locale under folding, hard-code the I i case pair
11055              * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES
11056              * flag set unless [Ii] were match possibilities */
11057             if (UNLIKELY(IN_UTF8_TURKIC_LOCALE) && ! match) {
11058                 if (utf8_target) {
11059                     if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
11060                         if (ANYOF_BITMAP_TEST(n, 'i')) {
11061                             match = TRUE;
11062                         }
11063                     }
11064                     else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
11065                         if (ANYOF_BITMAP_TEST(n, 'I')) {
11066                             match = TRUE;
11067                         }
11068                     }
11069                 }
11070 
11071 #if NUM_ANYOF_CODE_POINTS > 256
11072                 /* Larger bitmap means these special cases aren't handled
11073                  * outside the bitmap above. */
11074                 if (*p == 'i') {
11075                     if (ANYOF_BITMAP_TEST(n,
11076                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11077                     {
11078                         match = TRUE;
11079                     }
11080                 }
11081                 else if (*p == 'I') {
11082                     if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) {
11083                         match = TRUE;
11084                     }
11085                 }
11086 #endif
11087             }
11088         }
11089 
11090         if (   UNICODE_IS_SUPER(c)
11091             && (flags & ANYOF_WARN_SUPER__shared)
11092             && OP(n) != ANYOFD
11093             && ckWARN_d(WARN_NON_UNICODE))
11094         {
11095             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
11096                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
11097         }
11098     }
11099 
11100 #if ANYOF_INVERT != 1
11101     /* Depending on compiler optimization cBOOL takes time, so if don't have to
11102      * use it, don't */
11103 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
11104 #endif
11105 
11106     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
11107     return (flags & ANYOF_INVERT) ^ match;
11108 }
11109 
11110 STATIC U8 *
S_reghop3(U8 * s,SSize_t off,const U8 * lim)11111 S_reghop3(U8 *s, SSize_t off, const U8* lim)
11112 {
11113     /* return the position 'off' UTF-8 characters away from 's', forward if
11114      * 'off' >= 0, backwards if negative.  But don't go outside of position
11115      * 'lim', which better be < s  if off < 0 */
11116 
11117     PERL_ARGS_ASSERT_REGHOP3;
11118 
11119     if (off >= 0) {
11120         while (off-- && s < lim) {
11121             /* XXX could check well-formedness here */
11122             U8 *new_s = s + UTF8SKIP(s);
11123             if (new_s > lim) /* lim may be in the middle of a long character */
11124                 return s;
11125             s = new_s;
11126         }
11127     }
11128     else {
11129         while (off++ && s > lim) {
11130             s--;
11131             if (UTF8_IS_CONTINUED(*s)) {
11132                 while (s > lim && UTF8_IS_CONTINUATION(*s))
11133                     s--;
11134                 if (! UTF8_IS_START(*s)) {
11135                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11136                 }
11137             }
11138             /* XXX could check well-formedness here */
11139         }
11140     }
11141     return s;
11142 }
11143 
11144 STATIC U8 *
S_reghop4(U8 * s,SSize_t off,const U8 * llim,const U8 * rlim)11145 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
11146 {
11147     PERL_ARGS_ASSERT_REGHOP4;
11148 
11149     if (off >= 0) {
11150         while (off-- && s < rlim) {
11151             /* XXX could check well-formedness here */
11152             s += UTF8SKIP(s);
11153         }
11154     }
11155     else {
11156         while (off++ && s > llim) {
11157             s--;
11158             if (UTF8_IS_CONTINUED(*s)) {
11159                 while (s > llim && UTF8_IS_CONTINUATION(*s))
11160                     s--;
11161                 if (! UTF8_IS_START(*s)) {
11162                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11163                 }
11164             }
11165             /* XXX could check well-formedness here */
11166         }
11167     }
11168     return s;
11169 }
11170 
11171 /* like reghop3, but returns NULL on overrun, rather than returning last
11172  * char pos */
11173 
11174 STATIC U8 *
S_reghopmaybe3(U8 * s,SSize_t off,const U8 * const lim)11175 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
11176 {
11177     PERL_ARGS_ASSERT_REGHOPMAYBE3;
11178 
11179     if (off >= 0) {
11180         while (off-- && s < lim) {
11181             /* XXX could check well-formedness here */
11182             s += UTF8SKIP(s);
11183         }
11184         if (off >= 0)
11185             return NULL;
11186     }
11187     else {
11188         while (off++ && s > lim) {
11189             s--;
11190             if (UTF8_IS_CONTINUED(*s)) {
11191                 while (s > lim && UTF8_IS_CONTINUATION(*s))
11192                     s--;
11193                 if (! UTF8_IS_START(*s)) {
11194                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11195                 }
11196             }
11197             /* XXX could check well-formedness here */
11198         }
11199         if (off <= 0)
11200             return NULL;
11201     }
11202     return s;
11203 }
11204 
11205 
11206 /* when executing a regex that may have (?{}), extra stuff needs setting
11207    up that will be visible to the called code, even before the current
11208    match has finished. In particular:
11209 
11210    * $_ is localised to the SV currently being matched;
11211    * pos($_) is created if necessary, ready to be updated on each call-out
11212      to code;
11213    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
11214      isn't set until the current pattern is successfully finished), so that
11215      $1 etc of the match-so-far can be seen;
11216    * save the old values of subbeg etc of the current regex, and  set then
11217      to the current string (again, this is normally only done at the end
11218      of execution)
11219 */
11220 
11221 static void
S_setup_eval_state(pTHX_ regmatch_info * const reginfo)11222 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
11223 {
11224     MAGIC *mg;
11225     regexp *const rex = ReANY(reginfo->prog);
11226     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
11227 
11228     eval_state->rex = rex;
11229     eval_state->sv  = reginfo->sv;
11230 
11231     if (reginfo->sv) {
11232         /* Make $_ available to executed code. */
11233         if (reginfo->sv != DEFSV) {
11234             SAVE_DEFSV;
11235             DEFSV_set(reginfo->sv);
11236         }
11237         /* will be dec'd by S_cleanup_regmatch_info_aux */
11238         SvREFCNT_inc_NN(reginfo->sv);
11239 
11240         if (!(mg = mg_find_mglob(reginfo->sv))) {
11241             /* prepare for quick setting of pos */
11242             mg = sv_magicext_mglob(reginfo->sv);
11243             mg->mg_len = -1;
11244         }
11245         eval_state->pos_magic = mg;
11246         eval_state->pos       = mg->mg_len;
11247         eval_state->pos_flags = mg->mg_flags;
11248     }
11249     else
11250         eval_state->pos_magic = NULL;
11251 
11252     if (!PL_reg_curpm) {
11253         /* PL_reg_curpm is a fake PMOP that we can attach the current
11254          * regex to and point PL_curpm at, so that $1 et al are visible
11255          * within a /(?{})/. It's just allocated once per interpreter the
11256          * first time its needed */
11257         Newxz(PL_reg_curpm, 1, PMOP);
11258 #ifdef USE_ITHREADS
11259         {
11260             SV* const repointer = &PL_sv_undef;
11261             /* this regexp is also owned by the new PL_reg_curpm, which
11262                will try to free it.  */
11263             av_push(PL_regex_padav, repointer);
11264             PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
11265             PL_regex_pad = AvARRAY(PL_regex_padav);
11266         }
11267 #endif
11268     }
11269     SET_reg_curpm(reginfo->prog);
11270     eval_state->curpm = PL_curpm;
11271     PL_curpm_under = PL_curpm;
11272     PL_curpm = PL_reg_curpm;
11273     if (RXp_MATCH_COPIED(rex)) {
11274         /*  Here is a serious problem: we cannot rewrite subbeg,
11275             since it may be needed if this match fails.  Thus
11276             $` inside (?{}) could fail... */
11277         eval_state->subbeg     = RXp_SUBBEG(rex);
11278         eval_state->sublen     = RXp_SUBLEN(rex);
11279         eval_state->suboffset  = RXp_SUBOFFSET(rex);
11280         eval_state->subcoffset = RXp_SUBCOFFSET(rex);
11281 #ifdef PERL_ANY_COW
11282         eval_state->saved_copy = RXp_SAVED_COPY(rex);
11283 #endif
11284         RXp_MATCH_COPIED_off(rex);
11285     }
11286     else
11287         eval_state->subbeg = NULL;
11288     RXp_SUBBEG(rex) = (char *)reginfo->strbeg;
11289     RXp_SUBOFFSET(rex) = 0;
11290     RXp_SUBCOFFSET(rex) = 0;
11291     RXp_SUBLEN(rex) = reginfo->strend - reginfo->strbeg;
11292 }
11293 
11294 
11295 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
11296 
11297 static void
S_cleanup_regmatch_info_aux(pTHX_ void * arg)11298 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
11299 {
11300     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
11301     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
11302     regmatch_slab *s;
11303 
11304     Safefree(aux->poscache);
11305 
11306     if (eval_state) {
11307 
11308         /* undo the effects of S_setup_eval_state() */
11309 
11310         if (eval_state->subbeg) {
11311             regexp * const rex = eval_state->rex;
11312             RXp_SUBBEG(rex) = eval_state->subbeg;
11313             RXp_SUBLEN(rex)     = eval_state->sublen;
11314             RXp_SUBOFFSET(rex)  = eval_state->suboffset;
11315             RXp_SUBCOFFSET(rex) = eval_state->subcoffset;
11316 #ifdef PERL_ANY_COW
11317             RXp_SAVED_COPY(rex) = eval_state->saved_copy;
11318 #endif
11319             RXp_MATCH_COPIED_on(rex);
11320         }
11321         if (eval_state->pos_magic)
11322         {
11323             eval_state->pos_magic->mg_len = eval_state->pos;
11324             eval_state->pos_magic->mg_flags =
11325                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
11326                | (eval_state->pos_flags & MGf_BYTES);
11327         }
11328 
11329         PL_curpm = eval_state->curpm;
11330         SvREFCNT_dec(eval_state->sv);
11331     }
11332 
11333     PL_regmatch_state = aux->old_regmatch_state;
11334     PL_regmatch_slab  = aux->old_regmatch_slab;
11335 
11336     /* free all slabs above current one - this must be the last action
11337      * of this function, as aux and eval_state are allocated within
11338      * slabs and may be freed here */
11339 
11340     s = PL_regmatch_slab->next;
11341     if (s) {
11342         PL_regmatch_slab->next = NULL;
11343         while (s) {
11344             regmatch_slab * const osl = s;
11345             s = s->next;
11346             Safefree(osl);
11347         }
11348     }
11349 }
11350 
11351 
11352 STATIC void
S_to_utf8_substr(pTHX_ regexp * prog)11353 S_to_utf8_substr(pTHX_ regexp *prog)
11354 {
11355     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11356      * on the converted value */
11357 
11358     int i = 1;
11359 
11360     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11361 
11362     do {
11363         if (prog->substrs->data[i].substr
11364             && !prog->substrs->data[i].utf8_substr) {
11365             SV* const sv = newSVsv(prog->substrs->data[i].substr);
11366             prog->substrs->data[i].utf8_substr = sv;
11367             sv_utf8_upgrade(sv);
11368             if (SvVALID(prog->substrs->data[i].substr)) {
11369                 if (SvTAIL(prog->substrs->data[i].substr)) {
11370                     /* Trim the trailing \n that fbm_compile added last
11371                        time.  */
11372                     SvCUR_set(sv, SvCUR(sv) - 1);
11373                     /* Whilst this makes the SV technically "invalid" (as its
11374                        buffer is no longer followed by "\0") when fbm_compile()
11375                        adds the "\n" back, a "\0" is restored.  */
11376                     fbm_compile(sv, FBMcf_TAIL);
11377                 } else
11378                     fbm_compile(sv, 0);
11379             }
11380             if (prog->substrs->data[i].substr == prog->check_substr)
11381                 prog->check_utf8 = sv;
11382         }
11383     } while (i--);
11384 }
11385 
11386 STATIC bool
S_to_byte_substr(pTHX_ regexp * prog)11387 S_to_byte_substr(pTHX_ regexp *prog)
11388 {
11389     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11390      * on the converted value; returns FALSE if can't be converted. */
11391 
11392     int i = 1;
11393 
11394     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11395 
11396     do {
11397         if (prog->substrs->data[i].utf8_substr
11398             && !prog->substrs->data[i].substr) {
11399             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11400             if (! sv_utf8_downgrade(sv, TRUE)) {
11401                 SvREFCNT_dec_NN(sv);
11402                 return FALSE;
11403             }
11404             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11405                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11406                     /* Trim the trailing \n that fbm_compile added last
11407                         time.  */
11408                     SvCUR_set(sv, SvCUR(sv) - 1);
11409                     fbm_compile(sv, FBMcf_TAIL);
11410                 } else
11411                     fbm_compile(sv, 0);
11412             }
11413             prog->substrs->data[i].substr = sv;
11414             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11415                 prog->check_substr = sv;
11416         }
11417     } while (i--);
11418 
11419     return TRUE;
11420 }
11421 
11422 #ifndef PERL_IN_XSUB_RE
11423 
11424 bool
Perl_is_grapheme(pTHX_ const U8 * strbeg,const U8 * s,const U8 * strend,const UV cp)11425 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11426 {
11427     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
11428      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
11429      * the larger string bounded by 'strbeg' and 'strend'.
11430      *
11431      * 'cp' needs to be assigned (if not, a future version of the Unicode
11432      * Standard could make it something that combines with adjacent characters,
11433      * so code using it would then break), and there has to be a GCB break
11434      * before and after the character. */
11435 
11436 
11437     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11438     const U8 * prev_cp_start;
11439 
11440     PERL_ARGS_ASSERT_IS_GRAPHEME;
11441 
11442     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
11443         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11444     {
11445         /* These are considered graphemes */
11446         return TRUE;
11447     }
11448 
11449     /* Otherwise, unassigned code points are forbidden */
11450     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11451                                     _invlist_search(PL_Assigned_invlist, cp))))
11452     {
11453         return FALSE;
11454     }
11455 
11456     cp_gcb_val = getGCB_VAL_CP(cp);
11457 
11458     /* Find the GCB value of the previous code point in the input */
11459     prev_cp_start = utf8_hop_back(s, -1, strbeg);
11460     if (UNLIKELY(prev_cp_start == s)) {
11461         prev_cp_gcb_val = GCB_EDGE;
11462     }
11463     else {
11464         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11465     }
11466 
11467     /* And check that is a grapheme boundary */
11468     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11469                 TRUE /* is UTF-8 encoded */ ))
11470     {
11471         return FALSE;
11472     }
11473 
11474     /* Similarly verify there is a break between the current character and the
11475      * following one */
11476     s += UTF8SKIP(s);
11477     if (s >= strend) {
11478         next_cp_gcb_val = GCB_EDGE;
11479     }
11480     else {
11481         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11482     }
11483 
11484     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11485 }
11486 
11487 /*
11488 =for apidoc_section $unicode
11489 
11490 =for apidoc isSCRIPT_RUN
11491 
11492 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11493 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
11494 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
11495 two degenerate cases given below, this function returns TRUE iff all code
11496 points in it come from any combination of three "scripts" given by the Unicode
11497 "Script Extensions" property: Common, Inherited, and possibly one other.
11498 Additionally all decimal digits must come from the same consecutive sequence of
11499 10.
11500 
11501 For example, if all the characters in the sequence are Greek, or Common, or
11502 Inherited, this function will return TRUE, provided any decimal digits in it
11503 are from the same block of digits in Common.  (These are the ASCII digits
11504 "0".."9" and additionally a block for full width forms of these, and several
11505 others used in mathematical notation.)   For scripts (unlike Greek) that have
11506 their own digits defined this will accept either digits from that set or from
11507 one of the Common digit sets, but not a combination of the two.  Some scripts,
11508 such as Arabic, have more than one set of digits.  All digits must come from
11509 the same set for this function to return TRUE.
11510 
11511 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11512 contain the script found, using the C<SCX_enum> typedef.  Its value will be
11513 C<SCX_INVALID> if the function returns FALSE.
11514 
11515 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11516 will be C<SCX_INVALID>.
11517 
11518 If the sequence contains a single code point which is unassigned to a character
11519 in the version of Unicode being used, the function will return TRUE, and the
11520 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
11521 in the input sequence will result in the function treating the input as not
11522 being a script run.
11523 
11524 The returned script will be C<SCX_Inherited> iff all the code points in it are
11525 from the Inherited script.
11526 
11527 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11528 it are from the Inherited or Common scripts.
11529 
11530 =cut
11531 
11532 */
11533 
11534 bool
Perl_isSCRIPT_RUN(pTHX_ const U8 * s,const U8 * send,const bool utf8_target)11535 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11536 {
11537     /* Basically, it looks at each character in the sequence to see if the
11538      * above conditions are met; if not it fails.  It uses an inversion map to
11539      * find the enum corresponding to the script of each character.  But this
11540      * is complicated by the fact that a few code points can be in any of
11541      * several scripts.  The data has been constructed so that there are
11542      * additional enum values (all negative) for these situations.  The
11543      * absolute value of those is an index into another table which contains
11544      * pointers to auxiliary tables for each such situation.  Each aux array
11545      * lists all the scripts for the given situation.  There is another,
11546      * parallel, table that gives the number of entries in each aux table.
11547      * These are all defined in charclass_invlists.h */
11548 
11549     /* XXX Here are the additional things UTS 39 says could be done:
11550      *
11551      * Forbid sequences of the same nonspacing mark
11552      *
11553      * Check to see that all the characters are in the sets of exemplar
11554      * characters for at least one language in the Unicode Common Locale Data
11555      * Repository [CLDR]. */
11556 
11557 
11558     /* Things that match /\d/u */
11559     SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_];
11560     UV * decimals_array = invlist_array(decimals_invlist);
11561 
11562     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11563      * not currently known) */
11564     UV zero_of_run = 0;
11565 
11566     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
11567     SCX_enum script_of_char = SCX_INVALID;
11568 
11569     /* If the script remains not fully determined from iteration to iteration,
11570      * this is the current intersection of the possiblities.  */
11571     SCX_enum * intersection = NULL;
11572     PERL_UINT_FAST8_T intersection_len = 0;
11573 
11574     bool retval = TRUE;
11575     SCX_enum * ret_script = NULL;
11576 
11577     assert(send >= s);
11578 
11579     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11580 
11581     /* All code points in 0..255 are either Common or Latin, so must be a
11582      * script run.  We can return immediately unless we need to know which
11583      * script it is. */
11584     if (! utf8_target && LIKELY(send > s)) {
11585         if (ret_script == NULL) {
11586             return TRUE;
11587         }
11588 
11589         /* If any character is Latin, the run is Latin */
11590         while (s < send) {
11591             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11592                 *ret_script = SCX_Latin;
11593                 return TRUE;
11594             }
11595         }
11596 
11597         /* Here, all are Common */
11598         *ret_script = SCX_Common;
11599         return TRUE;
11600     }
11601 
11602     /* Look at each character in the sequence */
11603     while (s < send) {
11604         /* If the current character being examined is a digit, this is the code
11605          * point of the zero for its sequence of 10 */
11606         UV zero_of_char;
11607 
11608         UV cp;
11609 
11610         /* The code allows all scripts to use the ASCII digits.  This is
11611          * because they are in the Common script.  Hence any ASCII ones found
11612          * are ok, unless and until a digit from another set has already been
11613          * encountered.  digit ranges in Common are not similarly blessed) */
11614         if (UNLIKELY(isDIGIT(*s))) {
11615             if (UNLIKELY(script_of_run == SCX_Unknown)) {
11616                 retval = FALSE;
11617                 break;
11618             }
11619             if (zero_of_run) {
11620                 if (zero_of_run != '0') {
11621                     retval = FALSE;
11622                     break;
11623                 }
11624             }
11625             else {
11626                 zero_of_run = '0';
11627             }
11628             s++;
11629             continue;
11630         }
11631 
11632         /* Here, isn't an ASCII digit.  Find the code point of the character */
11633         if (! UTF8_IS_INVARIANT(*s)) {
11634             Size_t len;
11635             cp = valid_utf8_to_uvchr((U8 *) s, &len);
11636             s += len;
11637         }
11638         else {
11639             cp = *(s++);
11640         }
11641 
11642         /* If is within the range [+0 .. +9] of the script's zero, it also is a
11643          * digit in that script.  We can skip the rest of this code for this
11644          * character. */
11645         if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11646             continue;
11647         }
11648 
11649         /* Find the character's script.  The correct values are hard-coded here
11650          * for small-enough code points. */
11651         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
11652                                unlikely to change */
11653             if (       cp > 255
11654                 || (   isALPHA_L1(cp)
11655                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
11656             {
11657                 script_of_char = SCX_Latin;
11658             }
11659             else {
11660                 script_of_char = SCX_Common;
11661             }
11662         }
11663         else {
11664             script_of_char = _Perl_SCX_invmap[
11665                                        _invlist_search(PL_SCX_invlist, cp)];
11666         }
11667 
11668         /* We arbitrarily accept a single unassigned character, but not in
11669          * combination with anything else, and not a run of them. */
11670         if (   UNLIKELY(script_of_run == SCX_Unknown)
11671             || UNLIKELY(   script_of_run != SCX_INVALID
11672                         && script_of_char == SCX_Unknown))
11673         {
11674             retval = FALSE;
11675             break;
11676         }
11677 
11678         /* For the first character, or the run is inherited, the run's script
11679          * is set to the char's */
11680         if (   UNLIKELY(script_of_run == SCX_INVALID)
11681             || UNLIKELY(script_of_run == SCX_Inherited))
11682         {
11683             script_of_run = script_of_char;
11684         }
11685 
11686         /* For the character's script to be Unknown, it must be the first
11687          * character in the sequence (for otherwise a test above would have
11688          * prevented us from reaching here), and we have set the run's script
11689          * to it.  Nothing further to be done for this character */
11690         if (UNLIKELY(script_of_char == SCX_Unknown)) {
11691             continue;
11692         }
11693 
11694         /* We accept 'inherited' script characters currently even at the
11695          * beginning.  (We know that no characters in Inherited are digits, or
11696          * we'd have to check for that) */
11697         if (UNLIKELY(script_of_char == SCX_Inherited)) {
11698             continue;
11699         }
11700 
11701         /* If the run so far is Common, and the new character isn't, change the
11702          * run's script to that of this character */
11703         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11704             script_of_run = script_of_char;
11705         }
11706 
11707         /* Now we can see if the script of the new character is the same as
11708          * that of the run */
11709         if (LIKELY(script_of_char == script_of_run)) {
11710             /* By far the most common case */
11711             goto scripts_match;
11712         }
11713 
11714         /* Here, the script of the run isn't Common.  But characters in Common
11715          * match any script */
11716         if (script_of_char == SCX_Common) {
11717             goto scripts_match;
11718         }
11719 
11720 #ifndef HAS_SCX_AUX_TABLES
11721 
11722         /* Too early a Unicode version to have a code point belonging to more
11723          * than one script, so, if the scripts don't exactly match, fail */
11724         PERL_UNUSED_VAR(intersection_len);
11725         retval = FALSE;
11726         break;
11727 
11728 #else
11729 
11730         /* Here there is no exact match between the character's script and the
11731          * run's.  And we've handled the special cases of scripts Unknown,
11732          * Inherited, and Common.
11733          *
11734          * Negative script numbers signify that the value may be any of several
11735          * scripts, and we need to look at auxiliary information to make our
11736          * determination.  But if both are non-negative, we can fail now */
11737         if (LIKELY(script_of_char >= 0)) {
11738             const SCX_enum * search_in;
11739             PERL_UINT_FAST8_T search_in_len;
11740             PERL_UINT_FAST8_T i;
11741 
11742             if (LIKELY(script_of_run >= 0)) {
11743                 retval = FALSE;
11744                 break;
11745             }
11746 
11747             /* Use the previously constructed set of possible scripts, if any.
11748              * */
11749             if (intersection) {
11750                 search_in = intersection;
11751                 search_in_len = intersection_len;
11752             }
11753             else {
11754                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11755                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11756             }
11757 
11758             for (i = 0; i < search_in_len; i++) {
11759                 if (search_in[i] == script_of_char) {
11760                     script_of_run = script_of_char;
11761                     goto scripts_match;
11762                 }
11763             }
11764 
11765             retval = FALSE;
11766             break;
11767         }
11768         else if (LIKELY(script_of_run >= 0)) {
11769             /* script of character could be one of several, but run is a single
11770              * script */
11771             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11772             const PERL_UINT_FAST8_T search_in_len
11773                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11774             PERL_UINT_FAST8_T i;
11775 
11776             for (i = 0; i < search_in_len; i++) {
11777                 if (search_in[i] == script_of_run) {
11778                     script_of_char = script_of_run;
11779                     goto scripts_match;
11780                 }
11781             }
11782 
11783             retval = FALSE;
11784             break;
11785         }
11786         else {
11787             /* Both run and char could be in one of several scripts.  If the
11788              * intersection is empty, then this character isn't in this script
11789              * run.  Otherwise, we need to calculate the intersection to use
11790              * for future iterations of the loop, unless we are already at the
11791              * final character */
11792             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11793             const PERL_UINT_FAST8_T char_len
11794                                       = SCX_AUX_TABLE_lengths[-script_of_char];
11795             const SCX_enum * search_run;
11796             PERL_UINT_FAST8_T run_len;
11797 
11798             SCX_enum * new_overlap = NULL;
11799             PERL_UINT_FAST8_T i, j;
11800 
11801             if (intersection) {
11802                 search_run = intersection;
11803                 run_len = intersection_len;
11804             }
11805             else {
11806                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11807                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11808             }
11809 
11810             intersection_len = 0;
11811 
11812             for (i = 0; i < run_len; i++) {
11813                 for (j = 0; j < char_len; j++) {
11814                     if (search_run[i] == search_char[j]) {
11815 
11816                         /* Here, the script at i,j matches.  That means this
11817                          * character is in the run.  But continue on to find
11818                          * the complete intersection, for the next loop
11819                          * iteration, and for the digit check after it.
11820                          *
11821                          * On the first found common script, we malloc space
11822                          * for the intersection list for the worst case of the
11823                          * intersection, which is the minimum of the number of
11824                          * scripts remaining in each set. */
11825                         if (intersection_len == 0) {
11826                             Newx(new_overlap,
11827                                  MIN(run_len - i, char_len - j),
11828                                  SCX_enum);
11829                         }
11830                         new_overlap[intersection_len++] = search_run[i];
11831                     }
11832                 }
11833             }
11834 
11835             /* Here we've looked through everything.  If they have no scripts
11836              * in common, not a run */
11837             if (intersection_len == 0) {
11838                 retval = FALSE;
11839                 break;
11840             }
11841 
11842             /* If there is only a single script in common, set to that.
11843              * Otherwise, use the intersection going forward */
11844             Safefree(intersection);
11845             intersection = NULL;
11846             if (intersection_len == 1) {
11847                 script_of_run = script_of_char = new_overlap[0];
11848                 Safefree(new_overlap);
11849                 new_overlap = NULL;
11850             }
11851             else {
11852                 intersection = new_overlap;
11853             }
11854         }
11855 
11856 #endif
11857 
11858   scripts_match:
11859 
11860         /* Here, the script of the character is compatible with that of the
11861          * run.  That means that in most cases, it continues the script run.
11862          * Either it and the run match exactly, or one or both can be in any of
11863          * several scripts, and the intersection is not empty.  However, if the
11864          * character is a decimal digit, it could still mean failure if it is
11865          * from the wrong sequence of 10.  So, we need to look at if it's a
11866          * digit.  We've already handled the 10 digits [0-9], and the next
11867          * lowest one is this one: */
11868         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11869             continue;   /* Not a digit; this character is part of the run */
11870         }
11871 
11872         /* If we have a definitive '0' for the script of this character, we
11873          * know that for this to be a digit, it must be in the range of +0..+9
11874          * of that zero. */
11875         if (   script_of_char >= 0
11876             && (zero_of_char = script_zeros[script_of_char]))
11877         {
11878             if (! withinCOUNT(cp, zero_of_char, 9)) {
11879                 continue;   /* Not a digit; this character is part of the run
11880                              */
11881             }
11882 
11883         }
11884         else {  /* Need to look up if this character is a digit or not */
11885             SSize_t index_of_zero_of_char;
11886             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11887             if (     UNLIKELY(index_of_zero_of_char < 0)
11888                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11889             {
11890                 continue;   /* Not a digit; this character is part of the run.
11891                              */
11892             }
11893 
11894             zero_of_char = decimals_array[index_of_zero_of_char];
11895         }
11896 
11897         /* Here, the character is a decimal digit, and the zero of its sequence
11898          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11899          * they better be the same. */
11900         if (zero_of_run) {
11901             if (zero_of_run != zero_of_char) {
11902                 retval = FALSE;
11903                 break;
11904             }
11905         }
11906         else {  /* Otherwise we now have a zero for this run */
11907             zero_of_run = zero_of_char;
11908         }
11909     } /* end of looping through CLOSESR text */
11910 
11911     Safefree(intersection);
11912 
11913     if (ret_script != NULL) {
11914         if (retval) {
11915             *ret_script = script_of_run;
11916         }
11917         else {
11918             *ret_script = SCX_INVALID;
11919         }
11920     }
11921 
11922     return retval;
11923 }
11924 #endif /* ifndef PERL_IN_XSUB_RE */
11925 
11926 /* Buffer logic. */
11927 SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx,SV * const key,SV * const value,const U32 flags)11928 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
11929                     const U32 flags)
11930 {
11931     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
11932 
11933     PERL_UNUSED_ARG(value);
11934 
11935     if (flags & RXapif_FETCH) {
11936         return reg_named_buff_fetch(rx, key, flags);
11937     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
11938         Perl_croak_no_modify();
11939         return NULL;
11940     } else if (flags & RXapif_EXISTS) {
11941         return reg_named_buff_exists(rx, key, flags)
11942             ? &PL_sv_yes
11943             : &PL_sv_no;
11944     } else if (flags & RXapif_REGNAMES) {
11945         return reg_named_buff_all(rx, flags);
11946     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
11947         return reg_named_buff_scalar(rx, flags);
11948     } else {
11949         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
11950         return NULL;
11951     }
11952 }
11953 
11954 SV*
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx,const SV * const lastkey,const U32 flags)11955 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
11956                          const U32 flags)
11957 {
11958     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
11959     PERL_UNUSED_ARG(lastkey);
11960 
11961     if (flags & RXapif_FIRSTKEY)
11962         return reg_named_buff_firstkey(rx, flags);
11963     else if (flags & RXapif_NEXTKEY)
11964         return reg_named_buff_nextkey(rx, flags);
11965     else {
11966         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
11967                                             (int)flags);
11968         return NULL;
11969     }
11970 }
11971 
11972 SV*
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r,SV * const namesv,const U32 flags)11973 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
11974                           const U32 flags)
11975 {
11976     SV *ret;
11977     struct regexp *const rx = ReANY(r);
11978 
11979     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
11980 
11981     if (rx && RXp_PAREN_NAMES(rx)) {
11982         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
11983         if (he_str) {
11984             IV i;
11985             SV* sv_dat=HeVAL(he_str);
11986             I32 *nums=(I32*)SvPVX(sv_dat);
11987             AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL;
11988             for ( i=0; i<SvIVX(sv_dat); i++ ) {
11989                 if ((I32)(rx->nparens) >= nums[i]
11990                     && RXp_OFFS_VALID(rx,nums[i]))
11991                 {
11992                     ret = newSVpvs("");
11993                     Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE);
11994                     if (!retarray)
11995                         return ret;
11996                 } else {
11997                     if (retarray)
11998                         ret = newSV_type(SVt_NULL);
11999                 }
12000                 if (retarray)
12001                     av_push_simple(retarray, ret);
12002             }
12003             if (retarray)
12004                 return newRV_noinc(MUTABLE_SV(retarray));
12005         }
12006     }
12007     return NULL;
12008 }
12009 
12010 bool
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r,SV * const key,const U32 flags)12011 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
12012                            const U32 flags)
12013 {
12014     struct regexp *const rx = ReANY(r);
12015 
12016     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
12017 
12018     if (rx && RXp_PAREN_NAMES(rx)) {
12019         if (flags & RXapif_ALL) {
12020             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
12021         } else {
12022             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
12023             if (sv) {
12024                 SvREFCNT_dec_NN(sv);
12025                 return TRUE;
12026             } else {
12027                 return FALSE;
12028             }
12029         }
12030     } else {
12031         return FALSE;
12032     }
12033 }
12034 
12035 SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r,const U32 flags)12036 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
12037 {
12038     struct regexp *const rx = ReANY(r);
12039 
12040     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
12041 
12042     if ( rx && RXp_PAREN_NAMES(rx) ) {
12043         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
12044 
12045         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
12046     } else {
12047         return FALSE;
12048     }
12049 }
12050 
12051 SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r,const U32 flags)12052 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
12053 {
12054     struct regexp *const rx = ReANY(r);
12055     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12056 
12057     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
12058 
12059     if (rx && RXp_PAREN_NAMES(rx)) {
12060         HV *hv = RXp_PAREN_NAMES(rx);
12061         HE *temphe;
12062         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12063             IV i;
12064             IV parno = 0;
12065             SV* sv_dat = HeVAL(temphe);
12066             I32 *nums = (I32*)SvPVX(sv_dat);
12067             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12068                 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12069                     RXp_OFFS_VALID(rx,nums[i]))
12070                 {
12071                     parno = nums[i];
12072                     break;
12073                 }
12074             }
12075             if (parno || flags & RXapif_ALL) {
12076                 return newSVhek(HeKEY_hek(temphe));
12077             }
12078         }
12079     }
12080     return NULL;
12081 }
12082 
12083 SV*
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r,const U32 flags)12084 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
12085 {
12086     SV *ret;
12087     AV *av;
12088     SSize_t length;
12089     struct regexp *const rx = ReANY(r);
12090 
12091     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
12092 
12093     if (rx && RXp_PAREN_NAMES(rx)) {
12094         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
12095             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
12096         } else if (flags & RXapif_ONE) {
12097             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
12098             av = MUTABLE_AV(SvRV(ret));
12099             length = av_count(av);
12100             SvREFCNT_dec_NN(ret);
12101             return newSViv(length);
12102         } else {
12103             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
12104                                                 (int)flags);
12105             return NULL;
12106         }
12107     }
12108     return &PL_sv_undef;
12109 }
12110 
12111 SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r,const U32 flags)12112 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
12113 {
12114     struct regexp *const rx = ReANY(r);
12115     AV *av = newAV();
12116 
12117     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
12118 
12119     if (rx && RXp_PAREN_NAMES(rx)) {
12120         HV *hv= RXp_PAREN_NAMES(rx);
12121         HE *temphe;
12122         (void)hv_iterinit(hv);
12123         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12124             IV i;
12125             IV parno = 0;
12126             SV* sv_dat = HeVAL(temphe);
12127             I32 *nums = (I32*)SvPVX(sv_dat);
12128             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12129                 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12130                     RXp_OFFS_VALID(rx,nums[i]))
12131                 {
12132                     parno = nums[i];
12133                     break;
12134                 }
12135             }
12136             if (parno || flags & RXapif_ALL) {
12137                 av_push_simple(av, newSVhek(HeKEY_hek(temphe)));
12138             }
12139         }
12140     }
12141 
12142     return newRV_noinc(MUTABLE_SV(av));
12143 }
12144 
12145 void
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re,const I32 paren,SV * const sv)12146 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren,
12147                              SV * const sv)
12148 {
12149     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
12150     Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0);
12151 }
12152 
12153 #ifndef PERL_IN_XSUB_RE
12154 
12155 void
Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re,const I32 paren,SV * const sv,U32 flags)12156 Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren,
12157                                    SV * const sv, U32 flags)
12158 {
12159     struct regexp *const rx = ReANY(re);
12160     char *s = NULL;
12161     SSize_t i,t = 0;
12162     SSize_t s1, t1;
12163     I32 n = paren;
12164     I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12165 
12166     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS;
12167 
12168     if (      n == RX_BUFF_IDX_CARET_PREMATCH
12169            || n == RX_BUFF_IDX_CARET_FULLMATCH
12170            || n == RX_BUFF_IDX_CARET_POSTMATCH
12171        )
12172     {
12173         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12174         if (!keepcopy) {
12175             /* on something like
12176              *    $r = qr/.../;
12177              *    /$qr/p;
12178              * the KEEPCOPY is set on the PMOP rather than the regex */
12179             if (PL_curpm && re == PM_GETRE(PL_curpm))
12180                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12181         }
12182         if (!keepcopy)
12183             goto ret_undef;
12184     }
12185 
12186     if (!RXp_SUBBEG(rx))
12187         goto ret_undef;
12188 
12189     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
12190         /* no need to distinguish between them any more */
12191         n = RX_BUFF_IDX_FULLMATCH;
12192 
12193     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
12194         && (i = RXp_OFFS_START(rx,0)) != -1)
12195     {
12196         /* $`, ${^PREMATCH} */
12197         s = RXp_SUBBEG(rx);
12198     }
12199     else
12200     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
12201         && (t = RXp_OFFS_END(rx,0)) != -1)
12202     {
12203         /* $', ${^POSTMATCH} */
12204         s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + t;
12205         i = RXp_SUBLEN(rx) + RXp_SUBOFFSET(rx) - t;
12206     }
12207     else /* when flags is true we do an absolute lookup, and compare against rx->nparens */
12208     if (inRANGE(n, 0, flags ? (I32)rx->nparens : logical_nparens)) {
12209         I32 *map = (!flags && n) ? rx->logical_to_parno : NULL;
12210         I32 true_parno = map ? map[n] : n;
12211         do {
12212             if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1)  &&
12213                 ((t1 = RXp_OFFS_END(rx,true_parno)) != -1))
12214             {
12215                 /* $&, ${^MATCH}, $1 ... */
12216                 i = t1 - s1;
12217                 s = RXp_SUBBEG(rx) + s1 - RXp_SUBOFFSET(rx);
12218                 goto found_it;
12219             }
12220             else if (map) {
12221                 true_parno = rx->parno_to_logical_next[true_parno];
12222             }
12223             else {
12224                 break;
12225             }
12226         } while (true_parno);
12227         goto ret_undef;
12228     } else {
12229         goto ret_undef;
12230     }
12231 
12232   found_it:
12233     assert(s >= RXp_SUBBEG(rx));
12234     assert((STRLEN)RXp_SUBLEN(rx) >= (STRLEN)((s - RXp_SUBBEG(rx)) + i) );
12235     if (i >= 0) {
12236 #ifdef NO_TAINT_SUPPORT
12237         sv_setpvn(sv, s, i);
12238 #else
12239         const int oldtainted = TAINT_get;
12240         TAINT_NOT;
12241         sv_setpvn(sv, s, i);
12242         TAINT_set(oldtainted);
12243 #endif
12244         if (RXp_MATCH_UTF8(rx))
12245             SvUTF8_on(sv);
12246         else
12247             SvUTF8_off(sv);
12248         if (TAINTING_get) {
12249             if (RXp_MATCH_TAINTED(rx)) {
12250                 if (SvTYPE(sv) >= SVt_PVMG) {
12251                     MAGIC* const mg = SvMAGIC(sv);
12252                     MAGIC* mgt;
12253                     TAINT;
12254                     SvMAGIC_set(sv, mg->mg_moremagic);
12255                     SvTAINT(sv);
12256                     if ((mgt = SvMAGIC(sv))) {
12257                         mg->mg_moremagic = mgt;
12258                         SvMAGIC_set(sv, mg);
12259                     }
12260                 } else {
12261                     TAINT;
12262                     SvTAINT(sv);
12263                 }
12264             } else
12265                 SvTAINTED_off(sv);
12266         }
12267     } else {
12268       ret_undef:
12269         sv_set_undef(sv);
12270         return;
12271     }
12272 }
12273 
12274 #endif
12275 
12276 void
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx,const I32 paren,SV const * const value)12277 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
12278                                                          SV const * const value)
12279 {
12280     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
12281 
12282     PERL_UNUSED_ARG(rx);
12283     PERL_UNUSED_ARG(paren);
12284     PERL_UNUSED_ARG(value);
12285 
12286     if (!PL_localizing)
12287         Perl_croak_no_modify();
12288 }
12289 
12290 I32
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r,const SV * const sv,const I32 paren)12291 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
12292                               const I32 paren)
12293 {
12294     struct regexp *const rx = ReANY(r);
12295     I32 i,j;
12296     I32 s1, t1;
12297     I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12298 
12299     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
12300 
12301     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
12302         || paren == RX_BUFF_IDX_CARET_FULLMATCH
12303         || paren == RX_BUFF_IDX_CARET_POSTMATCH
12304     )
12305     {
12306         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12307         if (!keepcopy) {
12308             /* on something like
12309              *    $r = qr/.../;
12310              *    /$qr/p;
12311              * the KEEPCOPY is set on the PMOP rather than the regex */
12312             if (PL_curpm && r == PM_GETRE(PL_curpm))
12313                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12314         }
12315         if (!keepcopy)
12316             goto warn_undef;
12317     }
12318 
12319     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
12320     switch (paren) {
12321       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
12322       case RX_BUFF_IDX_PREMATCH:       /* $` */
12323         if ( (i = RXp_OFFS_START(rx,0)) != -1) {
12324             if (i > 0) {
12325                 s1 = 0;
12326                 t1 = i;
12327                 goto getlen;
12328             }
12329         }
12330         return 0;
12331 
12332       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
12333       case RX_BUFF_IDX_POSTMATCH:       /* $' */
12334         if ( (j = RXp_OFFS_END(rx,0)) != -1 ) {
12335             i = RXp_SUBLEN(rx) - j;
12336             if (i > 0) {
12337                 s1 = j;
12338                 t1 = RXp_SUBLEN(rx);
12339                 goto getlen;
12340             }
12341         }
12342         return 0;
12343 
12344       default: /* $& / ${^MATCH}, $1, $2, ... */
12345         if (paren <= logical_nparens) {
12346             I32 true_paren = rx->logical_to_parno
12347                              ? rx->logical_to_parno[paren]
12348                              : paren;
12349             do {
12350                 if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) &&
12351                     ((t1 = RXp_OFFS_END(rx,true_paren)) != -1))
12352                 {
12353                     i = t1 - s1;
12354                     goto getlen;
12355                 } else if (rx->parno_to_logical_next) {
12356                     true_paren = rx->parno_to_logical_next[true_paren];
12357                 } else {
12358                     break;
12359                 }
12360             } while(true_paren);
12361         }
12362       warn_undef:
12363         if (ckWARN(WARN_UNINITIALIZED))
12364             report_uninit((const SV *)sv);
12365         return 0;
12366     }
12367   getlen:
12368     if (i > 0 && RXp_MATCH_UTF8(rx)) {
12369         const char * const s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + s1;
12370         const U8 *ep;
12371         STRLEN el;
12372 
12373         i = t1 - s1;
12374         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
12375             i = el;
12376     }
12377     return i;
12378 }
12379 
12380 /*
12381  * ex: set ts=8 sts=4 sw=4 et:
12382  */
12383