1 /* regexec.c
2 */
3
4 /*
5 * One Ring to rule them all, One Ring to find them
6 *
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10 */
11
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
20 */
21
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34 */
35
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39
40 /*
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
65 ****
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEXEC_C
75 #include "perl.h"
76
77 #ifdef PERL_IN_XSUB_RE
78 # include "re_comp.h"
79 #else
80 # include "regcomp.h"
81 #endif
82
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
85
86 static const char b_utf8_locale_required[] =
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88 " Assuming a UTF-8 locale";
89
90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
91 STMT_START { \
92 if (! IN_UTF8_CTYPE_LOCALE) { \
93 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
94 b_utf8_locale_required); \
95 } \
96 } STMT_END
97
98 static const char sets_utf8_locale_required[] =
99 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
100
101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
102 STMT_START { \
103 if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \
104 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
105 sets_utf8_locale_required); \
106 } \
107 } STMT_END
108
109 #ifdef DEBUGGING
110 /* At least one required character in the target string is expressible only in
111 * UTF-8. */
112 static const char non_utf8_target_but_utf8_required[]
113 = "Can't match, because target string needs to be in UTF-8\n";
114 #endif
115
116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
117 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
118 goto target; \
119 } STMT_END
120
121 #ifndef STATIC
122 #define STATIC static
123 #endif
124
125 /*
126 * Forwards.
127 */
128
129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
130
131 #define HOPc(pos,off) \
132 (char *)(reginfo->is_utf8_target \
133 ? reghop3((U8*)pos, off, \
134 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
135 : (U8*)(pos + off))
136
137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138 #define HOPBACK3(pos, off, lim) \
139 (reginfo->is_utf8_target \
140 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141 : (pos - off >= lim) \
142 ? (U8*)pos - off \
143 : NULL)
144
145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
146
147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
149
150 /* lim must be +ve. Returns NULL on overshoot */
151 #define HOPMAYBE3(pos,off,lim) \
152 (reginfo->is_utf8_target \
153 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
154 : ((U8*)pos + off <= lim) \
155 ? (U8*)pos + off \
156 : NULL)
157
158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159 * off must be >=0; args should be vars rather than expressions */
160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162 : (U8*)((pos + off) > lim ? lim : (pos + off)))
163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
164
165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
167 : (U8*)(pos + off))
168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
169
170 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
172
173 /* for use after a quantifier and before an EXACT-like node -- japhy */
174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
175 *
176 * NOTE that *nothing* that affects backtracking should be in here, specifically
177 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
178 * node that is in between two EXACT like nodes when ascertaining what the required
179 * "follow" character is. This should probably be moved to regex compile time
180 * although it may be done at run time beause of the REF possibility - more
181 * investigation required. -- demerphq
182 */
183 #define JUMPABLE(rn) ( \
184 OP(rn) == OPEN || \
185 (OP(rn) == CLOSE && \
186 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
187 OP(rn) == EVAL || \
188 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
189 OP(rn) == PLUS || OP(rn) == MINMOD || \
190 OP(rn) == KEEPS || \
191 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
192 )
193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
194
195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
196
197 /*
198 Search for mandatory following text node; for lookahead, the text must
199 follow but for lookbehind (rn->flags != 0) we skip to the next step.
200 */
201 #define FIND_NEXT_IMPT(rn) STMT_START { \
202 while (JUMPABLE(rn)) { \
203 const OPCODE type = OP(rn); \
204 if (type == SUSPEND || PL_regkind[type] == CURLY) \
205 rn = NEXTOPER(NEXTOPER(rn)); \
206 else if (type == PLUS) \
207 rn = NEXTOPER(rn); \
208 else if (type == IFMATCH) \
209 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
210 else rn += NEXT_OFF(rn); \
211 } \
212 } STMT_END
213
214 #define SLAB_FIRST(s) (&(s)->states[0])
215 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
216
217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
219 static regmatch_state * S_push_slab(pTHX);
220
221 #define REGCP_PAREN_ELEMS 3
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
226
227 STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp * rex,I32 parenfloor,U32 maxopenparen _pDEPTH)228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
229 {
230 const int retval = PL_savestack_ix;
231 const int paren_elems_to_push =
232 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
233 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
235 I32 p;
236 DECLARE_AND_GET_RE_DEBUG_FLAGS;
237
238 PERL_ARGS_ASSERT_REGCPPUSH;
239
240 if (paren_elems_to_push < 0)
241 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242 (int)paren_elems_to_push, (int)maxopenparen,
243 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
244
245 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
246 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
247 " out of range (%lu-%ld)",
248 total_elems,
249 (unsigned long)maxopenparen,
250 (long)parenfloor);
251
252 SSGROW(total_elems + REGCP_FRAME_ELEMS);
253
254 DEBUG_BUFFERS_r(
255 if ((int)maxopenparen > (int)parenfloor)
256 Perl_re_exec_indentf( aTHX_
257 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
258 depth,
259 PTR2UV(rex),
260 PTR2UV(rex->offs)
261 );
262 );
263 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
265 SSPUSHIV(rex->offs[p].end);
266 SSPUSHIV(rex->offs[p].start);
267 SSPUSHINT(rex->offs[p].start_tmp);
268 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
269 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
270 depth,
271 (UV)p,
272 (IV)rex->offs[p].start,
273 (IV)rex->offs[p].start_tmp,
274 (IV)rex->offs[p].end
275 ));
276 }
277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
278 SSPUSHINT(maxopenparen);
279 SSPUSHINT(rex->lastparen);
280 SSPUSHINT(rex->lastcloseparen);
281 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
282
283 return retval;
284 }
285
286 /* These are needed since we do not localize EVAL nodes: */
287 #define REGCP_SET(cp) \
288 DEBUG_STATE_r( \
289 Perl_re_exec_indentf( aTHX_ \
290 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
291 depth, (IV)PL_savestack_ix \
292 ) \
293 ); \
294 cp = PL_savestack_ix
295
296 #define REGCP_UNWIND(cp) \
297 DEBUG_STATE_r( \
298 if (cp != PL_savestack_ix) \
299 Perl_re_exec_indentf( aTHX_ \
300 "Clearing an EVAL scope, savestack=%" \
301 IVdf "..%" IVdf "\n", \
302 depth, (IV)(cp), (IV)PL_savestack_ix \
303 ) \
304 ); \
305 regcpblow(cp)
306
307 /* set the start and end positions of capture ix */
308 #define CLOSE_CAPTURE(ix, s, e) \
309 rex->offs[ix].start = s; \
310 rex->offs[ix].end = e; \
311 if (ix > rex->lastparen) \
312 rex->lastparen = ix; \
313 rex->lastcloseparen = ix; \
314 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
315 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
316 depth, \
317 PTR2UV(rex), \
318 PTR2UV(rex->offs), \
319 (UV)ix, \
320 (IV)rex->offs[ix].start, \
321 (IV)rex->offs[ix].end, \
322 (UV)rex->lastparen \
323 ))
324
325 #define UNWIND_PAREN(lp, lcp) \
326 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
327 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
328 depth, \
329 PTR2UV(rex), \
330 PTR2UV(rex->offs), \
331 (UV)(lp), \
332 (UV)(rex->lastparen), \
333 (UV)(lcp) \
334 )); \
335 for (n = rex->lastparen; n > lp; n--) \
336 rex->offs[n].end = -1; \
337 rex->lastparen = n; \
338 rex->lastcloseparen = lcp;
339
340
341 STATIC void
S_regcppop(pTHX_ regexp * rex,U32 * maxopenparen_p _pDEPTH)342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
343 {
344 UV i;
345 U32 paren;
346 DECLARE_AND_GET_RE_DEBUG_FLAGS;
347
348 PERL_ARGS_ASSERT_REGCPPOP;
349
350 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
351 i = SSPOPUV;
352 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
354 rex->lastcloseparen = SSPOPINT;
355 rex->lastparen = SSPOPINT;
356 *maxopenparen_p = SSPOPINT;
357
358 i -= REGCP_OTHER_ELEMS;
359 /* Now restore the parentheses context. */
360 DEBUG_BUFFERS_r(
361 if (i || rex->lastparen + 1 <= rex->nparens)
362 Perl_re_exec_indentf( aTHX_
363 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
364 depth,
365 PTR2UV(rex),
366 PTR2UV(rex->offs)
367 );
368 );
369 paren = *maxopenparen_p;
370 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
371 SSize_t tmps;
372 rex->offs[paren].start_tmp = SSPOPINT;
373 rex->offs[paren].start = SSPOPIV;
374 tmps = SSPOPIV;
375 if (paren <= rex->lastparen)
376 rex->offs[paren].end = tmps;
377 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
378 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
379 depth,
380 (UV)paren,
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
385 );
386 paren--;
387 }
388 #if 1
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 if (i > *maxopenparen_p)
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
402 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
403 " \\%" UVuf ": %s ..-1 undeffing\n",
404 depth,
405 (UV)i,
406 (i > *maxopenparen_p) ? "-1" : " "
407 ));
408 }
409 #endif
410 }
411
412 /* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
414
415 STATIC void
S_regcp_restore(pTHX_ regexp * rex,I32 ix,U32 * maxopenparen_p _pDEPTH)416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
417 {
418 I32 tmpix = PL_savestack_ix;
419 PERL_ARGS_ASSERT_REGCP_RESTORE;
420
421 PL_savestack_ix = ix;
422 regcppop(rex, maxopenparen_p);
423 PL_savestack_ix = tmpix;
424 }
425
426 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
427
428 STATIC bool
S_isFOO_lc(pTHX_ const U8 classnum,const U8 character)429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
430 {
431 /* Returns a boolean as to whether or not 'character' is a member of the
432 * Posix character class given by 'classnum' that should be equivalent to a
433 * value in the typedef '_char_class_number'.
434 *
435 * Ideally this could be replaced by a just an array of function pointers
436 * to the C library functions that implement the macros this calls.
437 * However, to compile, the precise function signatures are required, and
438 * these may vary from platform to platform. To avoid having to figure
439 * out what those all are on each platform, I (khw) am using this method,
440 * which adds an extra layer of function call overhead (unless the C
441 * optimizer strips it away). But we don't particularly care about
442 * performance with locales anyway. */
443
444 switch ((_char_class_number) classnum) {
445 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
446 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
447 case _CC_ENUM_ASCII: return isASCII_LC(character);
448 case _CC_ENUM_BLANK: return isBLANK_LC(character);
449 case _CC_ENUM_CASED: return isLOWER_LC(character)
450 || isUPPER_LC(character);
451 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
452 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
453 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
454 case _CC_ENUM_LOWER: return isLOWER_LC(character);
455 case _CC_ENUM_PRINT: return isPRINT_LC(character);
456 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
457 case _CC_ENUM_SPACE: return isSPACE_LC(character);
458 case _CC_ENUM_UPPER: return isUPPER_LC(character);
459 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
460 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
461 default: /* VERTSPACE should never occur in locales */
462 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
463 }
464
465 NOT_REACHED; /* NOTREACHED */
466 return FALSE;
467 }
468
469 PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(const char * s1,const char * s2,I32 len)470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
471 {
472 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
473 * folded. Works on all folds representable without UTF-8, except for
474 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
475 * check that the strings each have at least 'len' characters.
476 *
477 * There is almost an identical API function where s2 need not be folded:
478 * Perl_foldEQ_latin1() */
479
480 const U8 *a = (const U8 *)s1;
481 const U8 *b = (const U8 *)s2;
482
483 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
484
485 assert(len >= 0);
486
487 while (len--) {
488 assert(! isUPPER_L1(*b));
489 if (toLOWER_L1(*a) != *b) {
490 return 0;
491 }
492 a++, b++;
493 }
494 return 1;
495 }
496
497 STATIC bool
S_isFOO_utf8_lc(pTHX_ const U8 classnum,const U8 * character,const U8 * e)498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
499 {
500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501 * 'character' is a member of the Posix character class given by 'classnum'
502 * that should be equivalent to a value in the typedef
503 * '_char_class_number'.
504 *
505 * This just calls isFOO_lc on the code point for the character if it is in
506 * the range 0-255. Outside that range, all characters use Unicode
507 * rules, ignoring any locale. So use the Unicode function if this class
508 * requires an inversion list, and use the Unicode macro otherwise. */
509
510
511 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
512
513 if (UTF8_IS_INVARIANT(*character)) {
514 return isFOO_lc(classnum, *character);
515 }
516 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
517 return isFOO_lc(classnum,
518 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
519 }
520
521 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
522
523 switch ((_char_class_number) classnum) {
524 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
525 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
526 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
527 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
528 default:
529 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
530 utf8_to_uvchr_buf(character, e, NULL));
531 }
532
533 return FALSE; /* Things like CNTRL are always below 256 */
534 }
535
536 STATIC U8 *
S_find_span_end(U8 * s,const U8 * send,const U8 span_byte)537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
538 {
539 /* Returns the position of the first byte in the sequence between 's' and
540 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
541 * */
542
543 PERL_ARGS_ASSERT_FIND_SPAN_END;
544
545 assert(send >= s);
546
547 if ((STRLEN) (send - s) >= PERL_WORDSIZE
548 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
549 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
550 {
551 PERL_UINTMAX_T span_word;
552
553 /* Process per-byte until reach word boundary. XXX This loop could be
554 * eliminated if we knew that this platform had fast unaligned reads */
555 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
556 if (*s != span_byte) {
557 return s;
558 }
559 s++;
560 }
561
562 /* Create a word filled with the bytes we are spanning */
563 span_word = PERL_COUNT_MULTIPLIER * span_byte;
564
565 /* Process per-word as long as we have at least a full word left */
566 do {
567
568 /* Keep going if the whole word is composed of 'span_byte's */
569 if ((* (PERL_UINTMAX_T *) s) == span_word) {
570 s += PERL_WORDSIZE;
571 continue;
572 }
573
574 /* Here, at least one byte in the word isn't 'span_byte'. */
575
576 #ifdef EBCDIC
577
578 break;
579
580 #else
581
582 /* This xor leaves 1 bits only in those non-matching bytes */
583 span_word ^= * (PERL_UINTMAX_T *) s;
584
585 /* Make sure the upper bit of each non-matching byte is set. This
586 * makes each such byte look like an ASCII platform variant byte */
587 span_word |= span_word << 1;
588 span_word |= span_word << 2;
589 span_word |= span_word << 4;
590
591 /* That reduces the problem to what this function solves */
592 return s + variant_byte_number(span_word);
593
594 #endif
595
596 } while (s + PERL_WORDSIZE <= send);
597 }
598
599 /* Process the straggler bytes beyond the final word boundary */
600 while (s < send) {
601 if (*s != span_byte) {
602 return s;
603 }
604 s++;
605 }
606
607 return s;
608 }
609
610 STATIC U8 *
S_find_next_masked(U8 * s,const U8 * send,const U8 byte,const U8 mask)611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
612 {
613 /* Returns the position of the first byte in the sequence between 's'
614 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
615 * returns 'send' if none found. It uses word-level operations instead of
616 * byte to speed up the process */
617
618 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
619
620 assert(send >= s);
621 assert((byte & mask) == byte);
622
623 #ifndef EBCDIC
624
625 if ((STRLEN) (send - s) >= PERL_WORDSIZE
626 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
627 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
628 {
629 PERL_UINTMAX_T word, mask_word;
630
631 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
632 if (((*s) & mask) == byte) {
633 return s;
634 }
635 s++;
636 }
637
638 word = PERL_COUNT_MULTIPLIER * byte;
639 mask_word = PERL_COUNT_MULTIPLIER * mask;
640
641 do {
642 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
643
644 /* If 'masked' contains bytes with the bit pattern of 'byte' within
645 * it, xoring with 'word' will leave each of the 8 bits in such
646 * bytes be 0, and no byte containing any other bit pattern will be
647 * 0. */
648 masked ^= word;
649
650 /* This causes the most significant bit to be set to 1 for any
651 * bytes in the word that aren't completely 0 */
652 masked |= masked << 1;
653 masked |= masked << 2;
654 masked |= masked << 4;
655
656 /* The msbits are the same as what marks a byte as variant, so we
657 * can use this mask. If all msbits are 1, the word doesn't
658 * contain 'byte' */
659 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
660 s += PERL_WORDSIZE;
661 continue;
662 }
663
664 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
665 * and any that are, are 0. Complement and re-AND to swap that */
666 masked = ~ masked;
667 masked &= PERL_VARIANTS_WORD_MASK;
668
669 /* This reduces the problem to that solved by this function */
670 s += variant_byte_number(masked);
671 return s;
672
673 } while (s + PERL_WORDSIZE <= send);
674 }
675
676 #endif
677
678 while (s < send) {
679 if (((*s) & mask) == byte) {
680 return s;
681 }
682 s++;
683 }
684
685 return s;
686 }
687
688 STATIC U8 *
S_find_span_end_mask(U8 * s,const U8 * send,const U8 span_byte,const U8 mask)689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
690 {
691 /* Returns the position of the first byte in the sequence between 's' and
692 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
693 * 'span_byte' should have been ANDed with 'mask' in the call of this
694 * function. Returns 'send' if none found. Works like find_span_end(),
695 * except for the AND */
696
697 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
698
699 assert(send >= s);
700 assert((span_byte & mask) == span_byte);
701
702 if ((STRLEN) (send - s) >= PERL_WORDSIZE
703 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
704 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
705 {
706 PERL_UINTMAX_T span_word, mask_word;
707
708 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
709 if (((*s) & mask) != span_byte) {
710 return s;
711 }
712 s++;
713 }
714
715 span_word = PERL_COUNT_MULTIPLIER * span_byte;
716 mask_word = PERL_COUNT_MULTIPLIER * mask;
717
718 do {
719 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
720
721 if (masked == span_word) {
722 s += PERL_WORDSIZE;
723 continue;
724 }
725
726 #ifdef EBCDIC
727
728 break;
729
730 #else
731
732 masked ^= span_word;
733 masked |= masked << 1;
734 masked |= masked << 2;
735 masked |= masked << 4;
736 return s + variant_byte_number(masked);
737
738 #endif
739
740 } while (s + PERL_WORDSIZE <= send);
741 }
742
743 while (s < send) {
744 if (((*s) & mask) != span_byte) {
745 return s;
746 }
747 s++;
748 }
749
750 return s;
751 }
752
753 /*
754 * pregexec and friends
755 */
756
757 #ifndef PERL_IN_XSUB_RE
758 /*
759 - pregexec - match a regexp against a string
760 */
761 I32
Perl_pregexec(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
763 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
764 /* stringarg: the point in the string at which to begin matching */
765 /* strend: pointer to null at end of string */
766 /* strbeg: real beginning of string */
767 /* minend: end of match must be >= minend bytes after stringarg. */
768 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
769 * itself is accessed via the pointers above */
770 /* nosave: For optimizations. */
771 {
772 PERL_ARGS_ASSERT_PREGEXEC;
773
774 return
775 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
776 nosave ? 0 : REXEC_COPY_STR);
777 }
778 #endif
779
780
781
782 /* re_intuit_start():
783 *
784 * Based on some optimiser hints, try to find the earliest position in the
785 * string where the regex could match.
786 *
787 * rx: the regex to match against
788 * sv: the SV being matched: only used for utf8 flag; the string
789 * itself is accessed via the pointers below. Note that on
790 * something like an overloaded SV, SvPOK(sv) may be false
791 * and the string pointers may point to something unrelated to
792 * the SV itself.
793 * strbeg: real beginning of string
794 * strpos: the point in the string at which to begin matching
795 * strend: pointer to the byte following the last char of the string
796 * flags currently unused; set to 0
797 * data: currently unused; set to NULL
798 *
799 * The basic idea of re_intuit_start() is to use some known information
800 * about the pattern, namely:
801 *
802 * a) the longest known anchored substring (i.e. one that's at a
803 * constant offset from the beginning of the pattern; but not
804 * necessarily at a fixed offset from the beginning of the
805 * string);
806 * b) the longest floating substring (i.e. one that's not at a constant
807 * offset from the beginning of the pattern);
808 * c) Whether the pattern is anchored to the string; either
809 * an absolute anchor: /^../, or anchored to \n: /^.../m,
810 * or anchored to pos(): /\G/;
811 * d) A start class: a real or synthetic character class which
812 * represents which characters are legal at the start of the pattern;
813 *
814 * to either quickly reject the match, or to find the earliest position
815 * within the string at which the pattern might match, thus avoiding
816 * running the full NFA engine at those earlier locations, only to
817 * eventually fail and retry further along.
818 *
819 * Returns NULL if the pattern can't match, or returns the address within
820 * the string which is the earliest place the match could occur.
821 *
822 * The longest of the anchored and floating substrings is called 'check'
823 * and is checked first. The other is called 'other' and is checked
824 * second. The 'other' substring may not be present. For example,
825 *
826 * /(abc|xyz)ABC\d{0,3}DEFG/
827 *
828 * will have
829 *
830 * check substr (float) = "DEFG", offset 6..9 chars
831 * other substr (anchored) = "ABC", offset 3..3 chars
832 * stclass = [ax]
833 *
834 * Be aware that during the course of this function, sometimes 'anchored'
835 * refers to a substring being anchored relative to the start of the
836 * pattern, and sometimes to the pattern itself being anchored relative to
837 * the string. For example:
838 *
839 * /\dabc/: "abc" is anchored to the pattern;
840 * /^\dabc/: "abc" is anchored to the pattern and the string;
841 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
842 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
843 * but the pattern is anchored to the string.
844 */
845
846 char *
Perl_re_intuit_start(pTHX_ REGEXP * const rx,SV * sv,const char * const strbeg,char * strpos,char * strend,const U32 flags,re_scream_pos_data * data)847 Perl_re_intuit_start(pTHX_
848 REGEXP * const rx,
849 SV *sv,
850 const char * const strbeg,
851 char *strpos,
852 char *strend,
853 const U32 flags,
854 re_scream_pos_data *data)
855 {
856 struct regexp *const prog = ReANY(rx);
857 SSize_t start_shift = prog->check_offset_min;
858 /* Should be nonnegative! */
859 SSize_t end_shift = 0;
860 /* current lowest pos in string where the regex can start matching */
861 char *rx_origin = strpos;
862 SV *check;
863 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
864 U8 other_ix = 1 - prog->substrs->check_ix;
865 bool ml_anch = 0;
866 char *other_last = strpos;/* latest pos 'other' substr already checked to */
867 char *check_at = NULL; /* check substr found at this pos */
868 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
869 RXi_GET_DECL(prog,progi);
870 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
871 regmatch_info *const reginfo = ®info_buf;
872 DECLARE_AND_GET_RE_DEBUG_FLAGS;
873
874 PERL_ARGS_ASSERT_RE_INTUIT_START;
875 PERL_UNUSED_ARG(flags);
876 PERL_UNUSED_ARG(data);
877
878 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
879 "Intuit: trying to determine minimum start position...\n"));
880
881 /* for now, assume that all substr offsets are positive. If at some point
882 * in the future someone wants to do clever things with lookbehind and
883 * -ve offsets, they'll need to fix up any code in this function
884 * which uses these offsets. See the thread beginning
885 * <20140113145929.GF27210@iabyn.com>
886 */
887 assert(prog->substrs->data[0].min_offset >= 0);
888 assert(prog->substrs->data[0].max_offset >= 0);
889 assert(prog->substrs->data[1].min_offset >= 0);
890 assert(prog->substrs->data[1].max_offset >= 0);
891 assert(prog->substrs->data[2].min_offset >= 0);
892 assert(prog->substrs->data[2].max_offset >= 0);
893
894 /* for now, assume that if both present, that the floating substring
895 * doesn't start before the anchored substring.
896 * If you break this assumption (e.g. doing better optimisations
897 * with lookahead/behind), then you'll need to audit the code in this
898 * function carefully first
899 */
900 assert(
901 ! ( (prog->anchored_utf8 || prog->anchored_substr)
902 && (prog->float_utf8 || prog->float_substr))
903 || (prog->float_min_offset >= prog->anchored_offset));
904
905 /* byte rather than char calculation for efficiency. It fails
906 * to quickly reject some cases that can't match, but will reject
907 * them later after doing full char arithmetic */
908 if (prog->minlen > strend - strpos) {
909 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
910 " String too short...\n"));
911 goto fail;
912 }
913
914 RXp_MATCH_UTF8_set(prog, utf8_target);
915 reginfo->is_utf8_target = cBOOL(utf8_target);
916 reginfo->info_aux = NULL;
917 reginfo->strbeg = strbeg;
918 reginfo->strend = strend;
919 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
920 reginfo->intuit = 1;
921 /* not actually used within intuit, but zero for safety anyway */
922 reginfo->poscache_maxiter = 0;
923
924 if (utf8_target) {
925 if ((!prog->anchored_utf8 && prog->anchored_substr)
926 || (!prog->float_utf8 && prog->float_substr))
927 to_utf8_substr(prog);
928 check = prog->check_utf8;
929 } else {
930 if (!prog->check_substr && prog->check_utf8) {
931 if (! to_byte_substr(prog)) {
932 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
933 }
934 }
935 check = prog->check_substr;
936 }
937
938 /* dump the various substring data */
939 DEBUG_OPTIMISE_MORE_r({
940 int i;
941 for (i=0; i<=2; i++) {
942 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
943 : prog->substrs->data[i].substr);
944 if (!sv)
945 continue;
946
947 Perl_re_printf( aTHX_
948 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
949 " useful=%" IVdf " utf8=%d [%s]\n",
950 i,
951 (IV)prog->substrs->data[i].min_offset,
952 (IV)prog->substrs->data[i].max_offset,
953 (IV)prog->substrs->data[i].end_shift,
954 BmUSEFUL(sv),
955 utf8_target ? 1 : 0,
956 SvPEEK(sv));
957 }
958 });
959
960 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
961
962 /* ml_anch: check after \n?
963 *
964 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
965 * with /.*.../, these flags will have been added by the
966 * compiler:
967 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
968 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
969 */
970 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
971 && !(prog->intflags & PREGf_IMPLICIT);
972
973 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
974 /* we are only allowed to match at BOS or \G */
975
976 /* trivially reject if there's a BOS anchor and we're not at BOS.
977 *
978 * Note that we don't try to do a similar quick reject for
979 * \G, since generally the caller will have calculated strpos
980 * based on pos() and gofs, so the string is already correctly
981 * anchored by definition; and handling the exceptions would
982 * be too fiddly (e.g. REXEC_IGNOREPOS).
983 */
984 if ( strpos != strbeg
985 && (prog->intflags & PREGf_ANCH_SBOL))
986 {
987 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
988 " Not at start...\n"));
989 goto fail;
990 }
991
992 /* in the presence of an anchor, the anchored (relative to the
993 * start of the regex) substr must also be anchored relative
994 * to strpos. So quickly reject if substr isn't found there.
995 * This works for \G too, because the caller will already have
996 * subtracted gofs from pos, and gofs is the offset from the
997 * \G to the start of the regex. For example, in /.abc\Gdef/,
998 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
999 * caller will have set strpos=pos()-4; we look for the substr
1000 * at position pos()-4+1, which lines up with the "a" */
1001
1002 if (prog->check_offset_min == prog->check_offset_max) {
1003 /* Substring at constant offset from beg-of-str... */
1004 SSize_t slen = SvCUR(check);
1005 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1006
1007 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1008 " Looking for check substr at fixed offset %" IVdf "...\n",
1009 (IV)prog->check_offset_min));
1010
1011 if (SvTAIL(check)) {
1012 /* In this case, the regex is anchored at the end too.
1013 * Unless it's a multiline match, the lengths must match
1014 * exactly, give or take a \n. NB: slen >= 1 since
1015 * the last char of check is \n */
1016 if (!multiline
1017 && ( strend - s > slen
1018 || strend - s < slen - 1
1019 || (strend - s == slen && strend[-1] != '\n')))
1020 {
1021 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1022 " String too long...\n"));
1023 goto fail_finish;
1024 }
1025 /* Now should match s[0..slen-2] */
1026 slen--;
1027 }
1028 if (slen && (strend - s < slen
1029 || *SvPVX_const(check) != *s
1030 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1031 {
1032 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033 " String not equal...\n"));
1034 goto fail_finish;
1035 }
1036
1037 check_at = s;
1038 goto success_at_start;
1039 }
1040 }
1041 }
1042
1043 end_shift = prog->check_end_shift;
1044
1045 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1046 if (end_shift < 0)
1047 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1048 (IV)end_shift, RX_PRECOMP(rx));
1049 #endif
1050
1051 restart:
1052
1053 /* This is the (re)entry point of the main loop in this function.
1054 * The goal of this loop is to:
1055 * 1) find the "check" substring in the region rx_origin..strend
1056 * (adjusted by start_shift / end_shift). If not found, reject
1057 * immediately.
1058 * 2) If it exists, look for the "other" substr too if defined; for
1059 * example, if the check substr maps to the anchored substr, then
1060 * check the floating substr, and vice-versa. If not found, go
1061 * back to (1) with rx_origin suitably incremented.
1062 * 3) If we find an rx_origin position that doesn't contradict
1063 * either of the substrings, then check the possible additional
1064 * constraints on rx_origin of /^.../m or a known start class.
1065 * If these fail, then depending on which constraints fail, jump
1066 * back to here, or to various other re-entry points further along
1067 * that skip some of the first steps.
1068 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1069 * substring. If the start position was determined to be at the
1070 * beginning of the string - so, not rejected, but not optimised,
1071 * since we have to run regmatch from position 0 - decrement the
1072 * BmUSEFUL() count. Otherwise increment it.
1073 */
1074
1075
1076 /* first, look for the 'check' substring */
1077
1078 {
1079 U8* start_point;
1080 U8* end_point;
1081
1082 DEBUG_OPTIMISE_MORE_r({
1083 Perl_re_printf( aTHX_
1084 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1085 " Start shift: %" IVdf " End shift %" IVdf
1086 " Real end Shift: %" IVdf "\n",
1087 (IV)(rx_origin - strbeg),
1088 (IV)prog->check_offset_min,
1089 (IV)start_shift,
1090 (IV)end_shift,
1091 (IV)prog->check_end_shift);
1092 });
1093
1094 end_point = HOPBACK3(strend, end_shift, rx_origin);
1095 if (!end_point)
1096 goto fail_finish;
1097 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1098 if (!start_point)
1099 goto fail_finish;
1100
1101
1102 /* If the regex is absolutely anchored to either the start of the
1103 * string (SBOL) or to pos() (ANCH_GPOS), then
1104 * check_offset_max represents an upper bound on the string where
1105 * the substr could start. For the ANCH_GPOS case, we assume that
1106 * the caller of intuit will have already set strpos to
1107 * pos()-gofs, so in this case strpos + offset_max will still be
1108 * an upper bound on the substr.
1109 */
1110 if (!ml_anch
1111 && prog->intflags & PREGf_ANCH
1112 && prog->check_offset_max != SSize_t_MAX)
1113 {
1114 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1115 const char * const anchor =
1116 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1117 SSize_t targ_len = (char*)end_point - anchor;
1118
1119 if (check_len > targ_len) {
1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121 "Target string too short to match required substring...\n"));
1122 goto fail_finish;
1123 }
1124
1125 /* do a bytes rather than chars comparison. It's conservative;
1126 * so it skips doing the HOP if the result can't possibly end
1127 * up earlier than the old value of end_point.
1128 */
1129 assert(anchor + check_len <= (char *)end_point);
1130 if (prog->check_offset_max + check_len < targ_len) {
1131 end_point = HOP3lim((U8*)anchor,
1132 prog->check_offset_max,
1133 end_point - check_len
1134 )
1135 + check_len;
1136 if (end_point < start_point)
1137 goto fail_finish;
1138 }
1139 }
1140
1141 check_at = fbm_instr( start_point, end_point,
1142 check, multiline ? FBMrf_MULTILINE : 0);
1143
1144 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1146 (IV)((char*)start_point - strbeg),
1147 (IV)((char*)end_point - strbeg),
1148 (IV)(check_at ? check_at - strbeg : -1)
1149 ));
1150
1151 /* Update the count-of-usability, remove useless subpatterns,
1152 unshift s. */
1153
1154 DEBUG_EXECUTE_r({
1155 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1156 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1157 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1158 (check_at ? "Found" : "Did not find"),
1159 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1160 ? "anchored" : "floating"),
1161 quoted,
1162 RE_SV_TAIL(check),
1163 (check_at ? " at offset " : "...\n") );
1164 });
1165
1166 if (!check_at)
1167 goto fail_finish;
1168 /* set rx_origin to the minimum position where the regex could start
1169 * matching, given the constraint of the just-matched check substring.
1170 * But don't set it lower than previously.
1171 */
1172
1173 if (check_at - rx_origin > prog->check_offset_max)
1174 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1175 /* Finish the diagnostic message */
1176 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1177 "%ld (rx_origin now %" IVdf ")...\n",
1178 (long)(check_at - strbeg),
1179 (IV)(rx_origin - strbeg)
1180 ));
1181 }
1182
1183
1184 /* now look for the 'other' substring if defined */
1185
1186 if (prog->substrs->data[other_ix].utf8_substr
1187 || prog->substrs->data[other_ix].substr)
1188 {
1189 /* Take into account the "other" substring. */
1190 char *last, *last1;
1191 char *s;
1192 SV* must;
1193 struct reg_substr_datum *other;
1194
1195 do_other_substr:
1196 other = &prog->substrs->data[other_ix];
1197 if (!utf8_target && !other->substr) {
1198 if (!to_byte_substr(prog)) {
1199 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1200 }
1201 }
1202
1203 /* if "other" is anchored:
1204 * we've previously found a floating substr starting at check_at.
1205 * This means that the regex origin must lie somewhere
1206 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1207 * and max: HOP3(check_at, -check_offset_min)
1208 * (except that min will be >= strpos)
1209 * So the fixed substr must lie somewhere between
1210 * HOP3(min, anchored_offset)
1211 * HOP3(max, anchored_offset) + SvCUR(substr)
1212 */
1213
1214 /* if "other" is floating
1215 * Calculate last1, the absolute latest point where the
1216 * floating substr could start in the string, ignoring any
1217 * constraints from the earlier fixed match. It is calculated
1218 * as follows:
1219 *
1220 * strend - prog->minlen (in chars) is the absolute latest
1221 * position within the string where the origin of the regex
1222 * could appear. The latest start point for the floating
1223 * substr is float_min_offset(*) on from the start of the
1224 * regex. last1 simply combines thee two offsets.
1225 *
1226 * (*) You might think the latest start point should be
1227 * float_max_offset from the regex origin, and technically
1228 * you'd be correct. However, consider
1229 * /a\d{2,4}bcd\w/
1230 * Here, float min, max are 3,5 and minlen is 7.
1231 * This can match either
1232 * /a\d\dbcd\w/
1233 * /a\d\d\dbcd\w/
1234 * /a\d\d\d\dbcd\w/
1235 * In the first case, the regex matches minlen chars; in the
1236 * second, minlen+1, in the third, minlen+2.
1237 * In the first case, the floating offset is 3 (which equals
1238 * float_min), in the second, 4, and in the third, 5 (which
1239 * equals float_max). In all cases, the floating string bcd
1240 * can never start more than 4 chars from the end of the
1241 * string, which equals minlen - float_min. As the substring
1242 * starts to match more than float_min from the start of the
1243 * regex, it makes the regex match more than minlen chars,
1244 * and the two cancel each other out. So we can always use
1245 * float_min - minlen, rather than float_max - minlen for the
1246 * latest position in the string.
1247 *
1248 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1249 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1250 */
1251
1252 assert(prog->minlen >= other->min_offset);
1253 last1 = HOP3c(strend,
1254 other->min_offset - prog->minlen, strbeg);
1255
1256 if (other_ix) {/* i.e. if (other-is-float) */
1257 /* last is the latest point where the floating substr could
1258 * start, *given* any constraints from the earlier fixed
1259 * match. This constraint is that the floating string starts
1260 * <= float_max_offset chars from the regex origin (rx_origin).
1261 * If this value is less than last1, use it instead.
1262 */
1263 assert(rx_origin <= last1);
1264 last =
1265 /* this condition handles the offset==infinity case, and
1266 * is a short-cut otherwise. Although it's comparing a
1267 * byte offset to a char length, it does so in a safe way,
1268 * since 1 char always occupies 1 or more bytes,
1269 * so if a string range is (last1 - rx_origin) bytes,
1270 * it will be less than or equal to (last1 - rx_origin)
1271 * chars; meaning it errs towards doing the accurate HOP3
1272 * rather than just using last1 as a short-cut */
1273 (last1 - rx_origin) < other->max_offset
1274 ? last1
1275 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1276 }
1277 else {
1278 assert(strpos + start_shift <= check_at);
1279 last = HOP4c(check_at, other->min_offset - start_shift,
1280 strbeg, strend);
1281 }
1282
1283 s = HOP3c(rx_origin, other->min_offset, strend);
1284 if (s < other_last) /* These positions already checked */
1285 s = other_last;
1286
1287 must = utf8_target ? other->utf8_substr : other->substr;
1288 assert(SvPOK(must));
1289 {
1290 char *from = s;
1291 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1292
1293 if (to > strend)
1294 to = strend;
1295 if (from > to) {
1296 s = NULL;
1297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1299 (IV)(from - strbeg),
1300 (IV)(to - strbeg)
1301 ));
1302 }
1303 else {
1304 s = fbm_instr(
1305 (unsigned char*)from,
1306 (unsigned char*)to,
1307 must,
1308 multiline ? FBMrf_MULTILINE : 0
1309 );
1310 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1311 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1312 (IV)(from - strbeg),
1313 (IV)(to - strbeg),
1314 (IV)(s ? s - strbeg : -1)
1315 ));
1316 }
1317 }
1318
1319 DEBUG_EXECUTE_r({
1320 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1321 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1322 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1323 s ? "Found" : "Contradicts",
1324 other_ix ? "floating" : "anchored",
1325 quoted, RE_SV_TAIL(must));
1326 });
1327
1328
1329 if (!s) {
1330 /* last1 is latest possible substr location. If we didn't
1331 * find it before there, we never will */
1332 if (last >= last1) {
1333 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1334 "; giving up...\n"));
1335 goto fail_finish;
1336 }
1337
1338 /* try to find the check substr again at a later
1339 * position. Maybe next time we'll find the "other" substr
1340 * in range too */
1341 other_last = HOP3c(last, 1, strend) /* highest failure */;
1342 rx_origin =
1343 other_ix /* i.e. if other-is-float */
1344 ? HOP3c(rx_origin, 1, strend)
1345 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1346 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1347 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1348 (other_ix ? "floating" : "anchored"),
1349 (long)(HOP3c(check_at, 1, strend) - strbeg),
1350 (IV)(rx_origin - strbeg)
1351 ));
1352 goto restart;
1353 }
1354 else {
1355 if (other_ix) { /* if (other-is-float) */
1356 /* other_last is set to s, not s+1, since its possible for
1357 * a floating substr to fail first time, then succeed
1358 * second time at the same floating position; e.g.:
1359 * "-AB--AABZ" =~ /\wAB\d*Z/
1360 * The first time round, anchored and float match at
1361 * "-(AB)--AAB(Z)" then fail on the initial \w character
1362 * class. Second time round, they match at "-AB--A(AB)(Z)".
1363 */
1364 other_last = s;
1365 }
1366 else {
1367 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1368 other_last = HOP3c(s, 1, strend);
1369 }
1370 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371 " at offset %ld (rx_origin now %" IVdf ")...\n",
1372 (long)(s - strbeg),
1373 (IV)(rx_origin - strbeg)
1374 ));
1375
1376 }
1377 }
1378 else {
1379 DEBUG_OPTIMISE_MORE_r(
1380 Perl_re_printf( aTHX_
1381 " Check-only match: offset min:%" IVdf " max:%" IVdf
1382 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1383 " strend:%" IVdf "\n",
1384 (IV)prog->check_offset_min,
1385 (IV)prog->check_offset_max,
1386 (IV)(check_at-strbeg),
1387 (IV)(rx_origin-strbeg),
1388 (IV)(rx_origin-check_at),
1389 (IV)(strend-strbeg)
1390 )
1391 );
1392 }
1393
1394 postprocess_substr_matches:
1395
1396 /* handle the extra constraint of /^.../m if present */
1397
1398 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1399 char *s;
1400
1401 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1402 " looking for /^/m anchor"));
1403
1404 /* we have failed the constraint of a \n before rx_origin.
1405 * Find the next \n, if any, even if it's beyond the current
1406 * anchored and/or floating substrings. Whether we should be
1407 * scanning ahead for the next \n or the next substr is debatable.
1408 * On the one hand you'd expect rare substrings to appear less
1409 * often than \n's. On the other hand, searching for \n means
1410 * we're effectively flipping between check_substr and "\n" on each
1411 * iteration as the current "rarest" candidate string, which
1412 * means for example that we'll quickly reject the whole string if
1413 * hasn't got a \n, rather than trying every substr position
1414 * first
1415 */
1416
1417 s = HOP3c(strend, - prog->minlen, strpos);
1418 if (s <= rx_origin ||
1419 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1420 {
1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422 " Did not find /%s^%s/m...\n",
1423 PL_colors[0], PL_colors[1]));
1424 goto fail_finish;
1425 }
1426
1427 /* earliest possible origin is 1 char after the \n.
1428 * (since *rx_origin == '\n', it's safe to ++ here rather than
1429 * HOP(rx_origin, 1)) */
1430 rx_origin++;
1431
1432 if (prog->substrs->check_ix == 0 /* check is anchored */
1433 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1434 {
1435 /* Position contradicts check-string; either because
1436 * check was anchored (and thus has no wiggle room),
1437 * or check was float and rx_origin is above the float range */
1438 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1440 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1441 goto restart;
1442 }
1443
1444 /* if we get here, the check substr must have been float,
1445 * is in range, and we may or may not have had an anchored
1446 * "other" substr which still contradicts */
1447 assert(prog->substrs->check_ix); /* check is float */
1448
1449 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1450 /* whoops, the anchored "other" substr exists, so we still
1451 * contradict. On the other hand, the float "check" substr
1452 * didn't contradict, so just retry the anchored "other"
1453 * substr */
1454 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1455 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1456 PL_colors[0], PL_colors[1],
1457 (IV)(rx_origin - strbeg + prog->anchored_offset),
1458 (IV)(rx_origin - strbeg)
1459 ));
1460 goto do_other_substr;
1461 }
1462
1463 /* success: we don't contradict the found floating substring
1464 * (and there's no anchored substr). */
1465 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466 " Found /%s^%s/m with rx_origin %ld...\n",
1467 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1468 }
1469 else {
1470 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1471 " (multiline anchor test skipped)\n"));
1472 }
1473
1474 success_at_start:
1475
1476
1477 /* if we have a starting character class, then test that extra constraint.
1478 * (trie stclasses are too expensive to use here, we are better off to
1479 * leave it to regmatch itself) */
1480
1481 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1482 const U8* const str = (U8*)STRING(progi->regstclass);
1483
1484 /* XXX this value could be pre-computed */
1485 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1486 ? (reginfo->is_utf8_pat
1487 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1488 : (SSize_t)STR_LEN(progi->regstclass))
1489 : 1);
1490 char * endpos;
1491 char *s;
1492 /* latest pos that a matching float substr constrains rx start to */
1493 char *rx_max_float = NULL;
1494
1495 /* if the current rx_origin is anchored, either by satisfying an
1496 * anchored substring constraint, or a /^.../m constraint, then we
1497 * can reject the current origin if the start class isn't found
1498 * at the current position. If we have a float-only match, then
1499 * rx_origin is constrained to a range; so look for the start class
1500 * in that range. if neither, then look for the start class in the
1501 * whole rest of the string */
1502
1503 /* XXX DAPM it's not clear what the minlen test is for, and why
1504 * it's not used in the floating case. Nothing in the test suite
1505 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1506 * Here are some old comments, which may or may not be correct:
1507 *
1508 * minlen == 0 is possible if regstclass is \b or \B,
1509 * and the fixed substr is ''$.
1510 * Since minlen is already taken into account, rx_origin+1 is
1511 * before strend; accidentally, minlen >= 1 guaranties no false
1512 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1513 * 0) below assumes that regstclass does not come from lookahead...
1514 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1515 * This leaves EXACTF-ish only, which are dealt with in
1516 * find_byclass().
1517 */
1518
1519 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1520 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1521 else if (prog->float_substr || prog->float_utf8) {
1522 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1523 endpos = HOP3clim(rx_max_float, cl_l, strend);
1524 }
1525 else
1526 endpos= strend;
1527
1528 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1529 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1530 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1531 (IV)start_shift, (IV)(check_at - strbeg),
1532 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1533
1534 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1535 reginfo);
1536 if (!s) {
1537 if (endpos == strend) {
1538 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1539 " Could not match STCLASS...\n") );
1540 goto fail;
1541 }
1542 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1543 " This position contradicts STCLASS...\n") );
1544 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1545 && !(prog->intflags & PREGf_IMPLICIT))
1546 goto fail;
1547
1548 /* Contradict one of substrings */
1549 if (prog->anchored_substr || prog->anchored_utf8) {
1550 if (prog->substrs->check_ix == 1) { /* check is float */
1551 /* Have both, check_string is floating */
1552 assert(rx_origin + start_shift <= check_at);
1553 if (rx_origin + start_shift != check_at) {
1554 /* not at latest position float substr could match:
1555 * Recheck anchored substring, but not floating.
1556 * The condition above is in bytes rather than
1557 * chars for efficiency. It's conservative, in
1558 * that it errs on the side of doing 'goto
1559 * do_other_substr'. In this case, at worst,
1560 * an extra anchored search may get done, but in
1561 * practice the extra fbm_instr() is likely to
1562 * get skipped anyway. */
1563 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1564 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1565 (long)(other_last - strbeg),
1566 (IV)(rx_origin - strbeg)
1567 ));
1568 goto do_other_substr;
1569 }
1570 }
1571 }
1572 else {
1573 /* float-only */
1574
1575 if (ml_anch) {
1576 /* In the presence of ml_anch, we might be able to
1577 * find another \n without breaking the current float
1578 * constraint. */
1579
1580 /* strictly speaking this should be HOP3c(..., 1, ...),
1581 * but since we goto a block of code that's going to
1582 * search for the next \n if any, its safe here */
1583 rx_origin++;
1584 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1585 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1586 PL_colors[0], PL_colors[1],
1587 (long)(rx_origin - strbeg)) );
1588 goto postprocess_substr_matches;
1589 }
1590
1591 /* strictly speaking this can never be true; but might
1592 * be if we ever allow intuit without substrings */
1593 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1594 goto fail;
1595
1596 rx_origin = rx_max_float;
1597 }
1598
1599 /* at this point, any matching substrings have been
1600 * contradicted. Start again... */
1601
1602 rx_origin = HOP3c(rx_origin, 1, strend);
1603
1604 /* uses bytes rather than char calculations for efficiency.
1605 * It's conservative: it errs on the side of doing 'goto restart',
1606 * where there is code that does a proper char-based test */
1607 if (rx_origin + start_shift + end_shift > strend) {
1608 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1609 " Could not match STCLASS...\n") );
1610 goto fail;
1611 }
1612 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1613 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1614 (prog->substrs->check_ix ? "floating" : "anchored"),
1615 (long)(rx_origin + start_shift - strbeg),
1616 (IV)(rx_origin - strbeg)
1617 ));
1618 goto restart;
1619 }
1620
1621 /* Success !!! */
1622
1623 if (rx_origin != s) {
1624 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625 " By STCLASS: moving %ld --> %ld\n",
1626 (long)(rx_origin - strbeg), (long)(s - strbeg))
1627 );
1628 }
1629 else {
1630 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1631 " Does not contradict STCLASS...\n");
1632 );
1633 }
1634 }
1635
1636 /* Decide whether using the substrings helped */
1637
1638 if (rx_origin != strpos) {
1639 /* Fixed substring is found far enough so that the match
1640 cannot start at strpos. */
1641
1642 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1643 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1644 }
1645 else {
1646 /* The found rx_origin position does not prohibit matching at
1647 * strpos, so calling intuit didn't gain us anything. Decrement
1648 * the BmUSEFUL() count on the check substring, and if we reach
1649 * zero, free it. */
1650 if (!(prog->intflags & PREGf_NAUGHTY)
1651 && (utf8_target ? (
1652 prog->check_utf8 /* Could be deleted already */
1653 && --BmUSEFUL(prog->check_utf8) < 0
1654 && (prog->check_utf8 == prog->float_utf8)
1655 ) : (
1656 prog->check_substr /* Could be deleted already */
1657 && --BmUSEFUL(prog->check_substr) < 0
1658 && (prog->check_substr == prog->float_substr)
1659 )))
1660 {
1661 /* If flags & SOMETHING - do not do it many times on the same match */
1662 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1663 /* XXX Does the destruction order has to change with utf8_target? */
1664 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1665 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1666 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1667 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1668 check = NULL; /* abort */
1669 /* XXXX This is a remnant of the old implementation. It
1670 looks wasteful, since now INTUIT can use many
1671 other heuristics. */
1672 prog->extflags &= ~RXf_USE_INTUIT;
1673 }
1674 }
1675
1676 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1677 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1678 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1679
1680 return rx_origin;
1681
1682 fail_finish: /* Substring not found */
1683 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1684 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1685 fail:
1686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1687 PL_colors[4], PL_colors[5]));
1688 return NULL;
1689 }
1690
1691
1692 #define DECL_TRIE_TYPE(scan) \
1693 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1694 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1695 trie_utf8l, trie_flu8, trie_flu8_latin } \
1696 trie_type = ((scan->flags == EXACT) \
1697 ? (utf8_target ? trie_utf8 : trie_plain) \
1698 : (scan->flags == EXACTL) \
1699 ? (utf8_target ? trie_utf8l : trie_plain) \
1700 : (scan->flags == EXACTFAA) \
1701 ? (utf8_target \
1702 ? trie_utf8_exactfa_fold \
1703 : trie_latin_utf8_exactfa_fold) \
1704 : (scan->flags == EXACTFLU8 \
1705 ? (utf8_target \
1706 ? trie_flu8 \
1707 : trie_flu8_latin) \
1708 : (utf8_target \
1709 ? trie_utf8_fold \
1710 : trie_latin_utf8_fold)))
1711
1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1713 * 'foldbuf+sizeof(foldbuf)' */
1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1715 STMT_START { \
1716 STRLEN skiplen; \
1717 U8 flags = FOLD_FLAGS_FULL; \
1718 switch (trie_type) { \
1719 case trie_flu8: \
1720 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1721 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1722 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1723 } \
1724 goto do_trie_utf8_fold; \
1725 case trie_utf8_exactfa_fold: \
1726 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1727 /* FALLTHROUGH */ \
1728 case trie_utf8_fold: \
1729 do_trie_utf8_fold: \
1730 if ( foldlen>0 ) { \
1731 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1732 foldlen -= len; \
1733 uscan += len; \
1734 len=0; \
1735 } else { \
1736 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1737 flags); \
1738 len = UTF8_SAFE_SKIP(uc, uc_end); \
1739 skiplen = UVCHR_SKIP( uvc ); \
1740 foldlen -= skiplen; \
1741 uscan = foldbuf + skiplen; \
1742 } \
1743 break; \
1744 case trie_flu8_latin: \
1745 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1746 goto do_trie_latin_utf8_fold; \
1747 case trie_latin_utf8_exactfa_fold: \
1748 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1749 /* FALLTHROUGH */ \
1750 case trie_latin_utf8_fold: \
1751 do_trie_latin_utf8_fold: \
1752 if ( foldlen>0 ) { \
1753 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1754 foldlen -= len; \
1755 uscan += len; \
1756 len=0; \
1757 } else { \
1758 len = 1; \
1759 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1760 skiplen = UVCHR_SKIP( uvc ); \
1761 foldlen -= skiplen; \
1762 uscan = foldbuf + skiplen; \
1763 } \
1764 break; \
1765 case trie_utf8l: \
1766 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1767 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1768 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1769 } \
1770 /* FALLTHROUGH */ \
1771 case trie_utf8: \
1772 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1773 break; \
1774 case trie_plain: \
1775 uvc = (UV)*uc; \
1776 len = 1; \
1777 } \
1778 if (uvc < 256) { \
1779 charid = trie->charmap[ uvc ]; \
1780 } \
1781 else { \
1782 charid = 0; \
1783 if (widecharmap) { \
1784 SV** const svpp = hv_fetch(widecharmap, \
1785 (char*)&uvc, sizeof(UV), 0); \
1786 if (svpp) \
1787 charid = (U16)SvIV(*svpp); \
1788 } \
1789 } \
1790 } STMT_END
1791
1792 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1793 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1794 startpos, doutf8, depth)
1795
1796 #define REXEC_FBC_UTF8_SCAN(CODE) \
1797 STMT_START { \
1798 while (s < strend) { \
1799 CODE \
1800 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1801 } \
1802 } STMT_END
1803
1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1805 STMT_START { \
1806 while (s < strend) { \
1807 CODE \
1808 s++; \
1809 } \
1810 } STMT_END
1811
1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1813 STMT_START { \
1814 while (s < strend) { \
1815 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1816 } \
1817 } STMT_END
1818
1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
1820 STMT_START { \
1821 while (s < strend) { \
1822 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1823 } \
1824 } STMT_END
1825
1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1827 if (COND) { \
1828 FBC_CHECK_AND_TRY \
1829 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1830 previous_occurrence_end = s; \
1831 } \
1832 else { \
1833 s += UTF8SKIP(s); \
1834 }
1835
1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1837 if (COND) { \
1838 FBC_CHECK_AND_TRY \
1839 s++; \
1840 previous_occurrence_end = s; \
1841 } \
1842 else { \
1843 s++; \
1844 }
1845
1846 /* We keep track of where the next character should start after an occurrence
1847 * of the one we're looking for. Knowing that, we can see right away if the
1848 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1849 * don't accept the 2nd and succeeding adjacent occurrences */
1850 #define FBC_CHECK_AND_TRY \
1851 if ( ( doevery \
1852 || s != previous_occurrence_end) \
1853 && ( reginfo->intuit \
1854 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1855 { \
1856 goto got_it; \
1857 }
1858
1859
1860 /* These differ from the above macros in that they call a function which
1861 * returns the next occurrence of the thing being looked for in 's'; and
1862 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c)
1863 * */
1864 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
1865 while (s < strend) { \
1866 s = (char *) (f); \
1867 if (s >= strend) { \
1868 break; \
1869 } \
1870 \
1871 FBC_CHECK_AND_TRY \
1872 s += UTF8SKIP(s); \
1873 previous_occurrence_end = s; \
1874 }
1875
1876 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1877 while (s < strend) { \
1878 s = (char *) (f); \
1879 if (s >= strend) { \
1880 break; \
1881 } \
1882 \
1883 FBC_CHECK_AND_TRY \
1884 s++; \
1885 previous_occurrence_end = s; \
1886 }
1887
1888 /* This is like the above macro except the function returns NULL if there is no
1889 * occurrence, and there is a further condition that must be matched besides
1890 * the function */
1891 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \
1892 while (s < strend) { \
1893 s = (char *) (f); \
1894 if (s == NULL) { \
1895 s = (char *) strend; \
1896 break; \
1897 } \
1898 \
1899 if (COND) { \
1900 FBC_CHECK_AND_TRY \
1901 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1902 previous_occurrence_end = s; \
1903 } \
1904 else { \
1905 s += UTF8SKIP(s); \
1906 } \
1907 }
1908
1909 /* This differs from the above macros in that it is passed a single byte that
1910 * is known to begin the next occurrence of the thing being looked for in 's'.
1911 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1912 * at that position. */
1913 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1914 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \
1915 COND)
1916
1917 /* This is like the function above, but takes an entire string to look for
1918 * instead of a single byte */
1919 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \
1920 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \
1921 ninstr(s, strend, substr, substr_end), \
1922 COND)
1923
1924 /* The four macros below are slightly different versions of the same logic.
1925 *
1926 * The first is for /a and /aa when the target string is UTF-8. This can only
1927 * match ascii, but it must advance based on UTF-8. The other three handle
1928 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
1929 * looking for the boundary (or non-boundary) between a word and non-word
1930 * character. The utf8 and non-utf8 cases have the same logic, but the details
1931 * must be different. Find the "wordness" of the character just prior to this
1932 * one, and compare it with the wordness of this one. If they differ, we have
1933 * a boundary. At the beginning of the string, pretend that the previous
1934 * character was a new-line.
1935 *
1936 * All these macros uncleanly have side-effects with each other and outside
1937 * variables. So far it's been too much trouble to clean-up
1938 *
1939 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1940 * a word character or not.
1941 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1942 * word/non-word
1943 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1944 *
1945 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1946 * are looking for a boundary or for a non-boundary. If we are looking for a
1947 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1948 * see if this tentative match actually works, and if so, to quit the loop
1949 * here. And vice-versa if we are looking for a non-boundary.
1950 *
1951 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1952 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1953 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1954 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1955 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1956 * complement. But in that branch we complement tmp, meaning that at the
1957 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1958 * which means at the top of the loop in the next iteration, it is
1959 * TEST_NON_UTF8(s-1) */
1960 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1961 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1962 tmp = TEST_NON_UTF8(tmp); \
1963 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1964 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1965 tmp = !tmp; \
1966 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1967 } \
1968 else { \
1969 IF_FAIL; \
1970 } \
1971 ); \
1972
1973 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1974 * TEST_UTF8 is a macro that for the same input code points returns identically
1975 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1976 * end pointer as well) */
1977 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1978 if (s == reginfo->strbeg) { \
1979 tmp = '\n'; \
1980 } \
1981 else { /* Back-up to the start of the previous character */ \
1982 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1983 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1984 0, UTF8_ALLOW_DEFAULT); \
1985 } \
1986 tmp = TEST_UV(tmp); \
1987 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
1988 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
1989 tmp = !tmp; \
1990 IF_SUCCESS; \
1991 } \
1992 else { \
1993 IF_FAIL; \
1994 } \
1995 );
1996
1997 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
1998 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
1999 * set-up by the FBC_BOUND, etc macros below */
2000 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2001 UTF8_CODE; \
2002 /* Here, things have been set up by the previous code so that tmp is the \
2003 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
2004 * against the EOS, which we treat as a \n */ \
2005 if (tmp == ! TEST_NON_UTF8('\n')) { \
2006 IF_SUCCESS; \
2007 } \
2008 else { \
2009 IF_FAIL; \
2010 }
2011
2012 /* Same as the macro above, but the target isn't UTF-8 */
2013 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2014 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2015 tmp = TEST_NON_UTF8(tmp); \
2016 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2017 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2018 IF_SUCCESS; \
2019 tmp = !tmp; \
2020 } \
2021 else { \
2022 IF_FAIL; \
2023 } \
2024 ); \
2025 /* Here, things have been set up by the previous code so that tmp is \
2026 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2027 * matches against the EOS, which we treat as a \n */ \
2028 if (tmp == ! TEST_NON_UTF8('\n')) { \
2029 IF_SUCCESS; \
2030 } \
2031 else { \
2032 IF_FAIL; \
2033 }
2034
2035 /* This is the macro to use when we want to see if something that looks like it
2036 * could match, actually does, and if so exits the loop. It needs to be used
2037 * only for bounds checking macros, as it allows for matching beyond the end of
2038 * string (which should be zero length without having to look at the string
2039 * contents) */
2040 #define REXEC_FBC_TRYIT \
2041 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
2042 goto got_it
2043
2044 /* The only difference between the BOUND and NBOUND cases is that
2045 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2046 * NBOUND. This is accomplished by passing it as either the if or else clause,
2047 * with the other one being empty (PLACEHOLDER is defined as empty).
2048 *
2049 * The TEST_FOO parameters are for operating on different forms of input, but
2050 * all should be ones that return identically for the same underlying code
2051 * points */
2052
2053 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2054 FBC_BOUND_COMMON_UTF8( \
2055 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2056 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2057
2058 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2059 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2060
2061 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2062 FBC_BOUND_COMMON_UTF8( \
2063 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2064 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2065
2066 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2067 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2068
2069 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2070 FBC_BOUND_COMMON_UTF8( \
2071 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2072 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2073
2074 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2075 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2076
2077 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2078 FBC_BOUND_COMMON_UTF8( \
2079 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2080 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2081
2082 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2083 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2084
2085 #ifdef DEBUGGING
2086 static IV
S_get_break_val_cp_checked(SV * const invlist,const UV cp_in)2087 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2088 IV cp_out = _invlist_search(invlist, cp_in);
2089 assert(cp_out >= 0);
2090 return cp_out;
2091 }
2092 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2093 invmap[S_get_break_val_cp_checked(invlist, cp)]
2094 #else
2095 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2096 invmap[_invlist_search(invlist, cp)]
2097 #endif
2098
2099 /* Takes a pointer to an inversion list, a pointer to its corresponding
2100 * inversion map, and a code point, and returns the code point's value
2101 * according to the two arrays. It assumes that all code points have a value.
2102 * This is used as the base macro for macros for particular properties */
2103 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2104 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2105
2106 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2107 * of a code point, returning the value for the first code point in the string.
2108 * And it takes the particular macro name that finds the desired value given a
2109 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2110 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2111 (__ASSERT_(pos < strend) \
2112 /* Note assumes is valid UTF-8 */ \
2113 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2114
2115 /* Returns the GCB value for the input code point */
2116 #define getGCB_VAL_CP(cp) \
2117 _generic_GET_BREAK_VAL_CP( \
2118 PL_GCB_invlist, \
2119 _Perl_GCB_invmap, \
2120 (cp))
2121
2122 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2123 * bounded by pos and strend */
2124 #define getGCB_VAL_UTF8(pos, strend) \
2125 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2126
2127 /* Returns the LB value for the input code point */
2128 #define getLB_VAL_CP(cp) \
2129 _generic_GET_BREAK_VAL_CP( \
2130 PL_LB_invlist, \
2131 _Perl_LB_invmap, \
2132 (cp))
2133
2134 /* Returns the LB value for the first code point in the UTF-8 encoded string
2135 * bounded by pos and strend */
2136 #define getLB_VAL_UTF8(pos, strend) \
2137 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2138
2139
2140 /* Returns the SB value for the input code point */
2141 #define getSB_VAL_CP(cp) \
2142 _generic_GET_BREAK_VAL_CP( \
2143 PL_SB_invlist, \
2144 _Perl_SB_invmap, \
2145 (cp))
2146
2147 /* Returns the SB value for the first code point in the UTF-8 encoded string
2148 * bounded by pos and strend */
2149 #define getSB_VAL_UTF8(pos, strend) \
2150 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2151
2152 /* Returns the WB value for the input code point */
2153 #define getWB_VAL_CP(cp) \
2154 _generic_GET_BREAK_VAL_CP( \
2155 PL_WB_invlist, \
2156 _Perl_WB_invmap, \
2157 (cp))
2158
2159 /* Returns the WB value for the first code point in the UTF-8 encoded string
2160 * bounded by pos and strend */
2161 #define getWB_VAL_UTF8(pos, strend) \
2162 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2163
2164 /* We know what class REx starts with. Try to find this position... */
2165 /* if reginfo->intuit, its a dryrun */
2166 /* annoyingly all the vars in this routine have different names from their counterparts
2167 in regmatch. /grrr */
2168 STATIC char *
S_find_byclass(pTHX_ regexp * prog,const regnode * c,char * s,const char * strend,regmatch_info * reginfo)2169 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2170 const char *strend, regmatch_info *reginfo)
2171 {
2172
2173 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2174 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2175
2176 char *pat_string; /* The pattern's exactish string */
2177 char *pat_end; /* ptr to end char of pat_string */
2178 re_fold_t folder; /* Function for computing non-utf8 folds */
2179 const U8 *fold_array; /* array for folding ords < 256 */
2180 STRLEN ln;
2181 STRLEN lnc;
2182 U8 c1;
2183 U8 c2;
2184 char *e = NULL;
2185
2186 /* In some cases we accept only the first occurence of 'x' in a sequence of
2187 * them. This variable points to just beyond the end of the previous
2188 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2189 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2190 * hop back.) */
2191 char * previous_occurrence_end = 0;
2192
2193 I32 tmp; /* Scratch variable */
2194 const bool utf8_target = reginfo->is_utf8_target;
2195 UV utf8_fold_flags = 0;
2196 const bool is_utf8_pat = reginfo->is_utf8_pat;
2197 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2198 with a result inverts that result, as 0^1 =
2199 1 and 1^1 = 0 */
2200 _char_class_number classnum;
2201
2202 RXi_GET_DECL(prog,progi);
2203
2204 PERL_ARGS_ASSERT_FIND_BYCLASS;
2205
2206 /* We know what class it must start with. The case statements below have
2207 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2208 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2209 * ('p8' and 'pb'. */
2210 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2211
2212 case ANYOFPOSIXL_t8_pb:
2213 case ANYOFPOSIXL_t8_p8:
2214 case ANYOFL_t8_pb:
2215 case ANYOFL_t8_p8:
2216 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2217 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2218
2219 /* FALLTHROUGH */
2220
2221 case ANYOFD_t8_pb:
2222 case ANYOFD_t8_p8:
2223 case ANYOF_t8_pb:
2224 case ANYOF_t8_p8:
2225 REXEC_FBC_UTF8_CLASS_SCAN(
2226 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2227 break;
2228
2229 case ANYOFPOSIXL_tb_pb:
2230 case ANYOFPOSIXL_tb_p8:
2231 case ANYOFL_tb_pb:
2232 case ANYOFL_tb_p8:
2233 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2234 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2235
2236 /* FALLTHROUGH */
2237
2238 case ANYOFD_tb_pb:
2239 case ANYOFD_tb_p8:
2240 case ANYOF_tb_pb:
2241 case ANYOF_tb_p8:
2242 if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2243 /* We know that s is in the bitmap range since the target isn't
2244 * UTF-8, so what happens for out-of-range values is not relevant,
2245 * so exclude that from the flags */
2246 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2247 0));
2248 }
2249 else {
2250 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2251 }
2252 break;
2253
2254 case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2255 case ANYOFM_tb_p8:
2256 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2257 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2258 break;
2259
2260 case ANYOFM_t8_pb:
2261 case ANYOFM_t8_p8:
2262 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2263 * we do anyway for performance reasons, as otherwise we would have to
2264 * examine all the continuation characters */
2265 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2266 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2267 break;
2268
2269 case NANYOFM_tb_pb:
2270 case NANYOFM_tb_p8:
2271 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2272 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2273 break;
2274
2275 case NANYOFM_t8_pb:
2276 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2277 variants. */
2278 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2279 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2280 (U8) ARG(c), FLAGS(c)));
2281 break;
2282
2283 /* These nodes all require at least one code point to be in UTF-8 to
2284 * match */
2285 case ANYOFH_tb_pb:
2286 case ANYOFH_tb_p8:
2287 case ANYOFHb_tb_pb:
2288 case ANYOFHb_tb_p8:
2289 case ANYOFHr_tb_pb:
2290 case ANYOFHr_tb_p8:
2291 case ANYOFHs_tb_pb:
2292 case ANYOFHs_tb_p8:
2293 case EXACTFLU8_tb_pb:
2294 case EXACTFLU8_tb_p8:
2295 case EXACTFU_REQ8_tb_pb:
2296 case EXACTFU_REQ8_tb_p8:
2297 break;
2298
2299 case ANYOFH_t8_pb:
2300 case ANYOFH_t8_p8:
2301 REXEC_FBC_UTF8_CLASS_SCAN(
2302 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2303 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2304 break;
2305
2306 case ANYOFHb_t8_pb:
2307 case ANYOFHb_t8_p8:
2308 {
2309 /* We know what the first byte of any matched string should be. */
2310 U8 first_byte = FLAGS(c);
2311
2312 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2313 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2314 }
2315 break;
2316
2317 case ANYOFHr_t8_pb:
2318 case ANYOFHr_t8_p8:
2319 REXEC_FBC_UTF8_CLASS_SCAN(
2320 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2321 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2322 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2323 && reginclass(prog, c, (U8*)s, (U8*) strend,
2324 1 /* is utf8 */)));
2325 break;
2326
2327 case ANYOFHs_t8_pb:
2328 case ANYOFHs_t8_p8:
2329 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2330 ((struct regnode_anyofhs *) c)->string,
2331 /* Note FLAGS is the string length in this regnode */
2332 ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2333 reginclass(prog, c, (U8*)s, (U8*) strend,
2334 1 /* is utf8 */));
2335 break;
2336
2337 case ANYOFR_tb_pb:
2338 case ANYOFR_tb_p8:
2339 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2340 ANYOFRbase(c), ANYOFRdelta(c)));
2341 break;
2342
2343 case ANYOFR_t8_pb:
2344 case ANYOFR_t8_p8:
2345 REXEC_FBC_UTF8_CLASS_SCAN(
2346 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2347 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2348 (U8 *) strend,
2349 NULL),
2350 ANYOFRbase(c), ANYOFRdelta(c))));
2351 break;
2352
2353 case ANYOFRb_tb_pb:
2354 case ANYOFRb_tb_p8:
2355 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2356 ANYOFRbase(c), ANYOFRdelta(c)));
2357 break;
2358
2359 case ANYOFRb_t8_pb:
2360 case ANYOFRb_t8_p8:
2361 { /* We know what the first byte of any matched string should be */
2362 U8 first_byte = FLAGS(c);
2363
2364 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2365 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2366 (U8 *) strend,
2367 NULL),
2368 ANYOFRbase(c), ANYOFRdelta(c)));
2369 }
2370 break;
2371
2372 case EXACTFAA_tb_pb:
2373
2374 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2375 * which these functions don't handle anyway */
2376 fold_array = PL_fold_latin1;
2377 folder = foldEQ_latin1_s2_folded;
2378 goto do_exactf_non_utf8;
2379
2380 case EXACTF_tb_pb:
2381 fold_array = PL_fold;
2382 folder = foldEQ;
2383 goto do_exactf_non_utf8;
2384
2385 case EXACTFL_tb_pb:
2386 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2387
2388 if (IN_UTF8_CTYPE_LOCALE) {
2389 utf8_fold_flags = FOLDEQ_LOCALE;
2390 goto do_exactf_utf8;
2391 }
2392
2393 fold_array = PL_fold_locale;
2394 folder = foldEQ_locale;
2395 goto do_exactf_non_utf8;
2396
2397 case EXACTFU_tb_pb:
2398 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2399 * don't have to worry here about this single special case in the
2400 * Latin1 range */
2401 fold_array = PL_fold_latin1;
2402 folder = foldEQ_latin1_s2_folded;
2403
2404 /* FALLTHROUGH */
2405
2406 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2407 are no glitches with fold-length differences
2408 between the target string and pattern */
2409
2410 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2411 * character of the EXACTF* node and then, if necessary,
2412 * case-insensitively compare the full text of the node. c1 is the
2413 * first character. c2 is its fold. This logic will not work for
2414 * Unicode semantics and the german sharp ss, which hence should not be
2415 * compiled into a node that gets here. */
2416 pat_string = STRINGs(c);
2417 ln = STR_LENs(c); /* length to match in octets/bytes */
2418
2419 /* We know that we have to match at least 'ln' bytes (which is the same
2420 * as characters, since not utf8). If we have to match 3 characters,
2421 * and there are only 2 availabe, we know without trying that it will
2422 * fail; so don't start a match past the required minimum number from
2423 * the far end */
2424 e = HOP3c(strend, -((SSize_t)ln), s);
2425 if (e < s)
2426 break;
2427
2428 c1 = *pat_string;
2429 c2 = fold_array[c1];
2430 if (c1 == c2) { /* If char and fold are the same */
2431 while (s <= e) {
2432 s = (char *) memchr(s, c1, e + 1 - s);
2433 if (s == NULL) {
2434 break;
2435 }
2436
2437 /* Check that the rest of the node matches */
2438 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2439 && (reginfo->intuit || regtry(reginfo, &s)) )
2440 {
2441 goto got_it;
2442 }
2443 s++;
2444 }
2445 }
2446 else {
2447 U8 bits_differing = c1 ^ c2;
2448
2449 /* If the folds differ in one bit position only, we can mask to
2450 * match either of them, and can use this faster find method. Both
2451 * ASCII and EBCDIC tend to have their case folds differ in only
2452 * one position, so this is very likely */
2453 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2454 bits_differing = ~ bits_differing;
2455 while (s <= e) {
2456 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2457 (c1 & bits_differing), bits_differing);
2458 if (s > e) {
2459 break;
2460 }
2461
2462 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2463 && (reginfo->intuit || regtry(reginfo, &s)) )
2464 {
2465 goto got_it;
2466 }
2467 s++;
2468 }
2469 }
2470 else { /* Otherwise, stuck with looking byte-at-a-time. This
2471 should actually happen only in EXACTFL nodes */
2472 while (s <= e) {
2473 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2474 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2475 && (reginfo->intuit || regtry(reginfo, &s)) )
2476 {
2477 goto got_it;
2478 }
2479 s++;
2480 }
2481 }
2482 }
2483 break;
2484
2485 case EXACTFAA_tb_p8:
2486 case EXACTFAA_t8_p8:
2487 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2488 |FOLDEQ_S2_ALREADY_FOLDED
2489 |FOLDEQ_S2_FOLDS_SANE;
2490 goto do_exactf_utf8;
2491
2492 case EXACTFAA_NO_TRIE_tb_pb:
2493 case EXACTFAA_NO_TRIE_t8_pb:
2494 case EXACTFAA_t8_pb:
2495
2496 /* Here, and elsewhere in this file, the reason we can't consider a
2497 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2498 * is because any MICRO SIGN in the pattern won't be folded. Since the
2499 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2500 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2501 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2502 goto do_exactf_utf8;
2503
2504 case EXACTFL_tb_p8:
2505 case EXACTFL_t8_pb:
2506 case EXACTFL_t8_p8:
2507 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2508 utf8_fold_flags = FOLDEQ_LOCALE;
2509 goto do_exactf_utf8;
2510
2511 case EXACTFLU8_t8_pb:
2512 case EXACTFLU8_t8_p8:
2513 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2514 | FOLDEQ_S2_FOLDS_SANE;
2515 goto do_exactf_utf8;
2516
2517 case EXACTFU_REQ8_t8_p8:
2518 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2519 goto do_exactf_utf8;
2520
2521 case EXACTFU_tb_p8:
2522 case EXACTFU_t8_pb:
2523 case EXACTFU_t8_p8:
2524 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2525 goto do_exactf_utf8;
2526
2527 /* The following are problematic even though pattern isn't UTF-8. Use
2528 * full functionality normally not done except for UTF-8. */
2529 case EXACTF_t8_pb:
2530 case EXACTFUP_tb_pb:
2531 case EXACTFUP_t8_pb:
2532
2533 do_exactf_utf8:
2534 {
2535 unsigned expansion;
2536
2537 /* If one of the operands is in utf8, we can't use the simpler
2538 * folding above, due to the fact that many different characters
2539 * can have the same fold, or portion of a fold, or different-
2540 * length fold */
2541 pat_string = STRINGs(c);
2542 ln = STR_LENs(c); /* length to match in octets/bytes */
2543 pat_end = pat_string + ln;
2544 lnc = is_utf8_pat /* length to match in characters */
2545 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2546 : ln;
2547
2548 /* We have 'lnc' characters to match in the pattern, but because of
2549 * multi-character folding, each character in the target can match
2550 * up to 3 characters (Unicode guarantees it will never exceed
2551 * this) if it is utf8-encoded; and up to 2 if not (based on the
2552 * fact that the Latin 1 folds are already determined, and the only
2553 * multi-char fold in that range is the sharp-s folding to 'ss'.
2554 * Thus, a pattern character can match as little as 1/3 of a string
2555 * character. Adjust lnc accordingly, rounding up, so that if we
2556 * need to match at least 4+1/3 chars, that really is 5. */
2557 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2558 lnc = (lnc + expansion - 1) / expansion;
2559
2560 /* As in the non-UTF8 case, if we have to match 3 characters, and
2561 * only 2 are left, it's guaranteed to fail, so don't start a match
2562 * that would require us to go beyond the end of the string */
2563 e = HOP3c(strend, -((SSize_t)lnc), s);
2564
2565 /* XXX Note that we could recalculate e to stop the loop earlier,
2566 * as the worst case expansion above will rarely be met, and as we
2567 * go along we would usually find that e moves further to the left.
2568 * This would happen only after we reached the point in the loop
2569 * where if there were no expansion we should fail. Unclear if
2570 * worth the expense */
2571
2572 while (s <= e) {
2573 char *my_strend= (char *)strend;
2574 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2575 pat_string, NULL, ln, is_utf8_pat,
2576 utf8_fold_flags)
2577 && (reginfo->intuit || regtry(reginfo, &s)) )
2578 {
2579 goto got_it;
2580 }
2581 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2582 }
2583 }
2584 break;
2585
2586 case BOUNDA_tb_pb:
2587 case BOUNDA_tb_p8:
2588 case BOUND_tb_pb: /* /d without utf8 target is /a */
2589 case BOUND_tb_p8:
2590 /* regcomp.c makes sure that these only have the traditional \b
2591 * meaning. */
2592 assert(FLAGS(c) == TRADITIONAL_BOUND);
2593
2594 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2595 break;
2596
2597 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2598 case BOUNDA_t8_p8:
2599 /* regcomp.c makes sure that these only have the traditional \b
2600 * meaning. */
2601 assert(FLAGS(c) == TRADITIONAL_BOUND);
2602
2603 FBC_BOUND_A_UTF8(isWORDCHAR_A);
2604 break;
2605
2606 case NBOUNDA_tb_pb:
2607 case NBOUNDA_tb_p8:
2608 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2609 case NBOUND_tb_p8:
2610 /* regcomp.c makes sure that these only have the traditional \b
2611 * meaning. */
2612 assert(FLAGS(c) == TRADITIONAL_BOUND);
2613
2614 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2615 break;
2616
2617 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2618 case NBOUNDA_t8_p8:
2619 /* regcomp.c makes sure that these only have the traditional \b
2620 * meaning. */
2621 assert(FLAGS(c) == TRADITIONAL_BOUND);
2622
2623 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2624 break;
2625
2626 case NBOUNDU_tb_pb:
2627 case NBOUNDU_tb_p8:
2628 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2629 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2630 break;
2631 }
2632
2633 to_complement = 1;
2634 goto do_boundu_non_utf8;
2635
2636 case NBOUNDL_tb_pb:
2637 case NBOUNDL_tb_p8:
2638 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2639 if (FLAGS(c) == TRADITIONAL_BOUND) {
2640 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2641 break;
2642 }
2643
2644 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2645
2646 to_complement = 1;
2647 goto do_boundu_non_utf8;
2648
2649 case BOUNDL_tb_pb:
2650 case BOUNDL_tb_p8:
2651 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2652 if (FLAGS(c) == TRADITIONAL_BOUND) {
2653 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2654 break;
2655 }
2656
2657 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2658
2659 goto do_boundu_non_utf8;
2660
2661 case BOUNDU_tb_pb:
2662 case BOUNDU_tb_p8:
2663 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2664 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2665 break;
2666 }
2667
2668 do_boundu_non_utf8:
2669 if (s == reginfo->strbeg) {
2670 if (reginfo->intuit || regtry(reginfo, &s))
2671 {
2672 goto got_it;
2673 }
2674
2675 /* Didn't match. Try at the next position (if there is one) */
2676 s++;
2677 if (UNLIKELY(s >= reginfo->strend)) {
2678 break;
2679 }
2680 }
2681
2682 switch((bound_type) FLAGS(c)) {
2683 case TRADITIONAL_BOUND: /* Should have already been handled */
2684 assert(0);
2685 break;
2686
2687 case GCB_BOUND:
2688 /* Not utf8. Everything is a GCB except between CR and LF */
2689 while (s < strend) {
2690 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2691 || UCHARAT(s) != '\n'))
2692 && (reginfo->intuit || regtry(reginfo, &s)))
2693 {
2694 goto got_it;
2695 }
2696 s++;
2697 }
2698
2699 break;
2700
2701 case LB_BOUND:
2702 {
2703 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2704 while (s < strend) {
2705 LB_enum after = getLB_VAL_CP((U8) *s);
2706 if (to_complement ^ isLB(before,
2707 after,
2708 (U8*) reginfo->strbeg,
2709 (U8*) s,
2710 (U8*) reginfo->strend,
2711 0 /* target not utf8 */ )
2712 && (reginfo->intuit || regtry(reginfo, &s)))
2713 {
2714 goto got_it;
2715 }
2716 before = after;
2717 s++;
2718 }
2719 }
2720
2721 break;
2722
2723 case SB_BOUND:
2724 {
2725 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2726 while (s < strend) {
2727 SB_enum after = getSB_VAL_CP((U8) *s);
2728 if ((to_complement ^ isSB(before,
2729 after,
2730 (U8*) reginfo->strbeg,
2731 (U8*) s,
2732 (U8*) reginfo->strend,
2733 0 /* target not utf8 */ ))
2734 && (reginfo->intuit || regtry(reginfo, &s)))
2735 {
2736 goto got_it;
2737 }
2738 before = after;
2739 s++;
2740 }
2741 }
2742
2743 break;
2744
2745 case WB_BOUND:
2746 {
2747 WB_enum previous = WB_UNKNOWN;
2748 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2749 while (s < strend) {
2750 WB_enum after = getWB_VAL_CP((U8) *s);
2751 if ((to_complement ^ isWB(previous,
2752 before,
2753 after,
2754 (U8*) reginfo->strbeg,
2755 (U8*) s,
2756 (U8*) reginfo->strend,
2757 0 /* target not utf8 */ ))
2758 && (reginfo->intuit || regtry(reginfo, &s)))
2759 {
2760 goto got_it;
2761 }
2762 previous = before;
2763 before = after;
2764 s++;
2765 }
2766 }
2767 }
2768
2769 /* Here are at the final position in the target string, which is a
2770 * boundary by definition, so matches, depending on other constraints.
2771 * */
2772 if ( reginfo->intuit
2773 || (s <= reginfo->strend && regtry(reginfo, &s)))
2774 {
2775 goto got_it;
2776 }
2777
2778 break;
2779
2780 case BOUNDL_t8_pb:
2781 case BOUNDL_t8_p8:
2782 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2783 if (FLAGS(c) == TRADITIONAL_BOUND) {
2784 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2785 isWORDCHAR_LC_utf8_safe);
2786 break;
2787 }
2788
2789 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2790
2791 to_complement = 1;
2792 goto do_boundu_utf8;
2793
2794 case NBOUNDL_t8_pb:
2795 case NBOUNDL_t8_p8:
2796 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2797 if (FLAGS(c) == TRADITIONAL_BOUND) {
2798 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2799 isWORDCHAR_LC_utf8_safe);
2800 break;
2801 }
2802
2803 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2804
2805 to_complement = 1;
2806 goto do_boundu_utf8;
2807
2808 case NBOUND_t8_pb:
2809 case NBOUND_t8_p8:
2810 /* regcomp.c makes sure that these only have the traditional \b
2811 * meaning. */
2812 assert(FLAGS(c) == TRADITIONAL_BOUND);
2813
2814 /* FALLTHROUGH */
2815
2816 case NBOUNDU_t8_pb:
2817 case NBOUNDU_t8_p8:
2818 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2819 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2820 isWORDCHAR_utf8_safe);
2821 break;
2822 }
2823
2824 to_complement = 1;
2825 goto do_boundu_utf8;
2826
2827 case BOUND_t8_pb:
2828 case BOUND_t8_p8:
2829 /* regcomp.c makes sure that these only have the traditional \b
2830 * meaning. */
2831 assert(FLAGS(c) == TRADITIONAL_BOUND);
2832
2833 /* FALLTHROUGH */
2834
2835 case BOUNDU_t8_pb:
2836 case BOUNDU_t8_p8:
2837 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2838 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2839 break;
2840 }
2841
2842 do_boundu_utf8:
2843 if (s == reginfo->strbeg) {
2844 if (reginfo->intuit || regtry(reginfo, &s))
2845 {
2846 goto got_it;
2847 }
2848
2849 /* Didn't match. Try at the next position (if there is one) */
2850 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2851 if (UNLIKELY(s >= reginfo->strend)) {
2852 break;
2853 }
2854 }
2855
2856 switch((bound_type) FLAGS(c)) {
2857 case TRADITIONAL_BOUND: /* Should have already been handled */
2858 assert(0);
2859 break;
2860
2861 case GCB_BOUND:
2862 {
2863 GCB_enum before = getGCB_VAL_UTF8(
2864 reghop3((U8*)s, -1,
2865 (U8*)(reginfo->strbeg)),
2866 (U8*) reginfo->strend);
2867 while (s < strend) {
2868 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2869 (U8*) reginfo->strend);
2870 if ( (to_complement ^ isGCB(before,
2871 after,
2872 (U8*) reginfo->strbeg,
2873 (U8*) s,
2874 1 /* target is utf8 */ ))
2875 && (reginfo->intuit || regtry(reginfo, &s)))
2876 {
2877 goto got_it;
2878 }
2879 before = after;
2880 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2881 }
2882 }
2883 break;
2884
2885 case LB_BOUND:
2886 {
2887 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2888 -1,
2889 (U8*)(reginfo->strbeg)),
2890 (U8*) reginfo->strend);
2891 while (s < strend) {
2892 LB_enum after = getLB_VAL_UTF8((U8*) s,
2893 (U8*) reginfo->strend);
2894 if (to_complement ^ isLB(before,
2895 after,
2896 (U8*) reginfo->strbeg,
2897 (U8*) s,
2898 (U8*) reginfo->strend,
2899 1 /* target is utf8 */ )
2900 && (reginfo->intuit || regtry(reginfo, &s)))
2901 {
2902 goto got_it;
2903 }
2904 before = after;
2905 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2906 }
2907 }
2908
2909 break;
2910
2911 case SB_BOUND:
2912 {
2913 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2914 -1,
2915 (U8*)(reginfo->strbeg)),
2916 (U8*) reginfo->strend);
2917 while (s < strend) {
2918 SB_enum after = getSB_VAL_UTF8((U8*) s,
2919 (U8*) reginfo->strend);
2920 if ((to_complement ^ isSB(before,
2921 after,
2922 (U8*) reginfo->strbeg,
2923 (U8*) s,
2924 (U8*) reginfo->strend,
2925 1 /* target is utf8 */ ))
2926 && (reginfo->intuit || regtry(reginfo, &s)))
2927 {
2928 goto got_it;
2929 }
2930 before = after;
2931 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2932 }
2933 }
2934
2935 break;
2936
2937 case WB_BOUND:
2938 {
2939 /* We are at a boundary between char_sub_0 and char_sub_1.
2940 * We also keep track of the value for char_sub_-1 as we
2941 * loop through the line. Context may be needed to make a
2942 * determination, and if so, this can save having to
2943 * recalculate it */
2944 WB_enum previous = WB_UNKNOWN;
2945 WB_enum before = getWB_VAL_UTF8(
2946 reghop3((U8*)s,
2947 -1,
2948 (U8*)(reginfo->strbeg)),
2949 (U8*) reginfo->strend);
2950 while (s < strend) {
2951 WB_enum after = getWB_VAL_UTF8((U8*) s,
2952 (U8*) reginfo->strend);
2953 if ((to_complement ^ isWB(previous,
2954 before,
2955 after,
2956 (U8*) reginfo->strbeg,
2957 (U8*) s,
2958 (U8*) reginfo->strend,
2959 1 /* target is utf8 */ ))
2960 && (reginfo->intuit || regtry(reginfo, &s)))
2961 {
2962 goto got_it;
2963 }
2964 previous = before;
2965 before = after;
2966 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2967 }
2968 }
2969 }
2970
2971 /* Here are at the final position in the target string, which is a
2972 * boundary by definition, so matches, depending on other constraints.
2973 * */
2974
2975 if ( reginfo->intuit
2976 || (s <= reginfo->strend && regtry(reginfo, &s)))
2977 {
2978 goto got_it;
2979 }
2980 break;
2981
2982 case LNBREAK_t8_pb:
2983 case LNBREAK_t8_p8:
2984 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2985 break;
2986
2987 case LNBREAK_tb_pb:
2988 case LNBREAK_tb_p8:
2989 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
2990 break;
2991
2992 /* The argument to all the POSIX node types is the class number to pass
2993 * to _generic_isCC() to build a mask for searching in PL_charclass[] */
2994
2995 case NPOSIXL_t8_pb:
2996 case NPOSIXL_t8_p8:
2997 to_complement = 1;
2998 /* FALLTHROUGH */
2999
3000 case POSIXL_t8_pb:
3001 case POSIXL_t8_p8:
3002 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3003 REXEC_FBC_UTF8_CLASS_SCAN(
3004 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3005 (U8 *) strend)));
3006 break;
3007
3008 case NPOSIXL_tb_pb:
3009 case NPOSIXL_tb_p8:
3010 to_complement = 1;
3011 /* FALLTHROUGH */
3012
3013 case POSIXL_tb_pb:
3014 case POSIXL_tb_p8:
3015 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3016 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3017 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3018 break;
3019
3020 case NPOSIXA_t8_pb:
3021 case NPOSIXA_t8_p8:
3022 /* The complement of something that matches only ASCII matches all
3023 * non-ASCII, plus everything in ASCII that isn't in the class. */
3024 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3025 || ! _generic_isCC_A(*s, FLAGS(c)));
3026 break;
3027
3028 case POSIXA_t8_pb:
3029 case POSIXA_t8_p8:
3030 /* Don't need to worry about utf8, as it can match only a single
3031 * byte invariant character. But we do anyway for performance reasons,
3032 * as otherwise we would have to examine all the continuation
3033 * characters */
3034 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3035 break;
3036
3037 case NPOSIXD_tb_pb:
3038 case NPOSIXD_tb_p8:
3039 case NPOSIXA_tb_pb:
3040 case NPOSIXA_tb_p8:
3041 to_complement = 1;
3042 /* FALLTHROUGH */
3043
3044 case POSIXD_tb_pb:
3045 case POSIXD_tb_p8:
3046 case POSIXA_tb_pb:
3047 case POSIXA_tb_p8:
3048 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3049 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
3050 break;
3051
3052 case NPOSIXU_tb_pb:
3053 case NPOSIXU_tb_p8:
3054 to_complement = 1;
3055 /* FALLTHROUGH */
3056
3057 case POSIXU_tb_pb:
3058 case POSIXU_tb_p8:
3059 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3060 to_complement ^ cBOOL(_generic_isCC(*s,
3061 FLAGS(c))));
3062 break;
3063
3064 case NPOSIXD_t8_pb:
3065 case NPOSIXD_t8_p8:
3066 case NPOSIXU_t8_pb:
3067 case NPOSIXU_t8_p8:
3068 to_complement = 1;
3069 /* FALLTHROUGH */
3070
3071 case POSIXD_t8_pb:
3072 case POSIXD_t8_p8:
3073 case POSIXU_t8_pb:
3074 case POSIXU_t8_p8:
3075 classnum = (_char_class_number) FLAGS(c);
3076 switch (classnum) {
3077 default:
3078 REXEC_FBC_UTF8_CLASS_SCAN(
3079 to_complement ^ cBOOL(_invlist_contains_cp(
3080 PL_XPosix_ptrs[classnum],
3081 utf8_to_uvchr_buf((U8 *) s,
3082 (U8 *) strend,
3083 NULL))));
3084 break;
3085
3086 case _CC_ENUM_SPACE:
3087 REXEC_FBC_UTF8_CLASS_SCAN(
3088 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3089 break;
3090
3091 case _CC_ENUM_BLANK:
3092 REXEC_FBC_UTF8_CLASS_SCAN(
3093 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3094 break;
3095
3096 case _CC_ENUM_XDIGIT:
3097 REXEC_FBC_UTF8_CLASS_SCAN(
3098 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3099 break;
3100
3101 case _CC_ENUM_VERTSPACE:
3102 REXEC_FBC_UTF8_CLASS_SCAN(
3103 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3104 break;
3105
3106 case _CC_ENUM_CNTRL:
3107 REXEC_FBC_UTF8_CLASS_SCAN(
3108 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3109 break;
3110 }
3111 break;
3112
3113 case AHOCORASICKC_tb_pb:
3114 case AHOCORASICKC_tb_p8:
3115 case AHOCORASICKC_t8_pb:
3116 case AHOCORASICKC_t8_p8:
3117 case AHOCORASICK_tb_pb:
3118 case AHOCORASICK_tb_p8:
3119 case AHOCORASICK_t8_pb:
3120 case AHOCORASICK_t8_p8:
3121 {
3122 DECL_TRIE_TYPE(c);
3123 /* what trie are we using right now */
3124 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3125 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3126 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3127
3128 const char *last_start = strend - trie->minlen;
3129 #ifdef DEBUGGING
3130 const char *real_start = s;
3131 #endif
3132 STRLEN maxlen = trie->maxlen;
3133 SV *sv_points;
3134 U8 **points; /* map of where we were in the input string
3135 when reading a given char. For ASCII this
3136 is unnecessary overhead as the relationship
3137 is always 1:1, but for Unicode, especially
3138 case folded Unicode this is not true. */
3139 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3140 U8 *bitmap=NULL;
3141
3142
3143 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3144
3145 /* We can't just allocate points here. We need to wrap it in
3146 * an SV so it gets freed properly if there is a croak while
3147 * running the match */
3148 ENTER;
3149 SAVETMPS;
3150 sv_points=newSV(maxlen * sizeof(U8 *));
3151 SvCUR_set(sv_points,
3152 maxlen * sizeof(U8 *));
3153 SvPOK_on(sv_points);
3154 sv_2mortal(sv_points);
3155 points=(U8**)SvPV_nolen(sv_points );
3156 if ( trie_type != trie_utf8_fold
3157 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3158 {
3159 if (trie->bitmap)
3160 bitmap=(U8*)trie->bitmap;
3161 else
3162 bitmap=(U8*)ANYOF_BITMAP(c);
3163 }
3164 /* this is the Aho-Corasick algorithm modified a touch
3165 to include special handling for long "unknown char" sequences.
3166 The basic idea being that we use AC as long as we are dealing
3167 with a possible matching char, when we encounter an unknown char
3168 (and we have not encountered an accepting state) we scan forward
3169 until we find a legal starting char.
3170 AC matching is basically that of trie matching, except that when
3171 we encounter a failing transition, we fall back to the current
3172 states "fail state", and try the current char again, a process
3173 we repeat until we reach the root state, state 1, or a legal
3174 transition. If we fail on the root state then we can either
3175 terminate if we have reached an accepting state previously, or
3176 restart the entire process from the beginning if we have not.
3177
3178 */
3179 while (s <= last_start) {
3180 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3181 U8 *uc = (U8*)s;
3182 U16 charid = 0;
3183 U32 base = 1;
3184 U32 state = 1;
3185 UV uvc = 0;
3186 STRLEN len = 0;
3187 STRLEN foldlen = 0;
3188 U8 *uscan = (U8*)NULL;
3189 U8 *leftmost = NULL;
3190 #ifdef DEBUGGING
3191 U32 accepted_word= 0;
3192 #endif
3193 U32 pointpos = 0;
3194
3195 while ( state && uc <= (U8*)strend ) {
3196 int failed=0;
3197 U32 word = aho->states[ state ].wordnum;
3198
3199 if( state==1 ) {
3200 if ( bitmap ) {
3201 DEBUG_TRIE_EXECUTE_r(
3202 if ( uc <= (U8*)last_start
3203 && !BITMAP_TEST(bitmap,*uc) )
3204 {
3205 dump_exec_pos( (char *)uc, c, strend,
3206 real_start,
3207 (char *)uc, utf8_target, 0 );
3208 Perl_re_printf( aTHX_
3209 " Scanning for legal start char...\n");
3210 }
3211 );
3212 if (utf8_target) {
3213 while ( uc <= (U8*)last_start
3214 && !BITMAP_TEST(bitmap,*uc) )
3215 {
3216 uc += UTF8SKIP(uc);
3217 }
3218 } else {
3219 while ( uc <= (U8*)last_start
3220 && ! BITMAP_TEST(bitmap,*uc) )
3221 {
3222 uc++;
3223 }
3224 }
3225 s= (char *)uc;
3226 }
3227 if (uc >(U8*)last_start) break;
3228 }
3229
3230 if ( word ) {
3231 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3232 % maxlen ];
3233 if (!leftmost || lpos < leftmost) {
3234 DEBUG_r(accepted_word=word);
3235 leftmost= lpos;
3236 }
3237 if (base==0) break;
3238
3239 }
3240 points[pointpos++ % maxlen]= uc;
3241 if (foldlen || uc < (U8*)strend) {
3242 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3243 (U8 *) strend, uscan, len, uvc,
3244 charid, foldlen, foldbuf,
3245 uniflags);
3246 DEBUG_TRIE_EXECUTE_r({
3247 dump_exec_pos( (char *)uc, c, strend,
3248 real_start, s, utf8_target, 0);
3249 Perl_re_printf( aTHX_
3250 " Charid:%3u CP:%4" UVxf " ",
3251 charid, uvc);
3252 });
3253 }
3254 else {
3255 len = 0;
3256 charid = 0;
3257 }
3258
3259
3260 do {
3261 #ifdef DEBUGGING
3262 word = aho->states[ state ].wordnum;
3263 #endif
3264 base = aho->states[ state ].trans.base;
3265
3266 DEBUG_TRIE_EXECUTE_r({
3267 if (failed)
3268 dump_exec_pos((char *)uc, c, strend, real_start,
3269 s, utf8_target, 0 );
3270 Perl_re_printf( aTHX_
3271 "%sState: %4" UVxf ", word=%" UVxf,
3272 failed ? " Fail transition to " : "",
3273 (UV)state, (UV)word);
3274 });
3275 if ( base ) {
3276 U32 tmp;
3277 I32 offset;
3278 if (charid &&
3279 ( ((offset = base + charid
3280 - 1 - trie->uniquecharcount)) >= 0)
3281 && ((U32)offset < trie->lasttrans)
3282 && trie->trans[offset].check == state
3283 && (tmp=trie->trans[offset].next))
3284 {
3285 DEBUG_TRIE_EXECUTE_r(
3286 Perl_re_printf( aTHX_ " - legal\n"));
3287 state = tmp;
3288 break;
3289 }
3290 else {
3291 DEBUG_TRIE_EXECUTE_r(
3292 Perl_re_printf( aTHX_ " - fail\n"));
3293 failed = 1;
3294 state = aho->fail[state];
3295 }
3296 }
3297 else {
3298 /* we must be accepting here */
3299 DEBUG_TRIE_EXECUTE_r(
3300 Perl_re_printf( aTHX_ " - accepting\n"));
3301 failed = 1;
3302 break;
3303 }
3304 } while(state);
3305 uc += len;
3306 if (failed) {
3307 if (leftmost)
3308 break;
3309 if (!state) state = 1;
3310 }
3311 }
3312 if ( aho->states[ state ].wordnum ) {
3313 U8 *lpos = points[ (pointpos
3314 - trie->wordinfo[aho->states[ state ]
3315 .wordnum].len) % maxlen ];
3316 if (!leftmost || lpos < leftmost) {
3317 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3318 leftmost = lpos;
3319 }
3320 }
3321 if (leftmost) {
3322 s = (char*)leftmost;
3323 DEBUG_TRIE_EXECUTE_r({
3324 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3325 " at position %" IVdf ". Trying full"
3326 " pattern...\n",
3327 (UV)accepted_word, (IV)(s - real_start)
3328 );
3329 });
3330 if (reginfo->intuit || regtry(reginfo, &s)) {
3331 FREETMPS;
3332 LEAVE;
3333 goto got_it;
3334 }
3335 if (s < reginfo->strend) {
3336 s = HOPc(s,1);
3337 }
3338 DEBUG_TRIE_EXECUTE_r({
3339 Perl_re_printf( aTHX_
3340 "Pattern failed. Looking for new start"
3341 " point...\n");
3342 });
3343 } else {
3344 DEBUG_TRIE_EXECUTE_r(
3345 Perl_re_printf( aTHX_ "No match.\n"));
3346 break;
3347 }
3348 }
3349 FREETMPS;
3350 LEAVE;
3351 }
3352 break;
3353
3354 case EXACTFU_REQ8_t8_pb:
3355 case EXACTFUP_tb_p8:
3356 case EXACTFUP_t8_p8:
3357 case EXACTF_tb_p8:
3358 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3359 case EXACTFAA_NO_TRIE_tb_p8:
3360 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3361 patterns */
3362 assert(0);
3363
3364 default:
3365 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3366 } /* End of switch on node type */
3367
3368 return 0;
3369
3370 got_it:
3371 return s;
3372 }
3373
3374 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3375 * flags have same meanings as with regexec_flags() */
3376
3377 static void
S_reg_set_capture_string(pTHX_ REGEXP * const rx,char * strbeg,char * strend,SV * sv,U32 flags,bool utf8_target)3378 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3379 char *strbeg,
3380 char *strend,
3381 SV *sv,
3382 U32 flags,
3383 bool utf8_target)
3384 {
3385 struct regexp *const prog = ReANY(rx);
3386
3387 if (flags & REXEC_COPY_STR) {
3388 #ifdef PERL_ANY_COW
3389 if (SvCANCOW(sv)) {
3390 DEBUG_C(Perl_re_printf( aTHX_
3391 "Copy on write: regexp capture, type %d\n",
3392 (int) SvTYPE(sv)));
3393 /* Create a new COW SV to share the match string and store
3394 * in saved_copy, unless the current COW SV in saved_copy
3395 * is valid and suitable for our purpose */
3396 if (( prog->saved_copy
3397 && SvIsCOW(prog->saved_copy)
3398 && SvPOKp(prog->saved_copy)
3399 && SvIsCOW(sv)
3400 && SvPOKp(sv)
3401 && SvPVX(sv) == SvPVX(prog->saved_copy)))
3402 {
3403 /* just reuse saved_copy SV */
3404 if (RXp_MATCH_COPIED(prog)) {
3405 Safefree(prog->subbeg);
3406 RXp_MATCH_COPIED_off(prog);
3407 }
3408 }
3409 else {
3410 /* create new COW SV to share string */
3411 RXp_MATCH_COPY_FREE(prog);
3412 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3413 }
3414 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3415 assert (SvPOKp(prog->saved_copy));
3416 prog->sublen = strend - strbeg;
3417 prog->suboffset = 0;
3418 prog->subcoffset = 0;
3419 } else
3420 #endif
3421 {
3422 SSize_t min = 0;
3423 SSize_t max = strend - strbeg;
3424 SSize_t sublen;
3425
3426 if ( (flags & REXEC_COPY_SKIP_POST)
3427 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3428 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3429 ) { /* don't copy $' part of string */
3430 U32 n = 0;
3431 max = -1;
3432 /* calculate the right-most part of the string covered
3433 * by a capture. Due to lookahead, this may be to
3434 * the right of $&, so we have to scan all captures */
3435 while (n <= prog->lastparen) {
3436 if (prog->offs[n].end > max)
3437 max = prog->offs[n].end;
3438 n++;
3439 }
3440 if (max == -1)
3441 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3442 ? prog->offs[0].start
3443 : 0;
3444 assert(max >= 0 && max <= strend - strbeg);
3445 }
3446
3447 if ( (flags & REXEC_COPY_SKIP_PRE)
3448 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3449 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3450 ) { /* don't copy $` part of string */
3451 U32 n = 0;
3452 min = max;
3453 /* calculate the left-most part of the string covered
3454 * by a capture. Due to lookbehind, this may be to
3455 * the left of $&, so we have to scan all captures */
3456 while (min && n <= prog->lastparen) {
3457 if ( prog->offs[n].start != -1
3458 && prog->offs[n].start < min)
3459 {
3460 min = prog->offs[n].start;
3461 }
3462 n++;
3463 }
3464 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3465 && min > prog->offs[0].end
3466 )
3467 min = prog->offs[0].end;
3468
3469 }
3470
3471 assert(min >= 0 && min <= max && min <= strend - strbeg);
3472 sublen = max - min;
3473
3474 if (RXp_MATCH_COPIED(prog)) {
3475 if (sublen > prog->sublen)
3476 prog->subbeg =
3477 (char*)saferealloc(prog->subbeg, sublen+1);
3478 }
3479 else
3480 prog->subbeg = (char*)safemalloc(sublen+1);
3481 Copy(strbeg + min, prog->subbeg, sublen, char);
3482 prog->subbeg[sublen] = '\0';
3483 prog->suboffset = min;
3484 prog->sublen = sublen;
3485 RXp_MATCH_COPIED_on(prog);
3486 }
3487 prog->subcoffset = prog->suboffset;
3488 if (prog->suboffset && utf8_target) {
3489 /* Convert byte offset to chars.
3490 * XXX ideally should only compute this if @-/@+
3491 * has been seen, a la PL_sawampersand ??? */
3492
3493 /* If there's a direct correspondence between the
3494 * string which we're matching and the original SV,
3495 * then we can use the utf8 len cache associated with
3496 * the SV. In particular, it means that under //g,
3497 * sv_pos_b2u() will use the previously cached
3498 * position to speed up working out the new length of
3499 * subcoffset, rather than counting from the start of
3500 * the string each time. This stops
3501 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3502 * from going quadratic */
3503 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3504 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3505 SV_GMAGIC|SV_CONST_RETURN);
3506 else
3507 prog->subcoffset = utf8_length((U8*)strbeg,
3508 (U8*)(strbeg+prog->suboffset));
3509 }
3510 }
3511 else {
3512 RXp_MATCH_COPY_FREE(prog);
3513 prog->subbeg = strbeg;
3514 prog->suboffset = 0;
3515 prog->subcoffset = 0;
3516 prog->sublen = strend - strbeg;
3517 }
3518 }
3519
3520
3521
3522
3523 /*
3524 - regexec_flags - match a regexp against a string
3525 */
3526 I32
Perl_regexec_flags(pTHX_ REGEXP * const rx,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * sv,void * data,U32 flags)3527 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3528 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3529 /* stringarg: the point in the string at which to begin matching */
3530 /* strend: pointer to null at end of string */
3531 /* strbeg: real beginning of string */
3532 /* minend: end of match must be >= minend bytes after stringarg. */
3533 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3534 * itself is accessed via the pointers above */
3535 /* data: May be used for some additional optimizations.
3536 Currently unused. */
3537 /* flags: For optimizations. See REXEC_* in regexp.h */
3538
3539 {
3540 struct regexp *const prog = ReANY(rx);
3541 char *s;
3542 regnode *c;
3543 char *startpos;
3544 SSize_t minlen; /* must match at least this many chars */
3545 SSize_t dontbother = 0; /* how many characters not to try at end */
3546 const bool utf8_target = cBOOL(DO_UTF8(sv));
3547 I32 multiline;
3548 RXi_GET_DECL(prog,progi);
3549 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3550 regmatch_info *const reginfo = ®info_buf;
3551 regexp_paren_pair *swap = NULL;
3552 I32 oldsave;
3553 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3554
3555 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3556 PERL_UNUSED_ARG(data);
3557
3558 /* Be paranoid... */
3559 if (prog == NULL) {
3560 Perl_croak(aTHX_ "NULL regexp parameter");
3561 }
3562
3563 DEBUG_EXECUTE_r(
3564 debug_start_match(rx, utf8_target, stringarg, strend,
3565 "Matching");
3566 );
3567
3568 startpos = stringarg;
3569
3570 /* set these early as they may be used by the HOP macros below */
3571 reginfo->strbeg = strbeg;
3572 reginfo->strend = strend;
3573 reginfo->is_utf8_target = cBOOL(utf8_target);
3574
3575 if (prog->intflags & PREGf_GPOS_SEEN) {
3576 MAGIC *mg;
3577
3578 /* set reginfo->ganch, the position where \G can match */
3579
3580 reginfo->ganch =
3581 (flags & REXEC_IGNOREPOS)
3582 ? stringarg /* use start pos rather than pos() */
3583 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3584 /* Defined pos(): */
3585 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3586 : strbeg; /* pos() not defined; use start of string */
3587
3588 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3589 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3590
3591 /* in the presence of \G, we may need to start looking earlier in
3592 * the string than the suggested start point of stringarg:
3593 * if prog->gofs is set, then that's a known, fixed minimum
3594 * offset, such as
3595 * /..\G/: gofs = 2
3596 * /ab|c\G/: gofs = 1
3597 * or if the minimum offset isn't known, then we have to go back
3598 * to the start of the string, e.g. /w+\G/
3599 */
3600
3601 if (prog->intflags & PREGf_ANCH_GPOS) {
3602 if (prog->gofs) {
3603 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3604 if (!startpos ||
3605 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3606 {
3607 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3608 "fail: ganch-gofs before earliest possible start\n"));
3609 return 0;
3610 }
3611 }
3612 else
3613 startpos = reginfo->ganch;
3614 }
3615 else if (prog->gofs) {
3616 startpos = HOPBACKc(startpos, prog->gofs);
3617 if (!startpos)
3618 startpos = strbeg;
3619 }
3620 else if (prog->intflags & PREGf_GPOS_FLOAT)
3621 startpos = strbeg;
3622 }
3623
3624 minlen = prog->minlen;
3625 if ((startpos + minlen) > strend || startpos < strbeg) {
3626 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3627 "Regex match can't succeed, so not even tried\n"));
3628 return 0;
3629 }
3630
3631 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3632 * which will call destuctors to reset PL_regmatch_state, free higher
3633 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3634 * regmatch_info_aux_eval */
3635
3636 oldsave = PL_savestack_ix;
3637
3638 s = startpos;
3639
3640 if ((prog->extflags & RXf_USE_INTUIT)
3641 && !(flags & REXEC_CHECKED))
3642 {
3643 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3644 flags, NULL);
3645 if (!s)
3646 return 0;
3647
3648 if (prog->extflags & RXf_CHECK_ALL) {
3649 /* we can match based purely on the result of INTUIT.
3650 * Set up captures etc just for $& and $-[0]
3651 * (an intuit-only match wont have $1,$2,..) */
3652 assert(!prog->nparens);
3653
3654 /* s/// doesn't like it if $& is earlier than where we asked it to
3655 * start searching (which can happen on something like /.\G/) */
3656 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3657 && (s < stringarg))
3658 {
3659 /* this should only be possible under \G */
3660 assert(prog->intflags & PREGf_GPOS_SEEN);
3661 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3662 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3663 goto phooey;
3664 }
3665
3666 /* match via INTUIT shouldn't have any captures.
3667 * Let @-, @+, $^N know */
3668 prog->lastparen = prog->lastcloseparen = 0;
3669 RXp_MATCH_UTF8_set(prog, utf8_target);
3670 prog->offs[0].start = s - strbeg;
3671 prog->offs[0].end = utf8_target
3672 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3673 : s - strbeg + prog->minlenret;
3674 if ( !(flags & REXEC_NOT_FIRST) )
3675 S_reg_set_capture_string(aTHX_ rx,
3676 strbeg, strend,
3677 sv, flags, utf8_target);
3678
3679 return 1;
3680 }
3681 }
3682
3683 multiline = prog->extflags & RXf_PMf_MULTILINE;
3684
3685 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3687 "String too short [regexec_flags]...\n"));
3688 goto phooey;
3689 }
3690
3691 /* Check validity of program. */
3692 if (UCHARAT(progi->program) != REG_MAGIC) {
3693 Perl_croak(aTHX_ "corrupted regexp program");
3694 }
3695
3696 RXp_MATCH_TAINTED_off(prog);
3697 RXp_MATCH_UTF8_set(prog, utf8_target);
3698
3699 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3700 reginfo->intuit = 0;
3701 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3702 reginfo->warned = FALSE;
3703 reginfo->sv = sv;
3704 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3705 /* see how far we have to get to not match where we matched before */
3706 reginfo->till = stringarg + minend;
3707
3708 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3709 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3710 S_cleanup_regmatch_info_aux has executed (registered by
3711 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3712 magic belonging to this SV.
3713 Not newSVsv, either, as it does not COW.
3714 */
3715 reginfo->sv = newSV(0);
3716 SvSetSV_nosteal(reginfo->sv, sv);
3717 SAVEFREESV(reginfo->sv);
3718 }
3719
3720 /* reserve next 2 or 3 slots in PL_regmatch_state:
3721 * slot N+0: may currently be in use: skip it
3722 * slot N+1: use for regmatch_info_aux struct
3723 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3724 * slot N+3: ready for use by regmatch()
3725 */
3726
3727 {
3728 regmatch_state *old_regmatch_state;
3729 regmatch_slab *old_regmatch_slab;
3730 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3731
3732 /* on first ever match, allocate first slab */
3733 if (!PL_regmatch_slab) {
3734 Newx(PL_regmatch_slab, 1, regmatch_slab);
3735 PL_regmatch_slab->prev = NULL;
3736 PL_regmatch_slab->next = NULL;
3737 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3738 }
3739
3740 old_regmatch_state = PL_regmatch_state;
3741 old_regmatch_slab = PL_regmatch_slab;
3742
3743 for (i=0; i <= max; i++) {
3744 if (i == 1)
3745 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3746 else if (i ==2)
3747 reginfo->info_aux_eval =
3748 reginfo->info_aux->info_aux_eval =
3749 &(PL_regmatch_state->u.info_aux_eval);
3750
3751 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3752 PL_regmatch_state = S_push_slab(aTHX);
3753 }
3754
3755 /* note initial PL_regmatch_state position; at end of match we'll
3756 * pop back to there and free any higher slabs */
3757
3758 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3759 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3760 reginfo->info_aux->poscache = NULL;
3761
3762 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3763
3764 if ((prog->extflags & RXf_EVAL_SEEN))
3765 S_setup_eval_state(aTHX_ reginfo);
3766 else
3767 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3768 }
3769
3770 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3771 /* We have to be careful. If the previous successful match
3772 was from this regex we don't want a subsequent partially
3773 successful match to clobber the old results.
3774 So when we detect this possibility we add a swap buffer
3775 to the re, and switch the buffer each match. If we fail,
3776 we switch it back; otherwise we leave it swapped.
3777 */
3778 swap = prog->offs;
3779 /* avoid leak if we die, or clean up anyway if match completes */
3780 SAVEFREEPV(swap);
3781 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3782 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3783 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3784 0,
3785 PTR2UV(prog),
3786 PTR2UV(swap),
3787 PTR2UV(prog->offs)
3788 ));
3789 }
3790
3791 if (prog->recurse_locinput)
3792 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3793
3794 /* Simplest case: anchored match (but not \G) need be tried only once,
3795 * or with MBOL, only at the beginning of each line.
3796 *
3797 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3798 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3799 * match at the start of the string then it won't match anywhere else
3800 * either; while with /.*.../, if it doesn't match at the beginning,
3801 * the earliest it could match is at the start of the next line */
3802
3803 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3804 char *end;
3805
3806 if (regtry(reginfo, &s))
3807 goto got_it;
3808
3809 if (!(prog->intflags & PREGf_ANCH_MBOL))
3810 goto phooey;
3811
3812 /* didn't match at start, try at other newline positions */
3813
3814 if (minlen)
3815 dontbother = minlen - 1;
3816 end = HOP3c(strend, -dontbother, strbeg) - 1;
3817
3818 /* skip to next newline */
3819
3820 while (s <= end) { /* note it could be possible to match at the end of the string */
3821 /* NB: newlines are the same in unicode as they are in latin */
3822 if (*s++ != '\n')
3823 continue;
3824 if (prog->check_substr || prog->check_utf8) {
3825 /* note that with PREGf_IMPLICIT, intuit can only fail
3826 * or return the start position, so it's of limited utility.
3827 * Nevertheless, I made the decision that the potential for
3828 * quick fail was still worth it - DAPM */
3829 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3830 if (!s)
3831 goto phooey;
3832 }
3833 if (regtry(reginfo, &s))
3834 goto got_it;
3835 }
3836 goto phooey;
3837 } /* end anchored search */
3838
3839 /* anchored \G match */
3840 if (prog->intflags & PREGf_ANCH_GPOS)
3841 {
3842 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3843 assert(prog->intflags & PREGf_GPOS_SEEN);
3844 /* For anchored \G, the only position it can match from is
3845 * (ganch-gofs); we already set startpos to this above; if intuit
3846 * moved us on from there, we can't possibly succeed */
3847 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3848 if (s == startpos && regtry(reginfo, &s))
3849 goto got_it;
3850 goto phooey;
3851 }
3852
3853 /* Messy cases: unanchored match. */
3854
3855 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3856 /* we have /x+whatever/ */
3857 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3858 char ch;
3859 #ifdef DEBUGGING
3860 int did_match = 0;
3861 #endif
3862 if (utf8_target) {
3863 if (! prog->anchored_utf8) {
3864 to_utf8_substr(prog);
3865 }
3866 ch = SvPVX_const(prog->anchored_utf8)[0];
3867 REXEC_FBC_UTF8_SCAN(
3868 if (*s == ch) {
3869 DEBUG_EXECUTE_r( did_match = 1 );
3870 if (regtry(reginfo, &s)) goto got_it;
3871 s += UTF8_SAFE_SKIP(s, strend);
3872 while (s < strend && *s == ch)
3873 s += UTF8SKIP(s);
3874 }
3875 );
3876
3877 }
3878 else {
3879 if (! prog->anchored_substr) {
3880 if (! to_byte_substr(prog)) {
3881 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3882 }
3883 }
3884 ch = SvPVX_const(prog->anchored_substr)[0];
3885 REXEC_FBC_NON_UTF8_SCAN(
3886 if (*s == ch) {
3887 DEBUG_EXECUTE_r( did_match = 1 );
3888 if (regtry(reginfo, &s)) goto got_it;
3889 s++;
3890 while (s < strend && *s == ch)
3891 s++;
3892 }
3893 );
3894 }
3895 DEBUG_EXECUTE_r(if (!did_match)
3896 Perl_re_printf( aTHX_
3897 "Did not find anchored character...\n")
3898 );
3899 }
3900 else if (prog->anchored_substr != NULL
3901 || prog->anchored_utf8 != NULL
3902 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3903 && prog->float_max_offset < strend - s)) {
3904 SV *must;
3905 SSize_t back_max;
3906 SSize_t back_min;
3907 char *last;
3908 char *last1; /* Last position checked before */
3909 #ifdef DEBUGGING
3910 int did_match = 0;
3911 #endif
3912 if (prog->anchored_substr || prog->anchored_utf8) {
3913 if (utf8_target) {
3914 if (! prog->anchored_utf8) {
3915 to_utf8_substr(prog);
3916 }
3917 must = prog->anchored_utf8;
3918 }
3919 else {
3920 if (! prog->anchored_substr) {
3921 if (! to_byte_substr(prog)) {
3922 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3923 }
3924 }
3925 must = prog->anchored_substr;
3926 }
3927 back_max = back_min = prog->anchored_offset;
3928 } else {
3929 if (utf8_target) {
3930 if (! prog->float_utf8) {
3931 to_utf8_substr(prog);
3932 }
3933 must = prog->float_utf8;
3934 }
3935 else {
3936 if (! prog->float_substr) {
3937 if (! to_byte_substr(prog)) {
3938 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3939 }
3940 }
3941 must = prog->float_substr;
3942 }
3943 back_max = prog->float_max_offset;
3944 back_min = prog->float_min_offset;
3945 }
3946
3947 if (back_min<0) {
3948 last = strend;
3949 } else {
3950 last = HOP3c(strend, /* Cannot start after this */
3951 -(SSize_t)(CHR_SVLEN(must)
3952 - (SvTAIL(must) != 0) + back_min), strbeg);
3953 }
3954 if (s > reginfo->strbeg)
3955 last1 = HOPc(s, -1);
3956 else
3957 last1 = s - 1; /* bogus */
3958
3959 /* XXXX check_substr already used to find "s", can optimize if
3960 check_substr==must. */
3961 dontbother = 0;
3962 strend = HOPc(strend, -dontbother);
3963 while ( (s <= last) &&
3964 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3965 (unsigned char*)strend, must,
3966 multiline ? FBMrf_MULTILINE : 0)) ) {
3967 DEBUG_EXECUTE_r( did_match = 1 );
3968 if (HOPc(s, -back_max) > last1) {
3969 last1 = HOPc(s, -back_min);
3970 s = HOPc(s, -back_max);
3971 }
3972 else {
3973 char * const t = (last1 >= reginfo->strbeg)
3974 ? HOPc(last1, 1) : last1 + 1;
3975
3976 last1 = HOPc(s, -back_min);
3977 s = t;
3978 }
3979 if (utf8_target) {
3980 while (s <= last1) {
3981 if (regtry(reginfo, &s))
3982 goto got_it;
3983 if (s >= last1) {
3984 s++; /* to break out of outer loop */
3985 break;
3986 }
3987 s += UTF8SKIP(s);
3988 }
3989 }
3990 else {
3991 while (s <= last1) {
3992 if (regtry(reginfo, &s))
3993 goto got_it;
3994 s++;
3995 }
3996 }
3997 }
3998 DEBUG_EXECUTE_r(if (!did_match) {
3999 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4000 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4001 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
4002 ((must == prog->anchored_substr || must == prog->anchored_utf8)
4003 ? "anchored" : "floating"),
4004 quoted, RE_SV_TAIL(must));
4005 });
4006 goto phooey;
4007 }
4008 else if ( (c = progi->regstclass) ) {
4009 if (minlen) {
4010 const OPCODE op = OP(progi->regstclass);
4011 /* don't bother with what can't match */
4012 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
4013 strend = HOPc(strend, -(minlen - 1));
4014 }
4015 DEBUG_EXECUTE_r({
4016 SV * const prop = sv_newmortal();
4017 regprop(prog, prop, c, reginfo, NULL);
4018 {
4019 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4020 s,strend-s,PL_dump_re_max_len);
4021 Perl_re_printf( aTHX_
4022 "Matching stclass %.*s against %s (%d bytes)\n",
4023 (int)SvCUR(prop), SvPVX_const(prop),
4024 quoted, (int)(strend - s));
4025 }
4026 });
4027 if (find_byclass(prog, c, s, strend, reginfo))
4028 goto got_it;
4029 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
4030 }
4031 else {
4032 dontbother = 0;
4033 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4034 /* Trim the end. */
4035 char *last= NULL;
4036 SV* float_real;
4037 STRLEN len;
4038 const char *little;
4039
4040 if (utf8_target) {
4041 if (! prog->float_utf8) {
4042 to_utf8_substr(prog);
4043 }
4044 float_real = prog->float_utf8;
4045 }
4046 else {
4047 if (! prog->float_substr) {
4048 if (! to_byte_substr(prog)) {
4049 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4050 }
4051 }
4052 float_real = prog->float_substr;
4053 }
4054
4055 little = SvPV_const(float_real, len);
4056 if (SvTAIL(float_real)) {
4057 /* This means that float_real contains an artificial \n on
4058 * the end due to the presence of something like this:
4059 * /foo$/ where we can match both "foo" and "foo\n" at the
4060 * end of the string. So we have to compare the end of the
4061 * string first against the float_real without the \n and
4062 * then against the full float_real with the string. We
4063 * have to watch out for cases where the string might be
4064 * smaller than the float_real or the float_real without
4065 * the \n. */
4066 char *checkpos= strend - len;
4067 DEBUG_OPTIMISE_r(
4068 Perl_re_printf( aTHX_
4069 "%sChecking for float_real.%s\n",
4070 PL_colors[4], PL_colors[5]));
4071 if (checkpos + 1 < strbeg) {
4072 /* can't match, even if we remove the trailing \n
4073 * string is too short to match */
4074 DEBUG_EXECUTE_r(
4075 Perl_re_printf( aTHX_
4076 "%sString shorter than required trailing substring, cannot match.%s\n",
4077 PL_colors[4], PL_colors[5]));
4078 goto phooey;
4079 } else if (memEQ(checkpos + 1, little, len - 1)) {
4080 /* can match, the end of the string matches without the
4081 * "\n" */
4082 last = checkpos + 1;
4083 } else if (checkpos < strbeg) {
4084 /* cant match, string is too short when the "\n" is
4085 * included */
4086 DEBUG_EXECUTE_r(
4087 Perl_re_printf( aTHX_
4088 "%sString does not contain required trailing substring, cannot match.%s\n",
4089 PL_colors[4], PL_colors[5]));
4090 goto phooey;
4091 } else if (!multiline) {
4092 /* non multiline match, so compare with the "\n" at the
4093 * end of the string */
4094 if (memEQ(checkpos, little, len)) {
4095 last= checkpos;
4096 } else {
4097 DEBUG_EXECUTE_r(
4098 Perl_re_printf( aTHX_
4099 "%sString does not contain required trailing substring, cannot match.%s\n",
4100 PL_colors[4], PL_colors[5]));
4101 goto phooey;
4102 }
4103 } else {
4104 /* multiline match, so we have to search for a place
4105 * where the full string is located */
4106 goto find_last;
4107 }
4108 } else {
4109 find_last:
4110 if (len)
4111 last = rninstr(s, strend, little, little + len);
4112 else
4113 last = strend; /* matching "$" */
4114 }
4115 if (!last) {
4116 /* at one point this block contained a comment which was
4117 * probably incorrect, which said that this was a "should not
4118 * happen" case. Even if it was true when it was written I am
4119 * pretty sure it is not anymore, so I have removed the comment
4120 * and replaced it with this one. Yves */
4121 DEBUG_EXECUTE_r(
4122 Perl_re_printf( aTHX_
4123 "%sString does not contain required substring, cannot match.%s\n",
4124 PL_colors[4], PL_colors[5]
4125 ));
4126 goto phooey;
4127 }
4128 dontbother = strend - last + prog->float_min_offset;
4129 }
4130 if (minlen && (dontbother < minlen))
4131 dontbother = minlen - 1;
4132 strend -= dontbother; /* this one's always in bytes! */
4133 /* We don't know much -- general case. */
4134 if (utf8_target) {
4135 for (;;) {
4136 if (regtry(reginfo, &s))
4137 goto got_it;
4138 if (s >= strend)
4139 break;
4140 s += UTF8SKIP(s);
4141 };
4142 }
4143 else {
4144 do {
4145 if (regtry(reginfo, &s))
4146 goto got_it;
4147 } while (s++ < strend);
4148 }
4149 }
4150
4151 /* Failure. */
4152 goto phooey;
4153
4154 got_it:
4155 /* s/// doesn't like it if $& is earlier than where we asked it to
4156 * start searching (which can happen on something like /.\G/) */
4157 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
4158 && (prog->offs[0].start < stringarg - strbeg))
4159 {
4160 /* this should only be possible under \G */
4161 assert(prog->intflags & PREGf_GPOS_SEEN);
4162 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4163 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4164 goto phooey;
4165 }
4166
4167 /* clean up; this will trigger destructors that will free all slabs
4168 * above the current one, and cleanup the regmatch_info_aux
4169 * and regmatch_info_aux_eval sructs */
4170
4171 LEAVE_SCOPE(oldsave);
4172
4173 if (RXp_PAREN_NAMES(prog))
4174 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4175
4176 /* make sure $`, $&, $', and $digit will work later */
4177 if ( !(flags & REXEC_NOT_FIRST) )
4178 S_reg_set_capture_string(aTHX_ rx,
4179 strbeg, reginfo->strend,
4180 sv, flags, utf8_target);
4181
4182 return 1;
4183
4184 phooey:
4185 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
4186 PL_colors[4], PL_colors[5]));
4187
4188 if (swap) {
4189 /* we failed :-( roll it back.
4190 * Since the swap buffer will be freed on scope exit which follows
4191 * shortly, restore the old captures by copying 'swap's original
4192 * data to the new offs buffer
4193 */
4194 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4195 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4196 0,
4197 PTR2UV(prog),
4198 PTR2UV(prog->offs),
4199 PTR2UV(swap)
4200 ));
4201
4202 Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4203 }
4204
4205 /* clean up; this will trigger destructors that will free all slabs
4206 * above the current one, and cleanup the regmatch_info_aux
4207 * and regmatch_info_aux_eval sructs */
4208
4209 LEAVE_SCOPE(oldsave);
4210
4211 return 0;
4212 }
4213
4214
4215 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4216 * Do inc before dec, in case old and new rex are the same */
4217 #define SET_reg_curpm(Re2) \
4218 if (reginfo->info_aux_eval) { \
4219 (void)ReREFCNT_inc(Re2); \
4220 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
4221 PM_SETRE((PL_reg_curpm), (Re2)); \
4222 }
4223
4224
4225 /*
4226 - regtry - try match at specific point
4227 */
4228 STATIC bool /* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info * reginfo,char ** startposp)4229 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4230 {
4231 CHECKPOINT lastcp;
4232 REGEXP *const rx = reginfo->prog;
4233 regexp *const prog = ReANY(rx);
4234 SSize_t result;
4235 #ifdef DEBUGGING
4236 U32 depth = 0; /* used by REGCP_SET */
4237 #endif
4238 RXi_GET_DECL(prog,progi);
4239 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4240
4241 PERL_ARGS_ASSERT_REGTRY;
4242
4243 reginfo->cutpoint=NULL;
4244
4245 prog->offs[0].start = *startposp - reginfo->strbeg;
4246 prog->lastparen = 0;
4247 prog->lastcloseparen = 0;
4248
4249 /* XXXX What this code is doing here?!!! There should be no need
4250 to do this again and again, prog->lastparen should take care of
4251 this! --ilya*/
4252
4253 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4254 * Actually, the code in regcppop() (which Ilya may be meaning by
4255 * prog->lastparen), is not needed at all by the test suite
4256 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4257 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4258 * Meanwhile, this code *is* needed for the
4259 * above-mentioned test suite tests to succeed. The common theme
4260 * on those tests seems to be returning null fields from matches.
4261 * --jhi updated by dapm */
4262
4263 /* After encountering a variant of the issue mentioned above I think
4264 * the point Ilya was making is that if we properly unwind whenever
4265 * we set lastparen to a smaller value then we should not need to do
4266 * this every time, only when needed. So if we have tests that fail if
4267 * we remove this, then it suggests somewhere else we are improperly
4268 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4269 * places it is called, and related regcp() routines. - Yves */
4270 #if 1
4271 if (prog->nparens) {
4272 regexp_paren_pair *pp = prog->offs;
4273 I32 i;
4274 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4275 ++pp;
4276 pp->start = -1;
4277 pp->end = -1;
4278 }
4279 }
4280 #endif
4281 REGCP_SET(lastcp);
4282 result = regmatch(reginfo, *startposp, progi->program + 1);
4283 if (result != -1) {
4284 prog->offs[0].end = result;
4285 return 1;
4286 }
4287 if (reginfo->cutpoint)
4288 *startposp= reginfo->cutpoint;
4289 REGCP_UNWIND(lastcp);
4290 return 0;
4291 }
4292
4293 /* this is used to determine how far from the left messages like
4294 'failed...' are printed in regexec.c. It should be set such that
4295 messages are inline with the regop output that created them.
4296 */
4297 #define REPORT_CODE_OFF 29
4298 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4299 #ifdef DEBUGGING
4300 int
Perl_re_exec_indentf(pTHX_ const char * fmt,U32 depth,...)4301 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4302 {
4303 va_list ap;
4304 int result;
4305 PerlIO *f= Perl_debug_log;
4306 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4307 va_start(ap, depth);
4308 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4309 result = PerlIO_vprintf(f, fmt, ap);
4310 va_end(ap);
4311 return result;
4312 }
4313 #endif /* DEBUGGING */
4314
4315 /* grab a new slab and return the first slot in it */
4316
4317 STATIC regmatch_state *
S_push_slab(pTHX)4318 S_push_slab(pTHX)
4319 {
4320 regmatch_slab *s = PL_regmatch_slab->next;
4321 if (!s) {
4322 Newx(s, 1, regmatch_slab);
4323 s->prev = PL_regmatch_slab;
4324 s->next = NULL;
4325 PL_regmatch_slab->next = s;
4326 }
4327 PL_regmatch_slab = s;
4328 return SLAB_FIRST(s);
4329 }
4330
4331 #ifdef DEBUGGING
4332
4333 STATIC void
S_debug_start_match(pTHX_ const REGEXP * prog,const bool utf8_target,const char * start,const char * end,const char * blurb)4334 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4335 const char *start, const char *end, const char *blurb)
4336 {
4337 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4338
4339 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4340
4341 if (!PL_colorset)
4342 reginitcolors();
4343 {
4344 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4345 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4346
4347 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4348 start, end - start, PL_dump_re_max_len);
4349
4350 Perl_re_printf( aTHX_
4351 "%s%s REx%s %s against %s\n",
4352 PL_colors[4], blurb, PL_colors[5], s0, s1);
4353
4354 if (utf8_target||utf8_pat)
4355 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4356 utf8_pat ? "pattern" : "",
4357 utf8_pat && utf8_target ? " and " : "",
4358 utf8_target ? "string" : ""
4359 );
4360 }
4361 }
4362
4363 STATIC void
S_dump_exec_pos(pTHX_ const char * locinput,const regnode * scan,const char * loc_regeol,const char * loc_bostr,const char * loc_reg_starttry,const bool utf8_target,const U32 depth)4364 S_dump_exec_pos(pTHX_ const char *locinput,
4365 const regnode *scan,
4366 const char *loc_regeol,
4367 const char *loc_bostr,
4368 const char *loc_reg_starttry,
4369 const bool utf8_target,
4370 const U32 depth
4371 )
4372 {
4373 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4374 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4375 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4376 /* The part of the string before starttry has one color
4377 (pref0_len chars), between starttry and current
4378 position another one (pref_len - pref0_len chars),
4379 after the current position the third one.
4380 We assume that pref0_len <= pref_len, otherwise we
4381 decrease pref0_len. */
4382 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4383 ? (5 + taill) - l : locinput - loc_bostr;
4384 int pref0_len;
4385
4386 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4387
4388 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4389 pref_len++;
4390 pref0_len = pref_len - (locinput - loc_reg_starttry);
4391 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4392 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4393 ? (5 + taill) - pref_len : loc_regeol - locinput);
4394 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4395 l--;
4396 if (pref0_len < 0)
4397 pref0_len = 0;
4398 if (pref0_len > pref_len)
4399 pref0_len = pref_len;
4400 {
4401 const int is_uni = utf8_target ? 1 : 0;
4402
4403 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4404 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4405
4406 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4407 (locinput - pref_len + pref0_len),
4408 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4409
4410 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4411 locinput, loc_regeol - locinput, 10, 0, 1);
4412
4413 const STRLEN tlen=len0+len1+len2;
4414 Perl_re_printf( aTHX_
4415 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4416 (IV)(locinput - loc_bostr),
4417 len0, s0,
4418 len1, s1,
4419 (docolor ? "" : "> <"),
4420 len2, s2,
4421 (int)(tlen > 19 ? 0 : 19 - tlen),
4422 "",
4423 depth);
4424 }
4425 }
4426
4427 #endif
4428
4429 /* reg_check_named_buff_matched()
4430 * Checks to see if a named buffer has matched. The data array of
4431 * buffer numbers corresponding to the buffer is expected to reside
4432 * in the regexp->data->data array in the slot stored in the ARG() of
4433 * node involved. Note that this routine doesn't actually care about the
4434 * name, that information is not preserved from compilation to execution.
4435 * Returns the index of the leftmost defined buffer with the given name
4436 * or 0 if non of the buffers matched.
4437 */
4438 STATIC I32
S_reg_check_named_buff_matched(const regexp * rex,const regnode * scan)4439 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4440 {
4441 I32 n;
4442 RXi_GET_DECL(rex,rexi);
4443 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4444 I32 *nums=(I32*)SvPVX(sv_dat);
4445
4446 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4447
4448 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4449 if ((I32)rex->lastparen >= nums[n] &&
4450 rex->offs[nums[n]].end != -1)
4451 {
4452 return nums[n];
4453 }
4454 }
4455 return 0;
4456 }
4457
4458 static bool
S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,struct next_matchable_info * m,regmatch_info * reginfo)4459 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4460 struct next_matchable_info * m,
4461 regmatch_info *reginfo)
4462 {
4463 /* This function determines various characteristics about every possible
4464 * initial match of the passed-in EXACTish <text_node>, and stores them in
4465 * <*m>.
4466 *
4467 * That includes a match string and a parallel mask, such that if you AND
4468 * the target string with the mask and compare with the match string,
4469 * you'll have a pretty good idea, perhaps even perfect, if that portion of
4470 * the target matches or not.
4471 *
4472 * The motivation behind this function is to allow the caller to set up
4473 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where
4474 * B is an arbitrary EXACTish node. To find the end of .*, we look for the
4475 * beginning oF B, which is the passed in <text_node> That's where this
4476 * function comes in. The values it returns can quickly be used to rule
4477 * out many, or all, cases of possible matches not actually being the
4478 * beginning of B, <text_node>. It is also used in regrepeat() where we
4479 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently
4480 * determine where the span of 'A's stop.
4481 *
4482 * If <text_node> is of type EXACT, there is only one possible character
4483 * that can match its first character, and so the situation is quite
4484 * simple. But things can get much more complicated if folding is
4485 * involved. It may be that the first character of an EXACTFish node
4486 * doesn't participate in any possible fold, e.g., punctuation, so it can
4487 * be matched only by itself. The vast majority of characters that are in
4488 * folds match just two things, their lower and upper-case equivalents.
4489 * But not all are like that; some have multiple possible matches, or match
4490 * sequences of more than one character. This function sorts all that out.
4491 *
4492 * It returns information about all possibilities of what the first
4493 * character(s) of <text_node> could look like. Again, if <text_node> is a
4494 * plain EXACT node, that's just the actual first bytes of the first
4495 * character; but otherwise it is the bytes, that when masked, match all
4496 * possible combinations of all the initial bytes of all the characters
4497 * that could match, folded. (Actually, this is a slight over promise. It
4498 * handles only up to the initial 5 bytes, which is enough for all Unicode
4499 * characters, but not for all non-Unicode ones.)
4500 *
4501 * Here's an example to clarify. Suppose the first character of
4502 * <text_node> is the letter 'C', and we are under /i matching. That means
4503 * 'c' also matches. The representations of these two characters differ in
4504 * just one bit, so the mask would be a zero in that position and ones in
4505 * the other 7. And the returned string would be the AND of these two
4506 * characters, and would be one byte long, since these characters are each
4507 * a single byte. ANDing the target <text_node> with this mask will yield
4508 * the returned string if and only if <text_node> begins with one of these
4509 * two characters. So, the function would also return that the definitive
4510 * length matched is 1 byte.
4511 *
4512 * Now, suppose instead of the letter 'C', <text_node> begins with the
4513 * letter 'F'. The situation is much more complicated because there are
4514 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4515 * begins with 'f', and hence could match. We add these into the returned
4516 * string and mask, but the result isn't definitive; the caller has to
4517 * check further if its AND and compare pass. But the failure of that
4518 * compare will quickly rule out most possible inputs.
4519 *
4520 * Much of this could be done in regcomp.c at compile time, except for
4521 * locale-dependent, and UTF-8 target dependent data. Extra data fields
4522 * could be used for one or the other eventualities.
4523 *
4524 * If this function determines that no possible character in the target
4525 * string can match, it returns FALSE; otherwise TRUE. (The FALSE
4526 * situation occurs if the first character in <text_node> requires UTF-8 to
4527 * represent, and the target string isn't in UTF-8.)
4528 *
4529 * Some analysis is in GH #18414, located at the time of this writing at:
4530 * https://github.com/Perl/perl5/issues/18414
4531 */
4532
4533 const bool utf8_target = reginfo->is_utf8_target;
4534 bool utf8_pat = reginfo->is_utf8_pat;
4535
4536 PERL_UINT_FAST8_T i;
4537
4538 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4539 */
4540 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4541 U8 lengths[MAX_MATCHES] = { 0 };
4542
4543 U8 index_of_longest = 0;
4544
4545 U8 *pat = (U8*)STRING(text_node);
4546 Size_t pat_len = STR_LEN(text_node);
4547 U8 op = OP(text_node);
4548
4549 U8 byte_mask[5] = {0};
4550 U8 byte_anded[5] = {0};
4551
4552 /* There are some folds in Unicode to multiple characters. This will hold
4553 * such characters that could fold to the beginning of 'text_node' */
4554 UV multi_fold_from = 0;
4555
4556 /* We may have to create a modified copy of the pattern */
4557 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4558
4559 m->max_length = 0;
4560 m->min_length = 255;
4561 m->count = 0;
4562
4563 /* Even if the first character in the node can match something in Latin1,
4564 * if there is anything in the node that can't, the match must fail */
4565 if (! utf8_target && isEXACT_REQ8(op)) {
4566 return FALSE;
4567 }
4568
4569 /* Define a temporary op for use in this function, using an existing one that
4570 * should never be a real op during execution */
4571 #define TURKISH PSEUDO
4572
4573 /* What to do about these two nodes had to be deferred to runtime (which is
4574 * now). If the extra information we now have so indicates, turn them into
4575 * EXACTFU nodes */
4576 if ( (op == EXACTF && utf8_target)
4577 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4578 {
4579 if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4580 op = TURKISH;
4581 }
4582 else {
4583 op = EXACTFU;
4584 }
4585
4586 /* And certain situations are better handled if we create a modified
4587 * version of the pattern */
4588 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4589 specific problematic characters */
4590 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4591
4592 /* The node could start with characters that are the first ones
4593 * of a multi-character fold. */
4594 multi_fold_from
4595 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4596 if (multi_fold_from) {
4597
4598 /* Here, they do form a sequence that matches the fold of a
4599 * single character. That single character then is a
4600 * possible match. Below we will look again at this, but
4601 * the code below is expecting every character in the
4602 * pattern to be folded, which the input isn't required to
4603 * be in this case. So, just fold the single character,
4604 * and the result will be in the expected form. */
4605 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4606 FOLD_FLAGS_FULL);
4607 pat = mod_pat;
4608 }
4609 /* Turkish has a couple extra possibilities. */
4610 else if ( UNLIKELY(op == TURKISH)
4611 && pat_len >= 3
4612 && isALPHA_FOLD_EQ(pat[0], 'f')
4613 && ( memBEGINs(pat + 1, pat_len - 1,
4614 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4615 || ( pat_len >= 4
4616 && isALPHA_FOLD_EQ(pat[1], 'f')
4617 && memBEGINs(pat + 2, pat_len - 2,
4618 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4619 ))) {
4620 /* The macros for finding a multi-char fold don't include
4621 * the Turkish possibilities, in which U+130 folds to 'i'.
4622 * Hard-code these. It's very unlikely that Unicode will
4623 * ever add any others. */
4624 if (pat[1] == 'f') {
4625 pat_len = 3;
4626 Copy("ffi", mod_pat, pat_len, U8);
4627 }
4628 else {
4629 pat_len = 2;
4630 Copy("fi", mod_pat, pat_len, U8);
4631 }
4632 pat = mod_pat;
4633 }
4634 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat)
4635 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4636 && LIKELY(memNEs(pat, pat_len,
4637 LATIN_SMALL_LETTER_SHARP_S_UTF8))
4638 && (LIKELY(op != TURKISH || *pat != 'I')))
4639 {
4640 /* For all cases of things between 0-255, except the ones
4641 * in the conditional above, the fold is just the lower
4642 * case, which is faster than the more general case. */
4643 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4644 pat[1]));
4645 pat_len = 1;
4646 pat = mod_pat;
4647 utf8_pat = FALSE;
4648 }
4649 else { /* Code point above 255, or needs special handling */
4650 _to_utf8_fold_flags(pat, pat + pat_len,
4651 mod_pat, &pat_len,
4652 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4653 pat = mod_pat;
4654 }
4655 }
4656 }
4657 else if /* Below is not a UTF-8 pattern; there's a somewhat different
4658 set of problematic characters */
4659 ((multi_fold_from
4660 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4661 {
4662 /* We may have to canonicalize a multi-char fold, as in the UTF-8
4663 * case */
4664 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4665 FOLD_FLAGS_FULL);
4666 pat = mod_pat;
4667 }
4668 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4669 mod_pat[0] = mod_pat[1] = 's';
4670 pat_len = 2;
4671 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4672 chars, and speeds copying */
4673 pat = mod_pat;
4674 }
4675 else if (LIKELY(op != TURKISH || *pat != 'I')) {
4676 mod_pat[0] = toLOWER_L1(*pat);
4677 pat_len = 1;
4678 pat = mod_pat;
4679 }
4680 }
4681 else if /* Below isn't a node that we convert to UTF-8 */
4682 ( utf8_target
4683 && ! utf8_pat
4684 && op == EXACTFAA_NO_TRIE
4685 && *pat == LATIN_SMALL_LETTER_SHARP_S)
4686 {
4687 /* A very special case. Folding U+DF goes to U+17F under /iaa. We
4688 * did this at compile time when the pattern was UTF-8 , but otherwise
4689 * we couldn't do it earlier, because it requires a UTF-8 target for
4690 * this match to be legal. */
4691 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4692 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4693 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4694 pat = mod_pat;
4695 utf8_pat = TRUE;
4696 }
4697
4698 /* Here, we have taken care of the initial work for a few very problematic
4699 * situations, possibly creating a modified pattern.
4700 *
4701 * Now ready for the general case. We build up all the possible things
4702 * that could match the first character of the pattern into the elements of
4703 * 'matches[]'
4704 *
4705 * Everything generally matches at least itself. But if there is a
4706 * UTF8ness mismatch, we have to convert to that of the target string. */
4707 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
4708 matches[0][0] = pat[0];
4709 lengths[0] = 1;
4710 m->count++;
4711 }
4712 else if (utf8_target) {
4713 if (utf8_pat) {
4714 lengths[0] = UTF8SKIP(pat);
4715 Copy(pat, matches[0], lengths[0], U8);
4716 m->count++;
4717 }
4718 else { /* target is UTF-8, pattern isn't */
4719 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4720 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4721 lengths[0] = 2;
4722 m->count++;
4723 }
4724 }
4725 else if (! utf8_pat) { /* Neither is UTF-8 */
4726 matches[0][0] = pat[0];
4727 lengths[0] = 1;
4728 m->count++;
4729 }
4730 else /* target isn't UTF-8; pattern is. No match possible unless the
4731 pattern's first character can fit in a byte */
4732 if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4733 {
4734 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4735 lengths[0] = 1;
4736 m->count++;
4737 }
4738
4739 /* Here we have taken care of any necessary node-type changes */
4740
4741 if (m->count) {
4742 m->max_length = lengths[0];
4743 m->min_length = lengths[0];
4744 }
4745
4746 /* For non-folding nodes, there are no other possible candidate matches,
4747 * but for foldable ones, we have to look further. */
4748 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4749 UV folded; /* The first character in the pattern, folded */
4750 U32 first_fold_from; /* A character that folds to it */
4751 const U32 * remaining_fold_froms; /* The remaining characters that
4752 fold to it, if any */
4753 Size_t folds_to_count; /* The total number of characters that fold to
4754 'folded' */
4755
4756 /* If the node begins with a sequence of more than one character that
4757 * together form the fold of a single character, it is called a
4758 * 'multi-character fold', and the normal functions don't handle this
4759 * case. We set 'multi_fold_from' to the single folded-from character,
4760 * which is handled in an extra iteration below */
4761 if (utf8_pat) {
4762 folded = valid_utf8_to_uvchr(pat, NULL);
4763 multi_fold_from
4764 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4765 }
4766 else {
4767 folded = *pat;
4768
4769 /* This may generate illegal combinations for things like EXACTF,
4770 * but rather than repeat the logic and exclude them here, all such
4771 * illegalities are checked for and skipped below in the loop */
4772 multi_fold_from
4773 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4774 }
4775
4776 /* Everything matches at least itself; initialize to that because the
4777 * only the branches below that set it are the ones where the number
4778 * isn't 1. */
4779 folds_to_count = 1;
4780
4781 /* There are a few special cases for locale-dependent nodes, where the
4782 * run-time context was needed before we could know what matched */
4783 if (UNLIKELY(op == EXACTFL) && folded < 256) {
4784 first_fold_from = PL_fold_locale[folded];
4785 }
4786 else if ( op == EXACTFL && utf8_target && utf8_pat
4787 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4788 LATIN_SMALL_LETTER_LONG_S_UTF8))
4789 {
4790 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4791 }
4792 else if (UNLIKELY( op == TURKISH
4793 && ( isALPHA_FOLD_EQ(folded, 'i')
4794 || inRANGE(folded,
4795 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4796 LATIN_SMALL_LETTER_DOTLESS_I))))
4797 { /* Turkish folding requires special handling */
4798 if (folded == 'i')
4799 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4800 else if (folded == 'I')
4801 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4802 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4803 first_fold_from = 'i';
4804 else first_fold_from = 'I';
4805 }
4806 else {
4807 /* Here, isn't a special case: use the generic function to
4808 * calculate what folds to this */
4809 redo_multi:
4810 /* Look up what code points (besides itself) fold to 'folded';
4811 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4812 folds_to_count = _inverse_folds(folded, &first_fold_from,
4813 &remaining_fold_froms);
4814 }
4815
4816 /* Add each character that folds to 'folded' to the list of them,
4817 * subject to limitations based on the node type and target UTF8ness.
4818 * If there was a character that folded to multiple characters, do an
4819 * extra iteration for it. (Note the extra iteration if there is a
4820 * multi-character fold) */
4821 for (i = 0; i < folds_to_count
4822 + UNLIKELY(multi_fold_from != 0); i++)
4823 {
4824 UV fold_from = 0;
4825
4826 if (i >= folds_to_count) { /* Final iteration: handle the
4827 multi-char */
4828 fold_from = multi_fold_from;
4829 }
4830 else if (i == 0) {
4831 fold_from = first_fold_from;
4832 }
4833 else if (i < folds_to_count) {
4834 fold_from = remaining_fold_froms[i-1];
4835 }
4836
4837 if (folded == fold_from) { /* We already added the character
4838 itself */
4839 continue;
4840 }
4841
4842 /* EXACTF doesn't have any non-ascii folds */
4843 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4844 continue;
4845 }
4846
4847 /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4848 * */
4849 if ( isASCII(folded) != isASCII(fold_from)
4850 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4851
4852 {
4853 continue;
4854 }
4855
4856 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4857 * locale, but those have been converted to EXACTFU above) */
4858 if ( op == EXACTFL
4859 && (folded < 256) != (fold_from < 256))
4860 {
4861 continue;
4862 }
4863
4864 /* If this triggers, it likely is because of the unlikely case
4865 * where a new Unicode standard has changed what MAX_MATCHES should
4866 * be set to */
4867 assert(m->count < MAX_MATCHES);
4868
4869 /* Add this character to the list of possible matches */
4870 if (utf8_target) {
4871 uvchr_to_utf8(matches[m->count], fold_from);
4872 lengths[m->count] = UVCHR_SKIP(fold_from);
4873 m->count++;
4874 }
4875 else { /* Non-UTF8 target: no code point above 255 can appear in it
4876 */
4877 if (fold_from > 255) {
4878 continue;
4879 }
4880
4881 matches[m->count][0] = fold_from;
4882 lengths[m->count] = 1;
4883 m->count++;
4884 }
4885
4886 /* Update min and mlengths */
4887 if (m->min_length > lengths[m->count-1]) {
4888 m->min_length = lengths[m->count-1];
4889 }
4890
4891 if (m->max_length < lengths[m->count-1]) {
4892 index_of_longest = m->count - 1;
4893 m->max_length = lengths[index_of_longest];
4894 }
4895 } /* looped through each potential fold */
4896
4897 /* If there is something that folded to an initial multi-character
4898 * fold, repeat, using it. This catches some edge cases. An example
4899 * of one is /ss/i when UTF-8 encoded. The function
4900 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4901 * (LATIN SMALL SHARP S). If it returned a list of characters, this
4902 * code wouldn't be needed. But since it doesn't, we have to look what
4903 * folds to the U+DF. In this case, U+1E9E does, and has to be added.
4904 * */
4905 if (multi_fold_from) {
4906 folded = multi_fold_from;
4907 multi_fold_from = 0;
4908 goto redo_multi;
4909 }
4910 } /* End of finding things that participate in this fold */
4911
4912 if (m->count == 0) { /* If nothing found, can't match */
4913 m->min_length = 0;
4914 return FALSE;
4915 }
4916
4917 /* Have calculated all possible matches. Now calculate the mask and AND
4918 * values */
4919 m->initial_exact = 0;
4920 m->initial_definitive = 0;
4921
4922 {
4923 unsigned int mask_ones = 0;
4924 unsigned int possible_ones = 0;
4925 U8 j;
4926
4927 /* For each byte that is in all possible matches ... */
4928 for (j = 0; j < MIN(m->min_length, 5); j++) {
4929
4930 /* Initialize the accumulator for this byte */
4931 byte_mask[j] = 0xFF;
4932 byte_anded[j] = matches[0][j];
4933
4934 /* Then the rest of the rows (folds). The mask is based on, like,
4935 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4936 * where they differ. */
4937 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4938 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]);
4939 byte_anded[j] &= matches[i][j];
4940 }
4941
4942 /* Keep track of the number of initial mask bytes that are all one
4943 * bits. The code calling this can use this number to know that
4944 * a string that matches this number of bytes in the pattern is an
4945 * exact match of that pattern for this number of bytes. But also
4946 * counted are the number of initial bytes that in total have a
4947 * single zero bit. If a string matches those, masked, it must be
4948 * one of two possibilites, both of which this function has
4949 * determined are legal. (But if that single 0 is one of the
4950 * initial bits for masking a UTF-8 start byte, that could
4951 * incorrectly lead to different length strings appearing to be
4952 * equivalent, so only do this optimization when the matchables are
4953 * all the same length. This was uncovered by testing
4954 * /\x{029E}/i.) */
4955 if (m->min_length == m->max_length) {
4956 mask_ones += PL_bitcount[byte_mask[j]];
4957 possible_ones += 8;
4958 if (mask_ones + 1 >= possible_ones) {
4959 m->initial_definitive++;
4960 if (mask_ones >= possible_ones) {
4961 m->initial_exact++;
4962 }
4963 }
4964 }
4965 }
4966 }
4967
4968 /* The first byte is separate for speed */
4969 m->first_byte_mask = byte_mask[0];
4970 m->first_byte_anded = byte_anded[0];
4971
4972 /* Then pack up to the next 4 bytes into a word */
4973 m->mask32 = m->anded32 = 0;
4974 for (i = 1; i < MIN(m->min_length, 5); i++) {
4975 U8 which = i;
4976 U8 shift = (which - 1) * 8;
4977 m->mask32 |= (U32) byte_mask[i] << shift;
4978 m->anded32 |= (U32) byte_anded[i] << shift;
4979 }
4980
4981 /* Finally, take the match strings and place them sequentially into a
4982 * one-dimensional array. (This is done to save significant space in the
4983 * structure.) Sort so the longest (presumably the least likely) is last.
4984 * XXX When this gets moved to regcomp, may want to fully sort shortest
4985 * first, but above we generally used the folded code point first, and
4986 * those tend to be no longer than their upper case values, so this is
4987 * already pretty well sorted by size.
4988 *
4989 * If the asserts fail, it's most likely because a new version of the
4990 * Unicode standard requires more space; simply increase the declaration
4991 * size. */
4992 {
4993 U8 cur_pos = 0;
4994 U8 output_index = 0;
4995
4996 if (m->count > 1) { /* No need to sort a single entry */
4997 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
4998
4999 /* Keep the same order for all but the longest. (If the
5000 * asserts fail, it could be because m->matches is declared too
5001 * short, either because of a new Unicode release, or an
5002 * overlooked test case, or it could be a bug.) */
5003 if (i != index_of_longest) {
5004 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5005 Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5006 cur_pos += lengths[i];
5007 m->lengths[output_index++] = lengths[i];
5008 }
5009 }
5010 }
5011
5012 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5013 Copy(matches[index_of_longest], m->matches + cur_pos,
5014 lengths[index_of_longest], U8);
5015
5016 /* Place the longest match last */
5017 m->lengths[output_index] = lengths[index_of_longest];
5018 }
5019
5020
5021 return TRUE;
5022 }
5023
5024 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */
5025 bool
S_test_EXACTISH_ST(const char * loc,struct next_matchable_info info)5026 S_test_EXACTISH_ST(const char * loc,
5027 struct next_matchable_info info)
5028 {
5029 /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5030 * bytes starting at 'loc' can match based on 'next_matchable_info' */
5031
5032 U32 input32 = 0;
5033
5034 /* Check the first byte */
5035 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5036 return FALSE;
5037
5038 /* Pack the next up-to-4 bytes into a 32 bit word */
5039 switch (info.min_length) {
5040 default:
5041 input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5042 /* FALLTHROUGH */
5043 case 4:
5044 input32 |= (U8) loc[3] << 2 * 8;
5045 /* FALLTHROUGH */
5046 case 3:
5047 input32 |= (U8) loc[2] << 1 * 8;
5048 /* FALLTHROUGH */
5049 case 2:
5050 input32 |= (U8) loc[1];
5051 break;
5052 case 1:
5053 return TRUE; /* We already tested and passed the 0th byte */
5054 case 0:
5055 ASSUME(0);
5056 }
5057
5058 /* And AND that with the mask and compare that with the assembled ANDED
5059 * values */
5060 return (input32 & info.mask32) == info.anded32;
5061 }
5062
5063 STATIC bool
S_isGCB(pTHX_ const GCB_enum before,const GCB_enum after,const U8 * const strbeg,const U8 * const curpos,const bool utf8_target)5064 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5065 {
5066 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5067 * between the inputs. See https://www.unicode.org/reports/tr29/. */
5068
5069 PERL_ARGS_ASSERT_ISGCB;
5070
5071 switch (GCB_table[before][after]) {
5072 case GCB_BREAKABLE:
5073 return TRUE;
5074
5075 case GCB_NOBREAK:
5076 return FALSE;
5077
5078 case GCB_RI_then_RI:
5079 {
5080 int RI_count = 1;
5081 U8 * temp_pos = (U8 *) curpos;
5082
5083 /* Do not break within emoji flag sequences. That is, do not
5084 * break between regional indicator (RI) symbols if there is an
5085 * odd number of RI characters before the break point.
5086 * GB12 sot (RI RI)* RI × RI
5087 * GB13 [^RI] (RI RI)* RI × RI */
5088
5089 while (backup_one_GCB(strbeg,
5090 &temp_pos,
5091 utf8_target) == GCB_Regional_Indicator)
5092 {
5093 RI_count++;
5094 }
5095
5096 return RI_count % 2 != 1;
5097 }
5098
5099 case GCB_EX_then_EM:
5100
5101 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
5102 {
5103 U8 * temp_pos = (U8 *) curpos;
5104 GCB_enum prev;
5105
5106 do {
5107 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5108 }
5109 while (prev == GCB_Extend);
5110
5111 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5112 }
5113
5114 case GCB_Maybe_Emoji_NonBreak:
5115
5116 {
5117
5118 /* Do not break within emoji modifier sequences or emoji zwj sequences.
5119 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5120 */
5121 U8 * temp_pos = (U8 *) curpos;
5122 GCB_enum prev;
5123
5124 do {
5125 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5126 }
5127 while (prev == GCB_Extend);
5128
5129 return prev != GCB_ExtPict_XX;
5130 }
5131
5132 default:
5133 break;
5134 }
5135
5136 #ifdef DEBUGGING
5137 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5138 before, after, GCB_table[before][after]);
5139 assert(0);
5140 #endif
5141 return TRUE;
5142 }
5143
5144 STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5145 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5146 {
5147 GCB_enum gcb;
5148
5149 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5150
5151 if (*curpos < strbeg) {
5152 return GCB_EDGE;
5153 }
5154
5155 if (utf8_target) {
5156 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5157 U8 * prev_prev_char_pos;
5158
5159 if (! prev_char_pos) {
5160 return GCB_EDGE;
5161 }
5162
5163 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5164 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5165 *curpos = prev_char_pos;
5166 prev_char_pos = prev_prev_char_pos;
5167 }
5168 else {
5169 *curpos = (U8 *) strbeg;
5170 return GCB_EDGE;
5171 }
5172 }
5173 else {
5174 if (*curpos - 2 < strbeg) {
5175 *curpos = (U8 *) strbeg;
5176 return GCB_EDGE;
5177 }
5178 (*curpos)--;
5179 gcb = getGCB_VAL_CP(*(*curpos - 1));
5180 }
5181
5182 return gcb;
5183 }
5184
5185 /* Combining marks attach to most classes that precede them, but this defines
5186 * the exceptions (from TR14) */
5187 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
5188 || prev == LB_Mandatory_Break \
5189 || prev == LB_Carriage_Return \
5190 || prev == LB_Line_Feed \
5191 || prev == LB_Next_Line \
5192 || prev == LB_Space \
5193 || prev == LB_ZWSpace))
5194
5195 STATIC bool
S_isLB(pTHX_ LB_enum before,LB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5196 S_isLB(pTHX_ LB_enum before,
5197 LB_enum after,
5198 const U8 * const strbeg,
5199 const U8 * const curpos,
5200 const U8 * const strend,
5201 const bool utf8_target)
5202 {
5203 U8 * temp_pos = (U8 *) curpos;
5204 LB_enum prev = before;
5205
5206 /* Is the boundary between 'before' and 'after' line-breakable?
5207 * Most of this is just a table lookup of a generated table from Unicode
5208 * rules. But some rules require context to decide, and so have to be
5209 * implemented in code */
5210
5211 PERL_ARGS_ASSERT_ISLB;
5212
5213 /* Rule numbers in the comments below are as of Unicode 9.0 */
5214
5215 redo:
5216 before = prev;
5217 switch (LB_table[before][after]) {
5218 case LB_BREAKABLE:
5219 return TRUE;
5220
5221 case LB_NOBREAK:
5222 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5223 return FALSE;
5224
5225 case LB_SP_foo + LB_BREAKABLE:
5226 case LB_SP_foo + LB_NOBREAK:
5227 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5228
5229 /* When we have something following a SP, we have to look at the
5230 * context in order to know what to do.
5231 *
5232 * SP SP should not reach here because LB7: Do not break before
5233 * spaces. (For two spaces in a row there is nothing that
5234 * overrides that) */
5235 assert(after != LB_Space);
5236
5237 /* Here we have a space followed by a non-space. Mostly this is a
5238 * case of LB18: "Break after spaces". But there are complications
5239 * as the handling of spaces is somewhat tricky. They are in a
5240 * number of rules, which have to be applied in priority order, but
5241 * something earlier in the string can cause a rule to be skipped
5242 * and a lower priority rule invoked. A prime example is LB7 which
5243 * says don't break before a space. But rule LB8 (lower priority)
5244 * says that the first break opportunity after a ZW is after any
5245 * span of spaces immediately after it. If a ZW comes before a SP
5246 * in the input, rule LB8 applies, and not LB7. Other such rules
5247 * involve combining marks which are rules 9 and 10, but they may
5248 * override higher priority rules if they come earlier in the
5249 * string. Since we're doing random access into the middle of the
5250 * string, we have to look for rules that should get applied based
5251 * on both string position and priority. Combining marks do not
5252 * attach to either ZW nor SP, so we don't have to consider them
5253 * until later.
5254 *
5255 * To check for LB8, we have to find the first non-space character
5256 * before this span of spaces */
5257 do {
5258 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5259 }
5260 while (prev == LB_Space);
5261
5262 /* LB8 Break before any character following a zero-width space,
5263 * even if one or more spaces intervene.
5264 * ZW SP* ÷
5265 * So if we have a ZW just before this span, and to get here this
5266 * is the final space in the span. */
5267 if (prev == LB_ZWSpace) {
5268 return TRUE;
5269 }
5270
5271 /* Here, not ZW SP+. There are several rules that have higher
5272 * priority than LB18 and can be resolved now, as they don't depend
5273 * on anything earlier in the string (except ZW, which we have
5274 * already handled). One of these rules is LB11 Do not break
5275 * before Word joiner, but we have specially encoded that in the
5276 * lookup table so it is caught by the single test below which
5277 * catches the other ones. */
5278 if (LB_table[LB_Space][after] - LB_SP_foo
5279 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5280 {
5281 return FALSE;
5282 }
5283
5284 /* If we get here, we have to XXX consider combining marks. */
5285 if (prev == LB_Combining_Mark) {
5286
5287 /* What happens with these depends on the character they
5288 * follow. */
5289 do {
5290 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5291 }
5292 while (prev == LB_Combining_Mark);
5293
5294 /* Most times these attach to and inherit the characteristics
5295 * of that character, but not always, and when not, they are to
5296 * be treated as AL by rule LB10. */
5297 if (! LB_CM_ATTACHES_TO(prev)) {
5298 prev = LB_Alphabetic;
5299 }
5300 }
5301
5302 /* Here, we have the character preceding the span of spaces all set
5303 * up. We follow LB18: "Break after spaces" unless the table shows
5304 * that is overriden */
5305 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5306
5307 case LB_CM_ZWJ_foo:
5308
5309 /* We don't know how to treat the CM except by looking at the first
5310 * non-CM character preceding it. ZWJ is treated as CM */
5311 do {
5312 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5313 }
5314 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5315
5316 /* Here, 'prev' is that first earlier non-CM character. If the CM
5317 * attatches to it, then it inherits the behavior of 'prev'. If it
5318 * doesn't attach, it is to be treated as an AL */
5319 if (! LB_CM_ATTACHES_TO(prev)) {
5320 prev = LB_Alphabetic;
5321 }
5322
5323 goto redo;
5324
5325 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5326 case LB_HY_or_BA_then_foo + LB_NOBREAK:
5327
5328 /* LB21a Don't break after Hebrew + Hyphen.
5329 * HL (HY | BA) × */
5330
5331 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5332 == LB_Hebrew_Letter)
5333 {
5334 return FALSE;
5335 }
5336
5337 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5338
5339 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5340 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5341
5342 /* LB25a (PR | PO) × ( OP | HY )? NU */
5343 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5344 return FALSE;
5345 }
5346
5347 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5348 == LB_BREAKABLE;
5349
5350 case LB_SY_or_IS_then_various + LB_BREAKABLE:
5351 case LB_SY_or_IS_then_various + LB_NOBREAK:
5352 {
5353 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5354
5355 LB_enum temp = prev;
5356 do {
5357 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5358 }
5359 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5360 if (temp == LB_Numeric) {
5361 return FALSE;
5362 }
5363
5364 return LB_table[prev][after] - LB_SY_or_IS_then_various
5365 == LB_BREAKABLE;
5366 }
5367
5368 case LB_various_then_PO_or_PR + LB_BREAKABLE:
5369 case LB_various_then_PO_or_PR + LB_NOBREAK:
5370 {
5371 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5372
5373 LB_enum temp = prev;
5374 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5375 {
5376 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5377 }
5378 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5379 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5380 }
5381 if (temp == LB_Numeric) {
5382 return FALSE;
5383 }
5384 return LB_various_then_PO_or_PR;
5385 }
5386
5387 case LB_RI_then_RI + LB_NOBREAK:
5388 case LB_RI_then_RI + LB_BREAKABLE:
5389 {
5390 int RI_count = 1;
5391
5392 /* LB30a Break between two regional indicator symbols if and
5393 * only if there are an even number of regional indicators
5394 * preceding the position of the break.
5395 *
5396 * sot (RI RI)* RI × RI
5397 * [^RI] (RI RI)* RI × RI */
5398
5399 while (backup_one_LB(strbeg,
5400 &temp_pos,
5401 utf8_target) == LB_Regional_Indicator)
5402 {
5403 RI_count++;
5404 }
5405
5406 return RI_count % 2 == 0;
5407 }
5408
5409 default:
5410 break;
5411 }
5412
5413 #ifdef DEBUGGING
5414 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5415 before, after, LB_table[before][after]);
5416 assert(0);
5417 #endif
5418 return TRUE;
5419 }
5420
5421 STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5422 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5423 {
5424
5425 LB_enum lb;
5426
5427 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5428
5429 if (*curpos >= strend) {
5430 return LB_EDGE;
5431 }
5432
5433 if (utf8_target) {
5434 *curpos += UTF8SKIP(*curpos);
5435 if (*curpos >= strend) {
5436 return LB_EDGE;
5437 }
5438 lb = getLB_VAL_UTF8(*curpos, strend);
5439 }
5440 else {
5441 (*curpos)++;
5442 if (*curpos >= strend) {
5443 return LB_EDGE;
5444 }
5445 lb = getLB_VAL_CP(**curpos);
5446 }
5447
5448 return lb;
5449 }
5450
5451 STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5452 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5453 {
5454 LB_enum lb;
5455
5456 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5457
5458 if (*curpos < strbeg) {
5459 return LB_EDGE;
5460 }
5461
5462 if (utf8_target) {
5463 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5464 U8 * prev_prev_char_pos;
5465
5466 if (! prev_char_pos) {
5467 return LB_EDGE;
5468 }
5469
5470 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5471 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5472 *curpos = prev_char_pos;
5473 prev_char_pos = prev_prev_char_pos;
5474 }
5475 else {
5476 *curpos = (U8 *) strbeg;
5477 return LB_EDGE;
5478 }
5479 }
5480 else {
5481 if (*curpos - 2 < strbeg) {
5482 *curpos = (U8 *) strbeg;
5483 return LB_EDGE;
5484 }
5485 (*curpos)--;
5486 lb = getLB_VAL_CP(*(*curpos - 1));
5487 }
5488
5489 return lb;
5490 }
5491
5492 STATIC bool
S_isSB(pTHX_ SB_enum before,SB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5493 S_isSB(pTHX_ SB_enum before,
5494 SB_enum after,
5495 const U8 * const strbeg,
5496 const U8 * const curpos,
5497 const U8 * const strend,
5498 const bool utf8_target)
5499 {
5500 /* returns a boolean indicating if there is a Sentence Boundary Break
5501 * between the inputs. See https://www.unicode.org/reports/tr29/ */
5502
5503 U8 * lpos = (U8 *) curpos;
5504 bool has_para_sep = FALSE;
5505 bool has_sp = FALSE;
5506
5507 PERL_ARGS_ASSERT_ISSB;
5508
5509 /* Break at the start and end of text.
5510 SB1. sot ÷
5511 SB2. ÷ eot
5512 But unstated in Unicode is don't break if the text is empty */
5513 if (before == SB_EDGE || after == SB_EDGE) {
5514 return before != after;
5515 }
5516
5517 /* SB 3: Do not break within CRLF. */
5518 if (before == SB_CR && after == SB_LF) {
5519 return FALSE;
5520 }
5521
5522 /* Break after paragraph separators. CR and LF are considered
5523 * so because Unicode views text as like word processing text where there
5524 * are no newlines except between paragraphs, and the word processor takes
5525 * care of wrapping without there being hard line-breaks in the text *./
5526 SB4. Sep | CR | LF ÷ */
5527 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5528 return TRUE;
5529 }
5530
5531 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5532 * (See Section 6.2, Replacing Ignore Rules.)
5533 SB5. X (Extend | Format)* → X */
5534 if (after == SB_Extend || after == SB_Format) {
5535
5536 /* Implied is that the these characters attach to everything
5537 * immediately prior to them except for those separator-type
5538 * characters. And the rules earlier have already handled the case
5539 * when one of those immediately precedes the extend char */
5540 return FALSE;
5541 }
5542
5543 if (before == SB_Extend || before == SB_Format) {
5544 U8 * temp_pos = lpos;
5545 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5546 if ( backup != SB_EDGE
5547 && backup != SB_Sep
5548 && backup != SB_CR
5549 && backup != SB_LF)
5550 {
5551 before = backup;
5552 lpos = temp_pos;
5553 }
5554
5555 /* Here, both 'before' and 'backup' are these types; implied is that we
5556 * don't break between them */
5557 if (backup == SB_Extend || backup == SB_Format) {
5558 return FALSE;
5559 }
5560 }
5561
5562 /* Do not break after ambiguous terminators like period, if they are
5563 * immediately followed by a number or lowercase letter, if they are
5564 * between uppercase letters, if the first following letter (optionally
5565 * after certain punctuation) is lowercase, or if they are followed by
5566 * "continuation" punctuation such as comma, colon, or semicolon. For
5567 * example, a period may be an abbreviation or numeric period, and thus may
5568 * not mark the end of a sentence.
5569
5570 * SB6. ATerm × Numeric */
5571 if (before == SB_ATerm && after == SB_Numeric) {
5572 return FALSE;
5573 }
5574
5575 /* SB7. (Upper | Lower) ATerm × Upper */
5576 if (before == SB_ATerm && after == SB_Upper) {
5577 U8 * temp_pos = lpos;
5578 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5579 if (backup == SB_Upper || backup == SB_Lower) {
5580 return FALSE;
5581 }
5582 }
5583
5584 /* The remaining rules that aren't the final one, all require an STerm or
5585 * an ATerm after having backed up over some Close* Sp*, and in one case an
5586 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5587 * So do that backup now, setting flags if either Sp or a paragraph
5588 * separator are found */
5589
5590 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5591 has_para_sep = TRUE;
5592 before = backup_one_SB(strbeg, &lpos, utf8_target);
5593 }
5594
5595 if (before == SB_Sp) {
5596 has_sp = TRUE;
5597 do {
5598 before = backup_one_SB(strbeg, &lpos, utf8_target);
5599 }
5600 while (before == SB_Sp);
5601 }
5602
5603 while (before == SB_Close) {
5604 before = backup_one_SB(strbeg, &lpos, utf8_target);
5605 }
5606
5607 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5608 * most cases an STerm */
5609 if (before == SB_STerm || before == SB_ATerm) {
5610
5611 /* So, here the lhs matches
5612 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5613 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5614 * The rules that apply here are:
5615 *
5616 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5617 | LF | STerm | ATerm) )* Lower
5618 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5619 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5620 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5621 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5622 */
5623
5624 /* And all but SB11 forbid having seen a paragraph separator */
5625 if (! has_para_sep) {
5626 if (before == SB_ATerm) { /* SB8 */
5627 U8 * rpos = (U8 *) curpos;
5628 SB_enum later = after;
5629
5630 while ( later != SB_OLetter
5631 && later != SB_Upper
5632 && later != SB_Lower
5633 && later != SB_Sep
5634 && later != SB_CR
5635 && later != SB_LF
5636 && later != SB_STerm
5637 && later != SB_ATerm
5638 && later != SB_EDGE)
5639 {
5640 later = advance_one_SB(&rpos, strend, utf8_target);
5641 }
5642 if (later == SB_Lower) {
5643 return FALSE;
5644 }
5645 }
5646
5647 if ( after == SB_SContinue /* SB8a */
5648 || after == SB_STerm
5649 || after == SB_ATerm)
5650 {
5651 return FALSE;
5652 }
5653
5654 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5655 if ( after == SB_Close
5656 || after == SB_Sp
5657 || after == SB_Sep
5658 || after == SB_CR
5659 || after == SB_LF)
5660 {
5661 return FALSE;
5662 }
5663 }
5664
5665 /* SB10. This and SB9 could probably be combined some way, but khw
5666 * has decided to follow the Unicode rule book precisely for
5667 * simplified maintenance */
5668 if ( after == SB_Sp
5669 || after == SB_Sep
5670 || after == SB_CR
5671 || after == SB_LF)
5672 {
5673 return FALSE;
5674 }
5675 }
5676
5677 /* SB11. */
5678 return TRUE;
5679 }
5680
5681 /* Otherwise, do not break.
5682 SB12. Any × Any */
5683
5684 return FALSE;
5685 }
5686
5687 STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5688 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5689 {
5690 SB_enum sb;
5691
5692 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5693
5694 if (*curpos >= strend) {
5695 return SB_EDGE;
5696 }
5697
5698 if (utf8_target) {
5699 do {
5700 *curpos += UTF8SKIP(*curpos);
5701 if (*curpos >= strend) {
5702 return SB_EDGE;
5703 }
5704 sb = getSB_VAL_UTF8(*curpos, strend);
5705 } while (sb == SB_Extend || sb == SB_Format);
5706 }
5707 else {
5708 do {
5709 (*curpos)++;
5710 if (*curpos >= strend) {
5711 return SB_EDGE;
5712 }
5713 sb = getSB_VAL_CP(**curpos);
5714 } while (sb == SB_Extend || sb == SB_Format);
5715 }
5716
5717 return sb;
5718 }
5719
5720 STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5721 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5722 {
5723 SB_enum sb;
5724
5725 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5726
5727 if (*curpos < strbeg) {
5728 return SB_EDGE;
5729 }
5730
5731 if (utf8_target) {
5732 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5733 if (! prev_char_pos) {
5734 return SB_EDGE;
5735 }
5736
5737 /* Back up over Extend and Format. curpos is always just to the right
5738 * of the characater whose value we are getting */
5739 do {
5740 U8 * prev_prev_char_pos;
5741 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5742 strbeg)))
5743 {
5744 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5745 *curpos = prev_char_pos;
5746 prev_char_pos = prev_prev_char_pos;
5747 }
5748 else {
5749 *curpos = (U8 *) strbeg;
5750 return SB_EDGE;
5751 }
5752 } while (sb == SB_Extend || sb == SB_Format);
5753 }
5754 else {
5755 do {
5756 if (*curpos - 2 < strbeg) {
5757 *curpos = (U8 *) strbeg;
5758 return SB_EDGE;
5759 }
5760 (*curpos)--;
5761 sb = getSB_VAL_CP(*(*curpos - 1));
5762 } while (sb == SB_Extend || sb == SB_Format);
5763 }
5764
5765 return sb;
5766 }
5767
5768 STATIC bool
S_isWB(pTHX_ WB_enum previous,WB_enum before,WB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5769 S_isWB(pTHX_ WB_enum previous,
5770 WB_enum before,
5771 WB_enum after,
5772 const U8 * const strbeg,
5773 const U8 * const curpos,
5774 const U8 * const strend,
5775 const bool utf8_target)
5776 {
5777 /* Return a boolean as to if the boundary between 'before' and 'after' is
5778 * a Unicode word break, using their published algorithm, but tailored for
5779 * Perl by treating spans of white space as one unit. Context may be
5780 * needed to make this determination. If the value for the character
5781 * before 'before' is known, it is passed as 'previous'; otherwise that
5782 * should be set to WB_UNKNOWN. The other input parameters give the
5783 * boundaries and current position in the matching of the string. That
5784 * is, 'curpos' marks the position where the character whose wb value is
5785 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5786
5787 U8 * before_pos = (U8 *) curpos;
5788 U8 * after_pos = (U8 *) curpos;
5789 WB_enum prev = before;
5790 WB_enum next;
5791
5792 PERL_ARGS_ASSERT_ISWB;
5793
5794 /* Rule numbers in the comments below are as of Unicode 9.0 */
5795
5796 redo:
5797 before = prev;
5798 switch (WB_table[before][after]) {
5799 case WB_BREAKABLE:
5800 return TRUE;
5801
5802 case WB_NOBREAK:
5803 return FALSE;
5804
5805 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5806 next = advance_one_WB(&after_pos, strend, utf8_target,
5807 FALSE /* Don't skip Extend nor Format */ );
5808 /* A space immediately preceding an Extend or Format is attached
5809 * to by them, and hence gets separated from previous spaces.
5810 * Otherwise don't break between horizontal white space */
5811 return next == WB_Extend || next == WB_Format;
5812
5813 /* WB4 Ignore Format and Extend characters, except when they appear at
5814 * the beginning of a region of text. This code currently isn't
5815 * general purpose, but it works as the rules are currently and likely
5816 * to be laid out. The reason it works is that when 'they appear at
5817 * the beginning of a region of text', the rule is to break before
5818 * them, just like any other character. Therefore, the default rule
5819 * applies and we don't have to look in more depth. Should this ever
5820 * change, we would have to have 2 'case' statements, like in the rules
5821 * below, and backup a single character (not spacing over the extend
5822 * ones) and then see if that is one of the region-end characters and
5823 * go from there */
5824 case WB_Ex_or_FO_or_ZWJ_then_foo:
5825 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5826 goto redo;
5827
5828 case WB_DQ_then_HL + WB_BREAKABLE:
5829 case WB_DQ_then_HL + WB_NOBREAK:
5830
5831 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5832
5833 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5834 == WB_Hebrew_Letter)
5835 {
5836 return FALSE;
5837 }
5838
5839 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5840
5841 case WB_HL_then_DQ + WB_BREAKABLE:
5842 case WB_HL_then_DQ + WB_NOBREAK:
5843
5844 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5845
5846 if (advance_one_WB(&after_pos, strend, utf8_target,
5847 TRUE /* Do skip Extend and Format */ )
5848 == WB_Hebrew_Letter)
5849 {
5850 return FALSE;
5851 }
5852
5853 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5854
5855 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5856 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5857
5858 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5859 * | Single_Quote) (ALetter | Hebrew_Letter) */
5860
5861 next = advance_one_WB(&after_pos, strend, utf8_target,
5862 TRUE /* Do skip Extend and Format */ );
5863
5864 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5865 {
5866 return FALSE;
5867 }
5868
5869 return WB_table[before][after]
5870 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5871
5872 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5873 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5874
5875 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5876 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5877
5878 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5879 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5880 {
5881 return FALSE;
5882 }
5883
5884 return WB_table[before][after]
5885 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5886
5887 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5888 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5889
5890 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5891 * */
5892
5893 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5894 == WB_Numeric)
5895 {
5896 return FALSE;
5897 }
5898
5899 return WB_table[before][after]
5900 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5901
5902 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5903 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5904
5905 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5906
5907 if (advance_one_WB(&after_pos, strend, utf8_target,
5908 TRUE /* Do skip Extend and Format */ )
5909 == WB_Numeric)
5910 {
5911 return FALSE;
5912 }
5913
5914 return WB_table[before][after]
5915 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5916
5917 case WB_RI_then_RI + WB_NOBREAK:
5918 case WB_RI_then_RI + WB_BREAKABLE:
5919 {
5920 int RI_count = 1;
5921
5922 /* Do not break within emoji flag sequences. That is, do not
5923 * break between regional indicator (RI) symbols if there is an
5924 * odd number of RI characters before the potential break
5925 * point.
5926 *
5927 * WB15 sot (RI RI)* RI × RI
5928 * WB16 [^RI] (RI RI)* RI × RI */
5929
5930 while (backup_one_WB(&previous,
5931 strbeg,
5932 &before_pos,
5933 utf8_target) == WB_Regional_Indicator)
5934 {
5935 RI_count++;
5936 }
5937
5938 return RI_count % 2 != 1;
5939 }
5940
5941 default:
5942 break;
5943 }
5944
5945 #ifdef DEBUGGING
5946 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5947 before, after, WB_table[before][after]);
5948 assert(0);
5949 #endif
5950 return TRUE;
5951 }
5952
5953 STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target,const bool skip_Extend_Format)5954 S_advance_one_WB(pTHX_ U8 ** curpos,
5955 const U8 * const strend,
5956 const bool utf8_target,
5957 const bool skip_Extend_Format)
5958 {
5959 WB_enum wb;
5960
5961 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5962
5963 if (*curpos >= strend) {
5964 return WB_EDGE;
5965 }
5966
5967 if (utf8_target) {
5968
5969 /* Advance over Extend and Format */
5970 do {
5971 *curpos += UTF8SKIP(*curpos);
5972 if (*curpos >= strend) {
5973 return WB_EDGE;
5974 }
5975 wb = getWB_VAL_UTF8(*curpos, strend);
5976 } while ( skip_Extend_Format
5977 && (wb == WB_Extend || wb == WB_Format));
5978 }
5979 else {
5980 do {
5981 (*curpos)++;
5982 if (*curpos >= strend) {
5983 return WB_EDGE;
5984 }
5985 wb = getWB_VAL_CP(**curpos);
5986 } while ( skip_Extend_Format
5987 && (wb == WB_Extend || wb == WB_Format));
5988 }
5989
5990 return wb;
5991 }
5992
5993 STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous,const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5994 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5995 {
5996 WB_enum wb;
5997
5998 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5999
6000 /* If we know what the previous character's break value is, don't have
6001 * to look it up */
6002 if (*previous != WB_UNKNOWN) {
6003 wb = *previous;
6004
6005 /* But we need to move backwards by one */
6006 if (utf8_target) {
6007 *curpos = reghopmaybe3(*curpos, -1, strbeg);
6008 if (! *curpos) {
6009 *previous = WB_EDGE;
6010 *curpos = (U8 *) strbeg;
6011 }
6012 else {
6013 *previous = WB_UNKNOWN;
6014 }
6015 }
6016 else {
6017 (*curpos)--;
6018 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6019 }
6020
6021 /* And we always back up over these three types */
6022 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6023 return wb;
6024 }
6025 }
6026
6027 if (*curpos < strbeg) {
6028 return WB_EDGE;
6029 }
6030
6031 if (utf8_target) {
6032 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6033 if (! prev_char_pos) {
6034 return WB_EDGE;
6035 }
6036
6037 /* Back up over Extend and Format. curpos is always just to the right
6038 * of the characater whose value we are getting */
6039 do {
6040 U8 * prev_prev_char_pos;
6041 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6042 -1,
6043 strbeg)))
6044 {
6045 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6046 *curpos = prev_char_pos;
6047 prev_char_pos = prev_prev_char_pos;
6048 }
6049 else {
6050 *curpos = (U8 *) strbeg;
6051 return WB_EDGE;
6052 }
6053 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6054 }
6055 else {
6056 do {
6057 if (*curpos - 2 < strbeg) {
6058 *curpos = (U8 *) strbeg;
6059 return WB_EDGE;
6060 }
6061 (*curpos)--;
6062 wb = getWB_VAL_CP(*(*curpos - 1));
6063 } while (wb == WB_Extend || wb == WB_Format);
6064 }
6065
6066 return wb;
6067 }
6068
6069 /* Macros for regmatch(), using its internal variables */
6070 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6071 #define NEXTCHR_IS_EOS (nextbyte < 0)
6072
6073 #define SET_nextchr \
6074 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6075
6076 #define SET_locinput(p) \
6077 locinput = (p); \
6078 SET_nextchr
6079
6080 #define sayYES goto yes
6081 #define sayNO goto no
6082 #define sayNO_SILENT goto no_silent
6083
6084 /* we dont use STMT_START/END here because it leads to
6085 "unreachable code" warnings, which are bogus, but distracting. */
6086 #define CACHEsayNO \
6087 if (ST.cache_mask) \
6088 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6089 sayNO
6090
6091 #define EVAL_CLOSE_PAREN_IS(st,expr) \
6092 ( \
6093 ( ( st ) ) && \
6094 ( ( st )->u.eval.close_paren ) && \
6095 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6096 )
6097
6098 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
6099 ( \
6100 ( ( st ) ) && \
6101 ( ( st )->u.eval.close_paren ) && \
6102 ( ( expr ) ) && \
6103 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6104 )
6105
6106
6107 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6108 (st)->u.eval.close_paren = ( (expr) + 1 )
6109
6110 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6111 (st)->u.eval.close_paren = 0
6112
6113 /* push a new state then goto it */
6114
6115 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
6116 pushinput = input; \
6117 pusheol = eol; \
6118 pushsr0 = sr0; \
6119 scan = node; \
6120 st->resume_state = state; \
6121 goto push_state;
6122
6123 /* push a new state with success backtracking, then goto it */
6124
6125 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
6126 pushinput = input; \
6127 pusheol = eol; \
6128 pushsr0 = sr0; \
6129 scan = node; \
6130 st->resume_state = state; \
6131 goto push_yes_state;
6132
6133 #define DEBUG_STATE_pp(pp) \
6134 DEBUG_STATE_r({ \
6135 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6136 Perl_re_printf( aTHX_ \
6137 "%*s" pp " %s%s%s%s%s\n", \
6138 INDENT_CHARS(depth), "", \
6139 PL_reg_name[st->resume_state], \
6140 ((st==yes_state||st==mark_state) ? "[" : ""), \
6141 ((st==yes_state) ? "Y" : ""), \
6142 ((st==mark_state) ? "M" : ""), \
6143 ((st==yes_state||st==mark_state) ? "]" : "") \
6144 ); \
6145 });
6146
6147 /*
6148
6149 regmatch() - main matching routine
6150
6151 This is basically one big switch statement in a loop. We execute an op,
6152 set 'next' to point the next op, and continue. If we come to a point which
6153 we may need to backtrack to on failure such as (A|B|C), we push a
6154 backtrack state onto the backtrack stack. On failure, we pop the top
6155 state, and re-enter the loop at the state indicated. If there are no more
6156 states to pop, we return failure.
6157
6158 Sometimes we also need to backtrack on success; for example /A+/, where
6159 after successfully matching one A, we need to go back and try to
6160 match another one; similarly for lookahead assertions: if the assertion
6161 completes successfully, we backtrack to the state just before the assertion
6162 and then carry on. In these cases, the pushed state is marked as
6163 'backtrack on success too'. This marking is in fact done by a chain of
6164 pointers, each pointing to the previous 'yes' state. On success, we pop to
6165 the nearest yes state, discarding any intermediate failure-only states.
6166 Sometimes a yes state is pushed just to force some cleanup code to be
6167 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6168 it to free the inner regex.
6169
6170 Note that failure backtracking rewinds the cursor position, while
6171 success backtracking leaves it alone.
6172
6173 A pattern is complete when the END op is executed, while a subpattern
6174 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6175 ops trigger the "pop to last yes state if any, otherwise return true"
6176 behaviour.
6177
6178 A common convention in this function is to use A and B to refer to the two
6179 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6180 the subpattern to be matched possibly multiple times, while B is the entire
6181 rest of the pattern. Variable and state names reflect this convention.
6182
6183 The states in the main switch are the union of ops and failure/success of
6184 substates associated with that op. For example, IFMATCH is the op
6185 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6186 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6187 successfully matched A and IFMATCH_A_fail is a state saying that we have
6188 just failed to match A. Resume states always come in pairs. The backtrack
6189 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6190 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6191 on success or failure.
6192
6193 The struct that holds a backtracking state is actually a big union, with
6194 one variant for each major type of op. The variable st points to the
6195 top-most backtrack struct. To make the code clearer, within each
6196 block of code we #define ST to alias the relevant union.
6197
6198 Here's a concrete example of a (vastly oversimplified) IFMATCH
6199 implementation:
6200
6201 switch (state) {
6202 ....
6203
6204 #define ST st->u.ifmatch
6205
6206 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6207 ST.foo = ...; // some state we wish to save
6208 ...
6209 // push a yes backtrack state with a resume value of
6210 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6211 // first node of A:
6212 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6213 // NOTREACHED
6214
6215 case IFMATCH_A: // we have successfully executed A; now continue with B
6216 next = B;
6217 bar = ST.foo; // do something with the preserved value
6218 break;
6219
6220 case IFMATCH_A_fail: // A failed, so the assertion failed
6221 ...; // do some housekeeping, then ...
6222 sayNO; // propagate the failure
6223
6224 #undef ST
6225
6226 ...
6227 }
6228
6229 For any old-timers reading this who are familiar with the old recursive
6230 approach, the code above is equivalent to:
6231
6232 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6233 {
6234 int foo = ...
6235 ...
6236 if (regmatch(A)) {
6237 next = B;
6238 bar = foo;
6239 break;
6240 }
6241 ...; // do some housekeeping, then ...
6242 sayNO; // propagate the failure
6243 }
6244
6245 The topmost backtrack state, pointed to by st, is usually free. If you
6246 want to claim it, populate any ST.foo fields in it with values you wish to
6247 save, then do one of
6248
6249 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6250 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6251
6252 which sets that backtrack state's resume value to 'resume_state', pushes a
6253 new free entry to the top of the backtrack stack, then goes to 'node'.
6254 On backtracking, the free slot is popped, and the saved state becomes the
6255 new free state. An ST.foo field in this new top state can be temporarily
6256 accessed to retrieve values, but once the main loop is re-entered, it
6257 becomes available for reuse.
6258
6259 Note that the depth of the backtrack stack constantly increases during the
6260 left-to-right execution of the pattern, rather than going up and down with
6261 the pattern nesting. For example the stack is at its maximum at Z at the
6262 end of the pattern, rather than at X in the following:
6263
6264 /(((X)+)+)+....(Y)+....Z/
6265
6266 The only exceptions to this are lookahead/behind assertions and the cut,
6267 (?>A), which pop all the backtrack states associated with A before
6268 continuing.
6269
6270 Backtrack state structs are allocated in slabs of about 4K in size.
6271 PL_regmatch_state and st always point to the currently active state,
6272 and PL_regmatch_slab points to the slab currently containing
6273 PL_regmatch_state. The first time regmatch() is called, the first slab is
6274 allocated, and is never freed until interpreter destruction. When the slab
6275 is full, a new one is allocated and chained to the end. At exit from
6276 regmatch(), slabs allocated since entry are freed.
6277
6278 In order to work with variable length lookbehinds, an upper limit is placed on
6279 lookbehinds which is set to where the match position is at the end of where the
6280 lookbehind would get to. Nothing in the lookbehind should match above that,
6281 except we should be able to look beyond if for things like \b, which need the
6282 next character in the string to be able to determine if this is a boundary or
6283 not. We also can't match the end of string/line unless we are also at the end
6284 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6285 that match a width, we have to add a condition that they are within the legal
6286 bounds of our window into the string.
6287
6288 */
6289
6290 /* returns -1 on failure, $+[0] on success */
6291 STATIC SSize_t
S_regmatch(pTHX_ regmatch_info * reginfo,char * startpos,regnode * prog)6292 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6293 {
6294 const bool utf8_target = reginfo->is_utf8_target;
6295 const U32 uniflags = UTF8_ALLOW_DEFAULT;
6296 REGEXP *rex_sv = reginfo->prog;
6297 regexp *rex = ReANY(rex_sv);
6298 RXi_GET_DECL(rex,rexi);
6299 /* the current state. This is a cached copy of PL_regmatch_state */
6300 regmatch_state *st;
6301 /* cache heavy used fields of st in registers */
6302 regnode *scan;
6303 regnode *next;
6304 U32 n = 0; /* general value; init to avoid compiler warning */
6305 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
6306 SSize_t endref = 0; /* offset of end of backref when ln is start */
6307 char *locinput = startpos;
6308 char *loceol = reginfo->strend;
6309 char *pushinput; /* where to continue after a PUSH */
6310 char *pusheol; /* where to stop matching (loceol) after a PUSH */
6311 U8 *pushsr0; /* save starting pos of script run */
6312 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
6313 at EOS */
6314
6315 bool result = 0; /* return value of S_regmatch */
6316 U32 depth = 0; /* depth of backtrack stack */
6317 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6318 const U32 max_nochange_depth =
6319 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6320 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6321 regmatch_state *yes_state = NULL; /* state to pop to on success of
6322 subpattern */
6323 /* mark_state piggy backs on the yes_state logic so that when we unwind
6324 the stack on success we can update the mark_state as we go */
6325 regmatch_state *mark_state = NULL; /* last mark state we have seen */
6326 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6327 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
6328 U32 state_num;
6329 bool no_final = 0; /* prevent failure from backtracking? */
6330 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
6331 char *startpoint = locinput;
6332 SV *popmark = NULL; /* are we looking for a mark? */
6333 SV *sv_commit = NULL; /* last mark name seen in failure */
6334 SV *sv_yes_mark = NULL; /* last mark name we have seen
6335 during a successful match */
6336 U32 lastopen = 0; /* last open we saw */
6337 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6338 SV* const oreplsv = GvSVn(PL_replgv);
6339 /* these three flags are set by various ops to signal information to
6340 * the very next op. They have a useful lifetime of exactly one loop
6341 * iteration, and are not preserved or restored by state pushes/pops
6342 */
6343 bool sw = 0; /* the condition value in (?(cond)a|b) */
6344 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
6345 int logical = 0; /* the following EVAL is:
6346 0: (?{...})
6347 1: (?(?{...})X|Y)
6348 2: (??{...})
6349 or the following IFMATCH/UNLESSM is:
6350 false: plain (?=foo)
6351 true: used as a condition: (?(?=foo))
6352 */
6353 PAD* last_pad = NULL;
6354 dMULTICALL;
6355 U8 gimme = G_SCALAR;
6356 CV *caller_cv = NULL; /* who called us */
6357 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
6358 U32 maxopenparen = 0; /* max '(' index seen so far */
6359 int to_complement; /* Invert the result? */
6360 _char_class_number classnum;
6361 bool is_utf8_pat = reginfo->is_utf8_pat;
6362 bool match = FALSE;
6363 I32 orig_savestack_ix = PL_savestack_ix;
6364 U8 * script_run_begin = NULL;
6365
6366 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6367 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6368 # define SOLARIS_BAD_OPTIMIZER
6369 const U32 *pl_charclass_dup = PL_charclass;
6370 # define PL_charclass pl_charclass_dup
6371 #endif
6372
6373 #ifdef DEBUGGING
6374 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6375 #endif
6376
6377 /* protect against undef(*^R) */
6378 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6379
6380 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6381 multicall_oldcatch = 0;
6382 PERL_UNUSED_VAR(multicall_cop);
6383
6384 PERL_ARGS_ASSERT_REGMATCH;
6385
6386 st = PL_regmatch_state;
6387
6388 /* Note that nextbyte is a byte even in UTF */
6389 SET_nextchr;
6390 scan = prog;
6391
6392 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6393 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6394 Perl_re_printf( aTHX_ "regmatch start\n" );
6395 }));
6396
6397 while (scan != NULL) {
6398 next = scan + NEXT_OFF(scan);
6399 if (next == scan)
6400 next = NULL;
6401 state_num = OP(scan);
6402
6403 reenter_switch:
6404 DEBUG_EXECUTE_r(
6405 if (state_num <= REGNODE_MAX) {
6406 SV * const prop = sv_newmortal();
6407 regnode *rnext = regnext(scan);
6408
6409 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6410 regprop(rex, prop, scan, reginfo, NULL);
6411 Perl_re_printf( aTHX_
6412 "%*s%" IVdf ":%s(%" IVdf ")\n",
6413 INDENT_CHARS(depth), "",
6414 (IV)(scan - rexi->program),
6415 SvPVX_const(prop),
6416 (PL_regkind[OP(scan)] == END || !rnext) ?
6417 0 : (IV)(rnext - rexi->program));
6418 }
6419 );
6420
6421 to_complement = 0;
6422
6423 SET_nextchr;
6424 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6425
6426 switch (state_num) {
6427 case SBOL: /* /^../ and /\A../ */
6428 if (locinput == reginfo->strbeg)
6429 break;
6430 sayNO;
6431
6432 case MBOL: /* /^../m */
6433 if (locinput == reginfo->strbeg ||
6434 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6435 {
6436 break;
6437 }
6438 sayNO;
6439
6440 case GPOS: /* \G */
6441 if (locinput == reginfo->ganch)
6442 break;
6443 sayNO;
6444
6445 case KEEPS: /* \K */
6446 /* update the startpoint */
6447 st->u.keeper.val = rex->offs[0].start;
6448 rex->offs[0].start = locinput - reginfo->strbeg;
6449 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6450 script_run_begin);
6451 NOT_REACHED; /* NOTREACHED */
6452
6453 case KEEPS_next_fail:
6454 /* rollback the start point change */
6455 rex->offs[0].start = st->u.keeper.val;
6456 sayNO_SILENT;
6457 NOT_REACHED; /* NOTREACHED */
6458
6459 case MEOL: /* /..$/m */
6460 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6461 sayNO;
6462 break;
6463
6464 case SEOL: /* /..$/ */
6465 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6466 sayNO;
6467 if (reginfo->strend - locinput > 1)
6468 sayNO;
6469 break;
6470
6471 case EOS: /* \z */
6472 if (!NEXTCHR_IS_EOS)
6473 sayNO;
6474 break;
6475
6476 case SANY: /* /./s */
6477 if (NEXTCHR_IS_EOS || locinput >= loceol)
6478 sayNO;
6479 goto increment_locinput;
6480
6481 case REG_ANY: /* /./ */
6482 if ( NEXTCHR_IS_EOS
6483 || locinput >= loceol
6484 || nextbyte == '\n')
6485 {
6486 sayNO;
6487 }
6488 goto increment_locinput;
6489
6490
6491 #undef ST
6492 #define ST st->u.trie
6493 case TRIEC: /* (ab|cd) with known charclass */
6494 /* In this case the charclass data is available inline so
6495 we can fail fast without a lot of extra overhead.
6496 */
6497 if ( ! NEXTCHR_IS_EOS
6498 && locinput < loceol
6499 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6500 {
6501 DEBUG_EXECUTE_r(
6502 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6503 depth, PL_colors[4], PL_colors[5])
6504 );
6505 sayNO_SILENT;
6506 NOT_REACHED; /* NOTREACHED */
6507 }
6508 /* FALLTHROUGH */
6509 case TRIE: /* (ab|cd) */
6510 /* the basic plan of execution of the trie is:
6511 * At the beginning, run though all the states, and
6512 * find the longest-matching word. Also remember the position
6513 * of the shortest matching word. For example, this pattern:
6514 * 1 2 3 4 5
6515 * ab|a|x|abcd|abc
6516 * when matched against the string "abcde", will generate
6517 * accept states for all words except 3, with the longest
6518 * matching word being 4, and the shortest being 2 (with
6519 * the position being after char 1 of the string).
6520 *
6521 * Then for each matching word, in word order (i.e. 1,2,4,5),
6522 * we run the remainder of the pattern; on each try setting
6523 * the current position to the character following the word,
6524 * returning to try the next word on failure.
6525 *
6526 * We avoid having to build a list of words at runtime by
6527 * using a compile-time structure, wordinfo[].prev, which
6528 * gives, for each word, the previous accepting word (if any).
6529 * In the case above it would contain the mappings 1->2, 2->0,
6530 * 3->0, 4->5, 5->1. We can use this table to generate, from
6531 * the longest word (4 above), a list of all words, by
6532 * following the list of prev pointers; this gives us the
6533 * unordered list 4,5,1,2. Then given the current word we have
6534 * just tried, we can go through the list and find the
6535 * next-biggest word to try (so if we just failed on word 2,
6536 * the next in the list is 4).
6537 *
6538 * Since at runtime we don't record the matching position in
6539 * the string for each word, we have to work that out for
6540 * each word we're about to process. The wordinfo table holds
6541 * the character length of each word; given that we recorded
6542 * at the start: the position of the shortest word and its
6543 * length in chars, we just need to move the pointer the
6544 * difference between the two char lengths. Depending on
6545 * Unicode status and folding, that's cheap or expensive.
6546 *
6547 * This algorithm is optimised for the case where are only a
6548 * small number of accept states, i.e. 0,1, or maybe 2.
6549 * With lots of accepts states, and having to try all of them,
6550 * it becomes quadratic on number of accept states to find all
6551 * the next words.
6552 */
6553
6554 {
6555 /* what type of TRIE am I? (utf8 makes this contextual) */
6556 DECL_TRIE_TYPE(scan);
6557
6558 /* what trie are we using right now */
6559 reg_trie_data * const trie
6560 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6561 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6562 U32 state = trie->startstate;
6563
6564 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6565 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6566 if (utf8_target
6567 && ! NEXTCHR_IS_EOS
6568 && UTF8_IS_ABOVE_LATIN1(nextbyte)
6569 && scan->flags == EXACTL)
6570 {
6571 /* We only output for EXACTL, as we let the folder
6572 * output this message for EXACTFLU8 to avoid
6573 * duplication */
6574 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6575 reginfo->strend);
6576 }
6577 }
6578 if ( trie->bitmap
6579 && ( NEXTCHR_IS_EOS
6580 || locinput >= loceol
6581 || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6582 {
6583 if (trie->states[ state ].wordnum) {
6584 DEBUG_EXECUTE_r(
6585 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
6586 depth, PL_colors[4], PL_colors[5])
6587 );
6588 if (!trie->jump)
6589 break;
6590 } else {
6591 DEBUG_EXECUTE_r(
6592 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6593 depth, PL_colors[4], PL_colors[5])
6594 );
6595 sayNO_SILENT;
6596 }
6597 }
6598
6599 {
6600 U8 *uc = ( U8* )locinput;
6601
6602 STRLEN len = 0;
6603 STRLEN foldlen = 0;
6604 U8 *uscan = (U8*)NULL;
6605 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6606 U32 charcount = 0; /* how many input chars we have matched */
6607 U32 accepted = 0; /* have we seen any accepting states? */
6608
6609 ST.jump = trie->jump;
6610 ST.me = scan;
6611 ST.firstpos = NULL;
6612 ST.longfold = FALSE; /* char longer if folded => it's harder */
6613 ST.nextword = 0;
6614
6615 /* fully traverse the TRIE; note the position of the
6616 shortest accept state and the wordnum of the longest
6617 accept state */
6618
6619 while ( state && uc <= (U8*)(loceol) ) {
6620 U32 base = trie->states[ state ].trans.base;
6621 UV uvc = 0;
6622 U16 charid = 0;
6623 U16 wordnum;
6624 wordnum = trie->states[ state ].wordnum;
6625
6626 if (wordnum) { /* it's an accept state */
6627 if (!accepted) {
6628 accepted = 1;
6629 /* record first match position */
6630 if (ST.longfold) {
6631 ST.firstpos = (U8*)locinput;
6632 ST.firstchars = 0;
6633 }
6634 else {
6635 ST.firstpos = uc;
6636 ST.firstchars = charcount;
6637 }
6638 }
6639 if (!ST.nextword || wordnum < ST.nextword)
6640 ST.nextword = wordnum;
6641 ST.topword = wordnum;
6642 }
6643
6644 DEBUG_TRIE_EXECUTE_r({
6645 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6646 /* HERE */
6647 PerlIO_printf( Perl_debug_log,
6648 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6649 INDENT_CHARS(depth), "", PL_colors[4],
6650 (UV)state, (accepted ? 'Y' : 'N'));
6651 });
6652
6653 /* read a char and goto next state */
6654 if ( base && (foldlen || uc < (U8*)(loceol))) {
6655 I32 offset;
6656 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6657 (U8 *) loceol, uscan,
6658 len, uvc, charid, foldlen,
6659 foldbuf, uniflags);
6660 charcount++;
6661 if (foldlen>0)
6662 ST.longfold = TRUE;
6663 if (charid &&
6664 ( ((offset =
6665 base + charid - 1 - trie->uniquecharcount)) >= 0)
6666
6667 && ((U32)offset < trie->lasttrans)
6668 && trie->trans[offset].check == state)
6669 {
6670 state = trie->trans[offset].next;
6671 }
6672 else {
6673 state = 0;
6674 }
6675 uc += len;
6676
6677 }
6678 else {
6679 state = 0;
6680 }
6681 DEBUG_TRIE_EXECUTE_r(
6682 Perl_re_printf( aTHX_
6683 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6684 charid, uvc, (UV)state, PL_colors[5] );
6685 );
6686 }
6687 if (!accepted)
6688 sayNO;
6689
6690 /* calculate total number of accept states */
6691 {
6692 U16 w = ST.topword;
6693 accepted = 0;
6694 while (w) {
6695 w = trie->wordinfo[w].prev;
6696 accepted++;
6697 }
6698 ST.accepted = accepted;
6699 }
6700
6701 DEBUG_EXECUTE_r(
6702 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6703 depth,
6704 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6705 );
6706 goto trie_first_try; /* jump into the fail handler */
6707 }}
6708 NOT_REACHED; /* NOTREACHED */
6709
6710 case TRIE_next_fail: /* we failed - try next alternative */
6711 {
6712 U8 *uc;
6713 if ( ST.jump ) {
6714 /* undo any captures done in the tail part of a branch,
6715 * e.g.
6716 * /(?:X(.)(.)|Y(.)).../
6717 * where the trie just matches X then calls out to do the
6718 * rest of the branch */
6719 REGCP_UNWIND(ST.cp);
6720 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6721 }
6722 if (!--ST.accepted) {
6723 DEBUG_EXECUTE_r({
6724 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6725 depth,
6726 PL_colors[4],
6727 PL_colors[5] );
6728 });
6729 sayNO_SILENT;
6730 }
6731 {
6732 /* Find next-highest word to process. Note that this code
6733 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6734 U16 min = 0;
6735 U16 word;
6736 U16 const nextword = ST.nextword;
6737 reg_trie_wordinfo * const wordinfo
6738 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6739 for (word=ST.topword; word; word=wordinfo[word].prev) {
6740 if (word > nextword && (!min || word < min))
6741 min = word;
6742 }
6743 ST.nextword = min;
6744 }
6745
6746 trie_first_try:
6747 if (do_cutgroup) {
6748 do_cutgroup = 0;
6749 no_final = 0;
6750 }
6751
6752 if ( ST.jump ) {
6753 ST.lastparen = rex->lastparen;
6754 ST.lastcloseparen = rex->lastcloseparen;
6755 REGCP_SET(ST.cp);
6756 }
6757
6758 /* find start char of end of current word */
6759 {
6760 U32 chars; /* how many chars to skip */
6761 reg_trie_data * const trie
6762 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6763
6764 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6765 >= ST.firstchars);
6766 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6767 - ST.firstchars;
6768 uc = ST.firstpos;
6769
6770 if (ST.longfold) {
6771 /* the hard option - fold each char in turn and find
6772 * its folded length (which may be different */
6773 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6774 STRLEN foldlen;
6775 STRLEN len;
6776 UV uvc;
6777 U8 *uscan;
6778
6779 while (chars) {
6780 if (utf8_target) {
6781 /* XXX This assumes the length is well-formed, as
6782 * does the UTF8SKIP below */
6783 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6784 uniflags);
6785 uc += len;
6786 }
6787 else {
6788 uvc = *uc;
6789 uc++;
6790 }
6791 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6792 uscan = foldbuf;
6793 while (foldlen) {
6794 if (!--chars)
6795 break;
6796 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6797 uniflags);
6798 uscan += len;
6799 foldlen -= len;
6800 }
6801 }
6802 }
6803 else {
6804 if (utf8_target)
6805 uc = utf8_hop(uc, chars);
6806 else
6807 uc += chars;
6808 }
6809 }
6810
6811 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6812 ? ST.jump[ST.nextword]
6813 : NEXT_OFF(ST.me));
6814
6815 DEBUG_EXECUTE_r({
6816 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6817 depth,
6818 PL_colors[4],
6819 ST.nextword,
6820 PL_colors[5]
6821 );
6822 });
6823
6824 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6825 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6826 script_run_begin);
6827 NOT_REACHED; /* NOTREACHED */
6828 }
6829 /* only one choice left - just continue */
6830 DEBUG_EXECUTE_r({
6831 AV *const trie_words
6832 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6833 SV ** const tmp = trie_words
6834 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6835 SV *sv= tmp ? sv_newmortal() : NULL;
6836
6837 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6838 depth, PL_colors[4],
6839 ST.nextword,
6840 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6841 PL_colors[0], PL_colors[1],
6842 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6843 )
6844 : "not compiled under -Dr",
6845 PL_colors[5] );
6846 });
6847
6848 locinput = (char*)uc;
6849 continue; /* execute rest of RE */
6850 /* NOTREACHED */
6851 }
6852 #undef ST
6853
6854 case LEXACT_REQ8:
6855 if (! utf8_target) {
6856 sayNO;
6857 }
6858 /* FALLTHROUGH */
6859
6860 case LEXACT:
6861 {
6862 char *s;
6863
6864 s = STRINGl(scan);
6865 ln = STR_LENl(scan);
6866 goto join_short_long_exact;
6867
6868 case EXACTL: /* /abc/l */
6869 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6870
6871 /* Complete checking would involve going through every character
6872 * matched by the string to see if any is above latin1. But the
6873 * comparision otherwise might very well be a fast assembly
6874 * language routine, and I (khw) don't think slowing things down
6875 * just to check for this warning is worth it. So this just checks
6876 * the first character */
6877 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6878 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6879 }
6880 goto do_exact;
6881 case EXACT_REQ8:
6882 if (! utf8_target) {
6883 sayNO;
6884 }
6885 /* FALLTHROUGH */
6886
6887 case EXACT: /* /abc/ */
6888 do_exact:
6889 s = STRINGs(scan);
6890 ln = STR_LENs(scan);
6891
6892 join_short_long_exact:
6893 if (utf8_target != is_utf8_pat) {
6894 /* The target and the pattern have differing utf8ness. */
6895 char *l = locinput;
6896 const char * const e = s + ln;
6897
6898 if (utf8_target) {
6899 /* The target is utf8, the pattern is not utf8.
6900 * Above-Latin1 code points can't match the pattern;
6901 * invariants match exactly, and the other Latin1 ones need
6902 * to be downgraded to a single byte in order to do the
6903 * comparison. (If we could be confident that the target
6904 * is not malformed, this could be refactored to have fewer
6905 * tests by just assuming that if the first bytes match, it
6906 * is an invariant, but there are tests in the test suite
6907 * dealing with (??{...}) which violate this) */
6908 while (s < e) {
6909 if ( l >= loceol
6910 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6911 {
6912 sayNO;
6913 }
6914 if (UTF8_IS_INVARIANT(*(U8*)l)) {
6915 if (*l != *s) {
6916 sayNO;
6917 }
6918 l++;
6919 }
6920 else {
6921 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6922 {
6923 sayNO;
6924 }
6925 l += 2;
6926 }
6927 s++;
6928 }
6929 }
6930 else {
6931 /* The target is not utf8, the pattern is utf8. */
6932 while (s < e) {
6933 if ( l >= loceol
6934 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6935 {
6936 sayNO;
6937 }
6938 if (UTF8_IS_INVARIANT(*(U8*)s)) {
6939 if (*s != *l) {
6940 sayNO;
6941 }
6942 s++;
6943 }
6944 else {
6945 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6946 {
6947 sayNO;
6948 }
6949 s += 2;
6950 }
6951 l++;
6952 }
6953 }
6954 locinput = l;
6955 }
6956 else {
6957 /* The target and the pattern have the same utf8ness. */
6958 /* Inline the first character, for speed. */
6959 if ( loceol - locinput < ln
6960 || UCHARAT(s) != nextbyte
6961 || (ln > 1 && memNE(s, locinput, ln)))
6962 {
6963 sayNO;
6964 }
6965 locinput += ln;
6966 }
6967 break;
6968 }
6969
6970 case EXACTFL: /* /abc/il */
6971 {
6972 re_fold_t folder;
6973 const U8 * fold_array;
6974 const char * s;
6975 U32 fold_utf8_flags;
6976
6977 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6978 folder = foldEQ_locale;
6979 fold_array = PL_fold_locale;
6980 fold_utf8_flags = FOLDEQ_LOCALE;
6981 goto do_exactf;
6982
6983 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
6984 is effectively /u; hence to match, target
6985 must be UTF-8. */
6986 if (! utf8_target) {
6987 sayNO;
6988 }
6989 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6990 | FOLDEQ_S2_FOLDS_SANE;
6991 folder = foldEQ_latin1_s2_folded;
6992 fold_array = PL_fold_latin1;
6993 goto do_exactf;
6994
6995 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
6996 if (! utf8_target) {
6997 sayNO;
6998 }
6999 assert(is_utf8_pat);
7000 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7001 goto do_exactf;
7002
7003 case EXACTFUP: /* /foo/iu, and something is problematic in
7004 'foo' so can't take shortcuts. */
7005 assert(! is_utf8_pat);
7006 folder = foldEQ_latin1;
7007 fold_array = PL_fold_latin1;
7008 fold_utf8_flags = 0;
7009 goto do_exactf;
7010
7011 case EXACTFU: /* /abc/iu */
7012 folder = foldEQ_latin1_s2_folded;
7013 fold_array = PL_fold_latin1;
7014 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7015 goto do_exactf;
7016
7017 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
7018 patterns */
7019 assert(! is_utf8_pat);
7020 /* FALLTHROUGH */
7021 case EXACTFAA: /* /abc/iaa */
7022 folder = foldEQ_latin1_s2_folded;
7023 fold_array = PL_fold_latin1;
7024 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7025 if (is_utf8_pat || ! utf8_target) {
7026
7027 /* The possible presence of a MICRO SIGN in the pattern forbids
7028 * us to view a non-UTF-8 pattern as folded when there is a
7029 * UTF-8 target */
7030 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7031 |FOLDEQ_S2_FOLDS_SANE;
7032 }
7033 goto do_exactf;
7034
7035
7036 case EXACTF: /* /abc/i This node only generated for
7037 non-utf8 patterns */
7038 assert(! is_utf8_pat);
7039 folder = foldEQ;
7040 fold_array = PL_fold;
7041 fold_utf8_flags = 0;
7042
7043 do_exactf:
7044 s = STRINGs(scan);
7045 ln = STR_LENs(scan);
7046
7047 if ( utf8_target
7048 || is_utf8_pat
7049 || state_num == EXACTFUP
7050 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7051 {
7052 /* Either target or the pattern are utf8, or has the issue where
7053 * the fold lengths may differ. */
7054 const char * const l = locinput;
7055 char *e = loceol;
7056
7057 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
7058 s, 0, ln, is_utf8_pat,fold_utf8_flags))
7059 {
7060 sayNO;
7061 }
7062 locinput = e;
7063 break;
7064 }
7065
7066 /* Neither the target nor the pattern are utf8 */
7067 if (UCHARAT(s) != nextbyte
7068 && !NEXTCHR_IS_EOS
7069 && UCHARAT(s) != fold_array[nextbyte])
7070 {
7071 sayNO;
7072 }
7073 if (loceol - locinput < ln)
7074 sayNO;
7075 if (ln > 1 && ! folder(locinput, s, ln))
7076 sayNO;
7077 locinput += ln;
7078 break;
7079 }
7080
7081 case NBOUNDL: /* /\B/l */
7082 to_complement = 1;
7083 /* FALLTHROUGH */
7084
7085 case BOUNDL: /* /\b/l */
7086 {
7087 bool b1, b2;
7088 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7089
7090 if (FLAGS(scan) != TRADITIONAL_BOUND) {
7091 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7092 goto boundu;
7093 }
7094
7095 if (utf8_target) {
7096 if (locinput == reginfo->strbeg)
7097 b1 = isWORDCHAR_LC('\n');
7098 else {
7099 U8 *p = reghop3((U8*)locinput, -1,
7100 (U8*)(reginfo->strbeg));
7101 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7102 }
7103 b2 = (NEXTCHR_IS_EOS)
7104 ? isWORDCHAR_LC('\n')
7105 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7106 (U8*) reginfo->strend);
7107 }
7108 else { /* Here the string isn't utf8 */
7109 b1 = (locinput == reginfo->strbeg)
7110 ? isWORDCHAR_LC('\n')
7111 : isWORDCHAR_LC(UCHARAT(locinput - 1));
7112 b2 = (NEXTCHR_IS_EOS)
7113 ? isWORDCHAR_LC('\n')
7114 : isWORDCHAR_LC(nextbyte);
7115 }
7116 if (to_complement ^ (b1 == b2)) {
7117 sayNO;
7118 }
7119 break;
7120 }
7121
7122 case NBOUND: /* /\B/ */
7123 to_complement = 1;
7124 /* FALLTHROUGH */
7125
7126 case BOUND: /* /\b/ */
7127 if (utf8_target) {
7128 goto bound_utf8;
7129 }
7130 goto bound_ascii_match_only;
7131
7132 case NBOUNDA: /* /\B/a */
7133 to_complement = 1;
7134 /* FALLTHROUGH */
7135
7136 case BOUNDA: /* /\b/a */
7137 {
7138 bool b1, b2;
7139
7140 bound_ascii_match_only:
7141 /* Here the string isn't utf8, or is utf8 and only ascii characters
7142 * are to match \w. In the latter case looking at the byte just
7143 * prior to the current one may be just the final byte of a
7144 * multi-byte character. This is ok. There are two cases:
7145 * 1) it is a single byte character, and then the test is doing
7146 * just what it's supposed to.
7147 * 2) it is a multi-byte character, in which case the final byte is
7148 * never mistakable for ASCII, and so the test will say it is
7149 * not a word character, which is the correct answer. */
7150 b1 = (locinput == reginfo->strbeg)
7151 ? isWORDCHAR_A('\n')
7152 : isWORDCHAR_A(UCHARAT(locinput - 1));
7153 b2 = (NEXTCHR_IS_EOS)
7154 ? isWORDCHAR_A('\n')
7155 : isWORDCHAR_A(nextbyte);
7156 if (to_complement ^ (b1 == b2)) {
7157 sayNO;
7158 }
7159 break;
7160 }
7161
7162 case NBOUNDU: /* /\B/u */
7163 to_complement = 1;
7164 /* FALLTHROUGH */
7165
7166 case BOUNDU: /* /\b/u */
7167
7168 boundu:
7169 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7170 match = FALSE;
7171 }
7172 else if (utf8_target) {
7173 bound_utf8:
7174 switch((bound_type) FLAGS(scan)) {
7175 case TRADITIONAL_BOUND:
7176 {
7177 bool b1, b2;
7178 if (locinput == reginfo->strbeg) {
7179 b1 = 0 /* isWORDCHAR_L1('\n') */;
7180 }
7181 else {
7182 U8 *p = reghop3((U8*)locinput, -1,
7183 (U8*)(reginfo->strbeg));
7184
7185 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7186 }
7187 b2 = (NEXTCHR_IS_EOS)
7188 ? 0 /* isWORDCHAR_L1('\n') */
7189 : isWORDCHAR_utf8_safe((U8*)locinput,
7190 (U8*) reginfo->strend);
7191 match = cBOOL(b1 != b2);
7192 break;
7193 }
7194 case GCB_BOUND:
7195 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7196 match = TRUE; /* GCB always matches at begin and
7197 end */
7198 }
7199 else {
7200 /* Find the gcb values of previous and current
7201 * chars, then see if is a break point */
7202 match = isGCB(getGCB_VAL_UTF8(
7203 reghop3((U8*)locinput,
7204 -1,
7205 (U8*)(reginfo->strbeg)),
7206 (U8*) reginfo->strend),
7207 getGCB_VAL_UTF8((U8*) locinput,
7208 (U8*) reginfo->strend),
7209 (U8*) reginfo->strbeg,
7210 (U8*) locinput,
7211 utf8_target);
7212 }
7213 break;
7214
7215 case LB_BOUND:
7216 if (locinput == reginfo->strbeg) {
7217 match = FALSE;
7218 }
7219 else if (NEXTCHR_IS_EOS) {
7220 match = TRUE;
7221 }
7222 else {
7223 match = isLB(getLB_VAL_UTF8(
7224 reghop3((U8*)locinput,
7225 -1,
7226 (U8*)(reginfo->strbeg)),
7227 (U8*) reginfo->strend),
7228 getLB_VAL_UTF8((U8*) locinput,
7229 (U8*) reginfo->strend),
7230 (U8*) reginfo->strbeg,
7231 (U8*) locinput,
7232 (U8*) reginfo->strend,
7233 utf8_target);
7234 }
7235 break;
7236
7237 case SB_BOUND: /* Always matches at begin and end */
7238 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7239 match = TRUE;
7240 }
7241 else {
7242 match = isSB(getSB_VAL_UTF8(
7243 reghop3((U8*)locinput,
7244 -1,
7245 (U8*)(reginfo->strbeg)),
7246 (U8*) reginfo->strend),
7247 getSB_VAL_UTF8((U8*) locinput,
7248 (U8*) reginfo->strend),
7249 (U8*) reginfo->strbeg,
7250 (U8*) locinput,
7251 (U8*) reginfo->strend,
7252 utf8_target);
7253 }
7254 break;
7255
7256 case WB_BOUND:
7257 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7258 match = TRUE;
7259 }
7260 else {
7261 match = isWB(WB_UNKNOWN,
7262 getWB_VAL_UTF8(
7263 reghop3((U8*)locinput,
7264 -1,
7265 (U8*)(reginfo->strbeg)),
7266 (U8*) reginfo->strend),
7267 getWB_VAL_UTF8((U8*) locinput,
7268 (U8*) reginfo->strend),
7269 (U8*) reginfo->strbeg,
7270 (U8*) locinput,
7271 (U8*) reginfo->strend,
7272 utf8_target);
7273 }
7274 break;
7275 }
7276 }
7277 else { /* Not utf8 target */
7278 switch((bound_type) FLAGS(scan)) {
7279 case TRADITIONAL_BOUND:
7280 {
7281 bool b1, b2;
7282 b1 = (locinput == reginfo->strbeg)
7283 ? 0 /* isWORDCHAR_L1('\n') */
7284 : isWORDCHAR_L1(UCHARAT(locinput - 1));
7285 b2 = (NEXTCHR_IS_EOS)
7286 ? 0 /* isWORDCHAR_L1('\n') */
7287 : isWORDCHAR_L1(nextbyte);
7288 match = cBOOL(b1 != b2);
7289 break;
7290 }
7291
7292 case GCB_BOUND:
7293 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7294 match = TRUE; /* GCB always matches at begin and
7295 end */
7296 }
7297 else { /* Only CR-LF combo isn't a GCB in 0-255
7298 range */
7299 match = UCHARAT(locinput - 1) != '\r'
7300 || UCHARAT(locinput) != '\n';
7301 }
7302 break;
7303
7304 case LB_BOUND:
7305 if (locinput == reginfo->strbeg) {
7306 match = FALSE;
7307 }
7308 else if (NEXTCHR_IS_EOS) {
7309 match = TRUE;
7310 }
7311 else {
7312 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7313 getLB_VAL_CP(UCHARAT(locinput)),
7314 (U8*) reginfo->strbeg,
7315 (U8*) locinput,
7316 (U8*) reginfo->strend,
7317 utf8_target);
7318 }
7319 break;
7320
7321 case SB_BOUND: /* Always matches at begin and end */
7322 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7323 match = TRUE;
7324 }
7325 else {
7326 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7327 getSB_VAL_CP(UCHARAT(locinput)),
7328 (U8*) reginfo->strbeg,
7329 (U8*) locinput,
7330 (U8*) reginfo->strend,
7331 utf8_target);
7332 }
7333 break;
7334
7335 case WB_BOUND:
7336 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7337 match = TRUE;
7338 }
7339 else {
7340 match = isWB(WB_UNKNOWN,
7341 getWB_VAL_CP(UCHARAT(locinput -1)),
7342 getWB_VAL_CP(UCHARAT(locinput)),
7343 (U8*) reginfo->strbeg,
7344 (U8*) locinput,
7345 (U8*) reginfo->strend,
7346 utf8_target);
7347 }
7348 break;
7349 }
7350 }
7351
7352 if (to_complement ^ ! match) {
7353 sayNO;
7354 }
7355 break;
7356
7357 case ANYOFPOSIXL:
7358 case ANYOFL: /* /[abc]/l */
7359 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7360 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7361
7362 /* FALLTHROUGH */
7363 case ANYOFD: /* /[abc]/d */
7364 case ANYOF: /* /[abc]/ */
7365 if (NEXTCHR_IS_EOS || locinput >= loceol)
7366 sayNO;
7367 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7368 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7369 {
7370 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7371 sayNO;
7372 }
7373 locinput++;
7374 }
7375 else {
7376 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7377 utf8_target))
7378 {
7379 sayNO;
7380 }
7381 goto increment_locinput;
7382 }
7383 break;
7384
7385 case ANYOFM:
7386 if ( NEXTCHR_IS_EOS
7387 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7388 || locinput >= loceol)
7389 {
7390 sayNO;
7391 }
7392 locinput++; /* ANYOFM is always single byte */
7393 break;
7394
7395 case NANYOFM:
7396 if ( NEXTCHR_IS_EOS
7397 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7398 || locinput >= loceol)
7399 {
7400 sayNO;
7401 }
7402 goto increment_locinput;
7403 break;
7404
7405 case ANYOFH:
7406 if ( ! utf8_target
7407 || NEXTCHR_IS_EOS
7408 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7409 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7410 utf8_target))
7411 {
7412 sayNO;
7413 }
7414 goto increment_locinput;
7415 break;
7416
7417 case ANYOFHb:
7418 if ( ! utf8_target
7419 || NEXTCHR_IS_EOS
7420 || ANYOF_FLAGS(scan) != (U8) *locinput
7421 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7422 utf8_target))
7423 {
7424 sayNO;
7425 }
7426 goto increment_locinput;
7427 break;
7428
7429 case ANYOFHr:
7430 if ( ! utf8_target
7431 || NEXTCHR_IS_EOS
7432 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7433 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7434 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7435 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7436 utf8_target))
7437 {
7438 sayNO;
7439 }
7440 goto increment_locinput;
7441 break;
7442
7443 case ANYOFHs:
7444 if ( ! utf8_target
7445 || NEXTCHR_IS_EOS
7446 || loceol - locinput < FLAGS(scan)
7447 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7448 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7449 utf8_target))
7450 {
7451 sayNO;
7452 }
7453 goto increment_locinput;
7454 break;
7455
7456 case ANYOFR:
7457 if (NEXTCHR_IS_EOS) {
7458 sayNO;
7459 }
7460
7461 if (utf8_target) {
7462 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7463 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7464 (U8 *) reginfo->strend,
7465 NULL),
7466 ANYOFRbase(scan), ANYOFRdelta(scan)))
7467 {
7468 sayNO;
7469 }
7470 }
7471 else {
7472 if (! withinCOUNT((U8) *locinput,
7473 ANYOFRbase(scan), ANYOFRdelta(scan)))
7474 {
7475 sayNO;
7476 }
7477 }
7478 goto increment_locinput;
7479 break;
7480
7481 case ANYOFRb:
7482 if (NEXTCHR_IS_EOS) {
7483 sayNO;
7484 }
7485
7486 if (utf8_target) {
7487 if ( ANYOF_FLAGS(scan) != (U8) *locinput
7488 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7489 (U8 *) reginfo->strend,
7490 NULL),
7491 ANYOFRbase(scan), ANYOFRdelta(scan)))
7492 {
7493 sayNO;
7494 }
7495 }
7496 else {
7497 if (! withinCOUNT((U8) *locinput,
7498 ANYOFRbase(scan), ANYOFRdelta(scan)))
7499 {
7500 sayNO;
7501 }
7502 }
7503 goto increment_locinput;
7504 break;
7505
7506 /* The argument (FLAGS) to all the POSIX node types is the class number
7507 * */
7508
7509 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
7510 to_complement = 1;
7511 /* FALLTHROUGH */
7512
7513 case POSIXL: /* \w or [:punct:] etc. under /l */
7514 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7515 if (NEXTCHR_IS_EOS || locinput >= loceol)
7516 sayNO;
7517
7518 /* Use isFOO_lc() for characters within Latin1. (Note that
7519 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7520 * wouldn't be invariant) */
7521 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7522 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7523 sayNO;
7524 }
7525
7526 locinput++;
7527 break;
7528 }
7529
7530 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7531 /* An above Latin-1 code point, or malformed */
7532 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7533 reginfo->strend);
7534 goto utf8_posix_above_latin1;
7535 }
7536
7537 /* Here is a UTF-8 variant code point below 256 and the target is
7538 * UTF-8 */
7539 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7540 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7541 *(locinput + 1))))))
7542 {
7543 sayNO;
7544 }
7545
7546 goto increment_locinput;
7547
7548 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
7549 to_complement = 1;
7550 /* FALLTHROUGH */
7551
7552 case POSIXD: /* \w or [:punct:] etc. under /d */
7553 if (utf8_target) {
7554 goto utf8_posix;
7555 }
7556 goto posixa;
7557
7558 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
7559
7560 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7561 sayNO;
7562 }
7563
7564 /* All UTF-8 variants match */
7565 if (! UTF8_IS_INVARIANT(nextbyte)) {
7566 goto increment_locinput;
7567 }
7568
7569 to_complement = 1;
7570 goto join_nposixa;
7571
7572 case POSIXA: /* \w or [:punct:] etc. under /a */
7573
7574 posixa:
7575 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7576 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7577 * character is a single byte */
7578
7579 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7580 sayNO;
7581 }
7582
7583 join_nposixa:
7584
7585 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
7586 FLAGS(scan)))))
7587 {
7588 sayNO;
7589 }
7590
7591 /* Here we are either not in utf8, or we matched a utf8-invariant,
7592 * so the next char is the next byte */
7593 locinput++;
7594 break;
7595
7596 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
7597 to_complement = 1;
7598 /* FALLTHROUGH */
7599
7600 case POSIXU: /* \w or [:punct:] etc. under /u */
7601 utf8_posix:
7602 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7603 sayNO;
7604 }
7605
7606 /* Use _generic_isCC() for characters within Latin1. (Note that
7607 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7608 * wouldn't be invariant) */
7609 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7610 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
7611 FLAGS(scan)))))
7612 {
7613 sayNO;
7614 }
7615 locinput++;
7616 }
7617 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7618 if (! (to_complement
7619 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7620 *(locinput + 1)),
7621 FLAGS(scan)))))
7622 {
7623 sayNO;
7624 }
7625 locinput += 2;
7626 }
7627 else { /* Handle above Latin-1 code points */
7628 utf8_posix_above_latin1:
7629 classnum = (_char_class_number) FLAGS(scan);
7630 switch (classnum) {
7631 default:
7632 if (! (to_complement
7633 ^ cBOOL(_invlist_contains_cp(
7634 PL_XPosix_ptrs[classnum],
7635 utf8_to_uvchr_buf((U8 *) locinput,
7636 (U8 *) reginfo->strend,
7637 NULL)))))
7638 {
7639 sayNO;
7640 }
7641 break;
7642 case _CC_ENUM_SPACE:
7643 if (! (to_complement
7644 ^ cBOOL(is_XPERLSPACE_high(locinput))))
7645 {
7646 sayNO;
7647 }
7648 break;
7649 case _CC_ENUM_BLANK:
7650 if (! (to_complement
7651 ^ cBOOL(is_HORIZWS_high(locinput))))
7652 {
7653 sayNO;
7654 }
7655 break;
7656 case _CC_ENUM_XDIGIT:
7657 if (! (to_complement
7658 ^ cBOOL(is_XDIGIT_high(locinput))))
7659 {
7660 sayNO;
7661 }
7662 break;
7663 case _CC_ENUM_VERTSPACE:
7664 if (! (to_complement
7665 ^ cBOOL(is_VERTWS_high(locinput))))
7666 {
7667 sayNO;
7668 }
7669 break;
7670 case _CC_ENUM_CNTRL: /* These can't match above Latin1 */
7671 case _CC_ENUM_ASCII:
7672 if (! to_complement) {
7673 sayNO;
7674 }
7675 break;
7676 }
7677 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7678 }
7679 break;
7680
7681 case CLUMP: /* Match \X: logical Unicode character. This is defined as
7682 a Unicode extended Grapheme Cluster */
7683 if (NEXTCHR_IS_EOS || locinput >= loceol)
7684 sayNO;
7685 if (! utf8_target) {
7686
7687 /* Match either CR LF or '.', as all the other possibilities
7688 * require utf8 */
7689 locinput++; /* Match the . or CR */
7690 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7691 match the LF */
7692 && locinput < loceol
7693 && UCHARAT(locinput) == '\n')
7694 {
7695 locinput++;
7696 }
7697 }
7698 else {
7699
7700 /* Get the gcb type for the current character */
7701 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7702 (U8*) reginfo->strend);
7703
7704 /* Then scan through the input until we get to the first
7705 * character whose type is supposed to be a gcb with the
7706 * current character. (There is always a break at the
7707 * end-of-input) */
7708 locinput += UTF8SKIP(locinput);
7709 while (locinput < loceol) {
7710 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7711 (U8*) reginfo->strend);
7712 if (isGCB(prev_gcb, cur_gcb,
7713 (U8*) reginfo->strbeg, (U8*) locinput,
7714 utf8_target))
7715 {
7716 break;
7717 }
7718
7719 prev_gcb = cur_gcb;
7720 locinput += UTF8SKIP(locinput);
7721 }
7722
7723
7724 }
7725 break;
7726
7727 case REFFLN: /* /\g{name}/il */
7728 { /* The capture buffer cases. The ones beginning with N for the
7729 named buffers just convert to the equivalent numbered and
7730 pretend they were called as the corresponding numbered buffer
7731 op. */
7732 /* don't initialize these in the declaration, it makes C++
7733 unhappy */
7734 const char *s;
7735 char type;
7736 re_fold_t folder;
7737 const U8 *fold_array;
7738 UV utf8_fold_flags;
7739
7740 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7741 folder = foldEQ_locale;
7742 fold_array = PL_fold_locale;
7743 type = REFFL;
7744 utf8_fold_flags = FOLDEQ_LOCALE;
7745 goto do_nref;
7746
7747 case REFFAN: /* /\g{name}/iaa */
7748 folder = foldEQ_latin1;
7749 fold_array = PL_fold_latin1;
7750 type = REFFA;
7751 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7752 goto do_nref;
7753
7754 case REFFUN: /* /\g{name}/iu */
7755 folder = foldEQ_latin1;
7756 fold_array = PL_fold_latin1;
7757 type = REFFU;
7758 utf8_fold_flags = 0;
7759 goto do_nref;
7760
7761 case REFFN: /* /\g{name}/i */
7762 folder = foldEQ;
7763 fold_array = PL_fold;
7764 type = REFF;
7765 utf8_fold_flags = 0;
7766 goto do_nref;
7767
7768 case REFN: /* /\g{name}/ */
7769 type = REF;
7770 folder = NULL;
7771 fold_array = NULL;
7772 utf8_fold_flags = 0;
7773 do_nref:
7774
7775 /* For the named back references, find the corresponding buffer
7776 * number */
7777 n = reg_check_named_buff_matched(rex,scan);
7778
7779 if ( ! n ) {
7780 sayNO;
7781 }
7782 goto do_nref_ref_common;
7783
7784 case REFFL: /* /\1/il */
7785 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7786 folder = foldEQ_locale;
7787 fold_array = PL_fold_locale;
7788 utf8_fold_flags = FOLDEQ_LOCALE;
7789 goto do_ref;
7790
7791 case REFFA: /* /\1/iaa */
7792 folder = foldEQ_latin1;
7793 fold_array = PL_fold_latin1;
7794 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7795 goto do_ref;
7796
7797 case REFFU: /* /\1/iu */
7798 folder = foldEQ_latin1;
7799 fold_array = PL_fold_latin1;
7800 utf8_fold_flags = 0;
7801 goto do_ref;
7802
7803 case REFF: /* /\1/i */
7804 folder = foldEQ;
7805 fold_array = PL_fold;
7806 utf8_fold_flags = 0;
7807 goto do_ref;
7808
7809 case REF: /* /\1/ */
7810 folder = NULL;
7811 fold_array = NULL;
7812 utf8_fold_flags = 0;
7813
7814 do_ref:
7815 type = OP(scan);
7816 n = ARG(scan); /* which paren pair */
7817
7818 do_nref_ref_common:
7819 ln = rex->offs[n].start;
7820 endref = rex->offs[n].end;
7821 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7822 if (rex->lastparen < n || ln == -1 || endref == -1)
7823 sayNO; /* Do not match unless seen CLOSEn. */
7824 if (ln == endref)
7825 break;
7826
7827 s = reginfo->strbeg + ln;
7828 if (type != REF /* REF can do byte comparison */
7829 && (utf8_target || type == REFFU || type == REFFL))
7830 {
7831 char * limit = loceol;
7832
7833 /* This call case insensitively compares the entire buffer
7834 * at s, with the current input starting at locinput, but
7835 * not going off the end given by loceol, and
7836 * returns in <limit> upon success, how much of the
7837 * current input was matched */
7838 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7839 locinput, &limit, 0, utf8_target, utf8_fold_flags))
7840 {
7841 sayNO;
7842 }
7843 locinput = limit;
7844 break;
7845 }
7846
7847 /* Not utf8: Inline the first character, for speed. */
7848 if ( ! NEXTCHR_IS_EOS
7849 && locinput < loceol
7850 && UCHARAT(s) != nextbyte
7851 && ( type == REF
7852 || UCHARAT(s) != fold_array[nextbyte]))
7853 {
7854 sayNO;
7855 }
7856 ln = endref - ln;
7857 if (locinput + ln > loceol)
7858 sayNO;
7859 if (ln > 1 && (type == REF
7860 ? memNE(s, locinput, ln)
7861 : ! folder(locinput, s, ln)))
7862 sayNO;
7863 locinput += ln;
7864 break;
7865 }
7866
7867 case NOTHING: /* null op; e.g. the 'nothing' following
7868 * the '*' in m{(a+|b)*}' */
7869 break;
7870 case TAIL: /* placeholder while compiling (A|B|C) */
7871 break;
7872
7873 #undef ST
7874 #define ST st->u.eval
7875 #define CUR_EVAL cur_eval->u.eval
7876
7877 {
7878 SV *ret;
7879 REGEXP *re_sv;
7880 regexp *re;
7881 regexp_internal *rei;
7882 regnode *startpoint;
7883 U32 arg;
7884
7885 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
7886 arg= (U32)ARG(scan);
7887 if (cur_eval && cur_eval->locinput == locinput) {
7888 if ( ++nochange_depth > max_nochange_depth )
7889 Perl_croak(aTHX_
7890 "Pattern subroutine nesting without pos change"
7891 " exceeded limit in regex");
7892 } else {
7893 nochange_depth = 0;
7894 }
7895 re_sv = rex_sv;
7896 re = rex;
7897 rei = rexi;
7898 startpoint = scan + ARG2L(scan);
7899 EVAL_CLOSE_PAREN_SET( st, arg );
7900 /* Detect infinite recursion
7901 *
7902 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7903 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7904 * So we track the position in the string we are at each time
7905 * we recurse and if we try to enter the same routine twice from
7906 * the same position we throw an error.
7907 */
7908 if ( rex->recurse_locinput[arg] == locinput ) {
7909 /* FIXME: we should show the regop that is failing as part
7910 * of the error message. */
7911 Perl_croak(aTHX_ "Infinite recursion in regex");
7912 } else {
7913 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7914 rex->recurse_locinput[arg]= locinput;
7915
7916 DEBUG_r({
7917 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918 DEBUG_STACK_r({
7919 Perl_re_exec_indentf( aTHX_
7920 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7921 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7922 );
7923 });
7924 });
7925 }
7926
7927 /* Save all the positions seen so far. */
7928 ST.cp = regcppush(rex, 0, maxopenparen);
7929 REGCP_SET(ST.lastcp);
7930
7931 /* and then jump to the code we share with EVAL */
7932 goto eval_recurse_doit;
7933 /* NOTREACHED */
7934
7935 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
7936 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7937 if ( ++nochange_depth > max_nochange_depth )
7938 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7939 } else {
7940 nochange_depth = 0;
7941 }
7942 {
7943 /* execute the code in the {...} */
7944
7945 dSP;
7946 IV before;
7947 OP * const oop = PL_op;
7948 COP * const ocurcop = PL_curcop;
7949 OP *nop;
7950 CV *newcv;
7951
7952 /* save *all* paren positions */
7953 regcppush(rex, 0, maxopenparen);
7954 REGCP_SET(ST.lastcp);
7955
7956 if (!caller_cv)
7957 caller_cv = find_runcv(NULL);
7958
7959 n = ARG(scan);
7960
7961 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7962 newcv = (ReANY(
7963 (REGEXP*)(rexi->data->data[n])
7964 ))->qr_anoncv;
7965 nop = (OP*)rexi->data->data[n+1];
7966 }
7967 else if (rexi->data->what[n] == 'l') { /* literal code */
7968 newcv = caller_cv;
7969 nop = (OP*)rexi->data->data[n];
7970 assert(CvDEPTH(newcv));
7971 }
7972 else {
7973 /* literal with own CV */
7974 assert(rexi->data->what[n] == 'L');
7975 newcv = rex->qr_anoncv;
7976 nop = (OP*)rexi->data->data[n];
7977 }
7978
7979 /* Some notes about MULTICALL and the context and save stacks.
7980 *
7981 * In something like
7982 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7983 * since codeblocks don't introduce a new scope (so that
7984 * local() etc accumulate), at the end of a successful
7985 * match there will be a SAVEt_CLEARSV on the savestack
7986 * for each of $x, $y, $z. If the three code blocks above
7987 * happen to have come from different CVs (e.g. via
7988 * embedded qr//s), then we must ensure that during any
7989 * savestack unwinding, PL_comppad always points to the
7990 * right pad at each moment. We achieve this by
7991 * interleaving SAVEt_COMPPAD's on the savestack whenever
7992 * there is a change of pad.
7993 * In theory whenever we call a code block, we should
7994 * push a CXt_SUB context, then pop it on return from
7995 * that code block. This causes a bit of an issue in that
7996 * normally popping a context also clears the savestack
7997 * back to cx->blk_oldsaveix, but here we specifically
7998 * don't want to clear the save stack on exit from the
7999 * code block.
8000 * Also for efficiency we don't want to keep pushing and
8001 * popping the single SUB context as we backtrack etc.
8002 * So instead, we push a single context the first time
8003 * we need, it, then hang onto it until the end of this
8004 * function. Whenever we encounter a new code block, we
8005 * update the CV etc if that's changed. During the times
8006 * in this function where we're not executing a code
8007 * block, having the SUB context still there is a bit
8008 * naughty - but we hope that no-one notices.
8009 * When the SUB context is initially pushed, we fake up
8010 * cx->blk_oldsaveix to be as if we'd pushed this context
8011 * on first entry to S_regmatch rather than at some random
8012 * point during the regexe execution. That way if we
8013 * croak, popping the context stack will ensure that
8014 * *everything* SAVEd by this function is undone and then
8015 * the context popped, rather than e.g., popping the
8016 * context (and restoring the original PL_comppad) then
8017 * popping more of the savestack and restoring a bad
8018 * PL_comppad.
8019 */
8020
8021 /* If this is the first EVAL, push a MULTICALL. On
8022 * subsequent calls, if we're executing a different CV, or
8023 * if PL_comppad has got messed up from backtracking
8024 * through SAVECOMPPADs, then refresh the context.
8025 */
8026 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8027 {
8028 U8 flags = (CXp_SUB_RE |
8029 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8030 SAVECOMPPAD();
8031 if (last_pushed_cv) {
8032 CHANGE_MULTICALL_FLAGS(newcv, flags);
8033 }
8034 else {
8035 PUSH_MULTICALL_FLAGS(newcv, flags);
8036 }
8037 /* see notes above */
8038 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8039
8040 last_pushed_cv = newcv;
8041 }
8042 else {
8043 /* these assignments are just to silence compiler
8044 * warnings */
8045 multicall_cop = NULL;
8046 }
8047 last_pad = PL_comppad;
8048
8049 /* the initial nextstate you would normally execute
8050 * at the start of an eval (which would cause error
8051 * messages to come from the eval), may be optimised
8052 * away from the execution path in the regex code blocks;
8053 * so manually set PL_curcop to it initially */
8054 {
8055 OP *o = cUNOPx(nop)->op_first;
8056 assert(o->op_type == OP_NULL);
8057 if (o->op_targ == OP_SCOPE) {
8058 o = cUNOPo->op_first;
8059 }
8060 else {
8061 assert(o->op_targ == OP_LEAVE);
8062 o = cUNOPo->op_first;
8063 assert(o->op_type == OP_ENTER);
8064 o = OpSIBLING(o);
8065 }
8066
8067 if (o->op_type != OP_STUB) {
8068 assert( o->op_type == OP_NEXTSTATE
8069 || o->op_type == OP_DBSTATE
8070 || (o->op_type == OP_NULL
8071 && ( o->op_targ == OP_NEXTSTATE
8072 || o->op_targ == OP_DBSTATE
8073 )
8074 )
8075 );
8076 PL_curcop = (COP*)o;
8077 }
8078 }
8079 nop = nop->op_next;
8080
8081 DEBUG_STATE_r( Perl_re_printf( aTHX_
8082 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8083
8084 rex->offs[0].end = locinput - reginfo->strbeg;
8085 if (reginfo->info_aux_eval->pos_magic)
8086 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8087 reginfo->sv, reginfo->strbeg,
8088 locinput - reginfo->strbeg);
8089
8090 if (sv_yes_mark) {
8091 SV *sv_mrk = get_sv("REGMARK", 1);
8092 sv_setsv(sv_mrk, sv_yes_mark);
8093 }
8094
8095 /* we don't use MULTICALL here as we want to call the
8096 * first op of the block of interest, rather than the
8097 * first op of the sub. Also, we don't want to free
8098 * the savestack frame */
8099 before = (IV)(SP-PL_stack_base);
8100 PL_op = nop;
8101 CALLRUNOPS(aTHX); /* Scalar context. */
8102 SPAGAIN;
8103 if ((IV)(SP-PL_stack_base) == before)
8104 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8105 else {
8106 ret = POPs;
8107 PUTBACK;
8108 }
8109
8110 /* before restoring everything, evaluate the returned
8111 * value, so that 'uninit' warnings don't use the wrong
8112 * PL_op or pad. Also need to process any magic vars
8113 * (e.g. $1) *before* parentheses are restored */
8114
8115 PL_op = NULL;
8116
8117 re_sv = NULL;
8118 if (logical == 0) { /* (?{})/ */
8119 SV *replsv = save_scalar(PL_replgv);
8120 sv_setsv(replsv, ret); /* $^R */
8121 SvSETMAGIC(replsv);
8122 }
8123 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
8124 sw = cBOOL(SvTRUE_NN(ret));
8125 logical = 0;
8126 }
8127 else { /* /(??{}) */
8128 /* if its overloaded, let the regex compiler handle
8129 * it; otherwise extract regex, or stringify */
8130 if (SvGMAGICAL(ret))
8131 ret = sv_mortalcopy(ret);
8132 if (!SvAMAGIC(ret)) {
8133 SV *sv = ret;
8134 if (SvROK(sv))
8135 sv = SvRV(sv);
8136 if (SvTYPE(sv) == SVt_REGEXP)
8137 re_sv = (REGEXP*) sv;
8138 else if (SvSMAGICAL(ret)) {
8139 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8140 if (mg)
8141 re_sv = (REGEXP *) mg->mg_obj;
8142 }
8143
8144 /* force any undef warnings here */
8145 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8146 ret = sv_mortalcopy(ret);
8147 (void) SvPV_force_nolen(ret);
8148 }
8149 }
8150
8151 }
8152
8153 /* *** Note that at this point we don't restore
8154 * PL_comppad, (or pop the CxSUB) on the assumption it may
8155 * be used again soon. This is safe as long as nothing
8156 * in the regexp code uses the pad ! */
8157 PL_op = oop;
8158 PL_curcop = ocurcop;
8159 regcp_restore(rex, ST.lastcp, &maxopenparen);
8160 PL_curpm_under = PL_curpm;
8161 PL_curpm = PL_reg_curpm;
8162
8163 if (logical != 2) {
8164 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8165 script_run_begin);
8166 /* NOTREACHED */
8167 }
8168 }
8169
8170 /* only /(??{})/ from now on */
8171 logical = 0;
8172 {
8173 /* extract RE object from returned value; compiling if
8174 * necessary */
8175
8176 if (re_sv) {
8177 re_sv = reg_temp_copy(NULL, re_sv);
8178 }
8179 else {
8180 U32 pm_flags = 0;
8181
8182 if (SvUTF8(ret) && IN_BYTES) {
8183 /* In use 'bytes': make a copy of the octet
8184 * sequence, but without the flag on */
8185 STRLEN len;
8186 const char *const p = SvPV(ret, len);
8187 ret = newSVpvn_flags(p, len, SVs_TEMP);
8188 }
8189 if (rex->intflags & PREGf_USE_RE_EVAL)
8190 pm_flags |= PMf_USE_RE_EVAL;
8191
8192 /* if we got here, it should be an engine which
8193 * supports compiling code blocks and stuff */
8194 assert(rex->engine && rex->engine->op_comp);
8195 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8196 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8197 rex->engine, NULL, NULL,
8198 /* copy /msixn etc to inner pattern */
8199 ARG2L(scan),
8200 pm_flags);
8201
8202 if (!(SvFLAGS(ret)
8203 & (SVs_TEMP | SVs_GMG | SVf_ROK))
8204 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8205 /* This isn't a first class regexp. Instead, it's
8206 caching a regexp onto an existing, Perl visible
8207 scalar. */
8208 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8209 }
8210 }
8211 SAVEFREESV(re_sv);
8212 re = ReANY(re_sv);
8213 }
8214 RXp_MATCH_COPIED_off(re);
8215 re->subbeg = rex->subbeg;
8216 re->sublen = rex->sublen;
8217 re->suboffset = rex->suboffset;
8218 re->subcoffset = rex->subcoffset;
8219 re->lastparen = 0;
8220 re->lastcloseparen = 0;
8221 rei = RXi_GET(re);
8222 DEBUG_EXECUTE_r(
8223 debug_start_match(re_sv, utf8_target, locinput,
8224 reginfo->strend, "EVAL/GOSUB: Matching embedded");
8225 );
8226 startpoint = rei->program + 1;
8227 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8228 * close_paren only for GOSUB */
8229 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8230 /* Save all the seen positions so far. */
8231 ST.cp = regcppush(rex, 0, maxopenparen);
8232 REGCP_SET(ST.lastcp);
8233 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8234 maxopenparen = 0;
8235 /* run the pattern returned from (??{...}) */
8236
8237 eval_recurse_doit: /* Share code with GOSUB below this line
8238 * At this point we expect the stack context to be
8239 * set up correctly */
8240
8241 /* invalidate the S-L poscache. We're now executing a
8242 * different set of WHILEM ops (and their associated
8243 * indexes) against the same string, so the bits in the
8244 * cache are meaningless. Setting maxiter to zero forces
8245 * the cache to be invalidated and zeroed before reuse.
8246 * XXX This is too dramatic a measure. Ideally we should
8247 * save the old cache and restore when running the outer
8248 * pattern again */
8249 reginfo->poscache_maxiter = 0;
8250
8251 /* the new regexp might have a different is_utf8_pat than we do */
8252 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8253
8254 ST.prev_rex = rex_sv;
8255 ST.prev_curlyx = cur_curlyx;
8256 rex_sv = re_sv;
8257 SET_reg_curpm(rex_sv);
8258 rex = re;
8259 rexi = rei;
8260 cur_curlyx = NULL;
8261 ST.B = next;
8262 ST.prev_eval = cur_eval;
8263 cur_eval = st;
8264 /* now continue from first node in postoned RE */
8265 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8266 loceol, script_run_begin);
8267 NOT_REACHED; /* NOTREACHED */
8268 }
8269
8270 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8271 /* note: this is called twice; first after popping B, then A */
8272 DEBUG_STACK_r({
8273 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
8274 depth, cur_eval, ST.prev_eval);
8275 });
8276
8277 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8278 if ( cur_eval && CUR_EVAL.close_paren ) {\
8279 DEBUG_STACK_r({ \
8280 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8281 depth, \
8282 CUR_EVAL.close_paren - 1,\
8283 cur_eval, \
8284 VAL); \
8285 }); \
8286 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8287 }
8288
8289 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8290
8291 rex_sv = ST.prev_rex;
8292 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8293 SET_reg_curpm(rex_sv);
8294 rex = ReANY(rex_sv);
8295 rexi = RXi_GET(rex);
8296 {
8297 /* preserve $^R across LEAVE's. See Bug 121070. */
8298 SV *save_sv= GvSV(PL_replgv);
8299 SV *replsv;
8300 SvREFCNT_inc(save_sv);
8301 regcpblow(ST.cp); /* LEAVE in disguise */
8302 /* don't move this initialization up */
8303 replsv = GvSV(PL_replgv);
8304 sv_setsv(replsv, save_sv);
8305 SvSETMAGIC(replsv);
8306 SvREFCNT_dec(save_sv);
8307 }
8308 cur_eval = ST.prev_eval;
8309 cur_curlyx = ST.prev_curlyx;
8310
8311 /* Invalidate cache. See "invalidate" comment above. */
8312 reginfo->poscache_maxiter = 0;
8313 if ( nochange_depth )
8314 nochange_depth--;
8315
8316 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8317 sayYES;
8318
8319
8320 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8321 REGCP_UNWIND(ST.lastcp);
8322 sayNO;
8323
8324 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8325 /* note: this is called twice; first after popping B, then A */
8326 DEBUG_STACK_r({
8327 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8328 depth, cur_eval, ST.prev_eval);
8329 });
8330
8331 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8332
8333 rex_sv = ST.prev_rex;
8334 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8335 SET_reg_curpm(rex_sv);
8336 rex = ReANY(rex_sv);
8337 rexi = RXi_GET(rex);
8338
8339 REGCP_UNWIND(ST.lastcp);
8340 regcppop(rex, &maxopenparen);
8341 cur_eval = ST.prev_eval;
8342 cur_curlyx = ST.prev_curlyx;
8343
8344 /* Invalidate cache. See "invalidate" comment above. */
8345 reginfo->poscache_maxiter = 0;
8346 if ( nochange_depth )
8347 nochange_depth--;
8348
8349 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8350 sayNO_SILENT;
8351 #undef ST
8352
8353 case OPEN: /* ( */
8354 n = ARG(scan); /* which paren pair */
8355 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8356 if (n > maxopenparen)
8357 maxopenparen = n;
8358 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8359 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8360 depth,
8361 PTR2UV(rex),
8362 PTR2UV(rex->offs),
8363 (UV)n,
8364 (IV)rex->offs[n].start_tmp,
8365 (UV)maxopenparen
8366 ));
8367 lastopen = n;
8368 break;
8369
8370 case SROPEN: /* (*SCRIPT_RUN: */
8371 script_run_begin = (U8 *) locinput;
8372 break;
8373
8374
8375 case CLOSE: /* ) */
8376 n = ARG(scan); /* which paren pair */
8377 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8378 locinput - reginfo->strbeg);
8379 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8380 goto fake_end;
8381
8382 break;
8383
8384 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
8385
8386 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8387 {
8388 sayNO;
8389 }
8390
8391 break;
8392
8393
8394 case ACCEPT: /* (*ACCEPT) */
8395 if (scan->flags)
8396 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8397 if (ARG2L(scan)){
8398 regnode *cursor;
8399 for (cursor=scan;
8400 cursor && OP(cursor)!=END;
8401 cursor=regnext(cursor))
8402 {
8403 if ( OP(cursor)==CLOSE ){
8404 n = ARG(cursor);
8405 if ( n <= lastopen ) {
8406 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8407 locinput - reginfo->strbeg);
8408 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8409 break;
8410 }
8411 }
8412 }
8413 }
8414 goto fake_end;
8415 /* NOTREACHED */
8416
8417 case GROUPP: /* (?(1)) */
8418 n = ARG(scan); /* which paren pair */
8419 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8420 break;
8421
8422 case GROUPPN: /* (?(<name>)) */
8423 /* reg_check_named_buff_matched returns 0 for no match */
8424 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8425 break;
8426
8427 case INSUBP: /* (?(R)) */
8428 n = ARG(scan);
8429 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8430 * of SCAN is already set up as matches a eval.close_paren */
8431 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8432 break;
8433
8434 case DEFINEP: /* (?(DEFINE)) */
8435 sw = 0;
8436 break;
8437
8438 case IFTHEN: /* (?(cond)A|B) */
8439 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8440 if (sw)
8441 next = NEXTOPER(NEXTOPER(scan));
8442 else {
8443 next = scan + ARG(scan);
8444 if (OP(next) == IFTHEN) /* Fake one. */
8445 next = NEXTOPER(NEXTOPER(next));
8446 }
8447 break;
8448
8449 case LOGICAL: /* modifier for EVAL and IFMATCH */
8450 logical = scan->flags;
8451 break;
8452
8453 /*******************************************************************
8454
8455 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8456 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8457 STAR/PLUS/CURLY/CURLYN are used instead.)
8458
8459 A*B is compiled as <CURLYX><A><WHILEM><B>
8460
8461 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8462 state, which contains the current count, initialised to -1. It also sets
8463 cur_curlyx to point to this state, with any previous value saved in the
8464 state block.
8465
8466 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8467 since the pattern may possibly match zero times (i.e. it's a while {} loop
8468 rather than a do {} while loop).
8469
8470 Each entry to WHILEM represents a successful match of A. The count in the
8471 CURLYX block is incremented, another WHILEM state is pushed, and execution
8472 passes to A or B depending on greediness and the current count.
8473
8474 For example, if matching against the string a1a2a3b (where the aN are
8475 substrings that match /A/), then the match progresses as follows: (the
8476 pushed states are interspersed with the bits of strings matched so far):
8477
8478 <CURLYX cnt=-1>
8479 <CURLYX cnt=0><WHILEM>
8480 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8481 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8482 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8483 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8484
8485 (Contrast this with something like CURLYM, which maintains only a single
8486 backtrack state:
8487
8488 <CURLYM cnt=0> a1
8489 a1 <CURLYM cnt=1> a2
8490 a1 a2 <CURLYM cnt=2> a3
8491 a1 a2 a3 <CURLYM cnt=3> b
8492 )
8493
8494 Each WHILEM state block marks a point to backtrack to upon partial failure
8495 of A or B, and also contains some minor state data related to that
8496 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
8497 overall state, such as the count, and pointers to the A and B ops.
8498
8499 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8500 must always point to the *current* CURLYX block, the rules are:
8501
8502 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8503 and set cur_curlyx to point the new block.
8504
8505 When popping the CURLYX block after a successful or unsuccessful match,
8506 restore the previous cur_curlyx.
8507
8508 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8509 to the outer one saved in the CURLYX block.
8510
8511 When popping the WHILEM block after a successful or unsuccessful B match,
8512 restore the previous cur_curlyx.
8513
8514 Here's an example for the pattern (AI* BI)*BO
8515 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8516
8517 cur_
8518 curlyx backtrack stack
8519 ------ ---------------
8520 NULL
8521 CO <CO prev=NULL> <WO>
8522 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8523 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8524 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8525
8526 At this point the pattern succeeds, and we work back down the stack to
8527 clean up, restoring as we go:
8528
8529 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8530 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8531 CO <CO prev=NULL> <WO>
8532 NULL
8533
8534 *******************************************************************/
8535
8536 #define ST st->u.curlyx
8537
8538 case CURLYX: /* start of /A*B/ (for complex A) */
8539 {
8540 /* No need to save/restore up to this paren */
8541 I32 parenfloor = scan->flags;
8542
8543 assert(next); /* keep Coverity happy */
8544 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8545 next += ARG(next);
8546
8547 /* XXXX Probably it is better to teach regpush to support
8548 parenfloor > maxopenparen ... */
8549 if (parenfloor > (I32)rex->lastparen)
8550 parenfloor = rex->lastparen; /* Pessimization... */
8551
8552 ST.prev_curlyx= cur_curlyx;
8553 cur_curlyx = st;
8554 ST.cp = PL_savestack_ix;
8555
8556 /* these fields contain the state of the current curly.
8557 * they are accessed by subsequent WHILEMs */
8558 ST.parenfloor = parenfloor;
8559 ST.me = scan;
8560 ST.B = next;
8561 ST.minmod = minmod;
8562 minmod = 0;
8563 ST.count = -1; /* this will be updated by WHILEM */
8564 ST.lastloc = NULL; /* this will be updated by WHILEM */
8565
8566 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8567 script_run_begin);
8568 NOT_REACHED; /* NOTREACHED */
8569 }
8570
8571 case CURLYX_end: /* just finished matching all of A*B */
8572 cur_curlyx = ST.prev_curlyx;
8573 sayYES;
8574 NOT_REACHED; /* NOTREACHED */
8575
8576 case CURLYX_end_fail: /* just failed to match all of A*B */
8577 regcpblow(ST.cp);
8578 cur_curlyx = ST.prev_curlyx;
8579 sayNO;
8580 NOT_REACHED; /* NOTREACHED */
8581
8582
8583 #undef ST
8584 #define ST st->u.whilem
8585
8586 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
8587 {
8588 /* see the discussion above about CURLYX/WHILEM */
8589 I32 n;
8590 int min, max;
8591 regnode *A;
8592
8593 assert(cur_curlyx); /* keep Coverity happy */
8594
8595 min = ARG1(cur_curlyx->u.curlyx.me);
8596 max = ARG2(cur_curlyx->u.curlyx.me);
8597 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8598 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8599 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8600 ST.cache_offset = 0;
8601 ST.cache_mask = 0;
8602
8603
8604 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
8605 depth, (long)n, min, max)
8606 );
8607
8608 /* First just match a string of min A's. */
8609
8610 if (n < min) {
8611 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8612 cur_curlyx->u.curlyx.lastloc = locinput;
8613 REGCP_SET(ST.lastcp);
8614
8615 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8616 script_run_begin);
8617 NOT_REACHED; /* NOTREACHED */
8618 }
8619
8620 /* If degenerate A matches "", assume A done. */
8621
8622 if (locinput == cur_curlyx->u.curlyx.lastloc) {
8623 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
8624 depth)
8625 );
8626 goto do_whilem_B_max;
8627 }
8628
8629 /* super-linear cache processing.
8630 *
8631 * The idea here is that for certain types of CURLYX/WHILEM -
8632 * principally those whose upper bound is infinity (and
8633 * excluding regexes that have things like \1 and other very
8634 * non-regular expresssiony things), then if a pattern like
8635 * /....A*.../ fails and we backtrack to the WHILEM, then we
8636 * make a note that this particular WHILEM op was at string
8637 * position 47 (say) when the rest of pattern failed. Then, if
8638 * we ever find ourselves back at that WHILEM, and at string
8639 * position 47 again, we can just fail immediately rather than
8640 * running the rest of the pattern again.
8641 *
8642 * This is very handy when patterns start to go
8643 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8644 * with a combinatorial explosion of backtracking.
8645 *
8646 * The cache is implemented as a bit array, with one bit per
8647 * string byte position per WHILEM op (up to 16) - so its
8648 * between 0.25 and 2x the string size.
8649 *
8650 * To avoid allocating a poscache buffer every time, we do an
8651 * initially countdown; only after we have executed a WHILEM
8652 * op (string-length x #WHILEMs) times do we allocate the
8653 * cache.
8654 *
8655 * The top 4 bits of scan->flags byte say how many different
8656 * relevant CURLLYX/WHILEM op pairs there are, while the
8657 * bottom 4-bits is the identifying index number of this
8658 * WHILEM.
8659 */
8660
8661 if (scan->flags) {
8662
8663 if (!reginfo->poscache_maxiter) {
8664 /* start the countdown: Postpone detection until we
8665 * know the match is not *that* much linear. */
8666 reginfo->poscache_maxiter
8667 = (reginfo->strend - reginfo->strbeg + 1)
8668 * (scan->flags>>4);
8669 /* possible overflow for long strings and many CURLYX's */
8670 if (reginfo->poscache_maxiter < 0)
8671 reginfo->poscache_maxiter = I32_MAX;
8672 reginfo->poscache_iter = reginfo->poscache_maxiter;
8673 }
8674
8675 if (reginfo->poscache_iter-- == 0) {
8676 /* initialise cache */
8677 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8678 regmatch_info_aux *const aux = reginfo->info_aux;
8679 if (aux->poscache) {
8680 if ((SSize_t)reginfo->poscache_size < size) {
8681 Renew(aux->poscache, size, char);
8682 reginfo->poscache_size = size;
8683 }
8684 Zero(aux->poscache, size, char);
8685 }
8686 else {
8687 reginfo->poscache_size = size;
8688 Newxz(aux->poscache, size, char);
8689 }
8690 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8691 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8692 PL_colors[4], PL_colors[5])
8693 );
8694 }
8695
8696 if (reginfo->poscache_iter < 0) {
8697 /* have we already failed at this position? */
8698 SSize_t offset, mask;
8699
8700 reginfo->poscache_iter = -1; /* stop eventual underflow */
8701 offset = (scan->flags & 0xf) - 1
8702 + (locinput - reginfo->strbeg)
8703 * (scan->flags>>4);
8704 mask = 1 << (offset % 8);
8705 offset /= 8;
8706 if (reginfo->info_aux->poscache[offset] & mask) {
8707 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
8708 depth)
8709 );
8710 cur_curlyx->u.curlyx.count--;
8711 sayNO; /* cache records failure */
8712 }
8713 ST.cache_offset = offset;
8714 ST.cache_mask = mask;
8715 }
8716 }
8717
8718 /* Prefer B over A for minimal matching. */
8719
8720 if (cur_curlyx->u.curlyx.minmod) {
8721 ST.save_curlyx = cur_curlyx;
8722 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8723 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8724 locinput, loceol, script_run_begin);
8725 NOT_REACHED; /* NOTREACHED */
8726 }
8727
8728 /* Prefer A over B for maximal matching. */
8729
8730 if (n < max) { /* More greed allowed? */
8731 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8732 maxopenparen);
8733 cur_curlyx->u.curlyx.lastloc = locinput;
8734 REGCP_SET(ST.lastcp);
8735 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8736 script_run_begin);
8737 NOT_REACHED; /* NOTREACHED */
8738 }
8739 goto do_whilem_B_max;
8740 }
8741 NOT_REACHED; /* NOTREACHED */
8742
8743 case WHILEM_B_min: /* just matched B in a minimal match */
8744 case WHILEM_B_max: /* just matched B in a maximal match */
8745 cur_curlyx = ST.save_curlyx;
8746 sayYES;
8747 NOT_REACHED; /* NOTREACHED */
8748
8749 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8750 cur_curlyx = ST.save_curlyx;
8751 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8752 cur_curlyx->u.curlyx.count--;
8753 CACHEsayNO;
8754 NOT_REACHED; /* NOTREACHED */
8755
8756 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8757 /* FALLTHROUGH */
8758 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8759 REGCP_UNWIND(ST.lastcp);
8760 regcppop(rex, &maxopenparen);
8761 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8762 cur_curlyx->u.curlyx.count--;
8763 CACHEsayNO;
8764 NOT_REACHED; /* NOTREACHED */
8765
8766 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8767 REGCP_UNWIND(ST.lastcp);
8768 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8769 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
8770 depth)
8771 );
8772 do_whilem_B_max:
8773 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8774 && ckWARN(WARN_REGEXP)
8775 && !reginfo->warned)
8776 {
8777 reginfo->warned = TRUE;
8778 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8779 "Complex regular subexpression recursion limit (%d) "
8780 "exceeded",
8781 REG_INFTY - 1);
8782 }
8783
8784 /* now try B */
8785 ST.save_curlyx = cur_curlyx;
8786 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8787 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8788 locinput, loceol, script_run_begin);
8789 NOT_REACHED; /* NOTREACHED */
8790
8791 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8792 cur_curlyx = ST.save_curlyx;
8793
8794 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8795 /* Maximum greed exceeded */
8796 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8797 && ckWARN(WARN_REGEXP)
8798 && !reginfo->warned)
8799 {
8800 reginfo->warned = TRUE;
8801 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8802 "Complex regular subexpression recursion "
8803 "limit (%d) exceeded",
8804 REG_INFTY - 1);
8805 }
8806 cur_curlyx->u.curlyx.count--;
8807 CACHEsayNO;
8808 }
8809
8810 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
8811 );
8812 /* Try grabbing another A and see if it helps. */
8813 cur_curlyx->u.curlyx.lastloc = locinput;
8814 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8815 maxopenparen);
8816 REGCP_SET(ST.lastcp);
8817 PUSH_STATE_GOTO(WHILEM_A_min,
8818 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8819 locinput, loceol, script_run_begin);
8820 NOT_REACHED; /* NOTREACHED */
8821
8822 #undef ST
8823 #define ST st->u.branch
8824
8825 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
8826 next = scan + ARG(scan);
8827 if (next == scan)
8828 next = NULL;
8829 scan = NEXTOPER(scan);
8830 /* FALLTHROUGH */
8831
8832 case BRANCH: /* /(...|A|...)/ */
8833 scan = NEXTOPER(scan); /* scan now points to inner node */
8834 ST.lastparen = rex->lastparen;
8835 ST.lastcloseparen = rex->lastcloseparen;
8836 ST.next_branch = next;
8837 REGCP_SET(ST.cp);
8838
8839 /* Now go into the branch */
8840 if (has_cutgroup) {
8841 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8842 script_run_begin);
8843 } else {
8844 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8845 script_run_begin);
8846 }
8847 NOT_REACHED; /* NOTREACHED */
8848
8849 case CUTGROUP: /* /(*THEN)/ */
8850 sv_yes_mark = st->u.mark.mark_name = scan->flags
8851 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8852 : NULL;
8853 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8854 script_run_begin);
8855 NOT_REACHED; /* NOTREACHED */
8856
8857 case CUTGROUP_next_fail:
8858 do_cutgroup = 1;
8859 no_final = 1;
8860 if (st->u.mark.mark_name)
8861 sv_commit = st->u.mark.mark_name;
8862 sayNO;
8863 NOT_REACHED; /* NOTREACHED */
8864
8865 case BRANCH_next:
8866 sayYES;
8867 NOT_REACHED; /* NOTREACHED */
8868
8869 case BRANCH_next_fail: /* that branch failed; try the next, if any */
8870 if (do_cutgroup) {
8871 do_cutgroup = 0;
8872 no_final = 0;
8873 }
8874 REGCP_UNWIND(ST.cp);
8875 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8876 scan = ST.next_branch;
8877 /* no more branches? */
8878 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8879 DEBUG_EXECUTE_r({
8880 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
8881 depth,
8882 PL_colors[4],
8883 PL_colors[5] );
8884 });
8885 sayNO_SILENT;
8886 }
8887 continue; /* execute next BRANCH[J] op */
8888 /* NOTREACHED */
8889
8890 case MINMOD: /* next op will be non-greedy, e.g. A*? */
8891 minmod = 1;
8892 break;
8893
8894 #undef ST
8895 #define ST st->u.curlym
8896
8897 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
8898
8899 /* This is an optimisation of CURLYX that enables us to push
8900 * only a single backtracking state, no matter how many matches
8901 * there are in {m,n}. It relies on the pattern being constant
8902 * length, with no parens to influence future backrefs
8903 */
8904
8905 ST.me = scan;
8906 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8907
8908 ST.lastparen = rex->lastparen;
8909 ST.lastcloseparen = rex->lastcloseparen;
8910
8911 /* if paren positive, emulate an OPEN/CLOSE around A */
8912 if (ST.me->flags) {
8913 U32 paren = ST.me->flags;
8914 if (paren > maxopenparen)
8915 maxopenparen = paren;
8916 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8917 }
8918 ST.A = scan;
8919 ST.B = next;
8920 ST.alen = 0;
8921 ST.count = 0;
8922 ST.minmod = minmod;
8923 minmod = 0;
8924 ST.Binfo.count = -1;
8925 REGCP_SET(ST.cp);
8926
8927 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8928 goto curlym_do_B;
8929
8930 curlym_do_A: /* execute the A in /A{m,n}B/ */
8931 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8932 script_run_begin);
8933 NOT_REACHED; /* NOTREACHED */
8934
8935 case CURLYM_A: /* we've just matched an A */
8936 ST.count++;
8937 /* after first match, determine A's length: u.curlym.alen */
8938 if (ST.count == 1) {
8939 if (reginfo->is_utf8_target) {
8940 char *s = st->locinput;
8941 while (s < locinput) {
8942 ST.alen++;
8943 s += UTF8SKIP(s);
8944 }
8945 }
8946 else {
8947 ST.alen = locinput - st->locinput;
8948 }
8949 if (ST.alen == 0)
8950 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8951 }
8952 DEBUG_EXECUTE_r(
8953 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8954 depth, (IV) ST.count, (IV)ST.alen)
8955 );
8956
8957 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8958 goto fake_end;
8959
8960 {
8961 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8962 if ( max == REG_INFTY || ST.count < max )
8963 goto curlym_do_A; /* try to match another A */
8964 }
8965 goto curlym_do_B; /* try to match B */
8966
8967 case CURLYM_A_fail: /* just failed to match an A */
8968 REGCP_UNWIND(ST.cp);
8969
8970
8971 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8972 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8973 sayNO;
8974
8975 curlym_do_B: /* execute the B in /A{m,n}B/ */
8976 if (ST.Binfo.count < 0) {
8977 /* calculate possible match of 1st char following curly */
8978 assert(ST.B);
8979 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8980 regnode *text_node = ST.B;
8981 if (! HAS_TEXT(text_node))
8982 FIND_NEXT_IMPT(text_node);
8983 if (PL_regkind[OP(text_node)] == EXACT) {
8984 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
8985 &ST.Binfo, reginfo))
8986 {
8987 sayNO;
8988 }
8989 }
8990 }
8991 }
8992
8993 DEBUG_EXECUTE_r(
8994 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
8995 depth, (IV)ST.count)
8996 );
8997 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
8998 assert(ST.Binfo.count > 0);
8999
9000 /* Do a quick test to hopefully rule out most non-matches */
9001 if ( locinput + ST.Binfo.min_length > loceol
9002 || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9003 {
9004 DEBUG_OPTIMISE_r(
9005 Perl_re_exec_indentf( aTHX_
9006 "CURLYM Fast bail next target=0x%X anded==0x%X"
9007 " mask=0x%X\n",
9008 depth,
9009 (int) nextbyte, ST.Binfo.first_byte_anded,
9010 ST.Binfo.first_byte_mask)
9011 );
9012 state_num = CURLYM_B_fail;
9013 goto reenter_switch;
9014 }
9015 }
9016
9017 if (ST.me->flags) {
9018 /* emulate CLOSE: mark current A as captured */
9019 U32 paren = (U32)ST.me->flags;
9020 if (ST.count) {
9021 CLOSE_CAPTURE(paren,
9022 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9023 locinput - reginfo->strbeg);
9024 }
9025 else
9026 rex->offs[paren].end = -1;
9027
9028 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9029 {
9030 if (ST.count)
9031 goto fake_end;
9032 else
9033 sayNO;
9034 }
9035 }
9036
9037 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
9038 script_run_begin);
9039 NOT_REACHED; /* NOTREACHED */
9040
9041 case CURLYM_B_fail: /* just failed to match a B */
9042 REGCP_UNWIND(ST.cp);
9043 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9044 if (ST.minmod) {
9045 I32 max = ARG2(ST.me);
9046 if (max != REG_INFTY && ST.count == max)
9047 sayNO;
9048 goto curlym_do_A; /* try to match a further A */
9049 }
9050 /* backtrack one A */
9051 if (ST.count == ARG1(ST.me) /* min */)
9052 sayNO;
9053 ST.count--;
9054 SET_locinput(HOPc(locinput, -ST.alen));
9055 goto curlym_do_B; /* try to match B */
9056
9057 #undef ST
9058 #define ST st->u.curly
9059
9060 #define CURLY_SETPAREN(paren, success) \
9061 if (paren) { \
9062 if (success) { \
9063 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9064 locinput - reginfo->strbeg); \
9065 } \
9066 else { \
9067 rex->offs[paren].end = -1; \
9068 rex->lastparen = ST.lastparen; \
9069 rex->lastcloseparen = ST.lastcloseparen; \
9070 } \
9071 }
9072
9073 case STAR: /* /A*B/ where A is width 1 char */
9074 ST.paren = 0;
9075 ST.min = 0;
9076 ST.max = REG_INFTY;
9077 scan = NEXTOPER(scan);
9078 goto repeat;
9079
9080 case PLUS: /* /A+B/ where A is width 1 char */
9081 ST.paren = 0;
9082 ST.min = 1;
9083 ST.max = REG_INFTY;
9084 scan = NEXTOPER(scan);
9085 goto repeat;
9086
9087 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
9088 ST.paren = scan->flags; /* Which paren to set */
9089 ST.lastparen = rex->lastparen;
9090 ST.lastcloseparen = rex->lastcloseparen;
9091 if (ST.paren > maxopenparen)
9092 maxopenparen = ST.paren;
9093 ST.min = ARG1(scan); /* min to match */
9094 ST.max = ARG2(scan); /* max to match */
9095 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
9096
9097 /* handle the single-char capture called as a GOSUB etc */
9098 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9099 {
9100 char *li = locinput;
9101 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9102 sayNO;
9103 SET_locinput(li);
9104 goto fake_end;
9105 }
9106
9107 goto repeat;
9108
9109 case CURLY: /* /A{m,n}B/ where A is width 1 char */
9110 ST.paren = 0;
9111 ST.min = ARG1(scan); /* min to match */
9112 ST.max = ARG2(scan); /* max to match */
9113 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
9114 repeat:
9115 /*
9116 * Lookahead to avoid useless match attempts
9117 * when we know what character comes next.
9118 *
9119 * Used to only do .*x and .*?x, but now it allows
9120 * for )'s, ('s and (?{ ... })'s to be in the way
9121 * of the quantifier and the EXACT-like node. -- japhy
9122 */
9123
9124 assert(ST.min <= ST.max);
9125 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9126 ST.Binfo.count = 0;
9127 }
9128 else {
9129 regnode *text_node = next;
9130
9131 if (! HAS_TEXT(text_node))
9132 FIND_NEXT_IMPT(text_node);
9133
9134 if (! HAS_TEXT(text_node))
9135 ST.Binfo.count = 0;
9136 else {
9137 if ( PL_regkind[OP(text_node)] != EXACT ) {
9138 ST.Binfo.count = 0;
9139 }
9140 else {
9141 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9142 &ST.Binfo, reginfo))
9143 {
9144 sayNO;
9145 }
9146 }
9147 }
9148 }
9149
9150 ST.A = scan;
9151 ST.B = next;
9152 if (minmod) {
9153 char *li = locinput;
9154 minmod = 0;
9155 if (ST.min &&
9156 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9157 < ST.min)
9158 sayNO;
9159 SET_locinput(li);
9160 ST.count = ST.min;
9161 REGCP_SET(ST.cp);
9162
9163 if (ST.Binfo.count <= 0)
9164 goto curly_try_B_min;
9165
9166 ST.oldloc = locinput;
9167
9168 /* set ST.maxpos to the furthest point along the
9169 * string that could possibly match, i.e., that a match could
9170 * start at. */
9171 if (ST.max == REG_INFTY) {
9172 ST.maxpos = loceol - 1;
9173 if (utf8_target)
9174 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9175 ST.maxpos--;
9176 }
9177 else if (utf8_target) {
9178 int m = ST.max - ST.min;
9179 for (ST.maxpos = locinput;
9180 m >0 && ST.maxpos < loceol; m--)
9181 ST.maxpos += UTF8SKIP(ST.maxpos);
9182 }
9183 else {
9184 ST.maxpos = locinput + ST.max - ST.min;
9185 if (ST.maxpos >= loceol)
9186 ST.maxpos = loceol - 1;
9187 }
9188 goto curly_try_B_min_known;
9189
9190 }
9191 else {
9192 /* avoid taking address of locinput, so it can remain
9193 * a register var */
9194 char *li = locinput;
9195 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9196 if (ST.count < ST.min)
9197 sayNO;
9198 SET_locinput(li);
9199 if ((ST.count > ST.min)
9200 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
9201 {
9202 /* A{m,n} must come at the end of the string, there's
9203 * no point in backing off ... */
9204 ST.min = ST.count;
9205 /* ...except that $ and \Z can match before *and* after
9206 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
9207 We may back off by one in this case. */
9208 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9209 ST.min--;
9210 }
9211 REGCP_SET(ST.cp);
9212 goto curly_try_B_max;
9213 }
9214 NOT_REACHED; /* NOTREACHED */
9215
9216 case CURLY_B_min_fail:
9217 /* failed to find B in a non-greedy match. */
9218
9219 REGCP_UNWIND(ST.cp);
9220 if (ST.paren) {
9221 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9222 }
9223
9224 if (ST.Binfo.count == 0) {
9225 /* failed -- move forward one */
9226 char *li = locinput;
9227 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9228 sayNO;
9229 }
9230 locinput = li;
9231 ST.count++;
9232 if (!( ST.count <= ST.max
9233 /* count overflow ? */
9234 || (ST.max == REG_INFTY && ST.count > 0))
9235 )
9236 sayNO;
9237 }
9238 else {
9239 int n;
9240 /* Couldn't or didn't -- move forward. */
9241 ST.oldloc = locinput;
9242 if (utf8_target)
9243 locinput += UTF8SKIP(locinput);
9244 else
9245 locinput++;
9246 ST.count++;
9247
9248 curly_try_B_min_known:
9249 /* find the next place where 'B' could work, then call B */
9250 if (locinput + ST.Binfo.initial_exact < loceol) {
9251 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9252
9253 /* Here, the mask is all 1's for the entire length of
9254 * any possible match. (That actually means that there
9255 * is only one possible match.) Look for the next
9256 * occurrence */
9257 locinput = ninstr(locinput, loceol,
9258 (char *) ST.Binfo.matches,
9259 (char *) ST.Binfo.matches
9260 + ST.Binfo.initial_exact);
9261 if (locinput == NULL) {
9262 sayNO;
9263 }
9264 }
9265 else do {
9266 /* If the first byte(s) of the mask are all ones, it
9267 * means those bytes must match identically, so can use
9268 * ninstr() to find the next possible matchpoint */
9269 if (ST.Binfo.initial_exact > 0) {
9270 locinput = ninstr(locinput, loceol,
9271 (char *) ST.Binfo.matches,
9272 (char *) ST.Binfo.matches
9273 + ST.Binfo.initial_exact);
9274 }
9275 else { /* Otherwise find the next byte that matches,
9276 masked */
9277 locinput = (char *) find_next_masked(
9278 (U8 *) locinput, (U8 *) loceol,
9279 ST.Binfo.first_byte_anded,
9280 ST.Binfo.first_byte_mask);
9281 /* Advance to the end of a multi-byte character */
9282 if (utf8_target) {
9283 while ( locinput < loceol
9284 && UTF8_IS_CONTINUATION(*locinput))
9285 {
9286 locinput++;
9287 }
9288 }
9289 }
9290 if ( locinput == NULL
9291 || locinput + ST.Binfo.min_length > loceol)
9292 {
9293 sayNO;
9294 }
9295
9296 /* Here, we have found a possible match point; if can't
9297 * rule it out, quit the loop so can check fully */
9298 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9299 break;
9300 }
9301
9302 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9303
9304 } while (locinput <= ST.maxpos);
9305 }
9306
9307 if (locinput > ST.maxpos)
9308 sayNO;
9309
9310 n = (utf8_target)
9311 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9312 : (STRLEN) (locinput - ST.oldloc);
9313
9314
9315 /* Here is at the beginning of a character that meets the mask
9316 * criteria. Need to make sure that some real possibility */
9317
9318 if (n) {
9319 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9320 * at what may be the beginning of b; check that everything
9321 * between oldloc and locinput matches */
9322 char *li = ST.oldloc;
9323 ST.count += n;
9324 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9325 sayNO;
9326 assert(n == REG_INFTY || locinput == li);
9327 }
9328 }
9329
9330 curly_try_B_min:
9331 CURLY_SETPAREN(ST.paren, ST.count);
9332 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9333 script_run_begin);
9334 NOT_REACHED; /* NOTREACHED */
9335
9336
9337 curly_try_B_max:
9338 /* a successful greedy match: now try to match B */
9339 if ( ST.Binfo.count <= 0
9340 || ( ST.Binfo.count > 0
9341 && locinput + ST.Binfo.min_length <= loceol
9342 && S_test_EXACTISH_ST(locinput, ST.Binfo)))
9343 {
9344 CURLY_SETPAREN(ST.paren, ST.count);
9345 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9346 script_run_begin);
9347 NOT_REACHED; /* NOTREACHED */
9348 }
9349 /* FALLTHROUGH */
9350
9351 case CURLY_B_max_fail:
9352 /* failed to find B in a greedy match */
9353
9354 REGCP_UNWIND(ST.cp);
9355 if (ST.paren) {
9356 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9357 }
9358 /* back up. */
9359 if (--ST.count < ST.min)
9360 sayNO;
9361 locinput = HOPc(locinput, -1);
9362 goto curly_try_B_max;
9363
9364 #undef ST
9365
9366 case END: /* last op of main pattern */
9367 fake_end:
9368 if (cur_eval) {
9369 /* we've just finished A in /(??{A})B/; now continue with B */
9370 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9371 st->u.eval.prev_rex = rex_sv; /* inner */
9372
9373 /* Save *all* the positions. */
9374 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9375 rex_sv = CUR_EVAL.prev_rex;
9376 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9377 SET_reg_curpm(rex_sv);
9378 rex = ReANY(rex_sv);
9379 rexi = RXi_GET(rex);
9380
9381 st->u.eval.prev_curlyx = cur_curlyx;
9382 cur_curlyx = CUR_EVAL.prev_curlyx;
9383
9384 REGCP_SET(st->u.eval.lastcp);
9385
9386 /* Restore parens of the outer rex without popping the
9387 * savestack */
9388 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9389
9390 st->u.eval.prev_eval = cur_eval;
9391 cur_eval = CUR_EVAL.prev_eval;
9392 DEBUG_EXECUTE_r(
9393 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
9394 depth, cur_eval););
9395 if ( nochange_depth )
9396 nochange_depth--;
9397
9398 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9399
9400 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
9401 st->u.eval.prev_eval->u.eval.B,
9402 locinput, loceol, script_run_begin);
9403 }
9404
9405 if (locinput < reginfo->till) {
9406 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9407 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9408 PL_colors[4],
9409 (long)(locinput - startpos),
9410 (long)(reginfo->till - startpos),
9411 PL_colors[5]));
9412
9413 sayNO_SILENT; /* Cannot match: too short. */
9414 }
9415 sayYES; /* Success! */
9416
9417 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
9418 DEBUG_EXECUTE_r(
9419 Perl_re_exec_indentf( aTHX_ "%sSUCCEED: subpattern success...%s\n",
9420 depth, PL_colors[4], PL_colors[5]));
9421 sayYES; /* Success! */
9422
9423 #undef ST
9424 #define ST st->u.ifmatch
9425
9426 case SUSPEND: /* (?>A) */
9427 ST.wanted = 1;
9428 ST.start = locinput;
9429 ST.end = loceol;
9430 ST.count = 1;
9431 goto do_ifmatch;
9432
9433 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9434 ST.wanted = 0;
9435 goto ifmatch_trivial_fail_test;
9436
9437 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9438 ST.wanted = 1;
9439 ifmatch_trivial_fail_test:
9440 ST.count = scan->next_off + 1; /* next_off repurposed to be
9441 lookbehind count, requires
9442 non-zero flags */
9443 if (! scan->flags) { /* 'flags' zero means lookahed */
9444
9445 /* Lookahead starts here and ends at the normal place */
9446 ST.start = locinput;
9447 ST.end = loceol;
9448 }
9449 else {
9450 PERL_UINT_FAST8_T back_count = scan->flags;
9451 char * s;
9452
9453 /* Lookbehind can look beyond the current position */
9454 ST.end = loceol;
9455
9456 /* ... and starts at the first place in the input that is in
9457 * the range of the possible start positions */
9458 for (; ST.count > 0; ST.count--, back_count--) {
9459 s = HOPBACKc(locinput, back_count);
9460 if (s) {
9461 ST.start = s;
9462 goto do_ifmatch;
9463 }
9464 }
9465
9466 /* If the lookbehind doesn't start in the actual string, is a
9467 * trivial match failure */
9468 if (logical) {
9469 logical = 0;
9470 sw = 1 - cBOOL(ST.wanted);
9471 }
9472 else if (ST.wanted)
9473 sayNO;
9474
9475 /* Here, we didn't want it to match, so is actually success */
9476 next = scan + ARG(scan);
9477 if (next == scan)
9478 next = NULL;
9479 break;
9480 }
9481
9482 do_ifmatch:
9483 ST.me = scan;
9484 ST.logical = logical;
9485 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9486
9487 /* execute body of (?...A) */
9488 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9489 ST.end, script_run_begin);
9490 NOT_REACHED; /* NOTREACHED */
9491
9492 {
9493 bool matched;
9494
9495 case IFMATCH_A_fail: /* body of (?...A) failed */
9496 if (! ST.logical && ST.count > 1) {
9497
9498 /* It isn't a real failure until we've tried all starting
9499 * positions. Move to the next starting position and retry */
9500 ST.count--;
9501 ST.start = HOPc(ST.start, 1);
9502 scan = ST.me;
9503 logical = ST.logical;
9504 goto do_ifmatch;
9505 }
9506
9507 /* Here, all starting positions have been tried. */
9508 matched = FALSE;
9509 goto ifmatch_done;
9510
9511 case IFMATCH_A: /* body of (?...A) succeeded */
9512 matched = TRUE;
9513 ifmatch_done:
9514 sw = matched == ST.wanted;
9515 if (! ST.logical && !sw) {
9516 sayNO;
9517 }
9518
9519 if (OP(ST.me) != SUSPEND) {
9520 /* restore old position except for (?>...) */
9521 locinput = st->locinput;
9522 loceol = st->loceol;
9523 script_run_begin = st->sr0;
9524 }
9525 scan = ST.me + ARG(ST.me);
9526 if (scan == ST.me)
9527 scan = NULL;
9528 continue; /* execute B */
9529 }
9530
9531 #undef ST
9532
9533 case LONGJMP: /* alternative with many branches compiles to
9534 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9535 next = scan + ARG(scan);
9536 if (next == scan)
9537 next = NULL;
9538 break;
9539
9540 case COMMIT: /* (*COMMIT) */
9541 reginfo->cutpoint = loceol;
9542 /* FALLTHROUGH */
9543
9544 case PRUNE: /* (*PRUNE) */
9545 if (scan->flags)
9546 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9547 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9548 script_run_begin);
9549 NOT_REACHED; /* NOTREACHED */
9550
9551 case COMMIT_next_fail:
9552 no_final = 1;
9553 /* FALLTHROUGH */
9554 sayNO;
9555 NOT_REACHED; /* NOTREACHED */
9556
9557 case OPFAIL: /* (*FAIL) */
9558 if (scan->flags)
9559 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9560 if (logical) {
9561 /* deal with (?(?!)X|Y) properly,
9562 * make sure we trigger the no branch
9563 * of the trailing IFTHEN structure*/
9564 sw= 0;
9565 break;
9566 } else {
9567 sayNO;
9568 }
9569 NOT_REACHED; /* NOTREACHED */
9570
9571 #define ST st->u.mark
9572 case MARKPOINT: /* (*MARK:foo) */
9573 ST.prev_mark = mark_state;
9574 ST.mark_name = sv_commit = sv_yes_mark
9575 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9576 mark_state = st;
9577 ST.mark_loc = locinput;
9578 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9579 script_run_begin);
9580 NOT_REACHED; /* NOTREACHED */
9581
9582 case MARKPOINT_next:
9583 mark_state = ST.prev_mark;
9584 sayYES;
9585 NOT_REACHED; /* NOTREACHED */
9586
9587 case MARKPOINT_next_fail:
9588 if (popmark && sv_eq(ST.mark_name,popmark))
9589 {
9590 if (ST.mark_loc > startpoint)
9591 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9592 popmark = NULL; /* we found our mark */
9593 sv_commit = ST.mark_name;
9594
9595 DEBUG_EXECUTE_r({
9596 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9597 depth,
9598 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9599 });
9600 }
9601 mark_state = ST.prev_mark;
9602 sv_yes_mark = mark_state ?
9603 mark_state->u.mark.mark_name : NULL;
9604 sayNO;
9605 NOT_REACHED; /* NOTREACHED */
9606
9607 case SKIP: /* (*SKIP) */
9608 if (!scan->flags) {
9609 /* (*SKIP) : if we fail we cut here*/
9610 ST.mark_name = NULL;
9611 ST.mark_loc = locinput;
9612 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9613 script_run_begin);
9614 } else {
9615 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9616 otherwise do nothing. Meaning we need to scan
9617 */
9618 regmatch_state *cur = mark_state;
9619 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9620
9621 while (cur) {
9622 if ( sv_eq( cur->u.mark.mark_name,
9623 find ) )
9624 {
9625 ST.mark_name = find;
9626 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9627 script_run_begin);
9628 }
9629 cur = cur->u.mark.prev_mark;
9630 }
9631 }
9632 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9633 break;
9634
9635 case SKIP_next_fail:
9636 if (ST.mark_name) {
9637 /* (*CUT:NAME) - Set up to search for the name as we
9638 collapse the stack*/
9639 popmark = ST.mark_name;
9640 } else {
9641 /* (*CUT) - No name, we cut here.*/
9642 if (ST.mark_loc > startpoint)
9643 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9644 /* but we set sv_commit to latest mark_name if there
9645 is one so they can test to see how things lead to this
9646 cut */
9647 if (mark_state)
9648 sv_commit=mark_state->u.mark.mark_name;
9649 }
9650 no_final = 1;
9651 sayNO;
9652 NOT_REACHED; /* NOTREACHED */
9653 #undef ST
9654
9655 case LNBREAK: /* \R */
9656 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9657 locinput += n;
9658 } else
9659 sayNO;
9660 break;
9661
9662 default:
9663 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9664 PTR2UV(scan), OP(scan));
9665 Perl_croak(aTHX_ "regexp memory corruption");
9666
9667 /* this is a point to jump to in order to increment
9668 * locinput by one character */
9669 increment_locinput:
9670 assert(!NEXTCHR_IS_EOS);
9671 if (utf8_target) {
9672 locinput += PL_utf8skip[nextbyte];
9673 /* locinput is allowed to go 1 char off the end (signifying
9674 * EOS), but not 2+ */
9675 if (locinput > loceol)
9676 sayNO;
9677 }
9678 else
9679 locinput++;
9680 break;
9681
9682 } /* end switch */
9683
9684 /* switch break jumps here */
9685 scan = next; /* prepare to execute the next op and ... */
9686 continue; /* ... jump back to the top, reusing st */
9687 /* NOTREACHED */
9688
9689 push_yes_state:
9690 /* push a state that backtracks on success */
9691 st->u.yes.prev_yes_state = yes_state;
9692 yes_state = st;
9693 /* FALLTHROUGH */
9694 push_state:
9695 /* push a new regex state, then continue at scan */
9696 {
9697 regmatch_state *newst;
9698 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9699
9700 DEBUG_r( /* DEBUG_STACK_r */
9701 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9702 regmatch_state *cur = st;
9703 regmatch_state *curyes = yes_state;
9704 U32 i;
9705 regmatch_slab *slab = PL_regmatch_slab;
9706 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9707 if (cur < SLAB_FIRST(slab)) {
9708 slab = slab->prev;
9709 cur = SLAB_LAST(slab);
9710 }
9711 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9712 depth,
9713 i ? " " : "push",
9714 depth - i, PL_reg_name[cur->resume_state],
9715 (curyes == cur) ? "yes" : ""
9716 );
9717 if (curyes == cur)
9718 curyes = cur->u.yes.prev_yes_state;
9719 }
9720 } else {
9721 DEBUG_STATE_pp("push")
9722 });
9723 depth++;
9724 st->locinput = locinput;
9725 st->loceol = loceol;
9726 st->sr0 = script_run_begin;
9727 newst = st+1;
9728 if (newst > SLAB_LAST(PL_regmatch_slab))
9729 newst = S_push_slab(aTHX);
9730 PL_regmatch_state = newst;
9731
9732 locinput = pushinput;
9733 loceol = pusheol;
9734 script_run_begin = pushsr0;
9735 st = newst;
9736 continue;
9737 /* NOTREACHED */
9738 }
9739 }
9740 #ifdef SOLARIS_BAD_OPTIMIZER
9741 # undef PL_charclass
9742 #endif
9743
9744 /*
9745 * We get here only if there's trouble -- normally "case END" is
9746 * the terminating point.
9747 */
9748 Perl_croak(aTHX_ "corrupted regexp pointers");
9749 NOT_REACHED; /* NOTREACHED */
9750
9751 yes:
9752 if (yes_state) {
9753 /* we have successfully completed a subexpression, but we must now
9754 * pop to the state marked by yes_state and continue from there */
9755 assert(st != yes_state);
9756 #ifdef DEBUGGING
9757 while (st != yes_state) {
9758 st--;
9759 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9760 PL_regmatch_slab = PL_regmatch_slab->prev;
9761 st = SLAB_LAST(PL_regmatch_slab);
9762 }
9763 DEBUG_STATE_r({
9764 if (no_final) {
9765 DEBUG_STATE_pp("pop (no final)");
9766 } else {
9767 DEBUG_STATE_pp("pop (yes)");
9768 }
9769 });
9770 depth--;
9771 }
9772 #else
9773 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9774 || yes_state > SLAB_LAST(PL_regmatch_slab))
9775 {
9776 /* not in this slab, pop slab */
9777 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9778 PL_regmatch_slab = PL_regmatch_slab->prev;
9779 st = SLAB_LAST(PL_regmatch_slab);
9780 }
9781 depth -= (st - yes_state);
9782 #endif
9783 st = yes_state;
9784 yes_state = st->u.yes.prev_yes_state;
9785 PL_regmatch_state = st;
9786
9787 if (no_final) {
9788 locinput= st->locinput;
9789 loceol= st->loceol;
9790 script_run_begin = st->sr0;
9791 }
9792 state_num = st->resume_state + no_final;
9793 goto reenter_switch;
9794 }
9795
9796 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
9797 PL_colors[4], PL_colors[5]));
9798
9799 if (reginfo->info_aux_eval) {
9800 /* each successfully executed (?{...}) block does the equivalent of
9801 * local $^R = do {...}
9802 * When popping the save stack, all these locals would be undone;
9803 * bypass this by setting the outermost saved $^R to the latest
9804 * value */
9805 /* I dont know if this is needed or works properly now.
9806 * see code related to PL_replgv elsewhere in this file.
9807 * Yves
9808 */
9809 if (oreplsv != GvSV(PL_replgv)) {
9810 sv_setsv(oreplsv, GvSV(PL_replgv));
9811 SvSETMAGIC(oreplsv);
9812 }
9813 }
9814 result = 1;
9815 goto final_exit;
9816
9817 no:
9818 DEBUG_EXECUTE_r(
9819 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
9820 depth,
9821 PL_colors[4], PL_colors[5])
9822 );
9823
9824 no_silent:
9825 if (no_final) {
9826 if (yes_state) {
9827 goto yes;
9828 } else {
9829 goto final_exit;
9830 }
9831 }
9832 if (depth) {
9833 /* there's a previous state to backtrack to */
9834 st--;
9835 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9836 PL_regmatch_slab = PL_regmatch_slab->prev;
9837 st = SLAB_LAST(PL_regmatch_slab);
9838 }
9839 PL_regmatch_state = st;
9840 locinput= st->locinput;
9841 loceol= st->loceol;
9842 script_run_begin = st->sr0;
9843
9844 DEBUG_STATE_pp("pop");
9845 depth--;
9846 if (yes_state == st)
9847 yes_state = st->u.yes.prev_yes_state;
9848
9849 state_num = st->resume_state + 1; /* failure = success + 1 */
9850 PERL_ASYNC_CHECK();
9851 goto reenter_switch;
9852 }
9853 result = 0;
9854
9855 final_exit:
9856 if (rex->intflags & PREGf_VERBARG_SEEN) {
9857 SV *sv_err = get_sv("REGERROR", 1);
9858 SV *sv_mrk = get_sv("REGMARK", 1);
9859 if (result) {
9860 sv_commit = &PL_sv_no;
9861 if (!sv_yes_mark)
9862 sv_yes_mark = &PL_sv_yes;
9863 } else {
9864 if (!sv_commit)
9865 sv_commit = &PL_sv_yes;
9866 sv_yes_mark = &PL_sv_no;
9867 }
9868 assert(sv_err);
9869 assert(sv_mrk);
9870 sv_setsv(sv_err, sv_commit);
9871 sv_setsv(sv_mrk, sv_yes_mark);
9872 }
9873
9874
9875 if (last_pushed_cv) {
9876 dSP;
9877 /* see "Some notes about MULTICALL" above */
9878 POP_MULTICALL;
9879 PERL_UNUSED_VAR(SP);
9880 }
9881 else
9882 LEAVE_SCOPE(orig_savestack_ix);
9883
9884 assert(!result || locinput - reginfo->strbeg >= 0);
9885 return result ? locinput - reginfo->strbeg : -1;
9886 }
9887
9888 /*
9889 - regrepeat - repeatedly match something simple, report how many
9890 *
9891 * What 'simple' means is a node which can be the operand of a quantifier like
9892 * '+', or {1,3}
9893 *
9894 * startposp - pointer to a pointer to the start position. This is updated
9895 * to point to the byte following the highest successful
9896 * match.
9897 * p - the regnode to be repeatedly matched against.
9898 * loceol - pointer to the end position beyond which we aren't supposed to
9899 * look.
9900 * reginfo - struct holding match state, such as utf8_target
9901 * max - maximum number of things to match.
9902 * depth - (for debugging) backtracking depth.
9903 */
9904 STATIC I32
S_regrepeat(pTHX_ regexp * prog,char ** startposp,const regnode * p,char * loceol,regmatch_info * const reginfo,I32 max _pDEPTH)9905 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9906 char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9907 {
9908 char *scan; /* Pointer to current position in target string */
9909 I32 c;
9910 char *this_eol = loceol; /* potentially adjusted version. */
9911 I32 hardcount = 0; /* How many matches so far */
9912 bool utf8_target = reginfo->is_utf8_target;
9913 unsigned int to_complement = 0; /* Invert the result? */
9914 _char_class_number classnum;
9915
9916 PERL_ARGS_ASSERT_REGREPEAT;
9917
9918 /* This routine is structured so that we switch on the input OP. Each OP
9919 * case: statement contains a loop to repeatedly apply the OP, advancing
9920 * the input until it fails, or reaches the end of the input, or until it
9921 * reaches the upper limit of matches. */
9922
9923 scan = *startposp;
9924 if (max == REG_INFTY) /* This is a special marker to go to the platform's
9925 max */
9926 max = I32_MAX;
9927 else if (! utf8_target && this_eol - scan > max)
9928 this_eol = scan + max;
9929
9930 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
9931 * down to the maximum of how far we should go in it (but leaving it set to
9932 * the real end if the maximum permissible would take us beyond that).
9933 * This allows us to make the loop exit condition that we haven't gone past
9934 * <this_eol> to also mean that we haven't exceeded the max permissible
9935 * count, saving a test each time through the loop. But it assumes that
9936 * the OP matches a single byte, which is true for most of the OPs below
9937 * when applied to a non-UTF-8 target. Those relatively few OPs that don't
9938 * have this characteristic have to compensate.
9939 *
9940 * There is no such adjustment for UTF-8 targets, since the number of bytes
9941 * per character can vary. OPs will have to test both that the count is
9942 * less than the max permissible (using <hardcount> to keep track), and
9943 * that we are still within the bounds of the string (using <this_eol>. A
9944 * few OPs match a single byte no matter what the encoding. They can omit
9945 * the max test if, for the UTF-8 case, they do the adjustment that was
9946 * skipped above.
9947 *
9948 * Thus, the code above sets things up for the common case; and exceptional
9949 * cases need extra work; the common case is to make sure <scan> doesn't go
9950 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9951 * count doesn't exceed the maximum permissible */
9952
9953 switch (with_t_UTF8ness(OP(p), utf8_target)) {
9954 case REG_ANY_t8:
9955 while (scan < this_eol && hardcount < max && *scan != '\n') {
9956 scan += UTF8SKIP(scan);
9957 hardcount++;
9958 }
9959 break;
9960
9961 case REG_ANY_tb:
9962 scan = (char *) memchr(scan, '\n', this_eol - scan);
9963 if (! scan) {
9964 scan = this_eol;
9965 }
9966 break;
9967
9968 case SANY_t8:
9969 while (scan < this_eol && hardcount < max) {
9970 scan += UTF8SKIP(scan);
9971 hardcount++;
9972 }
9973 break;
9974
9975 case SANY_tb:
9976 scan = this_eol;
9977 break;
9978
9979 case EXACT_REQ8_tb:
9980 case LEXACT_REQ8_tb:
9981 case EXACTFU_REQ8_tb:
9982 break;
9983
9984 case EXACTL_t8:
9985 if (UTF8_IS_ABOVE_LATIN1(*scan)) {
9986 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9987 }
9988 /* FALLTHROUGH */
9989
9990 case EXACTL_tb:
9991 case EXACTFL_t8:
9992 case EXACTFL_tb:
9993 case EXACTFLU8_t8:
9994 case EXACTFLU8_tb:
9995 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9996 /* FALLTHROUGH */
9997
9998 case EXACT_REQ8_t8:
9999 case LEXACT_REQ8_t8:
10000 case EXACTFU_REQ8_t8:
10001 case LEXACT_t8:
10002 case LEXACT_tb:
10003 case EXACT_t8:
10004 case EXACT_tb:
10005 case EXACTF_t8:
10006 case EXACTF_tb:
10007 case EXACTFAA_NO_TRIE_t8:
10008 case EXACTFAA_NO_TRIE_tb:
10009 case EXACTFAA_t8:
10010 case EXACTFAA_tb:
10011 case EXACTFU_t8:
10012 case EXACTFU_tb:
10013 case EXACTFUP_t8:
10014 case EXACTFUP_tb:
10015
10016 {
10017 struct next_matchable_info Binfo;
10018 PERL_UINT_FAST8_T definitive_len;
10019
10020 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10021
10022 /* Set up termination info, and quit if we can rule out that we've
10023 * gotten a match of the termination criteria */
10024 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10025 || scan + Binfo.min_length > this_eol
10026 || ! S_test_EXACTISH_ST(scan, Binfo))
10027 {
10028 break;
10029 }
10030
10031 definitive_len = Binfo.initial_definitive;
10032
10033 /* Here there are potential matches, and the first byte(s) matched our
10034 * filter
10035 *
10036 * If we got a definitive match of some initial bytes, there is no
10037 * possibility of false positives as far as it got */
10038 if (definitive_len > 0) {
10039
10040 /* If as far as it got is the maximum possible, there were no false
10041 * positives at all. Since we have everything set up, see how many
10042 * repeats there are. */
10043 if (definitive_len >= Binfo.max_length) {
10044
10045 /* We've already found one match */
10046 scan += definitive_len;
10047 hardcount++;
10048
10049 /* If want more than the one match, and there is room for more,
10050 * see if there are any */
10051 if (hardcount < max && scan + definitive_len <= this_eol) {
10052
10053 /* If the character is only a single byte long, just span
10054 * all such bytes. */
10055 if (definitive_len == 1) {
10056 const char * orig_scan = scan;
10057
10058 if (this_eol - (scan - hardcount) > max) {
10059 this_eol = scan - hardcount + max;
10060 }
10061
10062 /* Use different routines depending on whether it's an
10063 * exact match or matches with a mask */
10064 if (Binfo.initial_exact == 1) {
10065 scan = (char *) find_span_end((U8 *) scan,
10066 (U8 *) this_eol,
10067 Binfo.matches[0]);
10068 }
10069 else {
10070 scan = (char *) find_span_end_mask(
10071 (U8 *) scan,
10072 (U8 *) this_eol,
10073 Binfo.first_byte_anded,
10074 Binfo.first_byte_mask);
10075 }
10076
10077 hardcount += scan - orig_scan;
10078 }
10079 else { /* Here, the full character definitive match is more
10080 than one byte */
10081 while ( hardcount < max
10082 && scan + definitive_len <= this_eol
10083 && S_test_EXACTISH_ST(scan, Binfo))
10084 {
10085 scan += definitive_len;
10086 hardcount++;
10087 }
10088 }
10089 }
10090
10091 break;
10092 } /* End of a full character is definitively matched */
10093
10094 /* Here, an initial portion of the character matched definitively,
10095 * and the rest matched as well, but could have false positives */
10096
10097 do {
10098 PERL_INT_FAST8_T i;
10099 U8 * matches = Binfo.matches;
10100
10101 /* The first bytes were definitive. Look at the remaining */
10102 for (i = 0; i < Binfo.count; i++) {
10103 if (memEQ(scan + definitive_len,
10104 matches + definitive_len,
10105 Binfo.lengths[i] - definitive_len))
10106 {
10107 goto found_a_completion;
10108 }
10109
10110 matches += Binfo.lengths[i];
10111 }
10112
10113 /* Didn't find anything to complete our initial match. Stop
10114 * here */
10115 break;
10116
10117 found_a_completion:
10118
10119 /* Here, matched a full character, Include it in the result,
10120 * and then look to see if the next char matches */
10121 hardcount++;
10122 scan += Binfo.lengths[i];
10123
10124 } while ( hardcount < max
10125 && scan + definitive_len < this_eol
10126 && S_test_EXACTISH_ST(scan, Binfo));
10127
10128 /* Here, have advanced as far as possible */
10129 break;
10130 } /* End of found some initial bytes that definitively matched */
10131
10132 /* Here, we can't rule out that we have found the beginning of 'B', but
10133 * there were no initial bytes that could rule out anything
10134 * definitively. Use brute force to examine all the possibilities */
10135 while (scan < this_eol && hardcount < max) {
10136 PERL_INT_FAST8_T i;
10137 U8 * matches = Binfo.matches;
10138
10139 for (i = 0; i < Binfo.count; i++) {
10140 if (memEQ(scan, matches, Binfo.lengths[i])) {
10141 goto found1;
10142 }
10143
10144 matches += Binfo.lengths[i];
10145 }
10146
10147 break;
10148
10149 found1:
10150 hardcount++;
10151 scan += Binfo.lengths[i];
10152 }
10153
10154 break;
10155 }
10156
10157 case ANYOFPOSIXL_t8:
10158 case ANYOFL_t8:
10159 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10160 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10161
10162 /* FALLTHROUGH */
10163 case ANYOFD_t8:
10164 case ANYOF_t8:
10165 while ( hardcount < max
10166 && scan < this_eol
10167 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10168 {
10169 scan += UTF8SKIP(scan);
10170 hardcount++;
10171 }
10172 break;
10173
10174 case ANYOFPOSIXL_tb:
10175 case ANYOFL_tb:
10176 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10177 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10178 /* FALLTHROUGH */
10179
10180 case ANYOFD_tb:
10181 case ANYOF_tb:
10182 if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
10183 while ( scan < this_eol
10184 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10185 scan++;
10186 }
10187 else {
10188 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10189 scan++;
10190 }
10191 break;
10192
10193 case ANYOFM_t8:
10194 if (this_eol - scan > max) {
10195
10196 /* We didn't adjust <this_eol> at the beginning of this routine
10197 * because is UTF-8, but it is actually ok to do so, since here, to
10198 * match, 1 char == 1 byte. */
10199 this_eol = scan + max;
10200 }
10201 /* FALLTHROUGH */
10202
10203 case ANYOFM_tb:
10204 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10205 (U8) ARG(p), FLAGS(p));
10206 break;
10207
10208 case NANYOFM_t8:
10209 while ( hardcount < max
10210 && scan < this_eol
10211 && (*scan & FLAGS(p)) != ARG(p))
10212 {
10213 scan += UTF8SKIP(scan);
10214 hardcount++;
10215 }
10216 break;
10217
10218 case NANYOFM_tb:
10219 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10220 (U8) ARG(p), FLAGS(p));
10221 break;
10222
10223 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10224 case ANYOFHb_tb:
10225 case ANYOFHr_tb:
10226 case ANYOFHs_tb:
10227 break;
10228
10229 case ANYOFH_t8:
10230 while ( hardcount < max
10231 && scan < this_eol
10232 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10233 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10234 {
10235 scan += UTF8SKIP(scan);
10236 hardcount++;
10237 }
10238 break;
10239
10240 case ANYOFHb_t8:
10241 /* we know the first byte must be the FLAGS field */
10242 while ( hardcount < max
10243 && scan < this_eol
10244 && (U8) *scan == ANYOF_FLAGS(p)
10245 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10246 {
10247 scan += UTF8SKIP(scan);
10248 hardcount++;
10249 }
10250 break;
10251
10252 case ANYOFHr_t8:
10253 while ( hardcount < max
10254 && scan < this_eol
10255 && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10256 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10257 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10258 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10259 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10260 {
10261 scan += UTF8SKIP(scan);
10262 hardcount++;
10263 }
10264 break;
10265
10266 case ANYOFHs_t8:
10267 while ( hardcount < max
10268 && scan + FLAGS(p) < this_eol
10269 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10270 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10271 {
10272 scan += UTF8SKIP(scan);
10273 hardcount++;
10274 }
10275 break;
10276
10277 case ANYOFR_t8:
10278 while ( hardcount < max
10279 && scan < this_eol
10280 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10281 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10282 (U8 *) this_eol,
10283 NULL),
10284 ANYOFRbase(p), ANYOFRdelta(p)))
10285 {
10286 scan += UTF8SKIP(scan);
10287 hardcount++;
10288 }
10289 break;
10290
10291 case ANYOFR_tb:
10292 while ( hardcount < max
10293 && scan < this_eol
10294 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10295 {
10296 scan++;
10297 hardcount++;
10298 }
10299 break;
10300
10301 case ANYOFRb_t8:
10302 while ( hardcount < max
10303 && scan < this_eol
10304 && (U8) *scan == ANYOF_FLAGS(p)
10305 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10306 (U8 *) this_eol,
10307 NULL),
10308 ANYOFRbase(p), ANYOFRdelta(p)))
10309 {
10310 scan += UTF8SKIP(scan);
10311 hardcount++;
10312 }
10313 break;
10314
10315 case ANYOFRb_tb:
10316 while ( hardcount < max
10317 && scan < this_eol
10318 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10319 {
10320 scan++;
10321 hardcount++;
10322 }
10323 break;
10324
10325 /* The argument (FLAGS) to all the POSIX node types is the class number */
10326
10327 case NPOSIXL_tb:
10328 to_complement = 1;
10329 /* FALLTHROUGH */
10330
10331 case POSIXL_tb:
10332 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10333 while ( scan < this_eol
10334 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10335 {
10336 scan++;
10337 }
10338 break;
10339
10340 case NPOSIXL_t8:
10341 to_complement = 1;
10342 /* FALLTHROUGH */
10343
10344 case POSIXL_t8:
10345 while ( hardcount < max && scan < this_eol
10346 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10347 (U8 *) scan,
10348 (U8 *) this_eol)))
10349 {
10350 scan += UTF8SKIP(scan);
10351 hardcount++;
10352 }
10353 break;
10354
10355 case POSIXD_tb:
10356 /* FALLTHROUGH */
10357
10358 case POSIXA_t8:
10359 if (this_eol - scan > max) {
10360
10361 /* We didn't adjust <this_eol> at the beginning of this routine
10362 * because is UTF-8, but it is actually ok to do so, since here, to
10363 * match, 1 char == 1 byte. */
10364 this_eol = scan + max;
10365 }
10366 /* FALLTHROUGH */
10367
10368 case POSIXA_tb:
10369 while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
10370 scan++;
10371 }
10372 break;
10373
10374 case NPOSIXD_tb:
10375 /* FALLTHROUGH */
10376
10377 case NPOSIXA_tb:
10378 while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
10379 scan++;
10380 }
10381 break;
10382
10383 case NPOSIXA_t8:
10384
10385 /* The complement of something that matches only ASCII matches all
10386 * non-ASCII, plus everything in ASCII that isn't in the class. */
10387 while ( hardcount < max && scan < this_eol
10388 && ( ! isASCII_utf8_safe(scan, loceol)
10389 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
10390 {
10391 scan += UTF8SKIP(scan);
10392 hardcount++;
10393 }
10394 break;
10395
10396 case NPOSIXU_tb:
10397 to_complement = 1;
10398 /* FALLTHROUGH */
10399
10400 case POSIXU_tb:
10401 while ( scan < this_eol
10402 && to_complement ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
10403 {
10404 scan++;
10405 }
10406 break;
10407
10408 case NPOSIXU_t8:
10409 case NPOSIXD_t8:
10410 to_complement = 1;
10411 /* FALLTHROUGH */
10412
10413 case POSIXD_t8:
10414 case POSIXU_t8:
10415 classnum = (_char_class_number) FLAGS(p);
10416 switch (classnum) {
10417 default:
10418 while ( hardcount < max && scan < this_eol
10419 && to_complement
10420 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10421 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10422 {
10423 scan += UTF8SKIP(scan);
10424 hardcount++;
10425 }
10426 break;
10427
10428 /* For the classes below, the knowledge of how to handle every code
10429 * point is compiled into Perl via a macro. This code is written
10430 * for making the loops as tight as possible. It could be
10431 * refactored to save space instead. */
10432
10433 case _CC_ENUM_SPACE:
10434 while ( hardcount < max
10435 && scan < this_eol
10436 && (to_complement
10437 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10438 {
10439 scan += UTF8SKIP(scan);
10440 hardcount++;
10441 }
10442 break;
10443 case _CC_ENUM_BLANK:
10444 while ( hardcount < max
10445 && scan < this_eol
10446 && (to_complement
10447 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10448 {
10449 scan += UTF8SKIP(scan);
10450 hardcount++;
10451 }
10452 break;
10453 case _CC_ENUM_XDIGIT:
10454 while ( hardcount < max
10455 && scan < this_eol
10456 && (to_complement
10457 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10458 {
10459 scan += UTF8SKIP(scan);
10460 hardcount++;
10461 }
10462 break;
10463 case _CC_ENUM_VERTSPACE:
10464 while ( hardcount < max
10465 && scan < this_eol
10466 && (to_complement
10467 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10468 {
10469 scan += UTF8SKIP(scan);
10470 hardcount++;
10471 }
10472 break;
10473 case _CC_ENUM_CNTRL:
10474 while ( hardcount < max
10475 && scan < this_eol
10476 && (to_complement
10477 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10478 {
10479 scan += UTF8SKIP(scan);
10480 hardcount++;
10481 }
10482 break;
10483 }
10484 break;
10485
10486 case LNBREAK_t8:
10487 while ( hardcount < max && scan < this_eol
10488 && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10489 {
10490 scan += c;
10491 hardcount++;
10492 }
10493 break;
10494
10495 case LNBREAK_tb:
10496 /* LNBREAK can match one or two latin chars, which is ok, but we have
10497 * to use hardcount in this situation, and throw away the adjustment to
10498 * <this_eol> done before the switch statement */
10499 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
10500 scan+=c;
10501 hardcount++;
10502 }
10503 break;
10504
10505 default:
10506 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10507 " node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10508 NOT_REACHED; /* NOTREACHED */
10509
10510 }
10511
10512 if (hardcount)
10513 c = hardcount;
10514 else
10515 c = scan - *startposp;
10516 *startposp = scan;
10517
10518 DEBUG_r({
10519 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10520 DEBUG_EXECUTE_r({
10521 SV * const prop = sv_newmortal();
10522 regprop(prog, prop, p, reginfo, NULL);
10523 Perl_re_exec_indentf( aTHX_
10524 "%s can match %" IVdf " times out of %" IVdf "...\n",
10525 depth, SvPVX_const(prop),(IV)c,(IV)max);
10526 });
10527 });
10528
10529 return(c);
10530 }
10531
10532 /*
10533 - reginclass - determine if a character falls into a character class
10534
10535 n is the ANYOF-type regnode
10536 p is the target string
10537 p_end points to one byte beyond the end of the target string
10538 utf8_target tells whether p is in UTF-8.
10539
10540 Returns true if matched; false otherwise.
10541
10542 Note that this can be a synthetic start class, a combination of various
10543 nodes, so things you think might be mutually exclusive, such as locale,
10544 aren't. It can match both locale and non-locale
10545
10546 */
10547
10548 STATIC bool
S_reginclass(pTHX_ regexp * const prog,const regnode * const n,const U8 * const p,const U8 * const p_end,const bool utf8_target)10549 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10550 {
10551 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10552 ? 0
10553 : ANYOF_FLAGS(n);
10554 bool match = FALSE;
10555 UV c = *p;
10556
10557 PERL_ARGS_ASSERT_REGINCLASS;
10558
10559 /* If c is not already the code point, get it. Note that
10560 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10561 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10562 STRLEN c_len = 0;
10563 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10564 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10565 if (c_len == (STRLEN)-1) {
10566 _force_out_malformed_utf8_message(p, p_end,
10567 utf8n_flags,
10568 1 /* 1 means die */ );
10569 NOT_REACHED; /* NOTREACHED */
10570 }
10571 if ( c > 255
10572 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10573 && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10574 {
10575 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10576 }
10577 }
10578
10579 /* If this character is potentially in the bitmap, check it */
10580 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10581 if (ANYOF_BITMAP_TEST(n, c))
10582 match = TRUE;
10583 else if ((flags
10584 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10585 && OP(n) == ANYOFD
10586 && ! utf8_target
10587 && ! isASCII(c))
10588 {
10589 match = TRUE;
10590 }
10591 else if (flags & ANYOF_LOCALE_FLAGS) {
10592 if ( (flags & ANYOFL_FOLD)
10593 && c < 256
10594 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10595 {
10596 match = TRUE;
10597 }
10598 else if ( ANYOF_POSIXL_TEST_ANY_SET(n)
10599 && c <= U8_MAX /* param to isFOO_lc() */
10600 ) {
10601 /* The data structure is arranged so bits 0, 2, 4, ... are set
10602 * if the class includes the Posix character class given by
10603 * bit/2; and 1, 3, 5, ... are set if the class includes the
10604 * complemented Posix class given by int(bit/2), so the
10605 * remainder modulo 2 tells us if to complement or not.
10606 *
10607 * Note that this code assumes that all the classes are closed
10608 * under folding. For example, if a character matches \w, then
10609 * its fold does too; and vice versa. This should be true for
10610 * any well-behaved locale for all the currently defined Posix
10611 * classes, except for :lower: and :upper:, which are handled
10612 * by the pseudo-class :cased: which matches if either of the
10613 * other two does. To get rid of this assumption, an outer
10614 * loop could be used below to iterate over both the source
10615 * character, and its fold (if different) */
10616
10617 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10618
10619 do {
10620 /* Find the next set bit indicating a class to try matching
10621 * against */
10622 U8 bit_pos = lsbit_pos32(posixl_bits);
10623
10624 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10625 match = TRUE;
10626 break;
10627 }
10628
10629 /* Remove this class from consideration; repeat */
10630 POSIXL_CLEAR(posixl_bits, bit_pos);
10631 } while(posixl_bits != 0);
10632 }
10633 }
10634 }
10635
10636
10637 /* If the bitmap didn't (or couldn't) match, and something outside the
10638 * bitmap could match, try that. */
10639 if (!match) {
10640 if (c >= NUM_ANYOF_CODE_POINTS
10641 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10642 {
10643 match = TRUE; /* Everything above the bitmap matches */
10644 }
10645 /* Here doesn't match everything above the bitmap. If there is
10646 * some information available beyond the bitmap, we may find a
10647 * match in it. If so, this is most likely because the code point
10648 * is outside the bitmap range. But rarely, it could be because of
10649 * some other reason. If so, various flags are set to indicate
10650 * this possibility. On ANYOFD nodes, there may be matches that
10651 * happen only when the target string is UTF-8; or for other node
10652 * types, because runtime lookup is needed, regardless of the
10653 * UTF-8ness of the target string. Finally, under /il, there may
10654 * be some matches only possible if the locale is a UTF-8 one. */
10655 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
10656 && ( c >= NUM_ANYOF_CODE_POINTS
10657 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10658 && ( UNLIKELY(OP(n) != ANYOFD)
10659 || (utf8_target && ! isASCII_uni(c)
10660 # if NUM_ANYOF_CODE_POINTS > 256
10661 && c < 256
10662 # endif
10663 )))
10664 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10665 && IN_UTF8_CTYPE_LOCALE)))
10666 {
10667 SV* only_utf8_locale = NULL;
10668 SV * const definition =
10669 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10670 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10671 &only_utf8_locale, NULL);
10672 #else
10673 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10674 &only_utf8_locale, NULL);
10675 #endif
10676 if (definition) {
10677 U8 utf8_buffer[2];
10678 U8 * utf8_p;
10679 if (utf8_target) {
10680 utf8_p = (U8 *) p;
10681 } else { /* Convert to utf8 */
10682 utf8_p = utf8_buffer;
10683 append_utf8_from_native_byte(*p, &utf8_p);
10684 utf8_p = utf8_buffer;
10685 }
10686
10687 /* Turkish locales have these hard-coded rules overriding
10688 * normal ones */
10689 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10690 && isALPHA_FOLD_EQ(*p, 'i'))
10691 {
10692 if (*p == 'i') {
10693 if (_invlist_contains_cp(definition,
10694 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10695 {
10696 match = TRUE;
10697 }
10698 }
10699 else if (*p == 'I') {
10700 if (_invlist_contains_cp(definition,
10701 LATIN_SMALL_LETTER_DOTLESS_I))
10702 {
10703 match = TRUE;
10704 }
10705 }
10706 }
10707 else if (_invlist_contains_cp(definition, c)) {
10708 match = TRUE;
10709 }
10710 }
10711 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10712 match = _invlist_contains_cp(only_utf8_locale, c);
10713 }
10714 }
10715
10716 /* In a Turkic locale under folding, hard-code the I i case pair
10717 * matches */
10718 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10719 && ! match
10720 && (flags & ANYOFL_FOLD)
10721 && utf8_target)
10722 {
10723 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10724 if (ANYOF_BITMAP_TEST(n, 'i')) {
10725 match = TRUE;
10726 }
10727 }
10728 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10729 if (ANYOF_BITMAP_TEST(n, 'I')) {
10730 match = TRUE;
10731 }
10732 }
10733 }
10734
10735 if (UNICODE_IS_SUPER(c)
10736 && (flags
10737 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10738 && OP(n) != ANYOFD
10739 && ckWARN_d(WARN_NON_UNICODE))
10740 {
10741 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10742 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10743 }
10744 }
10745
10746 #if ANYOF_INVERT != 1
10747 /* Depending on compiler optimization cBOOL takes time, so if don't have to
10748 * use it, don't */
10749 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10750 #endif
10751
10752 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10753 return (flags & ANYOF_INVERT) ^ match;
10754 }
10755
10756 STATIC U8 *
S_reghop3(U8 * s,SSize_t off,const U8 * lim)10757 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10758 {
10759 /* return the position 'off' UTF-8 characters away from 's', forward if
10760 * 'off' >= 0, backwards if negative. But don't go outside of position
10761 * 'lim', which better be < s if off < 0 */
10762
10763 PERL_ARGS_ASSERT_REGHOP3;
10764
10765 if (off >= 0) {
10766 while (off-- && s < lim) {
10767 /* XXX could check well-formedness here */
10768 U8 *new_s = s + UTF8SKIP(s);
10769 if (new_s > lim) /* lim may be in the middle of a long character */
10770 return s;
10771 s = new_s;
10772 }
10773 }
10774 else {
10775 while (off++ && s > lim) {
10776 s--;
10777 if (UTF8_IS_CONTINUED(*s)) {
10778 while (s > lim && UTF8_IS_CONTINUATION(*s))
10779 s--;
10780 if (! UTF8_IS_START(*s)) {
10781 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10782 }
10783 }
10784 /* XXX could check well-formedness here */
10785 }
10786 }
10787 return s;
10788 }
10789
10790 STATIC U8 *
S_reghop4(U8 * s,SSize_t off,const U8 * llim,const U8 * rlim)10791 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10792 {
10793 PERL_ARGS_ASSERT_REGHOP4;
10794
10795 if (off >= 0) {
10796 while (off-- && s < rlim) {
10797 /* XXX could check well-formedness here */
10798 s += UTF8SKIP(s);
10799 }
10800 }
10801 else {
10802 while (off++ && s > llim) {
10803 s--;
10804 if (UTF8_IS_CONTINUED(*s)) {
10805 while (s > llim && UTF8_IS_CONTINUATION(*s))
10806 s--;
10807 if (! UTF8_IS_START(*s)) {
10808 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10809 }
10810 }
10811 /* XXX could check well-formedness here */
10812 }
10813 }
10814 return s;
10815 }
10816
10817 /* like reghop3, but returns NULL on overrun, rather than returning last
10818 * char pos */
10819
10820 STATIC U8 *
S_reghopmaybe3(U8 * s,SSize_t off,const U8 * const lim)10821 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10822 {
10823 PERL_ARGS_ASSERT_REGHOPMAYBE3;
10824
10825 if (off >= 0) {
10826 while (off-- && s < lim) {
10827 /* XXX could check well-formedness here */
10828 s += UTF8SKIP(s);
10829 }
10830 if (off >= 0)
10831 return NULL;
10832 }
10833 else {
10834 while (off++ && s > lim) {
10835 s--;
10836 if (UTF8_IS_CONTINUED(*s)) {
10837 while (s > lim && UTF8_IS_CONTINUATION(*s))
10838 s--;
10839 if (! UTF8_IS_START(*s)) {
10840 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10841 }
10842 }
10843 /* XXX could check well-formedness here */
10844 }
10845 if (off <= 0)
10846 return NULL;
10847 }
10848 return s;
10849 }
10850
10851
10852 /* when executing a regex that may have (?{}), extra stuff needs setting
10853 up that will be visible to the called code, even before the current
10854 match has finished. In particular:
10855
10856 * $_ is localised to the SV currently being matched;
10857 * pos($_) is created if necessary, ready to be updated on each call-out
10858 to code;
10859 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10860 isn't set until the current pattern is successfully finished), so that
10861 $1 etc of the match-so-far can be seen;
10862 * save the old values of subbeg etc of the current regex, and set then
10863 to the current string (again, this is normally only done at the end
10864 of execution)
10865 */
10866
10867 static void
S_setup_eval_state(pTHX_ regmatch_info * const reginfo)10868 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10869 {
10870 MAGIC *mg;
10871 regexp *const rex = ReANY(reginfo->prog);
10872 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10873
10874 eval_state->rex = rex;
10875 eval_state->sv = reginfo->sv;
10876
10877 if (reginfo->sv) {
10878 /* Make $_ available to executed code. */
10879 if (reginfo->sv != DEFSV) {
10880 SAVE_DEFSV;
10881 DEFSV_set(reginfo->sv);
10882 }
10883 /* will be dec'd by S_cleanup_regmatch_info_aux */
10884 SvREFCNT_inc_NN(reginfo->sv);
10885
10886 if (!(mg = mg_find_mglob(reginfo->sv))) {
10887 /* prepare for quick setting of pos */
10888 mg = sv_magicext_mglob(reginfo->sv);
10889 mg->mg_len = -1;
10890 }
10891 eval_state->pos_magic = mg;
10892 eval_state->pos = mg->mg_len;
10893 eval_state->pos_flags = mg->mg_flags;
10894 }
10895 else
10896 eval_state->pos_magic = NULL;
10897
10898 if (!PL_reg_curpm) {
10899 /* PL_reg_curpm is a fake PMOP that we can attach the current
10900 * regex to and point PL_curpm at, so that $1 et al are visible
10901 * within a /(?{})/. It's just allocated once per interpreter the
10902 * first time its needed */
10903 Newxz(PL_reg_curpm, 1, PMOP);
10904 #ifdef USE_ITHREADS
10905 {
10906 SV* const repointer = &PL_sv_undef;
10907 /* this regexp is also owned by the new PL_reg_curpm, which
10908 will try to free it. */
10909 av_push(PL_regex_padav, repointer);
10910 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
10911 PL_regex_pad = AvARRAY(PL_regex_padav);
10912 }
10913 #endif
10914 }
10915 SET_reg_curpm(reginfo->prog);
10916 eval_state->curpm = PL_curpm;
10917 PL_curpm_under = PL_curpm;
10918 PL_curpm = PL_reg_curpm;
10919 if (RXp_MATCH_COPIED(rex)) {
10920 /* Here is a serious problem: we cannot rewrite subbeg,
10921 since it may be needed if this match fails. Thus
10922 $` inside (?{}) could fail... */
10923 eval_state->subbeg = rex->subbeg;
10924 eval_state->sublen = rex->sublen;
10925 eval_state->suboffset = rex->suboffset;
10926 eval_state->subcoffset = rex->subcoffset;
10927 #ifdef PERL_ANY_COW
10928 eval_state->saved_copy = rex->saved_copy;
10929 #endif
10930 RXp_MATCH_COPIED_off(rex);
10931 }
10932 else
10933 eval_state->subbeg = NULL;
10934 rex->subbeg = (char *)reginfo->strbeg;
10935 rex->suboffset = 0;
10936 rex->subcoffset = 0;
10937 rex->sublen = reginfo->strend - reginfo->strbeg;
10938 }
10939
10940
10941 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10942
10943 static void
S_cleanup_regmatch_info_aux(pTHX_ void * arg)10944 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10945 {
10946 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10947 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
10948 regmatch_slab *s;
10949
10950 Safefree(aux->poscache);
10951
10952 if (eval_state) {
10953
10954 /* undo the effects of S_setup_eval_state() */
10955
10956 if (eval_state->subbeg) {
10957 regexp * const rex = eval_state->rex;
10958 rex->subbeg = eval_state->subbeg;
10959 rex->sublen = eval_state->sublen;
10960 rex->suboffset = eval_state->suboffset;
10961 rex->subcoffset = eval_state->subcoffset;
10962 #ifdef PERL_ANY_COW
10963 rex->saved_copy = eval_state->saved_copy;
10964 #endif
10965 RXp_MATCH_COPIED_on(rex);
10966 }
10967 if (eval_state->pos_magic)
10968 {
10969 eval_state->pos_magic->mg_len = eval_state->pos;
10970 eval_state->pos_magic->mg_flags =
10971 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10972 | (eval_state->pos_flags & MGf_BYTES);
10973 }
10974
10975 PL_curpm = eval_state->curpm;
10976 SvREFCNT_dec(eval_state->sv);
10977 }
10978
10979 PL_regmatch_state = aux->old_regmatch_state;
10980 PL_regmatch_slab = aux->old_regmatch_slab;
10981
10982 /* free all slabs above current one - this must be the last action
10983 * of this function, as aux and eval_state are allocated within
10984 * slabs and may be freed here */
10985
10986 s = PL_regmatch_slab->next;
10987 if (s) {
10988 PL_regmatch_slab->next = NULL;
10989 while (s) {
10990 regmatch_slab * const osl = s;
10991 s = s->next;
10992 Safefree(osl);
10993 }
10994 }
10995 }
10996
10997
10998 STATIC void
S_to_utf8_substr(pTHX_ regexp * prog)10999 S_to_utf8_substr(pTHX_ regexp *prog)
11000 {
11001 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11002 * on the converted value */
11003
11004 int i = 1;
11005
11006 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11007
11008 do {
11009 if (prog->substrs->data[i].substr
11010 && !prog->substrs->data[i].utf8_substr) {
11011 SV* const sv = newSVsv(prog->substrs->data[i].substr);
11012 prog->substrs->data[i].utf8_substr = sv;
11013 sv_utf8_upgrade(sv);
11014 if (SvVALID(prog->substrs->data[i].substr)) {
11015 if (SvTAIL(prog->substrs->data[i].substr)) {
11016 /* Trim the trailing \n that fbm_compile added last
11017 time. */
11018 SvCUR_set(sv, SvCUR(sv) - 1);
11019 /* Whilst this makes the SV technically "invalid" (as its
11020 buffer is no longer followed by "\0") when fbm_compile()
11021 adds the "\n" back, a "\0" is restored. */
11022 fbm_compile(sv, FBMcf_TAIL);
11023 } else
11024 fbm_compile(sv, 0);
11025 }
11026 if (prog->substrs->data[i].substr == prog->check_substr)
11027 prog->check_utf8 = sv;
11028 }
11029 } while (i--);
11030 }
11031
11032 STATIC bool
S_to_byte_substr(pTHX_ regexp * prog)11033 S_to_byte_substr(pTHX_ regexp *prog)
11034 {
11035 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11036 * on the converted value; returns FALSE if can't be converted. */
11037
11038 int i = 1;
11039
11040 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11041
11042 do {
11043 if (prog->substrs->data[i].utf8_substr
11044 && !prog->substrs->data[i].substr) {
11045 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11046 if (! sv_utf8_downgrade(sv, TRUE)) {
11047 SvREFCNT_dec_NN(sv);
11048 return FALSE;
11049 }
11050 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11051 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11052 /* Trim the trailing \n that fbm_compile added last
11053 time. */
11054 SvCUR_set(sv, SvCUR(sv) - 1);
11055 fbm_compile(sv, FBMcf_TAIL);
11056 } else
11057 fbm_compile(sv, 0);
11058 }
11059 prog->substrs->data[i].substr = sv;
11060 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11061 prog->check_substr = sv;
11062 }
11063 } while (i--);
11064
11065 return TRUE;
11066 }
11067
11068 #ifndef PERL_IN_XSUB_RE
11069
11070 bool
Perl_is_grapheme(pTHX_ const U8 * strbeg,const U8 * s,const U8 * strend,const UV cp)11071 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11072 {
11073 /* Temporary helper function for toke.c. Verify that the code point 'cp'
11074 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
11075 * the larger string bounded by 'strbeg' and 'strend'.
11076 *
11077 * 'cp' needs to be assigned (if not, a future version of the Unicode
11078 * Standard could make it something that combines with adjacent characters,
11079 * so code using it would then break), and there has to be a GCB break
11080 * before and after the character. */
11081
11082
11083 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11084 const U8 * prev_cp_start;
11085
11086 PERL_ARGS_ASSERT_IS_GRAPHEME;
11087
11088 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
11089 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11090 {
11091 /* These are considered graphemes */
11092 return TRUE;
11093 }
11094
11095 /* Otherwise, unassigned code points are forbidden */
11096 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11097 _invlist_search(PL_Assigned_invlist, cp))))
11098 {
11099 return FALSE;
11100 }
11101
11102 cp_gcb_val = getGCB_VAL_CP(cp);
11103
11104 /* Find the GCB value of the previous code point in the input */
11105 prev_cp_start = utf8_hop_back(s, -1, strbeg);
11106 if (UNLIKELY(prev_cp_start == s)) {
11107 prev_cp_gcb_val = GCB_EDGE;
11108 }
11109 else {
11110 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11111 }
11112
11113 /* And check that is a grapheme boundary */
11114 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11115 TRUE /* is UTF-8 encoded */ ))
11116 {
11117 return FALSE;
11118 }
11119
11120 /* Similarly verify there is a break between the current character and the
11121 * following one */
11122 s += UTF8SKIP(s);
11123 if (s >= strend) {
11124 next_cp_gcb_val = GCB_EDGE;
11125 }
11126 else {
11127 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11128 }
11129
11130 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11131 }
11132
11133 /*
11134 =for apidoc_section $unicode
11135
11136 =for apidoc isSCRIPT_RUN
11137
11138 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11139 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
11140 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
11141 two degenerate cases given below, this function returns TRUE iff all code
11142 points in it come from any combination of three "scripts" given by the Unicode
11143 "Script Extensions" property: Common, Inherited, and possibly one other.
11144 Additionally all decimal digits must come from the same consecutive sequence of
11145 10.
11146
11147 For example, if all the characters in the sequence are Greek, or Common, or
11148 Inherited, this function will return TRUE, provided any decimal digits in it
11149 are from the same block of digits in Common. (These are the ASCII digits
11150 "0".."9" and additionally a block for full width forms of these, and several
11151 others used in mathematical notation.) For scripts (unlike Greek) that have
11152 their own digits defined this will accept either digits from that set or from
11153 one of the Common digit sets, but not a combination of the two. Some scripts,
11154 such as Arabic, have more than one set of digits. All digits must come from
11155 the same set for this function to return TRUE.
11156
11157 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11158 contain the script found, using the C<SCX_enum> typedef. Its value will be
11159 C<SCX_INVALID> if the function returns FALSE.
11160
11161 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11162 will be C<SCX_INVALID>.
11163
11164 If the sequence contains a single code point which is unassigned to a character
11165 in the version of Unicode being used, the function will return TRUE, and the
11166 script will be C<SCX_Unknown>. Any other combination of unassigned code points
11167 in the input sequence will result in the function treating the input as not
11168 being a script run.
11169
11170 The returned script will be C<SCX_Inherited> iff all the code points in it are
11171 from the Inherited script.
11172
11173 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11174 it are from the Inherited or Common scripts.
11175
11176 =cut
11177
11178 */
11179
11180 bool
Perl_isSCRIPT_RUN(pTHX_ const U8 * s,const U8 * send,const bool utf8_target)11181 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11182 {
11183 /* Basically, it looks at each character in the sequence to see if the
11184 * above conditions are met; if not it fails. It uses an inversion map to
11185 * find the enum corresponding to the script of each character. But this
11186 * is complicated by the fact that a few code points can be in any of
11187 * several scripts. The data has been constructed so that there are
11188 * additional enum values (all negative) for these situations. The
11189 * absolute value of those is an index into another table which contains
11190 * pointers to auxiliary tables for each such situation. Each aux array
11191 * lists all the scripts for the given situation. There is another,
11192 * parallel, table that gives the number of entries in each aux table.
11193 * These are all defined in charclass_invlists.h */
11194
11195 /* XXX Here are the additional things UTS 39 says could be done:
11196 *
11197 * Forbid sequences of the same nonspacing mark
11198 *
11199 * Check to see that all the characters are in the sets of exemplar
11200 * characters for at least one language in the Unicode Common Locale Data
11201 * Repository [CLDR]. */
11202
11203
11204 /* Things that match /\d/u */
11205 SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
11206 UV * decimals_array = invlist_array(decimals_invlist);
11207
11208 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11209 * not currently known) */
11210 UV zero_of_run = 0;
11211
11212 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
11213 SCX_enum script_of_char = SCX_INVALID;
11214
11215 /* If the script remains not fully determined from iteration to iteration,
11216 * this is the current intersection of the possiblities. */
11217 SCX_enum * intersection = NULL;
11218 PERL_UINT_FAST8_T intersection_len = 0;
11219
11220 bool retval = TRUE;
11221 SCX_enum * ret_script = NULL;
11222
11223 assert(send >= s);
11224
11225 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11226
11227 /* All code points in 0..255 are either Common or Latin, so must be a
11228 * script run. We can return immediately unless we need to know which
11229 * script it is. */
11230 if (! utf8_target && LIKELY(send > s)) {
11231 if (ret_script == NULL) {
11232 return TRUE;
11233 }
11234
11235 /* If any character is Latin, the run is Latin */
11236 while (s < send) {
11237 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11238 *ret_script = SCX_Latin;
11239 return TRUE;
11240 }
11241 }
11242
11243 /* Here, all are Common */
11244 *ret_script = SCX_Common;
11245 return TRUE;
11246 }
11247
11248 /* Look at each character in the sequence */
11249 while (s < send) {
11250 /* If the current character being examined is a digit, this is the code
11251 * point of the zero for its sequence of 10 */
11252 UV zero_of_char;
11253
11254 UV cp;
11255
11256 /* The code allows all scripts to use the ASCII digits. This is
11257 * because they are in the Common script. Hence any ASCII ones found
11258 * are ok, unless and until a digit from another set has already been
11259 * encountered. digit ranges in Common are not similarly blessed) */
11260 if (UNLIKELY(isDIGIT(*s))) {
11261 if (UNLIKELY(script_of_run == SCX_Unknown)) {
11262 retval = FALSE;
11263 break;
11264 }
11265 if (zero_of_run) {
11266 if (zero_of_run != '0') {
11267 retval = FALSE;
11268 break;
11269 }
11270 }
11271 else {
11272 zero_of_run = '0';
11273 }
11274 s++;
11275 continue;
11276 }
11277
11278 /* Here, isn't an ASCII digit. Find the code point of the character */
11279 if (! UTF8_IS_INVARIANT(*s)) {
11280 Size_t len;
11281 cp = valid_utf8_to_uvchr((U8 *) s, &len);
11282 s += len;
11283 }
11284 else {
11285 cp = *(s++);
11286 }
11287
11288 /* If is within the range [+0 .. +9] of the script's zero, it also is a
11289 * digit in that script. We can skip the rest of this code for this
11290 * character. */
11291 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11292 continue;
11293 }
11294
11295 /* Find the character's script. The correct values are hard-coded here
11296 * for small-enough code points. */
11297 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
11298 unlikely to change */
11299 if ( cp > 255
11300 || ( isALPHA_L1(cp)
11301 && LIKELY(cp != MICRO_SIGN_NATIVE)))
11302 {
11303 script_of_char = SCX_Latin;
11304 }
11305 else {
11306 script_of_char = SCX_Common;
11307 }
11308 }
11309 else {
11310 script_of_char = _Perl_SCX_invmap[
11311 _invlist_search(PL_SCX_invlist, cp)];
11312 }
11313
11314 /* We arbitrarily accept a single unassigned character, but not in
11315 * combination with anything else, and not a run of them. */
11316 if ( UNLIKELY(script_of_run == SCX_Unknown)
11317 || UNLIKELY( script_of_run != SCX_INVALID
11318 && script_of_char == SCX_Unknown))
11319 {
11320 retval = FALSE;
11321 break;
11322 }
11323
11324 /* For the first character, or the run is inherited, the run's script
11325 * is set to the char's */
11326 if ( UNLIKELY(script_of_run == SCX_INVALID)
11327 || UNLIKELY(script_of_run == SCX_Inherited))
11328 {
11329 script_of_run = script_of_char;
11330 }
11331
11332 /* For the character's script to be Unknown, it must be the first
11333 * character in the sequence (for otherwise a test above would have
11334 * prevented us from reaching here), and we have set the run's script
11335 * to it. Nothing further to be done for this character */
11336 if (UNLIKELY(script_of_char == SCX_Unknown)) {
11337 continue;
11338 }
11339
11340 /* We accept 'inherited' script characters currently even at the
11341 * beginning. (We know that no characters in Inherited are digits, or
11342 * we'd have to check for that) */
11343 if (UNLIKELY(script_of_char == SCX_Inherited)) {
11344 continue;
11345 }
11346
11347 /* If the run so far is Common, and the new character isn't, change the
11348 * run's script to that of this character */
11349 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11350 script_of_run = script_of_char;
11351 }
11352
11353 /* Now we can see if the script of the new character is the same as
11354 * that of the run */
11355 if (LIKELY(script_of_char == script_of_run)) {
11356 /* By far the most common case */
11357 goto scripts_match;
11358 }
11359
11360 /* Here, the script of the run isn't Common. But characters in Common
11361 * match any script */
11362 if (script_of_char == SCX_Common) {
11363 goto scripts_match;
11364 }
11365
11366 #ifndef HAS_SCX_AUX_TABLES
11367
11368 /* Too early a Unicode version to have a code point belonging to more
11369 * than one script, so, if the scripts don't exactly match, fail */
11370 PERL_UNUSED_VAR(intersection_len);
11371 retval = FALSE;
11372 break;
11373
11374 #else
11375
11376 /* Here there is no exact match between the character's script and the
11377 * run's. And we've handled the special cases of scripts Unknown,
11378 * Inherited, and Common.
11379 *
11380 * Negative script numbers signify that the value may be any of several
11381 * scripts, and we need to look at auxiliary information to make our
11382 * deterimination. But if both are non-negative, we can fail now */
11383 if (LIKELY(script_of_char >= 0)) {
11384 const SCX_enum * search_in;
11385 PERL_UINT_FAST8_T search_in_len;
11386 PERL_UINT_FAST8_T i;
11387
11388 if (LIKELY(script_of_run >= 0)) {
11389 retval = FALSE;
11390 break;
11391 }
11392
11393 /* Use the previously constructed set of possible scripts, if any.
11394 * */
11395 if (intersection) {
11396 search_in = intersection;
11397 search_in_len = intersection_len;
11398 }
11399 else {
11400 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11401 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11402 }
11403
11404 for (i = 0; i < search_in_len; i++) {
11405 if (search_in[i] == script_of_char) {
11406 script_of_run = script_of_char;
11407 goto scripts_match;
11408 }
11409 }
11410
11411 retval = FALSE;
11412 break;
11413 }
11414 else if (LIKELY(script_of_run >= 0)) {
11415 /* script of character could be one of several, but run is a single
11416 * script */
11417 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11418 const PERL_UINT_FAST8_T search_in_len
11419 = SCX_AUX_TABLE_lengths[-script_of_char];
11420 PERL_UINT_FAST8_T i;
11421
11422 for (i = 0; i < search_in_len; i++) {
11423 if (search_in[i] == script_of_run) {
11424 script_of_char = script_of_run;
11425 goto scripts_match;
11426 }
11427 }
11428
11429 retval = FALSE;
11430 break;
11431 }
11432 else {
11433 /* Both run and char could be in one of several scripts. If the
11434 * intersection is empty, then this character isn't in this script
11435 * run. Otherwise, we need to calculate the intersection to use
11436 * for future iterations of the loop, unless we are already at the
11437 * final character */
11438 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11439 const PERL_UINT_FAST8_T char_len
11440 = SCX_AUX_TABLE_lengths[-script_of_char];
11441 const SCX_enum * search_run;
11442 PERL_UINT_FAST8_T run_len;
11443
11444 SCX_enum * new_overlap = NULL;
11445 PERL_UINT_FAST8_T i, j;
11446
11447 if (intersection) {
11448 search_run = intersection;
11449 run_len = intersection_len;
11450 }
11451 else {
11452 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11453 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11454 }
11455
11456 intersection_len = 0;
11457
11458 for (i = 0; i < run_len; i++) {
11459 for (j = 0; j < char_len; j++) {
11460 if (search_run[i] == search_char[j]) {
11461
11462 /* Here, the script at i,j matches. That means this
11463 * character is in the run. But continue on to find
11464 * the complete intersection, for the next loop
11465 * iteration, and for the digit check after it.
11466 *
11467 * On the first found common script, we malloc space
11468 * for the intersection list for the worst case of the
11469 * intersection, which is the minimum of the number of
11470 * scripts remaining in each set. */
11471 if (intersection_len == 0) {
11472 Newx(new_overlap,
11473 MIN(run_len - i, char_len - j),
11474 SCX_enum);
11475 }
11476 new_overlap[intersection_len++] = search_run[i];
11477 }
11478 }
11479 }
11480
11481 /* Here we've looked through everything. If they have no scripts
11482 * in common, not a run */
11483 if (intersection_len == 0) {
11484 retval = FALSE;
11485 break;
11486 }
11487
11488 /* If there is only a single script in common, set to that.
11489 * Otherwise, use the intersection going forward */
11490 Safefree(intersection);
11491 intersection = NULL;
11492 if (intersection_len == 1) {
11493 script_of_run = script_of_char = new_overlap[0];
11494 Safefree(new_overlap);
11495 new_overlap = NULL;
11496 }
11497 else {
11498 intersection = new_overlap;
11499 }
11500 }
11501
11502 #endif
11503
11504 scripts_match:
11505
11506 /* Here, the script of the character is compatible with that of the
11507 * run. That means that in most cases, it continues the script run.
11508 * Either it and the run match exactly, or one or both can be in any of
11509 * several scripts, and the intersection is not empty. However, if the
11510 * character is a decimal digit, it could still mean failure if it is
11511 * from the wrong sequence of 10. So, we need to look at if it's a
11512 * digit. We've already handled the 10 digits [0-9], and the next
11513 * lowest one is this one: */
11514 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11515 continue; /* Not a digit; this character is part of the run */
11516 }
11517
11518 /* If we have a definitive '0' for the script of this character, we
11519 * know that for this to be a digit, it must be in the range of +0..+9
11520 * of that zero. */
11521 if ( script_of_char >= 0
11522 && (zero_of_char = script_zeros[script_of_char]))
11523 {
11524 if (! withinCOUNT(cp, zero_of_char, 9)) {
11525 continue; /* Not a digit; this character is part of the run
11526 */
11527 }
11528
11529 }
11530 else { /* Need to look up if this character is a digit or not */
11531 SSize_t index_of_zero_of_char;
11532 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11533 if ( UNLIKELY(index_of_zero_of_char < 0)
11534 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11535 {
11536 continue; /* Not a digit; this character is part of the run.
11537 */
11538 }
11539
11540 zero_of_char = decimals_array[index_of_zero_of_char];
11541 }
11542
11543 /* Here, the character is a decimal digit, and the zero of its sequence
11544 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
11545 * they better be the same. */
11546 if (zero_of_run) {
11547 if (zero_of_run != zero_of_char) {
11548 retval = FALSE;
11549 break;
11550 }
11551 }
11552 else { /* Otherwise we now have a zero for this run */
11553 zero_of_run = zero_of_char;
11554 }
11555 } /* end of looping through CLOSESR text */
11556
11557 Safefree(intersection);
11558
11559 if (ret_script != NULL) {
11560 if (retval) {
11561 *ret_script = script_of_run;
11562 }
11563 else {
11564 *ret_script = SCX_INVALID;
11565 }
11566 }
11567
11568 return retval;
11569 }
11570
11571 #endif /* ifndef PERL_IN_XSUB_RE */
11572
11573 /*
11574 * ex: set ts=8 sts=4 sw=4 et:
11575 */
11576