1 /* regexec.c
2 */
3
4 /*
5 * One Ring to rule them all, One Ring to find them
6 *
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10 */
11
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
20 */
21
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34 */
35
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39
40 /*
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
65 ****
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEX_ENGINE
75 #define PERL_IN_REGEXEC_C
76 #include "perl.h"
77
78 #ifdef PERL_IN_XSUB_RE
79 # include "re_comp.h"
80 #else
81 # include "regcomp.h"
82 #endif
83
84 #include "invlist_inline.h"
85 #include "unicode_constants.h"
86
87 static const char b_utf8_locale_required[] =
88 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
89 " Assuming a UTF-8 locale";
90
91 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
92 STMT_START { \
93 if (! IN_UTF8_CTYPE_LOCALE) { \
94 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
95 b_utf8_locale_required); \
96 } \
97 } STMT_END
98
99 static const char sets_utf8_locale_required[] =
100 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
101
102 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
103 STMT_START { \
104 if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\
105 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
106 sets_utf8_locale_required); \
107 } \
108 } STMT_END
109
110 #ifdef DEBUGGING
111 /* At least one required character in the target string is expressible only in
112 * UTF-8. */
113 static const char non_utf8_target_but_utf8_required[]
114 = "Can't match, because target string needs to be in UTF-8\n";
115 #endif
116
117 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
118 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
119 goto target; \
120 } STMT_END
121
122 #ifndef STATIC
123 #define STATIC static
124 #endif
125
126 /*
127 * Forwards.
128 */
129
130 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131
132 #define HOPc(pos,off) \
133 (char *)(reginfo->is_utf8_target \
134 ? reghop3((U8*)pos, off, \
135 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
136 : (U8*)(pos + off))
137
138 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
139 #define HOPBACK3(pos, off, lim) \
140 (reginfo->is_utf8_target \
141 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
142 : (pos - off >= lim) \
143 ? (U8*)pos - off \
144 : NULL)
145
146 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147
148 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
149 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150
151 /* lim must be +ve. Returns NULL on overshoot */
152 #define HOPMAYBE3(pos,off,lim) \
153 (reginfo->is_utf8_target \
154 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
155 : ((U8*)pos + off <= lim) \
156 ? (U8*)pos + off \
157 : NULL)
158
159 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
160 * off must be >=0; args should be vars rather than expressions */
161 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
162 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
163 : (U8*)((pos + off) > lim ? lim : (pos + off)))
164 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165
166 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
167 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168 : (U8*)(pos + off))
169 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170
171 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
172 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173
174 /* for use after a quantifier and before an EXACT-like node -- japhy */
175 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176 *
177 * NOTE that *nothing* that affects backtracking should be in here, specifically
178 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
179 * node that is in between two EXACT like nodes when ascertaining what the required
180 * "follow" character is. This should probably be moved to regex compile time
181 * although it may be done at run time because of the REF possibility - more
182 * investigation required. -- demerphq
183 */
184 #define JUMPABLE(rn) ( \
185 OP(rn) == OPEN || \
186 (OP(rn) == CLOSE && \
187 !EVAL_CLOSE_PAREN_IS(cur_eval,PARNO(rn)) ) || \
188 OP(rn) == EVAL || \
189 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
190 OP(rn) == PLUS || OP(rn) == MINMOD || \
191 OP(rn) == KEEPS || \
192 (REGNODE_TYPE(OP(rn)) == CURLY && ARG1i(rn) > 0) \
193 )
194 #define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT)
195
196 #define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF )
197
198 /*
199 Search for mandatory following text node; for lookahead, the text must
200 follow but for lookbehind (FLAGS(rn) != 0) we skip to the next step.
201 */
202 #define FIND_NEXT_IMPT(rn) STMT_START { \
203 while (JUMPABLE(rn)) { \
204 const OPCODE type = OP(rn); \
205 if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \
206 rn = REGNODE_AFTER_opcode(rn,type); \
207 else if (type == PLUS) \
208 rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \
209 else if (type == IFMATCH) \
210 rn = (FLAGS(rn) == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG1u(rn); \
211 else rn += NEXT_OFF(rn); \
212 } \
213 } STMT_END
214
215 #define SLAB_FIRST(s) (&(s)->states[0])
216 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217
218 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
219 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
220 static regmatch_state * S_push_slab(pTHX);
221
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
226
227 STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp * rex,I32 parenfloor,U32 maxopenparen comma_pDEPTH)228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen comma_pDEPTH)
229 {
230 const int retval = PL_savestack_ix;
231 /* Number of bytes about to be stored in the stack */
232 const SSize_t paren_bytes_to_push = sizeof(*RXp_OFFSp(rex)) * (maxopenparen - parenfloor);
233 /* Number of savestack[] entries to be filled by the paren data */
234 /* Rounding is performed in case we are few elements short */
235 const int paren_elems_to_push = (paren_bytes_to_push + sizeof(*PL_savestack) - 1) / sizeof(*PL_savestack);
236 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
237 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
238
239 DECLARE_AND_GET_RE_DEBUG_FLAGS;
240
241 PERL_ARGS_ASSERT_REGCPPUSH;
242
243 if (paren_elems_to_push < 0)
244 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i",
245 (int)paren_elems_to_push, (int)maxopenparen,
246 (int)parenfloor);
247
248 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
249 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
250 " out of range (%lu-%ld)",
251 total_elems,
252 (unsigned long)maxopenparen,
253 (long)parenfloor);
254
255 DEBUG_BUFFERS_r(
256 if ((int)maxopenparen > (int)parenfloor)
257 Perl_re_exec_indentf( aTHX_
258 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
259 depth,
260 PTR2UV(rex),
261 PTR2UV(RXp_OFFSp(rex))
262 );
263 );
264
265 SSGROW(total_elems + REGCP_FRAME_ELEMS);
266 assert((IV)PL_savestack_max > (IV)(total_elems + REGCP_FRAME_ELEMS));
267
268 /* memcpy the offs inside the stack - it's faster than for loop */
269 memcpy(&PL_savestack[PL_savestack_ix], RXp_OFFSp(rex) + parenfloor + 1, paren_bytes_to_push);
270 PL_savestack_ix += paren_elems_to_push;
271
272 DEBUG_BUFFERS_r({
273 I32 p;
274 for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) {
275 Perl_re_exec_indentf(aTHX_
276 " \\%" UVuf " %" IVdf " (%" IVdf ") .. %" IVdf " (regcppush)\n",
277 depth,
278 (UV)p,
279 (IV)RXp_OFFSp(rex)[p].start,
280 (IV)RXp_OFFSp(rex)[p].start_tmp,
281 (IV)RXp_OFFSp(rex)[p].end
282 );
283 }
284 });
285
286 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
287 SSPUSHINT(maxopenparen);
288 SSPUSHINT(RXp_LASTPAREN(rex));
289 SSPUSHINT(RXp_LASTCLOSEPAREN(rex));
290 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
291
292
293 DEBUG_BUFFERS_r({
294 Perl_re_exec_indentf(aTHX_
295 "finished regcppush returning %" IVdf " cur: %" IVdf "\n",
296 depth, retval, PL_savestack_ix);
297 });
298
299 return retval;
300 }
301
302 /* These are needed since we do not localize EVAL nodes: */
303 #define REGCP_SET(cp) \
304 DEBUG_STATE_r( \
305 Perl_re_exec_indentf( aTHX_ \
306 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
307 depth, (IV)PL_savestack_ix \
308 ) \
309 ); \
310 cp = PL_savestack_ix
311
312 #define REGCP_UNWIND(cp) \
313 DEBUG_STATE_r( \
314 if (cp != PL_savestack_ix) \
315 Perl_re_exec_indentf( aTHX_ \
316 "Clearing an EVAL scope, savestack=%" \
317 IVdf "..%" IVdf "\n", \
318 depth, (IV)(cp), (IV)PL_savestack_ix \
319 ) \
320 ); \
321 regcpblow(cp)
322
323 /* set the start and end positions of capture ix */
324 #define CLOSE_ANY_CAPTURE(rex, ix, s, e) \
325 RXp_OFFSp(rex)[(ix)].start = (s); \
326 RXp_OFFSp(rex)[(ix)].end = (e)
327
328 #define CLOSE_CAPTURE(rex, ix, s, e) \
329 CLOSE_ANY_CAPTURE(rex, ix, s, e); \
330 if (ix > RXp_LASTPAREN(rex)) \
331 RXp_LASTPAREN(rex) = (ix); \
332 RXp_LASTCLOSEPAREN(rex) = (ix); \
333 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
334 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " .. %" IVdf " max: %" UVuf "\n", \
335 depth, \
336 PTR2UV(rex), \
337 PTR2UV(RXp_OFFSp(rex)), \
338 (UV)(ix), \
339 (IV)RXp_OFFSp(rex)[ix].start, \
340 (IV)RXp_OFFSp(rex)[ix].end, \
341 (UV)RXp_LASTPAREN(rex) \
342 ))
343
344 /* the lp and lcp args match the relevant members of the
345 * regexp structure, but in practice they should all be U16
346 * instead as we have a hard limit of U16_MAX parens. See
347 * line 4003 or so of regcomp.c where we parse OPEN parens
348 * of various types. */
349 PERL_STATIC_INLINE void
S_unwind_paren(pTHX_ regexp * rex,U32 lp,U32 lcp comma_pDEPTH)350 S_unwind_paren(pTHX_ regexp *rex, U32 lp, U32 lcp comma_pDEPTH) {
351 PERL_ARGS_ASSERT_UNWIND_PAREN;
352 U32 n;
353 DECLARE_AND_GET_RE_DEBUG_FLAGS;
354 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
355 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf
356 ": invalidate (%" UVuf " .. %" UVuf ") set lcp: %" UVuf "\n",
357 depth,
358 PTR2UV(rex),
359 PTR2UV(RXp_OFFSp(rex)),
360 (UV)(lp),
361 (UV)(RXp_LASTPAREN(rex)),
362 (UV)(lcp)
363 ));
364 for (n = RXp_LASTPAREN(rex); n > lp; n--) {
365 RXp_OFFSp(rex)[n].end = -1;
366 }
367 RXp_LASTPAREN(rex) = n;
368 RXp_LASTCLOSEPAREN(rex) = lcp;
369 }
370 #define UNWIND_PAREN(lp,lcp) unwind_paren(rex,lp,lcp)
371
372 PERL_STATIC_INLINE void
S_capture_clear(pTHX_ regexp * rex,U16 from_ix,U16 to_ix,const char * str comma_pDEPTH)373 S_capture_clear(pTHX_ regexp *rex, U16 from_ix, U16 to_ix, const char *str comma_pDEPTH) {
374 PERL_ARGS_ASSERT_CAPTURE_CLEAR;
375 PERL_UNUSED_ARG(str); /* only used for debugging */
376 U16 my_ix;
377 DECLARE_AND_GET_RE_DEBUG_FLAGS;
378 for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) {
379 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
380 "CAPTURE_CLEAR %s \\%" IVdf ": "
381 "%" IVdf "(%" IVdf ") .. %" IVdf
382 " => "
383 "%" IVdf "(%" IVdf ") .. %" IVdf
384 "\n",
385 depth, str, (IV)my_ix,
386 (IV)RXp_OFFSp(rex)[my_ix].start,
387 (IV)RXp_OFFSp(rex)[my_ix].start_tmp,
388 (IV)RXp_OFFSp(rex)[my_ix].end,
389 (IV)-1, (IV)-1, (IV)-1));
390 RXp_OFFSp(rex)[my_ix].start = -1;
391 RXp_OFFSp(rex)[my_ix].start_tmp = -1;
392 RXp_OFFSp(rex)[my_ix].end = -1;
393 }
394 }
395
396 #define CAPTURE_CLEAR(from_ix, to_ix, str) \
397 if (from_ix) capture_clear(rex,from_ix, to_ix, str)
398
399 STATIC void
S_regcppop(pTHX_ regexp * rex,U32 * maxopenparen_p comma_pDEPTH)400 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p comma_pDEPTH)
401 {
402 UV i;
403 U32 paren;
404 DECLARE_AND_GET_RE_DEBUG_FLAGS;
405
406 PERL_ARGS_ASSERT_REGCPPOP;
407
408
409 DEBUG_BUFFERS_r({
410 Perl_re_exec_indentf(aTHX_
411 "starting regcppop at %" IVdf "\n",
412 depth, PL_savestack_ix);
413 });
414
415 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
416 i = SSPOPUV;
417 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
418 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
419 RXp_LASTCLOSEPAREN(rex) = SSPOPINT;
420 RXp_LASTPAREN(rex) = SSPOPINT;
421 *maxopenparen_p = SSPOPINT;
422
423 i -= REGCP_OTHER_ELEMS;
424 /* Now restore the parentheses context. */
425 DEBUG_BUFFERS_r(
426 if (i || RXp_LASTPAREN(rex) + 1 <= rex->nparens)
427 Perl_re_exec_indentf( aTHX_
428 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
429 depth,
430 PTR2UV(rex),
431 PTR2UV(RXp_OFFSp(rex))
432 );
433 );
434 /* substract remaining elements from the stack */
435 PL_savestack_ix -= i;
436
437 /* static assert that offs struc size is not less than stack elem size */
438 STATIC_ASSERT_STMT(sizeof(*RXp_OFFSp(rex)) >= sizeof(*PL_savestack));
439
440 /* calculate actual number of offs/capture groups stored */
441 /* by doing integer division (leaving potential alignment aside) */
442 i = (i * sizeof(*PL_savestack)) / sizeof(*RXp_OFFSp(rex));
443
444 /* calculate paren starting point */
445 /* i is our number of entries which we are subtracting from *maxopenparen_p */
446 /* and we are storing + 1 this to get the beginning */
447 paren = *maxopenparen_p - i + 1;
448
449 /* restore them */
450 memcpy(RXp_OFFSp(rex) + paren, &PL_savestack[PL_savestack_ix], i * sizeof(*RXp_OFFSp(rex)));
451
452 DEBUG_BUFFERS_r(
453 for (; paren <= *maxopenparen_p; ++paren) {
454 Perl_re_exec_indentf(aTHX_
455 " \\%" UVuf " %" IVdf "(%" IVdf ") .. %" IVdf " %s (regcppop)\n",
456 depth,
457 (UV)paren,
458 (IV)RXp_OFFSp(rex)[paren].start,
459 (IV)RXp_OFFSp(rex)[paren].start_tmp,
460 (IV)RXp_OFFSp(rex)[paren].end,
461 (paren > RXp_LASTPAREN(rex) ? "(skipped)" : ""));
462 }
463 );
464 #if 1
465 /* It would seem that the similar code in regtry()
466 * already takes care of this, and in fact it is in
467 * a better location to since this code can #if 0-ed out
468 * but the code in regtry() is needed or otherwise tests
469 * requiring null fields (pat.t#187 and split.t#{13,14}
470 * (as of patchlevel 7877) will fail. Then again,
471 * this code seems to be necessary or otherwise
472 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
473 * --jhi updated by dapm */
474 for (i = RXp_LASTPAREN(rex) + 1; i <= rex->nparens; i++) {
475 if (i > *maxopenparen_p) {
476 RXp_OFFSp(rex)[i].start = -1;
477 }
478 RXp_OFFSp(rex)[i].end = -1;
479 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
480 " \\%" UVuf ": %s ..-1 undeffing (regcppop)\n",
481 depth,
482 (UV)i,
483 (i > *maxopenparen_p) ? "-1" : " "
484 ));
485 }
486 #endif
487 DEBUG_BUFFERS_r({
488 Perl_re_exec_indentf(aTHX_
489 "finished regcppop at %" IVdf "\n",
490 depth, PL_savestack_ix);
491 });
492 }
493
494 /* restore the parens and associated vars at savestack position ix,
495 * but without popping the stack */
496
497 STATIC void
S_regcp_restore(pTHX_ regexp * rex,I32 ix,U32 * maxopenparen_p comma_pDEPTH)498 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p comma_pDEPTH)
499 {
500 I32 tmpix = PL_savestack_ix;
501 PERL_ARGS_ASSERT_REGCP_RESTORE;
502
503 PL_savestack_ix = ix;
504 regcppop(rex, maxopenparen_p);
505 PL_savestack_ix = tmpix;
506 }
507
508 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
509
510 STATIC bool
S_isFOO_lc(pTHX_ const U8 classnum,const U8 character)511 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
512 {
513 /* Returns a boolean as to whether or not 'character' is a member of the
514 * Posix character class given by 'classnum' that should be equivalent to a
515 * value in the typedef 'char_class_number_'.
516 *
517 * Ideally this could be replaced by a just an array of function pointers
518 * to the C library functions that implement the macros this calls.
519 * However, to compile, the precise function signatures are required, and
520 * these may vary from platform to platform. To avoid having to figure
521 * out what those all are on each platform, I (khw) am using this method,
522 * which adds an extra layer of function call overhead (unless the C
523 * optimizer strips it away). But we don't particularly care about
524 * performance with locales anyway. */
525
526 if (IN_UTF8_CTYPE_LOCALE) {
527 return cBOOL(generic_isCC_(character, classnum));
528 }
529
530 switch ((char_class_number_) classnum) {
531 case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character);
532 case CC_ENUM_ALPHA_: return isU8_ALPHA_LC(character);
533 case CC_ENUM_ASCII_: return isU8_ASCII_LC(character);
534 case CC_ENUM_BLANK_: return isU8_BLANK_LC(character);
535 case CC_ENUM_CASED_: return isU8_CASED_LC(character);
536 case CC_ENUM_CNTRL_: return isU8_CNTRL_LC(character);
537 case CC_ENUM_DIGIT_: return isU8_DIGIT_LC(character);
538 case CC_ENUM_GRAPH_: return isU8_GRAPH_LC(character);
539 case CC_ENUM_LOWER_: return isU8_LOWER_LC(character);
540 case CC_ENUM_PRINT_: return isU8_PRINT_LC(character);
541 case CC_ENUM_PUNCT_: return isU8_PUNCT_LC(character);
542 case CC_ENUM_SPACE_: return isU8_SPACE_LC(character);
543 case CC_ENUM_UPPER_: return isU8_UPPER_LC(character);
544 case CC_ENUM_WORDCHAR_: return isU8_WORDCHAR_LC(character);
545 case CC_ENUM_XDIGIT_: return isU8_XDIGIT_LC(character);
546 default: /* VERTSPACE should never occur in locales */
547 break;
548 }
549
550 Perl_croak(aTHX_
551 "panic: isFOO_lc() has an unexpected character class '%d'",
552 classnum);
553
554 NOT_REACHED; /* NOTREACHED */
555 return FALSE;
556 }
557
558 PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(pTHX_ const char * s1,const char * s2,I32 len)559 S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len)
560 {
561 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
562 * folded. Works on all folds representable without UTF-8, except for
563 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
564 * check that the strings each have at least 'len' characters.
565 *
566 * There is almost an identical API function where s2 need not be folded:
567 * Perl_foldEQ_latin1() */
568
569 const U8 *a = (const U8 *)s1;
570 const U8 *b = (const U8 *)s2;
571
572 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
573
574 assert(len >= 0);
575
576 while (len--) {
577 assert(! isUPPER_L1(*b));
578 if (toLOWER_L1(*a) != *b) {
579 return 0;
580 }
581 a++, b++;
582 }
583 return 1;
584 }
585
586 STATIC bool
S_isFOO_utf8_lc(pTHX_ const U8 classnum,const U8 * character,const U8 * e)587 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
588 {
589 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
590 * 'character' is a member of the Posix character class given by 'classnum'
591 * that should be equivalent to a value in the typedef
592 * 'char_class_number_'.
593 *
594 * This just calls isFOO_lc on the code point for the character if it is in
595 * the range 0-255. Outside that range, all characters use Unicode
596 * rules, ignoring any locale. So use the Unicode function if this class
597 * requires an inversion list, and use the Unicode macro otherwise. */
598
599
600 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
601
602 if (UTF8_IS_INVARIANT(*character)) {
603 return isFOO_lc(classnum, *character);
604 }
605 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
606 return isFOO_lc(classnum,
607 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
608 }
609
610 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
611
612 switch ((char_class_number_) classnum) {
613 case CC_ENUM_SPACE_: return is_XPERLSPACE_high(character);
614 case CC_ENUM_BLANK_: return is_HORIZWS_high(character);
615 case CC_ENUM_XDIGIT_: return is_XDIGIT_high(character);
616 case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character);
617 default:
618 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
619 utf8_to_uvchr_buf(character, e, NULL));
620 }
621 NOT_REACHED; /* NOTREACHED */
622 }
623
624 STATIC U8 *
S_find_span_end(U8 * s,const U8 * send,const U8 span_byte)625 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
626 {
627 /* Returns the position of the first byte in the sequence between 's' and
628 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
629 * */
630
631 PERL_ARGS_ASSERT_FIND_SPAN_END;
632
633 assert(send >= s);
634
635 if ((STRLEN) (send - s) >= PERL_WORDSIZE
636 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
637 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
638 {
639 PERL_UINTMAX_T span_word;
640
641 /* Process per-byte until reach word boundary. XXX This loop could be
642 * eliminated if we knew that this platform had fast unaligned reads */
643 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
644 if (*s != span_byte) {
645 return s;
646 }
647 s++;
648 }
649
650 /* Create a word filled with the bytes we are spanning */
651 span_word = PERL_COUNT_MULTIPLIER * span_byte;
652
653 /* Process per-word as long as we have at least a full word left */
654 do {
655
656 /* Keep going if the whole word is composed of 'span_byte's */
657 if ((* (PERL_UINTMAX_T *) s) == span_word) {
658 s += PERL_WORDSIZE;
659 continue;
660 }
661
662 /* Here, at least one byte in the word isn't 'span_byte'. */
663
664 #ifdef EBCDIC
665
666 break;
667
668 #else
669
670 /* This xor leaves 1 bits only in those non-matching bytes */
671 span_word ^= * (PERL_UINTMAX_T *) s;
672
673 /* Make sure the upper bit of each non-matching byte is set. This
674 * makes each such byte look like an ASCII platform variant byte */
675 span_word |= span_word << 1;
676 span_word |= span_word << 2;
677 span_word |= span_word << 4;
678
679 /* That reduces the problem to what this function solves */
680 return s + variant_byte_number(span_word);
681
682 #endif
683
684 } while (s + PERL_WORDSIZE <= send);
685 }
686
687 /* Process the straggler bytes beyond the final word boundary */
688 while (s < send) {
689 if (*s != span_byte) {
690 return s;
691 }
692 s++;
693 }
694
695 return s;
696 }
697
698 STATIC U8 *
S_find_next_masked(U8 * s,const U8 * send,const U8 byte,const U8 mask)699 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
700 {
701 /* Returns the position of the first byte in the sequence between 's'
702 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
703 * returns 'send' if none found. It uses word-level operations instead of
704 * byte to speed up the process */
705
706 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
707
708 assert(send >= s);
709 assert((byte & mask) == byte);
710
711 #ifndef EBCDIC
712
713 if ((STRLEN) (send - s) >= PERL_WORDSIZE
714 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
715 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
716 {
717 PERL_UINTMAX_T word, mask_word;
718
719 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
720 if (((*s) & mask) == byte) {
721 return s;
722 }
723 s++;
724 }
725
726 word = PERL_COUNT_MULTIPLIER * byte;
727 mask_word = PERL_COUNT_MULTIPLIER * mask;
728
729 do {
730 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
731
732 /* If 'masked' contains bytes with the bit pattern of 'byte' within
733 * it, xoring with 'word' will leave each of the 8 bits in such
734 * bytes be 0, and no byte containing any other bit pattern will be
735 * 0. */
736 masked ^= word;
737
738 /* This causes the most significant bit to be set to 1 for any
739 * bytes in the word that aren't completely 0 */
740 masked |= masked << 1;
741 masked |= masked << 2;
742 masked |= masked << 4;
743
744 /* The msbits are the same as what marks a byte as variant, so we
745 * can use this mask. If all msbits are 1, the word doesn't
746 * contain 'byte' */
747 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
748 s += PERL_WORDSIZE;
749 continue;
750 }
751
752 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
753 * and any that are, are 0. Complement and re-AND to swap that */
754 masked = ~ masked;
755 masked &= PERL_VARIANTS_WORD_MASK;
756
757 /* This reduces the problem to that solved by this function */
758 s += variant_byte_number(masked);
759 return s;
760
761 } while (s + PERL_WORDSIZE <= send);
762 }
763
764 #endif
765
766 while (s < send) {
767 if (((*s) & mask) == byte) {
768 return s;
769 }
770 s++;
771 }
772
773 return s;
774 }
775
776 STATIC U8 *
S_find_span_end_mask(U8 * s,const U8 * send,const U8 span_byte,const U8 mask)777 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
778 {
779 /* Returns the position of the first byte in the sequence between 's' and
780 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
781 * 'span_byte' should have been ANDed with 'mask' in the call of this
782 * function. Returns 'send' if none found. Works like find_span_end(),
783 * except for the AND */
784
785 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
786
787 assert(send >= s);
788 assert((span_byte & mask) == span_byte);
789
790 if ((STRLEN) (send - s) >= PERL_WORDSIZE
791 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
792 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
793 {
794 PERL_UINTMAX_T span_word, mask_word;
795
796 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
797 if (((*s) & mask) != span_byte) {
798 return s;
799 }
800 s++;
801 }
802
803 span_word = PERL_COUNT_MULTIPLIER * span_byte;
804 mask_word = PERL_COUNT_MULTIPLIER * mask;
805
806 do {
807 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
808
809 if (masked == span_word) {
810 s += PERL_WORDSIZE;
811 continue;
812 }
813
814 #ifdef EBCDIC
815
816 break;
817
818 #else
819
820 masked ^= span_word;
821 masked |= masked << 1;
822 masked |= masked << 2;
823 masked |= masked << 4;
824 return s + variant_byte_number(masked);
825
826 #endif
827
828 } while (s + PERL_WORDSIZE <= send);
829 }
830
831 while (s < send) {
832 if (((*s) & mask) != span_byte) {
833 return s;
834 }
835 s++;
836 }
837
838 return s;
839 }
840
841 /*
842 * pregexec and friends
843 */
844
845 #ifndef PERL_IN_XSUB_RE
846 /*
847 - pregexec - match a regexp against a string
848 */
849 I32
Perl_pregexec(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)850 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
851 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
852 /* stringarg: the point in the string at which to begin matching */
853 /* strend: pointer to null at end of string */
854 /* strbeg: real beginning of string */
855 /* minend: end of match must be >= minend bytes after stringarg. */
856 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
857 * itself is accessed via the pointers above */
858 /* nosave: For optimizations. */
859 {
860 PERL_ARGS_ASSERT_PREGEXEC;
861
862 return
863 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
864 nosave ? 0 : REXEC_COPY_STR);
865 }
866 #endif
867
868
869
870 /* re_intuit_start():
871 *
872 * Based on some optimiser hints, try to find the earliest position in the
873 * string where the regex could match.
874 *
875 * rx: the regex to match against
876 * sv: the SV being matched: only used for utf8 flag; the string
877 * itself is accessed via the pointers below. Note that on
878 * something like an overloaded SV, SvPOK(sv) may be false
879 * and the string pointers may point to something unrelated to
880 * the SV itself.
881 * strbeg: real beginning of string
882 * strpos: the point in the string at which to begin matching
883 * strend: pointer to the byte following the last char of the string
884 * flags currently unused; set to 0
885 * data: currently unused; set to NULL
886 *
887 * The basic idea of re_intuit_start() is to use some known information
888 * about the pattern, namely:
889 *
890 * a) the longest known anchored substring (i.e. one that's at a
891 * constant offset from the beginning of the pattern; but not
892 * necessarily at a fixed offset from the beginning of the
893 * string);
894 * b) the longest floating substring (i.e. one that's not at a constant
895 * offset from the beginning of the pattern);
896 * c) Whether the pattern is anchored to the string; either
897 * an absolute anchor: /^../, or anchored to \n: /^.../m,
898 * or anchored to pos(): /\G/;
899 * d) A start class: a real or synthetic character class which
900 * represents which characters are legal at the start of the pattern;
901 *
902 * to either quickly reject the match, or to find the earliest position
903 * within the string at which the pattern might match, thus avoiding
904 * running the full NFA engine at those earlier locations, only to
905 * eventually fail and retry further along.
906 *
907 * Returns NULL if the pattern can't match, or returns the address within
908 * the string which is the earliest place the match could occur.
909 *
910 * The longest of the anchored and floating substrings is called 'check'
911 * and is checked first. The other is called 'other' and is checked
912 * second. The 'other' substring may not be present. For example,
913 *
914 * /(abc|xyz)ABC\d{0,3}DEFG/
915 *
916 * will have
917 *
918 * check substr (float) = "DEFG", offset 6..9 chars
919 * other substr (anchored) = "ABC", offset 3..3 chars
920 * stclass = [ax]
921 *
922 * Be aware that during the course of this function, sometimes 'anchored'
923 * refers to a substring being anchored relative to the start of the
924 * pattern, and sometimes to the pattern itself being anchored relative to
925 * the string. For example:
926 *
927 * /\dabc/: "abc" is anchored to the pattern;
928 * /^\dabc/: "abc" is anchored to the pattern and the string;
929 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
930 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
931 * but the pattern is anchored to the string.
932 */
933
934 char *
Perl_re_intuit_start(pTHX_ REGEXP * const rx,SV * sv,const char * const strbeg,char * strpos,char * strend,const U32 flags,re_scream_pos_data * data)935 Perl_re_intuit_start(pTHX_
936 REGEXP * const rx,
937 SV *sv,
938 const char * const strbeg,
939 char *strpos,
940 char *strend,
941 const U32 flags,
942 re_scream_pos_data *data)
943 {
944 struct regexp *const prog = ReANY(rx);
945 SSize_t start_shift = prog->check_offset_min;
946 /* Should be nonnegative! */
947 SSize_t end_shift = 0;
948 /* current lowest pos in string where the regex can start matching */
949 char *rx_origin = strpos;
950 SV *check;
951 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
952 U8 other_ix = 1 - prog->substrs->check_ix;
953 bool ml_anch = 0;
954 char *other_last = strpos;/* latest pos 'other' substr already checked to */
955 char *check_at = NULL; /* check substr found at this pos */
956 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
957 RXi_GET_DECL(prog,progi);
958 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
959 regmatch_info *const reginfo = ®info_buf;
960 DECLARE_AND_GET_RE_DEBUG_FLAGS;
961
962 PERL_ARGS_ASSERT_RE_INTUIT_START;
963 PERL_UNUSED_ARG(flags);
964 PERL_UNUSED_ARG(data);
965
966 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
967 "Intuit: trying to determine minimum start position...\n"));
968
969 /* for now, assume that all substr offsets are positive. If at some point
970 * in the future someone wants to do clever things with lookbehind and
971 * -ve offsets, they'll need to fix up any code in this function
972 * which uses these offsets. See the thread beginning
973 * <20140113145929.GF27210@iabyn.com>
974 */
975 assert(prog->substrs->data[0].min_offset >= 0);
976 assert(prog->substrs->data[0].max_offset >= 0);
977 assert(prog->substrs->data[1].min_offset >= 0);
978 assert(prog->substrs->data[1].max_offset >= 0);
979 assert(prog->substrs->data[2].min_offset >= 0);
980 assert(prog->substrs->data[2].max_offset >= 0);
981
982 /* for now, assume that if both present, that the floating substring
983 * doesn't start before the anchored substring.
984 * If you break this assumption (e.g. doing better optimisations
985 * with lookahead/behind), then you'll need to audit the code in this
986 * function carefully first
987 */
988 assert(
989 ! ( (prog->anchored_utf8 || prog->anchored_substr)
990 && (prog->float_utf8 || prog->float_substr))
991 || (prog->float_min_offset >= prog->anchored_offset));
992
993 /* byte rather than char calculation for efficiency. It fails
994 * to quickly reject some cases that can't match, but will reject
995 * them later after doing full char arithmetic */
996 if (prog->minlen > strend - strpos) {
997 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
998 " String too short...\n"));
999 goto fail;
1000 }
1001
1002 RXp_MATCH_UTF8_set(prog, utf8_target);
1003 reginfo->is_utf8_target = cBOOL(utf8_target);
1004 reginfo->info_aux = NULL;
1005 reginfo->strbeg = strbeg;
1006 reginfo->strend = strend;
1007 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1008 reginfo->intuit = 1;
1009 /* not actually used within intuit, but zero for safety anyway */
1010 reginfo->poscache_maxiter = 0;
1011
1012 if (utf8_target) {
1013 if ((!prog->anchored_utf8 && prog->anchored_substr)
1014 || (!prog->float_utf8 && prog->float_substr))
1015 to_utf8_substr(prog);
1016 check = prog->check_utf8;
1017 } else {
1018 if (!prog->check_substr && prog->check_utf8) {
1019 if (! to_byte_substr(prog)) {
1020 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1021 }
1022 }
1023 check = prog->check_substr;
1024 }
1025
1026 /* dump the various substring data */
1027 DEBUG_OPTIMISE_MORE_r({
1028 int i;
1029 for (i=0; i<=2; i++) {
1030 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1031 : prog->substrs->data[i].substr);
1032 if (!sv)
1033 continue;
1034
1035 Perl_re_printf( aTHX_
1036 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1037 " useful=%" IVdf " utf8=%d [%s]\n",
1038 i,
1039 (IV)prog->substrs->data[i].min_offset,
1040 (IV)prog->substrs->data[i].max_offset,
1041 (IV)prog->substrs->data[i].end_shift,
1042 BmUSEFUL(sv),
1043 utf8_target ? 1 : 0,
1044 SvPEEK(sv));
1045 }
1046 });
1047
1048 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1049
1050 /* ml_anch: check after \n?
1051 *
1052 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1053 * with /.*.../, these flags will have been added by the
1054 * compiler:
1055 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
1056 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
1057 */
1058 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
1059 && !(prog->intflags & PREGf_IMPLICIT);
1060
1061 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1062 /* we are only allowed to match at BOS or \G */
1063
1064 /* trivially reject if there's a BOS anchor and we're not at BOS.
1065 *
1066 * Note that we don't try to do a similar quick reject for
1067 * \G, since generally the caller will have calculated strpos
1068 * based on pos() and gofs, so the string is already correctly
1069 * anchored by definition; and handling the exceptions would
1070 * be too fiddly (e.g. REXEC_IGNOREPOS).
1071 */
1072 if ( strpos != strbeg
1073 && (prog->intflags & PREGf_ANCH_SBOL))
1074 {
1075 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1076 " Not at start...\n"));
1077 goto fail;
1078 }
1079
1080 /* in the presence of an anchor, the anchored (relative to the
1081 * start of the regex) substr must also be anchored relative
1082 * to strpos. So quickly reject if substr isn't found there.
1083 * This works for \G too, because the caller will already have
1084 * subtracted gofs from pos, and gofs is the offset from the
1085 * \G to the start of the regex. For example, in /.abc\Gdef/,
1086 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1087 * caller will have set strpos=pos()-4; we look for the substr
1088 * at position pos()-4+1, which lines up with the "a" */
1089
1090 if (prog->check_offset_min == prog->check_offset_max) {
1091 /* Substring at constant offset from beg-of-str... */
1092 SSize_t slen = SvCUR(check);
1093 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1094
1095 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1096 " Looking for check substr at fixed offset %" IVdf "...\n",
1097 (IV)prog->check_offset_min));
1098
1099 if (SvTAIL(check)) {
1100 /* In this case, the regex is anchored at the end too.
1101 * Unless it's a multiline match, the lengths must match
1102 * exactly, give or take a \n. NB: slen >= 1 since
1103 * the last char of check is \n */
1104 if (!multiline
1105 && ( strend - s > slen
1106 || strend - s < slen - 1
1107 || (strend - s == slen && strend[-1] != '\n')))
1108 {
1109 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1110 " String too long...\n"));
1111 goto fail_finish;
1112 }
1113 /* Now should match s[0..slen-2] */
1114 slen--;
1115 }
1116 if (slen && (strend - s < slen
1117 || *SvPVX_const(check) != *s
1118 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1119 {
1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121 " String not equal...\n"));
1122 goto fail_finish;
1123 }
1124
1125 check_at = s;
1126 goto success_at_start;
1127 }
1128 }
1129 }
1130
1131 end_shift = prog->check_end_shift;
1132
1133 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1134 if (end_shift < 0)
1135 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1136 (IV)end_shift, RX_PRECOMP(rx));
1137 #endif
1138
1139 restart:
1140
1141 /* This is the (re)entry point of the main loop in this function.
1142 * The goal of this loop is to:
1143 * 1) find the "check" substring in the region rx_origin..strend
1144 * (adjusted by start_shift / end_shift). If not found, reject
1145 * immediately.
1146 * 2) If it exists, look for the "other" substr too if defined; for
1147 * example, if the check substr maps to the anchored substr, then
1148 * check the floating substr, and vice-versa. If not found, go
1149 * back to (1) with rx_origin suitably incremented.
1150 * 3) If we find an rx_origin position that doesn't contradict
1151 * either of the substrings, then check the possible additional
1152 * constraints on rx_origin of /^.../m or a known start class.
1153 * If these fail, then depending on which constraints fail, jump
1154 * back to here, or to various other re-entry points further along
1155 * that skip some of the first steps.
1156 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1157 * substring. If the start position was determined to be at the
1158 * beginning of the string - so, not rejected, but not optimised,
1159 * since we have to run regmatch from position 0 - decrement the
1160 * BmUSEFUL() count. Otherwise increment it.
1161 */
1162
1163
1164 /* first, look for the 'check' substring */
1165
1166 {
1167 U8* start_point;
1168 U8* end_point;
1169
1170 DEBUG_OPTIMISE_MORE_r({
1171 Perl_re_printf( aTHX_
1172 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1173 " Start shift: %" IVdf " End shift %" IVdf
1174 " Real end Shift: %" IVdf "\n",
1175 (IV)(rx_origin - strbeg),
1176 (IV)prog->check_offset_min,
1177 (IV)start_shift,
1178 (IV)end_shift,
1179 (IV)prog->check_end_shift);
1180 });
1181
1182 end_point = HOPBACK3(strend, end_shift, rx_origin);
1183 if (!end_point)
1184 goto fail_finish;
1185 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1186 if (!start_point)
1187 goto fail_finish;
1188
1189
1190 /* If the regex is absolutely anchored to either the start of the
1191 * string (SBOL) or to pos() (ANCH_GPOS), then
1192 * check_offset_max represents an upper bound on the string where
1193 * the substr could start. For the ANCH_GPOS case, we assume that
1194 * the caller of intuit will have already set strpos to
1195 * pos()-gofs, so in this case strpos + offset_max will still be
1196 * an upper bound on the substr.
1197 */
1198 if (!ml_anch
1199 && prog->intflags & PREGf_ANCH
1200 && prog->check_offset_max != SSize_t_MAX)
1201 {
1202 SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check));
1203 const char * const anchor =
1204 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1205 SSize_t targ_len = (char*)end_point - anchor;
1206
1207 if (check_len > targ_len) {
1208 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1209 "Target string too short to match required substring...\n"));
1210 goto fail_finish;
1211 }
1212
1213 /* do a bytes rather than chars comparison. It's conservative;
1214 * so it skips doing the HOP if the result can't possibly end
1215 * up earlier than the old value of end_point.
1216 */
1217 assert(anchor + check_len <= (char *)end_point);
1218 if (prog->check_offset_max + check_len < targ_len) {
1219 end_point = HOP3lim((U8*)anchor,
1220 prog->check_offset_max,
1221 end_point - check_len
1222 )
1223 + check_len;
1224 if (end_point < start_point)
1225 goto fail_finish;
1226 }
1227 }
1228
1229 check_at = fbm_instr( start_point, end_point,
1230 check, multiline ? FBMrf_MULTILINE : 0);
1231
1232 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1233 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1234 (IV)((char*)start_point - strbeg),
1235 (IV)((char*)end_point - strbeg),
1236 (IV)(check_at ? check_at - strbeg : -1)
1237 ));
1238
1239 /* Update the count-of-usability, remove useless subpatterns,
1240 unshift s. */
1241
1242 DEBUG_EXECUTE_r({
1243 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1244 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1245 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1246 (check_at ? "Found" : "Did not find"),
1247 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1248 ? "anchored" : "floating"),
1249 quoted,
1250 RE_SV_TAIL(check),
1251 (check_at ? " at offset " : "...\n") );
1252 });
1253
1254 if (!check_at)
1255 goto fail_finish;
1256 /* set rx_origin to the minimum position where the regex could start
1257 * matching, given the constraint of the just-matched check substring.
1258 * But don't set it lower than previously.
1259 */
1260
1261 if (check_at - rx_origin > prog->check_offset_max)
1262 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1263 /* Finish the diagnostic message */
1264 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1265 "%ld (rx_origin now %" IVdf ")...\n",
1266 (long)(check_at - strbeg),
1267 (IV)(rx_origin - strbeg)
1268 ));
1269 }
1270
1271
1272 /* now look for the 'other' substring if defined */
1273
1274 if (prog->substrs->data[other_ix].utf8_substr
1275 || prog->substrs->data[other_ix].substr)
1276 {
1277 /* Take into account the "other" substring. */
1278 char *last, *last1;
1279 char *s;
1280 SV* must;
1281 struct reg_substr_datum *other;
1282
1283 do_other_substr:
1284 other = &prog->substrs->data[other_ix];
1285 if (!utf8_target && !other->substr) {
1286 if (!to_byte_substr(prog)) {
1287 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1288 }
1289 }
1290
1291 /* if "other" is anchored:
1292 * we've previously found a floating substr starting at check_at.
1293 * This means that the regex origin must lie somewhere
1294 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1295 * and max: HOP3(check_at, -check_offset_min)
1296 * (except that min will be >= strpos)
1297 * So the fixed substr must lie somewhere between
1298 * HOP3(min, anchored_offset)
1299 * HOP3(max, anchored_offset) + SvCUR(substr)
1300 */
1301
1302 /* if "other" is floating
1303 * Calculate last1, the absolute latest point where the
1304 * floating substr could start in the string, ignoring any
1305 * constraints from the earlier fixed match. It is calculated
1306 * as follows:
1307 *
1308 * strend - prog->minlen (in chars) is the absolute latest
1309 * position within the string where the origin of the regex
1310 * could appear. The latest start point for the floating
1311 * substr is float_min_offset(*) on from the start of the
1312 * regex. last1 simply combines thee two offsets.
1313 *
1314 * (*) You might think the latest start point should be
1315 * float_max_offset from the regex origin, and technically
1316 * you'd be correct. However, consider
1317 * /a\d{2,4}bcd\w/
1318 * Here, float min, max are 3,5 and minlen is 7.
1319 * This can match either
1320 * /a\d\dbcd\w/
1321 * /a\d\d\dbcd\w/
1322 * /a\d\d\d\dbcd\w/
1323 * In the first case, the regex matches minlen chars; in the
1324 * second, minlen+1, in the third, minlen+2.
1325 * In the first case, the floating offset is 3 (which equals
1326 * float_min), in the second, 4, and in the third, 5 (which
1327 * equals float_max). In all cases, the floating string bcd
1328 * can never start more than 4 chars from the end of the
1329 * string, which equals minlen - float_min. As the substring
1330 * starts to match more than float_min from the start of the
1331 * regex, it makes the regex match more than minlen chars,
1332 * and the two cancel each other out. So we can always use
1333 * float_min - minlen, rather than float_max - minlen for the
1334 * latest position in the string.
1335 *
1336 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1337 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1338 */
1339
1340 assert(prog->minlen >= other->min_offset);
1341 last1 = HOP3c(strend,
1342 other->min_offset - prog->minlen, strbeg);
1343
1344 if (other_ix) {/* i.e. if (other-is-float) */
1345 /* last is the latest point where the floating substr could
1346 * start, *given* any constraints from the earlier fixed
1347 * match. This constraint is that the floating string starts
1348 * <= float_max_offset chars from the regex origin (rx_origin).
1349 * If this value is less than last1, use it instead.
1350 */
1351 assert(rx_origin <= last1);
1352 last =
1353 /* this condition handles the offset==infinity case, and
1354 * is a short-cut otherwise. Although it's comparing a
1355 * byte offset to a char length, it does so in a safe way,
1356 * since 1 char always occupies 1 or more bytes,
1357 * so if a string range is (last1 - rx_origin) bytes,
1358 * it will be less than or equal to (last1 - rx_origin)
1359 * chars; meaning it errs towards doing the accurate HOP3
1360 * rather than just using last1 as a short-cut */
1361 (last1 - rx_origin) < other->max_offset
1362 ? last1
1363 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1364 }
1365 else {
1366 assert(strpos + start_shift <= check_at);
1367 last = HOP4c(check_at, other->min_offset - start_shift,
1368 strbeg, strend);
1369 }
1370
1371 s = HOP3c(rx_origin, other->min_offset, strend);
1372 if (s < other_last) /* These positions already checked */
1373 s = other_last;
1374
1375 must = utf8_target ? other->utf8_substr : other->substr;
1376 assert(SvPOK(must));
1377 {
1378 char *from = s;
1379 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1380
1381 if (to > strend)
1382 to = strend;
1383 if (from > to) {
1384 s = NULL;
1385 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1386 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1387 (IV)(from - strbeg),
1388 (IV)(to - strbeg)
1389 ));
1390 }
1391 else {
1392 s = fbm_instr(
1393 (unsigned char*)from,
1394 (unsigned char*)to,
1395 must,
1396 multiline ? FBMrf_MULTILINE : 0
1397 );
1398 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1399 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1400 (IV)(from - strbeg),
1401 (IV)(to - strbeg),
1402 (IV)(s ? s - strbeg : -1)
1403 ));
1404 }
1405 }
1406
1407 DEBUG_EXECUTE_r({
1408 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1409 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1410 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1411 s ? "Found" : "Contradicts",
1412 other_ix ? "floating" : "anchored",
1413 quoted, RE_SV_TAIL(must));
1414 });
1415
1416
1417 if (!s) {
1418 /* last1 is latest possible substr location. If we didn't
1419 * find it before there, we never will */
1420 if (last >= last1) {
1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422 "; giving up...\n"));
1423 goto fail_finish;
1424 }
1425
1426 /* try to find the check substr again at a later
1427 * position. Maybe next time we'll find the "other" substr
1428 * in range too */
1429 other_last = HOP3c(last, 1, strend) /* highest failure */;
1430 rx_origin =
1431 other_ix /* i.e. if other-is-float */
1432 ? HOP3c(rx_origin, 1, strend)
1433 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1434 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1435 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1436 (other_ix ? "floating" : "anchored"),
1437 (long)(HOP3c(check_at, 1, strend) - strbeg),
1438 (IV)(rx_origin - strbeg)
1439 ));
1440 goto restart;
1441 }
1442 else {
1443 if (other_ix) { /* if (other-is-float) */
1444 /* other_last is set to s, not s+1, since its possible for
1445 * a floating substr to fail first time, then succeed
1446 * second time at the same floating position; e.g.:
1447 * "-AB--AABZ" =~ /\wAB\d*Z/
1448 * The first time round, anchored and float match at
1449 * "-(AB)--AAB(Z)" then fail on the initial \w character
1450 * class. Second time round, they match at "-AB--A(AB)(Z)".
1451 */
1452 other_last = s;
1453 }
1454 else {
1455 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1456 other_last = HOP3c(s, 1, strend);
1457 }
1458 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1459 " at offset %ld (rx_origin now %" IVdf ")...\n",
1460 (long)(s - strbeg),
1461 (IV)(rx_origin - strbeg)
1462 ));
1463
1464 }
1465 }
1466 else {
1467 DEBUG_OPTIMISE_MORE_r(
1468 Perl_re_printf( aTHX_
1469 " Check-only match: offset min:%" IVdf " max:%" IVdf
1470 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1471 " strend:%" IVdf "\n",
1472 (IV)prog->check_offset_min,
1473 (IV)prog->check_offset_max,
1474 (IV)(check_at-strbeg),
1475 (IV)(rx_origin-strbeg),
1476 (IV)(rx_origin-check_at),
1477 (IV)(strend-strbeg)
1478 )
1479 );
1480 }
1481
1482 postprocess_substr_matches:
1483
1484 /* handle the extra constraint of /^.../m if present */
1485
1486 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1487 char *s;
1488
1489 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1490 " looking for /^/m anchor"));
1491
1492 /* we have failed the constraint of a \n before rx_origin.
1493 * Find the next \n, if any, even if it's beyond the current
1494 * anchored and/or floating substrings. Whether we should be
1495 * scanning ahead for the next \n or the next substr is debatable.
1496 * On the one hand you'd expect rare substrings to appear less
1497 * often than \n's. On the other hand, searching for \n means
1498 * we're effectively flipping between check_substr and "\n" on each
1499 * iteration as the current "rarest" candidate string, which
1500 * means for example that we'll quickly reject the whole string if
1501 * hasn't got a \n, rather than trying every substr position
1502 * first
1503 */
1504
1505 s = HOP3c(strend, - prog->minlen, strpos);
1506 if (s <= rx_origin ||
1507 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1508 {
1509 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1510 " Did not find /%s^%s/m...\n",
1511 PL_colors[0], PL_colors[1]));
1512 goto fail_finish;
1513 }
1514
1515 /* earliest possible origin is 1 char after the \n.
1516 * (since *rx_origin == '\n', it's safe to ++ here rather than
1517 * HOP(rx_origin, 1)) */
1518 rx_origin++;
1519
1520 if (prog->substrs->check_ix == 0 /* check is anchored */
1521 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1522 {
1523 /* Position contradicts check-string; either because
1524 * check was anchored (and thus has no wiggle room),
1525 * or check was float and rx_origin is above the float range */
1526 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1527 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1528 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1529 goto restart;
1530 }
1531
1532 /* if we get here, the check substr must have been float,
1533 * is in range, and we may or may not have had an anchored
1534 * "other" substr which still contradicts */
1535 assert(prog->substrs->check_ix); /* check is float */
1536
1537 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1538 /* whoops, the anchored "other" substr exists, so we still
1539 * contradict. On the other hand, the float "check" substr
1540 * didn't contradict, so just retry the anchored "other"
1541 * substr */
1542 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1543 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1544 PL_colors[0], PL_colors[1],
1545 (IV)(rx_origin - strbeg + prog->anchored_offset),
1546 (IV)(rx_origin - strbeg)
1547 ));
1548 goto do_other_substr;
1549 }
1550
1551 /* success: we don't contradict the found floating substring
1552 * (and there's no anchored substr). */
1553 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1554 " Found /%s^%s/m with rx_origin %ld...\n",
1555 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1556 }
1557 else {
1558 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1559 " (multiline anchor test skipped)\n"));
1560 }
1561
1562 success_at_start:
1563
1564
1565 /* if we have a starting character class, then test that extra constraint.
1566 * (trie stclasses are too expensive to use here, we are better off to
1567 * leave it to regmatch itself) */
1568
1569 if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) {
1570 const U8* const str = (U8*)STRING(progi->regstclass);
1571
1572 /* XXX this value could be pre-computed */
1573 const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT
1574 ? (reginfo->is_utf8_pat
1575 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1576 : (SSize_t)STR_LEN(progi->regstclass))
1577 : 1);
1578 char * endpos;
1579 char *s;
1580 /* latest pos that a matching float substr constrains rx start to */
1581 char *rx_max_float = NULL;
1582
1583 /* if the current rx_origin is anchored, either by satisfying an
1584 * anchored substring constraint, or a /^.../m constraint, then we
1585 * can reject the current origin if the start class isn't found
1586 * at the current position. If we have a float-only match, then
1587 * rx_origin is constrained to a range; so look for the start class
1588 * in that range. if neither, then look for the start class in the
1589 * whole rest of the string */
1590
1591 /* XXX DAPM it's not clear what the minlen test is for, and why
1592 * it's not used in the floating case. Nothing in the test suite
1593 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1594 * Here are some old comments, which may or may not be correct:
1595 *
1596 * minlen == 0 is possible if regstclass is \b or \B,
1597 * and the fixed substr is ''$.
1598 * Since minlen is already taken into account, rx_origin+1 is
1599 * before strend; accidentally, minlen >= 1 guaranties no false
1600 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1601 * 0) below assumes that regstclass does not come from lookahead...
1602 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1603 * This leaves EXACTF-ish only, which are dealt with in
1604 * find_byclass().
1605 */
1606
1607 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1608 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1609 else if (prog->float_substr || prog->float_utf8) {
1610 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1611 endpos = HOP3clim(rx_max_float, cl_l, strend);
1612 }
1613 else
1614 endpos= strend;
1615
1616 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1617 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1618 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1619 (IV)start_shift, (IV)(check_at - strbeg),
1620 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1621
1622 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1623 reginfo);
1624 if (!s) {
1625 if (endpos == strend) {
1626 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1627 " Could not match STCLASS...\n") );
1628 goto fail;
1629 }
1630 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1631 " This position contradicts STCLASS...\n") );
1632 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1633 && !(prog->intflags & PREGf_IMPLICIT))
1634 goto fail;
1635
1636 /* Contradict one of substrings */
1637 if (prog->anchored_substr || prog->anchored_utf8) {
1638 if (prog->substrs->check_ix == 1) { /* check is float */
1639 /* Have both, check_string is floating */
1640 assert(rx_origin + start_shift <= check_at);
1641 if (rx_origin + start_shift != check_at) {
1642 /* not at latest position float substr could match:
1643 * Recheck anchored substring, but not floating.
1644 * The condition above is in bytes rather than
1645 * chars for efficiency. It's conservative, in
1646 * that it errs on the side of doing 'goto
1647 * do_other_substr'. In this case, at worst,
1648 * an extra anchored search may get done, but in
1649 * practice the extra fbm_instr() is likely to
1650 * get skipped anyway. */
1651 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1652 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1653 (long)(other_last - strbeg),
1654 (IV)(rx_origin - strbeg)
1655 ));
1656 goto do_other_substr;
1657 }
1658 }
1659 }
1660 else {
1661 /* float-only */
1662
1663 if (ml_anch) {
1664 /* In the presence of ml_anch, we might be able to
1665 * find another \n without breaking the current float
1666 * constraint. */
1667
1668 /* strictly speaking this should be HOP3c(..., 1, ...),
1669 * but since we goto a block of code that's going to
1670 * search for the next \n if any, its safe here */
1671 rx_origin++;
1672 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1673 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1674 PL_colors[0], PL_colors[1],
1675 (long)(rx_origin - strbeg)) );
1676 goto postprocess_substr_matches;
1677 }
1678
1679 /* strictly speaking this can never be true; but might
1680 * be if we ever allow intuit without substrings */
1681 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1682 goto fail;
1683
1684 rx_origin = rx_max_float;
1685 }
1686
1687 /* at this point, any matching substrings have been
1688 * contradicted. Start again... */
1689
1690 rx_origin = HOP3c(rx_origin, 1, strend);
1691
1692 /* uses bytes rather than char calculations for efficiency.
1693 * It's conservative: it errs on the side of doing 'goto restart',
1694 * where there is code that does a proper char-based test */
1695 if (rx_origin + start_shift + end_shift > strend) {
1696 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1697 " Could not match STCLASS...\n") );
1698 goto fail;
1699 }
1700 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1701 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1702 (prog->substrs->check_ix ? "floating" : "anchored"),
1703 (long)(rx_origin + start_shift - strbeg),
1704 (IV)(rx_origin - strbeg)
1705 ));
1706 goto restart;
1707 }
1708
1709 /* Success !!! */
1710
1711 if (rx_origin != s) {
1712 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1713 " By STCLASS: moving %ld --> %ld\n",
1714 (long)(rx_origin - strbeg), (long)(s - strbeg))
1715 );
1716 }
1717 else {
1718 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1719 " Does not contradict STCLASS...\n");
1720 );
1721 }
1722 }
1723
1724 /* Decide whether using the substrings helped */
1725
1726 if (rx_origin != strpos) {
1727 /* Fixed substring is found far enough so that the match
1728 cannot start at strpos. */
1729
1730 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1731 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1732 }
1733 else {
1734 /* The found rx_origin position does not prohibit matching at
1735 * strpos, so calling intuit didn't gain us anything. Decrement
1736 * the BmUSEFUL() count on the check substring, and if we reach
1737 * zero, free it. */
1738 if (!(prog->intflags & PREGf_NAUGHTY)
1739 && (utf8_target ? (
1740 prog->check_utf8 /* Could be deleted already */
1741 && --BmUSEFUL(prog->check_utf8) < 0
1742 && (prog->check_utf8 == prog->float_utf8)
1743 ) : (
1744 prog->check_substr /* Could be deleted already */
1745 && --BmUSEFUL(prog->check_substr) < 0
1746 && (prog->check_substr == prog->float_substr)
1747 )))
1748 {
1749 /* If flags & SOMETHING - do not do it many times on the same match */
1750 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1751 /* XXX Does the destruction order has to change with utf8_target? */
1752 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1753 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1754 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1755 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1756 check = NULL; /* abort */
1757 /* XXXX This is a remnant of the old implementation. It
1758 looks wasteful, since now INTUIT can use many
1759 other heuristics. */
1760 prog->extflags &= ~RXf_USE_INTUIT;
1761 }
1762 }
1763
1764 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1765 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1766 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1767
1768 return rx_origin;
1769
1770 fail_finish: /* Substring not found */
1771 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1772 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1773 fail:
1774 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1775 PL_colors[4], PL_colors[5]));
1776 return NULL;
1777 }
1778
1779
1780 #define DECL_TRIE_TYPE(scan) \
1781 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1782 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1783 trie_utf8l, trie_flu8, trie_flu8_latin } \
1784 trie_type = ((FLAGS(scan) == EXACT) \
1785 ? (utf8_target ? trie_utf8 : trie_plain) \
1786 : (FLAGS(scan) == EXACTL) \
1787 ? (utf8_target ? trie_utf8l : trie_plain) \
1788 : (FLAGS(scan) == EXACTFAA) \
1789 ? (utf8_target \
1790 ? trie_utf8_exactfa_fold \
1791 : trie_latin_utf8_exactfa_fold) \
1792 : (FLAGS(scan) == EXACTFLU8 \
1793 ? (utf8_target \
1794 ? trie_flu8 \
1795 : trie_flu8_latin) \
1796 : (utf8_target \
1797 ? trie_utf8_fold \
1798 : trie_latin_utf8_fold)))
1799
1800 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1801 * 'foldbuf+sizeof(foldbuf)' */
1802 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1803 STMT_START { \
1804 STRLEN skiplen; \
1805 U8 flags = FOLD_FLAGS_FULL; \
1806 switch (trie_type) { \
1807 case trie_flu8: \
1808 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1809 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1810 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1811 } \
1812 goto do_trie_utf8_fold; \
1813 case trie_utf8_exactfa_fold: \
1814 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1815 /* FALLTHROUGH */ \
1816 case trie_utf8_fold: \
1817 do_trie_utf8_fold: \
1818 if ( foldlen>0 ) { \
1819 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1820 foldlen -= len; \
1821 uscan += len; \
1822 len=0; \
1823 } else { \
1824 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1825 flags); \
1826 len = UTF8_SAFE_SKIP(uc, uc_end); \
1827 skiplen = UVCHR_SKIP( uvc ); \
1828 foldlen -= skiplen; \
1829 uscan = foldbuf + skiplen; \
1830 } \
1831 break; \
1832 case trie_flu8_latin: \
1833 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1834 goto do_trie_latin_utf8_fold; \
1835 case trie_latin_utf8_exactfa_fold: \
1836 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1837 /* FALLTHROUGH */ \
1838 case trie_latin_utf8_fold: \
1839 do_trie_latin_utf8_fold: \
1840 if ( foldlen>0 ) { \
1841 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1842 foldlen -= len; \
1843 uscan += len; \
1844 len=0; \
1845 } else { \
1846 len = 1; \
1847 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1848 skiplen = UVCHR_SKIP( uvc ); \
1849 foldlen -= skiplen; \
1850 uscan = foldbuf + skiplen; \
1851 } \
1852 break; \
1853 case trie_utf8l: \
1854 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1855 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1856 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1857 } \
1858 /* FALLTHROUGH */ \
1859 case trie_utf8: \
1860 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1861 break; \
1862 case trie_plain: \
1863 uvc = (UV)*uc; \
1864 len = 1; \
1865 } \
1866 if (uvc < 256) { \
1867 charid = trie->charmap[ uvc ]; \
1868 } \
1869 else { \
1870 charid = 0; \
1871 if (widecharmap) { \
1872 SV** const svpp = hv_fetch(widecharmap, \
1873 (char*)&uvc, sizeof(UV), 0); \
1874 if (svpp) \
1875 charid = (U16)SvIV(*svpp); \
1876 } \
1877 } \
1878 } STMT_END
1879
1880 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1881 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1882 startpos, doutf8, depth)
1883
1884 #define GET_ANYOFH_INVLIST(prog, n) \
1885 GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL)
1886
1887 #define REXEC_FBC_UTF8_SCAN(CODE) \
1888 STMT_START { \
1889 while (s < strend) { \
1890 CODE \
1891 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1892 } \
1893 } STMT_END
1894
1895 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1896 STMT_START { \
1897 while (s < strend) { \
1898 CODE \
1899 s++; \
1900 } \
1901 } STMT_END
1902
1903 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1904 STMT_START { \
1905 while (s < strend) { \
1906 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1907 } \
1908 } STMT_END
1909
1910 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
1911 STMT_START { \
1912 while (s < strend) { \
1913 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1914 } \
1915 } STMT_END
1916
1917 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1918 if (COND) { \
1919 FBC_CHECK_AND_TRY \
1920 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1921 previous_occurrence_end = s; \
1922 } \
1923 else { \
1924 s += UTF8SKIP(s); \
1925 }
1926
1927 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1928 if (COND) { \
1929 FBC_CHECK_AND_TRY \
1930 s++; \
1931 previous_occurrence_end = s; \
1932 } \
1933 else { \
1934 s++; \
1935 }
1936
1937 #define LAST_REGTRY_SKIPPED_FORWARD(reginfo) (reginfo->cutpoint)
1938
1939 /* We keep track of where the next character should start after an occurrence
1940 * of the one we're looking for. Knowing that, we can see right away if the
1941 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1942 * don't accept the 2nd and succeeding adjacent occurrences */
1943 #define FBC_CHECK_AND_TRY \
1944 if ( ( doevery \
1945 || s != previous_occurrence_end \
1946 || LAST_REGTRY_SKIPPED_FORWARD(reginfo) ) \
1947 && ( reginfo->intuit \
1948 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1949 { \
1950 goto got_it; \
1951 }
1952
1953
1954 /* These differ from the above macros in that they call a function which
1955 * returns the next occurrence of the thing being looked for in 's'; and
1956 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c)
1957 * */
1958 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
1959 while (s < strend) { \
1960 s = (char *) (f); \
1961 if (s >= strend) { \
1962 break; \
1963 } \
1964 \
1965 FBC_CHECK_AND_TRY \
1966 s += UTF8SKIP(s); \
1967 previous_occurrence_end = s; \
1968 }
1969
1970 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1971 while (s < strend) { \
1972 s = (char *) (f); \
1973 if (s >= strend) { \
1974 break; \
1975 } \
1976 \
1977 FBC_CHECK_AND_TRY \
1978 s++; \
1979 previous_occurrence_end = s; \
1980 }
1981
1982 /* This is like the above macro except the function returns NULL if there is no
1983 * occurrence, and there is a further condition that must be matched besides
1984 * the function */
1985 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \
1986 while (s < strend) { \
1987 s = (char *) (f); \
1988 if (s == NULL) { \
1989 s = (char *) strend; \
1990 break; \
1991 } \
1992 \
1993 if (COND) { \
1994 FBC_CHECK_AND_TRY \
1995 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1996 previous_occurrence_end = s; \
1997 } \
1998 else { \
1999 s += UTF8SKIP(s); \
2000 } \
2001 }
2002
2003 /* This differs from the above macros in that it is passed a single byte that
2004 * is known to begin the next occurrence of the thing being looked for in 's'.
2005 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
2006 * at that position. */
2007 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
2008 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \
2009 COND)
2010
2011 /* This is like the function above, but takes an entire string to look for
2012 * instead of a single byte */
2013 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \
2014 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \
2015 ninstr(s, strend, substr, substr_end), \
2016 COND)
2017
2018 /* The four macros below are slightly different versions of the same logic.
2019 *
2020 * The first is for /a and /aa when the target string is UTF-8. This can only
2021 * match ascii, but it must advance based on UTF-8. The other three handle
2022 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
2023 * looking for the boundary (or non-boundary) between a word and non-word
2024 * character. The utf8 and non-utf8 cases have the same logic, but the details
2025 * must be different. Find the "wordness" of the character just prior to this
2026 * one, and compare it with the wordness of this one. If they differ, we have
2027 * a boundary. At the beginning of the string, pretend that the previous
2028 * character was a new-line.
2029 *
2030 * All these macros uncleanly have side-effects with each other and outside
2031 * variables. So far it's been too much trouble to clean-up
2032 *
2033 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
2034 * a word character or not.
2035 * IF_SUCCESS is code to do if it finds that we are at a boundary between
2036 * word/non-word
2037 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
2038 *
2039 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
2040 * are looking for a boundary or for a non-boundary. If we are looking for a
2041 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
2042 * see if this tentative match actually works, and if so, to quit the loop
2043 * here. And vice-versa if we are looking for a non-boundary.
2044 *
2045 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
2046 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
2047 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
2048 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2049 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2050 * complement. But in that branch we complement tmp, meaning that at the
2051 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2052 * which means at the top of the loop in the next iteration, it is
2053 * TEST_NON_UTF8(s-1) */
2054 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2055 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2056 tmp = TEST_NON_UTF8(tmp); \
2057 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
2058 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
2059 tmp = !tmp; \
2060 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
2061 } \
2062 else { \
2063 IF_FAIL; \
2064 } \
2065 ); \
2066
2067 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2068 * TEST_UTF8 is a macro that for the same input code points returns identically
2069 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
2070 * end pointer as well) */
2071 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
2072 if (s == reginfo->strbeg) { \
2073 tmp = '\n'; \
2074 } \
2075 else { /* Back-up to the start of the previous character */ \
2076 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
2077 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
2078 0, UTF8_ALLOW_DEFAULT); \
2079 } \
2080 tmp = TEST_UV(tmp); \
2081 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
2082 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
2083 tmp = !tmp; \
2084 IF_SUCCESS; \
2085 } \
2086 else { \
2087 IF_FAIL; \
2088 } \
2089 );
2090
2091 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
2092 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
2093 * set-up by the FBC_BOUND, etc macros below */
2094 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2095 UTF8_CODE; \
2096 /* Here, things have been set up by the previous code so that tmp is the \
2097 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
2098 * against the EOS, which we treat as a \n */ \
2099 if (tmp == ! TEST_NON_UTF8('\n')) { \
2100 IF_SUCCESS; \
2101 } \
2102 else { \
2103 IF_FAIL; \
2104 }
2105
2106 /* Same as the macro above, but the target isn't UTF-8 */
2107 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2108 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2109 tmp = TEST_NON_UTF8(tmp); \
2110 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2111 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2112 IF_SUCCESS; \
2113 tmp = !tmp; \
2114 } \
2115 else { \
2116 IF_FAIL; \
2117 } \
2118 ); \
2119 /* Here, things have been set up by the previous code so that tmp is \
2120 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2121 * matches against the EOS, which we treat as a \n */ \
2122 if (tmp == ! TEST_NON_UTF8('\n')) { \
2123 IF_SUCCESS; \
2124 } \
2125 else { \
2126 IF_FAIL; \
2127 }
2128
2129 /* This is the macro to use when we want to see if something that looks like it
2130 * could match, actually does, and if so exits the loop. It needs to be used
2131 * only for bounds checking macros, as it allows for matching beyond the end of
2132 * string (which should be zero length without having to look at the string
2133 * contents) */
2134 #define REXEC_FBC_TRYIT \
2135 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
2136 goto got_it
2137
2138 /* The only difference between the BOUND and NBOUND cases is that
2139 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2140 * NBOUND. This is accomplished by passing it as either the if or else clause,
2141 * with the other one being empty (PLACEHOLDER is defined as empty).
2142 *
2143 * The TEST_FOO parameters are for operating on different forms of input, but
2144 * all should be ones that return identically for the same underlying code
2145 * points */
2146
2147 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2148 FBC_BOUND_COMMON_UTF8( \
2149 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2150 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2151
2152 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2153 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2154
2155 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2156 FBC_BOUND_COMMON_UTF8( \
2157 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2158 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2159
2160 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2161 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2162
2163 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2164 FBC_BOUND_COMMON_UTF8( \
2165 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2166 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2167
2168 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2169 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2170
2171 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2172 FBC_BOUND_COMMON_UTF8( \
2173 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2174 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2175
2176 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2177 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2178
2179 #ifdef DEBUGGING
2180 static IV
S_get_break_val_cp_checked(SV * const invlist,const UV cp_in)2181 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2182 IV cp_out = _invlist_search(invlist, cp_in);
2183 assert(cp_out >= 0);
2184 return cp_out;
2185 }
2186 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2187 invmap[S_get_break_val_cp_checked(invlist, cp)]
2188 #else
2189 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2190 invmap[_invlist_search(invlist, cp)]
2191 #endif
2192
2193 /* Takes a pointer to an inversion list, a pointer to its corresponding
2194 * inversion map, and a code point, and returns the code point's value
2195 * according to the two arrays. It assumes that all code points have a value.
2196 * This is used as the base macro for macros for particular properties */
2197 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2198 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2199
2200 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2201 * of a code point, returning the value for the first code point in the string.
2202 * And it takes the particular macro name that finds the desired value given a
2203 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2204 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2205 (__ASSERT_(pos < strend) \
2206 /* Note assumes is valid UTF-8 */ \
2207 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2208
2209 /* Returns the GCB value for the input code point */
2210 #define getGCB_VAL_CP(cp) \
2211 _generic_GET_BREAK_VAL_CP( \
2212 PL_GCB_invlist, \
2213 _Perl_GCB_invmap, \
2214 (cp))
2215
2216 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2217 * bounded by pos and strend */
2218 #define getGCB_VAL_UTF8(pos, strend) \
2219 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2220
2221 /* Returns the LB value for the input code point */
2222 #define getLB_VAL_CP(cp) \
2223 _generic_GET_BREAK_VAL_CP( \
2224 PL_LB_invlist, \
2225 _Perl_LB_invmap, \
2226 (cp))
2227
2228 /* Returns the LB value for the first code point in the UTF-8 encoded string
2229 * bounded by pos and strend */
2230 #define getLB_VAL_UTF8(pos, strend) \
2231 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2232
2233
2234 /* Returns the SB value for the input code point */
2235 #define getSB_VAL_CP(cp) \
2236 _generic_GET_BREAK_VAL_CP( \
2237 PL_SB_invlist, \
2238 _Perl_SB_invmap, \
2239 (cp))
2240
2241 /* Returns the SB value for the first code point in the UTF-8 encoded string
2242 * bounded by pos and strend */
2243 #define getSB_VAL_UTF8(pos, strend) \
2244 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2245
2246 /* Returns the WB value for the input code point */
2247 #define getWB_VAL_CP(cp) \
2248 _generic_GET_BREAK_VAL_CP( \
2249 PL_WB_invlist, \
2250 _Perl_WB_invmap, \
2251 (cp))
2252
2253 /* Returns the WB value for the first code point in the UTF-8 encoded string
2254 * bounded by pos and strend */
2255 #define getWB_VAL_UTF8(pos, strend) \
2256 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2257
2258 /* We know what class REx starts with. Try to find this position... */
2259 /* if reginfo->intuit, its a dryrun */
2260 /* annoyingly all the vars in this routine have different names from their counterparts
2261 in regmatch. /grrr */
2262 STATIC char *
S_find_byclass(pTHX_ regexp * prog,const regnode * c,char * s,const char * strend,regmatch_info * reginfo)2263 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2264 const char *strend, regmatch_info *reginfo)
2265 {
2266
2267 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2268 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2269
2270 char *pat_string; /* The pattern's exactish string */
2271 char *pat_end; /* ptr to end char of pat_string */
2272 re_fold_t folder; /* Function for computing non-utf8 folds */
2273 const U8 *fold_array; /* array for folding ords < 256 */
2274 STRLEN ln;
2275 STRLEN lnc;
2276 U8 c1;
2277 U8 c2;
2278 char *e = NULL;
2279
2280 /* In some cases we accept only the first occurrence of 'x' in a sequence of
2281 * them. This variable points to just beyond the end of the previous
2282 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2283 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2284 * hop back.) */
2285 char * previous_occurrence_end = 0;
2286
2287 I32 tmp; /* Scratch variable */
2288 const bool utf8_target = reginfo->is_utf8_target;
2289 UV utf8_fold_flags = 0;
2290 const bool is_utf8_pat = reginfo->is_utf8_pat;
2291 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2292 with a result inverts that result, as 0^1 =
2293 1 and 1^1 = 0 */
2294 char_class_number_ classnum;
2295
2296 RXi_GET_DECL(prog,progi);
2297
2298 PERL_ARGS_ASSERT_FIND_BYCLASS;
2299
2300 /* We know what class it must start with. The case statements below have
2301 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2302 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2303 * ('p8' and 'pb'. */
2304 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2305 SV * anyofh_list;
2306
2307 case ANYOFPOSIXL_t8_pb:
2308 case ANYOFPOSIXL_t8_p8:
2309 case ANYOFL_t8_pb:
2310 case ANYOFL_t8_p8:
2311 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2312 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2313
2314 /* FALLTHROUGH */
2315
2316 case ANYOFD_t8_pb:
2317 case ANYOFD_t8_p8:
2318 case ANYOF_t8_pb:
2319 case ANYOF_t8_p8:
2320 REXEC_FBC_UTF8_CLASS_SCAN(
2321 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2322 break;
2323
2324 case ANYOFPOSIXL_tb_pb:
2325 case ANYOFPOSIXL_tb_p8:
2326 case ANYOFL_tb_pb:
2327 case ANYOFL_tb_p8:
2328 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2329 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2330
2331 /* FALLTHROUGH */
2332
2333 case ANYOFD_tb_pb:
2334 case ANYOFD_tb_p8:
2335 case ANYOF_tb_pb:
2336 case ANYOF_tb_p8:
2337 if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) {
2338 /* We know that s is in the bitmap range since the target isn't
2339 * UTF-8, so what happens for out-of-range values is not relevant,
2340 * so exclude that from the flags */
2341 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2342 }
2343 else {
2344 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2345 0));
2346 }
2347 break;
2348
2349 case ANYOFM_tb_pb: /* ARG1u() is the base byte; FLAGS() the mask byte */
2350 case ANYOFM_tb_p8:
2351 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2352 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2353 break;
2354
2355 case ANYOFM_t8_pb:
2356 case ANYOFM_t8_p8:
2357 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2358 * we do anyway for performance reasons, as otherwise we would have to
2359 * examine all the continuation characters */
2360 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2361 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2362 break;
2363
2364 case NANYOFM_tb_pb:
2365 case NANYOFM_tb_p8:
2366 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2367 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2368 break;
2369
2370 case NANYOFM_t8_pb:
2371 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2372 variants. */
2373 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2374 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2375 (U8) ARG1u(c), FLAGS(c)));
2376 break;
2377
2378 /* These nodes all require at least one code point to be in UTF-8 to
2379 * match */
2380 case ANYOFH_tb_pb:
2381 case ANYOFH_tb_p8:
2382 case ANYOFHb_tb_pb:
2383 case ANYOFHb_tb_p8:
2384 case ANYOFHbbm_tb_pb:
2385 case ANYOFHbbm_tb_p8:
2386 case ANYOFHr_tb_pb:
2387 case ANYOFHr_tb_p8:
2388 case ANYOFHs_tb_pb:
2389 case ANYOFHs_tb_p8:
2390 case EXACTFLU8_tb_pb:
2391 case EXACTFLU8_tb_p8:
2392 case EXACTFU_REQ8_tb_pb:
2393 case EXACTFU_REQ8_tb_p8:
2394 break;
2395
2396 case ANYOFH_t8_pb:
2397 case ANYOFH_t8_p8:
2398 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2399 REXEC_FBC_UTF8_CLASS_SCAN(
2400 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2401 && _invlist_contains_cp(anyofh_list,
2402 utf8_to_uvchr_buf((U8 *) s,
2403 (U8 *) strend,
2404 NULL))));
2405 break;
2406
2407 case ANYOFHb_t8_pb:
2408 case ANYOFHb_t8_p8:
2409 {
2410 /* We know what the first byte of any matched string should be. */
2411 U8 first_byte = FLAGS(c);
2412
2413 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2414 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2415 _invlist_contains_cp(anyofh_list,
2416 utf8_to_uvchr_buf((U8 *) s,
2417 (U8 *) strend,
2418 NULL)));
2419 }
2420 break;
2421
2422 case ANYOFHbbm_t8_pb:
2423 case ANYOFHbbm_t8_p8:
2424 {
2425 /* We know what the first byte of any matched string should be. */
2426 U8 first_byte = FLAGS(c);
2427
2428 /* And a bitmap defines all the legal 2nd byte matches */
2429 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2430 ( s < strend
2431 && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap,
2432 (U8) s[1] & UTF_CONTINUATION_MASK)));
2433 }
2434 break;
2435
2436 case ANYOFHr_t8_pb:
2437 case ANYOFHr_t8_p8:
2438 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2439 REXEC_FBC_UTF8_CLASS_SCAN(
2440 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2441 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2442 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2443 && _invlist_contains_cp(anyofh_list,
2444 utf8_to_uvchr_buf((U8 *) s,
2445 (U8 *) strend,
2446 NULL))));
2447 break;
2448
2449 case ANYOFHs_t8_pb:
2450 case ANYOFHs_t8_p8:
2451 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2452 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2453 ((struct regnode_anyofhs *) c)->string,
2454 /* Note FLAGS is the string length in this regnode */
2455 ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2456 _invlist_contains_cp(anyofh_list,
2457 utf8_to_uvchr_buf((U8 *) s,
2458 (U8 *) strend,
2459 NULL)));
2460 break;
2461
2462 case ANYOFR_tb_pb:
2463 case ANYOFR_tb_p8:
2464 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2465 ANYOFRbase(c), ANYOFRdelta(c)));
2466 break;
2467
2468 case ANYOFR_t8_pb:
2469 case ANYOFR_t8_p8:
2470 REXEC_FBC_UTF8_CLASS_SCAN(
2471 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2472 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2473 (U8 *) strend,
2474 NULL),
2475 ANYOFRbase(c), ANYOFRdelta(c))));
2476 break;
2477
2478 case ANYOFRb_tb_pb:
2479 case ANYOFRb_tb_p8:
2480 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2481 ANYOFRbase(c), ANYOFRdelta(c)));
2482 break;
2483
2484 case ANYOFRb_t8_pb:
2485 case ANYOFRb_t8_p8:
2486 { /* We know what the first byte of any matched string should be */
2487 U8 first_byte = FLAGS(c);
2488
2489 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2490 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2491 (U8 *) strend,
2492 NULL),
2493 ANYOFRbase(c), ANYOFRdelta(c)));
2494 }
2495 break;
2496
2497 case EXACTFAA_tb_pb:
2498
2499 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2500 * which these functions don't handle anyway */
2501 fold_array = PL_fold_latin1;
2502 folder = S_foldEQ_latin1_s2_folded;
2503 goto do_exactf_non_utf8;
2504
2505 case EXACTF_tb_pb:
2506 fold_array = PL_fold;
2507 folder = Perl_foldEQ;
2508 goto do_exactf_non_utf8;
2509
2510 case EXACTFL_tb_pb:
2511 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2512
2513 if (IN_UTF8_CTYPE_LOCALE) {
2514 utf8_fold_flags = FOLDEQ_LOCALE;
2515 goto do_exactf_utf8;
2516 }
2517
2518 fold_array = PL_fold_locale;
2519 folder = Perl_foldEQ_locale;
2520 goto do_exactf_non_utf8;
2521
2522 case EXACTFU_tb_pb:
2523 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2524 * don't have to worry here about this single special case in the
2525 * Latin1 range */
2526 fold_array = PL_fold_latin1;
2527 folder = S_foldEQ_latin1_s2_folded;
2528
2529 /* FALLTHROUGH */
2530
2531 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2532 are no glitches with fold-length differences
2533 between the target string and pattern */
2534
2535 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2536 * character of the EXACTF* node and then, if necessary,
2537 * case-insensitively compare the full text of the node. c1 is the
2538 * first character. c2 is its fold. This logic will not work for
2539 * Unicode semantics and the german sharp ss, which hence should not be
2540 * compiled into a node that gets here. */
2541 pat_string = STRINGs(c);
2542 ln = STR_LENs(c); /* length to match in octets/bytes */
2543
2544 /* We know that we have to match at least 'ln' bytes (which is the same
2545 * as characters, since not utf8). If we have to match 3 characters,
2546 * and there are only 2 available, we know without trying that it will
2547 * fail; so don't start a match past the required minimum number from
2548 * the far end */
2549 e = HOP3c(strend, -((SSize_t)ln), s);
2550 if (e < s)
2551 break;
2552
2553 c1 = *pat_string;
2554 c2 = fold_array[c1];
2555 if (c1 == c2) { /* If char and fold are the same */
2556 while (s <= e) {
2557 s = (char *) memchr(s, c1, e + 1 - s);
2558 if (s == NULL) {
2559 break;
2560 }
2561
2562 /* Check that the rest of the node matches */
2563 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2564 && (reginfo->intuit || regtry(reginfo, &s)) )
2565 {
2566 goto got_it;
2567 }
2568 s++;
2569 }
2570 }
2571 else {
2572 U8 bits_differing = c1 ^ c2;
2573
2574 /* If the folds differ in one bit position only, we can mask to
2575 * match either of them, and can use this faster find method. Both
2576 * ASCII and EBCDIC tend to have their case folds differ in only
2577 * one position, so this is very likely */
2578 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2579 bits_differing = ~ bits_differing;
2580 while (s <= e) {
2581 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2582 (c1 & bits_differing), bits_differing);
2583 if (s > e) {
2584 break;
2585 }
2586
2587 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2588 && (reginfo->intuit || regtry(reginfo, &s)) )
2589 {
2590 goto got_it;
2591 }
2592 s++;
2593 }
2594 }
2595 else { /* Otherwise, stuck with looking byte-at-a-time. This
2596 should actually happen only in EXACTFL nodes */
2597 while (s <= e) {
2598 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2599 && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2600 && (reginfo->intuit || regtry(reginfo, &s)) )
2601 {
2602 goto got_it;
2603 }
2604 s++;
2605 }
2606 }
2607 }
2608 break;
2609
2610 case EXACTFAA_tb_p8:
2611 case EXACTFAA_t8_p8:
2612 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2613 |FOLDEQ_S2_ALREADY_FOLDED
2614 |FOLDEQ_S2_FOLDS_SANE;
2615 goto do_exactf_utf8;
2616
2617 case EXACTFAA_NO_TRIE_tb_pb:
2618 case EXACTFAA_NO_TRIE_t8_pb:
2619 case EXACTFAA_t8_pb:
2620
2621 /* Here, and elsewhere in this file, the reason we can't consider a
2622 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2623 * is because any MICRO SIGN in the pattern won't be folded. Since the
2624 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2625 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2626 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2627 goto do_exactf_utf8;
2628
2629 case EXACTFL_tb_p8:
2630 case EXACTFL_t8_pb:
2631 case EXACTFL_t8_p8:
2632 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2633 utf8_fold_flags = FOLDEQ_LOCALE;
2634 goto do_exactf_utf8;
2635
2636 case EXACTFLU8_t8_pb:
2637 case EXACTFLU8_t8_p8:
2638 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2639 | FOLDEQ_S2_FOLDS_SANE;
2640 goto do_exactf_utf8;
2641
2642 case EXACTFU_REQ8_t8_p8:
2643 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2644 goto do_exactf_utf8;
2645
2646 case EXACTFU_tb_p8:
2647 case EXACTFU_t8_pb:
2648 case EXACTFU_t8_p8:
2649 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2650 goto do_exactf_utf8;
2651
2652 /* The following are problematic even though pattern isn't UTF-8. Use
2653 * full functionality normally not done except for UTF-8. */
2654 case EXACTF_t8_pb:
2655 case EXACTFUP_tb_pb:
2656 case EXACTFUP_t8_pb:
2657
2658 do_exactf_utf8:
2659 {
2660 unsigned expansion;
2661
2662 /* If one of the operands is in utf8, we can't use the simpler
2663 * folding above, due to the fact that many different characters
2664 * can have the same fold, or portion of a fold, or different-
2665 * length fold */
2666 pat_string = STRINGs(c);
2667 ln = STR_LENs(c); /* length to match in octets/bytes */
2668 pat_end = pat_string + ln;
2669 lnc = is_utf8_pat /* length to match in characters */
2670 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2671 : ln;
2672
2673 /* We have 'lnc' characters to match in the pattern, but because of
2674 * multi-character folding, each character in the target can match
2675 * up to 3 characters (Unicode guarantees it will never exceed
2676 * this) if it is utf8-encoded; and up to 2 if not (based on the
2677 * fact that the Latin 1 folds are already determined, and the only
2678 * multi-char fold in that range is the sharp-s folding to 'ss'.
2679 * Thus, a pattern character can match as little as 1/3 of a string
2680 * character. Adjust lnc accordingly, rounding up, so that if we
2681 * need to match at least 4+1/3 chars, that really is 5. */
2682 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2683 lnc = (lnc + expansion - 1) / expansion;
2684
2685 /* As in the non-UTF8 case, if we have to match 3 characters, and
2686 * only 2 are left, it's guaranteed to fail, so don't start a match
2687 * that would require us to go beyond the end of the string */
2688 e = HOP3c(strend, -((SSize_t)lnc), s);
2689
2690 /* XXX Note that we could recalculate e to stop the loop earlier,
2691 * as the worst case expansion above will rarely be met, and as we
2692 * go along we would usually find that e moves further to the left.
2693 * This would happen only after we reached the point in the loop
2694 * where if there were no expansion we should fail. Unclear if
2695 * worth the expense */
2696
2697 while (s <= e) {
2698 char *my_strend= (char *)strend;
2699 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2700 pat_string, NULL, ln, is_utf8_pat,
2701 utf8_fold_flags)
2702 && (reginfo->intuit || regtry(reginfo, &s)) )
2703 {
2704 goto got_it;
2705 }
2706 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2707 }
2708 }
2709 break;
2710
2711 case BOUNDA_tb_pb:
2712 case BOUNDA_tb_p8:
2713 case BOUND_tb_pb: /* /d without utf8 target is /a */
2714 case BOUND_tb_p8:
2715 /* regcomp.c makes sure that these only have the traditional \b
2716 * meaning. */
2717 assert(FLAGS(c) == TRADITIONAL_BOUND);
2718
2719 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2720 break;
2721
2722 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2723 case BOUNDA_t8_p8:
2724 /* regcomp.c makes sure that these only have the traditional \b
2725 * meaning. */
2726 assert(FLAGS(c) == TRADITIONAL_BOUND);
2727
2728 FBC_BOUND_A_UTF8(isWORDCHAR_A);
2729 break;
2730
2731 case NBOUNDA_tb_pb:
2732 case NBOUNDA_tb_p8:
2733 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2734 case NBOUND_tb_p8:
2735 /* regcomp.c makes sure that these only have the traditional \b
2736 * meaning. */
2737 assert(FLAGS(c) == TRADITIONAL_BOUND);
2738
2739 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2740 break;
2741
2742 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2743 case NBOUNDA_t8_p8:
2744 /* regcomp.c makes sure that these only have the traditional \b
2745 * meaning. */
2746 assert(FLAGS(c) == TRADITIONAL_BOUND);
2747
2748 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2749 break;
2750
2751 case NBOUNDU_tb_pb:
2752 case NBOUNDU_tb_p8:
2753 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2754 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2755 break;
2756 }
2757
2758 to_complement = 1;
2759 goto do_boundu_non_utf8;
2760
2761 case NBOUNDL_tb_pb:
2762 case NBOUNDL_tb_p8:
2763 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2764 if (FLAGS(c) == TRADITIONAL_BOUND) {
2765 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2766 break;
2767 }
2768
2769 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2770
2771 to_complement = 1;
2772 goto do_boundu_non_utf8;
2773
2774 case BOUNDL_tb_pb:
2775 case BOUNDL_tb_p8:
2776 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2777 if (FLAGS(c) == TRADITIONAL_BOUND) {
2778 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2779 break;
2780 }
2781
2782 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2783
2784 goto do_boundu_non_utf8;
2785
2786 case BOUNDU_tb_pb:
2787 case BOUNDU_tb_p8:
2788 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2789 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2790 break;
2791 }
2792
2793 do_boundu_non_utf8:
2794 if (s == reginfo->strbeg) {
2795 if (reginfo->intuit || regtry(reginfo, &s))
2796 {
2797 goto got_it;
2798 }
2799
2800 /* Didn't match. Try at the next position (if there is one) */
2801 s++;
2802 if (UNLIKELY(s >= reginfo->strend)) {
2803 break;
2804 }
2805 }
2806
2807 switch((bound_type) FLAGS(c)) {
2808 case TRADITIONAL_BOUND: /* Should have already been handled */
2809 assert(0);
2810 break;
2811
2812 case GCB_BOUND:
2813 /* Not utf8. Everything is a GCB except between CR and LF */
2814 while (s < strend) {
2815 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2816 || UCHARAT(s) != '\n'))
2817 && (reginfo->intuit || regtry(reginfo, &s)))
2818 {
2819 goto got_it;
2820 }
2821 s++;
2822 }
2823
2824 break;
2825
2826 case LB_BOUND:
2827 {
2828 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2829 while (s < strend) {
2830 LB_enum after = getLB_VAL_CP((U8) *s);
2831 if (to_complement ^ isLB(before,
2832 after,
2833 (U8*) reginfo->strbeg,
2834 (U8*) s,
2835 (U8*) reginfo->strend,
2836 0 /* target not utf8 */ )
2837 && (reginfo->intuit || regtry(reginfo, &s)))
2838 {
2839 goto got_it;
2840 }
2841 before = after;
2842 s++;
2843 }
2844 }
2845
2846 break;
2847
2848 case SB_BOUND:
2849 {
2850 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2851 while (s < strend) {
2852 SB_enum after = getSB_VAL_CP((U8) *s);
2853 if ((to_complement ^ isSB(before,
2854 after,
2855 (U8*) reginfo->strbeg,
2856 (U8*) s,
2857 (U8*) reginfo->strend,
2858 0 /* target not utf8 */ ))
2859 && (reginfo->intuit || regtry(reginfo, &s)))
2860 {
2861 goto got_it;
2862 }
2863 before = after;
2864 s++;
2865 }
2866 }
2867
2868 break;
2869
2870 case WB_BOUND:
2871 {
2872 WB_enum previous = WB_UNKNOWN;
2873 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2874 while (s < strend) {
2875 WB_enum after = getWB_VAL_CP((U8) *s);
2876 if ((to_complement ^ isWB(previous,
2877 before,
2878 after,
2879 (U8*) reginfo->strbeg,
2880 (U8*) s,
2881 (U8*) reginfo->strend,
2882 0 /* target not utf8 */ ))
2883 && (reginfo->intuit || regtry(reginfo, &s)))
2884 {
2885 goto got_it;
2886 }
2887 previous = before;
2888 before = after;
2889 s++;
2890 }
2891 }
2892 }
2893
2894 /* Here are at the final position in the target string, which is a
2895 * boundary by definition, so matches, depending on other constraints.
2896 * */
2897 if ( reginfo->intuit
2898 || (s <= reginfo->strend && regtry(reginfo, &s)))
2899 {
2900 goto got_it;
2901 }
2902
2903 break;
2904
2905 case BOUNDL_t8_pb:
2906 case BOUNDL_t8_p8:
2907 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2908 if (FLAGS(c) == TRADITIONAL_BOUND) {
2909 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2910 isWORDCHAR_LC_utf8_safe);
2911 break;
2912 }
2913
2914 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2915
2916 to_complement = 1;
2917 goto do_boundu_utf8;
2918
2919 case NBOUNDL_t8_pb:
2920 case NBOUNDL_t8_p8:
2921 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2922 if (FLAGS(c) == TRADITIONAL_BOUND) {
2923 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2924 isWORDCHAR_LC_utf8_safe);
2925 break;
2926 }
2927
2928 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2929
2930 to_complement = 1;
2931 goto do_boundu_utf8;
2932
2933 case NBOUND_t8_pb:
2934 case NBOUND_t8_p8:
2935 /* regcomp.c makes sure that these only have the traditional \b
2936 * meaning. */
2937 assert(FLAGS(c) == TRADITIONAL_BOUND);
2938
2939 /* FALLTHROUGH */
2940
2941 case NBOUNDU_t8_pb:
2942 case NBOUNDU_t8_p8:
2943 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2944 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2945 isWORDCHAR_utf8_safe);
2946 break;
2947 }
2948
2949 to_complement = 1;
2950 goto do_boundu_utf8;
2951
2952 case BOUND_t8_pb:
2953 case BOUND_t8_p8:
2954 /* regcomp.c makes sure that these only have the traditional \b
2955 * meaning. */
2956 assert(FLAGS(c) == TRADITIONAL_BOUND);
2957
2958 /* FALLTHROUGH */
2959
2960 case BOUNDU_t8_pb:
2961 case BOUNDU_t8_p8:
2962 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2963 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2964 break;
2965 }
2966
2967 do_boundu_utf8:
2968 if (s == reginfo->strbeg) {
2969 if (reginfo->intuit || regtry(reginfo, &s))
2970 {
2971 goto got_it;
2972 }
2973
2974 /* Didn't match. Try at the next position (if there is one) */
2975 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2976 if (UNLIKELY(s >= reginfo->strend)) {
2977 break;
2978 }
2979 }
2980
2981 switch((bound_type) FLAGS(c)) {
2982 case TRADITIONAL_BOUND: /* Should have already been handled */
2983 assert(0);
2984 break;
2985
2986 case GCB_BOUND:
2987 {
2988 GCB_enum before = getGCB_VAL_UTF8(
2989 reghop3((U8*)s, -1,
2990 (U8*)(reginfo->strbeg)),
2991 (U8*) reginfo->strend);
2992 while (s < strend) {
2993 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2994 (U8*) reginfo->strend);
2995 if ( (to_complement ^ isGCB(before,
2996 after,
2997 (U8*) reginfo->strbeg,
2998 (U8*) s,
2999 1 /* target is utf8 */ ))
3000 && (reginfo->intuit || regtry(reginfo, &s)))
3001 {
3002 goto got_it;
3003 }
3004 before = after;
3005 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3006 }
3007 }
3008 break;
3009
3010 case LB_BOUND:
3011 {
3012 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
3013 -1,
3014 (U8*)(reginfo->strbeg)),
3015 (U8*) reginfo->strend);
3016 while (s < strend) {
3017 LB_enum after = getLB_VAL_UTF8((U8*) s,
3018 (U8*) reginfo->strend);
3019 if (to_complement ^ isLB(before,
3020 after,
3021 (U8*) reginfo->strbeg,
3022 (U8*) s,
3023 (U8*) reginfo->strend,
3024 1 /* target is utf8 */ )
3025 && (reginfo->intuit || regtry(reginfo, &s)))
3026 {
3027 goto got_it;
3028 }
3029 before = after;
3030 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3031 }
3032 }
3033
3034 break;
3035
3036 case SB_BOUND:
3037 {
3038 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
3039 -1,
3040 (U8*)(reginfo->strbeg)),
3041 (U8*) reginfo->strend);
3042 while (s < strend) {
3043 SB_enum after = getSB_VAL_UTF8((U8*) s,
3044 (U8*) reginfo->strend);
3045 if ((to_complement ^ isSB(before,
3046 after,
3047 (U8*) reginfo->strbeg,
3048 (U8*) s,
3049 (U8*) reginfo->strend,
3050 1 /* target is utf8 */ ))
3051 && (reginfo->intuit || regtry(reginfo, &s)))
3052 {
3053 goto got_it;
3054 }
3055 before = after;
3056 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3057 }
3058 }
3059
3060 break;
3061
3062 case WB_BOUND:
3063 {
3064 /* We are at a boundary between char_sub_0 and char_sub_1.
3065 * We also keep track of the value for char_sub_-1 as we
3066 * loop through the line. Context may be needed to make a
3067 * determination, and if so, this can save having to
3068 * recalculate it */
3069 WB_enum previous = WB_UNKNOWN;
3070 WB_enum before = getWB_VAL_UTF8(
3071 reghop3((U8*)s,
3072 -1,
3073 (U8*)(reginfo->strbeg)),
3074 (U8*) reginfo->strend);
3075 while (s < strend) {
3076 WB_enum after = getWB_VAL_UTF8((U8*) s,
3077 (U8*) reginfo->strend);
3078 if ((to_complement ^ isWB(previous,
3079 before,
3080 after,
3081 (U8*) reginfo->strbeg,
3082 (U8*) s,
3083 (U8*) reginfo->strend,
3084 1 /* target is utf8 */ ))
3085 && (reginfo->intuit || regtry(reginfo, &s)))
3086 {
3087 goto got_it;
3088 }
3089 previous = before;
3090 before = after;
3091 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3092 }
3093 }
3094 }
3095
3096 /* Here are at the final position in the target string, which is a
3097 * boundary by definition, so matches, depending on other constraints.
3098 * */
3099
3100 if ( reginfo->intuit
3101 || (s <= reginfo->strend && regtry(reginfo, &s)))
3102 {
3103 goto got_it;
3104 }
3105 break;
3106
3107 case LNBREAK_t8_pb:
3108 case LNBREAK_t8_p8:
3109 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
3110 break;
3111
3112 case LNBREAK_tb_pb:
3113 case LNBREAK_tb_p8:
3114 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
3115 break;
3116
3117 /* The argument to all the POSIX node types is the class number to pass
3118 * to generic_isCC_() to build a mask for searching in PL_charclass[] */
3119
3120 case NPOSIXL_t8_pb:
3121 case NPOSIXL_t8_p8:
3122 to_complement = 1;
3123 /* FALLTHROUGH */
3124
3125 case POSIXL_t8_pb:
3126 case POSIXL_t8_p8:
3127 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3128 REXEC_FBC_UTF8_CLASS_SCAN(
3129 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3130 (U8 *) strend)));
3131 break;
3132
3133 case NPOSIXL_tb_pb:
3134 case NPOSIXL_tb_p8:
3135 to_complement = 1;
3136 /* FALLTHROUGH */
3137
3138 case POSIXL_tb_pb:
3139 case POSIXL_tb_p8:
3140 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3141 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3142 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3143 break;
3144
3145 case NPOSIXA_t8_pb:
3146 case NPOSIXA_t8_p8:
3147 /* The complement of something that matches only ASCII matches all
3148 * non-ASCII, plus everything in ASCII that isn't in the class. */
3149 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3150 || ! generic_isCC_A_(*s, FLAGS(c)));
3151 break;
3152
3153 case POSIXA_t8_pb:
3154 case POSIXA_t8_p8:
3155 /* Don't need to worry about utf8, as it can match only a single
3156 * byte invariant character. But we do anyway for performance reasons,
3157 * as otherwise we would have to examine all the continuation
3158 * characters */
3159 REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c)));
3160 break;
3161
3162 case NPOSIXD_tb_pb:
3163 case NPOSIXD_tb_p8:
3164 case NPOSIXA_tb_pb:
3165 case NPOSIXA_tb_p8:
3166 to_complement = 1;
3167 /* FALLTHROUGH */
3168
3169 case POSIXD_tb_pb:
3170 case POSIXD_tb_p8:
3171 case POSIXA_tb_pb:
3172 case POSIXA_tb_p8:
3173 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3174 to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c))));
3175 break;
3176
3177 case NPOSIXU_tb_pb:
3178 case NPOSIXU_tb_p8:
3179 to_complement = 1;
3180 /* FALLTHROUGH */
3181
3182 case POSIXU_tb_pb:
3183 case POSIXU_tb_p8:
3184 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3185 to_complement ^ cBOOL(generic_isCC_(*s,
3186 FLAGS(c))));
3187 break;
3188
3189 case NPOSIXD_t8_pb:
3190 case NPOSIXD_t8_p8:
3191 case NPOSIXU_t8_pb:
3192 case NPOSIXU_t8_p8:
3193 to_complement = 1;
3194 /* FALLTHROUGH */
3195
3196 case POSIXD_t8_pb:
3197 case POSIXD_t8_p8:
3198 case POSIXU_t8_pb:
3199 case POSIXU_t8_p8:
3200 classnum = (char_class_number_) FLAGS(c);
3201 switch (classnum) {
3202 default:
3203 REXEC_FBC_UTF8_CLASS_SCAN(
3204 to_complement ^ cBOOL(_invlist_contains_cp(
3205 PL_XPosix_ptrs[classnum],
3206 utf8_to_uvchr_buf((U8 *) s,
3207 (U8 *) strend,
3208 NULL))));
3209 break;
3210
3211 case CC_ENUM_SPACE_:
3212 REXEC_FBC_UTF8_CLASS_SCAN(
3213 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3214 break;
3215
3216 case CC_ENUM_BLANK_:
3217 REXEC_FBC_UTF8_CLASS_SCAN(
3218 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3219 break;
3220
3221 case CC_ENUM_XDIGIT_:
3222 REXEC_FBC_UTF8_CLASS_SCAN(
3223 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3224 break;
3225
3226 case CC_ENUM_VERTSPACE_:
3227 REXEC_FBC_UTF8_CLASS_SCAN(
3228 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3229 break;
3230
3231 case CC_ENUM_CNTRL_:
3232 REXEC_FBC_UTF8_CLASS_SCAN(
3233 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3234 break;
3235 }
3236 break;
3237
3238 case AHOCORASICKC_tb_pb:
3239 case AHOCORASICKC_tb_p8:
3240 case AHOCORASICKC_t8_pb:
3241 case AHOCORASICKC_t8_p8:
3242 case AHOCORASICK_tb_pb:
3243 case AHOCORASICK_tb_p8:
3244 case AHOCORASICK_t8_pb:
3245 case AHOCORASICK_t8_p8:
3246 {
3247 DECL_TRIE_TYPE(c);
3248 /* what trie are we using right now */
3249 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG1u( c ) ];
3250 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3251 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3252
3253 const char *last_start = strend - trie->minlen;
3254 #ifdef DEBUGGING
3255 const char *real_start = s;
3256 #endif
3257 STRLEN maxlen = trie->maxlen;
3258 SV *sv_points;
3259 U8 **points; /* map of where we were in the input string
3260 when reading a given char. For ASCII this
3261 is unnecessary overhead as the relationship
3262 is always 1:1, but for Unicode, especially
3263 case folded Unicode this is not true. */
3264 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3265 U8 *bitmap=NULL;
3266
3267
3268 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3269
3270 /* We can't just allocate points here. We need to wrap it in
3271 * an SV so it gets freed properly if there is a croak while
3272 * running the match */
3273 ENTER;
3274 SAVETMPS;
3275 sv_points=newSV(maxlen * sizeof(U8 *));
3276 SvCUR_set(sv_points,
3277 maxlen * sizeof(U8 *));
3278 SvPOK_on(sv_points);
3279 sv_2mortal(sv_points);
3280 points=(U8**)SvPV_nolen(sv_points );
3281 if ( trie_type != trie_utf8_fold
3282 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3283 {
3284 if (trie->bitmap)
3285 bitmap=(U8*)trie->bitmap;
3286 else
3287 bitmap=(U8*)ANYOF_BITMAP(c);
3288 }
3289 /* this is the Aho-Corasick algorithm modified a touch
3290 to include special handling for long "unknown char" sequences.
3291 The basic idea being that we use AC as long as we are dealing
3292 with a possible matching char, when we encounter an unknown char
3293 (and we have not encountered an accepting state) we scan forward
3294 until we find a legal starting char.
3295 AC matching is basically that of trie matching, except that when
3296 we encounter a failing transition, we fall back to the current
3297 states "fail state", and try the current char again, a process
3298 we repeat until we reach the root state, state 1, or a legal
3299 transition. If we fail on the root state then we can either
3300 terminate if we have reached an accepting state previously, or
3301 restart the entire process from the beginning if we have not.
3302
3303 */
3304 while (s <= last_start) {
3305 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3306 U8 *uc = (U8*)s;
3307 U16 charid = 0;
3308 U32 base = 1;
3309 U32 state = 1;
3310 UV uvc = 0;
3311 STRLEN len = 0;
3312 STRLEN foldlen = 0;
3313 U8 *uscan = (U8*)NULL;
3314 U8 *leftmost = NULL;
3315 #ifdef DEBUGGING
3316 U32 accepted_word= 0;
3317 #endif
3318 U32 pointpos = 0;
3319
3320 while ( state && uc <= (U8*)strend ) {
3321 int failed=0;
3322 U32 word = aho->states[ state ].wordnum;
3323
3324 if( state==1 ) {
3325 if ( bitmap ) {
3326 DEBUG_TRIE_EXECUTE_r(
3327 if ( uc <= (U8*)last_start
3328 && !BITMAP_TEST(bitmap,*uc) )
3329 {
3330 dump_exec_pos( (char *)uc, c, strend,
3331 real_start,
3332 (char *)uc, utf8_target, 0 );
3333 Perl_re_printf( aTHX_
3334 " Scanning for legal start char...\n");
3335 }
3336 );
3337 if (utf8_target) {
3338 while ( uc <= (U8*)last_start
3339 && !BITMAP_TEST(bitmap,*uc) )
3340 {
3341 uc += UTF8SKIP(uc);
3342 }
3343 } else {
3344 while ( uc <= (U8*)last_start
3345 && ! BITMAP_TEST(bitmap,*uc) )
3346 {
3347 uc++;
3348 }
3349 }
3350 s= (char *)uc;
3351 }
3352 if (uc >(U8*)last_start) break;
3353 }
3354
3355 if ( word ) {
3356 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3357 % maxlen ];
3358 if (!leftmost || lpos < leftmost) {
3359 DEBUG_r(accepted_word=word);
3360 leftmost= lpos;
3361 }
3362 if (base==0) break;
3363
3364 }
3365 points[pointpos++ % maxlen]= uc;
3366 if (foldlen || uc < (U8*)strend) {
3367 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3368 (U8 *) strend, uscan, len, uvc,
3369 charid, foldlen, foldbuf,
3370 uniflags);
3371 DEBUG_TRIE_EXECUTE_r({
3372 dump_exec_pos( (char *)uc, c, strend,
3373 real_start, s, utf8_target, 0);
3374 Perl_re_printf( aTHX_
3375 " Charid:%3u CP:%4" UVxf " ",
3376 charid, uvc);
3377 });
3378 }
3379 else {
3380 len = 0;
3381 charid = 0;
3382 }
3383
3384
3385 do {
3386 #ifdef DEBUGGING
3387 word = aho->states[ state ].wordnum;
3388 #endif
3389 base = aho->states[ state ].trans.base;
3390
3391 DEBUG_TRIE_EXECUTE_r({
3392 if (failed)
3393 dump_exec_pos((char *)uc, c, strend, real_start,
3394 s, utf8_target, 0 );
3395 Perl_re_printf( aTHX_
3396 "%sState: %4" UVxf ", word=%" UVxf,
3397 failed ? " Fail transition to " : "",
3398 (UV)state, (UV)word);
3399 });
3400 if ( base ) {
3401 U32 tmp;
3402 I32 offset;
3403 if (charid &&
3404 ( ((offset = base + charid
3405 - 1 - trie->uniquecharcount)) >= 0)
3406 && ((U32)offset < trie->lasttrans)
3407 && trie->trans[offset].check == state
3408 && (tmp=trie->trans[offset].next))
3409 {
3410 DEBUG_TRIE_EXECUTE_r(
3411 Perl_re_printf( aTHX_ " - legal\n"));
3412 state = tmp;
3413 break;
3414 }
3415 else {
3416 DEBUG_TRIE_EXECUTE_r(
3417 Perl_re_printf( aTHX_ " - fail\n"));
3418 failed = 1;
3419 state = aho->fail[state];
3420 }
3421 }
3422 else {
3423 /* we must be accepting here */
3424 DEBUG_TRIE_EXECUTE_r(
3425 Perl_re_printf( aTHX_ " - accepting\n"));
3426 failed = 1;
3427 break;
3428 }
3429 } while(state);
3430 uc += len;
3431 if (failed) {
3432 if (leftmost)
3433 break;
3434 if (!state) state = 1;
3435 }
3436 }
3437 if ( aho->states[ state ].wordnum ) {
3438 U8 *lpos = points[ (pointpos
3439 - trie->wordinfo[aho->states[ state ]
3440 .wordnum].len) % maxlen ];
3441 if (!leftmost || lpos < leftmost) {
3442 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3443 leftmost = lpos;
3444 }
3445 }
3446 if (leftmost) {
3447 s = (char*)leftmost;
3448 DEBUG_TRIE_EXECUTE_r({
3449 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3450 " at position %" IVdf ". Trying full"
3451 " pattern...\n",
3452 (UV)accepted_word, (IV)(s - real_start)
3453 );
3454 });
3455 if (reginfo->intuit || regtry(reginfo, &s)) {
3456 FREETMPS;
3457 LEAVE;
3458 goto got_it;
3459 }
3460 if (s < reginfo->strend) {
3461 s = HOPc(s,1);
3462 }
3463 DEBUG_TRIE_EXECUTE_r({
3464 Perl_re_printf( aTHX_
3465 "Pattern failed. Looking for new start"
3466 " point...\n");
3467 });
3468 } else {
3469 DEBUG_TRIE_EXECUTE_r(
3470 Perl_re_printf( aTHX_ "No match.\n"));
3471 break;
3472 }
3473 }
3474 FREETMPS;
3475 LEAVE;
3476 }
3477 break;
3478
3479 case EXACTFU_REQ8_t8_pb:
3480 case EXACTFUP_tb_p8:
3481 case EXACTFUP_t8_p8:
3482 case EXACTF_tb_p8:
3483 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3484 case EXACTFAA_NO_TRIE_tb_p8:
3485 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3486 patterns */
3487 assert(0);
3488
3489 default:
3490 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3491 } /* End of switch on node type */
3492
3493 return 0;
3494
3495 got_it:
3496 return s;
3497 }
3498
3499 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3500 * flags have same meanings as with regexec_flags() */
3501
3502 static void
S_reg_set_capture_string(pTHX_ REGEXP * const rx,char * strbeg,char * strend,SV * sv,U32 flags,bool utf8_target)3503 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3504 char *strbeg,
3505 char *strend,
3506 SV *sv,
3507 U32 flags,
3508 bool utf8_target)
3509 {
3510 struct regexp *const prog = ReANY(rx);
3511
3512 if (flags & REXEC_COPY_STR) {
3513 #ifdef PERL_ANY_COW
3514 if (SvCANCOW(sv)) {
3515 DEBUG_C(Perl_re_printf( aTHX_
3516 "Copy on write: regexp capture, type %d\n",
3517 (int) SvTYPE(sv)));
3518 /* Create a new COW SV to share the match string and store
3519 * in saved_copy, unless the current COW SV in saved_copy
3520 * is valid and suitable for our purpose */
3521 if (( RXp_SAVED_COPY(prog)
3522 && SvIsCOW(RXp_SAVED_COPY(prog))
3523 && SvPOKp(RXp_SAVED_COPY(prog))
3524 && SvIsCOW(sv)
3525 && SvPOKp(sv)
3526 && SvPVX(sv) == SvPVX(RXp_SAVED_COPY(prog))))
3527 {
3528 /* just reuse saved_copy SV */
3529 if (RXp_MATCH_COPIED(prog)) {
3530 Safefree(RXp_SUBBEG(prog));
3531 RXp_MATCH_COPIED_off(prog);
3532 }
3533 }
3534 else {
3535 /* create new COW SV to share string */
3536 RXp_MATCH_COPY_FREE(prog);
3537 RXp_SAVED_COPY(prog) = sv_setsv_cow(RXp_SAVED_COPY(prog), sv);
3538 }
3539 RXp_SUBBEG(prog) = (char *)SvPVX_const(RXp_SAVED_COPY(prog));
3540 assert (SvPOKp(RXp_SAVED_COPY(prog)));
3541 RXp_SUBLEN(prog) = strend - strbeg;
3542 RXp_SUBOFFSET(prog) = 0;
3543 RXp_SUBCOFFSET(prog) = 0;
3544 } else
3545 #endif
3546 {
3547 SSize_t min = 0;
3548 SSize_t max = strend - strbeg;
3549 SSize_t sublen;
3550
3551 /* NOTE: the following if block is not used or tested
3552 * in standard builds. It is only used when PERL_SAWAMPERSAND is
3553 * defined */
3554
3555 if ( (flags & REXEC_COPY_SKIP_POST)
3556 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3557 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3558 ) { /* don't copy $' part of string */
3559 SSize_t offs_end;
3560 U32 n = 0;
3561 max = -1;
3562 /* calculate the right-most part of the string covered
3563 * by a capture. Due to lookahead, this may be to
3564 * the right of $&, so we have to scan all captures */
3565 while (n <= RXp_LASTPAREN(prog)) {
3566 if ((offs_end = RXp_OFFS_END(prog,n)) > max)
3567 max = offs_end;
3568 n++;
3569 }
3570 if (max == -1)
3571 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3572 ? RXp_OFFS_START(prog,0)
3573 : 0;
3574 assert(max >= 0 && max <= strend - strbeg);
3575 }
3576
3577 /* NOTE: the following if block is not used or tested
3578 * in standard builds. It is only used when PERL_SAWAMPERSAND is
3579 * defined */
3580 if ( (flags & REXEC_COPY_SKIP_PRE)
3581 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3582 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3583 ) { /* don't copy $` part of string */
3584 U32 n = 0;
3585 min = max;
3586 /* calculate the left-most part of the string covered
3587 * by a capture. Due to lookbehind, this may be to
3588 * the left of $&, so we have to scan all captures */
3589 while (min && n <= RXp_LASTPAREN(prog)) {
3590 SSize_t start = RXp_OFFS_START(prog,n);
3591 if ( start != -1
3592 && start < min)
3593 {
3594 min = start;
3595 }
3596 n++;
3597 }
3598 if (PL_sawampersand & SAWAMPERSAND_RIGHT) {
3599 SSize_t end = RXp_OFFS_END(prog,0);
3600 if ( min > end )
3601 min = end;
3602 }
3603 }
3604
3605 assert(min >= 0 && min <= max && min <= strend - strbeg);
3606 sublen = max - min;
3607
3608 if (RXp_MATCH_COPIED(prog)) {
3609 if (sublen > RXp_SUBLEN(prog))
3610 RXp_SUBBEG(prog) =
3611 (char*)saferealloc(RXp_SUBBEG(prog), sublen+1);
3612 }
3613 else
3614 RXp_SUBBEG(prog) = (char*)safemalloc(sublen+1);
3615 Copy(strbeg + min, RXp_SUBBEG(prog), sublen, char);
3616 RXp_SUBBEG(prog)[sublen] = '\0';
3617 RXp_SUBOFFSET(prog) = min;
3618 RXp_SUBLEN(prog) = sublen;
3619 RXp_MATCH_COPIED_on(prog);
3620 }
3621 RXp_SUBCOFFSET(prog) = RXp_SUBOFFSET(prog);
3622 if (RXp_SUBOFFSET(prog) && utf8_target) {
3623 /* Convert byte offset to chars.
3624 * XXX ideally should only compute this if @-/@+
3625 * has been seen, a la PL_sawampersand ??? */
3626
3627 /* If there's a direct correspondence between the
3628 * string which we're matching and the original SV,
3629 * then we can use the utf8 len cache associated with
3630 * the SV. In particular, it means that under //g,
3631 * sv_pos_b2u() will use the previously cached
3632 * position to speed up working out the new length of
3633 * subcoffset, rather than counting from the start of
3634 * the string each time. This stops
3635 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3636 * from going quadratic */
3637 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3638 RXp_SUBCOFFSET(prog) = sv_pos_b2u_flags(sv, RXp_SUBCOFFSET(prog),
3639 SV_GMAGIC|SV_CONST_RETURN);
3640 else
3641 RXp_SUBCOFFSET(prog) = utf8_length((U8*)strbeg,
3642 (U8*)(strbeg+RXp_SUBOFFSET(prog)));
3643 }
3644 }
3645 else {
3646 RXp_MATCH_COPY_FREE(prog);
3647 RXp_SUBBEG(prog) = strbeg;
3648 RXp_SUBOFFSET(prog) = 0;
3649 RXp_SUBCOFFSET(prog) = 0;
3650 RXp_SUBLEN(prog) = strend - strbeg;
3651 }
3652 }
3653
3654
3655
3656
3657 /*
3658 - regexec_flags - match a regexp against a string
3659 */
3660 I32
Perl_regexec_flags(pTHX_ REGEXP * const rx,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * sv,void * data,U32 flags)3661 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3662 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3663 /* stringarg: the point in the string at which to begin matching */
3664 /* strend: pointer to null at end of string */
3665 /* strbeg: real beginning of string */
3666 /* minend: end of match must be >= minend bytes after stringarg. */
3667 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3668 * itself is accessed via the pointers above */
3669 /* data: May be used for some additional optimizations.
3670 Currently unused. */
3671 /* flags: For optimizations. See REXEC_* in regexp.h */
3672
3673 {
3674 struct regexp *const prog = ReANY(rx);
3675 char *s;
3676 regnode *c;
3677 char *startpos;
3678 SSize_t minlen; /* must match at least this many chars */
3679 SSize_t dontbother = 0; /* how many characters not to try at end */
3680 const bool utf8_target = cBOOL(DO_UTF8(sv));
3681 I32 multiline;
3682 RXi_GET_DECL(prog,progi);
3683 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3684 regmatch_info *const reginfo = ®info_buf;
3685 regexp_paren_pair *swap = NULL;
3686 I32 oldsave;
3687 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3688
3689 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3690 PERL_UNUSED_ARG(data);
3691
3692 /* Be paranoid... */
3693 if (prog == NULL) {
3694 Perl_croak(aTHX_ "NULL regexp parameter");
3695 }
3696
3697 DEBUG_EXECUTE_r(
3698 debug_start_match(rx, utf8_target, stringarg, strend,
3699 "Matching");
3700 );
3701
3702 startpos = stringarg;
3703
3704 /* set these early as they may be used by the HOP macros below */
3705 reginfo->strbeg = strbeg;
3706 reginfo->strend = strend;
3707 reginfo->is_utf8_target = cBOOL(utf8_target);
3708
3709 if (prog->intflags & PREGf_GPOS_SEEN) {
3710 MAGIC *mg;
3711
3712 /* set reginfo->ganch, the position where \G can match */
3713
3714 reginfo->ganch =
3715 (flags & REXEC_IGNOREPOS)
3716 ? stringarg /* use start pos rather than pos() */
3717 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3718 /* Defined pos(): */
3719 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3720 : strbeg; /* pos() not defined; use start of string */
3721
3722 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3723 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3724
3725 /* in the presence of \G, we may need to start looking earlier in
3726 * the string than the suggested start point of stringarg:
3727 * if prog->gofs is set, then that's a known, fixed minimum
3728 * offset, such as
3729 * /..\G/: gofs = 2
3730 * /ab|c\G/: gofs = 1
3731 * or if the minimum offset isn't known, then we have to go back
3732 * to the start of the string, e.g. /w+\G/
3733 */
3734
3735 if (prog->intflags & PREGf_ANCH_GPOS) {
3736 if (prog->gofs) {
3737 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3738 if (!startpos ||
3739 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3740 {
3741 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3742 "fail: ganch-gofs before earliest possible start\n"));
3743 return 0;
3744 }
3745 }
3746 else
3747 startpos = reginfo->ganch;
3748 }
3749 else if (prog->gofs) {
3750 startpos = HOPBACKc(startpos, prog->gofs);
3751 if (!startpos)
3752 startpos = strbeg;
3753 }
3754 else if (prog->intflags & PREGf_GPOS_FLOAT)
3755 startpos = strbeg;
3756 }
3757
3758 minlen = prog->minlen;
3759 if ((startpos + minlen) > strend || startpos < strbeg) {
3760 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3761 "Regex match can't succeed, so not even tried\n"));
3762 return 0;
3763 }
3764
3765 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3766 * which will call destuctors to reset PL_regmatch_state, free higher
3767 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3768 * regmatch_info_aux_eval */
3769
3770 oldsave = PL_savestack_ix;
3771
3772 s = startpos;
3773
3774 if ((prog->extflags & RXf_USE_INTUIT)
3775 && !(flags & REXEC_CHECKED))
3776 {
3777 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3778 flags, NULL);
3779 if (!s)
3780 return 0;
3781
3782 if (prog->extflags & RXf_CHECK_ALL) {
3783 /* we can match based purely on the result of INTUIT.
3784 * Set up captures etc just for $& and $-[0]
3785 * (an intuit-only match wont have $1,$2,..) */
3786 assert(!prog->nparens);
3787
3788 /* s/// doesn't like it if $& is earlier than where we asked it to
3789 * start searching (which can happen on something like /.\G/) */
3790 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3791 && (s < stringarg))
3792 {
3793 /* this should only be possible under \G */
3794 assert(prog->intflags & PREGf_GPOS_SEEN);
3795 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3796 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3797 goto phooey;
3798 }
3799
3800 /* match via INTUIT shouldn't have any captures.
3801 * Let @-, @+, $^N know */
3802 RXp_LASTPAREN(prog) = RXp_LASTCLOSEPAREN(prog) = 0;
3803 RXp_MATCH_UTF8_set(prog, utf8_target);
3804 SSize_t match_start = s - strbeg;
3805 SSize_t match_end = utf8_target
3806 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3807 : s - strbeg + prog->minlenret;
3808 CLOSE_ANY_CAPTURE(prog, 0, match_start, match_end);
3809 if ( !(flags & REXEC_NOT_FIRST) )
3810 S_reg_set_capture_string(aTHX_ rx,
3811 strbeg, strend,
3812 sv, flags, utf8_target);
3813
3814 return 1;
3815 }
3816 }
3817
3818 multiline = prog->extflags & RXf_PMf_MULTILINE;
3819
3820 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3821 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3822 "String too short [regexec_flags]...\n"));
3823 goto phooey;
3824 }
3825
3826 /* Check validity of program. */
3827 if (UCHARAT(progi->program) != REG_MAGIC) {
3828 Perl_croak(aTHX_ "corrupted regexp program");
3829 }
3830
3831 RXp_MATCH_TAINTED_off(prog);
3832 RXp_MATCH_UTF8_set(prog, utf8_target);
3833
3834 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3835 reginfo->intuit = 0;
3836 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3837 reginfo->warned = FALSE;
3838 reginfo->sv = sv;
3839 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3840 /* see how far we have to get to not match where we matched before */
3841 reginfo->till = stringarg + minend;
3842
3843 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3844 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3845 S_cleanup_regmatch_info_aux has executed (registered by
3846 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3847 magic belonging to this SV.
3848 Not newSVsv, either, as it does not COW.
3849 */
3850 reginfo->sv = newSV_type(SVt_NULL);
3851 SvSetSV_nosteal(reginfo->sv, sv);
3852 SAVEFREESV(reginfo->sv);
3853 }
3854
3855 /* reserve next 2 or 3 slots in PL_regmatch_state:
3856 * slot N+0: may currently be in use: skip it
3857 * slot N+1: use for regmatch_info_aux struct
3858 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3859 * slot N+3: ready for use by regmatch()
3860 */
3861
3862 {
3863 regmatch_state *old_regmatch_state;
3864 regmatch_slab *old_regmatch_slab;
3865 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3866
3867 /* on first ever match, allocate first slab */
3868 if (!PL_regmatch_slab) {
3869 Newx(PL_regmatch_slab, 1, regmatch_slab);
3870 PL_regmatch_slab->prev = NULL;
3871 PL_regmatch_slab->next = NULL;
3872 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3873 }
3874
3875 old_regmatch_state = PL_regmatch_state;
3876 old_regmatch_slab = PL_regmatch_slab;
3877
3878 for (i=0; i <= max; i++) {
3879 if (i == 1)
3880 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3881 else if (i ==2)
3882 reginfo->info_aux_eval =
3883 reginfo->info_aux->info_aux_eval =
3884 &(PL_regmatch_state->u.info_aux_eval);
3885
3886 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3887 PL_regmatch_state = S_push_slab(aTHX);
3888 }
3889
3890 /* note initial PL_regmatch_state position; at end of match we'll
3891 * pop back to there and free any higher slabs */
3892
3893 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3894 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3895 reginfo->info_aux->poscache = NULL;
3896
3897 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3898
3899 if ((prog->extflags & RXf_EVAL_SEEN))
3900 S_setup_eval_state(aTHX_ reginfo);
3901 else
3902 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3903 }
3904
3905 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3906 /* We have to be careful. If the previous successful match
3907 was from this regex we don't want a subsequent partially
3908 successful match to clobber the old results.
3909 So when we detect this possibility we add a swap buffer
3910 to the re, and switch the buffer each match. If we fail,
3911 we switch it back; otherwise we leave it swapped.
3912 */
3913 swap = RXp_OFFSp(prog);
3914 /* avoid leak if we die, or clean up anyway if match completes */
3915 SAVEFREEPV(swap);
3916 Newxz(RXp_OFFSp(prog), (prog->nparens + 1), regexp_paren_pair);
3917 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3918 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3919 0,
3920 PTR2UV(prog),
3921 PTR2UV(swap),
3922 PTR2UV(RXp_OFFSp(prog))
3923 ));
3924 }
3925
3926 if (prog->recurse_locinput)
3927 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3928
3929 /* Simplest case: anchored match (but not \G) need be tried only once,
3930 * or with MBOL, only at the beginning of each line.
3931 *
3932 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3933 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3934 * match at the start of the string then it won't match anywhere else
3935 * either; while with /.*.../, if it doesn't match at the beginning,
3936 * the earliest it could match is at the start of the next line */
3937
3938 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3939 char *end;
3940
3941 if (regtry(reginfo, &s))
3942 goto got_it;
3943
3944 if (!(prog->intflags & PREGf_ANCH_MBOL))
3945 goto phooey;
3946
3947 /* didn't match at start, try at other newline positions */
3948
3949 if (minlen)
3950 dontbother = minlen - 1;
3951 end = HOP3c(strend, -dontbother, strbeg) - 1;
3952
3953 /* skip to next newline */
3954
3955 while (s <= end) { /* note it could be possible to match at the end of the string */
3956 /* NB: newlines are the same in unicode as they are in latin */
3957 if (*s++ != '\n')
3958 continue;
3959 if (prog->check_substr || prog->check_utf8) {
3960 /* note that with PREGf_IMPLICIT, intuit can only fail
3961 * or return the start position, so it's of limited utility.
3962 * Nevertheless, I made the decision that the potential for
3963 * quick fail was still worth it - DAPM */
3964 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3965 if (!s)
3966 goto phooey;
3967 }
3968 if (regtry(reginfo, &s))
3969 goto got_it;
3970 }
3971 goto phooey;
3972 } /* end anchored search */
3973
3974 /* anchored \G match */
3975 if (prog->intflags & PREGf_ANCH_GPOS)
3976 {
3977 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3978 assert(prog->intflags & PREGf_GPOS_SEEN);
3979 /* For anchored \G, the only position it can match from is
3980 * (ganch-gofs); we already set startpos to this above; if intuit
3981 * moved us on from there, we can't possibly succeed */
3982 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3983 if (s == startpos && regtry(reginfo, &s))
3984 goto got_it;
3985 goto phooey;
3986 }
3987
3988 /* Messy cases: unanchored match. */
3989
3990 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3991 /* we have /x+whatever/ */
3992 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3993 char ch;
3994 #ifdef DEBUGGING
3995 int did_match = 0;
3996 #endif
3997 if (utf8_target) {
3998 if (! prog->anchored_utf8) {
3999 to_utf8_substr(prog);
4000 }
4001 ch = SvPVX_const(prog->anchored_utf8)[0];
4002 REXEC_FBC_UTF8_SCAN(
4003 if (*s == ch) {
4004 DEBUG_EXECUTE_r( did_match = 1 );
4005 if (regtry(reginfo, &s)) goto got_it;
4006 s += UTF8_SAFE_SKIP(s, strend);
4007 while (s < strend && *s == ch)
4008 s += UTF8SKIP(s);
4009 }
4010 );
4011
4012 }
4013 else {
4014 if (! prog->anchored_substr) {
4015 if (! to_byte_substr(prog)) {
4016 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4017 }
4018 }
4019 ch = SvPVX_const(prog->anchored_substr)[0];
4020 REXEC_FBC_NON_UTF8_SCAN(
4021 if (*s == ch) {
4022 DEBUG_EXECUTE_r( did_match = 1 );
4023 if (regtry(reginfo, &s)) goto got_it;
4024 s++;
4025 while (s < strend && *s == ch)
4026 s++;
4027 }
4028 );
4029 }
4030 DEBUG_EXECUTE_r(if (!did_match)
4031 Perl_re_printf( aTHX_
4032 "Did not find anchored character...\n")
4033 );
4034 }
4035 else if (prog->anchored_substr != NULL
4036 || prog->anchored_utf8 != NULL
4037 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
4038 && prog->float_max_offset < strend - s)) {
4039 SV *must;
4040 SSize_t back_max;
4041 SSize_t back_min;
4042 char *last;
4043 char *last1; /* Last position checked before */
4044 #ifdef DEBUGGING
4045 int did_match = 0;
4046 #endif
4047 if (prog->anchored_substr || prog->anchored_utf8) {
4048 if (utf8_target) {
4049 if (! prog->anchored_utf8) {
4050 to_utf8_substr(prog);
4051 }
4052 must = prog->anchored_utf8;
4053 }
4054 else {
4055 if (! prog->anchored_substr) {
4056 if (! to_byte_substr(prog)) {
4057 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4058 }
4059 }
4060 must = prog->anchored_substr;
4061 }
4062 back_max = back_min = prog->anchored_offset;
4063 } else {
4064 if (utf8_target) {
4065 if (! prog->float_utf8) {
4066 to_utf8_substr(prog);
4067 }
4068 must = prog->float_utf8;
4069 }
4070 else {
4071 if (! prog->float_substr) {
4072 if (! to_byte_substr(prog)) {
4073 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4074 }
4075 }
4076 must = prog->float_substr;
4077 }
4078 back_max = prog->float_max_offset;
4079 back_min = prog->float_min_offset;
4080 }
4081
4082 if (back_min<0) {
4083 last = strend;
4084 } else {
4085 last = HOP3c(strend, /* Cannot start after this */
4086 -(SSize_t)(CHR_SVLEN(must)
4087 - (SvTAIL(must) != 0) + back_min), strbeg);
4088 }
4089 if (s > reginfo->strbeg)
4090 last1 = HOPc(s, -1);
4091 else
4092 last1 = s - 1; /* bogus */
4093
4094 /* XXXX check_substr already used to find "s", can optimize if
4095 check_substr==must. */
4096 dontbother = 0;
4097 strend = HOPc(strend, -dontbother);
4098 while ( (s <= last) &&
4099 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
4100 (unsigned char*)strend, must,
4101 multiline ? FBMrf_MULTILINE : 0)) ) {
4102 DEBUG_EXECUTE_r( did_match = 1 );
4103 if (HOPc(s, -back_max) > last1) {
4104 last1 = HOPc(s, -back_min);
4105 s = HOPc(s, -back_max);
4106 }
4107 else {
4108 char * const t = (last1 >= reginfo->strbeg)
4109 ? HOPc(last1, 1) : last1 + 1;
4110
4111 last1 = HOPc(s, -back_min);
4112 s = t;
4113 }
4114 if (utf8_target) {
4115 while (s <= last1) {
4116 if (regtry(reginfo, &s))
4117 goto got_it;
4118 if (s >= last1) {
4119 s++; /* to break out of outer loop */
4120 break;
4121 }
4122 s += UTF8SKIP(s);
4123 }
4124 }
4125 else {
4126 while (s <= last1) {
4127 if (regtry(reginfo, &s))
4128 goto got_it;
4129 s++;
4130 }
4131 }
4132 }
4133 DEBUG_EXECUTE_r(if (!did_match) {
4134 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4135 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4136 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
4137 ((must == prog->anchored_substr || must == prog->anchored_utf8)
4138 ? "anchored" : "floating"),
4139 quoted, RE_SV_TAIL(must));
4140 });
4141 goto phooey;
4142 }
4143 else if ( (c = progi->regstclass) ) {
4144 if (minlen) {
4145 const OPCODE op = OP(progi->regstclass);
4146 /* don't bother with what can't match */
4147 if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE)
4148 strend = HOPc(strend, -(minlen - 1));
4149 }
4150 DEBUG_EXECUTE_r({
4151 SV * const prop = sv_newmortal();
4152 regprop(prog, prop, c, reginfo, NULL);
4153 {
4154 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4155 s,strend-s,PL_dump_re_max_len);
4156 Perl_re_printf( aTHX_
4157 "Matching stclass %.*s against %s (%d bytes)\n",
4158 (int)SvCUR(prop), SvPVX_const(prop),
4159 quoted, (int)(strend - s));
4160 }
4161 });
4162 if (find_byclass(prog, c, s, strend, reginfo))
4163 goto got_it;
4164 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
4165 }
4166 else {
4167 dontbother = 0;
4168 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4169 /* Trim the end. */
4170 char *last= NULL;
4171 SV* float_real;
4172 STRLEN len;
4173 const char *little;
4174
4175 if (utf8_target) {
4176 if (! prog->float_utf8) {
4177 to_utf8_substr(prog);
4178 }
4179 float_real = prog->float_utf8;
4180 }
4181 else {
4182 if (! prog->float_substr) {
4183 if (! to_byte_substr(prog)) {
4184 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4185 }
4186 }
4187 float_real = prog->float_substr;
4188 }
4189
4190 little = SvPV_const(float_real, len);
4191 if (SvTAIL(float_real)) {
4192 /* This means that float_real contains an artificial \n on
4193 * the end due to the presence of something like this:
4194 * /foo$/ where we can match both "foo" and "foo\n" at the
4195 * end of the string. So we have to compare the end of the
4196 * string first against the float_real without the \n and
4197 * then against the full float_real with the string. We
4198 * have to watch out for cases where the string might be
4199 * smaller than the float_real or the float_real without
4200 * the \n. */
4201 char *checkpos= strend - len;
4202 DEBUG_OPTIMISE_r(
4203 Perl_re_printf( aTHX_
4204 "%sChecking for float_real.%s\n",
4205 PL_colors[4], PL_colors[5]));
4206 if (checkpos + 1 < strbeg) {
4207 /* can't match, even if we remove the trailing \n
4208 * string is too short to match */
4209 DEBUG_EXECUTE_r(
4210 Perl_re_printf( aTHX_
4211 "%sString shorter than required trailing substring, cannot match.%s\n",
4212 PL_colors[4], PL_colors[5]));
4213 goto phooey;
4214 } else if (memEQ(checkpos + 1, little, len - 1)) {
4215 /* can match, the end of the string matches without the
4216 * "\n" */
4217 last = checkpos + 1;
4218 } else if (checkpos < strbeg) {
4219 /* cant match, string is too short when the "\n" is
4220 * included */
4221 DEBUG_EXECUTE_r(
4222 Perl_re_printf( aTHX_
4223 "%sString does not contain required trailing substring, cannot match.%s\n",
4224 PL_colors[4], PL_colors[5]));
4225 goto phooey;
4226 } else if (!multiline) {
4227 /* non multiline match, so compare with the "\n" at the
4228 * end of the string */
4229 if (memEQ(checkpos, little, len)) {
4230 last= checkpos;
4231 } else {
4232 DEBUG_EXECUTE_r(
4233 Perl_re_printf( aTHX_
4234 "%sString does not contain required trailing substring, cannot match.%s\n",
4235 PL_colors[4], PL_colors[5]));
4236 goto phooey;
4237 }
4238 } else {
4239 /* multiline match, so we have to search for a place
4240 * where the full string is located */
4241 goto find_last;
4242 }
4243 } else {
4244 find_last:
4245 if (len)
4246 last = rninstr(s, strend, little, little + len);
4247 else
4248 last = strend; /* matching "$" */
4249 }
4250 if (!last) {
4251 /* at one point this block contained a comment which was
4252 * probably incorrect, which said that this was a "should not
4253 * happen" case. Even if it was true when it was written I am
4254 * pretty sure it is not anymore, so I have removed the comment
4255 * and replaced it with this one. Yves */
4256 DEBUG_EXECUTE_r(
4257 Perl_re_printf( aTHX_
4258 "%sString does not contain required substring, cannot match.%s\n",
4259 PL_colors[4], PL_colors[5]
4260 ));
4261 goto phooey;
4262 }
4263 dontbother = strend - last + prog->float_min_offset;
4264 }
4265 if (minlen && (dontbother < minlen))
4266 dontbother = minlen - 1;
4267 strend -= dontbother; /* this one's always in bytes! */
4268 /* We don't know much -- general case. */
4269 if (utf8_target) {
4270 for (;;) {
4271 if (regtry(reginfo, &s))
4272 goto got_it;
4273 if (s >= strend)
4274 break;
4275 s += UTF8SKIP(s);
4276 };
4277 }
4278 else {
4279 do {
4280 if (regtry(reginfo, &s))
4281 goto got_it;
4282 } while (s++ < strend);
4283 }
4284 }
4285
4286 /* Failure. */
4287 goto phooey;
4288
4289 got_it:
4290 /* s/// doesn't like it if $& is earlier than where we asked it to
4291 * start searching (which can happen on something like /.\G/) */
4292 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
4293 && (RXp_OFFS_START(prog,0) < stringarg - strbeg))
4294 {
4295 /* this should only be possible under \G */
4296 assert(prog->intflags & PREGf_GPOS_SEEN);
4297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4298 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4299 goto phooey;
4300 }
4301
4302 /* clean up; this will trigger destructors that will free all slabs
4303 * above the current one, and cleanup the regmatch_info_aux
4304 * and regmatch_info_aux_eval sructs */
4305
4306 LEAVE_SCOPE(oldsave);
4307
4308 if (RXp_PAREN_NAMES(prog))
4309 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4310
4311 /* make sure $`, $&, $', and $digit will work later */
4312 if ( !(flags & REXEC_NOT_FIRST) )
4313 S_reg_set_capture_string(aTHX_ rx,
4314 strbeg, reginfo->strend,
4315 sv, flags, utf8_target);
4316
4317 return 1;
4318
4319 phooey:
4320 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
4321 PL_colors[4], PL_colors[5]));
4322
4323 if (swap) {
4324 /* we failed :-( roll it back.
4325 * Since the swap buffer will be freed on scope exit which follows
4326 * shortly, restore the old captures by copying 'swap's original
4327 * data to the new offs buffer
4328 */
4329 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4330 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4331 0,
4332 PTR2UV(prog),
4333 PTR2UV(RXp_OFFSp(prog)),
4334 PTR2UV(swap)
4335 ));
4336
4337 Copy(swap, RXp_OFFSp(prog), prog->nparens + 1, regexp_paren_pair);
4338 }
4339
4340 /* clean up; this will trigger destructors that will free all slabs
4341 * above the current one, and cleanup the regmatch_info_aux
4342 * and regmatch_info_aux_eval sructs */
4343
4344 LEAVE_SCOPE(oldsave);
4345
4346 return 0;
4347 }
4348
4349
4350 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4351 * Do inc before dec, in case old and new rex are the same */
4352 #define SET_reg_curpm(Re2) \
4353 if (reginfo->info_aux_eval) { \
4354 (void)ReREFCNT_inc(Re2); \
4355 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
4356 PM_SETRE((PL_reg_curpm), (Re2)); \
4357 }
4358
4359
4360 /*
4361 - regtry - try match at specific point
4362
4363 NOTE: *startpos may be modifed by regtry() to signal to the caller
4364 that the next match should start at a specific position in the
4365 string. The macro LAST_REGTRY_SKIPPED_FORWARD(reginfo) can be
4366 used to detect when this has happened.
4367
4368 */
4369 STATIC bool /* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info * reginfo,char ** startposp)4370 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4371 {
4372 CHECKPOINT lastcp;
4373 REGEXP *const rx = reginfo->prog;
4374 regexp *const prog = ReANY(rx);
4375 SSize_t result;
4376 #ifdef DEBUGGING
4377 U32 depth = 0; /* used by REGCP_SET */
4378 #endif
4379 RXi_GET_DECL(prog,progi);
4380 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4381
4382 PERL_ARGS_ASSERT_REGTRY;
4383
4384 reginfo->cutpoint=NULL;
4385
4386 RXp_OFFSp(prog)[0].start = *startposp - reginfo->strbeg;
4387 RXp_LASTPAREN(prog) = 0;
4388 RXp_LASTCLOSEPAREN(prog) = 0;
4389
4390 /* XXXX What this code is doing here?!!! There should be no need
4391 to do this again and again, RXp_LASTPAREN(prog) should take care of
4392 this! --ilya*/
4393
4394 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4395 * Actually, the code in regcppop() (which Ilya may be meaning by
4396 * RXp_LASTPAREN(prog)), is not needed at all by the test suite
4397 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4398 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4399 * Meanwhile, this code *is* needed for the
4400 * above-mentioned test suite tests to succeed. The common theme
4401 * on those tests seems to be returning null fields from matches.
4402 * --jhi updated by dapm */
4403
4404 /* After encountering a variant of the issue mentioned above I think
4405 * the point Ilya was making is that if we properly unwind whenever
4406 * we set lastparen to a smaller value then we should not need to do
4407 * this every time, only when needed. So if we have tests that fail if
4408 * we remove this, then it suggests somewhere else we are improperly
4409 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4410 * places it is called, and related regcp() routines. - Yves */
4411 #if 1
4412 if (prog->nparens) {
4413 regexp_paren_pair *pp = RXp_OFFSp(prog);
4414 I32 i;
4415 for (i = prog->nparens; i > (I32)RXp_LASTPAREN(prog); i--) {
4416 ++pp;
4417 pp->start = -1;
4418 pp->end = -1;
4419 }
4420 }
4421 #endif
4422 REGCP_SET(lastcp);
4423 result = regmatch(reginfo, *startposp, progi->program + 1);
4424 if (result != -1) {
4425 RXp_OFFSp(prog)[0].end = result;
4426 return 1;
4427 }
4428 if (reginfo->cutpoint)
4429 *startposp= reginfo->cutpoint;
4430 REGCP_UNWIND(lastcp);
4431 return 0;
4432 }
4433
4434 /* this is used to determine how far from the left messages like
4435 'failed...' are printed in regexec.c. It should be set such that
4436 messages are inline with the regop output that created them.
4437 */
4438 #define REPORT_CODE_OFF 29
4439 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4440
4441 #ifdef PERL_RE_BUILD_DEBUG
4442 int
Perl_re_exec_indentf(pTHX_ const char * fmt,U32 depth,...)4443 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4444 {
4445 va_list ap;
4446 int result;
4447 PerlIO *f= Perl_debug_log;
4448 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4449 va_start(ap, depth);
4450 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4451 result = PerlIO_vprintf(f, fmt, ap);
4452 va_end(ap);
4453 return result;
4454 }
4455 #endif /* DEBUGGING */
4456
4457 /* grab a new slab and return the first slot in it */
4458
4459 STATIC regmatch_state *
S_push_slab(pTHX)4460 S_push_slab(pTHX)
4461 {
4462 regmatch_slab *s = PL_regmatch_slab->next;
4463 if (!s) {
4464 Newx(s, 1, regmatch_slab);
4465 s->prev = PL_regmatch_slab;
4466 s->next = NULL;
4467 PL_regmatch_slab->next = s;
4468 }
4469 PL_regmatch_slab = s;
4470 return SLAB_FIRST(s);
4471 }
4472
4473 #ifdef DEBUGGING
4474
4475 STATIC void
S_debug_start_match(pTHX_ const REGEXP * prog,const bool utf8_target,const char * start,const char * end,const char * blurb)4476 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4477 const char *start, const char *end, const char *blurb)
4478 {
4479 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4480
4481 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4482
4483 if (!PL_colorset)
4484 reginitcolors();
4485 {
4486 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4487 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4488
4489 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4490 start, end - start, PL_dump_re_max_len);
4491
4492 Perl_re_printf( aTHX_
4493 "%s%s REx%s %s against %s\n",
4494 PL_colors[4], blurb, PL_colors[5], s0, s1);
4495
4496 if (utf8_target||utf8_pat)
4497 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4498 utf8_pat ? "pattern" : "",
4499 utf8_pat && utf8_target ? " and " : "",
4500 utf8_target ? "string" : ""
4501 );
4502 }
4503 }
4504
4505 STATIC void
S_dump_exec_pos(pTHX_ const char * locinput,const regnode * scan,const char * loc_regeol,const char * loc_bostr,const char * loc_reg_starttry,const bool utf8_target,const U32 depth)4506 S_dump_exec_pos(pTHX_ const char *locinput,
4507 const regnode *scan,
4508 const char *loc_regeol,
4509 const char *loc_bostr,
4510 const char *loc_reg_starttry,
4511 const bool utf8_target,
4512 const U32 depth
4513 )
4514 {
4515 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4516 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4517 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4518 /* The part of the string before starttry has one color
4519 (pref0_len chars), between starttry and current
4520 position another one (pref_len - pref0_len chars),
4521 after the current position the third one.
4522 We assume that pref0_len <= pref_len, otherwise we
4523 decrease pref0_len. */
4524 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4525 ? (5 + taill) - l : locinput - loc_bostr;
4526 int pref0_len;
4527
4528 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4529
4530 if (utf8_target) {
4531 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) {
4532 pref_len++;
4533 }
4534 }
4535 pref0_len = pref_len - (locinput - loc_reg_starttry);
4536 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4537 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4538 ? (5 + taill) - pref_len : loc_regeol - locinput);
4539 if (utf8_target) {
4540 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) {
4541 l--;
4542 }
4543 }
4544 if (pref0_len < 0)
4545 pref0_len = 0;
4546 if (pref0_len > pref_len)
4547 pref0_len = pref_len;
4548 {
4549 const int is_uni = utf8_target ? 1 : 0;
4550
4551 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4552 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4553
4554 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4555 (locinput - pref_len + pref0_len),
4556 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4557
4558 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4559 locinput, loc_regeol - locinput, 10, 0, 1);
4560
4561 const STRLEN tlen=len0+len1+len2;
4562 Perl_re_printf( aTHX_
4563 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4564 (IV)(locinput - loc_bostr),
4565 len0, s0,
4566 len1, s1,
4567 (docolor ? "" : "> <"),
4568 len2, s2,
4569 (int)(tlen > 19 ? 0 : 19 - tlen),
4570 "",
4571 (UV)depth);
4572 }
4573 }
4574
4575 #endif
4576
4577 /* reg_check_named_buff_matched()
4578 * Checks to see if a named buffer has matched. The data array of
4579 * buffer numbers corresponding to the buffer is expected to reside
4580 * in the regexp->data->data array in the slot stored in the ARG1u() of
4581 * node involved. Note that this routine doesn't actually care about the
4582 * name, that information is not preserved from compilation to execution.
4583 * Returns the index of the leftmost defined buffer with the given name
4584 * or 0 if non of the buffers matched.
4585 */
4586 STATIC I32
S_reg_check_named_buff_matched(const regexp * rex,const regnode * scan)4587 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4588 {
4589 I32 n;
4590 RXi_GET_DECL(rex,rexi);
4591 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
4592 I32 *nums=(I32*)SvPVX(sv_dat);
4593
4594 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4595
4596 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4597 if ((I32)RXp_LASTPAREN(rex) >= nums[n] &&
4598 RXp_OFFS_END(rex,nums[n]) != -1)
4599 {
4600 return nums[n];
4601 }
4602 }
4603 return 0;
4604 }
4605
4606 static bool
S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,struct next_matchable_info * m,regmatch_info * reginfo)4607 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4608 struct next_matchable_info * m,
4609 regmatch_info *reginfo)
4610 {
4611 /* This function determines various characteristics about every possible
4612 * initial match of the passed-in EXACTish <text_node>, and stores them in
4613 * <*m>.
4614 *
4615 * That includes a match string and a parallel mask, such that if you AND
4616 * the target string with the mask and compare with the match string,
4617 * you'll have a pretty good idea, perhaps even perfect, if that portion of
4618 * the target matches or not.
4619 *
4620 * The motivation behind this function is to allow the caller to set up
4621 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where
4622 * B is an arbitrary EXACTish node. To find the end of .*, we look for the
4623 * beginning oF B, which is the passed in <text_node> That's where this
4624 * function comes in. The values it returns can quickly be used to rule
4625 * out many, or all, cases of possible matches not actually being the
4626 * beginning of B, <text_node>. It is also used in regrepeat() where we
4627 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently
4628 * determine where the span of 'A's stop.
4629 *
4630 * If <text_node> is of type EXACT, there is only one possible character
4631 * that can match its first character, and so the situation is quite
4632 * simple. But things can get much more complicated if folding is
4633 * involved. It may be that the first character of an EXACTFish node
4634 * doesn't participate in any possible fold, e.g., punctuation, so it can
4635 * be matched only by itself. The vast majority of characters that are in
4636 * folds match just two things, their lower and upper-case equivalents.
4637 * But not all are like that; some have multiple possible matches, or match
4638 * sequences of more than one character. This function sorts all that out.
4639 *
4640 * It returns information about all possibilities of what the first
4641 * character(s) of <text_node> could look like. Again, if <text_node> is a
4642 * plain EXACT node, that's just the actual first bytes of the first
4643 * character; but otherwise it is the bytes, that when masked, match all
4644 * possible combinations of all the initial bytes of all the characters
4645 * that could match, folded. (Actually, this is a slight over promise. It
4646 * handles only up to the initial 5 bytes, which is enough for all Unicode
4647 * characters, but not for all non-Unicode ones.)
4648 *
4649 * Here's an example to clarify. Suppose the first character of
4650 * <text_node> is the letter 'C', and we are under /i matching. That means
4651 * 'c' also matches. The representations of these two characters differ in
4652 * just one bit, so the mask would be a zero in that position and ones in
4653 * the other 7. And the returned string would be the AND of these two
4654 * characters, and would be one byte long, since these characters are each
4655 * a single byte. ANDing the target <text_node> with this mask will yield
4656 * the returned string if and only if <text_node> begins with one of these
4657 * two characters. So, the function would also return that the definitive
4658 * length matched is 1 byte.
4659 *
4660 * Now, suppose instead of the letter 'C', <text_node> begins with the
4661 * letter 'F'. The situation is much more complicated because there are
4662 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4663 * begins with 'f', and hence could match. We add these into the returned
4664 * string and mask, but the result isn't definitive; the caller has to
4665 * check further if its AND and compare pass. But the failure of that
4666 * compare will quickly rule out most possible inputs.
4667 *
4668 * Much of this could be done in regcomp.c at compile time, except for
4669 * locale-dependent, and UTF-8 target dependent data. Extra data fields
4670 * could be used for one or the other eventualities.
4671 *
4672 * If this function determines that no possible character in the target
4673 * string can match, it returns FALSE; otherwise TRUE. (The FALSE
4674 * situation occurs if the first character in <text_node> requires UTF-8 to
4675 * represent, and the target string isn't in UTF-8.)
4676 *
4677 * Some analysis is in GH #18414, located at the time of this writing at:
4678 * https://github.com/Perl/perl5/issues/18414
4679 */
4680
4681 const bool utf8_target = reginfo->is_utf8_target;
4682 bool utf8_pat = reginfo->is_utf8_pat;
4683
4684 PERL_UINT_FAST8_T i;
4685
4686 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4687 */
4688 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4689 U8 lengths[MAX_MATCHES] = { 0 };
4690
4691 U8 index_of_longest = 0;
4692
4693 U8 *pat = (U8*)STRING(text_node);
4694 Size_t pat_len = STR_LEN(text_node);
4695 U8 op = OP(text_node);
4696
4697 U8 byte_mask[5] = {0};
4698 U8 byte_anded[5] = {0};
4699
4700 /* There are some folds in Unicode to multiple characters. This will hold
4701 * such characters that could fold to the beginning of 'text_node' */
4702 UV multi_fold_from = 0;
4703
4704 /* We may have to create a modified copy of the pattern */
4705 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4706
4707 m->max_length = 0;
4708 m->min_length = 255;
4709 m->count = 0;
4710
4711 /* Even if the first character in the node can match something in Latin1,
4712 * if there is anything in the node that can't, the match must fail */
4713 if (! utf8_target && isEXACT_REQ8(op)) {
4714 return FALSE;
4715 }
4716
4717 /* Define a temporary op for use in this function, using an existing one that
4718 * should never be a real op during execution */
4719 #define TURKISH PSEUDO
4720
4721 /* What to do about these two nodes had to be deferred to runtime (which is
4722 * now). If the extra information we now have so indicates, turn them into
4723 * EXACTFU nodes */
4724 if ( (op == EXACTF && utf8_target)
4725 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4726 {
4727 if (op == EXACTFL && IN_UTF8_TURKIC_LOCALE) {
4728 op = TURKISH;
4729 }
4730 else {
4731 op = EXACTFU;
4732 }
4733
4734 /* And certain situations are better handled if we create a modified
4735 * version of the pattern */
4736 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4737 specific problematic characters */
4738 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4739
4740 /* The node could start with characters that are the first ones
4741 * of a multi-character fold. */
4742 multi_fold_from
4743 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4744 if (multi_fold_from) {
4745
4746 /* Here, they do form a sequence that matches the fold of a
4747 * single character. That single character then is a
4748 * possible match. Below we will look again at this, but
4749 * the code below is expecting every character in the
4750 * pattern to be folded, which the input isn't required to
4751 * be in this case. So, just fold the single character,
4752 * and the result will be in the expected form. */
4753 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4754 FOLD_FLAGS_FULL);
4755 pat = mod_pat;
4756 }
4757 /* Turkish has a couple extra possibilities. */
4758 else if ( UNLIKELY(op == TURKISH)
4759 && pat_len >= 3
4760 && isALPHA_FOLD_EQ(pat[0], 'f')
4761 && ( memBEGINs(pat + 1, pat_len - 1,
4762 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4763 || ( pat_len >= 4
4764 && isALPHA_FOLD_EQ(pat[1], 'f')
4765 && memBEGINs(pat + 2, pat_len - 2,
4766 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4767 ))) {
4768 /* The macros for finding a multi-char fold don't include
4769 * the Turkish possibilities, in which U+130 folds to 'i'.
4770 * Hard-code these. It's very unlikely that Unicode will
4771 * ever add any others. */
4772 if (pat[1] == 'f') {
4773 pat_len = 3;
4774 Copy("ffi", mod_pat, pat_len, U8);
4775 }
4776 else {
4777 pat_len = 2;
4778 Copy("fi", mod_pat, pat_len, U8);
4779 }
4780 pat = mod_pat;
4781 }
4782 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat)
4783 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4784 && LIKELY(memNEs(pat, pat_len,
4785 LATIN_SMALL_LETTER_SHARP_S_UTF8))
4786 && (LIKELY(op != TURKISH || *pat != 'I')))
4787 {
4788 /* For all cases of things between 0-255, except the ones
4789 * in the conditional above, the fold is just the lower
4790 * case, which is faster than the more general case. */
4791 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4792 pat[1]));
4793 pat_len = 1;
4794 pat = mod_pat;
4795 utf8_pat = FALSE;
4796 }
4797 else { /* Code point above 255, or needs special handling */
4798 _to_utf8_fold_flags(pat, pat + pat_len,
4799 mod_pat, &pat_len,
4800 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4801 pat = mod_pat;
4802 }
4803 }
4804 }
4805 else if /* Below is not a UTF-8 pattern; there's a somewhat different
4806 set of problematic characters */
4807 ((multi_fold_from
4808 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4809 {
4810 /* We may have to canonicalize a multi-char fold, as in the UTF-8
4811 * case */
4812 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4813 FOLD_FLAGS_FULL);
4814 pat = mod_pat;
4815 }
4816 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4817 mod_pat[0] = mod_pat[1] = 's';
4818 pat_len = 2;
4819 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4820 chars, and speeds copying */
4821 pat = mod_pat;
4822 }
4823 else if (LIKELY(op != TURKISH || *pat != 'I')) {
4824 mod_pat[0] = toLOWER_L1(*pat);
4825 pat_len = 1;
4826 pat = mod_pat;
4827 }
4828 }
4829 else if /* Below isn't a node that we convert to UTF-8 */
4830 ( utf8_target
4831 && ! utf8_pat
4832 && op == EXACTFAA_NO_TRIE
4833 && *pat == LATIN_SMALL_LETTER_SHARP_S)
4834 {
4835 /* A very special case. Folding U+DF goes to U+17F under /iaa. We
4836 * did this at compile time when the pattern was UTF-8 , but otherwise
4837 * we couldn't do it earlier, because it requires a UTF-8 target for
4838 * this match to be legal. */
4839 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4840 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4841 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4842 pat = mod_pat;
4843 utf8_pat = TRUE;
4844 }
4845
4846 /* Here, we have taken care of the initial work for a few very problematic
4847 * situations, possibly creating a modified pattern.
4848 *
4849 * Now ready for the general case. We build up all the possible things
4850 * that could match the first character of the pattern into the elements of
4851 * 'matches[]'
4852 *
4853 * Everything generally matches at least itself. But if there is a
4854 * UTF8ness mismatch, we have to convert to that of the target string. */
4855 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
4856 matches[0][0] = pat[0];
4857 lengths[0] = 1;
4858 m->count++;
4859 }
4860 else if (utf8_target) {
4861 if (utf8_pat) {
4862 lengths[0] = UTF8SKIP(pat);
4863 Copy(pat, matches[0], lengths[0], U8);
4864 m->count++;
4865 }
4866 else { /* target is UTF-8, pattern isn't */
4867 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4868 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4869 lengths[0] = 2;
4870 m->count++;
4871 }
4872 }
4873 else if (! utf8_pat) { /* Neither is UTF-8 */
4874 matches[0][0] = pat[0];
4875 lengths[0] = 1;
4876 m->count++;
4877 }
4878 else /* target isn't UTF-8; pattern is. No match possible unless the
4879 pattern's first character can fit in a byte */
4880 if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4881 {
4882 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4883 lengths[0] = 1;
4884 m->count++;
4885 }
4886
4887 /* Here we have taken care of any necessary node-type changes */
4888
4889 if (m->count) {
4890 m->max_length = lengths[0];
4891 m->min_length = lengths[0];
4892 }
4893
4894 /* For non-folding nodes, there are no other possible candidate matches,
4895 * but for foldable ones, we have to look further. */
4896 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4897 UV folded; /* The first character in the pattern, folded */
4898 U32 first_fold_from; /* A character that folds to it */
4899 const U32 * remaining_fold_froms; /* The remaining characters that
4900 fold to it, if any */
4901 Size_t folds_to_count; /* The total number of characters that fold to
4902 'folded' */
4903
4904 /* If the node begins with a sequence of more than one character that
4905 * together form the fold of a single character, it is called a
4906 * 'multi-character fold', and the normal functions don't handle this
4907 * case. We set 'multi_fold_from' to the single folded-from character,
4908 * which is handled in an extra iteration below */
4909 if (utf8_pat) {
4910 folded = valid_utf8_to_uvchr(pat, NULL);
4911 multi_fold_from
4912 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4913 }
4914 else {
4915 folded = *pat;
4916
4917 /* This may generate illegal combinations for things like EXACTF,
4918 * but rather than repeat the logic and exclude them here, all such
4919 * illegalities are checked for and skipped below in the loop */
4920 multi_fold_from
4921 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4922 }
4923
4924 /* Everything matches at least itself; initialize to that because the
4925 * only the branches below that set it are the ones where the number
4926 * isn't 1. */
4927 folds_to_count = 1;
4928
4929 /* There are a few special cases for locale-dependent nodes, where the
4930 * run-time context was needed before we could know what matched */
4931 if (UNLIKELY(op == EXACTFL) && folded < 256) {
4932 first_fold_from = PL_fold_locale[folded];
4933 }
4934 else if ( op == EXACTFL && utf8_target && utf8_pat
4935 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4936 LATIN_SMALL_LETTER_LONG_S_UTF8))
4937 {
4938 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4939 }
4940 else if (UNLIKELY( op == TURKISH
4941 && ( isALPHA_FOLD_EQ(folded, 'i')
4942 || inRANGE(folded,
4943 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4944 LATIN_SMALL_LETTER_DOTLESS_I))))
4945 { /* Turkish folding requires special handling */
4946 if (folded == 'i')
4947 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4948 else if (folded == 'I')
4949 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4950 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4951 first_fold_from = 'i';
4952 else first_fold_from = 'I';
4953 }
4954 else {
4955 /* Here, isn't a special case: use the generic function to
4956 * calculate what folds to this */
4957 redo_multi:
4958 /* Look up what code points (besides itself) fold to 'folded';
4959 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4960 folds_to_count = _inverse_folds(folded, &first_fold_from,
4961 &remaining_fold_froms);
4962 }
4963
4964 /* Add each character that folds to 'folded' to the list of them,
4965 * subject to limitations based on the node type and target UTF8ness.
4966 * If there was a character that folded to multiple characters, do an
4967 * extra iteration for it. (Note the extra iteration if there is a
4968 * multi-character fold) */
4969 for (i = 0; i < folds_to_count
4970 + UNLIKELY(multi_fold_from != 0); i++)
4971 {
4972 UV fold_from = 0;
4973
4974 if (i >= folds_to_count) { /* Final iteration: handle the
4975 multi-char */
4976 fold_from = multi_fold_from;
4977 }
4978 else if (i == 0) {
4979 fold_from = first_fold_from;
4980 }
4981 else if (i < folds_to_count) {
4982 fold_from = remaining_fold_froms[i-1];
4983 }
4984
4985 if (folded == fold_from) { /* We already added the character
4986 itself */
4987 continue;
4988 }
4989
4990 /* EXACTF doesn't have any non-ascii folds */
4991 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4992 continue;
4993 }
4994
4995 /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4996 * */
4997 if ( isASCII(folded) != isASCII(fold_from)
4998 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4999
5000 {
5001 continue;
5002 }
5003
5004 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
5005 * locale, but those have been converted to EXACTFU above) */
5006 if ( op == EXACTFL
5007 && (folded < 256) != (fold_from < 256))
5008 {
5009 continue;
5010 }
5011
5012 /* If this triggers, it likely is because of the unlikely case
5013 * where a new Unicode standard has changed what MAX_MATCHES should
5014 * be set to */
5015 assert(m->count < MAX_MATCHES);
5016
5017 /* Add this character to the list of possible matches */
5018 if (utf8_target) {
5019 uvchr_to_utf8(matches[(U8) m->count], fold_from);
5020 lengths[m->count] = UVCHR_SKIP(fold_from);
5021 m->count++;
5022 }
5023 else { /* Non-UTF8 target: no code point above 255 can appear in it
5024 */
5025 if (fold_from > 255) {
5026 continue;
5027 }
5028
5029 matches[m->count][0] = fold_from;
5030 lengths[m->count] = 1;
5031 m->count++;
5032 }
5033
5034 /* Update min and mlengths */
5035 if (m->min_length > lengths[m->count-1]) {
5036 m->min_length = lengths[m->count-1];
5037 }
5038
5039 if (m->max_length < lengths[m->count-1]) {
5040 index_of_longest = m->count - 1;
5041 m->max_length = lengths[index_of_longest];
5042 }
5043 } /* looped through each potential fold */
5044
5045 /* If there is something that folded to an initial multi-character
5046 * fold, repeat, using it. This catches some edge cases. An example
5047 * of one is /ss/i when UTF-8 encoded. The function
5048 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
5049 * (LATIN SMALL SHARP S). If it returned a list of characters, this
5050 * code wouldn't be needed. But since it doesn't, we have to look what
5051 * folds to the U+DF. In this case, U+1E9E does, and has to be added.
5052 * */
5053 if (multi_fold_from) {
5054 folded = multi_fold_from;
5055 multi_fold_from = 0;
5056 goto redo_multi;
5057 }
5058 } /* End of finding things that participate in this fold */
5059
5060 if (m->count == 0) { /* If nothing found, can't match */
5061 m->min_length = 0;
5062 return FALSE;
5063 }
5064
5065 /* Have calculated all possible matches. Now calculate the mask and AND
5066 * values */
5067 m->initial_exact = 0;
5068 m->initial_definitive = 0;
5069
5070 {
5071 unsigned int mask_ones = 0;
5072 unsigned int possible_ones = 0;
5073 U8 j;
5074
5075 /* For each byte that is in all possible matches ... */
5076 for (j = 0; j < MIN(m->min_length, 5); j++) {
5077
5078 /* Initialize the accumulator for this byte */
5079 byte_mask[j] = 0xFF;
5080 byte_anded[j] = matches[0][j];
5081
5082 /* Then the rest of the rows (folds). The mask is based on, like,
5083 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
5084 * where they differ. */
5085 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
5086 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]);
5087 byte_anded[j] &= matches[i][j];
5088 }
5089
5090 /* Keep track of the number of initial mask bytes that are all one
5091 * bits. The code calling this can use this number to know that
5092 * a string that matches this number of bytes in the pattern is an
5093 * exact match of that pattern for this number of bytes. But also
5094 * counted are the number of initial bytes that in total have a
5095 * single zero bit. If a string matches those, masked, it must be
5096 * one of two possibilites, both of which this function has
5097 * determined are legal. (But if that single 0 is one of the
5098 * initial bits for masking a UTF-8 start byte, that could
5099 * incorrectly lead to different length strings appearing to be
5100 * equivalent, so only do this optimization when the matchables are
5101 * all the same length. This was uncovered by testing
5102 * /\x{029E}/i.) */
5103 if (m->min_length == m->max_length) {
5104 mask_ones += PL_bitcount[byte_mask[j]];
5105 possible_ones += 8;
5106 if (mask_ones + 1 >= possible_ones) {
5107 m->initial_definitive++;
5108 if (mask_ones >= possible_ones) {
5109 m->initial_exact++;
5110 }
5111 }
5112 }
5113 }
5114 }
5115
5116 /* The first byte is separate for speed */
5117 m->first_byte_mask = byte_mask[0];
5118 m->first_byte_anded = byte_anded[0];
5119
5120 /* Then pack up to the next 4 bytes into a word */
5121 m->mask32 = m->anded32 = 0;
5122 for (i = 1; i < MIN(m->min_length, 5); i++) {
5123 U8 which = i;
5124 U8 shift = (which - 1) * 8;
5125 m->mask32 |= (U32) byte_mask[i] << shift;
5126 m->anded32 |= (U32) byte_anded[i] << shift;
5127 }
5128
5129 /* Finally, take the match strings and place them sequentially into a
5130 * one-dimensional array. (This is done to save significant space in the
5131 * structure.) Sort so the longest (presumably the least likely) is last.
5132 * XXX When this gets moved to regcomp, may want to fully sort shortest
5133 * first, but above we generally used the folded code point first, and
5134 * those tend to be no longer than their upper case values, so this is
5135 * already pretty well sorted by size.
5136 *
5137 * If the asserts fail, it's most likely because a new version of the
5138 * Unicode standard requires more space; simply increase the declaration
5139 * size. */
5140 {
5141 U8 cur_pos = 0;
5142 U8 output_index = 0;
5143
5144 if (m->count > 1) { /* No need to sort a single entry */
5145 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
5146
5147 /* Keep the same order for all but the longest. (If the
5148 * asserts fail, it could be because m->matches is declared too
5149 * short, either because of a new Unicode release, or an
5150 * overlooked test case, or it could be a bug.) */
5151 if (i != index_of_longest) {
5152 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5153 Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5154 cur_pos += lengths[i];
5155 m->lengths[output_index++] = lengths[i];
5156 }
5157 }
5158 }
5159
5160 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5161 Copy(matches[index_of_longest], m->matches + cur_pos,
5162 lengths[index_of_longest], U8);
5163
5164 /* Place the longest match last */
5165 m->lengths[output_index] = lengths[index_of_longest];
5166 }
5167
5168
5169 return TRUE;
5170 }
5171
5172 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */
5173 bool
S_test_EXACTISH_ST(const char * loc,struct next_matchable_info info)5174 S_test_EXACTISH_ST(const char * loc,
5175 struct next_matchable_info info)
5176 {
5177 /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5178 * bytes starting at 'loc' can match based on 'next_matchable_info' */
5179
5180 U32 input32 = 0;
5181
5182 /* Check the first byte */
5183 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5184 return FALSE;
5185
5186 /* Pack the next up-to-4 bytes into a 32 bit word */
5187 switch (info.min_length) {
5188 default:
5189 input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5190 /* FALLTHROUGH */
5191 case 4:
5192 input32 |= (U8) loc[3] << 2 * 8;
5193 /* FALLTHROUGH */
5194 case 3:
5195 input32 |= (U8) loc[2] << 1 * 8;
5196 /* FALLTHROUGH */
5197 case 2:
5198 input32 |= (U8) loc[1];
5199 break;
5200 case 1:
5201 return TRUE; /* We already tested and passed the 0th byte */
5202 case 0:
5203 ASSUME(0);
5204 }
5205
5206 /* And AND that with the mask and compare that with the assembled ANDED
5207 * values */
5208 return (input32 & info.mask32) == info.anded32;
5209 }
5210
5211 STATIC bool
S_isGCB(pTHX_ const GCB_enum before,const GCB_enum after,const U8 * const strbeg,const U8 * const curpos,const bool utf8_target)5212 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5213 {
5214 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5215 * between the inputs. See https://www.unicode.org/reports/tr29/. */
5216
5217 PERL_ARGS_ASSERT_ISGCB;
5218
5219 switch (GCB_table[before][after]) {
5220 case GCB_BREAKABLE:
5221 return TRUE;
5222
5223 case GCB_NOBREAK:
5224 return FALSE;
5225
5226 case GCB_RI_then_RI:
5227 {
5228 int RI_count = 1;
5229 U8 * temp_pos = (U8 *) curpos;
5230
5231 /* Do not break within emoji flag sequences. That is, do not
5232 * break between regional indicator (RI) symbols if there is an
5233 * odd number of RI characters before the break point.
5234 * GB12 sot (RI RI)* RI × RI
5235 * GB13 [^RI] (RI RI)* RI × RI */
5236
5237 while (backup_one_GCB(strbeg,
5238 &temp_pos,
5239 utf8_target) == GCB_Regional_Indicator)
5240 {
5241 RI_count++;
5242 }
5243
5244 return RI_count % 2 != 1;
5245 }
5246
5247 case GCB_EX_then_EM:
5248
5249 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
5250 {
5251 U8 * temp_pos = (U8 *) curpos;
5252 GCB_enum prev;
5253
5254 do {
5255 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5256 }
5257 while (prev == GCB_Extend);
5258
5259 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5260 }
5261
5262 case GCB_Maybe_Emoji_NonBreak:
5263
5264 {
5265
5266 /* Do not break within emoji modifier sequences or emoji zwj sequences.
5267 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5268 */
5269 U8 * temp_pos = (U8 *) curpos;
5270 GCB_enum prev;
5271
5272 do {
5273 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5274 }
5275 while (prev == GCB_Extend);
5276
5277 return prev != GCB_ExtPict_XX;
5278 }
5279
5280 default:
5281 break;
5282 }
5283
5284 #ifdef DEBUGGING
5285 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5286 before, after, GCB_table[before][after]);
5287 assert(0);
5288 #endif
5289 return TRUE;
5290 }
5291
5292 STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5293 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5294 {
5295 GCB_enum gcb;
5296
5297 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5298
5299 if (*curpos < strbeg) {
5300 return GCB_EDGE;
5301 }
5302
5303 if (utf8_target) {
5304 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5305 U8 * prev_prev_char_pos;
5306
5307 if (! prev_char_pos) {
5308 return GCB_EDGE;
5309 }
5310
5311 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5312 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5313 *curpos = prev_char_pos;
5314 prev_char_pos = prev_prev_char_pos;
5315 }
5316 else {
5317 *curpos = (U8 *) strbeg;
5318 return GCB_EDGE;
5319 }
5320 }
5321 else {
5322 if (*curpos - 2 < strbeg) {
5323 *curpos = (U8 *) strbeg;
5324 return GCB_EDGE;
5325 }
5326 (*curpos)--;
5327 gcb = getGCB_VAL_CP(*(*curpos - 1));
5328 }
5329
5330 return gcb;
5331 }
5332
5333 /* Combining marks attach to most classes that precede them, but this defines
5334 * the exceptions (from TR14) */
5335 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
5336 || prev == LB_Mandatory_Break \
5337 || prev == LB_Carriage_Return \
5338 || prev == LB_Line_Feed \
5339 || prev == LB_Next_Line \
5340 || prev == LB_Space \
5341 || prev == LB_ZWSpace))
5342
5343 STATIC bool
S_isLB(pTHX_ LB_enum before,LB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5344 S_isLB(pTHX_ LB_enum before,
5345 LB_enum after,
5346 const U8 * const strbeg,
5347 const U8 * const curpos,
5348 const U8 * const strend,
5349 const bool utf8_target)
5350 {
5351 U8 * temp_pos = (U8 *) curpos;
5352 LB_enum prev = before;
5353
5354 /* Is the boundary between 'before' and 'after' line-breakable?
5355 * Most of this is just a table lookup of a generated table from Unicode
5356 * rules. But some rules require context to decide, and so have to be
5357 * implemented in code */
5358
5359 PERL_ARGS_ASSERT_ISLB;
5360
5361 /* Rule numbers in the comments below are as of Unicode 9.0 */
5362
5363 redo:
5364 before = prev;
5365 switch (LB_table[before][after]) {
5366 case LB_BREAKABLE:
5367 return TRUE;
5368
5369 case LB_NOBREAK:
5370 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5371 return FALSE;
5372
5373 case LB_SP_foo + LB_BREAKABLE:
5374 case LB_SP_foo + LB_NOBREAK:
5375 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5376
5377 /* When we have something following a SP, we have to look at the
5378 * context in order to know what to do.
5379 *
5380 * SP SP should not reach here because LB7: Do not break before
5381 * spaces. (For two spaces in a row there is nothing that
5382 * overrides that) */
5383 assert(after != LB_Space);
5384
5385 /* Here we have a space followed by a non-space. Mostly this is a
5386 * case of LB18: "Break after spaces". But there are complications
5387 * as the handling of spaces is somewhat tricky. They are in a
5388 * number of rules, which have to be applied in priority order, but
5389 * something earlier in the string can cause a rule to be skipped
5390 * and a lower priority rule invoked. A prime example is LB7 which
5391 * says don't break before a space. But rule LB8 (lower priority)
5392 * says that the first break opportunity after a ZW is after any
5393 * span of spaces immediately after it. If a ZW comes before a SP
5394 * in the input, rule LB8 applies, and not LB7. Other such rules
5395 * involve combining marks which are rules 9 and 10, but they may
5396 * override higher priority rules if they come earlier in the
5397 * string. Since we're doing random access into the middle of the
5398 * string, we have to look for rules that should get applied based
5399 * on both string position and priority. Combining marks do not
5400 * attach to either ZW nor SP, so we don't have to consider them
5401 * until later.
5402 *
5403 * To check for LB8, we have to find the first non-space character
5404 * before this span of spaces */
5405 do {
5406 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5407 }
5408 while (prev == LB_Space);
5409
5410 /* LB8 Break before any character following a zero-width space,
5411 * even if one or more spaces intervene.
5412 * ZW SP* ÷
5413 * So if we have a ZW just before this span, and to get here this
5414 * is the final space in the span. */
5415 if (prev == LB_ZWSpace) {
5416 return TRUE;
5417 }
5418
5419 /* Here, not ZW SP+. There are several rules that have higher
5420 * priority than LB18 and can be resolved now, as they don't depend
5421 * on anything earlier in the string (except ZW, which we have
5422 * already handled). One of these rules is LB11 Do not break
5423 * before Word joiner, but we have specially encoded that in the
5424 * lookup table so it is caught by the single test below which
5425 * catches the other ones. */
5426 if (LB_table[LB_Space][after] - LB_SP_foo
5427 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5428 {
5429 return FALSE;
5430 }
5431
5432 /* If we get here, we have to XXX consider combining marks. */
5433 if (prev == LB_Combining_Mark) {
5434
5435 /* What happens with these depends on the character they
5436 * follow. */
5437 do {
5438 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5439 }
5440 while (prev == LB_Combining_Mark);
5441
5442 /* Most times these attach to and inherit the characteristics
5443 * of that character, but not always, and when not, they are to
5444 * be treated as AL by rule LB10. */
5445 if (! LB_CM_ATTACHES_TO(prev)) {
5446 prev = LB_Alphabetic;
5447 }
5448 }
5449
5450 /* Here, we have the character preceding the span of spaces all set
5451 * up. We follow LB18: "Break after spaces" unless the table shows
5452 * that is overridden */
5453 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5454
5455 case LB_CM_ZWJ_foo:
5456
5457 /* We don't know how to treat the CM except by looking at the first
5458 * non-CM character preceding it. ZWJ is treated as CM */
5459 do {
5460 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5461 }
5462 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5463
5464 /* Here, 'prev' is that first earlier non-CM character. If the CM
5465 * attaches to it, then it inherits the behavior of 'prev'. If it
5466 * doesn't attach, it is to be treated as an AL */
5467 if (! LB_CM_ATTACHES_TO(prev)) {
5468 prev = LB_Alphabetic;
5469 }
5470
5471 goto redo;
5472
5473 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5474 case LB_HY_or_BA_then_foo + LB_NOBREAK:
5475
5476 /* LB21a Don't break after Hebrew + Hyphen.
5477 * HL (HY | BA) × */
5478
5479 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5480 == LB_Hebrew_Letter)
5481 {
5482 return FALSE;
5483 }
5484
5485 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5486
5487 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5488 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5489
5490 /* LB25a (PR | PO) × ( OP | HY )? NU */
5491 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5492 return FALSE;
5493 }
5494
5495 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5496 == LB_BREAKABLE;
5497
5498 case LB_SY_or_IS_then_various + LB_BREAKABLE:
5499 case LB_SY_or_IS_then_various + LB_NOBREAK:
5500 {
5501 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5502
5503 LB_enum temp = prev;
5504 do {
5505 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5506 }
5507 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5508 if (temp == LB_Numeric) {
5509 return FALSE;
5510 }
5511
5512 return LB_table[prev][after] - LB_SY_or_IS_then_various
5513 == LB_BREAKABLE;
5514 }
5515
5516 case LB_various_then_PO_or_PR + LB_BREAKABLE:
5517 case LB_various_then_PO_or_PR + LB_NOBREAK:
5518 {
5519 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5520
5521 LB_enum temp = prev;
5522 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5523 {
5524 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5525 }
5526 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5527 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5528 }
5529 if (temp == LB_Numeric) {
5530 return FALSE;
5531 }
5532 return LB_various_then_PO_or_PR;
5533 }
5534
5535 case LB_RI_then_RI + LB_NOBREAK:
5536 case LB_RI_then_RI + LB_BREAKABLE:
5537 {
5538 int RI_count = 1;
5539
5540 /* LB30a Break between two regional indicator symbols if and
5541 * only if there are an even number of regional indicators
5542 * preceding the position of the break.
5543 *
5544 * sot (RI RI)* RI × RI
5545 * [^RI] (RI RI)* RI × RI */
5546
5547 while (backup_one_LB(strbeg,
5548 &temp_pos,
5549 utf8_target) == LB_Regional_Indicator)
5550 {
5551 RI_count++;
5552 }
5553
5554 return RI_count % 2 == 0;
5555 }
5556
5557 default:
5558 break;
5559 }
5560
5561 #ifdef DEBUGGING
5562 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5563 before, after, LB_table[before][after]);
5564 assert(0);
5565 #endif
5566 return TRUE;
5567 }
5568
5569 STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5570 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5571 {
5572
5573 LB_enum lb;
5574
5575 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5576
5577 if (*curpos >= strend) {
5578 return LB_EDGE;
5579 }
5580
5581 if (utf8_target) {
5582 *curpos += UTF8SKIP(*curpos);
5583 if (*curpos >= strend) {
5584 return LB_EDGE;
5585 }
5586 lb = getLB_VAL_UTF8(*curpos, strend);
5587 }
5588 else {
5589 (*curpos)++;
5590 if (*curpos >= strend) {
5591 return LB_EDGE;
5592 }
5593 lb = getLB_VAL_CP(**curpos);
5594 }
5595
5596 return lb;
5597 }
5598
5599 STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5600 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5601 {
5602 LB_enum lb;
5603
5604 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5605
5606 if (*curpos < strbeg) {
5607 return LB_EDGE;
5608 }
5609
5610 if (utf8_target) {
5611 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5612 U8 * prev_prev_char_pos;
5613
5614 if (! prev_char_pos) {
5615 return LB_EDGE;
5616 }
5617
5618 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5619 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5620 *curpos = prev_char_pos;
5621 prev_char_pos = prev_prev_char_pos;
5622 }
5623 else {
5624 *curpos = (U8 *) strbeg;
5625 return LB_EDGE;
5626 }
5627 }
5628 else {
5629 if (*curpos - 2 < strbeg) {
5630 *curpos = (U8 *) strbeg;
5631 return LB_EDGE;
5632 }
5633 (*curpos)--;
5634 lb = getLB_VAL_CP(*(*curpos - 1));
5635 }
5636
5637 return lb;
5638 }
5639
5640 STATIC bool
S_isSB(pTHX_ SB_enum before,SB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5641 S_isSB(pTHX_ SB_enum before,
5642 SB_enum after,
5643 const U8 * const strbeg,
5644 const U8 * const curpos,
5645 const U8 * const strend,
5646 const bool utf8_target)
5647 {
5648 /* returns a boolean indicating if there is a Sentence Boundary Break
5649 * between the inputs. See https://www.unicode.org/reports/tr29/ */
5650
5651 U8 * lpos = (U8 *) curpos;
5652 bool has_para_sep = FALSE;
5653 bool has_sp = FALSE;
5654
5655 PERL_ARGS_ASSERT_ISSB;
5656
5657 /* Break at the start and end of text.
5658 SB1. sot ÷
5659 SB2. ÷ eot
5660 But unstated in Unicode is don't break if the text is empty */
5661 if (before == SB_EDGE || after == SB_EDGE) {
5662 return before != after;
5663 }
5664
5665 /* SB 3: Do not break within CRLF. */
5666 if (before == SB_CR && after == SB_LF) {
5667 return FALSE;
5668 }
5669
5670 /* Break after paragraph separators. CR and LF are considered
5671 * so because Unicode views text as like word processing text where there
5672 * are no newlines except between paragraphs, and the word processor takes
5673 * care of wrapping without there being hard line-breaks in the text *./
5674 SB4. Sep | CR | LF ÷ */
5675 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5676 return TRUE;
5677 }
5678
5679 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5680 * (See Section 6.2, Replacing Ignore Rules.)
5681 SB5. X (Extend | Format)* → X */
5682 if (after == SB_Extend || after == SB_Format) {
5683
5684 /* Implied is that these characters attach to everything
5685 * immediately prior to them except for those separator-type
5686 * characters. And the rules earlier have already handled the case
5687 * when one of those immediately precedes the extend char */
5688 return FALSE;
5689 }
5690
5691 if (before == SB_Extend || before == SB_Format) {
5692 U8 * temp_pos = lpos;
5693 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5694 if ( backup != SB_EDGE
5695 && backup != SB_Sep
5696 && backup != SB_CR
5697 && backup != SB_LF)
5698 {
5699 before = backup;
5700 lpos = temp_pos;
5701 }
5702
5703 /* Here, both 'before' and 'backup' are these types; implied is that we
5704 * don't break between them */
5705 if (backup == SB_Extend || backup == SB_Format) {
5706 return FALSE;
5707 }
5708 }
5709
5710 /* Do not break after ambiguous terminators like period, if they are
5711 * immediately followed by a number or lowercase letter, if they are
5712 * between uppercase letters, if the first following letter (optionally
5713 * after certain punctuation) is lowercase, or if they are followed by
5714 * "continuation" punctuation such as comma, colon, or semicolon. For
5715 * example, a period may be an abbreviation or numeric period, and thus may
5716 * not mark the end of a sentence.
5717
5718 * SB6. ATerm × Numeric */
5719 if (before == SB_ATerm && after == SB_Numeric) {
5720 return FALSE;
5721 }
5722
5723 /* SB7. (Upper | Lower) ATerm × Upper */
5724 if (before == SB_ATerm && after == SB_Upper) {
5725 U8 * temp_pos = lpos;
5726 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5727 if (backup == SB_Upper || backup == SB_Lower) {
5728 return FALSE;
5729 }
5730 }
5731
5732 /* The remaining rules that aren't the final one, all require an STerm or
5733 * an ATerm after having backed up over some Close* Sp*, and in one case an
5734 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5735 * So do that backup now, setting flags if either Sp or a paragraph
5736 * separator are found */
5737
5738 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5739 has_para_sep = TRUE;
5740 before = backup_one_SB(strbeg, &lpos, utf8_target);
5741 }
5742
5743 if (before == SB_Sp) {
5744 has_sp = TRUE;
5745 do {
5746 before = backup_one_SB(strbeg, &lpos, utf8_target);
5747 }
5748 while (before == SB_Sp);
5749 }
5750
5751 while (before == SB_Close) {
5752 before = backup_one_SB(strbeg, &lpos, utf8_target);
5753 }
5754
5755 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5756 * most cases an STerm */
5757 if (before == SB_STerm || before == SB_ATerm) {
5758
5759 /* So, here the lhs matches
5760 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5761 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5762 * The rules that apply here are:
5763 *
5764 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5765 | LF | STerm | ATerm) )* Lower
5766 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5767 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5768 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5769 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5770 */
5771
5772 /* And all but SB11 forbid having seen a paragraph separator */
5773 if (! has_para_sep) {
5774 if (before == SB_ATerm) { /* SB8 */
5775 U8 * rpos = (U8 *) curpos;
5776 SB_enum later = after;
5777
5778 while ( later != SB_OLetter
5779 && later != SB_Upper
5780 && later != SB_Lower
5781 && later != SB_Sep
5782 && later != SB_CR
5783 && later != SB_LF
5784 && later != SB_STerm
5785 && later != SB_ATerm
5786 && later != SB_EDGE)
5787 {
5788 later = advance_one_SB(&rpos, strend, utf8_target);
5789 }
5790 if (later == SB_Lower) {
5791 return FALSE;
5792 }
5793 }
5794
5795 if ( after == SB_SContinue /* SB8a */
5796 || after == SB_STerm
5797 || after == SB_ATerm)
5798 {
5799 return FALSE;
5800 }
5801
5802 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5803 if ( after == SB_Close
5804 || after == SB_Sp
5805 || after == SB_Sep
5806 || after == SB_CR
5807 || after == SB_LF)
5808 {
5809 return FALSE;
5810 }
5811 }
5812
5813 /* SB10. This and SB9 could probably be combined some way, but khw
5814 * has decided to follow the Unicode rule book precisely for
5815 * simplified maintenance */
5816 if ( after == SB_Sp
5817 || after == SB_Sep
5818 || after == SB_CR
5819 || after == SB_LF)
5820 {
5821 return FALSE;
5822 }
5823 }
5824
5825 /* SB11. */
5826 return TRUE;
5827 }
5828
5829 /* Otherwise, do not break.
5830 SB12. Any × Any */
5831
5832 return FALSE;
5833 }
5834
5835 STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5836 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5837 {
5838 SB_enum sb;
5839
5840 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5841
5842 if (*curpos >= strend) {
5843 return SB_EDGE;
5844 }
5845
5846 if (utf8_target) {
5847 do {
5848 *curpos += UTF8SKIP(*curpos);
5849 if (*curpos >= strend) {
5850 return SB_EDGE;
5851 }
5852 sb = getSB_VAL_UTF8(*curpos, strend);
5853 } while (sb == SB_Extend || sb == SB_Format);
5854 }
5855 else {
5856 do {
5857 (*curpos)++;
5858 if (*curpos >= strend) {
5859 return SB_EDGE;
5860 }
5861 sb = getSB_VAL_CP(**curpos);
5862 } while (sb == SB_Extend || sb == SB_Format);
5863 }
5864
5865 return sb;
5866 }
5867
5868 STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5869 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5870 {
5871 SB_enum sb;
5872
5873 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5874
5875 if (*curpos < strbeg) {
5876 return SB_EDGE;
5877 }
5878
5879 if (utf8_target) {
5880 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5881 if (! prev_char_pos) {
5882 return SB_EDGE;
5883 }
5884
5885 /* Back up over Extend and Format. curpos is always just to the right
5886 * of the character whose value we are getting */
5887 do {
5888 U8 * prev_prev_char_pos;
5889 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5890 strbeg)))
5891 {
5892 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5893 *curpos = prev_char_pos;
5894 prev_char_pos = prev_prev_char_pos;
5895 }
5896 else {
5897 *curpos = (U8 *) strbeg;
5898 return SB_EDGE;
5899 }
5900 } while (sb == SB_Extend || sb == SB_Format);
5901 }
5902 else {
5903 do {
5904 if (*curpos - 2 < strbeg) {
5905 *curpos = (U8 *) strbeg;
5906 return SB_EDGE;
5907 }
5908 (*curpos)--;
5909 sb = getSB_VAL_CP(*(*curpos - 1));
5910 } while (sb == SB_Extend || sb == SB_Format);
5911 }
5912
5913 return sb;
5914 }
5915
5916 STATIC bool
S_isWB(pTHX_ WB_enum previous,WB_enum before,WB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5917 S_isWB(pTHX_ WB_enum previous,
5918 WB_enum before,
5919 WB_enum after,
5920 const U8 * const strbeg,
5921 const U8 * const curpos,
5922 const U8 * const strend,
5923 const bool utf8_target)
5924 {
5925 /* Return a boolean as to if the boundary between 'before' and 'after' is
5926 * a Unicode word break, using their published algorithm, but tailored for
5927 * Perl by treating spans of white space as one unit. Context may be
5928 * needed to make this determination. If the value for the character
5929 * before 'before' is known, it is passed as 'previous'; otherwise that
5930 * should be set to WB_UNKNOWN. The other input parameters give the
5931 * boundaries and current position in the matching of the string. That
5932 * is, 'curpos' marks the position where the character whose wb value is
5933 * 'after' begins. See https://www.unicode.org/reports/tr29/ */
5934
5935 U8 * before_pos = (U8 *) curpos;
5936 U8 * after_pos = (U8 *) curpos;
5937 WB_enum prev = before;
5938 WB_enum next;
5939
5940 PERL_ARGS_ASSERT_ISWB;
5941
5942 /* Rule numbers in the comments below are as of Unicode 9.0 */
5943
5944 redo:
5945 before = prev;
5946 switch (WB_table[before][after]) {
5947 case WB_BREAKABLE:
5948 return TRUE;
5949
5950 case WB_NOBREAK:
5951 return FALSE;
5952
5953 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5954 next = advance_one_WB(&after_pos, strend, utf8_target,
5955 FALSE /* Don't skip Extend nor Format */ );
5956 /* A space immediately preceding an Extend or Format is attached
5957 * to by them, and hence gets separated from previous spaces.
5958 * Otherwise don't break between horizontal white space */
5959 return next == WB_Extend || next == WB_Format;
5960
5961 /* WB4 Ignore Format and Extend characters, except when they appear at
5962 * the beginning of a region of text. This code currently isn't
5963 * general purpose, but it works as the rules are currently and likely
5964 * to be laid out. The reason it works is that when 'they appear at
5965 * the beginning of a region of text', the rule is to break before
5966 * them, just like any other character. Therefore, the default rule
5967 * applies and we don't have to look in more depth. Should this ever
5968 * change, we would have to have 2 'case' statements, like in the rules
5969 * below, and backup a single character (not spacing over the extend
5970 * ones) and then see if that is one of the region-end characters and
5971 * go from there */
5972 case WB_Ex_or_FO_or_ZWJ_then_foo:
5973 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5974 goto redo;
5975
5976 case WB_DQ_then_HL + WB_BREAKABLE:
5977 case WB_DQ_then_HL + WB_NOBREAK:
5978
5979 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5980
5981 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5982 == WB_Hebrew_Letter)
5983 {
5984 return FALSE;
5985 }
5986
5987 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5988
5989 case WB_HL_then_DQ + WB_BREAKABLE:
5990 case WB_HL_then_DQ + WB_NOBREAK:
5991
5992 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5993
5994 if (advance_one_WB(&after_pos, strend, utf8_target,
5995 TRUE /* Do skip Extend and Format */ )
5996 == WB_Hebrew_Letter)
5997 {
5998 return FALSE;
5999 }
6000
6001 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
6002
6003 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
6004 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
6005
6006 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
6007 * | Single_Quote) (ALetter | Hebrew_Letter) */
6008
6009 next = advance_one_WB(&after_pos, strend, utf8_target,
6010 TRUE /* Do skip Extend and Format */ );
6011
6012 if (next == WB_ALetter || next == WB_Hebrew_Letter)
6013 {
6014 return FALSE;
6015 }
6016
6017 return WB_table[before][after]
6018 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
6019
6020 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
6021 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
6022
6023 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
6024 * | Single_Quote) × (ALetter | Hebrew_Letter) */
6025
6026 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
6027 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
6028 {
6029 return FALSE;
6030 }
6031
6032 return WB_table[before][after]
6033 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
6034
6035 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
6036 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
6037
6038 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
6039 * */
6040
6041 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
6042 == WB_Numeric)
6043 {
6044 return FALSE;
6045 }
6046
6047 return WB_table[before][after]
6048 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
6049
6050 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
6051 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
6052
6053 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
6054
6055 if (advance_one_WB(&after_pos, strend, utf8_target,
6056 TRUE /* Do skip Extend and Format */ )
6057 == WB_Numeric)
6058 {
6059 return FALSE;
6060 }
6061
6062 return WB_table[before][after]
6063 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
6064
6065 case WB_RI_then_RI + WB_NOBREAK:
6066 case WB_RI_then_RI + WB_BREAKABLE:
6067 {
6068 int RI_count = 1;
6069
6070 /* Do not break within emoji flag sequences. That is, do not
6071 * break between regional indicator (RI) symbols if there is an
6072 * odd number of RI characters before the potential break
6073 * point.
6074 *
6075 * WB15 sot (RI RI)* RI × RI
6076 * WB16 [^RI] (RI RI)* RI × RI */
6077
6078 while (backup_one_WB(&previous,
6079 strbeg,
6080 &before_pos,
6081 utf8_target) == WB_Regional_Indicator)
6082 {
6083 RI_count++;
6084 }
6085
6086 return RI_count % 2 != 1;
6087 }
6088
6089 default:
6090 break;
6091 }
6092
6093 #ifdef DEBUGGING
6094 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
6095 before, after, WB_table[before][after]);
6096 assert(0);
6097 #endif
6098 return TRUE;
6099 }
6100
6101 STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target,const bool skip_Extend_Format)6102 S_advance_one_WB(pTHX_ U8 ** curpos,
6103 const U8 * const strend,
6104 const bool utf8_target,
6105 const bool skip_Extend_Format)
6106 {
6107 WB_enum wb;
6108
6109 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
6110
6111 if (*curpos >= strend) {
6112 return WB_EDGE;
6113 }
6114
6115 if (utf8_target) {
6116
6117 /* Advance over Extend and Format */
6118 do {
6119 *curpos += UTF8SKIP(*curpos);
6120 if (*curpos >= strend) {
6121 return WB_EDGE;
6122 }
6123 wb = getWB_VAL_UTF8(*curpos, strend);
6124 } while ( skip_Extend_Format
6125 && (wb == WB_Extend || wb == WB_Format));
6126 }
6127 else {
6128 do {
6129 (*curpos)++;
6130 if (*curpos >= strend) {
6131 return WB_EDGE;
6132 }
6133 wb = getWB_VAL_CP(**curpos);
6134 } while ( skip_Extend_Format
6135 && (wb == WB_Extend || wb == WB_Format));
6136 }
6137
6138 return wb;
6139 }
6140
6141 STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous,const U8 * const strbeg,U8 ** curpos,const bool utf8_target)6142 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
6143 {
6144 WB_enum wb;
6145
6146 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
6147
6148 /* If we know what the previous character's break value is, don't have
6149 * to look it up */
6150 if (*previous != WB_UNKNOWN) {
6151 wb = *previous;
6152
6153 /* But we need to move backwards by one */
6154 if (utf8_target) {
6155 *curpos = reghopmaybe3(*curpos, -1, strbeg);
6156 if (! *curpos) {
6157 *previous = WB_EDGE;
6158 *curpos = (U8 *) strbeg;
6159 }
6160 else {
6161 *previous = WB_UNKNOWN;
6162 }
6163 }
6164 else {
6165 (*curpos)--;
6166 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6167 }
6168
6169 /* And we always back up over these three types */
6170 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6171 return wb;
6172 }
6173 }
6174
6175 if (*curpos < strbeg) {
6176 return WB_EDGE;
6177 }
6178
6179 if (utf8_target) {
6180 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6181 if (! prev_char_pos) {
6182 return WB_EDGE;
6183 }
6184
6185 /* Back up over Extend and Format. curpos is always just to the right
6186 * of the character whose value we are getting */
6187 do {
6188 U8 * prev_prev_char_pos;
6189 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6190 -1,
6191 strbeg)))
6192 {
6193 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6194 *curpos = prev_char_pos;
6195 prev_char_pos = prev_prev_char_pos;
6196 }
6197 else {
6198 *curpos = (U8 *) strbeg;
6199 return WB_EDGE;
6200 }
6201 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6202 }
6203 else {
6204 do {
6205 if (*curpos - 2 < strbeg) {
6206 *curpos = (U8 *) strbeg;
6207 return WB_EDGE;
6208 }
6209 (*curpos)--;
6210 wb = getWB_VAL_CP(*(*curpos - 1));
6211 } while (wb == WB_Extend || wb == WB_Format);
6212 }
6213
6214 return wb;
6215 }
6216
6217 /* Macros for regmatch(), using its internal variables */
6218 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6219 #define NEXTCHR_IS_EOS (nextbyte < 0)
6220
6221 #define SET_nextchr \
6222 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6223
6224 #define SET_locinput(p) \
6225 locinput = (p); \
6226 SET_nextchr
6227
6228 #define sayYES goto yes
6229 #define sayNO goto no
6230 #define sayNO_SILENT goto no_silent
6231
6232 /* we don't use STMT_START/END here because it leads to
6233 "unreachable code" warnings, which are bogus, but distracting. */
6234 #define CACHEsayNO \
6235 if (ST.cache_mask) \
6236 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6237 sayNO
6238
6239 #define EVAL_CLOSE_PAREN_IS(st,expr) \
6240 ( \
6241 ( ( st ) ) && \
6242 ( ( st )->u.eval.close_paren ) && \
6243 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6244 )
6245
6246 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
6247 ( \
6248 ( ( st ) ) && \
6249 ( ( st )->u.eval.close_paren ) && \
6250 ( ( expr ) ) && \
6251 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6252 )
6253
6254
6255 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6256 (st)->u.eval.close_paren = ( (expr) + 1 )
6257
6258 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6259 (st)->u.eval.close_paren = 0
6260
6261 /* push a new state then goto it */
6262
6263 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
6264 pushinput = input; \
6265 pusheol = eol; \
6266 pushsr0 = sr0; \
6267 scan = node; \
6268 st->resume_state = state; \
6269 goto push_state;
6270
6271 /* push a new state with success backtracking, then goto it */
6272
6273 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
6274 pushinput = input; \
6275 pusheol = eol; \
6276 pushsr0 = sr0; \
6277 scan = node; \
6278 st->resume_state = state; \
6279 goto push_yes_state;
6280
6281 #define DEBUG_STATE_pp(pp) \
6282 DEBUG_STATE_r({ \
6283 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6284 Perl_re_printf( aTHX_ \
6285 "%*s" pp " %s%s%s%s%s\n", \
6286 INDENT_CHARS(depth), "", \
6287 REGNODE_NAME(st->resume_state), \
6288 ((st==yes_state||st==mark_state) ? "[" : ""), \
6289 ((st==yes_state) ? "Y" : ""), \
6290 ((st==mark_state) ? "M" : ""), \
6291 ((st==yes_state||st==mark_state) ? "]" : "") \
6292 ); \
6293 });
6294
6295 /*
6296
6297 regmatch() - main matching routine
6298
6299 This is basically one big switch statement in a loop. We execute an op,
6300 set 'next' to point the next op, and continue. If we come to a point which
6301 we may need to backtrack to on failure such as (A|B|C), we push a
6302 backtrack state onto the backtrack stack. On failure, we pop the top
6303 state, and re-enter the loop at the state indicated. If there are no more
6304 states to pop, we return failure.
6305
6306 Sometimes we also need to backtrack on success; for example /A+/, where
6307 after successfully matching one A, we need to go back and try to
6308 match another one; similarly for lookahead assertions: if the assertion
6309 completes successfully, we backtrack to the state just before the assertion
6310 and then carry on. In these cases, the pushed state is marked as
6311 'backtrack on success too'. This marking is in fact done by a chain of
6312 pointers, each pointing to the previous 'yes' state. On success, we pop to
6313 the nearest yes state, discarding any intermediate failure-only states.
6314 Sometimes a yes state is pushed just to force some cleanup code to be
6315 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6316 it to free the inner regex.
6317
6318 Note that failure backtracking rewinds the cursor position, while
6319 success backtracking leaves it alone.
6320
6321 A pattern is complete when the END op is executed, while a subpattern
6322 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6323 ops trigger the "pop to last yes state if any, otherwise return true"
6324 behaviour.
6325
6326 A common convention in this function is to use A and B to refer to the two
6327 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6328 the subpattern to be matched possibly multiple times, while B is the entire
6329 rest of the pattern. Variable and state names reflect this convention.
6330
6331 The states in the main switch are the union of ops and failure/success of
6332 substates associated with that op. For example, IFMATCH is the op
6333 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6334 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6335 successfully matched A and IFMATCH_A_fail is a state saying that we have
6336 just failed to match A. Resume states always come in pairs. The backtrack
6337 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6338 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6339 on success or failure.
6340
6341 The struct that holds a backtracking state is actually a big union, with
6342 one variant for each major type of op. The variable st points to the
6343 top-most backtrack struct. To make the code clearer, within each
6344 block of code we #define ST to alias the relevant union.
6345
6346 Here's a concrete example of a (vastly oversimplified) IFMATCH
6347 implementation:
6348
6349 switch (state) {
6350 ....
6351
6352 #define ST st->u.ifmatch
6353
6354 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6355 ST.foo = ...; // some state we wish to save
6356 ...
6357 // push a yes backtrack state with a resume value of
6358 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6359 // first node of A:
6360 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6361 // NOTREACHED
6362
6363 case IFMATCH_A: // we have successfully executed A; now continue with B
6364 next = B;
6365 bar = ST.foo; // do something with the preserved value
6366 break;
6367
6368 case IFMATCH_A_fail: // A failed, so the assertion failed
6369 ...; // do some housekeeping, then ...
6370 sayNO; // propagate the failure
6371
6372 #undef ST
6373
6374 ...
6375 }
6376
6377 For any old-timers reading this who are familiar with the old recursive
6378 approach, the code above is equivalent to:
6379
6380 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6381 {
6382 int foo = ...
6383 ...
6384 if (regmatch(A)) {
6385 next = B;
6386 bar = foo;
6387 break;
6388 }
6389 ...; // do some housekeeping, then ...
6390 sayNO; // propagate the failure
6391 }
6392
6393 The topmost backtrack state, pointed to by st, is usually free. If you
6394 want to claim it, populate any ST.foo fields in it with values you wish to
6395 save, then do one of
6396
6397 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6398 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6399
6400 which sets that backtrack state's resume value to 'resume_state', pushes a
6401 new free entry to the top of the backtrack stack, then goes to 'node'.
6402 On backtracking, the free slot is popped, and the saved state becomes the
6403 new free state. An ST.foo field in this new top state can be temporarily
6404 accessed to retrieve values, but once the main loop is re-entered, it
6405 becomes available for reuse.
6406
6407 Note that the depth of the backtrack stack constantly increases during the
6408 left-to-right execution of the pattern, rather than going up and down with
6409 the pattern nesting. For example the stack is at its maximum at Z at the
6410 end of the pattern, rather than at X in the following:
6411
6412 /(((X)+)+)+....(Y)+....Z/
6413
6414 The only exceptions to this are lookahead/behind assertions and the cut,
6415 (?>A), which pop all the backtrack states associated with A before
6416 continuing.
6417
6418 Backtrack state structs are allocated in slabs of about 4K in size.
6419 PL_regmatch_state and st always point to the currently active state,
6420 and PL_regmatch_slab points to the slab currently containing
6421 PL_regmatch_state. The first time regmatch() is called, the first slab is
6422 allocated, and is never freed until interpreter destruction. When the slab
6423 is full, a new one is allocated and chained to the end. At exit from
6424 regmatch(), slabs allocated since entry are freed.
6425
6426 In order to work with variable length lookbehinds, an upper limit is placed on
6427 lookbehinds which is set to where the match position is at the end of where the
6428 lookbehind would get to. Nothing in the lookbehind should match above that,
6429 except we should be able to look beyond if for things like \b, which need the
6430 next character in the string to be able to determine if this is a boundary or
6431 not. We also can't match the end of string/line unless we are also at the end
6432 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6433 that match a width, we have to add a condition that they are within the legal
6434 bounds of our window into the string.
6435
6436 */
6437
6438 /* returns -1 on failure, $+[0] on success */
6439 STATIC SSize_t
S_regmatch(pTHX_ regmatch_info * reginfo,char * startpos,regnode * prog)6440 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6441 {
6442 const bool utf8_target = reginfo->is_utf8_target;
6443 const U32 uniflags = UTF8_ALLOW_DEFAULT;
6444 REGEXP *rex_sv = reginfo->prog;
6445 regexp *rex = ReANY(rex_sv);
6446 RXi_GET_DECL(rex,rexi);
6447 /* the current state. This is a cached copy of PL_regmatch_state */
6448 regmatch_state *st;
6449 /* cache heavy used fields of st in registers */
6450 regnode *scan;
6451 regnode *next;
6452 U32 n = 0; /* general value; init to avoid compiler warning */
6453 U32 utmp = 0; /* tmp variable - valid for at most one opcode */
6454 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
6455 SSize_t endref = 0; /* offset of end of backref when ln is start */
6456 char *locinput = startpos;
6457 char *loceol = reginfo->strend;
6458 char *pushinput; /* where to continue after a PUSH */
6459 char *pusheol; /* where to stop matching (loceol) after a PUSH */
6460 U8 *pushsr0; /* save starting pos of script run */
6461 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
6462 at EOS */
6463
6464 bool result = 0; /* return value of S_regmatch */
6465 U32 depth = 0; /* depth of backtrack stack */
6466 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6467 const U32 max_nochange_depth =
6468 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6469 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6470 regmatch_state *yes_state = NULL; /* state to pop to on success of
6471 subpattern */
6472 /* mark_state piggy backs on the yes_state logic so that when we unwind
6473 the stack on success we can update the mark_state as we go */
6474 regmatch_state *mark_state = NULL; /* last mark state we have seen */
6475 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6476 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
6477 U32 state_num;
6478 bool no_final = 0; /* prevent failure from backtracking? */
6479 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
6480 char *startpoint = locinput;
6481 SV *popmark = NULL; /* are we looking for a mark? */
6482 SV *sv_commit = NULL; /* last mark name seen in failure */
6483 SV *sv_yes_mark = NULL; /* last mark name we have seen
6484 during a successful match */
6485 U32 lastopen = 0; /* last open we saw */
6486 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6487 SV* const oreplsv = GvSVn(PL_replgv);
6488 /* these three flags are set by various ops to signal information to
6489 * the very next op. They have a useful lifetime of exactly one loop
6490 * iteration, and are not preserved or restored by state pushes/pops
6491 */
6492 bool sw = 0; /* the condition value in (?(cond)a|b) */
6493 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
6494 int logical = 0; /* the following EVAL is:
6495 0: (?{...})
6496 1: (?(?{...})X|Y)
6497 2: (??{...})
6498 or the following IFMATCH/UNLESSM is:
6499 false: plain (?=foo)
6500 true: used as a condition: (?(?=foo))
6501 */
6502 PAD* last_pad = NULL;
6503 dMULTICALL;
6504 U8 gimme = G_SCALAR;
6505 CV *caller_cv = NULL; /* who called us */
6506 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
6507 U32 maxopenparen = 0; /* max '(' index seen so far */
6508 int to_complement; /* Invert the result? */
6509 char_class_number_ classnum;
6510 bool is_utf8_pat = reginfo->is_utf8_pat;
6511 bool match = FALSE;
6512 I32 orig_savestack_ix = PL_savestack_ix;
6513 U8 * script_run_begin = NULL;
6514 char *match_end= NULL; /* where a match MUST end to be considered successful */
6515 bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6516 re_fold_t folder = NULL; /* used by various EXACTish regops */
6517 const U8 * fold_array = NULL; /* used by various EXACTish regops */
6518
6519 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6520 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6521 # define SOLARIS_BAD_OPTIMIZER
6522 const U32 *pl_charclass_dup = PL_charclass;
6523 # define PL_charclass pl_charclass_dup
6524 #endif
6525
6526 #ifdef DEBUGGING
6527 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6528 #endif
6529
6530 /* protect against undef(*^R) */
6531 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6532
6533 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6534 multicall_oldcatch = 0;
6535 PERL_UNUSED_VAR(multicall_cop);
6536
6537 PERL_ARGS_ASSERT_REGMATCH;
6538
6539 st = PL_regmatch_state;
6540
6541 /* Note that nextbyte is a byte even in UTF */
6542 SET_nextchr;
6543 scan = prog;
6544
6545 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6546 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6547 Perl_re_printf( aTHX_ "regmatch start\n" );
6548 }));
6549
6550 while (scan != NULL) {
6551 next = scan + NEXT_OFF(scan);
6552 if (next == scan)
6553 next = NULL;
6554 state_num = OP(scan);
6555
6556 reenter_switch:
6557 DEBUG_EXECUTE_r(
6558 if (state_num <= REGNODE_MAX) {
6559 SV * const prop = sv_newmortal();
6560 regnode *rnext = regnext(scan);
6561
6562 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6563 regprop(rex, prop, scan, reginfo, NULL);
6564 Perl_re_printf( aTHX_
6565 "%*s%" IVdf ":%s(%" IVdf ")\n",
6566 INDENT_CHARS(depth), "",
6567 (IV)(scan - rexi->program),
6568 SvPVX_const(prop),
6569 (REGNODE_TYPE(OP(scan)) == END || !rnext) ?
6570 0 : (IV)(rnext - rexi->program));
6571 }
6572 );
6573
6574 to_complement = 0;
6575
6576 SET_nextchr;
6577 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6578
6579 switch (state_num) {
6580 SV * anyofh_list;
6581
6582 case SBOL: /* /^../ and /\A../ */
6583 if (locinput == reginfo->strbeg)
6584 break;
6585 sayNO;
6586
6587 case MBOL: /* /^../m */
6588 if (locinput == reginfo->strbeg ||
6589 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6590 {
6591 break;
6592 }
6593 sayNO;
6594
6595 case GPOS: /* \G */
6596 if (locinput == reginfo->ganch)
6597 break;
6598 sayNO;
6599
6600 case KEEPS: /* \K */
6601 /* update the startpoint */
6602 st->u.keeper.val = RXp_OFFS_START(rex,0);
6603 RXp_OFFSp(rex)[0].start = locinput - reginfo->strbeg;
6604 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6605 script_run_begin);
6606 NOT_REACHED; /* NOTREACHED */
6607
6608 case KEEPS_next_fail:
6609 /* rollback the start point change */
6610 RXp_OFFSp(rex)[0].start = st->u.keeper.val;
6611 sayNO_SILENT;
6612 NOT_REACHED; /* NOTREACHED */
6613
6614 case MEOL: /* /..$/m */
6615 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6616 sayNO;
6617 break;
6618
6619 case SEOL: /* /..$/ */
6620 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6621 sayNO;
6622 if (reginfo->strend - locinput > 1)
6623 sayNO;
6624 break;
6625
6626 case EOS: /* \z */
6627 if (!NEXTCHR_IS_EOS)
6628 sayNO;
6629 break;
6630
6631 case SANY: /* /./s */
6632 if (NEXTCHR_IS_EOS || locinput >= loceol)
6633 sayNO;
6634 goto increment_locinput;
6635
6636 case REG_ANY: /* /./ */
6637 if ( NEXTCHR_IS_EOS
6638 || locinput >= loceol
6639 || nextbyte == '\n')
6640 {
6641 sayNO;
6642 }
6643 goto increment_locinput;
6644
6645
6646 #undef ST
6647 #define ST st->u.trie
6648 case TRIEC: /* (ab|cd) with known charclass */
6649 /* In this case the charclass data is available inline so
6650 we can fail fast without a lot of extra overhead.
6651 */
6652 if ( ! NEXTCHR_IS_EOS
6653 && locinput < loceol
6654 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6655 {
6656 DEBUG_EXECUTE_r(
6657 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6658 depth, PL_colors[4], PL_colors[5])
6659 );
6660 sayNO_SILENT;
6661 NOT_REACHED; /* NOTREACHED */
6662 }
6663 /* FALLTHROUGH */
6664 case TRIE: /* (ab|cd) */
6665 /* the basic plan of execution of the trie is:
6666 * At the beginning, run though all the states, and
6667 * find the longest-matching word. Also remember the position
6668 * of the shortest matching word. For example, this pattern:
6669 * 1 2 3 4 5
6670 * ab|a|x|abcd|abc
6671 * when matched against the string "abcde", will generate
6672 * accept states for all words except 3, with the longest
6673 * matching word being 4, and the shortest being 2 (with
6674 * the position being after char 1 of the string).
6675 *
6676 * Then for each matching word, in word order (i.e. 1,2,4,5),
6677 * we run the remainder of the pattern; on each try setting
6678 * the current position to the character following the word,
6679 * returning to try the next word on failure.
6680 *
6681 * We avoid having to build a list of words at runtime by
6682 * using a compile-time structure, wordinfo[].prev, which
6683 * gives, for each word, the previous accepting word (if any).
6684 * In the case above it would contain the mappings 1->2, 2->0,
6685 * 3->0, 4->5, 5->1. We can use this table to generate, from
6686 * the longest word (4 above), a list of all words, by
6687 * following the list of prev pointers; this gives us the
6688 * unordered list 4,5,1,2. Then given the current word we have
6689 * just tried, we can go through the list and find the
6690 * next-biggest word to try (so if we just failed on word 2,
6691 * the next in the list is 4).
6692 *
6693 * Since at runtime we don't record the matching position in
6694 * the string for each word, we have to work that out for
6695 * each word we're about to process. The wordinfo table holds
6696 * the character length of each word; given that we recorded
6697 * at the start: the position of the shortest word and its
6698 * length in chars, we just need to move the pointer the
6699 * difference between the two char lengths. Depending on
6700 * Unicode status and folding, that's cheap or expensive.
6701 *
6702 * This algorithm is optimised for the case where are only a
6703 * small number of accept states, i.e. 0,1, or maybe 2.
6704 * With lots of accepts states, and having to try all of them,
6705 * it becomes quadratic on number of accept states to find all
6706 * the next words.
6707 */
6708
6709 {
6710 /* what type of TRIE am I? (utf8 makes this contextual) */
6711 DECL_TRIE_TYPE(scan);
6712
6713 /* what trie are we using right now */
6714 reg_trie_data * const trie
6715 = (reg_trie_data*)rexi->data->data[ ARG1u( scan ) ];
6716 ST.before_paren = trie->before_paren;
6717 ST.after_paren = trie->after_paren;
6718 assert(ST.before_paren<=rex->nparens);
6719 assert(ST.after_paren<=rex->nparens);
6720
6721 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG1u( scan ) + 1 ]);
6722 U32 state = trie->startstate;
6723
6724 if (FLAGS(scan) == EXACTL || FLAGS(scan) == EXACTFLU8) {
6725 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
6726 if (utf8_target
6727 && ! NEXTCHR_IS_EOS
6728 && UTF8_IS_ABOVE_LATIN1(nextbyte)
6729 && FLAGS(scan) == EXACTL)
6730 {
6731 /* We only output for EXACTL, as we let the folder
6732 * output this message for EXACTFLU8 to avoid
6733 * duplication */
6734 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6735 reginfo->strend);
6736 }
6737 }
6738 if ( trie->bitmap
6739 && ( NEXTCHR_IS_EOS
6740 || locinput >= loceol
6741 || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6742 {
6743 if (trie->states[ state ].wordnum) {
6744 DEBUG_EXECUTE_r(
6745 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
6746 depth, PL_colors[4], PL_colors[5])
6747 );
6748 if (!trie->jump)
6749 break;
6750 } else {
6751 DEBUG_EXECUTE_r(
6752 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6753 depth, PL_colors[4], PL_colors[5])
6754 );
6755 sayNO_SILENT;
6756 }
6757 }
6758
6759 {
6760 U8 *uc = ( U8* )locinput;
6761
6762 STRLEN len = 0;
6763 STRLEN foldlen = 0;
6764 U8 *uscan = (U8*)NULL;
6765 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6766 U32 charcount = 0; /* how many input chars we have matched */
6767 U32 accepted = 0; /* have we seen any accepting states? */
6768
6769 ST.jump = trie->jump;
6770 ST.j_before_paren = trie->j_before_paren;
6771 ST.j_after_paren= trie->j_after_paren;
6772 ST.me = scan;
6773 ST.firstpos = NULL;
6774 ST.longfold = FALSE; /* char longer if folded => it's harder */
6775 ST.nextword = 0;
6776
6777 /* fully traverse the TRIE; note the position of the
6778 shortest accept state and the wordnum of the longest
6779 accept state */
6780
6781 while ( state && uc <= (U8*)(loceol) ) {
6782 U32 base = trie->states[ state ].trans.base;
6783 UV uvc = 0;
6784 U16 charid = 0;
6785 U16 wordnum;
6786 wordnum = trie->states[ state ].wordnum;
6787
6788 if (wordnum) { /* it's an accept state */
6789 if (!accepted) {
6790 accepted = 1;
6791 /* record first match position */
6792 if (ST.longfold) {
6793 ST.firstpos = (U8*)locinput;
6794 ST.firstchars = 0;
6795 }
6796 else {
6797 ST.firstpos = uc;
6798 ST.firstchars = charcount;
6799 }
6800 }
6801 if (!ST.nextword || wordnum < ST.nextword)
6802 ST.nextword = wordnum;
6803 ST.topword = wordnum;
6804 }
6805
6806 DEBUG_TRIE_EXECUTE_r({
6807 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6808 /* HERE */
6809 PerlIO_printf( Perl_debug_log,
6810 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6811 INDENT_CHARS(depth), "", PL_colors[4],
6812 (UV)state, (accepted ? 'Y' : 'N'));
6813 });
6814
6815 /* read a char and goto next state */
6816 if ( base && (foldlen || uc < (U8*)(loceol))) {
6817 I32 offset;
6818 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6819 (U8 *) loceol, uscan,
6820 len, uvc, charid, foldlen,
6821 foldbuf, uniflags);
6822 charcount++;
6823 if (foldlen>0)
6824 ST.longfold = TRUE;
6825 if (charid &&
6826 ( ((offset =
6827 base + charid - 1 - trie->uniquecharcount)) >= 0)
6828
6829 && ((U32)offset < trie->lasttrans)
6830 && trie->trans[offset].check == state)
6831 {
6832 state = trie->trans[offset].next;
6833 }
6834 else {
6835 state = 0;
6836 }
6837 uc += len;
6838
6839 }
6840 else {
6841 state = 0;
6842 }
6843 DEBUG_TRIE_EXECUTE_r(
6844 Perl_re_printf( aTHX_
6845 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6846 charid, uvc, (UV)state, PL_colors[5] );
6847 );
6848 }
6849 if (!accepted)
6850 sayNO;
6851
6852 /* calculate total number of accept states */
6853 {
6854 U16 w = ST.topword;
6855 accepted = 0;
6856 while (w) {
6857 w = trie->wordinfo[w].prev;
6858 accepted++;
6859 }
6860 ST.accepted = accepted;
6861 }
6862
6863 DEBUG_EXECUTE_r(
6864 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6865 depth,
6866 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6867 );
6868 goto trie_first_try; /* jump into the fail handler */
6869 }}
6870 NOT_REACHED; /* NOTREACHED */
6871
6872 case TRIE_next_fail: /* we failed - try next alternative */
6873 {
6874 U8 *uc;
6875 if (RE_PESSIMISTIC_PARENS) {
6876 REGCP_UNWIND(ST.lastcp);
6877 regcppop(rex,&maxopenparen);
6878 }
6879 if ( ST.jump ) {
6880 /* undo any captures done in the tail part of a branch,
6881 * e.g.
6882 * /(?:X(.)(.)|Y(.)).../
6883 * where the trie just matches X then calls out to do the
6884 * rest of the branch */
6885 REGCP_UNWIND(ST.cp);
6886 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6887 if (ST.after_paren) {
6888 assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens);
6889 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "TRIE_next_fail");
6890 }
6891 }
6892 if (!--ST.accepted) {
6893 DEBUG_EXECUTE_r({
6894 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6895 depth,
6896 PL_colors[4],
6897 PL_colors[5] );
6898 });
6899 sayNO_SILENT;
6900 }
6901 {
6902 /* Find next-highest word to process. Note that this code
6903 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6904 U16 min = 0;
6905 U16 word;
6906 U16 const nextword = ST.nextword;
6907 reg_trie_wordinfo * const wordinfo
6908 = ((reg_trie_data*)rexi->data->data[ARG1u(ST.me)])->wordinfo;
6909 for (word=ST.topword; word; word=wordinfo[word].prev) {
6910 if (word > nextword && (!min || word < min))
6911 min = word;
6912 }
6913 ST.nextword = min;
6914 }
6915
6916 trie_first_try:
6917 if (do_cutgroup) {
6918 do_cutgroup = 0;
6919 no_final = 0;
6920 }
6921
6922 if ( ST.jump ) {
6923 ST.lastparen = RXp_LASTPAREN(rex);
6924 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
6925 REGCP_SET(ST.cp);
6926 }
6927
6928 /* find start char of end of current word */
6929 {
6930 U32 chars; /* how many chars to skip */
6931 reg_trie_data * const trie
6932 = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)];
6933
6934 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6935 >= ST.firstchars);
6936 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6937 - ST.firstchars;
6938 uc = ST.firstpos;
6939
6940 if (ST.longfold) {
6941 /* the hard option - fold each char in turn and find
6942 * its folded length (which may be different */
6943 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6944 STRLEN foldlen;
6945 STRLEN len;
6946 UV uvc;
6947 U8 *uscan;
6948
6949 while (chars) {
6950 if (utf8_target) {
6951 /* XXX This assumes the length is well-formed, as
6952 * does the UTF8SKIP below */
6953 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6954 uniflags);
6955 uc += len;
6956 }
6957 else {
6958 uvc = *uc;
6959 uc++;
6960 }
6961 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6962 uscan = foldbuf;
6963 while (foldlen) {
6964 if (!--chars)
6965 break;
6966 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6967 uniflags);
6968 uscan += len;
6969 foldlen -= len;
6970 }
6971 }
6972 }
6973 else {
6974 if (utf8_target)
6975 uc = utf8_hop(uc, chars);
6976 else
6977 uc += chars;
6978 }
6979 }
6980 if (ST.jump && ST.jump[ST.nextword]) {
6981 scan = ST.me + ST.jump[ST.nextword];
6982 ST.before_paren = ST.j_before_paren[ST.nextword];
6983 assert(ST.before_paren <= rex->nparens);
6984 ST.after_paren = ST.j_after_paren[ST.nextword];
6985 assert(ST.after_paren <= rex->nparens);
6986 } else {
6987 scan = ST.me + NEXT_OFF(ST.me);
6988 }
6989
6990
6991 DEBUG_EXECUTE_r({
6992 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6993 depth,
6994 PL_colors[4],
6995 ST.nextword,
6996 PL_colors[5]
6997 );
6998 });
6999
7000 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
7001 if (RE_PESSIMISTIC_PARENS) {
7002 (void)regcppush(rex, 0, maxopenparen);
7003 REGCP_SET(ST.lastcp);
7004 }
7005 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
7006 script_run_begin);
7007 NOT_REACHED; /* NOTREACHED */
7008 }
7009 /* only one choice left - just continue */
7010 DEBUG_EXECUTE_r({
7011 AV *const trie_words
7012 = MUTABLE_AV(rexi->data->data[ARG1u(ST.me)+TRIE_WORDS_OFFSET]);
7013 SV ** const tmp = trie_words
7014 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
7015 SV *sv= tmp ? sv_newmortal() : NULL;
7016
7017 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
7018 depth, PL_colors[4],
7019 ST.nextword,
7020 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
7021 PL_colors[0], PL_colors[1],
7022 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
7023 )
7024 : "not compiled under -Dr",
7025 PL_colors[5] );
7026 });
7027
7028 locinput = (char*)uc;
7029 continue; /* execute rest of RE */
7030 /* NOTREACHED */
7031 }
7032 #undef ST
7033
7034 case LEXACT_REQ8:
7035 if (! utf8_target) {
7036 sayNO;
7037 }
7038 /* FALLTHROUGH */
7039
7040 case LEXACT:
7041 {
7042 char *s;
7043
7044 s = STRINGl(scan);
7045 ln = STR_LENl(scan);
7046 goto join_short_long_exact;
7047
7048 case EXACTL: /* /abc/l */
7049 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7050
7051 /* Complete checking would involve going through every character
7052 * matched by the string to see if any is above latin1. But the
7053 * comparison otherwise might very well be a fast assembly
7054 * language routine, and I (khw) don't think slowing things down
7055 * just to check for this warning is worth it. So this just checks
7056 * the first character */
7057 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
7058 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
7059 }
7060 goto do_exact;
7061 case EXACT_REQ8:
7062 if (! utf8_target) {
7063 sayNO;
7064 }
7065 /* FALLTHROUGH */
7066
7067 case EXACT: /* /abc/ */
7068 do_exact:
7069 s = STRINGs(scan);
7070 ln = STR_LENs(scan);
7071
7072 join_short_long_exact:
7073 if (utf8_target != is_utf8_pat) {
7074 /* The target and the pattern have differing utf8ness. */
7075 char *l = locinput;
7076 const char * const e = s + ln;
7077
7078 if (utf8_target) {
7079 /* The target is utf8, the pattern is not utf8.
7080 * Above-Latin1 code points can't match the pattern;
7081 * invariants match exactly, and the other Latin1 ones need
7082 * to be downgraded to a single byte in order to do the
7083 * comparison. (If we could be confident that the target
7084 * is not malformed, this could be refactored to have fewer
7085 * tests by just assuming that if the first bytes match, it
7086 * is an invariant, but there are tests in the test suite
7087 * dealing with (??{...}) which violate this) */
7088 while (s < e) {
7089 if ( l >= loceol
7090 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
7091 {
7092 sayNO;
7093 }
7094 if (UTF8_IS_INVARIANT(*(U8*)l)) {
7095 if (*l != *s) {
7096 sayNO;
7097 }
7098 l++;
7099 }
7100 else {
7101 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
7102 {
7103 sayNO;
7104 }
7105 l += 2;
7106 }
7107 s++;
7108 }
7109 }
7110 else {
7111 /* The target is not utf8, the pattern is utf8. */
7112 while (s < e) {
7113 if ( l >= loceol
7114 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
7115 {
7116 sayNO;
7117 }
7118 if (UTF8_IS_INVARIANT(*(U8*)s)) {
7119 if (*s != *l) {
7120 sayNO;
7121 }
7122 s++;
7123 }
7124 else {
7125 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
7126 {
7127 sayNO;
7128 }
7129 s += 2;
7130 }
7131 l++;
7132 }
7133 }
7134 locinput = l;
7135 }
7136 else {
7137 /* The target and the pattern have the same utf8ness. */
7138 /* Inline the first character, for speed. */
7139 if ( loceol - locinput < ln
7140 || UCHARAT(s) != nextbyte
7141 || (ln > 1 && memNE(s, locinput, ln)))
7142 {
7143 sayNO;
7144 }
7145 locinput += ln;
7146 }
7147 break;
7148 }
7149
7150 case EXACTFL: /* /abc/il */
7151 {
7152 const char * s;
7153 U32 fold_utf8_flags;
7154
7155 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7156 folder = Perl_foldEQ_locale;
7157 fold_array = PL_fold_locale;
7158 fold_utf8_flags = FOLDEQ_LOCALE;
7159 goto do_exactf;
7160
7161 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
7162 is effectively /u; hence to match, target
7163 must be UTF-8. */
7164 if (! utf8_target) {
7165 sayNO;
7166 }
7167 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
7168 | FOLDEQ_S2_FOLDS_SANE;
7169 folder = S_foldEQ_latin1_s2_folded;
7170 fold_array = PL_fold_latin1;
7171 goto do_exactf;
7172
7173 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
7174 if (! utf8_target) {
7175 sayNO;
7176 }
7177 assert(is_utf8_pat);
7178 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7179 #ifdef DEBUGGING
7180 /* this is only used in an assert check, so we restrict it to DEBUGGING mode.
7181 * In theory neither of these variables should be used in this mode. */
7182 folder = NULL;
7183 fold_array = NULL;
7184 #endif
7185 goto do_exactf;
7186
7187 case EXACTFUP: /* /foo/iu, and something is problematic in
7188 'foo' so can't take shortcuts. */
7189 assert(! is_utf8_pat);
7190 folder = Perl_foldEQ_latin1;
7191 fold_array = PL_fold_latin1;
7192 fold_utf8_flags = 0;
7193 goto do_exactf;
7194
7195 case EXACTFU: /* /abc/iu */
7196 folder = S_foldEQ_latin1_s2_folded;
7197 fold_array = PL_fold_latin1;
7198 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7199 goto do_exactf;
7200
7201 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
7202 patterns */
7203 assert(! is_utf8_pat);
7204 /* FALLTHROUGH */
7205 case EXACTFAA: /* /abc/iaa */
7206 folder = S_foldEQ_latin1_s2_folded;
7207 fold_array = PL_fold_latin1;
7208 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7209 if (is_utf8_pat || ! utf8_target) {
7210
7211 /* The possible presence of a MICRO SIGN in the pattern forbids
7212 * us to view a non-UTF-8 pattern as folded when there is a
7213 * UTF-8 target */
7214 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7215 |FOLDEQ_S2_FOLDS_SANE;
7216 }
7217 goto do_exactf;
7218
7219
7220 case EXACTF: /* /abc/i This node only generated for
7221 non-utf8 patterns */
7222 assert(! is_utf8_pat);
7223 folder = Perl_foldEQ;
7224 fold_array = PL_fold;
7225 fold_utf8_flags = 0;
7226
7227 do_exactf:
7228 s = STRINGs(scan);
7229 ln = STR_LENs(scan);
7230
7231 if ( utf8_target
7232 || is_utf8_pat
7233 || state_num == EXACTFUP
7234 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7235 {
7236 /* Either target or the pattern are utf8, or has the issue where
7237 * the fold lengths may differ. */
7238 const char * const l = locinput;
7239 char *e = loceol;
7240
7241 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
7242 s, 0, ln, is_utf8_pat,fold_utf8_flags))
7243 {
7244 sayNO;
7245 }
7246 locinput = e;
7247 break;
7248 }
7249
7250 /* Neither the target nor the pattern are utf8 */
7251 assert(fold_array);
7252 if (UCHARAT(s) != nextbyte
7253 && !NEXTCHR_IS_EOS
7254 && UCHARAT(s) != fold_array[nextbyte])
7255 {
7256 sayNO;
7257 }
7258 if (loceol - locinput < ln)
7259 sayNO;
7260 assert(folder);
7261 if (ln > 1 && ! folder(aTHX_ locinput, s, ln))
7262 sayNO;
7263 locinput += ln;
7264 break;
7265 }
7266
7267 case NBOUNDL: /* /\B/l */
7268 to_complement = 1;
7269 /* FALLTHROUGH */
7270
7271 case BOUNDL: /* /\b/l */
7272 {
7273 bool b1, b2;
7274 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7275
7276 if (FLAGS(scan) != TRADITIONAL_BOUND) {
7277 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7278 goto boundu;
7279 }
7280
7281 if (utf8_target) {
7282 if (locinput == reginfo->strbeg)
7283 b1 = isWORDCHAR_LC('\n');
7284 else {
7285 U8 *p = reghop3((U8*)locinput, -1,
7286 (U8*)(reginfo->strbeg));
7287 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7288 }
7289 b2 = (NEXTCHR_IS_EOS)
7290 ? isWORDCHAR_LC('\n')
7291 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7292 (U8*) reginfo->strend);
7293 }
7294 else { /* Here the string isn't utf8 */
7295 b1 = (locinput == reginfo->strbeg)
7296 ? isWORDCHAR_LC('\n')
7297 : isWORDCHAR_LC(UCHARAT(locinput - 1));
7298 b2 = (NEXTCHR_IS_EOS)
7299 ? isWORDCHAR_LC('\n')
7300 : isWORDCHAR_LC(nextbyte);
7301 }
7302 if (to_complement ^ (b1 == b2)) {
7303 sayNO;
7304 }
7305 break;
7306 }
7307
7308 case NBOUND: /* /\B/ */
7309 to_complement = 1;
7310 /* FALLTHROUGH */
7311
7312 case BOUND: /* /\b/ */
7313 if (utf8_target) {
7314 goto bound_utf8;
7315 }
7316 goto bound_ascii_match_only;
7317
7318 case NBOUNDA: /* /\B/a */
7319 to_complement = 1;
7320 /* FALLTHROUGH */
7321
7322 case BOUNDA: /* /\b/a */
7323 {
7324 bool b1, b2;
7325
7326 bound_ascii_match_only:
7327 /* Here the string isn't utf8, or is utf8 and only ascii characters
7328 * are to match \w. In the latter case looking at the byte just
7329 * prior to the current one may be just the final byte of a
7330 * multi-byte character. This is ok. There are two cases:
7331 * 1) it is a single byte character, and then the test is doing
7332 * just what it's supposed to.
7333 * 2) it is a multi-byte character, in which case the final byte is
7334 * never mistakable for ASCII, and so the test will say it is
7335 * not a word character, which is the correct answer. */
7336 b1 = (locinput == reginfo->strbeg)
7337 ? isWORDCHAR_A('\n')
7338 : isWORDCHAR_A(UCHARAT(locinput - 1));
7339 b2 = (NEXTCHR_IS_EOS)
7340 ? isWORDCHAR_A('\n')
7341 : isWORDCHAR_A(nextbyte);
7342 if (to_complement ^ (b1 == b2)) {
7343 sayNO;
7344 }
7345 break;
7346 }
7347
7348 case NBOUNDU: /* /\B/u */
7349 to_complement = 1;
7350 /* FALLTHROUGH */
7351
7352 case BOUNDU: /* /\b/u */
7353
7354 boundu:
7355 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7356 match = FALSE;
7357 }
7358 else if (utf8_target) {
7359 bound_utf8:
7360 switch((bound_type) FLAGS(scan)) {
7361 case TRADITIONAL_BOUND:
7362 {
7363 bool b1, b2;
7364 if (locinput == reginfo->strbeg) {
7365 b1 = 0 /* isWORDCHAR_L1('\n') */;
7366 }
7367 else {
7368 U8 *p = reghop3((U8*)locinput, -1,
7369 (U8*)(reginfo->strbeg));
7370
7371 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7372 }
7373 b2 = (NEXTCHR_IS_EOS)
7374 ? 0 /* isWORDCHAR_L1('\n') */
7375 : isWORDCHAR_utf8_safe((U8*)locinput,
7376 (U8*) reginfo->strend);
7377 match = cBOOL(b1 != b2);
7378 break;
7379 }
7380 case GCB_BOUND:
7381 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7382 match = TRUE; /* GCB always matches at begin and
7383 end */
7384 }
7385 else {
7386 /* Find the gcb values of previous and current
7387 * chars, then see if is a break point */
7388 match = isGCB(getGCB_VAL_UTF8(
7389 reghop3((U8*)locinput,
7390 -1,
7391 (U8*)(reginfo->strbeg)),
7392 (U8*) reginfo->strend),
7393 getGCB_VAL_UTF8((U8*) locinput,
7394 (U8*) reginfo->strend),
7395 (U8*) reginfo->strbeg,
7396 (U8*) locinput,
7397 utf8_target);
7398 }
7399 break;
7400
7401 case LB_BOUND:
7402 if (locinput == reginfo->strbeg) {
7403 match = FALSE;
7404 }
7405 else if (NEXTCHR_IS_EOS) {
7406 match = TRUE;
7407 }
7408 else {
7409 match = isLB(getLB_VAL_UTF8(
7410 reghop3((U8*)locinput,
7411 -1,
7412 (U8*)(reginfo->strbeg)),
7413 (U8*) reginfo->strend),
7414 getLB_VAL_UTF8((U8*) locinput,
7415 (U8*) reginfo->strend),
7416 (U8*) reginfo->strbeg,
7417 (U8*) locinput,
7418 (U8*) reginfo->strend,
7419 utf8_target);
7420 }
7421 break;
7422
7423 case SB_BOUND: /* Always matches at begin and end */
7424 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7425 match = TRUE;
7426 }
7427 else {
7428 match = isSB(getSB_VAL_UTF8(
7429 reghop3((U8*)locinput,
7430 -1,
7431 (U8*)(reginfo->strbeg)),
7432 (U8*) reginfo->strend),
7433 getSB_VAL_UTF8((U8*) locinput,
7434 (U8*) reginfo->strend),
7435 (U8*) reginfo->strbeg,
7436 (U8*) locinput,
7437 (U8*) reginfo->strend,
7438 utf8_target);
7439 }
7440 break;
7441
7442 case WB_BOUND:
7443 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7444 match = TRUE;
7445 }
7446 else {
7447 match = isWB(WB_UNKNOWN,
7448 getWB_VAL_UTF8(
7449 reghop3((U8*)locinput,
7450 -1,
7451 (U8*)(reginfo->strbeg)),
7452 (U8*) reginfo->strend),
7453 getWB_VAL_UTF8((U8*) locinput,
7454 (U8*) reginfo->strend),
7455 (U8*) reginfo->strbeg,
7456 (U8*) locinput,
7457 (U8*) reginfo->strend,
7458 utf8_target);
7459 }
7460 break;
7461 }
7462 }
7463 else { /* Not utf8 target */
7464 switch((bound_type) FLAGS(scan)) {
7465 case TRADITIONAL_BOUND:
7466 {
7467 bool b1, b2;
7468 b1 = (locinput == reginfo->strbeg)
7469 ? 0 /* isWORDCHAR_L1('\n') */
7470 : isWORDCHAR_L1(UCHARAT(locinput - 1));
7471 b2 = (NEXTCHR_IS_EOS)
7472 ? 0 /* isWORDCHAR_L1('\n') */
7473 : isWORDCHAR_L1(nextbyte);
7474 match = cBOOL(b1 != b2);
7475 break;
7476 }
7477
7478 case GCB_BOUND:
7479 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7480 match = TRUE; /* GCB always matches at begin and
7481 end */
7482 }
7483 else { /* Only CR-LF combo isn't a GCB in 0-255
7484 range */
7485 match = UCHARAT(locinput - 1) != '\r'
7486 || UCHARAT(locinput) != '\n';
7487 }
7488 break;
7489
7490 case LB_BOUND:
7491 if (locinput == reginfo->strbeg) {
7492 match = FALSE;
7493 }
7494 else if (NEXTCHR_IS_EOS) {
7495 match = TRUE;
7496 }
7497 else {
7498 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7499 getLB_VAL_CP(UCHARAT(locinput)),
7500 (U8*) reginfo->strbeg,
7501 (U8*) locinput,
7502 (U8*) reginfo->strend,
7503 utf8_target);
7504 }
7505 break;
7506
7507 case SB_BOUND: /* Always matches at begin and end */
7508 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7509 match = TRUE;
7510 }
7511 else {
7512 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7513 getSB_VAL_CP(UCHARAT(locinput)),
7514 (U8*) reginfo->strbeg,
7515 (U8*) locinput,
7516 (U8*) reginfo->strend,
7517 utf8_target);
7518 }
7519 break;
7520
7521 case WB_BOUND:
7522 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7523 match = TRUE;
7524 }
7525 else {
7526 match = isWB(WB_UNKNOWN,
7527 getWB_VAL_CP(UCHARAT(locinput -1)),
7528 getWB_VAL_CP(UCHARAT(locinput)),
7529 (U8*) reginfo->strbeg,
7530 (U8*) locinput,
7531 (U8*) reginfo->strend,
7532 utf8_target);
7533 }
7534 break;
7535 }
7536 }
7537
7538 if (to_complement ^ ! match) {
7539 sayNO;
7540 }
7541 break;
7542
7543 case ANYOFPOSIXL:
7544 case ANYOFL: /* /[abc]/l */
7545 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7546 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7547
7548 /* FALLTHROUGH */
7549 case ANYOFD: /* /[abc]/d */
7550 case ANYOF: /* /[abc]/ */
7551 if (NEXTCHR_IS_EOS || locinput >= loceol)
7552 sayNO;
7553 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7554 && ! ANYOF_FLAGS(scan)
7555 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan))
7556 {
7557 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7558 sayNO;
7559 }
7560 locinput++;
7561 }
7562 else {
7563 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7564 utf8_target))
7565 {
7566 sayNO;
7567 }
7568 goto increment_locinput;
7569 }
7570 break;
7571
7572 case ANYOFM:
7573 if ( NEXTCHR_IS_EOS
7574 || (UCHARAT(locinput) & FLAGS(scan)) != ARG1u(scan)
7575 || locinput >= loceol)
7576 {
7577 sayNO;
7578 }
7579 locinput++; /* ANYOFM is always single byte */
7580 break;
7581
7582 case NANYOFM:
7583 if ( NEXTCHR_IS_EOS
7584 || (UCHARAT(locinput) & FLAGS(scan)) == ARG1u(scan)
7585 || locinput >= loceol)
7586 {
7587 sayNO;
7588 }
7589 goto increment_locinput;
7590 break;
7591
7592 case ANYOFH:
7593 if ( ! utf8_target
7594 || NEXTCHR_IS_EOS
7595 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7596 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7597 || ! _invlist_contains_cp(anyofh_list,
7598 utf8_to_uvchr_buf((U8 *) locinput,
7599 (U8 *) loceol,
7600 NULL)))
7601 {
7602 sayNO;
7603 }
7604 goto increment_locinput;
7605 break;
7606
7607 case ANYOFHb:
7608 if ( ! utf8_target
7609 || NEXTCHR_IS_EOS
7610 || ANYOF_FLAGS(scan) != (U8) *locinput
7611 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7612 || ! _invlist_contains_cp(anyofh_list,
7613 utf8_to_uvchr_buf((U8 *) locinput,
7614 (U8 *) loceol,
7615 NULL)))
7616 {
7617 sayNO;
7618 }
7619 goto increment_locinput;
7620 break;
7621
7622 case ANYOFHbbm:
7623 if ( ! utf8_target
7624 || NEXTCHR_IS_EOS
7625 || ANYOF_FLAGS(scan) != (U8) locinput[0]
7626 || locinput >= reginfo->strend
7627 || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap,
7628 (U8) locinput[1] & UTF_CONTINUATION_MASK))
7629 {
7630 sayNO;
7631 }
7632 goto increment_locinput;
7633 break;
7634
7635 case ANYOFHr:
7636 if ( ! utf8_target
7637 || NEXTCHR_IS_EOS
7638 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7639 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7640 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7641 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7642 || ! _invlist_contains_cp(anyofh_list,
7643 utf8_to_uvchr_buf((U8 *) locinput,
7644 (U8 *) loceol,
7645 NULL)))
7646 {
7647 sayNO;
7648 }
7649 goto increment_locinput;
7650 break;
7651
7652 case ANYOFHs:
7653 if ( ! utf8_target
7654 || NEXTCHR_IS_EOS
7655 || loceol - locinput < FLAGS(scan)
7656 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7657 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7658 || ! _invlist_contains_cp(anyofh_list,
7659 utf8_to_uvchr_buf((U8 *) locinput,
7660 (U8 *) loceol,
7661 NULL)))
7662 {
7663 sayNO;
7664 }
7665 goto increment_locinput;
7666 break;
7667
7668 case ANYOFR:
7669 if (NEXTCHR_IS_EOS) {
7670 sayNO;
7671 }
7672
7673 if (utf8_target) {
7674 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7675 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7676 (U8 *) reginfo->strend,
7677 NULL),
7678 ANYOFRbase(scan), ANYOFRdelta(scan)))
7679 {
7680 sayNO;
7681 }
7682 }
7683 else {
7684 if (! withinCOUNT((U8) *locinput,
7685 ANYOFRbase(scan), ANYOFRdelta(scan)))
7686 {
7687 sayNO;
7688 }
7689 }
7690 goto increment_locinput;
7691 break;
7692
7693 case ANYOFRb:
7694 if (NEXTCHR_IS_EOS) {
7695 sayNO;
7696 }
7697
7698 if (utf8_target) {
7699 if ( ANYOF_FLAGS(scan) != (U8) *locinput
7700 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7701 (U8 *) reginfo->strend,
7702 NULL),
7703 ANYOFRbase(scan), ANYOFRdelta(scan)))
7704 {
7705 sayNO;
7706 }
7707 }
7708 else {
7709 if (! withinCOUNT((U8) *locinput,
7710 ANYOFRbase(scan), ANYOFRdelta(scan)))
7711 {
7712 sayNO;
7713 }
7714 }
7715 goto increment_locinput;
7716 break;
7717
7718 /* The argument (FLAGS) to all the POSIX node types is the class number
7719 * */
7720
7721 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
7722 to_complement = 1;
7723 /* FALLTHROUGH */
7724
7725 case POSIXL: /* \w or [:punct:] etc. under /l */
7726 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7727 if (NEXTCHR_IS_EOS || locinput >= loceol)
7728 sayNO;
7729
7730 /* Use isFOO_lc() for characters within Latin1. (Note that
7731 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7732 * wouldn't be invariant) */
7733 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7734 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7735 sayNO;
7736 }
7737
7738 locinput++;
7739 break;
7740 }
7741
7742 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7743 /* An above Latin-1 code point, or malformed */
7744 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7745 reginfo->strend);
7746 goto utf8_posix_above_latin1;
7747 }
7748
7749 /* Here is a UTF-8 variant code point below 256 and the target is
7750 * UTF-8 */
7751 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7752 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7753 *(locinput + 1))))))
7754 {
7755 sayNO;
7756 }
7757
7758 goto increment_locinput;
7759
7760 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
7761 to_complement = 1;
7762 /* FALLTHROUGH */
7763
7764 case POSIXD: /* \w or [:punct:] etc. under /d */
7765 if (utf8_target) {
7766 goto utf8_posix;
7767 }
7768 goto posixa;
7769
7770 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
7771
7772 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7773 sayNO;
7774 }
7775
7776 /* All UTF-8 variants match */
7777 if (! UTF8_IS_INVARIANT(nextbyte)) {
7778 goto increment_locinput;
7779 }
7780
7781 to_complement = 1;
7782 goto join_nposixa;
7783
7784 case POSIXA: /* \w or [:punct:] etc. under /a */
7785
7786 posixa:
7787 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7788 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7789 * character is a single byte */
7790
7791 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7792 sayNO;
7793 }
7794
7795 join_nposixa:
7796
7797 if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte,
7798 FLAGS(scan)))))
7799 {
7800 sayNO;
7801 }
7802
7803 /* Here we are either not in utf8, or we matched a utf8-invariant,
7804 * so the next char is the next byte */
7805 locinput++;
7806 break;
7807
7808 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
7809 to_complement = 1;
7810 /* FALLTHROUGH */
7811
7812 case POSIXU: /* \w or [:punct:] etc. under /u */
7813 utf8_posix:
7814 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7815 sayNO;
7816 }
7817
7818 /* Use generic_isCC_() for characters within Latin1. (Note that
7819 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7820 * wouldn't be invariant) */
7821 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7822 if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte,
7823 FLAGS(scan)))))
7824 {
7825 sayNO;
7826 }
7827 locinput++;
7828 }
7829 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7830 if (! (to_complement
7831 ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7832 *(locinput + 1)),
7833 FLAGS(scan)))))
7834 {
7835 sayNO;
7836 }
7837 locinput += 2;
7838 }
7839 else { /* Handle above Latin-1 code points */
7840 utf8_posix_above_latin1:
7841 classnum = (char_class_number_) FLAGS(scan);
7842 switch (classnum) {
7843 default:
7844 if (! (to_complement
7845 ^ cBOOL(_invlist_contains_cp(
7846 PL_XPosix_ptrs[classnum],
7847 utf8_to_uvchr_buf((U8 *) locinput,
7848 (U8 *) reginfo->strend,
7849 NULL)))))
7850 {
7851 sayNO;
7852 }
7853 break;
7854 case CC_ENUM_SPACE_:
7855 if (! (to_complement
7856 ^ cBOOL(is_XPERLSPACE_high(locinput))))
7857 {
7858 sayNO;
7859 }
7860 break;
7861 case CC_ENUM_BLANK_:
7862 if (! (to_complement
7863 ^ cBOOL(is_HORIZWS_high(locinput))))
7864 {
7865 sayNO;
7866 }
7867 break;
7868 case CC_ENUM_XDIGIT_:
7869 if (! (to_complement
7870 ^ cBOOL(is_XDIGIT_high(locinput))))
7871 {
7872 sayNO;
7873 }
7874 break;
7875 case CC_ENUM_VERTSPACE_:
7876 if (! (to_complement
7877 ^ cBOOL(is_VERTWS_high(locinput))))
7878 {
7879 sayNO;
7880 }
7881 break;
7882 case CC_ENUM_CNTRL_: /* These can't match above Latin1 */
7883 case CC_ENUM_ASCII_:
7884 if (! to_complement) {
7885 sayNO;
7886 }
7887 break;
7888 }
7889 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7890 }
7891 break;
7892
7893 case CLUMP: /* Match \X: logical Unicode character. This is defined as
7894 a Unicode extended Grapheme Cluster */
7895 if (NEXTCHR_IS_EOS || locinput >= loceol)
7896 sayNO;
7897 if (! utf8_target) {
7898
7899 /* Match either CR LF or '.', as all the other possibilities
7900 * require utf8 */
7901 locinput++; /* Match the . or CR */
7902 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7903 match the LF */
7904 && locinput < loceol
7905 && UCHARAT(locinput) == '\n')
7906 {
7907 locinput++;
7908 }
7909 }
7910 else {
7911
7912 /* Get the gcb type for the current character */
7913 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7914 (U8*) reginfo->strend);
7915
7916 /* Then scan through the input until we get to the first
7917 * character whose type is supposed to be a gcb with the
7918 * current character. (There is always a break at the
7919 * end-of-input) */
7920 locinput += UTF8SKIP(locinput);
7921 while (locinput < loceol) {
7922 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7923 (U8*) reginfo->strend);
7924 if (isGCB(prev_gcb, cur_gcb,
7925 (U8*) reginfo->strbeg, (U8*) locinput,
7926 utf8_target))
7927 {
7928 break;
7929 }
7930
7931 prev_gcb = cur_gcb;
7932 locinput += UTF8SKIP(locinput);
7933 }
7934
7935
7936 }
7937 break;
7938
7939 case REFFLN: /* /\g{name}/il */
7940 { /* The capture buffer cases. The ones beginning with N for the
7941 named buffers just convert to the equivalent numbered and
7942 pretend they were called as the corresponding numbered buffer
7943 op. */
7944 /* don't initialize these in the declaration, it makes C++
7945 unhappy */
7946 const char *s;
7947 char type;
7948 re_fold_t folder;
7949 const U8 *fold_array;
7950 UV utf8_fold_flags;
7951
7952 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7953 folder = Perl_foldEQ_locale;
7954 fold_array = PL_fold_locale;
7955 type = REFFL;
7956 utf8_fold_flags = FOLDEQ_LOCALE;
7957 goto do_nref;
7958
7959 case REFFAN: /* /\g{name}/iaa */
7960 folder = Perl_foldEQ_latin1;
7961 fold_array = PL_fold_latin1;
7962 type = REFFA;
7963 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7964 goto do_nref;
7965
7966 case REFFUN: /* /\g{name}/iu */
7967 folder = Perl_foldEQ_latin1;
7968 fold_array = PL_fold_latin1;
7969 type = REFFU;
7970 utf8_fold_flags = 0;
7971 goto do_nref;
7972
7973 case REFFN: /* /\g{name}/i */
7974 folder = Perl_foldEQ;
7975 fold_array = PL_fold;
7976 type = REFF;
7977 utf8_fold_flags = 0;
7978 goto do_nref;
7979
7980 case REFN: /* /\g{name}/ */
7981 type = REF;
7982 folder = NULL;
7983 fold_array = NULL;
7984 utf8_fold_flags = 0;
7985 do_nref:
7986
7987 /* For the named back references, find the corresponding buffer
7988 * number */
7989 n = reg_check_named_buff_matched(rex,scan);
7990
7991 if ( ! n ) {
7992 sayNO;
7993 }
7994 goto do_nref_ref_common;
7995
7996 case REFFL: /* /\1/il */
7997 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7998 folder = Perl_foldEQ_locale;
7999 fold_array = PL_fold_locale;
8000 utf8_fold_flags = FOLDEQ_LOCALE;
8001 goto do_ref;
8002
8003 case REFFA: /* /\1/iaa */
8004 folder = Perl_foldEQ_latin1;
8005 fold_array = PL_fold_latin1;
8006 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8007 goto do_ref;
8008
8009 case REFFU: /* /\1/iu */
8010 folder = Perl_foldEQ_latin1;
8011 fold_array = PL_fold_latin1;
8012 utf8_fold_flags = 0;
8013 goto do_ref;
8014
8015 case REFF: /* /\1/i */
8016 folder = Perl_foldEQ;
8017 fold_array = PL_fold;
8018 utf8_fold_flags = 0;
8019 goto do_ref;
8020
8021 #undef ST
8022 #define ST st->u.backref
8023 case REF: /* /\1/ */
8024 folder = NULL;
8025 fold_array = NULL;
8026 utf8_fold_flags = 0;
8027
8028 do_ref:
8029 type = OP(scan);
8030 n = ARG1u(scan); /* which paren pair */
8031 if (rex->logical_to_parno) {
8032 n = rex->logical_to_parno[n];
8033 do {
8034 if ( RXp_LASTPAREN(rex) < n ||
8035 RXp_OFFS_START(rex,n) == -1 ||
8036 RXp_OFFS_END(rex,n) == -1
8037 ) {
8038 n = rex->parno_to_logical_next[n];
8039 }
8040 else {
8041 break;
8042 }
8043 } while(n);
8044
8045 if (!n) /* this means there is nothing that matched */
8046 sayNO;
8047 }
8048
8049 do_nref_ref_common:
8050 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8051 if (RXp_LASTPAREN(rex) < n)
8052 sayNO;
8053
8054 ln = RXp_OFFSp(rex)[n].start;
8055 endref = RXp_OFFSp(rex)[n].end;
8056 if (ln == -1 || endref == -1)
8057 sayNO; /* Do not match unless seen CLOSEn. */
8058
8059 if (ln == endref)
8060 goto ref_yes;
8061
8062 s = reginfo->strbeg + ln;
8063 if (type != REF /* REF can do byte comparison */
8064 && (utf8_target || type == REFFU || type == REFFL))
8065 {
8066 char * limit = loceol;
8067
8068 /* This call case insensitively compares the entire buffer
8069 * at s, with the current input starting at locinput, but
8070 * not going off the end given by loceol, and
8071 * returns in <limit> upon success, how much of the
8072 * current input was matched */
8073 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
8074 locinput, &limit, 0, utf8_target, utf8_fold_flags))
8075 {
8076 sayNO;
8077 }
8078 locinput = limit;
8079 goto ref_yes;
8080 }
8081
8082 /* Not utf8: Inline the first character, for speed. */
8083 if ( ! NEXTCHR_IS_EOS
8084 && locinput < loceol
8085 && UCHARAT(s) != nextbyte
8086 && ( type == REF
8087 || UCHARAT(s) != fold_array[nextbyte]))
8088 {
8089 sayNO;
8090 }
8091 ln = endref - ln;
8092 if (locinput + ln > loceol)
8093 sayNO;
8094 if (ln > 1 && (type == REF
8095 ? memNE(s, locinput, ln)
8096 : ! folder(aTHX_ locinput, s, ln)))
8097 sayNO;
8098 locinput += ln;
8099 }
8100 ref_yes:
8101 if (FLAGS(scan)) { /* == VOLATILE_REF but only other value is 0 */
8102 ST.cp = regcppush(rex, ARG2u(scan) - 1, maxopenparen);
8103 REGCP_SET(ST.lastcp);
8104 PUSH_STATE_GOTO(REF_next, next, locinput, loceol,
8105 script_run_begin);
8106 }
8107 break;
8108 NOT_REACHED; /* NOTREACHED */
8109
8110 case REF_next:
8111 sayYES;
8112 break;
8113
8114 case REF_next_fail:
8115 REGCP_UNWIND(ST.lastcp);
8116 regcppop(rex, &maxopenparen);
8117 sayNO;
8118 break;
8119
8120 case NOTHING: /* null op; e.g. the 'nothing' following
8121 * the '*' in m{(a+|b)*}' */
8122 break;
8123 case TAIL: /* placeholder while compiling (A|B|C) */
8124 break;
8125
8126 #undef ST
8127 #define ST st->u.eval
8128 #define CUR_EVAL cur_eval->u.eval
8129
8130 {
8131 SV *ret;
8132 REGEXP *re_sv;
8133 regexp *re;
8134 regexp_internal *rei;
8135 regnode *startpoint;
8136 U32 arg;
8137
8138 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
8139 arg = ARG1u(scan);
8140 if (cur_eval && cur_eval->locinput == locinput) {
8141 if ( ++nochange_depth > max_nochange_depth )
8142 Perl_croak(aTHX_
8143 "Pattern subroutine nesting without pos change"
8144 " exceeded limit in regex");
8145 } else {
8146 nochange_depth = 0;
8147 }
8148 re_sv = rex_sv;
8149 re = rex;
8150 rei = rexi;
8151 startpoint = scan + ARG2i(scan);
8152 EVAL_CLOSE_PAREN_SET( st, arg );
8153 /* Detect infinite recursion
8154 *
8155 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
8156 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
8157 * So we track the position in the string we are at each time
8158 * we recurse and if we try to enter the same routine twice from
8159 * the same position we throw an error.
8160 */
8161 if ( rex->recurse_locinput[arg] == locinput ) {
8162 /* FIXME: we should show the regop that is failing as part
8163 * of the error message. */
8164 Perl_croak(aTHX_ "Infinite recursion in regex");
8165 } else {
8166 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
8167 rex->recurse_locinput[arg]= locinput;
8168
8169 DEBUG_r({
8170 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8171 DEBUG_STACK_r({
8172 Perl_re_exec_indentf( aTHX_
8173 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
8174 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
8175 );
8176 });
8177 });
8178 }
8179
8180 /* Save all the positions seen so far. */
8181 ST.cp = regcppush(rex, 0, maxopenparen);
8182 REGCP_SET(ST.lastcp);
8183
8184 /* and then jump to the code we share with EVAL */
8185 goto eval_recurse_doit;
8186 /* NOTREACHED */
8187
8188 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
8189 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
8190 if ( ++nochange_depth > max_nochange_depth )
8191 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
8192 } else {
8193 nochange_depth = 0;
8194 }
8195 {
8196 /* execute the code in the {...} */
8197
8198 dSP;
8199 IV before;
8200 OP * const oop = PL_op;
8201 COP * const ocurcop = PL_curcop;
8202 OP *nop;
8203 CV *newcv;
8204
8205 /* save *all* paren positions */
8206 ST.cp = regcppush(rex, 0, maxopenparen);
8207 REGCP_SET(ST.lastcp);
8208
8209 if (!caller_cv)
8210 caller_cv = find_runcv(NULL);
8211
8212 n = ARG1u(scan);
8213
8214 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8215 newcv = (ReANY(
8216 (REGEXP*)(rexi->data->data[n])
8217 ))->qr_anoncv;
8218 nop = (OP*)rexi->data->data[n+1];
8219 }
8220 else if (rexi->data->what[n] == 'l') { /* literal code */
8221 newcv = caller_cv;
8222 nop = (OP*)rexi->data->data[n];
8223 assert(CvDEPTH(newcv));
8224 }
8225 else {
8226 /* literal with own CV */
8227 assert(rexi->data->what[n] == 'L');
8228 newcv = rex->qr_anoncv;
8229 nop = (OP*)rexi->data->data[n];
8230 }
8231
8232 /* Some notes about MULTICALL and the context and save stacks.
8233 *
8234 * In something like
8235 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
8236 * since codeblocks don't introduce a new scope (so that
8237 * local() etc accumulate), at the end of a successful
8238 * match there will be a SAVEt_CLEARSV on the savestack
8239 * for each of $x, $y, $z. If the three code blocks above
8240 * happen to have come from different CVs (e.g. via
8241 * embedded qr//s), then we must ensure that during any
8242 * savestack unwinding, PL_comppad always points to the
8243 * right pad at each moment. We achieve this by
8244 * interleaving SAVEt_COMPPAD's on the savestack whenever
8245 * there is a change of pad.
8246 * In theory whenever we call a code block, we should
8247 * push a CXt_SUB context, then pop it on return from
8248 * that code block. This causes a bit of an issue in that
8249 * normally popping a context also clears the savestack
8250 * back to cx->blk_oldsaveix, but here we specifically
8251 * don't want to clear the save stack on exit from the
8252 * code block.
8253 * Also for efficiency we don't want to keep pushing and
8254 * popping the single SUB context as we backtrack etc.
8255 * So instead, we push a single context the first time
8256 * we need, it, then hang onto it until the end of this
8257 * function. Whenever we encounter a new code block, we
8258 * update the CV etc if that's changed. During the times
8259 * in this function where we're not executing a code
8260 * block, having the SUB context still there is a bit
8261 * naughty - but we hope that no-one notices.
8262 * When the SUB context is initially pushed, we fake up
8263 * cx->blk_oldsaveix to be as if we'd pushed this context
8264 * on first entry to S_regmatch rather than at some random
8265 * point during the regexe execution. That way if we
8266 * croak, popping the context stack will ensure that
8267 * *everything* SAVEd by this function is undone and then
8268 * the context popped, rather than e.g., popping the
8269 * context (and restoring the original PL_comppad) then
8270 * popping more of the savestack and restoring a bad
8271 * PL_comppad.
8272 */
8273
8274 /* If this is the first EVAL, push a MULTICALL. On
8275 * subsequent calls, if we're executing a different CV, or
8276 * if PL_comppad has got messed up from backtracking
8277 * through SAVECOMPPADs, then refresh the context.
8278 */
8279 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8280 {
8281 U8 flags = (CXp_SUB_RE |
8282 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8283 SAVECOMPPAD();
8284 if (last_pushed_cv) {
8285 CHANGE_MULTICALL_FLAGS(newcv, flags);
8286 }
8287 else {
8288 PUSH_MULTICALL_FLAGS(newcv, flags);
8289 }
8290 /* see notes above */
8291 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8292
8293 last_pushed_cv = newcv;
8294 }
8295 else {
8296 /* these assignments are just to silence compiler
8297 * warnings */
8298 multicall_cop = NULL;
8299 }
8300 last_pad = PL_comppad;
8301
8302 /* the initial nextstate you would normally execute
8303 * at the start of an eval (which would cause error
8304 * messages to come from the eval), may be optimised
8305 * away from the execution path in the regex code blocks;
8306 * so manually set PL_curcop to it initially */
8307 {
8308 OP *o = cUNOPx(nop)->op_first;
8309 assert(o->op_type == OP_NULL);
8310 if (o->op_targ == OP_SCOPE) {
8311 o = cUNOPo->op_first;
8312 }
8313 else {
8314 assert(o->op_targ == OP_LEAVE);
8315 o = cUNOPo->op_first;
8316 assert(o->op_type == OP_ENTER);
8317 o = OpSIBLING(o);
8318 }
8319
8320 if (o->op_type != OP_STUB) {
8321 assert( o->op_type == OP_NEXTSTATE
8322 || o->op_type == OP_DBSTATE
8323 || (o->op_type == OP_NULL
8324 && ( o->op_targ == OP_NEXTSTATE
8325 || o->op_targ == OP_DBSTATE
8326 )
8327 )
8328 );
8329 PL_curcop = (COP*)o;
8330 }
8331 }
8332 nop = nop->op_next;
8333
8334 DEBUG_STATE_r( Perl_re_printf( aTHX_
8335 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8336
8337 RXp_OFFSp(rex)[0].end = locinput - reginfo->strbeg;
8338 if (reginfo->info_aux_eval->pos_magic)
8339 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8340 reginfo->sv, reginfo->strbeg,
8341 locinput - reginfo->strbeg);
8342
8343 if (sv_yes_mark) {
8344 SV *sv_mrk = get_sv("REGMARK", 1);
8345 sv_setsv(sv_mrk, sv_yes_mark);
8346 }
8347
8348 /* we don't use MULTICALL here as we want to call the
8349 * first op of the block of interest, rather than the
8350 * first op of the sub. Also, we don't want to free
8351 * the savestack frame */
8352 before = (IV)(SP-PL_stack_base);
8353 PL_op = nop;
8354 CALLRUNOPS(aTHX); /* Scalar context. */
8355 SPAGAIN;
8356 if ((IV)(SP-PL_stack_base) == before)
8357 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8358 else {
8359 ret = POPs;
8360 PUTBACK;
8361 }
8362
8363 /* before restoring everything, evaluate the returned
8364 * value, so that 'uninit' warnings don't use the wrong
8365 * PL_op or pad. Also need to process any magic vars
8366 * (e.g. $1) *before* parentheses are restored */
8367
8368 PL_op = NULL;
8369
8370 re_sv = NULL;
8371 if (logical == 0) { /* /(?{ ... })/ and /(*{ ... })/ */
8372 SV *replsv = save_scalar(PL_replgv);
8373 sv_setsv(replsv, ret); /* $^R */
8374 SvSETMAGIC(replsv);
8375 }
8376 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
8377 sw = cBOOL(SvTRUE_NN(ret));
8378 logical = 0;
8379 }
8380 else { /* /(??{ ... }) */
8381 /* if its overloaded, let the regex compiler handle
8382 * it; otherwise extract regex, or stringify */
8383 if (SvGMAGICAL(ret))
8384 ret = sv_mortalcopy(ret);
8385 if (!SvAMAGIC(ret)) {
8386 SV *sv = ret;
8387 if (SvROK(sv))
8388 sv = SvRV(sv);
8389 if (SvTYPE(sv) == SVt_REGEXP)
8390 re_sv = (REGEXP*) sv;
8391 else if (SvSMAGICAL(ret)) {
8392 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8393 if (mg)
8394 re_sv = (REGEXP *) mg->mg_obj;
8395 }
8396
8397 /* force any undef warnings here */
8398 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8399 ret = sv_mortalcopy(ret);
8400 (void) SvPV_force_nolen(ret);
8401 }
8402 }
8403
8404 }
8405
8406 /* *** Note that at this point we don't restore
8407 * PL_comppad, (or pop the CxSUB) on the assumption it may
8408 * be used again soon. This is safe as long as nothing
8409 * in the regexp code uses the pad ! */
8410 PL_op = oop;
8411 PL_curcop = ocurcop;
8412 regcp_restore(rex, ST.lastcp, &maxopenparen);
8413 PL_curpm_under = PL_curpm;
8414 PL_curpm = PL_reg_curpm;
8415
8416 if (logical != 2) {
8417 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8418 script_run_begin);
8419 /* NOTREACHED */
8420 }
8421 }
8422
8423 /* only /(??{ ... })/ from now on */
8424 logical = 0;
8425 {
8426 /* extract RE object from returned value; compiling if
8427 * necessary */
8428
8429 if (re_sv) {
8430 re_sv = reg_temp_copy(NULL, re_sv);
8431 }
8432 else {
8433 U32 pm_flags = 0;
8434
8435 if (SvUTF8(ret) && IN_BYTES) {
8436 /* In use 'bytes': make a copy of the octet
8437 * sequence, but without the flag on */
8438 STRLEN len;
8439 const char *const p = SvPV(ret, len);
8440 ret = newSVpvn_flags(p, len, SVs_TEMP);
8441 }
8442 if (rex->intflags & PREGf_USE_RE_EVAL)
8443 pm_flags |= PMf_USE_RE_EVAL;
8444
8445 /* if we got here, it should be an engine which
8446 * supports compiling code blocks and stuff */
8447 assert(rex->engine && rex->engine->op_comp);
8448 assert(!(FLAGS(scan) & ~RXf_PMf_COMPILETIME));
8449 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8450 rex->engine, NULL, NULL,
8451 /* copy /msixn etc to inner pattern */
8452 ARG2i(scan),
8453 pm_flags);
8454
8455 if (!(SvFLAGS(ret)
8456 & (SVs_TEMP | SVs_GMG | SVf_ROK))
8457 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8458 /* This isn't a first class regexp. Instead, it's
8459 caching a regexp onto an existing, Perl visible
8460 scalar. */
8461 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8462 }
8463 }
8464 SAVEFREESV(re_sv);
8465 re = ReANY(re_sv);
8466 }
8467 RXp_MATCH_COPIED_off(re);
8468 RXp_SUBBEG(re) = RXp_SUBBEG(rex);
8469 RXp_SUBLEN(re) = RXp_SUBLEN(rex);
8470 RXp_SUBOFFSET(re) = RXp_SUBOFFSET(rex);
8471 RXp_SUBCOFFSET(re) = RXp_SUBCOFFSET(rex);
8472 RXp_LASTPAREN(re) = 0;
8473 RXp_LASTCLOSEPAREN(re) = 0;
8474 rei = RXi_GET(re);
8475 DEBUG_EXECUTE_r(
8476 debug_start_match(re_sv, utf8_target, locinput,
8477 reginfo->strend, "EVAL/GOSUB: Matching embedded");
8478 );
8479 startpoint = rei->program + 1;
8480 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8481 * close_paren only for GOSUB */
8482 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8483
8484 /* note we saved the paren state earlier:
8485 ST.cp = regcppush(rex, 0, maxopenparen);
8486 REGCP_SET(ST.lastcp);
8487 */
8488 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8489 maxopenparen = 0;
8490 /* run the pattern returned from (??{...}) */
8491
8492 eval_recurse_doit: /* Share code with GOSUB below this line
8493 * At this point we expect the stack context to be
8494 * set up correctly */
8495
8496 /* invalidate the S-L poscache. We're now executing a
8497 * different set of WHILEM ops (and their associated
8498 * indexes) against the same string, so the bits in the
8499 * cache are meaningless. Setting maxiter to zero forces
8500 * the cache to be invalidated and zeroed before reuse.
8501 * XXX This is too dramatic a measure. Ideally we should
8502 * save the old cache and restore when running the outer
8503 * pattern again */
8504 reginfo->poscache_maxiter = 0;
8505
8506 /* the new regexp might have a different is_utf8_pat than we do */
8507 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8508
8509 ST.prev_rex = rex_sv;
8510 ST.prev_curlyx = cur_curlyx;
8511 rex_sv = re_sv;
8512 SET_reg_curpm(rex_sv);
8513 rex = re;
8514 rexi = rei;
8515 cur_curlyx = NULL;
8516 ST.B = next;
8517 ST.prev_eval = cur_eval;
8518 cur_eval = st;
8519 /* now continue from first node in postoned RE */
8520 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8521 loceol, script_run_begin);
8522 NOT_REACHED; /* NOTREACHED */
8523 }
8524
8525 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8526 /* note: this is called twice; first after popping B, then A */
8527 DEBUG_STACK_r({
8528 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
8529 depth, cur_eval, ST.prev_eval);
8530 });
8531
8532 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8533 if ( cur_eval && CUR_EVAL.close_paren ) {\
8534 DEBUG_STACK_r({ \
8535 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8536 depth, \
8537 CUR_EVAL.close_paren - 1,\
8538 cur_eval, \
8539 VAL); \
8540 }); \
8541 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8542 }
8543
8544 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8545
8546 rex_sv = ST.prev_rex;
8547 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8548 SET_reg_curpm(rex_sv);
8549 rex = ReANY(rex_sv);
8550 rexi = RXi_GET(rex);
8551 {
8552 /* preserve $^R across LEAVE's. See Bug 121070. */
8553 SV *save_sv= GvSV(PL_replgv);
8554 SV *replsv;
8555 SvREFCNT_inc(save_sv);
8556 regcpblow(ST.cp); /* LEAVE in disguise */
8557 /* don't move this initialization up */
8558 replsv = GvSV(PL_replgv);
8559 sv_setsv(replsv, save_sv);
8560 SvSETMAGIC(replsv);
8561 SvREFCNT_dec(save_sv);
8562 }
8563 cur_eval = ST.prev_eval;
8564 cur_curlyx = ST.prev_curlyx;
8565
8566 /* Invalidate cache. See "invalidate" comment above. */
8567 reginfo->poscache_maxiter = 0;
8568 if ( nochange_depth )
8569 nochange_depth--;
8570
8571 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8572 sayYES;
8573
8574
8575 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8576 REGCP_UNWIND(ST.lastcp);
8577 regcppop(rex, &maxopenparen);
8578 sayNO;
8579
8580 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8581 /* note: this is called twice; first after popping B, then A */
8582 DEBUG_STACK_r({
8583 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8584 depth, cur_eval, ST.prev_eval);
8585 });
8586
8587 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8588
8589 rex_sv = ST.prev_rex;
8590 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8591 SET_reg_curpm(rex_sv);
8592 rex = ReANY(rex_sv);
8593 rexi = RXi_GET(rex);
8594
8595 REGCP_UNWIND(ST.lastcp);
8596 regcppop(rex, &maxopenparen);
8597 cur_eval = ST.prev_eval;
8598 cur_curlyx = ST.prev_curlyx;
8599
8600 /* Invalidate cache. See "invalidate" comment above. */
8601 reginfo->poscache_maxiter = 0;
8602 if ( nochange_depth )
8603 nochange_depth--;
8604
8605 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8606 sayNO_SILENT;
8607 #undef ST
8608
8609 case OPEN: /* ( */
8610 n = PARNO(scan); /* which paren pair */
8611 RXp_OFFSp(rex)[n].start_tmp = locinput - reginfo->strbeg;
8612 if (n > maxopenparen)
8613 maxopenparen = n;
8614 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8615 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8616 depth,
8617 PTR2UV(rex),
8618 PTR2UV(RXp_OFFSp(rex)),
8619 (UV)n,
8620 (IV)RXp_OFFSp(rex)[n].start_tmp,
8621 (UV)maxopenparen
8622 ));
8623 lastopen = n;
8624 break;
8625
8626 case SROPEN: /* (*SCRIPT_RUN: */
8627 script_run_begin = (U8 *) locinput;
8628 break;
8629
8630
8631 case CLOSE: /* ) */
8632 n = PARNO(scan); /* which paren pair */
8633 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8634 locinput - reginfo->strbeg);
8635 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8636 goto fake_end;
8637
8638 break;
8639
8640 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
8641
8642 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8643 {
8644 sayNO;
8645 }
8646
8647 break;
8648
8649
8650 case ACCEPT: /* (*ACCEPT) */
8651 is_accepted = true;
8652 if (FLAGS(scan))
8653 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
8654 utmp = ARG2u(scan);
8655
8656 if ( utmp ) {
8657 regnode *cursor;
8658 for (
8659 cursor = scan;
8660 cursor && ( OP(cursor) != END );
8661 cursor = (
8662 REGNODE_TYPE( OP(cursor) ) == END
8663 || REGNODE_TYPE( OP(cursor) ) == WHILEM
8664 )
8665 ? REGNODE_AFTER(cursor)
8666 : regnext(cursor)
8667 ){
8668 if ( OP(cursor) != CLOSE )
8669 continue;
8670
8671 n = PARNO(cursor);
8672
8673 if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8674 continue; /* so skip this one */
8675
8676 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8677 locinput - reginfo->strbeg);
8678
8679 if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8680 break;
8681 }
8682 }
8683 goto fake_end;
8684 /* NOTREACHED */
8685
8686 case GROUPP: /* (?(1)) */
8687 n = ARG1u(scan); /* which paren pair */
8688 sw = cBOOL(RXp_LASTPAREN(rex) >= n && RXp_OFFS_END(rex,n) != -1);
8689 break;
8690
8691 case GROUPPN: /* (?(<name>)) */
8692 /* reg_check_named_buff_matched returns 0 for no match */
8693 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8694 break;
8695
8696 case INSUBP: /* (?(R)) */
8697 n = ARG1u(scan);
8698 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8699 * of SCAN is already set up as matches a eval.close_paren */
8700 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8701 break;
8702
8703 case DEFINEP: /* (?(DEFINE)) */
8704 sw = 0;
8705 break;
8706
8707 case IFTHEN: /* (?(cond)A|B) */
8708 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8709 if (sw)
8710 next = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
8711 else {
8712 next = scan + ARG1u(scan);
8713 if (OP(next) == IFTHEN) /* Fake one. */
8714 next = REGNODE_AFTER_type(next,tregnode_IFTHEN);
8715 }
8716 break;
8717
8718 case LOGICAL: /* modifier for EVAL and IFMATCH */
8719 logical = FLAGS(scan) & EVAL_FLAGS_MASK; /* reserve a bit for optimistic eval */
8720 break;
8721
8722 /*******************************************************************
8723
8724 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8725 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8726 STAR/PLUS/CURLY/CURLYN are used instead.)
8727
8728 A*B is compiled as <CURLYX><A><WHILEM><B>
8729
8730 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8731 state, which contains the current count, initialised to -1. It also sets
8732 cur_curlyx to point to this state, with any previous value saved in the
8733 state block.
8734
8735 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8736 since the pattern may possibly match zero times (i.e. it's a while {} loop
8737 rather than a do {} while loop).
8738
8739 Each entry to WHILEM represents a successful match of A. The count in the
8740 CURLYX block is incremented, another WHILEM state is pushed, and execution
8741 passes to A or B depending on greediness and the current count.
8742
8743 For example, if matching against the string a1a2a3b (where the aN are
8744 substrings that match /A/), then the match progresses as follows: (the
8745 pushed states are interspersed with the bits of strings matched so far):
8746
8747 <CURLYX cnt=-1>
8748 <CURLYX cnt=0><WHILEM>
8749 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8750 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8751 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8752 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8753
8754 (Contrast this with something like CURLYM, which maintains only a single
8755 backtrack state:
8756
8757 <CURLYM cnt=0> a1
8758 a1 <CURLYM cnt=1> a2
8759 a1 a2 <CURLYM cnt=2> a3
8760 a1 a2 a3 <CURLYM cnt=3> b
8761 )
8762
8763 Each WHILEM state block marks a point to backtrack to upon partial failure
8764 of A or B, and also contains some minor state data related to that
8765 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
8766 overall state, such as the count, and pointers to the A and B ops.
8767
8768 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8769 must always point to the *current* CURLYX block, the rules are:
8770
8771 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8772 and set cur_curlyx to point the new block.
8773
8774 When popping the CURLYX block after a successful or unsuccessful match,
8775 restore the previous cur_curlyx.
8776
8777 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8778 to the outer one saved in the CURLYX block.
8779
8780 When popping the WHILEM block after a successful or unsuccessful B match,
8781 restore the previous cur_curlyx.
8782
8783 Here's an example for the pattern (AI* BI)*BO
8784 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8785
8786 cur_
8787 curlyx backtrack stack
8788 ------ ---------------
8789 NULL
8790 CO <CO prev=NULL> <WO>
8791 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8792 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8793 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8794
8795 At this point the pattern succeeds, and we work back down the stack to
8796 clean up, restoring as we go:
8797
8798 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8799 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8800 CO <CO prev=NULL> <WO>
8801 NULL
8802
8803 *******************************************************************/
8804
8805 #define ST st->u.curlyx
8806
8807 case CURLYX: /* start of /A*B/ (for complex A) */
8808 {
8809 /* No need to save/restore up to this paren */
8810 I32 parenfloor = FLAGS(scan);
8811
8812 assert(next); /* keep Coverity happy */
8813 if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */
8814 next += ARG1u(next);
8815
8816 /* XXXX Probably it is better to teach regpush to support
8817 parenfloor > maxopenparen ... */
8818 if (parenfloor > (I32)RXp_LASTPAREN(rex))
8819 parenfloor = RXp_LASTPAREN(rex); /* Pessimization... */
8820
8821 ST.prev_curlyx= cur_curlyx;
8822 cur_curlyx = st;
8823 ST.cp = PL_savestack_ix;
8824
8825 /* these fields contain the state of the current curly.
8826 * they are accessed by subsequent WHILEMs */
8827 ST.parenfloor = parenfloor;
8828 ST.me = scan;
8829 ST.B = next;
8830 ST.minmod = minmod;
8831 minmod = 0;
8832 ST.count = -1; /* this will be updated by WHILEM */
8833 ST.lastloc = NULL; /* this will be updated by WHILEM */
8834
8835 PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol,
8836 script_run_begin);
8837 NOT_REACHED; /* NOTREACHED */
8838 }
8839
8840 case CURLYX_end: /* just finished matching all of A*B */
8841 cur_curlyx = ST.prev_curlyx;
8842 sayYES;
8843 NOT_REACHED; /* NOTREACHED */
8844
8845 case CURLYX_end_fail: /* just failed to match all of A*B */
8846 regcpblow(ST.cp);
8847 cur_curlyx = ST.prev_curlyx;
8848 sayNO;
8849 NOT_REACHED; /* NOTREACHED */
8850
8851
8852 #undef ST
8853 #define ST st->u.whilem
8854
8855 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
8856 {
8857 /* see the discussion above about CURLYX/WHILEM */
8858 I32 n;
8859 int min, max;
8860 /* U16 first_paren, last_paren; */
8861 regnode *A;
8862
8863 assert(cur_curlyx); /* keep Coverity happy */
8864
8865 min = ARG1i(cur_curlyx->u.curlyx.me);
8866 max = ARG2i(cur_curlyx->u.curlyx.me);
8867 /* first_paren = ARG3a(cur_curlyx->u.curlyx.me); */
8868 /* last_paren = ARG3b(cur_curlyx->u.curlyx.me); */
8869 A = REGNODE_AFTER(cur_curlyx->u.curlyx.me);
8870 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8871 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8872 ST.cache_offset = 0;
8873 ST.cache_mask = 0;
8874
8875 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
8876 depth, (long)n, min, max)
8877 );
8878
8879 /* First just match a string of min A's. */
8880 if (n < min) {
8881 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8882 cur_curlyx->u.curlyx.lastloc = locinput;
8883 REGCP_SET(ST.lastcp);
8884
8885 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8886 script_run_begin);
8887 NOT_REACHED; /* NOTREACHED */
8888 }
8889
8890 /* If degenerate A matches "", assume A done. */
8891
8892 if (locinput == cur_curlyx->u.curlyx.lastloc) {
8893 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
8894 depth)
8895 );
8896 goto do_whilem_B_max;
8897 }
8898
8899 /* super-linear cache processing.
8900 *
8901 * The idea here is that for certain types of CURLYX/WHILEM -
8902 * principally those whose upper bound is infinity (and
8903 * excluding regexes that have things like \1 and other very
8904 * non-regular expressiony things), then if a pattern like
8905 * /....A*.../ fails and we backtrack to the WHILEM, then we
8906 * make a note that this particular WHILEM op was at string
8907 * position 47 (say) when the rest of pattern failed. Then, if
8908 * we ever find ourselves back at that WHILEM, and at string
8909 * position 47 again, we can just fail immediately rather than
8910 * running the rest of the pattern again.
8911 *
8912 * This is very handy when patterns start to go
8913 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8914 * with a combinatorial explosion of backtracking.
8915 *
8916 * The cache is implemented as a bit array, with one bit per
8917 * string byte position per WHILEM op (up to 16) - so its
8918 * between 0.25 and 2x the string size.
8919 *
8920 * To avoid allocating a poscache buffer every time, we do an
8921 * initially countdown; only after we have executed a WHILEM
8922 * op (string-length x #WHILEMs) times do we allocate the
8923 * cache.
8924 *
8925 * The top 4 bits of FLAGS(scan) byte say how many different
8926 * relevant CURLLYX/WHILEM op pairs there are, while the
8927 * bottom 4-bits is the identifying index number of this
8928 * WHILEM.
8929 */
8930
8931 if (FLAGS(scan)) {
8932
8933 if (!reginfo->poscache_maxiter) {
8934 /* start the countdown: Postpone detection until we
8935 * know the match is not *that* much linear. */
8936 reginfo->poscache_maxiter
8937 = (reginfo->strend - reginfo->strbeg + 1)
8938 * (FLAGS(scan)>>4);
8939 /* possible overflow for long strings and many CURLYX's */
8940 if (reginfo->poscache_maxiter < 0)
8941 reginfo->poscache_maxiter = I32_MAX;
8942 reginfo->poscache_iter = reginfo->poscache_maxiter;
8943 }
8944
8945 if (reginfo->poscache_iter-- == 0) {
8946 /* initialise cache */
8947 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8948 regmatch_info_aux *const aux = reginfo->info_aux;
8949 if (aux->poscache) {
8950 if ((SSize_t)reginfo->poscache_size < size) {
8951 Renew(aux->poscache, size, char);
8952 reginfo->poscache_size = size;
8953 }
8954 Zero(aux->poscache, size, char);
8955 }
8956 else {
8957 reginfo->poscache_size = size;
8958 Newxz(aux->poscache, size, char);
8959 }
8960 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8961 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8962 PL_colors[4], PL_colors[5])
8963 );
8964 }
8965
8966 if (reginfo->poscache_iter < 0) {
8967 /* have we already failed at this position? */
8968 SSize_t offset, mask;
8969
8970 reginfo->poscache_iter = -1; /* stop eventual underflow */
8971 offset = (FLAGS(scan) & 0xf) - 1
8972 + (locinput - reginfo->strbeg)
8973 * (FLAGS(scan)>>4);
8974 mask = 1 << (offset % 8);
8975 offset /= 8;
8976 if (reginfo->info_aux->poscache[offset] & mask) {
8977 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
8978 depth)
8979 );
8980 cur_curlyx->u.curlyx.count--;
8981 sayNO; /* cache records failure */
8982 }
8983 ST.cache_offset = offset;
8984 ST.cache_mask = mask;
8985 }
8986 }
8987
8988 /* Prefer B over A for minimal matching. */
8989
8990 if (cur_curlyx->u.curlyx.minmod) {
8991 ST.save_curlyx = cur_curlyx;
8992 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8993 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8994 locinput, loceol, script_run_begin);
8995 NOT_REACHED; /* NOTREACHED */
8996 }
8997
8998 /* Prefer A over B for maximal matching. */
8999
9000 if (n < max) { /* More greed allowed? */
9001 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
9002 maxopenparen);
9003 cur_curlyx->u.curlyx.lastloc = locinput;
9004 REGCP_SET(ST.lastcp);
9005 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
9006 script_run_begin);
9007 NOT_REACHED; /* NOTREACHED */
9008 }
9009 goto do_whilem_B_max;
9010 }
9011 NOT_REACHED; /* NOTREACHED */
9012
9013 case WHILEM_B_min: /* just matched B in a minimal match */
9014 case WHILEM_B_max: /* just matched B in a maximal match */
9015 cur_curlyx = ST.save_curlyx;
9016 sayYES;
9017 NOT_REACHED; /* NOTREACHED */
9018
9019 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
9020 cur_curlyx = ST.save_curlyx;
9021 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9022 cur_curlyx->u.curlyx.count--;
9023 CACHEsayNO;
9024 NOT_REACHED; /* NOTREACHED */
9025
9026 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
9027 /* FALLTHROUGH */
9028 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
9029 REGCP_UNWIND(ST.lastcp);
9030 regcppop(rex, &maxopenparen);
9031 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9032 cur_curlyx->u.curlyx.count--;
9033 CACHEsayNO;
9034 NOT_REACHED; /* NOTREACHED */
9035
9036 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
9037 REGCP_UNWIND(ST.lastcp);
9038 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9039 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
9040 depth)
9041 );
9042
9043 do_whilem_B_max:
9044 /* now try B */
9045 ST.save_curlyx = cur_curlyx;
9046 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
9047 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
9048 locinput, loceol, script_run_begin);
9049 NOT_REACHED; /* NOTREACHED */
9050
9051 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
9052 cur_curlyx = ST.save_curlyx;
9053
9054 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2i(cur_curlyx->u.curlyx.me)) {
9055 /* Maximum greed exceeded */
9056 cur_curlyx->u.curlyx.count--;
9057 CACHEsayNO;
9058 }
9059
9060 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
9061 );
9062 /* Try grabbing another A and see if it helps. */
9063 cur_curlyx->u.curlyx.lastloc = locinput;
9064 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
9065 REGCP_SET(ST.lastcp);
9066 PUSH_STATE_GOTO(WHILEM_A_min,
9067 /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me),
9068 locinput, loceol, script_run_begin);
9069 NOT_REACHED; /* NOTREACHED */
9070
9071 #undef ST
9072 #define ST st->u.branch
9073
9074 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
9075 next = scan + ARG1u(scan);
9076 if (next == scan)
9077 next = NULL;
9078 ST.before_paren = ARG2a(scan);
9079 ST.after_paren = ARG2b(scan);
9080 goto branch_logic;
9081 NOT_REACHED; /* NOTREACHED */
9082
9083 case BRANCH: /* /(...|A|...)/ */
9084 ST.before_paren = ARG1a(scan);
9085 ST.after_paren = ARG1b(scan);
9086 branch_logic:
9087 scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
9088 assert(scan);
9089 ST.lastparen = RXp_LASTPAREN(rex);
9090 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9091 ST.next_branch = next;
9092 REGCP_SET(ST.cp);
9093 if (RE_PESSIMISTIC_PARENS) {
9094 regcppush(rex, 0, maxopenparen);
9095 REGCP_SET(ST.lastcp);
9096 }
9097
9098 /* Now go into the branch */
9099 if (has_cutgroup) {
9100 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9101 script_run_begin);
9102 } else {
9103 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9104 script_run_begin);
9105 }
9106 NOT_REACHED; /* NOTREACHED */
9107
9108 case CUTGROUP: /* /(*THEN)/ */
9109 sv_yes_mark = st->u.mark.mark_name = FLAGS(scan)
9110 ? MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ])
9111 : NULL;
9112 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
9113 script_run_begin);
9114 NOT_REACHED; /* NOTREACHED */
9115
9116 case CUTGROUP_next_fail:
9117 do_cutgroup = 1;
9118 no_final = 1;
9119 if (st->u.mark.mark_name)
9120 sv_commit = st->u.mark.mark_name;
9121 sayNO;
9122 NOT_REACHED; /* NOTREACHED */
9123
9124 case BRANCH_next:
9125 sayYES;
9126 NOT_REACHED; /* NOTREACHED */
9127
9128 case BRANCH_next_fail: /* that branch failed; try the next, if any */
9129 if (do_cutgroup) {
9130 do_cutgroup = 0;
9131 no_final = 0;
9132 }
9133 if (RE_PESSIMISTIC_PARENS) {
9134 REGCP_UNWIND(ST.lastcp);
9135 regcppop(rex,&maxopenparen);
9136 }
9137 REGCP_UNWIND(ST.cp);
9138 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9139 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "BRANCH_next_fail");
9140 scan = ST.next_branch;
9141 /* no more branches? */
9142 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
9143 DEBUG_EXECUTE_r({
9144 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
9145 depth,
9146 PL_colors[4],
9147 PL_colors[5] );
9148 });
9149 sayNO_SILENT;
9150 }
9151 continue; /* execute next BRANCH[J] op */
9152 /* NOTREACHED */
9153
9154 case MINMOD: /* next op will be non-greedy, e.g. A*? */
9155 minmod = 1;
9156 break;
9157
9158 #undef ST
9159 #define ST st->u.curlym
9160
9161 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
9162
9163 /* This is an optimisation of CURLYX that enables us to push
9164 * only a single backtracking state, no matter how many matches
9165 * there are in {m,n}. It relies on the pattern being constant
9166 * length, with no parens to influence future backrefs
9167 */
9168
9169 ST.me = scan;
9170 scan = REGNODE_AFTER_type(scan, tregnode_CURLYM);
9171
9172 ST.lastparen = RXp_LASTPAREN(rex);
9173 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9174
9175 /* if paren positive, emulate an OPEN/CLOSE around A */
9176 if (FLAGS(ST.me)) {
9177 U32 paren = FLAGS(ST.me);
9178 lastopen = paren;
9179 if (paren > maxopenparen)
9180 maxopenparen = paren;
9181 scan += NEXT_OFF(scan); /* Skip former OPEN. */
9182 }
9183 ST.A = scan;
9184 ST.B = next;
9185 ST.alen = 0;
9186 ST.count = 0;
9187 ST.minmod = minmod;
9188 minmod = 0;
9189 ST.Binfo.count = -1;
9190 REGCP_SET(ST.cp);
9191
9192 if (!(ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me))) /* min/max */
9193 goto curlym_do_B;
9194
9195 curlym_do_A: /* execute the A in /A{m,n}B/ */
9196 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
9197 script_run_begin);
9198 NOT_REACHED; /* NOTREACHED */
9199
9200 case CURLYM_A: /* we've just matched an A */
9201 ST.count++;
9202 /* after first match, determine A's length: u.curlym.alen */
9203 if (ST.count == 1) {
9204 if (reginfo->is_utf8_target) {
9205 char *s = st->locinput;
9206 while (s < locinput) {
9207 ST.alen++;
9208 s += UTF8SKIP(s);
9209 }
9210 }
9211 else {
9212 ST.alen = locinput - st->locinput;
9213 }
9214 if (ST.alen == 0)
9215 ST.count = ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me);
9216 }
9217 DEBUG_EXECUTE_r(
9218 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
9219 depth, (IV) ST.count, (IV)ST.alen)
9220 );
9221
9222 if (FLAGS(ST.me)) {
9223 /* emulate CLOSE: mark current A as captured */
9224 U32 paren = (U32)FLAGS(ST.me);
9225 CLOSE_CAPTURE(rex, paren,
9226 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9227 locinput - reginfo->strbeg);
9228 }
9229
9230 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9231 goto fake_end;
9232
9233
9234 if (!is_accepted) {
9235 I32 max = (ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me));
9236 if ( max == REG_INFTY || ST.count < max )
9237 goto curlym_do_A; /* try to match another A */
9238 }
9239 goto curlym_do_B; /* try to match B */
9240
9241 case CURLYM_A_fail: /* just failed to match an A */
9242 REGCP_UNWIND(ST.cp);
9243
9244
9245 if (ST.minmod || ST.count < ARG1i(ST.me) /* min*/
9246 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9247 sayNO;
9248
9249 curlym_do_B: /* execute the B in /A{m,n}B/ */
9250 if (is_accepted)
9251 goto curlym_close_B;
9252
9253 if (ST.Binfo.count < 0) {
9254 /* calculate possible match of 1st char following curly */
9255 assert(ST.B);
9256 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
9257 regnode *text_node = ST.B;
9258 if (! HAS_TEXT(text_node))
9259 FIND_NEXT_IMPT(text_node);
9260 if (REGNODE_TYPE(OP(text_node)) == EXACT) {
9261 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9262 &ST.Binfo, reginfo))
9263 {
9264 sayNO;
9265 }
9266 }
9267 }
9268 }
9269
9270 DEBUG_EXECUTE_r(
9271 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
9272 depth, (IV)ST.count)
9273 );
9274 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
9275 assert(ST.Binfo.count > 0);
9276
9277 /* Do a quick test to hopefully rule out most non-matches */
9278 if ( locinput + ST.Binfo.min_length > loceol
9279 || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9280 {
9281 DEBUG_OPTIMISE_r(
9282 Perl_re_exec_indentf( aTHX_
9283 "CURLYM Fast bail next target=0x%X anded==0x%X"
9284 " mask=0x%X\n",
9285 depth,
9286 (int) nextbyte, ST.Binfo.first_byte_anded,
9287 ST.Binfo.first_byte_mask)
9288 );
9289 state_num = CURLYM_B_fail;
9290 goto reenter_switch;
9291 }
9292 }
9293
9294 curlym_close_B:
9295 if (FLAGS(ST.me)) {
9296 /* emulate CLOSE: mark current A as captured */
9297 U32 paren = (U32)FLAGS(ST.me);
9298 if (ST.count || is_accepted) {
9299 CLOSE_CAPTURE(rex, paren,
9300 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9301 locinput - reginfo->strbeg);
9302 }
9303 else
9304 RXp_OFFSp(rex)[paren].end = -1;
9305
9306 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9307 {
9308 if (ST.count || is_accepted)
9309 goto fake_end;
9310 else
9311 sayNO;
9312 }
9313 }
9314
9315 if (is_accepted)
9316 goto fake_end;
9317
9318 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
9319 script_run_begin);
9320 NOT_REACHED; /* NOTREACHED */
9321
9322 case CURLYM_B_fail: /* just failed to match a B */
9323 REGCP_UNWIND(ST.cp);
9324 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9325 if (ST.minmod) {
9326 I32 max = ARG2i(ST.me);
9327 if (max != REG_INFTY && ST.count == max)
9328 sayNO;
9329 goto curlym_do_A; /* try to match a further A */
9330 }
9331 /* backtrack one A */
9332 if (ST.count == ARG1i(ST.me) /* min */)
9333 sayNO;
9334 ST.count--;
9335 SET_locinput(HOPc(locinput, -ST.alen));
9336 goto curlym_do_B; /* try to match B */
9337
9338 #undef ST
9339 #define ST st->u.curly
9340
9341 #define CURLY_SETPAREN(paren, success) \
9342 if (paren) { \
9343 if (success) { \
9344 CLOSE_CAPTURE(rex, paren, HOPc(locinput, -1) - reginfo->strbeg, \
9345 locinput - reginfo->strbeg); \
9346 } \
9347 else { \
9348 RXp_OFFSp(rex)[paren].end = -1; \
9349 RXp_LASTPAREN(rex) = ST.lastparen; \
9350 RXp_LASTCLOSEPAREN(rex) = ST.lastcloseparen; \
9351 } \
9352 }
9353
9354 case STAR: /* /A*B/ where A is width 1 char */
9355 ST.paren = 0;
9356 ST.min = 0;
9357 ST.max = REG_INFTY;
9358 scan = REGNODE_AFTER_type(scan,tregnode_STAR);
9359 goto repeat;
9360
9361 case PLUS: /* /A+B/ where A is width 1 char */
9362 ST.paren = 0;
9363 ST.min = 1;
9364 ST.max = REG_INFTY;
9365 scan = REGNODE_AFTER_type(scan,tregnode_PLUS);
9366 goto repeat;
9367
9368 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
9369 ST.paren = FLAGS(scan); /* Which paren to set */
9370 ST.lastparen = RXp_LASTPAREN(rex);
9371 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9372 if (ST.paren > maxopenparen)
9373 maxopenparen = ST.paren;
9374 ST.min = ARG1i(scan); /* min to match */
9375 ST.max = ARG2i(scan); /* max to match */
9376 scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN));
9377
9378 /* handle the single-char capture called as a GOSUB etc */
9379 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9380 {
9381 char *li = locinput;
9382 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9383 sayNO;
9384 SET_locinput(li);
9385 goto fake_end;
9386 }
9387
9388 goto repeat;
9389
9390 case CURLY: /* /A{m,n}B/ where A is width 1 char */
9391 ST.paren = 0;
9392 ST.min = ARG1i(scan); /* min to match */
9393 ST.max = ARG2i(scan); /* max to match */
9394 scan = REGNODE_AFTER_type(scan, tregnode_CURLY);
9395 repeat:
9396 /*
9397 * Lookahead to avoid useless match attempts
9398 * when we know what character comes next.
9399 *
9400 * Used to only do .*x and .*?x, but now it allows
9401 * for )'s, ('s and (?{ ... })'s to be in the way
9402 * of the quantifier and the EXACT-like node. -- japhy
9403 */
9404
9405 assert(ST.min <= ST.max);
9406 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9407 ST.Binfo.count = 0;
9408 }
9409 else {
9410 regnode *text_node = next;
9411
9412 if (! HAS_TEXT(text_node))
9413 FIND_NEXT_IMPT(text_node);
9414
9415 if (! HAS_TEXT(text_node))
9416 ST.Binfo.count = 0;
9417 else {
9418 if ( REGNODE_TYPE(OP(text_node)) != EXACT ) {
9419 ST.Binfo.count = 0;
9420 }
9421 else {
9422 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9423 &ST.Binfo, reginfo))
9424 {
9425 sayNO;
9426 }
9427 }
9428 }
9429 }
9430
9431 ST.A = scan;
9432 ST.B = next;
9433 if (minmod) {
9434 char *li = locinput;
9435 minmod = 0;
9436 if (ST.min &&
9437 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9438 < ST.min)
9439 sayNO;
9440 SET_locinput(li);
9441 ST.count = ST.min;
9442 REGCP_SET(ST.cp);
9443
9444 if (ST.Binfo.count <= 0)
9445 goto curly_try_B_min;
9446
9447 ST.oldloc = locinput;
9448
9449 /* set ST.maxpos to the furthest point along the
9450 * string that could possibly match, i.e., that a match could
9451 * start at. */
9452 if (ST.max == REG_INFTY) {
9453 ST.maxpos = loceol - 1;
9454 if (utf8_target)
9455 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9456 ST.maxpos--;
9457 }
9458 else if (utf8_target) {
9459 int m = ST.max - ST.min;
9460 for (ST.maxpos = locinput;
9461 m >0 && ST.maxpos < loceol; m--)
9462 ST.maxpos += UTF8SKIP(ST.maxpos);
9463 }
9464 else {
9465 ST.maxpos = locinput + ST.max - ST.min;
9466 if (ST.maxpos >= loceol)
9467 ST.maxpos = loceol - 1;
9468 }
9469 goto curly_try_B_min_known;
9470
9471 }
9472 else {
9473 /* avoid taking address of locinput, so it can remain
9474 * a register var */
9475 char *li = locinput;
9476 if (ST.max)
9477 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9478 else
9479 ST.count = 0;
9480 if (ST.count < ST.min)
9481 sayNO;
9482 SET_locinput(li);
9483 if ((ST.count > ST.min)
9484 && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL))
9485 {
9486 /* A{m,n} must come at the end of the string, there's
9487 * no point in backing off ... */
9488 ST.min = ST.count;
9489 /* ...except that $ and \Z can match before *and* after
9490 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
9491 We may back off by one in this case. */
9492 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9493 ST.min--;
9494 }
9495 REGCP_SET(ST.cp);
9496 goto curly_try_B_max;
9497 }
9498 NOT_REACHED; /* NOTREACHED */
9499
9500 case CURLY_B_min_fail:
9501 /* failed to find B in a non-greedy match. */
9502 if (RE_PESSIMISTIC_PARENS) {
9503 REGCP_UNWIND(ST.lastcp);
9504 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9505 }
9506 REGCP_UNWIND(ST.cp);
9507 if (ST.paren) {
9508 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9509 }
9510
9511 if (ST.Binfo.count == 0) {
9512 /* failed -- move forward one */
9513 char *li = locinput;
9514 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9515 sayNO;
9516 }
9517 locinput = li;
9518 ST.count++;
9519 if (!( ST.count <= ST.max
9520 /* count overflow ? */
9521 || (ST.max == REG_INFTY && ST.count > 0))
9522 )
9523 sayNO;
9524 }
9525 else {
9526 int n;
9527 /* Couldn't or didn't -- move forward. */
9528 ST.oldloc = locinput;
9529 if (utf8_target)
9530 locinput += UTF8SKIP(locinput);
9531 else
9532 locinput++;
9533 ST.count++;
9534
9535 curly_try_B_min_known:
9536 /* find the next place where 'B' could work, then call B */
9537 if (locinput + ST.Binfo.initial_exact < loceol) {
9538 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9539
9540 /* Here, the mask is all 1's for the entire length of
9541 * any possible match. (That actually means that there
9542 * is only one possible match.) Look for the next
9543 * occurrence */
9544 locinput = ninstr(locinput, loceol,
9545 (char *) ST.Binfo.matches,
9546 (char *) ST.Binfo.matches
9547 + ST.Binfo.initial_exact);
9548 if (locinput == NULL) {
9549 sayNO;
9550 }
9551 }
9552 else do {
9553 /* If the first byte(s) of the mask are all ones, it
9554 * means those bytes must match identically, so can use
9555 * ninstr() to find the next possible matchpoint */
9556 if (ST.Binfo.initial_exact > 0) {
9557 locinput = ninstr(locinput, loceol,
9558 (char *) ST.Binfo.matches,
9559 (char *) ST.Binfo.matches
9560 + ST.Binfo.initial_exact);
9561 }
9562 else { /* Otherwise find the next byte that matches,
9563 masked */
9564 locinput = (char *) find_next_masked(
9565 (U8 *) locinput, (U8 *) loceol,
9566 ST.Binfo.first_byte_anded,
9567 ST.Binfo.first_byte_mask);
9568 /* Advance to the end of a multi-byte character */
9569 if (utf8_target) {
9570 while ( locinput < loceol
9571 && UTF8_IS_CONTINUATION(*locinput))
9572 {
9573 locinput++;
9574 }
9575 }
9576 }
9577 if ( locinput == NULL
9578 || locinput + ST.Binfo.min_length > loceol)
9579 {
9580 sayNO;
9581 }
9582
9583 /* Here, we have found a possible match point; if can't
9584 * rule it out, quit the loop so can check fully */
9585 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9586 break;
9587 }
9588
9589 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9590
9591 } while (locinput <= ST.maxpos);
9592 }
9593
9594 if (locinput > ST.maxpos)
9595 sayNO;
9596
9597 n = (utf8_target)
9598 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9599 : (STRLEN) (locinput - ST.oldloc);
9600
9601
9602 /* Here is at the beginning of a character that meets the mask
9603 * criteria. Need to make sure that some real possibility */
9604
9605 if (n) {
9606 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9607 * at what may be the beginning of b; check that everything
9608 * between oldloc and locinput matches */
9609 char *li = ST.oldloc;
9610 ST.count += n;
9611 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9612 sayNO;
9613 assert(n == REG_INFTY || locinput == li);
9614 }
9615 }
9616
9617 curly_try_B_min:
9618 if (RE_PESSIMISTIC_PARENS) {
9619 (void)regcppush(rex, 0, maxopenparen);
9620 REGCP_SET(ST.lastcp);
9621 }
9622 CURLY_SETPAREN(ST.paren, ST.count);
9623 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9624 script_run_begin);
9625 NOT_REACHED; /* NOTREACHED */
9626
9627
9628 curly_try_B_max:
9629 /* a successful greedy match: now try to match B */
9630 if ( ST.Binfo.count <= 0
9631 || ( ST.Binfo.count > 0
9632 && locinput + ST.Binfo.min_length <= loceol
9633 && S_test_EXACTISH_ST(locinput, ST.Binfo)))
9634 {
9635 if (RE_PESSIMISTIC_PARENS) {
9636 (void)regcppush(rex, 0, maxopenparen);
9637 REGCP_SET(ST.lastcp);
9638 }
9639 CURLY_SETPAREN(ST.paren, ST.count);
9640 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9641 script_run_begin);
9642 NOT_REACHED; /* NOTREACHED */
9643 }
9644 goto CURLY_B_all_failed;
9645 NOT_REACHED; /* NOTREACHED */
9646
9647 case CURLY_B_max_fail:
9648 /* failed to find B in a greedy match */
9649
9650 if (RE_PESSIMISTIC_PARENS) {
9651 REGCP_UNWIND(ST.lastcp);
9652 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9653 }
9654 CURLY_B_all_failed:
9655 REGCP_UNWIND(ST.cp);
9656 if (ST.paren) {
9657 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9658 }
9659 /* back up. */
9660 if (--ST.count < ST.min)
9661 sayNO;
9662 locinput = HOPc(locinput, -1);
9663 goto curly_try_B_max;
9664
9665 #undef ST
9666
9667 case END: /* last op of main pattern */
9668 fake_end:
9669 if (cur_eval) {
9670 /* we've just finished A in /(??{A})B/; now continue with B */
9671 is_accepted= false;
9672 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9673 st->u.eval.prev_rex = rex_sv; /* inner */
9674
9675 /* Save *all* the positions. */
9676 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9677 rex_sv = CUR_EVAL.prev_rex;
9678 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9679 SET_reg_curpm(rex_sv);
9680 rex = ReANY(rex_sv);
9681 rexi = RXi_GET(rex);
9682
9683 st->u.eval.prev_curlyx = cur_curlyx;
9684 cur_curlyx = CUR_EVAL.prev_curlyx;
9685
9686 REGCP_SET(st->u.eval.lastcp);
9687
9688 /* Restore parens of the outer rex without popping the
9689 * savestack */
9690 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9691
9692 st->u.eval.prev_eval = cur_eval;
9693 cur_eval = CUR_EVAL.prev_eval;
9694 DEBUG_EXECUTE_r(
9695 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
9696 depth, cur_eval););
9697 if ( nochange_depth )
9698 nochange_depth--;
9699
9700 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9701
9702 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
9703 st->u.eval.prev_eval->u.eval.B,
9704 locinput, loceol, script_run_begin);
9705 }
9706
9707 if (locinput < reginfo->till) {
9708 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9709 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9710 PL_colors[4],
9711 (long)(locinput - startpos),
9712 (long)(reginfo->till - startpos),
9713 PL_colors[5]));
9714
9715 sayNO_SILENT; /* Cannot match: too short. */
9716 }
9717 sayYES; /* Success! */
9718
9719 case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9720 matches end at the right spot, required for
9721 variable length matches. */
9722 if (match_end && locinput != match_end)
9723 {
9724 DEBUG_EXECUTE_r(
9725 Perl_re_exec_indentf( aTHX_
9726 "%sLOOKBEHIND_END: subpattern failed...%s\n",
9727 depth, PL_colors[4], PL_colors[5]));
9728 sayNO; /* Variable length match didn't line up */
9729 }
9730 /* FALLTHROUGH */
9731
9732 case SUCCEED: /* successful SUSPEND/CURLYM and
9733 *lookahead* IFMATCH/UNLESSM*/
9734 DEBUG_EXECUTE_r(
9735 Perl_re_exec_indentf( aTHX_
9736 "%sSUCCEED: subpattern success...%s\n",
9737 depth, PL_colors[4], PL_colors[5]));
9738 sayYES; /* Success! */
9739
9740 #undef ST
9741 #define ST st->u.ifmatch
9742
9743 case SUSPEND: /* (?>A) */
9744 ST.wanted = 1;
9745 ST.start = locinput;
9746 ST.end = loceol;
9747 ST.count = 1;
9748 goto do_ifmatch;
9749
9750 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9751 ST.wanted = 0;
9752 goto ifmatch_trivial_fail_test;
9753
9754 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9755 ST.wanted = 1;
9756 ifmatch_trivial_fail_test:
9757 ST.prev_match_end= match_end;
9758 ST.count = NEXT_OFF(scan) + 1; /* next_off repurposed to be
9759 lookbehind count, requires
9760 non-zero flags */
9761 if (! FLAGS(scan)) { /* 'flags' zero means lookahed */
9762
9763 /* Lookahead starts here and ends at the normal place */
9764 ST.start = locinput;
9765 ST.end = loceol;
9766 match_end = NULL;
9767 }
9768 else {
9769 PERL_UINT_FAST8_T back_count = FLAGS(scan);
9770 char * s;
9771 match_end = locinput;
9772
9773 /* Lookbehind can look beyond the current position */
9774 ST.end = loceol;
9775
9776 /* ... and starts at the first place in the input that is in
9777 * the range of the possible start positions */
9778 for (; ST.count > 0; ST.count--, back_count--) {
9779 s = HOPBACKc(locinput, back_count);
9780 if (s) {
9781 ST.start = s;
9782 goto do_ifmatch;
9783 }
9784 }
9785
9786 /* If the lookbehind doesn't start in the actual string, is a
9787 * trivial match failure */
9788 match_end = ST.prev_match_end;
9789 if (logical) {
9790 logical = 0;
9791 sw = 1 - cBOOL(ST.wanted);
9792 }
9793 else if (ST.wanted)
9794 sayNO;
9795
9796 /* Here, we didn't want it to match, so is actually success */
9797 next = scan + ARG1u(scan);
9798 if (next == scan)
9799 next = NULL;
9800 break;
9801 }
9802
9803 do_ifmatch:
9804 ST.me = scan;
9805 ST.logical = logical;
9806 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9807
9808 /* execute body of (?...A) */
9809 PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start,
9810 ST.end, script_run_begin);
9811 NOT_REACHED; /* NOTREACHED */
9812
9813 {
9814 bool matched;
9815
9816 case IFMATCH_A_fail: /* body of (?...A) failed */
9817 if (! ST.logical && ST.count > 1) {
9818
9819 /* It isn't a real failure until we've tried all starting
9820 * positions. Move to the next starting position and retry */
9821 ST.count--;
9822 ST.start = HOPc(ST.start, 1);
9823 scan = ST.me;
9824 logical = ST.logical;
9825 goto do_ifmatch;
9826 }
9827
9828 /* Here, all starting positions have been tried. */
9829 matched = FALSE;
9830 goto ifmatch_done;
9831
9832 case IFMATCH_A: /* body of (?...A) succeeded */
9833 matched = TRUE;
9834 ifmatch_done:
9835 sw = matched == ST.wanted;
9836 match_end = ST.prev_match_end;
9837 if (! ST.logical && !sw) {
9838 sayNO;
9839 }
9840
9841 if (OP(ST.me) != SUSPEND) {
9842 /* restore old position except for (?>...) */
9843 locinput = st->locinput;
9844 loceol = st->loceol;
9845 script_run_begin = st->sr0;
9846 }
9847 scan = ST.me + ARG1u(ST.me);
9848 if (scan == ST.me)
9849 scan = NULL;
9850 continue; /* execute B */
9851 }
9852
9853 #undef ST
9854
9855 case LONGJMP: /* alternative with many branches compiles to
9856 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9857 next = scan + ARG1u(scan);
9858 if (next == scan)
9859 next = NULL;
9860 break;
9861
9862 case COMMIT: /* (*COMMIT) */
9863 reginfo->cutpoint = loceol;
9864 /* FALLTHROUGH */
9865
9866 case PRUNE: /* (*PRUNE) */
9867 if (FLAGS(scan))
9868 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9869 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9870 script_run_begin);
9871 NOT_REACHED; /* NOTREACHED */
9872
9873 case COMMIT_next_fail:
9874 no_final = 1;
9875 /* FALLTHROUGH */
9876 sayNO;
9877 NOT_REACHED; /* NOTREACHED */
9878
9879 case OPFAIL: /* (*FAIL) */
9880 if (FLAGS(scan))
9881 sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9882 if (logical) {
9883 /* deal with (?(?!)X|Y) properly,
9884 * make sure we trigger the no branch
9885 * of the trailing IFTHEN structure*/
9886 sw= 0;
9887 break;
9888 } else {
9889 sayNO;
9890 }
9891 NOT_REACHED; /* NOTREACHED */
9892
9893 #define ST st->u.mark
9894 case MARKPOINT: /* (*MARK:foo) */
9895 ST.prev_mark = mark_state;
9896 ST.mark_name = sv_commit = sv_yes_mark
9897 = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9898 mark_state = st;
9899 ST.mark_loc = locinput;
9900 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9901 script_run_begin);
9902 NOT_REACHED; /* NOTREACHED */
9903
9904 case MARKPOINT_next:
9905 mark_state = ST.prev_mark;
9906 sayYES;
9907 NOT_REACHED; /* NOTREACHED */
9908
9909 case MARKPOINT_next_fail:
9910 if (popmark && sv_eq(ST.mark_name,popmark))
9911 {
9912 if (ST.mark_loc > startpoint)
9913 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9914 popmark = NULL; /* we found our mark */
9915 sv_commit = ST.mark_name;
9916
9917 DEBUG_EXECUTE_r({
9918 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9919 depth,
9920 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9921 });
9922 }
9923 mark_state = ST.prev_mark;
9924 sv_yes_mark = mark_state ?
9925 mark_state->u.mark.mark_name : NULL;
9926 sayNO;
9927 NOT_REACHED; /* NOTREACHED */
9928
9929 case SKIP: /* (*SKIP) */
9930 if (!FLAGS(scan)) {
9931 /* (*SKIP) : if we fail we cut here*/
9932 ST.mark_name = NULL;
9933 ST.mark_loc = locinput;
9934 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9935 script_run_begin);
9936 } else {
9937 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9938 otherwise do nothing. Meaning we need to scan
9939 */
9940 regmatch_state *cur = mark_state;
9941 SV *find = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9942
9943 while (cur) {
9944 if ( sv_eq( cur->u.mark.mark_name,
9945 find ) )
9946 {
9947 ST.mark_name = find;
9948 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9949 script_run_begin);
9950 }
9951 cur = cur->u.mark.prev_mark;
9952 }
9953 }
9954 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9955 break;
9956
9957 case SKIP_next_fail:
9958 if (ST.mark_name) {
9959 /* (*CUT:NAME) - Set up to search for the name as we
9960 collapse the stack*/
9961 popmark = ST.mark_name;
9962 } else {
9963 /* (*CUT) - No name, we cut here.*/
9964 if (ST.mark_loc > startpoint)
9965 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9966 /* but we set sv_commit to latest mark_name if there
9967 is one so they can test to see how things lead to this
9968 cut */
9969 if (mark_state)
9970 sv_commit=mark_state->u.mark.mark_name;
9971 }
9972 no_final = 1;
9973 sayNO;
9974 NOT_REACHED; /* NOTREACHED */
9975 #undef ST
9976
9977 case LNBREAK: /* \R */
9978 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9979 locinput += n;
9980 } else
9981 sayNO;
9982 break;
9983
9984 default:
9985 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9986 PTR2UV(scan), OP(scan));
9987 Perl_croak(aTHX_ "regexp memory corruption");
9988
9989 /* this is a point to jump to in order to increment
9990 * locinput by one character */
9991 increment_locinput:
9992 assert(!NEXTCHR_IS_EOS);
9993 if (utf8_target) {
9994 locinput += PL_utf8skip[nextbyte];
9995 /* locinput is allowed to go 1 char off the end (signifying
9996 * EOS), but not 2+ */
9997 if (locinput > loceol)
9998 sayNO;
9999 }
10000 else
10001 locinput++;
10002 break;
10003
10004 } /* end switch */
10005
10006 /* switch break jumps here */
10007 scan = next; /* prepare to execute the next op and ... */
10008 continue; /* ... jump back to the top, reusing st */
10009 /* NOTREACHED */
10010
10011 push_yes_state:
10012 /* push a state that backtracks on success */
10013 st->u.yes.prev_yes_state = yes_state;
10014 yes_state = st;
10015 /* FALLTHROUGH */
10016 push_state:
10017 /* push a new regex state, then continue at scan */
10018 {
10019 regmatch_state *newst;
10020 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10021
10022 DEBUG_r( /* DEBUG_STACK_r */
10023 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
10024 regmatch_state *cur = st;
10025 regmatch_state *curyes = yes_state;
10026 U32 i;
10027 regmatch_slab *slab = PL_regmatch_slab;
10028 for (i = 0; i < 3 && i <= depth; cur--,i++) {
10029 if (cur < SLAB_FIRST(slab)) {
10030 slab = slab->prev;
10031 cur = SLAB_LAST(slab);
10032 }
10033 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
10034 depth,
10035 i ? " " : "push",
10036 depth - i, REGNODE_NAME(cur->resume_state),
10037 (curyes == cur) ? "yes" : ""
10038 );
10039 if (curyes == cur)
10040 curyes = cur->u.yes.prev_yes_state;
10041 }
10042 } else {
10043 DEBUG_STATE_pp("push")
10044 });
10045 depth++;
10046 st->locinput = locinput;
10047 st->loceol = loceol;
10048 st->sr0 = script_run_begin;
10049 newst = st+1;
10050 if (newst > SLAB_LAST(PL_regmatch_slab))
10051 newst = S_push_slab(aTHX);
10052 PL_regmatch_state = newst;
10053
10054 locinput = pushinput;
10055 loceol = pusheol;
10056 script_run_begin = pushsr0;
10057 st = newst;
10058 continue;
10059 /* NOTREACHED */
10060 }
10061 }
10062 #ifdef SOLARIS_BAD_OPTIMIZER
10063 # undef PL_charclass
10064 #endif
10065
10066 /*
10067 * We get here only if there's trouble -- normally "case END" is
10068 * the terminating point.
10069 */
10070 Perl_croak(aTHX_ "corrupted regexp pointers");
10071 NOT_REACHED; /* NOTREACHED */
10072
10073 yes:
10074 if (yes_state) {
10075 /* we have successfully completed a subexpression, but we must now
10076 * pop to the state marked by yes_state and continue from there */
10077 assert(st != yes_state);
10078 #ifdef DEBUGGING
10079 while (st != yes_state) {
10080 st--;
10081 if (st < SLAB_FIRST(PL_regmatch_slab)) {
10082 PL_regmatch_slab = PL_regmatch_slab->prev;
10083 st = SLAB_LAST(PL_regmatch_slab);
10084 }
10085 DEBUG_STATE_r({
10086 if (no_final) {
10087 DEBUG_STATE_pp("pop (no final)");
10088 } else {
10089 DEBUG_STATE_pp("pop (yes)");
10090 }
10091 });
10092 depth--;
10093 }
10094 #else
10095 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
10096 || yes_state > SLAB_LAST(PL_regmatch_slab))
10097 {
10098 /* not in this slab, pop slab */
10099 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
10100 PL_regmatch_slab = PL_regmatch_slab->prev;
10101 st = SLAB_LAST(PL_regmatch_slab);
10102 }
10103 depth -= (st - yes_state);
10104 #endif
10105 st = yes_state;
10106 yes_state = st->u.yes.prev_yes_state;
10107 PL_regmatch_state = st;
10108
10109 if (no_final) {
10110 locinput= st->locinput;
10111 loceol= st->loceol;
10112 script_run_begin = st->sr0;
10113 }
10114 state_num = st->resume_state + no_final;
10115 goto reenter_switch;
10116 }
10117
10118 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
10119 PL_colors[4], PL_colors[5]));
10120
10121 if (reginfo->info_aux_eval) {
10122 /* each successfully executed (?{...}) block does the equivalent of
10123 * local $^R = do {...}
10124 * When popping the save stack, all these locals would be undone;
10125 * bypass this by setting the outermost saved $^R to the latest
10126 * value */
10127 /* I don't know if this is needed or works properly now.
10128 * see code related to PL_replgv elsewhere in this file.
10129 * Yves
10130 */
10131 if (oreplsv != GvSV(PL_replgv)) {
10132 sv_setsv(oreplsv, GvSV(PL_replgv));
10133 SvSETMAGIC(oreplsv);
10134 }
10135 }
10136 result = 1;
10137 goto final_exit;
10138
10139 no:
10140 DEBUG_EXECUTE_r(
10141 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
10142 depth,
10143 PL_colors[4], PL_colors[5])
10144 );
10145
10146 no_silent:
10147 if (no_final) {
10148 if (yes_state) {
10149 goto yes;
10150 } else {
10151 goto final_exit;
10152 }
10153 }
10154 if (depth) {
10155 /* there's a previous state to backtrack to */
10156 st--;
10157 if (st < SLAB_FIRST(PL_regmatch_slab)) {
10158 PL_regmatch_slab = PL_regmatch_slab->prev;
10159 st = SLAB_LAST(PL_regmatch_slab);
10160 }
10161 PL_regmatch_state = st;
10162 locinput= st->locinput;
10163 loceol= st->loceol;
10164 script_run_begin = st->sr0;
10165
10166 DEBUG_STATE_pp("pop");
10167 depth--;
10168 if (yes_state == st)
10169 yes_state = st->u.yes.prev_yes_state;
10170
10171 state_num = st->resume_state + 1; /* failure = success + 1 */
10172 PERL_ASYNC_CHECK();
10173 goto reenter_switch;
10174 }
10175 result = 0;
10176
10177 final_exit:
10178 if (rex->intflags & PREGf_VERBARG_SEEN) {
10179 SV *sv_err = get_sv("REGERROR", 1);
10180 SV *sv_mrk = get_sv("REGMARK", 1);
10181 if (result) {
10182 sv_commit = &PL_sv_no;
10183 if (!sv_yes_mark)
10184 sv_yes_mark = &PL_sv_yes;
10185 } else {
10186 if (!sv_commit)
10187 sv_commit = &PL_sv_yes;
10188 sv_yes_mark = &PL_sv_no;
10189 }
10190 assert(sv_err);
10191 assert(sv_mrk);
10192 sv_setsv(sv_err, sv_commit);
10193 sv_setsv(sv_mrk, sv_yes_mark);
10194 }
10195
10196
10197 if (last_pushed_cv) {
10198 dSP;
10199 /* see "Some notes about MULTICALL" above */
10200 POP_MULTICALL;
10201 PERL_UNUSED_VAR(SP);
10202 }
10203 else
10204 LEAVE_SCOPE(orig_savestack_ix);
10205
10206 assert(!result || locinput - reginfo->strbeg >= 0);
10207 return result ? locinput - reginfo->strbeg : -1;
10208 }
10209
10210 /*
10211 - regrepeat - repeatedly match something simple, report how many
10212 *
10213 * What 'simple' means is a node which can be the operand of a quantifier like
10214 * '+', or {1,3}
10215 *
10216 * startposp - pointer to a pointer to the start position. This is updated
10217 * to point to the byte following the highest successful
10218 * match.
10219 * p - the regnode to be repeatedly matched against.
10220 * loceol - pointer to the end position beyond which we aren't supposed to
10221 * look.
10222 * reginfo - struct holding match state, such as utf8_target
10223 * max - maximum number of things to match.
10224 * depth - (for debugging) backtracking depth.
10225 */
10226 STATIC I32
S_regrepeat(pTHX_ regexp * prog,char ** startposp,const regnode * p,char * loceol,regmatch_info * const reginfo,I32 max comma_pDEPTH)10227 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
10228 char * loceol, regmatch_info *const reginfo, I32 max comma_pDEPTH)
10229 {
10230 char *scan; /* Pointer to current position in target string */
10231 I32 c;
10232 char *this_eol = loceol; /* potentially adjusted version. */
10233 I32 hardcount = 0; /* How many matches so far */
10234 bool utf8_target = reginfo->is_utf8_target;
10235 unsigned int to_complement = 0; /* Invert the result? */
10236 char_class_number_ classnum;
10237
10238 PERL_ARGS_ASSERT_REGREPEAT;
10239
10240 /* This routine is structured so that we switch on the input OP. Each OP
10241 * case: statement contains a loop to repeatedly apply the OP, advancing
10242 * the input until it fails, or reaches the end of the input, or until it
10243 * reaches the upper limit of matches. */
10244
10245 scan = *startposp;
10246 if (max == REG_INFTY) /* This is a special marker to go to the platform's
10247 max */
10248 max = I32_MAX;
10249 else if (! utf8_target && this_eol - scan > max)
10250 this_eol = scan + max;
10251
10252 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
10253 * down to the maximum of how far we should go in it (but leaving it set to
10254 * the real end if the maximum permissible would take us beyond that).
10255 * This allows us to make the loop exit condition that we haven't gone past
10256 * <this_eol> to also mean that we haven't exceeded the max permissible
10257 * count, saving a test each time through the loop. But it assumes that
10258 * the OP matches a single byte, which is true for most of the OPs below
10259 * when applied to a non-UTF-8 target. Those relatively few OPs that don't
10260 * have this characteristic have to compensate.
10261 *
10262 * There is no such adjustment for UTF-8 targets, since the number of bytes
10263 * per character can vary. OPs will have to test both that the count is
10264 * less than the max permissible (using <hardcount> to keep track), and
10265 * that we are still within the bounds of the string (using <this_eol>. A
10266 * few OPs match a single byte no matter what the encoding. They can omit
10267 * the max test if, for the UTF-8 case, they do the adjustment that was
10268 * skipped above.
10269 *
10270 * Thus, the code above sets things up for the common case; and exceptional
10271 * cases need extra work; the common case is to make sure <scan> doesn't go
10272 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
10273 * count doesn't exceed the maximum permissible */
10274
10275 switch (with_t_UTF8ness(OP(p), utf8_target)) {
10276 SV * anyofh_list;
10277
10278 case REG_ANY_t8:
10279 while (scan < this_eol && hardcount < max && *scan != '\n') {
10280 scan += UTF8SKIP(scan);
10281 hardcount++;
10282 }
10283 break;
10284
10285 case REG_ANY_tb:
10286 scan = (char *) memchr(scan, '\n', this_eol - scan);
10287 if (! scan) {
10288 scan = this_eol;
10289 }
10290 break;
10291
10292 case SANY_t8:
10293 while (scan < this_eol && hardcount < max) {
10294 scan += UTF8SKIP(scan);
10295 hardcount++;
10296 }
10297 break;
10298
10299 case SANY_tb:
10300 scan = this_eol;
10301 break;
10302
10303 case EXACT_REQ8_tb:
10304 case LEXACT_REQ8_tb:
10305 case EXACTFU_REQ8_tb:
10306 break;
10307
10308 case EXACTL_t8:
10309 if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10310 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10311 }
10312 /* FALLTHROUGH */
10313
10314 case EXACTL_tb:
10315 case EXACTFL_t8:
10316 case EXACTFL_tb:
10317 case EXACTFLU8_t8:
10318 case EXACTFLU8_tb:
10319 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10320 /* FALLTHROUGH */
10321
10322 case EXACT_REQ8_t8:
10323 case LEXACT_REQ8_t8:
10324 case EXACTFU_REQ8_t8:
10325 case LEXACT_t8:
10326 case LEXACT_tb:
10327 case EXACT_t8:
10328 case EXACT_tb:
10329 case EXACTF_t8:
10330 case EXACTF_tb:
10331 case EXACTFAA_NO_TRIE_t8:
10332 case EXACTFAA_NO_TRIE_tb:
10333 case EXACTFAA_t8:
10334 case EXACTFAA_tb:
10335 case EXACTFU_t8:
10336 case EXACTFU_tb:
10337 case EXACTFUP_t8:
10338 case EXACTFUP_tb:
10339
10340 {
10341 struct next_matchable_info Binfo;
10342 PERL_UINT_FAST8_T definitive_len;
10343
10344 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10345
10346 /* Set up termination info, and quit if we can rule out that we've
10347 * gotten a match of the termination criteria */
10348 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10349 || scan + Binfo.min_length > this_eol
10350 || ! S_test_EXACTISH_ST(scan, Binfo))
10351 {
10352 break;
10353 }
10354
10355 definitive_len = Binfo.initial_definitive;
10356
10357 /* Here there are potential matches, and the first byte(s) matched our
10358 * filter
10359 *
10360 * If we got a definitive match of some initial bytes, there is no
10361 * possibility of false positives as far as it got */
10362 if (definitive_len > 0) {
10363
10364 /* If as far as it got is the maximum possible, there were no false
10365 * positives at all. Since we have everything set up, see how many
10366 * repeats there are. */
10367 if (definitive_len >= Binfo.max_length) {
10368
10369 /* We've already found one match */
10370 scan += definitive_len;
10371 hardcount++;
10372
10373 /* If want more than the one match, and there is room for more,
10374 * see if there are any */
10375 if (hardcount < max && scan + definitive_len <= this_eol) {
10376
10377 /* If the character is only a single byte long, just span
10378 * all such bytes. */
10379 if (definitive_len == 1) {
10380 const char * orig_scan = scan;
10381
10382 if (this_eol - (scan - hardcount) > max) {
10383 this_eol = scan - hardcount + max;
10384 }
10385
10386 /* Use different routines depending on whether it's an
10387 * exact match or matches with a mask */
10388 if (Binfo.initial_exact == 1) {
10389 scan = (char *) find_span_end((U8 *) scan,
10390 (U8 *) this_eol,
10391 Binfo.matches[0]);
10392 }
10393 else {
10394 scan = (char *) find_span_end_mask(
10395 (U8 *) scan,
10396 (U8 *) this_eol,
10397 Binfo.first_byte_anded,
10398 Binfo.first_byte_mask);
10399 }
10400
10401 hardcount += scan - orig_scan;
10402 }
10403 else { /* Here, the full character definitive match is more
10404 than one byte */
10405 while ( hardcount < max
10406 && scan + definitive_len <= this_eol
10407 && S_test_EXACTISH_ST(scan, Binfo))
10408 {
10409 scan += definitive_len;
10410 hardcount++;
10411 }
10412 }
10413 }
10414
10415 break;
10416 } /* End of a full character is definitively matched */
10417
10418 /* Here, an initial portion of the character matched definitively,
10419 * and the rest matched as well, but could have false positives */
10420
10421 do {
10422 int i;
10423 U8 * matches = Binfo.matches;
10424
10425 /* The first bytes were definitive. Look at the remaining */
10426 for (i = 0; i < Binfo.count; i++) {
10427 if (memEQ(scan + definitive_len,
10428 matches + definitive_len,
10429 Binfo.lengths[i] - definitive_len))
10430 {
10431 goto found_a_completion;
10432 }
10433
10434 matches += Binfo.lengths[i];
10435 }
10436
10437 /* Didn't find anything to complete our initial match. Stop
10438 * here */
10439 break;
10440
10441 found_a_completion:
10442
10443 /* Here, matched a full character, Include it in the result,
10444 * and then look to see if the next char matches */
10445 hardcount++;
10446 scan += Binfo.lengths[i];
10447
10448 } while ( hardcount < max
10449 && scan + definitive_len < this_eol
10450 && S_test_EXACTISH_ST(scan, Binfo));
10451
10452 /* Here, have advanced as far as possible */
10453 break;
10454 } /* End of found some initial bytes that definitively matched */
10455
10456 /* Here, we can't rule out that we have found the beginning of 'B', but
10457 * there were no initial bytes that could rule out anything
10458 * definitively. Use brute force to examine all the possibilities */
10459 while (scan < this_eol && hardcount < max) {
10460 int i;
10461 U8 * matches = Binfo.matches;
10462
10463 for (i = 0; i < Binfo.count; i++) {
10464 if (memEQ(scan, matches, Binfo.lengths[i])) {
10465 goto found1;
10466 }
10467
10468 matches += Binfo.lengths[i];
10469 }
10470
10471 break;
10472
10473 found1:
10474 hardcount++;
10475 scan += Binfo.lengths[i];
10476 }
10477
10478 break;
10479 }
10480
10481 case ANYOFPOSIXL_t8:
10482 case ANYOFL_t8:
10483 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10484 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10485
10486 /* FALLTHROUGH */
10487 case ANYOFD_t8:
10488 case ANYOF_t8:
10489 while ( hardcount < max
10490 && scan < this_eol
10491 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10492 {
10493 scan += UTF8SKIP(scan);
10494 hardcount++;
10495 }
10496 break;
10497
10498 case ANYOFPOSIXL_tb:
10499 case ANYOFL_tb:
10500 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10501 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10502 /* FALLTHROUGH */
10503
10504 case ANYOFD_tb:
10505 case ANYOF_tb:
10506 if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) {
10507 while ( scan < this_eol
10508 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10509 scan++;
10510 }
10511 else {
10512 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10513 scan++;
10514 }
10515 break;
10516
10517 case ANYOFM_t8:
10518 if (this_eol - scan > max) {
10519
10520 /* We didn't adjust <this_eol> at the beginning of this routine
10521 * because is UTF-8, but it is actually ok to do so, since here, to
10522 * match, 1 char == 1 byte. */
10523 this_eol = scan + max;
10524 }
10525 /* FALLTHROUGH */
10526
10527 case ANYOFM_tb:
10528 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10529 (U8) ARG1u(p), FLAGS(p));
10530 break;
10531
10532 case NANYOFM_t8:
10533 while ( hardcount < max
10534 && scan < this_eol
10535 && (*scan & FLAGS(p)) != ARG1u(p))
10536 {
10537 scan += UTF8SKIP(scan);
10538 hardcount++;
10539 }
10540 break;
10541
10542 case NANYOFM_tb:
10543 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10544 (U8) ARG1u(p), FLAGS(p));
10545 break;
10546
10547 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10548 case ANYOFHb_tb:
10549 case ANYOFHbbm_tb:
10550 case ANYOFHr_tb:
10551 case ANYOFHs_tb:
10552 break;
10553
10554 case ANYOFH_t8:
10555 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10556 while ( hardcount < max
10557 && scan < this_eol
10558 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10559 && _invlist_contains_cp(anyofh_list,
10560 utf8_to_uvchr_buf((U8 *) scan,
10561 (U8 *) this_eol,
10562 NULL)))
10563 {
10564 scan += UTF8SKIP(scan);
10565 hardcount++;
10566 }
10567 break;
10568
10569 case ANYOFHb_t8:
10570 /* we know the first byte must be the FLAGS field */
10571 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10572 while ( hardcount < max
10573 && scan < this_eol
10574 && (U8) *scan == ANYOF_FLAGS(p)
10575 && _invlist_contains_cp(anyofh_list,
10576 utf8_to_uvchr_buf((U8 *) scan,
10577 (U8 *) this_eol,
10578 NULL)))
10579 {
10580 scan += UTF8SKIP(scan);
10581 hardcount++;
10582 }
10583 break;
10584
10585 case ANYOFHbbm_t8:
10586 while ( hardcount < max
10587 && scan + 1 < this_eol
10588 && (U8) *scan == ANYOF_FLAGS(p)
10589 && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap,
10590 (U8) scan[1] & UTF_CONTINUATION_MASK))
10591 {
10592 scan += 2; /* This node only matces 2-byte UTF-8 */
10593 hardcount++;
10594 }
10595 break;
10596
10597 case ANYOFHr_t8:
10598 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10599 while ( hardcount < max
10600 && scan < this_eol
10601 && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10602 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10603 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10604 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10605 && _invlist_contains_cp(anyofh_list,
10606 utf8_to_uvchr_buf((U8 *) scan,
10607 (U8 *) this_eol,
10608 NULL)))
10609 {
10610 scan += UTF8SKIP(scan);
10611 hardcount++;
10612 }
10613 break;
10614
10615 case ANYOFHs_t8:
10616 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10617 while ( hardcount < max
10618 && scan + FLAGS(p) < this_eol
10619 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10620 && _invlist_contains_cp(anyofh_list,
10621 utf8_to_uvchr_buf((U8 *) scan,
10622 (U8 *) this_eol,
10623 NULL)))
10624 {
10625 scan += UTF8SKIP(scan);
10626 hardcount++;
10627 }
10628 break;
10629
10630 case ANYOFR_t8:
10631 while ( hardcount < max
10632 && scan < this_eol
10633 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10634 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10635 (U8 *) this_eol,
10636 NULL),
10637 ANYOFRbase(p), ANYOFRdelta(p)))
10638 {
10639 scan += UTF8SKIP(scan);
10640 hardcount++;
10641 }
10642 break;
10643
10644 case ANYOFR_tb:
10645 while ( hardcount < max
10646 && scan < this_eol
10647 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10648 {
10649 scan++;
10650 hardcount++;
10651 }
10652 break;
10653
10654 case ANYOFRb_t8:
10655 while ( hardcount < max
10656 && scan < this_eol
10657 && (U8) *scan == ANYOF_FLAGS(p)
10658 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10659 (U8 *) this_eol,
10660 NULL),
10661 ANYOFRbase(p), ANYOFRdelta(p)))
10662 {
10663 scan += UTF8SKIP(scan);
10664 hardcount++;
10665 }
10666 break;
10667
10668 case ANYOFRb_tb:
10669 while ( hardcount < max
10670 && scan < this_eol
10671 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10672 {
10673 scan++;
10674 hardcount++;
10675 }
10676 break;
10677
10678 /* The argument (FLAGS) to all the POSIX node types is the class number */
10679
10680 case NPOSIXL_tb:
10681 to_complement = 1;
10682 /* FALLTHROUGH */
10683
10684 case POSIXL_tb:
10685 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10686 while ( scan < this_eol
10687 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10688 {
10689 scan++;
10690 }
10691 break;
10692
10693 case NPOSIXL_t8:
10694 to_complement = 1;
10695 /* FALLTHROUGH */
10696
10697 case POSIXL_t8:
10698 while ( hardcount < max && scan < this_eol
10699 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10700 (U8 *) scan,
10701 (U8 *) this_eol)))
10702 {
10703 scan += UTF8SKIP(scan);
10704 hardcount++;
10705 }
10706 break;
10707
10708 case POSIXD_tb:
10709 /* FALLTHROUGH */
10710
10711 case POSIXA_t8:
10712 if (this_eol - scan > max) {
10713
10714 /* We didn't adjust <this_eol> at the beginning of this routine
10715 * because is UTF-8, but it is actually ok to do so, since here, to
10716 * match, 1 char == 1 byte. */
10717 this_eol = scan + max;
10718 }
10719 /* FALLTHROUGH */
10720
10721 case POSIXA_tb:
10722 while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) {
10723 scan++;
10724 }
10725 break;
10726
10727 case NPOSIXD_tb:
10728 /* FALLTHROUGH */
10729
10730 case NPOSIXA_tb:
10731 while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) {
10732 scan++;
10733 }
10734 break;
10735
10736 case NPOSIXA_t8:
10737
10738 /* The complement of something that matches only ASCII matches all
10739 * non-ASCII, plus everything in ASCII that isn't in the class. */
10740 while ( hardcount < max && scan < this_eol
10741 && ( ! isASCII_utf8_safe(scan, loceol)
10742 || ! generic_isCC_A_((U8) *scan, FLAGS(p))))
10743 {
10744 scan += UTF8SKIP(scan);
10745 hardcount++;
10746 }
10747 break;
10748
10749 case NPOSIXU_tb:
10750 to_complement = 1;
10751 /* FALLTHROUGH */
10752
10753 case POSIXU_tb:
10754 while ( scan < this_eol
10755 && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p))))
10756 {
10757 scan++;
10758 }
10759 break;
10760
10761 case NPOSIXU_t8:
10762 case NPOSIXD_t8:
10763 to_complement = 1;
10764 /* FALLTHROUGH */
10765
10766 case POSIXD_t8:
10767 case POSIXU_t8:
10768 classnum = (char_class_number_) FLAGS(p);
10769 switch (classnum) {
10770 default:
10771 while ( hardcount < max && scan < this_eol
10772 && to_complement
10773 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10774 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10775 {
10776 scan += UTF8SKIP(scan);
10777 hardcount++;
10778 }
10779 break;
10780
10781 /* For the classes below, the knowledge of how to handle every code
10782 * point is compiled into Perl via a macro. This code is written
10783 * for making the loops as tight as possible. It could be
10784 * refactored to save space instead. */
10785
10786 case CC_ENUM_SPACE_:
10787 while ( hardcount < max
10788 && scan < this_eol
10789 && (to_complement
10790 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10791 {
10792 scan += UTF8SKIP(scan);
10793 hardcount++;
10794 }
10795 break;
10796 case CC_ENUM_BLANK_:
10797 while ( hardcount < max
10798 && scan < this_eol
10799 && (to_complement
10800 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10801 {
10802 scan += UTF8SKIP(scan);
10803 hardcount++;
10804 }
10805 break;
10806 case CC_ENUM_XDIGIT_:
10807 while ( hardcount < max
10808 && scan < this_eol
10809 && (to_complement
10810 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10811 {
10812 scan += UTF8SKIP(scan);
10813 hardcount++;
10814 }
10815 break;
10816 case CC_ENUM_VERTSPACE_:
10817 while ( hardcount < max
10818 && scan < this_eol
10819 && (to_complement
10820 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10821 {
10822 scan += UTF8SKIP(scan);
10823 hardcount++;
10824 }
10825 break;
10826 case CC_ENUM_CNTRL_:
10827 while ( hardcount < max
10828 && scan < this_eol
10829 && (to_complement
10830 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10831 {
10832 scan += UTF8SKIP(scan);
10833 hardcount++;
10834 }
10835 break;
10836 }
10837 break;
10838
10839 case LNBREAK_t8:
10840 while ( hardcount < max && scan < this_eol
10841 && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10842 {
10843 scan += c;
10844 hardcount++;
10845 }
10846 break;
10847
10848 case LNBREAK_tb:
10849 /* LNBREAK can match one or two latin chars, which is ok, but we have
10850 * to use hardcount in this situation, and throw away the adjustment to
10851 * <this_eol> done before the switch statement */
10852 while (
10853 hardcount < max && scan < loceol
10854 && (c = is_LNBREAK_latin1_safe(scan, loceol))
10855 ) {
10856 scan += c;
10857 hardcount++;
10858 }
10859 break;
10860
10861 default:
10862 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10863 " node type %d='%s'", OP(p), REGNODE_NAME(OP(p)));
10864 NOT_REACHED; /* NOTREACHED */
10865
10866 }
10867
10868 if (hardcount)
10869 c = hardcount;
10870 else
10871 c = scan - *startposp;
10872 *startposp = scan;
10873
10874 DEBUG_r({
10875 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10876 DEBUG_EXECUTE_r({
10877 SV * const prop = sv_newmortal();
10878 regprop(prog, prop, p, reginfo, NULL);
10879 Perl_re_exec_indentf( aTHX_
10880 "%s can match %" IVdf " times out of %" IVdf "...\n",
10881 depth, SvPVX_const(prop),(IV)c,(IV)max);
10882 });
10883 });
10884
10885 return(c);
10886 }
10887
10888 /*
10889 - reginclass - determine if a character falls into a character class
10890
10891 n is the ANYOF-type regnode
10892 p is the target string
10893 p_end points to one byte beyond the end of the target string
10894 utf8_target tells whether p is in UTF-8.
10895
10896 Returns true if matched; false otherwise.
10897
10898 Note that this can be a synthetic start class, a combination of various
10899 nodes, so things you think might be mutually exclusive, such as locale,
10900 aren't. It can match both locale and non-locale
10901
10902 */
10903
10904 STATIC bool
S_reginclass(pTHX_ regexp * const prog,const regnode * const n,const U8 * const p,const U8 * const p_end,const bool utf8_target)10905 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10906 {
10907 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10908 ? 0
10909 : ANYOF_FLAGS(n);
10910 bool match = FALSE;
10911 UV c = *p;
10912
10913 PERL_ARGS_ASSERT_REGINCLASS;
10914
10915 /* If c is not already the code point, get it. Note that
10916 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10917 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10918 STRLEN c_len = 0;
10919 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10920 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10921 if (c_len == (STRLEN)-1) {
10922 _force_out_malformed_utf8_message(p, p_end,
10923 utf8n_flags,
10924 1 /* 1 means die */ );
10925 NOT_REACHED; /* NOTREACHED */
10926 }
10927 if ( c > 255
10928 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10929 && ! (flags & ANYOFL_UTF8_LOCALE_REQD))
10930 {
10931 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10932 }
10933 }
10934
10935 /* If this character is potentially in the bitmap, check it */
10936 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10937 if (ANYOF_BITMAP_TEST(n, c))
10938 match = TRUE;
10939 else if ( (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)
10940 && OP(n) == ANYOFD
10941 && ! utf8_target
10942 && ! isASCII(c))
10943 {
10944 match = TRUE;
10945 }
10946 else if (flags & ANYOF_LOCALE_FLAGS) {
10947 if ( (flags & ANYOFL_FOLD)
10948 && c < 256
10949 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10950 {
10951 match = TRUE;
10952 }
10953 else if ( ANYOF_POSIXL_TEST_ANY_SET(n)
10954 && c <= U8_MAX /* param to isFOO_lc() */
10955 ) {
10956 /* The data structure is arranged so bits 0, 2, 4, ... are set
10957 * if the class includes the Posix character class given by
10958 * bit/2; and 1, 3, 5, ... are set if the class includes the
10959 * complemented Posix class given by int(bit/2), so the
10960 * remainder modulo 2 tells us if to complement or not.
10961 *
10962 * Note that this code assumes that all the classes are closed
10963 * under folding. For example, if a character matches \w, then
10964 * its fold does too; and vice versa. This should be true for
10965 * any well-behaved locale for all the currently defined Posix
10966 * classes, except for :lower: and :upper:, which are handled
10967 * by the pseudo-class :cased: which matches if either of the
10968 * other two does. To get rid of this assumption, an outer
10969 * loop could be used below to iterate over both the source
10970 * character, and its fold (if different) */
10971
10972 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10973
10974 do {
10975 /* Find the next set bit indicating a class to try matching
10976 * against */
10977 U8 bit_pos = lsbit_pos32(posixl_bits);
10978
10979 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10980 match = TRUE;
10981 break;
10982 }
10983
10984 /* Remove this class from consideration; repeat */
10985 POSIXL_CLEAR(posixl_bits, bit_pos);
10986 } while(posixl_bits != 0);
10987 }
10988 }
10989 }
10990
10991 /* If the bitmap didn't (or couldn't) match, and something outside the
10992 * bitmap could match, try that. */
10993 if (!match) {
10994 if ( c >= NUM_ANYOF_CODE_POINTS
10995 && ANYOF_ONLY_HAS_BITMAP(n)
10996 && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES))
10997 {
10998 /* In this case, the ARG is set up so that the final bit indicates
10999 * whether it matches or not */
11000 match = ARG1u(n) & 1;
11001 }
11002 else
11003 /* Here, the main way it could match is if the code point is
11004 * outside the bitmap and an inversion list exists to handle such
11005 * things. The other ways all occur when a flag is set to indicate
11006 * we need special handling. That handling doesn't come in to
11007 * effect for ANYOFD nodes unless the target string is UTF-8 and
11008 * that matters to code point being matched. */
11009 if ( c >= NUM_ANYOF_CODE_POINTS
11010 || ( (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)
11011 && ( UNLIKELY(OP(n) != ANYOFD)
11012 || (utf8_target && ! isASCII_uvchr(c)
11013 # if NUM_ANYOF_CODE_POINTS > 256
11014 && c < 256
11015 # endif
11016 ))))
11017 {
11018 /* Here, we have an inversion list for outside-the-bitmap code
11019 * points and/or the flag is set indicating special handling is
11020 * needed. The flag is set when there is auxiliary data beyond the
11021 * normal inversion list, or if there is the possibility of a
11022 * special Turkic locale match, even without auxiliary data.
11023 *
11024 * First check if there is an inversion list and/or auxiliary data.
11025 * */
11026 if (ANYOF_HAS_AUX(n)) {
11027 SV* only_utf8_locale = NULL;
11028
11029 /* This call will return in 'definition' the union of the base
11030 * non-bitmap inversion list, if any, plus the deferred
11031 * definitions of user-defined properties, if any. It croaks
11032 * if there is such a property but which still has no definition
11033 * available */
11034 SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0,
11035 &only_utf8_locale, NULL);
11036 if (definition) {
11037 /* Most likely is the outside-the-bitmap inversion list. */
11038 if (_invlist_contains_cp(definition, c)) {
11039 match = TRUE;
11040 }
11041 else /* Failing that, hardcode the two tests for a Turkic
11042 locale */
11043 if ( UNLIKELY(IN_UTF8_TURKIC_LOCALE)
11044 && isALPHA_FOLD_EQ(*p, 'i'))
11045 {
11046 /* Turkish locales have these hard-coded rules
11047 * overriding normal ones */
11048 if (*p == 'i') {
11049 if (_invlist_contains_cp(definition,
11050 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11051 {
11052 match = TRUE;
11053 }
11054 }
11055 else if (_invlist_contains_cp(definition,
11056 LATIN_SMALL_LETTER_DOTLESS_I))
11057 {
11058 match = TRUE;
11059 }
11060 }
11061 }
11062
11063 if ( UNLIKELY(only_utf8_locale)
11064 && UNLIKELY(IN_UTF8_CTYPE_LOCALE)
11065 && ! match)
11066 {
11067 match = _invlist_contains_cp(only_utf8_locale, c);
11068 }
11069 }
11070
11071 /* In a Turkic locale under folding, hard-code the I i case pair
11072 * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES
11073 * flag set unless [Ii] were match possibilities */
11074 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE) && ! match) {
11075 if (utf8_target) {
11076 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
11077 if (ANYOF_BITMAP_TEST(n, 'i')) {
11078 match = TRUE;
11079 }
11080 }
11081 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
11082 if (ANYOF_BITMAP_TEST(n, 'I')) {
11083 match = TRUE;
11084 }
11085 }
11086 }
11087
11088 #if NUM_ANYOF_CODE_POINTS > 256
11089 /* Larger bitmap means these special cases aren't handled
11090 * outside the bitmap above. */
11091 if (*p == 'i') {
11092 if (ANYOF_BITMAP_TEST(n,
11093 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11094 {
11095 match = TRUE;
11096 }
11097 }
11098 else if (*p == 'I') {
11099 if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) {
11100 match = TRUE;
11101 }
11102 }
11103 #endif
11104 }
11105 }
11106
11107 if ( UNICODE_IS_SUPER(c)
11108 && (flags & ANYOF_WARN_SUPER__shared)
11109 && OP(n) != ANYOFD
11110 && ckWARN_d(WARN_NON_UNICODE))
11111 {
11112 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
11113 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
11114 }
11115 }
11116
11117 #if ANYOF_INVERT != 1
11118 /* Depending on compiler optimization cBOOL takes time, so if don't have to
11119 * use it, don't */
11120 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
11121 #endif
11122
11123 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
11124 return (flags & ANYOF_INVERT) ^ match;
11125 }
11126
11127 STATIC U8 *
S_reghop3(U8 * s,SSize_t off,const U8 * lim)11128 S_reghop3(U8 *s, SSize_t off, const U8* lim)
11129 {
11130 /* return the position 'off' UTF-8 characters away from 's', forward if
11131 * 'off' >= 0, backwards if negative. But don't go outside of position
11132 * 'lim', which better be < s if off < 0 */
11133
11134 PERL_ARGS_ASSERT_REGHOP3;
11135
11136 if (off >= 0) {
11137 while (off-- && s < lim) {
11138 /* XXX could check well-formedness here */
11139 U8 *new_s = s + UTF8SKIP(s);
11140 if (new_s > lim) /* lim may be in the middle of a long character */
11141 return s;
11142 s = new_s;
11143 }
11144 }
11145 else {
11146 while (off++ && s > lim) {
11147 s--;
11148 if (UTF8_IS_CONTINUED(*s)) {
11149 while (s > lim && UTF8_IS_CONTINUATION(*s))
11150 s--;
11151 if (! UTF8_IS_START(*s)) {
11152 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11153 }
11154 }
11155 /* XXX could check well-formedness here */
11156 }
11157 }
11158 return s;
11159 }
11160
11161 STATIC U8 *
S_reghop4(U8 * s,SSize_t off,const U8 * llim,const U8 * rlim)11162 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
11163 {
11164 PERL_ARGS_ASSERT_REGHOP4;
11165
11166 return utf8_hop_safe(s, off, llim, rlim);
11167 }
11168
11169 /* like reghop3, but returns NULL on overrun, rather than returning last
11170 * char pos */
11171
11172 STATIC U8 *
S_reghopmaybe3(U8 * s,SSize_t off,const U8 * const lim)11173 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
11174 {
11175 PERL_ARGS_ASSERT_REGHOPMAYBE3;
11176
11177 if (off >= 0) {
11178 while (off-- && s < lim) {
11179 /* XXX could check well-formedness here */
11180 s += UTF8SKIP(s);
11181 }
11182 if (off >= 0)
11183 return NULL;
11184 }
11185 else {
11186 while (off++ && s > lim) {
11187 s--;
11188 if (UTF8_IS_CONTINUED(*s)) {
11189 while (s > lim && UTF8_IS_CONTINUATION(*s))
11190 s--;
11191 if (! UTF8_IS_START(*s)) {
11192 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11193 }
11194 }
11195 /* XXX could check well-formedness here */
11196 }
11197 if (off <= 0)
11198 return NULL;
11199 }
11200 return s;
11201 }
11202
11203
11204 /* when executing a regex that may have (?{}), extra stuff needs setting
11205 up that will be visible to the called code, even before the current
11206 match has finished. In particular:
11207
11208 * $_ is localised to the SV currently being matched;
11209 * pos($_) is created if necessary, ready to be updated on each call-out
11210 to code;
11211 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
11212 isn't set until the current pattern is successfully finished), so that
11213 $1 etc of the match-so-far can be seen;
11214 * save the old values of subbeg etc of the current regex, and set then
11215 to the current string (again, this is normally only done at the end
11216 of execution)
11217 */
11218
11219 static void
S_setup_eval_state(pTHX_ regmatch_info * const reginfo)11220 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
11221 {
11222 MAGIC *mg;
11223 regexp *const rex = ReANY(reginfo->prog);
11224 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
11225
11226 eval_state->rex = rex;
11227 eval_state->sv = reginfo->sv;
11228
11229 if (reginfo->sv) {
11230 /* Make $_ available to executed code. */
11231 if (reginfo->sv != DEFSV) {
11232 SAVE_DEFSV;
11233 DEFSV_set(reginfo->sv);
11234 }
11235 /* will be dec'd by S_cleanup_regmatch_info_aux */
11236 SvREFCNT_inc_NN(reginfo->sv);
11237
11238 if (!(mg = mg_find_mglob(reginfo->sv))) {
11239 /* prepare for quick setting of pos */
11240 mg = sv_magicext_mglob(reginfo->sv);
11241 mg->mg_len = -1;
11242 }
11243 eval_state->pos_magic = mg;
11244 eval_state->pos = mg->mg_len;
11245 eval_state->pos_flags = mg->mg_flags;
11246 }
11247 else
11248 eval_state->pos_magic = NULL;
11249
11250 if (!PL_reg_curpm) {
11251 /* PL_reg_curpm is a fake PMOP that we can attach the current
11252 * regex to and point PL_curpm at, so that $1 et al are visible
11253 * within a /(?{})/. It's just allocated once per interpreter the
11254 * first time its needed */
11255 Newxz(PL_reg_curpm, 1, PMOP);
11256 #ifdef USE_ITHREADS
11257 {
11258 SV* const repointer = &PL_sv_undef;
11259 /* this regexp is also owned by the new PL_reg_curpm, which
11260 will try to free it. */
11261 av_push(PL_regex_padav, repointer);
11262 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
11263 PL_regex_pad = AvARRAY(PL_regex_padav);
11264 }
11265 #endif
11266 }
11267 SET_reg_curpm(reginfo->prog);
11268 eval_state->curpm = PL_curpm;
11269 PL_curpm_under = PL_curpm;
11270 PL_curpm = PL_reg_curpm;
11271 if (RXp_MATCH_COPIED(rex)) {
11272 /* Here is a serious problem: we cannot rewrite subbeg,
11273 since it may be needed if this match fails. Thus
11274 $` inside (?{}) could fail... */
11275 eval_state->subbeg = RXp_SUBBEG(rex);
11276 eval_state->sublen = RXp_SUBLEN(rex);
11277 eval_state->suboffset = RXp_SUBOFFSET(rex);
11278 eval_state->subcoffset = RXp_SUBCOFFSET(rex);
11279 #ifdef PERL_ANY_COW
11280 eval_state->saved_copy = RXp_SAVED_COPY(rex);
11281 #endif
11282 RXp_MATCH_COPIED_off(rex);
11283 }
11284 else
11285 eval_state->subbeg = NULL;
11286 RXp_SUBBEG(rex) = (char *)reginfo->strbeg;
11287 RXp_SUBOFFSET(rex) = 0;
11288 RXp_SUBCOFFSET(rex) = 0;
11289 RXp_SUBLEN(rex) = reginfo->strend - reginfo->strbeg;
11290 }
11291
11292
11293 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
11294
11295 static void
S_cleanup_regmatch_info_aux(pTHX_ void * arg)11296 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
11297 {
11298 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
11299 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
11300 regmatch_slab *s;
11301
11302 Safefree(aux->poscache);
11303
11304 if (eval_state) {
11305
11306 /* undo the effects of S_setup_eval_state() */
11307
11308 if (eval_state->subbeg) {
11309 regexp * const rex = eval_state->rex;
11310 RXp_SUBBEG(rex) = eval_state->subbeg;
11311 RXp_SUBLEN(rex) = eval_state->sublen;
11312 RXp_SUBOFFSET(rex) = eval_state->suboffset;
11313 RXp_SUBCOFFSET(rex) = eval_state->subcoffset;
11314 #ifdef PERL_ANY_COW
11315 RXp_SAVED_COPY(rex) = eval_state->saved_copy;
11316 #endif
11317 RXp_MATCH_COPIED_on(rex);
11318 }
11319 if (eval_state->pos_magic)
11320 {
11321 eval_state->pos_magic->mg_len = eval_state->pos;
11322 eval_state->pos_magic->mg_flags =
11323 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
11324 | (eval_state->pos_flags & MGf_BYTES);
11325 }
11326
11327 PL_curpm = eval_state->curpm;
11328 SvREFCNT_dec(eval_state->sv);
11329 }
11330
11331 PL_regmatch_state = aux->old_regmatch_state;
11332 PL_regmatch_slab = aux->old_regmatch_slab;
11333
11334 /* free all slabs above current one - this must be the last action
11335 * of this function, as aux and eval_state are allocated within
11336 * slabs and may be freed here */
11337
11338 s = PL_regmatch_slab->next;
11339 if (s) {
11340 PL_regmatch_slab->next = NULL;
11341 while (s) {
11342 regmatch_slab * const osl = s;
11343 s = s->next;
11344 Safefree(osl);
11345 }
11346 }
11347 }
11348
11349
11350 STATIC void
S_to_utf8_substr(pTHX_ regexp * prog)11351 S_to_utf8_substr(pTHX_ regexp *prog)
11352 {
11353 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11354 * on the converted value */
11355
11356 int i = 1;
11357
11358 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11359
11360 do {
11361 if (prog->substrs->data[i].substr
11362 && !prog->substrs->data[i].utf8_substr) {
11363 SV* const sv = newSVsv(prog->substrs->data[i].substr);
11364 prog->substrs->data[i].utf8_substr = sv;
11365 sv_utf8_upgrade(sv);
11366 if (SvVALID(prog->substrs->data[i].substr)) {
11367 if (SvTAIL(prog->substrs->data[i].substr)) {
11368 /* Trim the trailing \n that fbm_compile added last
11369 time. */
11370 SvCUR_set(sv, SvCUR(sv) - 1);
11371 /* Whilst this makes the SV technically "invalid" (as its
11372 buffer is no longer followed by "\0") when fbm_compile()
11373 adds the "\n" back, a "\0" is restored. */
11374 fbm_compile(sv, FBMcf_TAIL);
11375 } else
11376 fbm_compile(sv, 0);
11377 }
11378 if (prog->substrs->data[i].substr == prog->check_substr)
11379 prog->check_utf8 = sv;
11380 }
11381 } while (i--);
11382 }
11383
11384 STATIC bool
S_to_byte_substr(pTHX_ regexp * prog)11385 S_to_byte_substr(pTHX_ regexp *prog)
11386 {
11387 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11388 * on the converted value; returns FALSE if can't be converted. */
11389
11390 int i = 1;
11391
11392 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11393
11394 do {
11395 if (prog->substrs->data[i].utf8_substr
11396 && !prog->substrs->data[i].substr) {
11397 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11398 if (! sv_utf8_downgrade(sv, TRUE)) {
11399 SvREFCNT_dec_NN(sv);
11400 return FALSE;
11401 }
11402 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11403 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11404 /* Trim the trailing \n that fbm_compile added last
11405 time. */
11406 SvCUR_set(sv, SvCUR(sv) - 1);
11407 fbm_compile(sv, FBMcf_TAIL);
11408 } else
11409 fbm_compile(sv, 0);
11410 }
11411 prog->substrs->data[i].substr = sv;
11412 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11413 prog->check_substr = sv;
11414 }
11415 } while (i--);
11416
11417 return TRUE;
11418 }
11419
11420 #ifndef PERL_IN_XSUB_RE
11421
11422 bool
Perl_is_grapheme(pTHX_ const U8 * strbeg,const U8 * s,const U8 * strend,const UV cp)11423 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11424 {
11425 /* Temporary helper function for toke.c. Verify that the code point 'cp'
11426 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
11427 * the larger string bounded by 'strbeg' and 'strend'.
11428 *
11429 * 'cp' needs to be assigned (if not, a future version of the Unicode
11430 * Standard could make it something that combines with adjacent characters,
11431 * so code using it would then break), and there has to be a GCB break
11432 * before and after the character. */
11433
11434
11435 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11436 const U8 * prev_cp_start;
11437
11438 PERL_ARGS_ASSERT_IS_GRAPHEME;
11439
11440 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
11441 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11442 {
11443 /* These are considered graphemes */
11444 return TRUE;
11445 }
11446
11447 /* Otherwise, unassigned code points are forbidden */
11448 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11449 _invlist_search(PL_Assigned_invlist, cp))))
11450 {
11451 return FALSE;
11452 }
11453
11454 cp_gcb_val = getGCB_VAL_CP(cp);
11455
11456 /* Find the GCB value of the previous code point in the input */
11457 prev_cp_start = utf8_hop_back(s, -1, strbeg);
11458 if (UNLIKELY(prev_cp_start == s)) {
11459 prev_cp_gcb_val = GCB_EDGE;
11460 }
11461 else {
11462 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11463 }
11464
11465 /* And check that is a grapheme boundary */
11466 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11467 TRUE /* is UTF-8 encoded */ ))
11468 {
11469 return FALSE;
11470 }
11471
11472 /* Similarly verify there is a break between the current character and the
11473 * following one */
11474 s += UTF8SKIP(s);
11475 if (s >= strend) {
11476 next_cp_gcb_val = GCB_EDGE;
11477 }
11478 else {
11479 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11480 }
11481
11482 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11483 }
11484
11485 /*
11486 =for apidoc_section $unicode
11487
11488 =for apidoc isSCRIPT_RUN
11489
11490 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11491 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
11492 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
11493 two degenerate cases given below, this function returns TRUE iff all code
11494 points in it come from any combination of three "scripts" given by the Unicode
11495 "Script Extensions" property: Common, Inherited, and possibly one other.
11496 Additionally all decimal digits must come from the same consecutive sequence of
11497 10.
11498
11499 For example, if all the characters in the sequence are Greek, or Common, or
11500 Inherited, this function will return TRUE, provided any decimal digits in it
11501 are from the same block of digits in Common. (These are the ASCII digits
11502 "0".."9" and additionally a block for full width forms of these, and several
11503 others used in mathematical notation.) For scripts (unlike Greek) that have
11504 their own digits defined this will accept either digits from that set or from
11505 one of the Common digit sets, but not a combination of the two. Some scripts,
11506 such as Arabic, have more than one set of digits. All digits must come from
11507 the same set for this function to return TRUE.
11508
11509 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11510 contain the script found, using the C<SCX_enum> typedef. Its value will be
11511 C<SCX_INVALID> if the function returns FALSE.
11512
11513 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11514 will be C<SCX_INVALID>.
11515
11516 If the sequence contains a single code point which is unassigned to a character
11517 in the version of Unicode being used, the function will return TRUE, and the
11518 script will be C<SCX_Unknown>. Any other combination of unassigned code points
11519 in the input sequence will result in the function treating the input as not
11520 being a script run.
11521
11522 The returned script will be C<SCX_Inherited> iff all the code points in it are
11523 from the Inherited script.
11524
11525 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11526 it are from the Inherited or Common scripts.
11527
11528 =cut
11529
11530 */
11531
11532 bool
Perl_isSCRIPT_RUN(pTHX_ const U8 * s,const U8 * send,const bool utf8_target)11533 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11534 {
11535 /* Basically, it looks at each character in the sequence to see if the
11536 * above conditions are met; if not it fails. It uses an inversion map to
11537 * find the enum corresponding to the script of each character. But this
11538 * is complicated by the fact that a few code points can be in any of
11539 * several scripts. The data has been constructed so that there are
11540 * additional enum values (all negative) for these situations. The
11541 * absolute value of those is an index into another table which contains
11542 * pointers to auxiliary tables for each such situation. Each aux array
11543 * lists all the scripts for the given situation. There is another,
11544 * parallel, table that gives the number of entries in each aux table.
11545 * These are all defined in charclass_invlists.h */
11546
11547 /* XXX Here are the additional things UTS 39 says could be done:
11548 *
11549 * Forbid sequences of the same nonspacing mark
11550 *
11551 * Check to see that all the characters are in the sets of exemplar
11552 * characters for at least one language in the Unicode Common Locale Data
11553 * Repository [CLDR]. */
11554
11555
11556 /* Things that match /\d/u */
11557 SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_];
11558 UV * decimals_array = invlist_array(decimals_invlist);
11559
11560 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11561 * not currently known) */
11562 UV zero_of_run = 0;
11563
11564 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
11565 SCX_enum script_of_char = SCX_INVALID;
11566
11567 /* If the script remains not fully determined from iteration to iteration,
11568 * this is the current intersection of the possiblities. */
11569 SCX_enum * intersection = NULL;
11570 PERL_UINT_FAST8_T intersection_len = 0;
11571
11572 bool retval = TRUE;
11573 SCX_enum * ret_script = NULL;
11574
11575 assert(send >= s);
11576
11577 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11578
11579 /* All code points in 0..255 are either Common or Latin, so must be a
11580 * script run. We can return immediately unless we need to know which
11581 * script it is. */
11582 if (! utf8_target && LIKELY(send > s)) {
11583 if (ret_script == NULL) {
11584 return TRUE;
11585 }
11586
11587 /* If any character is Latin, the run is Latin */
11588 while (s < send) {
11589 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11590 *ret_script = SCX_Latin;
11591 return TRUE;
11592 }
11593 }
11594
11595 /* Here, all are Common */
11596 *ret_script = SCX_Common;
11597 return TRUE;
11598 }
11599
11600 /* Look at each character in the sequence */
11601 while (s < send) {
11602 /* If the current character being examined is a digit, this is the code
11603 * point of the zero for its sequence of 10 */
11604 UV zero_of_char;
11605
11606 UV cp;
11607
11608 /* The code allows all scripts to use the ASCII digits. This is
11609 * because they are in the Common script. Hence any ASCII ones found
11610 * are ok, unless and until a digit from another set has already been
11611 * encountered. digit ranges in Common are not similarly blessed) */
11612 if (UNLIKELY(isDIGIT(*s))) {
11613 if (UNLIKELY(script_of_run == SCX_Unknown)) {
11614 retval = FALSE;
11615 break;
11616 }
11617 if (zero_of_run) {
11618 if (zero_of_run != '0') {
11619 retval = FALSE;
11620 break;
11621 }
11622 }
11623 else {
11624 zero_of_run = '0';
11625 }
11626 s++;
11627 continue;
11628 }
11629
11630 /* Here, isn't an ASCII digit. Find the code point of the character */
11631 if (! UTF8_IS_INVARIANT(*s)) {
11632 Size_t len;
11633 cp = valid_utf8_to_uvchr((U8 *) s, &len);
11634 s += len;
11635 }
11636 else {
11637 cp = *(s++);
11638 }
11639
11640 /* If is within the range [+0 .. +9] of the script's zero, it also is a
11641 * digit in that script. We can skip the rest of this code for this
11642 * character. */
11643 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11644 continue;
11645 }
11646
11647 /* Find the character's script. The correct values are hard-coded here
11648 * for small-enough code points. */
11649 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
11650 unlikely to change */
11651 if ( cp > 255
11652 || ( isALPHA_L1(cp)
11653 && LIKELY(cp != MICRO_SIGN_NATIVE)))
11654 {
11655 script_of_char = SCX_Latin;
11656 }
11657 else {
11658 script_of_char = SCX_Common;
11659 }
11660 }
11661 else {
11662 script_of_char = _Perl_SCX_invmap[
11663 _invlist_search(PL_SCX_invlist, cp)];
11664 }
11665
11666 /* We arbitrarily accept a single unassigned character, but not in
11667 * combination with anything else, and not a run of them. */
11668 if ( UNLIKELY(script_of_run == SCX_Unknown)
11669 || UNLIKELY( script_of_run != SCX_INVALID
11670 && script_of_char == SCX_Unknown))
11671 {
11672 retval = FALSE;
11673 break;
11674 }
11675
11676 /* For the first character, or the run is inherited, the run's script
11677 * is set to the char's */
11678 if ( UNLIKELY(script_of_run == SCX_INVALID)
11679 || UNLIKELY(script_of_run == SCX_Inherited))
11680 {
11681 script_of_run = script_of_char;
11682 }
11683
11684 /* For the character's script to be Unknown, it must be the first
11685 * character in the sequence (for otherwise a test above would have
11686 * prevented us from reaching here), and we have set the run's script
11687 * to it. Nothing further to be done for this character */
11688 if (UNLIKELY(script_of_char == SCX_Unknown)) {
11689 continue;
11690 }
11691
11692 /* We accept 'inherited' script characters currently even at the
11693 * beginning. (We know that no characters in Inherited are digits, or
11694 * we'd have to check for that) */
11695 if (UNLIKELY(script_of_char == SCX_Inherited)) {
11696 continue;
11697 }
11698
11699 /* If the run so far is Common, and the new character isn't, change the
11700 * run's script to that of this character */
11701 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11702 script_of_run = script_of_char;
11703 }
11704
11705 /* Now we can see if the script of the new character is the same as
11706 * that of the run */
11707 if (LIKELY(script_of_char == script_of_run)) {
11708 /* By far the most common case */
11709 goto scripts_match;
11710 }
11711
11712 /* Here, the script of the run isn't Common. But characters in Common
11713 * match any script */
11714 if (script_of_char == SCX_Common) {
11715 goto scripts_match;
11716 }
11717
11718 #ifndef HAS_SCX_AUX_TABLES
11719
11720 /* Too early a Unicode version to have a code point belonging to more
11721 * than one script, so, if the scripts don't exactly match, fail */
11722 PERL_UNUSED_VAR(intersection_len);
11723 retval = FALSE;
11724 break;
11725
11726 #else
11727
11728 /* Here there is no exact match between the character's script and the
11729 * run's. And we've handled the special cases of scripts Unknown,
11730 * Inherited, and Common.
11731 *
11732 * Negative script numbers signify that the value may be any of several
11733 * scripts, and we need to look at auxiliary information to make our
11734 * determination. But if both are non-negative, we can fail now */
11735 if (LIKELY(script_of_char >= 0)) {
11736 const SCX_enum * search_in;
11737 PERL_UINT_FAST8_T search_in_len;
11738 PERL_UINT_FAST8_T i;
11739
11740 if (LIKELY(script_of_run >= 0)) {
11741 retval = FALSE;
11742 break;
11743 }
11744
11745 /* Use the previously constructed set of possible scripts, if any.
11746 * */
11747 if (intersection) {
11748 search_in = intersection;
11749 search_in_len = intersection_len;
11750 }
11751 else {
11752 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11753 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11754 }
11755
11756 for (i = 0; i < search_in_len; i++) {
11757 if (search_in[i] == script_of_char) {
11758 script_of_run = script_of_char;
11759 goto scripts_match;
11760 }
11761 }
11762
11763 retval = FALSE;
11764 break;
11765 }
11766 else if (LIKELY(script_of_run >= 0)) {
11767 /* script of character could be one of several, but run is a single
11768 * script */
11769 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11770 const PERL_UINT_FAST8_T search_in_len
11771 = SCX_AUX_TABLE_lengths[-script_of_char];
11772 PERL_UINT_FAST8_T i;
11773
11774 for (i = 0; i < search_in_len; i++) {
11775 if (search_in[i] == script_of_run) {
11776 script_of_char = script_of_run;
11777 goto scripts_match;
11778 }
11779 }
11780
11781 retval = FALSE;
11782 break;
11783 }
11784 else {
11785 /* Both run and char could be in one of several scripts. If the
11786 * intersection is empty, then this character isn't in this script
11787 * run. Otherwise, we need to calculate the intersection to use
11788 * for future iterations of the loop, unless we are already at the
11789 * final character */
11790 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11791 const PERL_UINT_FAST8_T char_len
11792 = SCX_AUX_TABLE_lengths[-script_of_char];
11793 const SCX_enum * search_run;
11794 PERL_UINT_FAST8_T run_len;
11795
11796 SCX_enum * new_overlap = NULL;
11797 PERL_UINT_FAST8_T i, j;
11798
11799 if (intersection) {
11800 search_run = intersection;
11801 run_len = intersection_len;
11802 }
11803 else {
11804 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11805 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11806 }
11807
11808 intersection_len = 0;
11809
11810 for (i = 0; i < run_len; i++) {
11811 for (j = 0; j < char_len; j++) {
11812 if (search_run[i] == search_char[j]) {
11813
11814 /* Here, the script at i,j matches. That means this
11815 * character is in the run. But continue on to find
11816 * the complete intersection, for the next loop
11817 * iteration, and for the digit check after it.
11818 *
11819 * On the first found common script, we malloc space
11820 * for the intersection list for the worst case of the
11821 * intersection, which is the minimum of the number of
11822 * scripts remaining in each set. */
11823 if (intersection_len == 0) {
11824 Newx(new_overlap,
11825 MIN(run_len - i, char_len - j),
11826 SCX_enum);
11827 }
11828 new_overlap[intersection_len++] = search_run[i];
11829 }
11830 }
11831 }
11832
11833 /* Here we've looked through everything. If they have no scripts
11834 * in common, not a run */
11835 if (intersection_len == 0) {
11836 retval = FALSE;
11837 break;
11838 }
11839
11840 /* If there is only a single script in common, set to that.
11841 * Otherwise, use the intersection going forward */
11842 Safefree(intersection);
11843 intersection = NULL;
11844 if (intersection_len == 1) {
11845 script_of_run = script_of_char = new_overlap[0];
11846 Safefree(new_overlap);
11847 new_overlap = NULL;
11848 }
11849 else {
11850 intersection = new_overlap;
11851 }
11852 }
11853
11854 #endif
11855
11856 scripts_match:
11857
11858 /* Here, the script of the character is compatible with that of the
11859 * run. That means that in most cases, it continues the script run.
11860 * Either it and the run match exactly, or one or both can be in any of
11861 * several scripts, and the intersection is not empty. However, if the
11862 * character is a decimal digit, it could still mean failure if it is
11863 * from the wrong sequence of 10. So, we need to look at if it's a
11864 * digit. We've already handled the 10 digits [0-9], and the next
11865 * lowest one is this one: */
11866 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11867 continue; /* Not a digit; this character is part of the run */
11868 }
11869
11870 /* If we have a definitive '0' for the script of this character, we
11871 * know that for this to be a digit, it must be in the range of +0..+9
11872 * of that zero. */
11873 if ( script_of_char >= 0
11874 && (zero_of_char = script_zeros[script_of_char]))
11875 {
11876 if (! withinCOUNT(cp, zero_of_char, 9)) {
11877 continue; /* Not a digit; this character is part of the run
11878 */
11879 }
11880
11881 }
11882 else { /* Need to look up if this character is a digit or not */
11883 SSize_t index_of_zero_of_char;
11884 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11885 if ( UNLIKELY(index_of_zero_of_char < 0)
11886 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11887 {
11888 continue; /* Not a digit; this character is part of the run.
11889 */
11890 }
11891
11892 zero_of_char = decimals_array[index_of_zero_of_char];
11893 }
11894
11895 /* Here, the character is a decimal digit, and the zero of its sequence
11896 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
11897 * they better be the same. */
11898 if (zero_of_run) {
11899 if (zero_of_run != zero_of_char) {
11900 retval = FALSE;
11901 break;
11902 }
11903 }
11904 else { /* Otherwise we now have a zero for this run */
11905 zero_of_run = zero_of_char;
11906 }
11907 } /* end of looping through CLOSESR text */
11908
11909 Safefree(intersection);
11910
11911 if (ret_script != NULL) {
11912 if (retval) {
11913 *ret_script = script_of_run;
11914 }
11915 else {
11916 *ret_script = SCX_INVALID;
11917 }
11918 }
11919
11920 return retval;
11921 }
11922 #endif /* ifndef PERL_IN_XSUB_RE */
11923
11924 /* Buffer logic. */
11925 SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx,SV * const key,SV * const value,const U32 flags)11926 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
11927 const U32 flags)
11928 {
11929 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
11930
11931 PERL_UNUSED_ARG(value);
11932
11933 if (flags & RXapif_FETCH) {
11934 return reg_named_buff_fetch(rx, key, flags);
11935 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
11936 Perl_croak_no_modify();
11937 return NULL;
11938 } else if (flags & RXapif_EXISTS) {
11939 return reg_named_buff_exists(rx, key, flags)
11940 ? &PL_sv_yes
11941 : &PL_sv_no;
11942 } else if (flags & RXapif_REGNAMES) {
11943 return reg_named_buff_all(rx, flags);
11944 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
11945 return reg_named_buff_scalar(rx, flags);
11946 } else {
11947 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
11948 return NULL;
11949 }
11950 }
11951
11952 SV*
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx,const SV * const lastkey,const U32 flags)11953 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
11954 const U32 flags)
11955 {
11956 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
11957 PERL_UNUSED_ARG(lastkey);
11958
11959 if (flags & RXapif_FIRSTKEY)
11960 return reg_named_buff_firstkey(rx, flags);
11961 else if (flags & RXapif_NEXTKEY)
11962 return reg_named_buff_nextkey(rx, flags);
11963 else {
11964 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
11965 (int)flags);
11966 return NULL;
11967 }
11968 }
11969
11970 SV*
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r,SV * const namesv,const U32 flags)11971 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
11972 const U32 flags)
11973 {
11974 SV *ret;
11975 struct regexp *const rx = ReANY(r);
11976
11977 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
11978
11979 if (rx && RXp_PAREN_NAMES(rx)) {
11980 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
11981 if (he_str) {
11982 IV i;
11983 SV* sv_dat=HeVAL(he_str);
11984 I32 *nums=(I32*)SvPVX(sv_dat);
11985 AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL;
11986 for ( i=0; i<SvIVX(sv_dat); i++ ) {
11987 if ((I32)(rx->nparens) >= nums[i]
11988 && RXp_OFFS_VALID(rx,nums[i]))
11989 {
11990 ret = newSVpvs("");
11991 Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE);
11992 if (!retarray)
11993 return ret;
11994 } else {
11995 if (retarray)
11996 ret = newSV_type(SVt_NULL);
11997 }
11998 if (retarray)
11999 av_push_simple(retarray, ret);
12000 }
12001 if (retarray)
12002 return newRV_noinc(MUTABLE_SV(retarray));
12003 }
12004 }
12005 return NULL;
12006 }
12007
12008 bool
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r,SV * const key,const U32 flags)12009 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
12010 const U32 flags)
12011 {
12012 struct regexp *const rx = ReANY(r);
12013
12014 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
12015
12016 if (rx && RXp_PAREN_NAMES(rx)) {
12017 if (flags & RXapif_ALL) {
12018 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
12019 } else {
12020 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
12021 if (sv) {
12022 SvREFCNT_dec_NN(sv);
12023 return TRUE;
12024 } else {
12025 return FALSE;
12026 }
12027 }
12028 } else {
12029 return FALSE;
12030 }
12031 }
12032
12033 SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r,const U32 flags)12034 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
12035 {
12036 struct regexp *const rx = ReANY(r);
12037
12038 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
12039
12040 if ( rx && RXp_PAREN_NAMES(rx) ) {
12041 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
12042
12043 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
12044 } else {
12045 return FALSE;
12046 }
12047 }
12048
12049 SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r,const U32 flags)12050 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
12051 {
12052 struct regexp *const rx = ReANY(r);
12053 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12054
12055 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
12056
12057 if (rx && RXp_PAREN_NAMES(rx)) {
12058 HV *hv = RXp_PAREN_NAMES(rx);
12059 HE *temphe;
12060 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12061 IV i;
12062 IV parno = 0;
12063 SV* sv_dat = HeVAL(temphe);
12064 I32 *nums = (I32*)SvPVX(sv_dat);
12065 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12066 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12067 RXp_OFFS_VALID(rx,nums[i]))
12068 {
12069 parno = nums[i];
12070 break;
12071 }
12072 }
12073 if (parno || flags & RXapif_ALL) {
12074 return newSVhek(HeKEY_hek(temphe));
12075 }
12076 }
12077 }
12078 return NULL;
12079 }
12080
12081 SV*
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r,const U32 flags)12082 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
12083 {
12084 SV *ret;
12085 AV *av;
12086 SSize_t length;
12087 struct regexp *const rx = ReANY(r);
12088
12089 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
12090
12091 if (rx && RXp_PAREN_NAMES(rx)) {
12092 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
12093 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
12094 } else if (flags & RXapif_ONE) {
12095 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
12096 av = MUTABLE_AV(SvRV(ret));
12097 length = av_count(av);
12098 SvREFCNT_dec_NN(ret);
12099 return newSViv(length);
12100 } else {
12101 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
12102 (int)flags);
12103 return NULL;
12104 }
12105 }
12106 return &PL_sv_undef;
12107 }
12108
12109 SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r,const U32 flags)12110 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
12111 {
12112 struct regexp *const rx = ReANY(r);
12113 AV *av = newAV();
12114
12115 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
12116
12117 if (rx && RXp_PAREN_NAMES(rx)) {
12118 HV *hv= RXp_PAREN_NAMES(rx);
12119 HE *temphe;
12120 (void)hv_iterinit(hv);
12121 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12122 IV i;
12123 IV parno = 0;
12124 SV* sv_dat = HeVAL(temphe);
12125 I32 *nums = (I32*)SvPVX(sv_dat);
12126 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12127 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12128 RXp_OFFS_VALID(rx,nums[i]))
12129 {
12130 parno = nums[i];
12131 break;
12132 }
12133 }
12134 if (parno || flags & RXapif_ALL) {
12135 av_push_simple(av, newSVhek(HeKEY_hek(temphe)));
12136 }
12137 }
12138 }
12139
12140 return newRV_noinc(MUTABLE_SV(av));
12141 }
12142
12143 void
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re,const I32 paren,SV * const sv)12144 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren,
12145 SV * const sv)
12146 {
12147 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
12148 Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0);
12149 }
12150
12151 #ifndef PERL_IN_XSUB_RE
12152
12153 void
Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re,const I32 paren,SV * const sv,U32 flags)12154 Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren,
12155 SV * const sv, U32 flags)
12156 {
12157 struct regexp *const rx = ReANY(re);
12158 char *s = NULL;
12159 SSize_t i,t = 0;
12160 SSize_t s1, t1;
12161 I32 n = paren;
12162 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12163
12164 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS;
12165
12166 if ( n == RX_BUFF_IDX_CARET_PREMATCH
12167 || n == RX_BUFF_IDX_CARET_FULLMATCH
12168 || n == RX_BUFF_IDX_CARET_POSTMATCH
12169 )
12170 {
12171 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12172 if (!keepcopy) {
12173 /* on something like
12174 * $r = qr/.../;
12175 * /$qr/p;
12176 * the KEEPCOPY is set on the PMOP rather than the regex */
12177 if (PL_curpm && re == PM_GETRE(PL_curpm))
12178 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12179 }
12180 if (!keepcopy)
12181 goto ret_undef;
12182 }
12183
12184 if (!RXp_SUBBEG(rx))
12185 goto ret_undef;
12186
12187 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
12188 /* no need to distinguish between them any more */
12189 n = RX_BUFF_IDX_FULLMATCH;
12190
12191 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
12192 && (i = RXp_OFFS_START(rx,0)) != -1)
12193 {
12194 /* $`, ${^PREMATCH} */
12195 s = RXp_SUBBEG(rx);
12196 }
12197 else
12198 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
12199 && (t = RXp_OFFS_END(rx,0)) != -1)
12200 {
12201 /* $', ${^POSTMATCH} */
12202 s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + t;
12203 i = RXp_SUBLEN(rx) + RXp_SUBOFFSET(rx) - t;
12204 }
12205 else /* when flags is true we do an absolute lookup, and compare against rx->nparens */
12206 if (inRANGE(n, 0, flags ? (I32)rx->nparens : logical_nparens)) {
12207 I32 *map = (!flags && n) ? rx->logical_to_parno : NULL;
12208 I32 true_parno = map ? map[n] : n;
12209 do {
12210 if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1) &&
12211 ((t1 = RXp_OFFS_END(rx,true_parno)) != -1))
12212 {
12213 /* $&, ${^MATCH}, $1 ... */
12214 i = t1 - s1;
12215 s = RXp_SUBBEG(rx) + s1 - RXp_SUBOFFSET(rx);
12216 goto found_it;
12217 }
12218 else if (map) {
12219 true_parno = rx->parno_to_logical_next[true_parno];
12220 }
12221 else {
12222 break;
12223 }
12224 } while (true_parno);
12225 goto ret_undef;
12226 } else {
12227 goto ret_undef;
12228 }
12229
12230 found_it:
12231 assert(s >= RXp_SUBBEG(rx));
12232 assert((STRLEN)RXp_SUBLEN(rx) >= (STRLEN)((s - RXp_SUBBEG(rx)) + i) );
12233 if (i >= 0) {
12234 #ifdef NO_TAINT_SUPPORT
12235 sv_setpvn(sv, s, i);
12236 #else
12237 const int oldtainted = TAINT_get;
12238 TAINT_NOT;
12239 sv_setpvn(sv, s, i);
12240 TAINT_set(oldtainted);
12241 #endif
12242 if (RXp_MATCH_UTF8(rx))
12243 SvUTF8_on(sv);
12244 else
12245 SvUTF8_off(sv);
12246 if (TAINTING_get) {
12247 if (RXp_MATCH_TAINTED(rx)) {
12248 if (SvTYPE(sv) >= SVt_PVMG) {
12249 MAGIC* const mg = SvMAGIC(sv);
12250 MAGIC* mgt;
12251 TAINT;
12252 SvMAGIC_set(sv, mg->mg_moremagic);
12253 SvTAINT(sv);
12254 if ((mgt = SvMAGIC(sv))) {
12255 mg->mg_moremagic = mgt;
12256 SvMAGIC_set(sv, mg);
12257 }
12258 } else {
12259 TAINT;
12260 SvTAINT(sv);
12261 }
12262 } else
12263 SvTAINTED_off(sv);
12264 }
12265 } else {
12266 ret_undef:
12267 sv_set_undef(sv);
12268 return;
12269 }
12270 }
12271
12272 #endif
12273
12274 void
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx,const I32 paren,SV const * const value)12275 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
12276 SV const * const value)
12277 {
12278 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
12279
12280 PERL_UNUSED_ARG(rx);
12281 PERL_UNUSED_ARG(paren);
12282 PERL_UNUSED_ARG(value);
12283
12284 if (!PL_localizing)
12285 Perl_croak_no_modify();
12286 }
12287
12288 I32
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r,const SV * const sv,const I32 paren)12289 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
12290 const I32 paren)
12291 {
12292 struct regexp *const rx = ReANY(r);
12293 I32 i,j;
12294 I32 s1, t1;
12295 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12296
12297 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
12298
12299 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
12300 || paren == RX_BUFF_IDX_CARET_FULLMATCH
12301 || paren == RX_BUFF_IDX_CARET_POSTMATCH
12302 )
12303 {
12304 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12305 if (!keepcopy) {
12306 /* on something like
12307 * $r = qr/.../;
12308 * /$qr/p;
12309 * the KEEPCOPY is set on the PMOP rather than the regex */
12310 if (PL_curpm && r == PM_GETRE(PL_curpm))
12311 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12312 }
12313 if (!keepcopy)
12314 goto warn_undef;
12315 }
12316
12317 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
12318 switch (paren) {
12319 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
12320 case RX_BUFF_IDX_PREMATCH: /* $` */
12321 if ( (i = RXp_OFFS_START(rx,0)) != -1) {
12322 if (i > 0) {
12323 s1 = 0;
12324 t1 = i;
12325 goto getlen;
12326 }
12327 }
12328 return 0;
12329
12330 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
12331 case RX_BUFF_IDX_POSTMATCH: /* $' */
12332 if ( (j = RXp_OFFS_END(rx,0)) != -1 ) {
12333 i = RXp_SUBLEN(rx) - j;
12334 if (i > 0) {
12335 s1 = j;
12336 t1 = RXp_SUBLEN(rx);
12337 goto getlen;
12338 }
12339 }
12340 return 0;
12341
12342 default: /* $& / ${^MATCH}, $1, $2, ... */
12343 if (paren <= logical_nparens) {
12344 I32 true_paren = rx->logical_to_parno
12345 ? rx->logical_to_parno[paren]
12346 : paren;
12347 do {
12348 if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) &&
12349 ((t1 = RXp_OFFS_END(rx,true_paren)) != -1))
12350 {
12351 i = t1 - s1;
12352 goto getlen;
12353 } else if (rx->parno_to_logical_next) {
12354 true_paren = rx->parno_to_logical_next[true_paren];
12355 } else {
12356 break;
12357 }
12358 } while(true_paren);
12359 }
12360 warn_undef:
12361 if (ckWARN(WARN_UNINITIALIZED))
12362 report_uninit((const SV *)sv);
12363 return 0;
12364 }
12365 getlen:
12366 if (i > 0 && RXp_MATCH_UTF8(rx)) {
12367 const char * const s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + s1;
12368 const U8 *ep;
12369 STRLEN el;
12370
12371 i = t1 - s1;
12372 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
12373 i = el;
12374 }
12375 return i;
12376 }
12377
12378 /*
12379 * ex: set ts=8 sts=4 sw=4 et:
12380 */
12381