1 /* regexec.c
2 */
3
4 /*
5 * One Ring to rule them all, One Ring to find them
6 *
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10 */
11
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
20 */
21
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34 */
35
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39
40 /*
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
65 ****
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEX_ENGINE
75 #define PERL_IN_REGEXEC_C
76 #include "perl.h"
77
78 #ifdef PERL_IN_XSUB_RE
79 # include "re_comp.h"
80 #else
81 # include "regcomp.h"
82 #endif
83
84 #include "invlist_inline.h"
85 #include "unicode_constants.h"
86
87 static const char b_utf8_locale_required[] =
88 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
89 " Assuming a UTF-8 locale";
90
91 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
92 STMT_START { \
93 if (! IN_UTF8_CTYPE_LOCALE) { \
94 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
95 b_utf8_locale_required); \
96 } \
97 } STMT_END
98
99 static const char sets_utf8_locale_required[] =
100 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
101
102 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
103 STMT_START { \
104 if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\
105 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
106 sets_utf8_locale_required); \
107 } \
108 } STMT_END
109
110 #ifdef DEBUGGING
111 /* At least one required character in the target string is expressible only in
112 * UTF-8. */
113 static const char non_utf8_target_but_utf8_required[]
114 = "Can't match, because target string needs to be in UTF-8\n";
115 #endif
116
117 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
118 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
119 goto target; \
120 } STMT_END
121
122 #ifndef STATIC
123 #define STATIC static
124 #endif
125
126 /*
127 * Forwards.
128 */
129
130 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131
132 #define HOPc(pos,off) \
133 (char *)(reginfo->is_utf8_target \
134 ? reghop3((U8*)pos, off, \
135 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
136 : (U8*)(pos + off))
137
138 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
139 #define HOPBACK3(pos, off, lim) \
140 (reginfo->is_utf8_target \
141 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
142 : (pos - off >= lim) \
143 ? (U8*)pos - off \
144 : NULL)
145
146 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147
148 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
149 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150
151 /* lim must be +ve. Returns NULL on overshoot */
152 #define HOPMAYBE3(pos,off,lim) \
153 (reginfo->is_utf8_target \
154 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
155 : ((U8*)pos + off <= lim) \
156 ? (U8*)pos + off \
157 : NULL)
158
159 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
160 * off must be >=0; args should be vars rather than expressions */
161 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
162 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
163 : (U8*)((pos + off) > lim ? lim : (pos + off)))
164 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165
166 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
167 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168 : (U8*)(pos + off))
169 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170
171 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
172 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173
174 /* for use after a quantifier and before an EXACT-like node -- japhy */
175 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176 *
177 * NOTE that *nothing* that affects backtracking should be in here, specifically
178 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
179 * node that is in between two EXACT like nodes when ascertaining what the required
180 * "follow" character is. This should probably be moved to regex compile time
181 * although it may be done at run time because of the REF possibility - more
182 * investigation required. -- demerphq
183 */
184 #define JUMPABLE(rn) ( \
185 OP(rn) == OPEN || \
186 (OP(rn) == CLOSE && \
187 !EVAL_CLOSE_PAREN_IS(cur_eval,PARNO(rn)) ) || \
188 OP(rn) == EVAL || \
189 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
190 OP(rn) == PLUS || OP(rn) == MINMOD || \
191 OP(rn) == KEEPS || \
192 (REGNODE_TYPE(OP(rn)) == CURLY && ARG1i(rn) > 0) \
193 )
194 #define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT)
195
196 #define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF )
197
198 /*
199 Search for mandatory following text node; for lookahead, the text must
200 follow but for lookbehind (FLAGS(rn) != 0) we skip to the next step.
201 */
202 #define FIND_NEXT_IMPT(rn) STMT_START { \
203 while (JUMPABLE(rn)) { \
204 const OPCODE type = OP(rn); \
205 if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \
206 rn = REGNODE_AFTER_opcode(rn,type); \
207 else if (type == PLUS) \
208 rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \
209 else if (type == IFMATCH) \
210 rn = (FLAGS(rn) == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG1u(rn); \
211 else rn += NEXT_OFF(rn); \
212 } \
213 } STMT_END
214
215 #define SLAB_FIRST(s) (&(s)->states[0])
216 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217
218 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
219 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
220 static regmatch_state * S_push_slab(pTHX);
221
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
226
227 STATIC CHECKPOINT
S_regcppush(pTHX_ const regexp * rex,I32 parenfloor,U32 maxopenparen comma_pDEPTH)228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen comma_pDEPTH)
229 {
230 const int retval = PL_savestack_ix;
231 /* Number of bytes about to be stored in the stack */
232 const SSize_t paren_bytes_to_push = sizeof(*RXp_OFFSp(rex)) * (maxopenparen - parenfloor);
233 /* Number of savestack[] entries to be filled by the paren data */
234 /* Rounding is performed in case we are few elements short */
235 const int paren_elems_to_push = (paren_bytes_to_push + sizeof(*PL_savestack) - 1) / sizeof(*PL_savestack);
236 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
237 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
238
239 DECLARE_AND_GET_RE_DEBUG_FLAGS;
240
241 PERL_ARGS_ASSERT_REGCPPUSH;
242
243 if (paren_elems_to_push < 0)
244 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i",
245 (int)paren_elems_to_push, (int)maxopenparen,
246 (int)parenfloor);
247
248 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
249 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
250 " out of range (%lu-%ld)",
251 total_elems,
252 (unsigned long)maxopenparen,
253 (long)parenfloor);
254
255 DEBUG_BUFFERS_r(
256 if ((int)maxopenparen > (int)parenfloor)
257 Perl_re_exec_indentf( aTHX_
258 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
259 depth,
260 PTR2UV(rex),
261 PTR2UV(RXp_OFFSp(rex))
262 );
263 );
264
265 SSGROW(total_elems + REGCP_FRAME_ELEMS);
266 assert((IV)PL_savestack_max > (IV)(total_elems + REGCP_FRAME_ELEMS));
267
268 /* memcpy the offs inside the stack - it's faster than for loop */
269 memcpy(&PL_savestack[PL_savestack_ix], RXp_OFFSp(rex) + parenfloor + 1, paren_bytes_to_push);
270 PL_savestack_ix += paren_elems_to_push;
271
272 DEBUG_BUFFERS_r({
273 I32 p;
274 for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) {
275 Perl_re_exec_indentf(aTHX_
276 " \\%" UVuf " %" IVdf " (%" IVdf ") .. %" IVdf " (regcppush)\n",
277 depth,
278 (UV)p,
279 (IV)RXp_OFFSp(rex)[p].start,
280 (IV)RXp_OFFSp(rex)[p].start_tmp,
281 (IV)RXp_OFFSp(rex)[p].end
282 );
283 }
284 });
285
286 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
287 SSPUSHINT(maxopenparen);
288 SSPUSHINT(RXp_LASTPAREN(rex));
289 SSPUSHINT(RXp_LASTCLOSEPAREN(rex));
290 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
291
292
293 DEBUG_BUFFERS_r({
294 Perl_re_exec_indentf(aTHX_
295 "finished regcppush returning %" IVdf " cur: %" IVdf "\n",
296 depth, retval, PL_savestack_ix);
297 });
298
299 return retval;
300 }
301
302 /* These are needed since we do not localize EVAL nodes: */
303 #define REGCP_SET(cp) \
304 DEBUG_STATE_r( \
305 Perl_re_exec_indentf( aTHX_ \
306 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
307 depth, (IV)PL_savestack_ix \
308 ) \
309 ); \
310 cp = PL_savestack_ix
311
312 #define REGCP_UNWIND(cp) \
313 DEBUG_STATE_r( \
314 if (cp != PL_savestack_ix) \
315 Perl_re_exec_indentf( aTHX_ \
316 "Clearing an EVAL scope, savestack=%" \
317 IVdf "..%" IVdf "\n", \
318 depth, (IV)(cp), (IV)PL_savestack_ix \
319 ) \
320 ); \
321 regcpblow(cp)
322
323 /* set the start and end positions of capture ix */
324 #define CLOSE_ANY_CAPTURE(rex, ix, s, e) \
325 RXp_OFFSp(rex)[(ix)].start = (s); \
326 RXp_OFFSp(rex)[(ix)].end = (e)
327
328 #define CLOSE_CAPTURE(rex, ix, s, e) \
329 CLOSE_ANY_CAPTURE(rex, ix, s, e); \
330 if (ix > RXp_LASTPAREN(rex)) \
331 RXp_LASTPAREN(rex) = (ix); \
332 RXp_LASTCLOSEPAREN(rex) = (ix); \
333 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
334 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " .. %" IVdf " max: %" UVuf "\n", \
335 depth, \
336 PTR2UV(rex), \
337 PTR2UV(RXp_OFFSp(rex)), \
338 (UV)(ix), \
339 (IV)RXp_OFFSp(rex)[ix].start, \
340 (IV)RXp_OFFSp(rex)[ix].end, \
341 (UV)RXp_LASTPAREN(rex) \
342 ))
343
344 /* the lp and lcp args match the relevant members of the
345 * regexp structure, but in practice they should all be U16
346 * instead as we have a hard limit of U16_MAX parens. See
347 * line 4003 or so of regcomp.c where we parse OPEN parens
348 * of various types. */
349 PERL_STATIC_INLINE void
S_unwind_paren(pTHX_ regexp * rex,U32 lp,U32 lcp comma_pDEPTH)350 S_unwind_paren(pTHX_ regexp *rex, U32 lp, U32 lcp comma_pDEPTH) {
351 PERL_ARGS_ASSERT_UNWIND_PAREN;
352 U32 n;
353 DECLARE_AND_GET_RE_DEBUG_FLAGS;
354 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
355 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf
356 ": invalidate (%" UVuf " .. %" UVuf ") set lcp: %" UVuf "\n",
357 depth,
358 PTR2UV(rex),
359 PTR2UV(RXp_OFFSp(rex)),
360 (UV)(lp),
361 (UV)(RXp_LASTPAREN(rex)),
362 (UV)(lcp)
363 ));
364 for (n = RXp_LASTPAREN(rex); n > lp; n--) {
365 RXp_OFFSp(rex)[n].end = -1;
366 }
367 RXp_LASTPAREN(rex) = n;
368 RXp_LASTCLOSEPAREN(rex) = lcp;
369 }
370 #define UNWIND_PAREN(lp,lcp) unwind_paren(rex,lp,lcp)
371
372 PERL_STATIC_INLINE void
S_capture_clear(pTHX_ regexp * rex,U16 from_ix,U16 to_ix,const char * str comma_pDEPTH)373 S_capture_clear(pTHX_ regexp *rex, U16 from_ix, U16 to_ix, const char *str comma_pDEPTH) {
374 PERL_ARGS_ASSERT_CAPTURE_CLEAR;
375 PERL_UNUSED_ARG(str); /* only used for debugging */
376 U16 my_ix;
377 DECLARE_AND_GET_RE_DEBUG_FLAGS;
378 for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) {
379 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
380 "CAPTURE_CLEAR %s \\%" IVdf ": "
381 "%" IVdf "(%" IVdf ") .. %" IVdf
382 " => "
383 "%" IVdf "(%" IVdf ") .. %" IVdf
384 "\n",
385 depth, str, (IV)my_ix,
386 (IV)RXp_OFFSp(rex)[my_ix].start,
387 (IV)RXp_OFFSp(rex)[my_ix].start_tmp,
388 (IV)RXp_OFFSp(rex)[my_ix].end,
389 (IV)-1, (IV)-1, (IV)-1));
390 RXp_OFFSp(rex)[my_ix].start = -1;
391 RXp_OFFSp(rex)[my_ix].start_tmp = -1;
392 RXp_OFFSp(rex)[my_ix].end = -1;
393 }
394 }
395
396 #define CAPTURE_CLEAR(from_ix, to_ix, str) \
397 if (from_ix) capture_clear(rex,from_ix, to_ix, str)
398
399 STATIC void
S_regcppop(pTHX_ regexp * rex,U32 * maxopenparen_p comma_pDEPTH)400 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p comma_pDEPTH)
401 {
402 UV i;
403 U32 paren;
404 DECLARE_AND_GET_RE_DEBUG_FLAGS;
405
406 PERL_ARGS_ASSERT_REGCPPOP;
407
408
409 DEBUG_BUFFERS_r({
410 Perl_re_exec_indentf(aTHX_
411 "starting regcppop at %" IVdf "\n",
412 depth, PL_savestack_ix);
413 });
414
415 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
416 i = SSPOPUV;
417 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
418 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
419 RXp_LASTCLOSEPAREN(rex) = SSPOPINT;
420 RXp_LASTPAREN(rex) = SSPOPINT;
421 *maxopenparen_p = SSPOPINT;
422
423 i -= REGCP_OTHER_ELEMS;
424 /* Now restore the parentheses context. */
425 DEBUG_BUFFERS_r(
426 if (i || RXp_LASTPAREN(rex) + 1 <= rex->nparens)
427 Perl_re_exec_indentf( aTHX_
428 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
429 depth,
430 PTR2UV(rex),
431 PTR2UV(RXp_OFFSp(rex))
432 );
433 );
434 /* substract remaining elements from the stack */
435 PL_savestack_ix -= i;
436
437 /* static assert that offs struc size is not less than stack elem size */
438 STATIC_ASSERT_STMT(sizeof(*RXp_OFFSp(rex)) >= sizeof(*PL_savestack));
439
440 /* calculate actual number of offs/capture groups stored */
441 /* by doing integer division (leaving potential alignment aside) */
442 i = (i * sizeof(*PL_savestack)) / sizeof(*RXp_OFFSp(rex));
443
444 /* calculate paren starting point */
445 /* i is our number of entries which we are subtracting from *maxopenparen_p */
446 /* and we are storing + 1 this to get the beginning */
447 paren = *maxopenparen_p - i + 1;
448
449 /* restore them */
450 memcpy(RXp_OFFSp(rex) + paren, &PL_savestack[PL_savestack_ix], i * sizeof(*RXp_OFFSp(rex)));
451
452 DEBUG_BUFFERS_r(
453 for (; paren <= *maxopenparen_p; ++paren) {
454 Perl_re_exec_indentf(aTHX_
455 " \\%" UVuf " %" IVdf "(%" IVdf ") .. %" IVdf " %s (regcppop)\n",
456 depth,
457 (UV)paren,
458 (IV)RXp_OFFSp(rex)[paren].start,
459 (IV)RXp_OFFSp(rex)[paren].start_tmp,
460 (IV)RXp_OFFSp(rex)[paren].end,
461 (paren > RXp_LASTPAREN(rex) ? "(skipped)" : ""));
462 }
463 );
464 #if 1
465 /* It would seem that the similar code in regtry()
466 * already takes care of this, and in fact it is in
467 * a better location to since this code can #if 0-ed out
468 * but the code in regtry() is needed or otherwise tests
469 * requiring null fields (pat.t#187 and split.t#{13,14}
470 * (as of patchlevel 7877) will fail. Then again,
471 * this code seems to be necessary or otherwise
472 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
473 * --jhi updated by dapm */
474 for (i = RXp_LASTPAREN(rex) + 1; i <= rex->nparens; i++) {
475 if (i > *maxopenparen_p) {
476 RXp_OFFSp(rex)[i].start = -1;
477 }
478 RXp_OFFSp(rex)[i].end = -1;
479 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
480 " \\%" UVuf ": %s ..-1 undeffing (regcppop)\n",
481 depth,
482 (UV)i,
483 (i > *maxopenparen_p) ? "-1" : " "
484 ));
485 }
486 #endif
487 DEBUG_BUFFERS_r({
488 Perl_re_exec_indentf(aTHX_
489 "finished regcppop at %" IVdf "\n",
490 depth, PL_savestack_ix);
491 });
492 }
493
494 /* restore the parens and associated vars at savestack position ix,
495 * but without popping the stack */
496
497 STATIC void
S_regcp_restore(pTHX_ regexp * rex,I32 ix,U32 * maxopenparen_p comma_pDEPTH)498 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p comma_pDEPTH)
499 {
500 I32 tmpix = PL_savestack_ix;
501 PERL_ARGS_ASSERT_REGCP_RESTORE;
502
503 PL_savestack_ix = ix;
504 regcppop(rex, maxopenparen_p);
505 PL_savestack_ix = tmpix;
506 }
507
508 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
509
510 STATIC bool
S_isFOO_lc(pTHX_ const U8 classnum,const U8 character)511 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
512 {
513 /* Returns a boolean as to whether or not 'character' is a member of the
514 * Posix character class given by 'classnum' that should be equivalent to a
515 * value in the typedef 'char_class_number_'.
516 *
517 * Ideally this could be replaced by a just an array of function pointers
518 * to the C library functions that implement the macros this calls.
519 * However, to compile, the precise function signatures are required, and
520 * these may vary from platform to platform. To avoid having to figure
521 * out what those all are on each platform, I (khw) am using this method,
522 * which adds an extra layer of function call overhead (unless the C
523 * optimizer strips it away). But we don't particularly care about
524 * performance with locales anyway. */
525
526 if (IN_UTF8_CTYPE_LOCALE) {
527 return cBOOL(generic_isCC_(character, classnum));
528 }
529
530 switch ((char_class_number_) classnum) {
531 case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character);
532 case CC_ENUM_ALPHA_: return isU8_ALPHA_LC(character);
533 case CC_ENUM_ASCII_: return isU8_ASCII_LC(character);
534 case CC_ENUM_BLANK_: return isU8_BLANK_LC(character);
535 case CC_ENUM_CASED_: return isU8_CASED_LC(character);
536 case CC_ENUM_CNTRL_: return isU8_CNTRL_LC(character);
537 case CC_ENUM_DIGIT_: return isU8_DIGIT_LC(character);
538 case CC_ENUM_GRAPH_: return isU8_GRAPH_LC(character);
539 case CC_ENUM_LOWER_: return isU8_LOWER_LC(character);
540 case CC_ENUM_PRINT_: return isU8_PRINT_LC(character);
541 case CC_ENUM_PUNCT_: return isU8_PUNCT_LC(character);
542 case CC_ENUM_SPACE_: return isU8_SPACE_LC(character);
543 case CC_ENUM_UPPER_: return isU8_UPPER_LC(character);
544 case CC_ENUM_WORDCHAR_: return isU8_WORDCHAR_LC(character);
545 case CC_ENUM_XDIGIT_: return isU8_XDIGIT_LC(character);
546 default: /* VERTSPACE should never occur in locales */
547 break;
548 }
549
550 Perl_croak(aTHX_
551 "panic: isFOO_lc() has an unexpected character class '%d'",
552 classnum);
553
554 NOT_REACHED; /* NOTREACHED */
555 return FALSE;
556 }
557
558 PERL_STATIC_INLINE I32
S_foldEQ_latin1_s2_folded(pTHX_ const char * s1,const char * s2,I32 len)559 S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len)
560 {
561 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
562 * folded. Works on all folds representable without UTF-8, except for
563 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
564 * check that the strings each have at least 'len' characters.
565 *
566 * There is almost an identical API function where s2 need not be folded:
567 * Perl_foldEQ_latin1() */
568
569 const U8 *a = (const U8 *)s1;
570 const U8 *b = (const U8 *)s2;
571
572 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
573
574 assert(len >= 0);
575
576 while (len--) {
577 assert(! isUPPER_L1(*b));
578 if (toLOWER_L1(*a) != *b) {
579 return 0;
580 }
581 a++, b++;
582 }
583 return 1;
584 }
585
586 STATIC bool
S_isFOO_utf8_lc(pTHX_ const U8 classnum,const U8 * character,const U8 * e)587 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
588 {
589 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
590 * 'character' is a member of the Posix character class given by 'classnum'
591 * that should be equivalent to a value in the typedef
592 * 'char_class_number_'.
593 *
594 * This just calls isFOO_lc on the code point for the character if it is in
595 * the range 0-255. Outside that range, all characters use Unicode
596 * rules, ignoring any locale. So use the Unicode function if this class
597 * requires an inversion list, and use the Unicode macro otherwise. */
598
599
600 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
601
602 if (UTF8_IS_INVARIANT(*character)) {
603 return isFOO_lc(classnum, *character);
604 }
605 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
606 return isFOO_lc(classnum,
607 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
608 }
609
610 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
611
612 switch ((char_class_number_) classnum) {
613 case CC_ENUM_SPACE_: return is_XPERLSPACE_high(character);
614 case CC_ENUM_BLANK_: return is_HORIZWS_high(character);
615 case CC_ENUM_XDIGIT_: return is_XDIGIT_high(character);
616 case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character);
617 default:
618 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
619 utf8_to_uvchr_buf(character, e, NULL));
620 }
621 NOT_REACHED; /* NOTREACHED */
622 }
623
624 STATIC U8 *
S_find_span_end(U8 * s,const U8 * send,const U8 span_byte)625 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
626 {
627 /* Returns the position of the first byte in the sequence between 's' and
628 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
629 * */
630
631 PERL_ARGS_ASSERT_FIND_SPAN_END;
632
633 assert(send >= s);
634
635 if ((STRLEN) (send - s) >= PERL_WORDSIZE
636 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
637 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
638 {
639 PERL_UINTMAX_T span_word;
640
641 /* Process per-byte until reach word boundary. XXX This loop could be
642 * eliminated if we knew that this platform had fast unaligned reads */
643 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
644 if (*s != span_byte) {
645 return s;
646 }
647 s++;
648 }
649
650 /* Create a word filled with the bytes we are spanning */
651 span_word = PERL_COUNT_MULTIPLIER * span_byte;
652
653 /* Process per-word as long as we have at least a full word left */
654 do {
655
656 /* Keep going if the whole word is composed of 'span_byte's */
657 if ((* (PERL_UINTMAX_T *) s) == span_word) {
658 s += PERL_WORDSIZE;
659 continue;
660 }
661
662 /* Here, at least one byte in the word isn't 'span_byte'. */
663
664 #ifdef EBCDIC
665
666 break;
667
668 #else
669
670 /* This xor leaves 1 bits only in those non-matching bytes */
671 span_word ^= * (PERL_UINTMAX_T *) s;
672
673 /* Make sure the upper bit of each non-matching byte is set. This
674 * makes each such byte look like an ASCII platform variant byte */
675 span_word |= span_word << 1;
676 span_word |= span_word << 2;
677 span_word |= span_word << 4;
678
679 /* That reduces the problem to what this function solves */
680 return s + variant_byte_number(span_word);
681
682 #endif
683
684 } while (s + PERL_WORDSIZE <= send);
685 }
686
687 /* Process the straggler bytes beyond the final word boundary */
688 while (s < send) {
689 if (*s != span_byte) {
690 return s;
691 }
692 s++;
693 }
694
695 return s;
696 }
697
698 STATIC U8 *
S_find_next_masked(U8 * s,const U8 * send,const U8 byte,const U8 mask)699 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
700 {
701 /* Returns the position of the first byte in the sequence between 's'
702 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
703 * returns 'send' if none found. It uses word-level operations instead of
704 * byte to speed up the process */
705
706 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
707
708 assert(send >= s);
709 assert((byte & mask) == byte);
710
711 #ifndef EBCDIC
712
713 if ((STRLEN) (send - s) >= PERL_WORDSIZE
714 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
715 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
716 {
717 PERL_UINTMAX_T word, mask_word;
718
719 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
720 if (((*s) & mask) == byte) {
721 return s;
722 }
723 s++;
724 }
725
726 word = PERL_COUNT_MULTIPLIER * byte;
727 mask_word = PERL_COUNT_MULTIPLIER * mask;
728
729 do {
730 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
731
732 /* If 'masked' contains bytes with the bit pattern of 'byte' within
733 * it, xoring with 'word' will leave each of the 8 bits in such
734 * bytes be 0, and no byte containing any other bit pattern will be
735 * 0. */
736 masked ^= word;
737
738 /* This causes the most significant bit to be set to 1 for any
739 * bytes in the word that aren't completely 0 */
740 masked |= masked << 1;
741 masked |= masked << 2;
742 masked |= masked << 4;
743
744 /* The msbits are the same as what marks a byte as variant, so we
745 * can use this mask. If all msbits are 1, the word doesn't
746 * contain 'byte' */
747 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
748 s += PERL_WORDSIZE;
749 continue;
750 }
751
752 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
753 * and any that are, are 0. Complement and re-AND to swap that */
754 masked = ~ masked;
755 masked &= PERL_VARIANTS_WORD_MASK;
756
757 /* This reduces the problem to that solved by this function */
758 s += variant_byte_number(masked);
759 return s;
760
761 } while (s + PERL_WORDSIZE <= send);
762 }
763
764 #endif
765
766 while (s < send) {
767 if (((*s) & mask) == byte) {
768 return s;
769 }
770 s++;
771 }
772
773 return s;
774 }
775
776 STATIC U8 *
S_find_span_end_mask(U8 * s,const U8 * send,const U8 span_byte,const U8 mask)777 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
778 {
779 /* Returns the position of the first byte in the sequence between 's' and
780 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
781 * 'span_byte' should have been ANDed with 'mask' in the call of this
782 * function. Returns 'send' if none found. Works like find_span_end(),
783 * except for the AND */
784
785 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
786
787 assert(send >= s);
788 assert((span_byte & mask) == span_byte);
789
790 if ((STRLEN) (send - s) >= PERL_WORDSIZE
791 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
792 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
793 {
794 PERL_UINTMAX_T span_word, mask_word;
795
796 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
797 if (((*s) & mask) != span_byte) {
798 return s;
799 }
800 s++;
801 }
802
803 span_word = PERL_COUNT_MULTIPLIER * span_byte;
804 mask_word = PERL_COUNT_MULTIPLIER * mask;
805
806 do {
807 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
808
809 if (masked == span_word) {
810 s += PERL_WORDSIZE;
811 continue;
812 }
813
814 #ifdef EBCDIC
815
816 break;
817
818 #else
819
820 masked ^= span_word;
821 masked |= masked << 1;
822 masked |= masked << 2;
823 masked |= masked << 4;
824 return s + variant_byte_number(masked);
825
826 #endif
827
828 } while (s + PERL_WORDSIZE <= send);
829 }
830
831 while (s < send) {
832 if (((*s) & mask) != span_byte) {
833 return s;
834 }
835 s++;
836 }
837
838 return s;
839 }
840
841 /*
842 * pregexec and friends
843 */
844
845 #ifndef PERL_IN_XSUB_RE
846 /*
847 - pregexec - match a regexp against a string
848 */
849 I32
Perl_pregexec(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)850 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
851 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
852 /* stringarg: the point in the string at which to begin matching */
853 /* strend: pointer to null at end of string */
854 /* strbeg: real beginning of string */
855 /* minend: end of match must be >= minend bytes after stringarg. */
856 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
857 * itself is accessed via the pointers above */
858 /* nosave: For optimizations. */
859 {
860 PERL_ARGS_ASSERT_PREGEXEC;
861
862 return
863 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
864 nosave ? 0 : REXEC_COPY_STR);
865 }
866 #endif
867
868
869
870 /* re_intuit_start():
871 *
872 * Based on some optimiser hints, try to find the earliest position in the
873 * string where the regex could match.
874 *
875 * rx: the regex to match against
876 * sv: the SV being matched: only used for utf8 flag; the string
877 * itself is accessed via the pointers below. Note that on
878 * something like an overloaded SV, SvPOK(sv) may be false
879 * and the string pointers may point to something unrelated to
880 * the SV itself.
881 * strbeg: real beginning of string
882 * strpos: the point in the string at which to begin matching
883 * strend: pointer to the byte following the last char of the string
884 * flags currently unused; set to 0
885 * data: currently unused; set to NULL
886 *
887 * The basic idea of re_intuit_start() is to use some known information
888 * about the pattern, namely:
889 *
890 * a) the longest known anchored substring (i.e. one that's at a
891 * constant offset from the beginning of the pattern; but not
892 * necessarily at a fixed offset from the beginning of the
893 * string);
894 * b) the longest floating substring (i.e. one that's not at a constant
895 * offset from the beginning of the pattern);
896 * c) Whether the pattern is anchored to the string; either
897 * an absolute anchor: /^../, or anchored to \n: /^.../m,
898 * or anchored to pos(): /\G/;
899 * d) A start class: a real or synthetic character class which
900 * represents which characters are legal at the start of the pattern;
901 *
902 * to either quickly reject the match, or to find the earliest position
903 * within the string at which the pattern might match, thus avoiding
904 * running the full NFA engine at those earlier locations, only to
905 * eventually fail and retry further along.
906 *
907 * Returns NULL if the pattern can't match, or returns the address within
908 * the string which is the earliest place the match could occur.
909 *
910 * The longest of the anchored and floating substrings is called 'check'
911 * and is checked first. The other is called 'other' and is checked
912 * second. The 'other' substring may not be present. For example,
913 *
914 * /(abc|xyz)ABC\d{0,3}DEFG/
915 *
916 * will have
917 *
918 * check substr (float) = "DEFG", offset 6..9 chars
919 * other substr (anchored) = "ABC", offset 3..3 chars
920 * stclass = [ax]
921 *
922 * Be aware that during the course of this function, sometimes 'anchored'
923 * refers to a substring being anchored relative to the start of the
924 * pattern, and sometimes to the pattern itself being anchored relative to
925 * the string. For example:
926 *
927 * /\dabc/: "abc" is anchored to the pattern;
928 * /^\dabc/: "abc" is anchored to the pattern and the string;
929 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
930 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
931 * but the pattern is anchored to the string.
932 */
933
934 char *
Perl_re_intuit_start(pTHX_ REGEXP * const rx,SV * sv,const char * const strbeg,char * strpos,char * strend,const U32 flags,re_scream_pos_data * data)935 Perl_re_intuit_start(pTHX_
936 REGEXP * const rx,
937 SV *sv,
938 const char * const strbeg,
939 char *strpos,
940 char *strend,
941 const U32 flags,
942 re_scream_pos_data *data)
943 {
944 struct regexp *const prog = ReANY(rx);
945 SSize_t start_shift = prog->check_offset_min;
946 /* Should be nonnegative! */
947 SSize_t end_shift = 0;
948 /* current lowest pos in string where the regex can start matching */
949 char *rx_origin = strpos;
950 SV *check;
951 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
952 U8 other_ix = 1 - prog->substrs->check_ix;
953 bool ml_anch = 0;
954 char *other_last = strpos;/* latest pos 'other' substr already checked to */
955 char *check_at = NULL; /* check substr found at this pos */
956 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
957 RXi_GET_DECL(prog,progi);
958 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
959 regmatch_info *const reginfo = ®info_buf;
960 DECLARE_AND_GET_RE_DEBUG_FLAGS;
961
962 PERL_ARGS_ASSERT_RE_INTUIT_START;
963 PERL_UNUSED_ARG(flags);
964 PERL_UNUSED_ARG(data);
965
966 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
967 "Intuit: trying to determine minimum start position...\n"));
968
969 /* for now, assume that all substr offsets are positive. If at some point
970 * in the future someone wants to do clever things with lookbehind and
971 * -ve offsets, they'll need to fix up any code in this function
972 * which uses these offsets. See the thread beginning
973 * <20140113145929.GF27210@iabyn.com>
974 */
975 assert(prog->substrs->data[0].min_offset >= 0);
976 assert(prog->substrs->data[0].max_offset >= 0);
977 assert(prog->substrs->data[1].min_offset >= 0);
978 assert(prog->substrs->data[1].max_offset >= 0);
979 assert(prog->substrs->data[2].min_offset >= 0);
980 assert(prog->substrs->data[2].max_offset >= 0);
981
982 /* for now, assume that if both present, that the floating substring
983 * doesn't start before the anchored substring.
984 * If you break this assumption (e.g. doing better optimisations
985 * with lookahead/behind), then you'll need to audit the code in this
986 * function carefully first
987 */
988 assert(
989 ! ( (prog->anchored_utf8 || prog->anchored_substr)
990 && (prog->float_utf8 || prog->float_substr))
991 || (prog->float_min_offset >= prog->anchored_offset));
992
993 /* byte rather than char calculation for efficiency. It fails
994 * to quickly reject some cases that can't match, but will reject
995 * them later after doing full char arithmetic */
996 if (prog->minlen > strend - strpos) {
997 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
998 " String too short...\n"));
999 goto fail;
1000 }
1001
1002 RXp_MATCH_UTF8_set(prog, utf8_target);
1003 reginfo->is_utf8_target = cBOOL(utf8_target);
1004 reginfo->info_aux = NULL;
1005 reginfo->strbeg = strbeg;
1006 reginfo->strend = strend;
1007 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1008 reginfo->intuit = 1;
1009 /* not actually used within intuit, but zero for safety anyway */
1010 reginfo->poscache_maxiter = 0;
1011
1012 if (utf8_target) {
1013 if ((!prog->anchored_utf8 && prog->anchored_substr)
1014 || (!prog->float_utf8 && prog->float_substr))
1015 to_utf8_substr(prog);
1016 check = prog->check_utf8;
1017 } else {
1018 if (!prog->check_substr && prog->check_utf8) {
1019 if (! to_byte_substr(prog)) {
1020 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1021 }
1022 }
1023 check = prog->check_substr;
1024 }
1025
1026 /* dump the various substring data */
1027 DEBUG_OPTIMISE_MORE_r({
1028 int i;
1029 for (i=0; i<=2; i++) {
1030 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1031 : prog->substrs->data[i].substr);
1032 if (!sv)
1033 continue;
1034
1035 Perl_re_printf( aTHX_
1036 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1037 " useful=%" IVdf " utf8=%d [%s]\n",
1038 i,
1039 (IV)prog->substrs->data[i].min_offset,
1040 (IV)prog->substrs->data[i].max_offset,
1041 (IV)prog->substrs->data[i].end_shift,
1042 BmUSEFUL(sv),
1043 utf8_target ? 1 : 0,
1044 SvPEEK(sv));
1045 }
1046 });
1047
1048 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1049
1050 /* ml_anch: check after \n?
1051 *
1052 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1053 * with /.*.../, these flags will have been added by the
1054 * compiler:
1055 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
1056 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
1057 */
1058 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
1059 && !(prog->intflags & PREGf_IMPLICIT);
1060
1061 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1062 /* we are only allowed to match at BOS or \G */
1063
1064 /* trivially reject if there's a BOS anchor and we're not at BOS.
1065 *
1066 * Note that we don't try to do a similar quick reject for
1067 * \G, since generally the caller will have calculated strpos
1068 * based on pos() and gofs, so the string is already correctly
1069 * anchored by definition; and handling the exceptions would
1070 * be too fiddly (e.g. REXEC_IGNOREPOS).
1071 */
1072 if ( strpos != strbeg
1073 && (prog->intflags & PREGf_ANCH_SBOL))
1074 {
1075 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1076 " Not at start...\n"));
1077 goto fail;
1078 }
1079
1080 /* in the presence of an anchor, the anchored (relative to the
1081 * start of the regex) substr must also be anchored relative
1082 * to strpos. So quickly reject if substr isn't found there.
1083 * This works for \G too, because the caller will already have
1084 * subtracted gofs from pos, and gofs is the offset from the
1085 * \G to the start of the regex. For example, in /.abc\Gdef/,
1086 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1087 * caller will have set strpos=pos()-4; we look for the substr
1088 * at position pos()-4+1, which lines up with the "a" */
1089
1090 if (prog->check_offset_min == prog->check_offset_max) {
1091 /* Substring at constant offset from beg-of-str... */
1092 SSize_t slen = SvCUR(check);
1093 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1094
1095 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1096 " Looking for check substr at fixed offset %" IVdf "...\n",
1097 (IV)prog->check_offset_min));
1098
1099 if (SvTAIL(check)) {
1100 /* In this case, the regex is anchored at the end too.
1101 * Unless it's a multiline match, the lengths must match
1102 * exactly, give or take a \n. NB: slen >= 1 since
1103 * the last char of check is \n */
1104 if (!multiline
1105 && ( strend - s > slen
1106 || strend - s < slen - 1
1107 || (strend - s == slen && strend[-1] != '\n')))
1108 {
1109 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1110 " String too long...\n"));
1111 goto fail_finish;
1112 }
1113 /* Now should match s[0..slen-2] */
1114 slen--;
1115 }
1116 if (slen && (strend - s < slen
1117 || *SvPVX_const(check) != *s
1118 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1119 {
1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121 " String not equal...\n"));
1122 goto fail_finish;
1123 }
1124
1125 check_at = s;
1126 goto success_at_start;
1127 }
1128 }
1129 }
1130
1131 end_shift = prog->check_end_shift;
1132
1133 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1134 if (end_shift < 0)
1135 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1136 (IV)end_shift, RX_PRECOMP(rx));
1137 #endif
1138
1139 restart:
1140
1141 /* This is the (re)entry point of the main loop in this function.
1142 * The goal of this loop is to:
1143 * 1) find the "check" substring in the region rx_origin..strend
1144 * (adjusted by start_shift / end_shift). If not found, reject
1145 * immediately.
1146 * 2) If it exists, look for the "other" substr too if defined; for
1147 * example, if the check substr maps to the anchored substr, then
1148 * check the floating substr, and vice-versa. If not found, go
1149 * back to (1) with rx_origin suitably incremented.
1150 * 3) If we find an rx_origin position that doesn't contradict
1151 * either of the substrings, then check the possible additional
1152 * constraints on rx_origin of /^.../m or a known start class.
1153 * If these fail, then depending on which constraints fail, jump
1154 * back to here, or to various other re-entry points further along
1155 * that skip some of the first steps.
1156 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1157 * substring. If the start position was determined to be at the
1158 * beginning of the string - so, not rejected, but not optimised,
1159 * since we have to run regmatch from position 0 - decrement the
1160 * BmUSEFUL() count. Otherwise increment it.
1161 */
1162
1163
1164 /* first, look for the 'check' substring */
1165
1166 {
1167 U8* start_point;
1168 U8* end_point;
1169
1170 DEBUG_OPTIMISE_MORE_r({
1171 Perl_re_printf( aTHX_
1172 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1173 " Start shift: %" IVdf " End shift %" IVdf
1174 " Real end Shift: %" IVdf "\n",
1175 (IV)(rx_origin - strbeg),
1176 (IV)prog->check_offset_min,
1177 (IV)start_shift,
1178 (IV)end_shift,
1179 (IV)prog->check_end_shift);
1180 });
1181
1182 end_point = HOPBACK3(strend, end_shift, rx_origin);
1183 if (!end_point)
1184 goto fail_finish;
1185 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1186 if (!start_point)
1187 goto fail_finish;
1188
1189
1190 /* If the regex is absolutely anchored to either the start of the
1191 * string (SBOL) or to pos() (ANCH_GPOS), then
1192 * check_offset_max represents an upper bound on the string where
1193 * the substr could start. For the ANCH_GPOS case, we assume that
1194 * the caller of intuit will have already set strpos to
1195 * pos()-gofs, so in this case strpos + offset_max will still be
1196 * an upper bound on the substr.
1197 */
1198 if (!ml_anch
1199 && prog->intflags & PREGf_ANCH
1200 && prog->check_offset_max != SSize_t_MAX)
1201 {
1202 SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check));
1203 const char * const anchor =
1204 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1205 SSize_t targ_len = (char*)end_point - anchor;
1206
1207 if (check_len > targ_len) {
1208 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1209 "Target string too short to match required substring...\n"));
1210 goto fail_finish;
1211 }
1212
1213 /* do a bytes rather than chars comparison. It's conservative;
1214 * so it skips doing the HOP if the result can't possibly end
1215 * up earlier than the old value of end_point.
1216 */
1217 assert(anchor + check_len <= (char *)end_point);
1218 if (prog->check_offset_max + check_len < targ_len) {
1219 end_point = HOP3lim((U8*)anchor,
1220 prog->check_offset_max,
1221 end_point - check_len
1222 )
1223 + check_len;
1224 if (end_point < start_point)
1225 goto fail_finish;
1226 }
1227 }
1228
1229 check_at = fbm_instr( start_point, end_point,
1230 check, multiline ? FBMrf_MULTILINE : 0);
1231
1232 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1233 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1234 (IV)((char*)start_point - strbeg),
1235 (IV)((char*)end_point - strbeg),
1236 (IV)(check_at ? check_at - strbeg : -1)
1237 ));
1238
1239 /* Update the count-of-usability, remove useless subpatterns,
1240 unshift s. */
1241
1242 DEBUG_EXECUTE_r({
1243 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1244 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1245 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1246 (check_at ? "Found" : "Did not find"),
1247 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1248 ? "anchored" : "floating"),
1249 quoted,
1250 RE_SV_TAIL(check),
1251 (check_at ? " at offset " : "...\n") );
1252 });
1253
1254 if (!check_at)
1255 goto fail_finish;
1256 /* set rx_origin to the minimum position where the regex could start
1257 * matching, given the constraint of the just-matched check substring.
1258 * But don't set it lower than previously.
1259 */
1260
1261 if (check_at - rx_origin > prog->check_offset_max)
1262 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1263 /* Finish the diagnostic message */
1264 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1265 "%ld (rx_origin now %" IVdf ")...\n",
1266 (long)(check_at - strbeg),
1267 (IV)(rx_origin - strbeg)
1268 ));
1269 }
1270
1271
1272 /* now look for the 'other' substring if defined */
1273
1274 if (prog->substrs->data[other_ix].utf8_substr
1275 || prog->substrs->data[other_ix].substr)
1276 {
1277 /* Take into account the "other" substring. */
1278 char *last, *last1;
1279 char *s;
1280 SV* must;
1281 struct reg_substr_datum *other;
1282
1283 do_other_substr:
1284 other = &prog->substrs->data[other_ix];
1285 if (!utf8_target && !other->substr) {
1286 if (!to_byte_substr(prog)) {
1287 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1288 }
1289 }
1290
1291 /* if "other" is anchored:
1292 * we've previously found a floating substr starting at check_at.
1293 * This means that the regex origin must lie somewhere
1294 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1295 * and max: HOP3(check_at, -check_offset_min)
1296 * (except that min will be >= strpos)
1297 * So the fixed substr must lie somewhere between
1298 * HOP3(min, anchored_offset)
1299 * HOP3(max, anchored_offset) + SvCUR(substr)
1300 */
1301
1302 /* if "other" is floating
1303 * Calculate last1, the absolute latest point where the
1304 * floating substr could start in the string, ignoring any
1305 * constraints from the earlier fixed match. It is calculated
1306 * as follows:
1307 *
1308 * strend - prog->minlen (in chars) is the absolute latest
1309 * position within the string where the origin of the regex
1310 * could appear. The latest start point for the floating
1311 * substr is float_min_offset(*) on from the start of the
1312 * regex. last1 simply combines thee two offsets.
1313 *
1314 * (*) You might think the latest start point should be
1315 * float_max_offset from the regex origin, and technically
1316 * you'd be correct. However, consider
1317 * /a\d{2,4}bcd\w/
1318 * Here, float min, max are 3,5 and minlen is 7.
1319 * This can match either
1320 * /a\d\dbcd\w/
1321 * /a\d\d\dbcd\w/
1322 * /a\d\d\d\dbcd\w/
1323 * In the first case, the regex matches minlen chars; in the
1324 * second, minlen+1, in the third, minlen+2.
1325 * In the first case, the floating offset is 3 (which equals
1326 * float_min), in the second, 4, and in the third, 5 (which
1327 * equals float_max). In all cases, the floating string bcd
1328 * can never start more than 4 chars from the end of the
1329 * string, which equals minlen - float_min. As the substring
1330 * starts to match more than float_min from the start of the
1331 * regex, it makes the regex match more than minlen chars,
1332 * and the two cancel each other out. So we can always use
1333 * float_min - minlen, rather than float_max - minlen for the
1334 * latest position in the string.
1335 *
1336 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1337 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1338 */
1339
1340 assert(prog->minlen >= other->min_offset);
1341 last1 = HOP3c(strend,
1342 other->min_offset - prog->minlen, strbeg);
1343
1344 if (other_ix) {/* i.e. if (other-is-float) */
1345 /* last is the latest point where the floating substr could
1346 * start, *given* any constraints from the earlier fixed
1347 * match. This constraint is that the floating string starts
1348 * <= float_max_offset chars from the regex origin (rx_origin).
1349 * If this value is less than last1, use it instead.
1350 */
1351 assert(rx_origin <= last1);
1352 last =
1353 /* this condition handles the offset==infinity case, and
1354 * is a short-cut otherwise. Although it's comparing a
1355 * byte offset to a char length, it does so in a safe way,
1356 * since 1 char always occupies 1 or more bytes,
1357 * so if a string range is (last1 - rx_origin) bytes,
1358 * it will be less than or equal to (last1 - rx_origin)
1359 * chars; meaning it errs towards doing the accurate HOP3
1360 * rather than just using last1 as a short-cut */
1361 (last1 - rx_origin) < other->max_offset
1362 ? last1
1363 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1364 }
1365 else {
1366 assert(strpos + start_shift <= check_at);
1367 last = HOP4c(check_at, other->min_offset - start_shift,
1368 strbeg, strend);
1369 }
1370
1371 s = HOP3c(rx_origin, other->min_offset, strend);
1372 if (s < other_last) /* These positions already checked */
1373 s = other_last;
1374
1375 must = utf8_target ? other->utf8_substr : other->substr;
1376 assert(SvPOK(must));
1377 {
1378 char *from = s;
1379 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1380
1381 if (to > strend)
1382 to = strend;
1383 if (from > to) {
1384 s = NULL;
1385 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1386 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1387 (IV)(from - strbeg),
1388 (IV)(to - strbeg)
1389 ));
1390 }
1391 else {
1392 s = fbm_instr(
1393 (unsigned char*)from,
1394 (unsigned char*)to,
1395 must,
1396 multiline ? FBMrf_MULTILINE : 0
1397 );
1398 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1399 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1400 (IV)(from - strbeg),
1401 (IV)(to - strbeg),
1402 (IV)(s ? s - strbeg : -1)
1403 ));
1404 }
1405 }
1406
1407 DEBUG_EXECUTE_r({
1408 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1409 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1410 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1411 s ? "Found" : "Contradicts",
1412 other_ix ? "floating" : "anchored",
1413 quoted, RE_SV_TAIL(must));
1414 });
1415
1416
1417 if (!s) {
1418 /* last1 is latest possible substr location. If we didn't
1419 * find it before there, we never will */
1420 if (last >= last1) {
1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422 "; giving up...\n"));
1423 goto fail_finish;
1424 }
1425
1426 /* try to find the check substr again at a later
1427 * position. Maybe next time we'll find the "other" substr
1428 * in range too */
1429 other_last = HOP3c(last, 1, strend) /* highest failure */;
1430 rx_origin =
1431 other_ix /* i.e. if other-is-float */
1432 ? HOP3c(rx_origin, 1, strend)
1433 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1434 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1435 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1436 (other_ix ? "floating" : "anchored"),
1437 (long)(HOP3c(check_at, 1, strend) - strbeg),
1438 (IV)(rx_origin - strbeg)
1439 ));
1440 goto restart;
1441 }
1442 else {
1443 if (other_ix) { /* if (other-is-float) */
1444 /* other_last is set to s, not s+1, since its possible for
1445 * a floating substr to fail first time, then succeed
1446 * second time at the same floating position; e.g.:
1447 * "-AB--AABZ" =~ /\wAB\d*Z/
1448 * The first time round, anchored and float match at
1449 * "-(AB)--AAB(Z)" then fail on the initial \w character
1450 * class. Second time round, they match at "-AB--A(AB)(Z)".
1451 */
1452 other_last = s;
1453 }
1454 else {
1455 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1456 other_last = HOP3c(s, 1, strend);
1457 }
1458 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1459 " at offset %ld (rx_origin now %" IVdf ")...\n",
1460 (long)(s - strbeg),
1461 (IV)(rx_origin - strbeg)
1462 ));
1463
1464 }
1465 }
1466 else {
1467 DEBUG_OPTIMISE_MORE_r(
1468 Perl_re_printf( aTHX_
1469 " Check-only match: offset min:%" IVdf " max:%" IVdf
1470 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1471 " strend:%" IVdf "\n",
1472 (IV)prog->check_offset_min,
1473 (IV)prog->check_offset_max,
1474 (IV)(check_at-strbeg),
1475 (IV)(rx_origin-strbeg),
1476 (IV)(rx_origin-check_at),
1477 (IV)(strend-strbeg)
1478 )
1479 );
1480 }
1481
1482 postprocess_substr_matches:
1483
1484 /* handle the extra constraint of /^.../m if present */
1485
1486 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1487 char *s;
1488
1489 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1490 " looking for /^/m anchor"));
1491
1492 /* we have failed the constraint of a \n before rx_origin.
1493 * Find the next \n, if any, even if it's beyond the current
1494 * anchored and/or floating substrings. Whether we should be
1495 * scanning ahead for the next \n or the next substr is debatable.
1496 * On the one hand you'd expect rare substrings to appear less
1497 * often than \n's. On the other hand, searching for \n means
1498 * we're effectively flipping between check_substr and "\n" on each
1499 * iteration as the current "rarest" candidate string, which
1500 * means for example that we'll quickly reject the whole string if
1501 * hasn't got a \n, rather than trying every substr position
1502 * first
1503 */
1504
1505 s = HOP3c(strend, - prog->minlen, strpos);
1506 if (s <= rx_origin ||
1507 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1508 {
1509 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1510 " Did not find /%s^%s/m...\n",
1511 PL_colors[0], PL_colors[1]));
1512 goto fail_finish;
1513 }
1514
1515 /* earliest possible origin is 1 char after the \n.
1516 * (since *rx_origin == '\n', it's safe to ++ here rather than
1517 * HOP(rx_origin, 1)) */
1518 rx_origin++;
1519
1520 if (prog->substrs->check_ix == 0 /* check is anchored */
1521 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1522 {
1523 /* Position contradicts check-string; either because
1524 * check was anchored (and thus has no wiggle room),
1525 * or check was float and rx_origin is above the float range */
1526 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1527 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1528 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1529 goto restart;
1530 }
1531
1532 /* if we get here, the check substr must have been float,
1533 * is in range, and we may or may not have had an anchored
1534 * "other" substr which still contradicts */
1535 assert(prog->substrs->check_ix); /* check is float */
1536
1537 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1538 /* whoops, the anchored "other" substr exists, so we still
1539 * contradict. On the other hand, the float "check" substr
1540 * didn't contradict, so just retry the anchored "other"
1541 * substr */
1542 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1543 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1544 PL_colors[0], PL_colors[1],
1545 (IV)(rx_origin - strbeg + prog->anchored_offset),
1546 (IV)(rx_origin - strbeg)
1547 ));
1548 goto do_other_substr;
1549 }
1550
1551 /* success: we don't contradict the found floating substring
1552 * (and there's no anchored substr). */
1553 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1554 " Found /%s^%s/m with rx_origin %ld...\n",
1555 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1556 }
1557 else {
1558 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1559 " (multiline anchor test skipped)\n"));
1560 }
1561
1562 success_at_start:
1563
1564
1565 /* if we have a starting character class, then test that extra constraint.
1566 * (trie stclasses are too expensive to use here, we are better off to
1567 * leave it to regmatch itself) */
1568
1569 if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) {
1570 const U8* const str = (U8*)STRING(progi->regstclass);
1571
1572 /* XXX this value could be pre-computed */
1573 const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT
1574 ? (reginfo->is_utf8_pat
1575 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1576 : (SSize_t)STR_LEN(progi->regstclass))
1577 : 1);
1578 char * endpos;
1579 char *s;
1580 /* latest pos that a matching float substr constrains rx start to */
1581 char *rx_max_float = NULL;
1582
1583 /* if the current rx_origin is anchored, either by satisfying an
1584 * anchored substring constraint, or a /^.../m constraint, then we
1585 * can reject the current origin if the start class isn't found
1586 * at the current position. If we have a float-only match, then
1587 * rx_origin is constrained to a range; so look for the start class
1588 * in that range. if neither, then look for the start class in the
1589 * whole rest of the string */
1590
1591 /* XXX DAPM it's not clear what the minlen test is for, and why
1592 * it's not used in the floating case. Nothing in the test suite
1593 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1594 * Here are some old comments, which may or may not be correct:
1595 *
1596 * minlen == 0 is possible if regstclass is \b or \B,
1597 * and the fixed substr is ''$.
1598 * Since minlen is already taken into account, rx_origin+1 is
1599 * before strend; accidentally, minlen >= 1 guaranties no false
1600 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1601 * 0) below assumes that regstclass does not come from lookahead...
1602 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1603 * This leaves EXACTF-ish only, which are dealt with in
1604 * find_byclass().
1605 */
1606
1607 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1608 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1609 else if (prog->float_substr || prog->float_utf8) {
1610 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1611 endpos = HOP3clim(rx_max_float, cl_l, strend);
1612 }
1613 else
1614 endpos= strend;
1615
1616 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1617 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1618 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1619 (IV)start_shift, (IV)(check_at - strbeg),
1620 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1621
1622 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1623 reginfo);
1624 if (!s) {
1625 if (endpos == strend) {
1626 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1627 " Could not match STCLASS...\n") );
1628 goto fail;
1629 }
1630 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1631 " This position contradicts STCLASS...\n") );
1632 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1633 && !(prog->intflags & PREGf_IMPLICIT))
1634 goto fail;
1635
1636 /* Contradict one of substrings */
1637 if (prog->anchored_substr || prog->anchored_utf8) {
1638 if (prog->substrs->check_ix == 1) { /* check is float */
1639 /* Have both, check_string is floating */
1640 assert(rx_origin + start_shift <= check_at);
1641 if (rx_origin + start_shift != check_at) {
1642 /* not at latest position float substr could match:
1643 * Recheck anchored substring, but not floating.
1644 * The condition above is in bytes rather than
1645 * chars for efficiency. It's conservative, in
1646 * that it errs on the side of doing 'goto
1647 * do_other_substr'. In this case, at worst,
1648 * an extra anchored search may get done, but in
1649 * practice the extra fbm_instr() is likely to
1650 * get skipped anyway. */
1651 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1652 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1653 (long)(other_last - strbeg),
1654 (IV)(rx_origin - strbeg)
1655 ));
1656 goto do_other_substr;
1657 }
1658 }
1659 }
1660 else {
1661 /* float-only */
1662
1663 if (ml_anch) {
1664 /* In the presence of ml_anch, we might be able to
1665 * find another \n without breaking the current float
1666 * constraint. */
1667
1668 /* strictly speaking this should be HOP3c(..., 1, ...),
1669 * but since we goto a block of code that's going to
1670 * search for the next \n if any, its safe here */
1671 rx_origin++;
1672 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1673 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1674 PL_colors[0], PL_colors[1],
1675 (long)(rx_origin - strbeg)) );
1676 goto postprocess_substr_matches;
1677 }
1678
1679 /* strictly speaking this can never be true; but might
1680 * be if we ever allow intuit without substrings */
1681 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1682 goto fail;
1683
1684 rx_origin = rx_max_float;
1685 }
1686
1687 /* at this point, any matching substrings have been
1688 * contradicted. Start again... */
1689
1690 rx_origin = HOP3c(rx_origin, 1, strend);
1691
1692 /* uses bytes rather than char calculations for efficiency.
1693 * It's conservative: it errs on the side of doing 'goto restart',
1694 * where there is code that does a proper char-based test */
1695 if (rx_origin + start_shift + end_shift > strend) {
1696 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1697 " Could not match STCLASS...\n") );
1698 goto fail;
1699 }
1700 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1701 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1702 (prog->substrs->check_ix ? "floating" : "anchored"),
1703 (long)(rx_origin + start_shift - strbeg),
1704 (IV)(rx_origin - strbeg)
1705 ));
1706 goto restart;
1707 }
1708
1709 /* Success !!! */
1710
1711 if (rx_origin != s) {
1712 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1713 " By STCLASS: moving %ld --> %ld\n",
1714 (long)(rx_origin - strbeg), (long)(s - strbeg))
1715 );
1716 }
1717 else {
1718 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1719 " Does not contradict STCLASS...\n");
1720 );
1721 }
1722 }
1723
1724 /* Decide whether using the substrings helped */
1725
1726 if (rx_origin != strpos) {
1727 /* Fixed substring is found far enough so that the match
1728 cannot start at strpos. */
1729
1730 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1731 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1732 }
1733 else {
1734 /* The found rx_origin position does not prohibit matching at
1735 * strpos, so calling intuit didn't gain us anything. Decrement
1736 * the BmUSEFUL() count on the check substring, and if we reach
1737 * zero, free it. */
1738 if (!(prog->intflags & PREGf_NAUGHTY)
1739 && (utf8_target ? (
1740 prog->check_utf8 /* Could be deleted already */
1741 && --BmUSEFUL(prog->check_utf8) < 0
1742 && (prog->check_utf8 == prog->float_utf8)
1743 ) : (
1744 prog->check_substr /* Could be deleted already */
1745 && --BmUSEFUL(prog->check_substr) < 0
1746 && (prog->check_substr == prog->float_substr)
1747 )))
1748 {
1749 /* If flags & SOMETHING - do not do it many times on the same match */
1750 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1751 /* XXX Does the destruction order has to change with utf8_target? */
1752 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1753 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1754 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1755 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1756 check = NULL; /* abort */
1757 /* XXXX This is a remnant of the old implementation. It
1758 looks wasteful, since now INTUIT can use many
1759 other heuristics. */
1760 prog->extflags &= ~RXf_USE_INTUIT;
1761 }
1762 }
1763
1764 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1765 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1766 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1767
1768 return rx_origin;
1769
1770 fail_finish: /* Substring not found */
1771 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1772 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1773 fail:
1774 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1775 PL_colors[4], PL_colors[5]));
1776 return NULL;
1777 }
1778
1779
1780 #define DECL_TRIE_TYPE(scan) \
1781 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1782 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1783 trie_utf8l, trie_flu8, trie_flu8_latin } \
1784 trie_type = ((FLAGS(scan) == EXACT) \
1785 ? (utf8_target ? trie_utf8 : trie_plain) \
1786 : (FLAGS(scan) == EXACTL) \
1787 ? (utf8_target ? trie_utf8l : trie_plain) \
1788 : (FLAGS(scan) == EXACTFAA) \
1789 ? (utf8_target \
1790 ? trie_utf8_exactfa_fold \
1791 : trie_latin_utf8_exactfa_fold) \
1792 : (FLAGS(scan) == EXACTFLU8 \
1793 ? (utf8_target \
1794 ? trie_flu8 \
1795 : trie_flu8_latin) \
1796 : (utf8_target \
1797 ? trie_utf8_fold \
1798 : trie_latin_utf8_fold)))
1799
1800 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1801 * 'foldbuf+sizeof(foldbuf)' */
1802 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1803 STMT_START { \
1804 STRLEN skiplen; \
1805 U8 flags = FOLD_FLAGS_FULL; \
1806 switch (trie_type) { \
1807 case trie_flu8: \
1808 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1809 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1810 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1811 } \
1812 goto do_trie_utf8_fold; \
1813 case trie_utf8_exactfa_fold: \
1814 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1815 /* FALLTHROUGH */ \
1816 case trie_utf8_fold: \
1817 do_trie_utf8_fold: \
1818 if ( foldlen>0 ) { \
1819 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1820 foldlen -= len; \
1821 uscan += len; \
1822 len=0; \
1823 } else { \
1824 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1825 flags); \
1826 len = UTF8_SAFE_SKIP(uc, uc_end); \
1827 skiplen = UVCHR_SKIP( uvc ); \
1828 foldlen -= skiplen; \
1829 uscan = foldbuf + skiplen; \
1830 } \
1831 break; \
1832 case trie_flu8_latin: \
1833 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1834 goto do_trie_latin_utf8_fold; \
1835 case trie_latin_utf8_exactfa_fold: \
1836 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1837 /* FALLTHROUGH */ \
1838 case trie_latin_utf8_fold: \
1839 do_trie_latin_utf8_fold: \
1840 if ( foldlen>0 ) { \
1841 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1842 foldlen -= len; \
1843 uscan += len; \
1844 len=0; \
1845 } else { \
1846 len = 1; \
1847 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1848 skiplen = UVCHR_SKIP( uvc ); \
1849 foldlen -= skiplen; \
1850 uscan = foldbuf + skiplen; \
1851 } \
1852 break; \
1853 case trie_utf8l: \
1854 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1855 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1856 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1857 } \
1858 /* FALLTHROUGH */ \
1859 case trie_utf8: \
1860 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1861 break; \
1862 case trie_plain: \
1863 uvc = (UV)*uc; \
1864 len = 1; \
1865 } \
1866 if (uvc < 256) { \
1867 charid = trie->charmap[ uvc ]; \
1868 } \
1869 else { \
1870 charid = 0; \
1871 if (widecharmap) { \
1872 SV** const svpp = hv_fetch(widecharmap, \
1873 (char*)&uvc, sizeof(UV), 0); \
1874 if (svpp) \
1875 charid = (U16)SvIV(*svpp); \
1876 } \
1877 } \
1878 } STMT_END
1879
1880 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1881 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1882 startpos, doutf8, depth)
1883
1884 #define GET_ANYOFH_INVLIST(prog, n) \
1885 GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL)
1886
1887 #define REXEC_FBC_UTF8_SCAN(CODE) \
1888 STMT_START { \
1889 while (s < strend) { \
1890 CODE \
1891 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1892 } \
1893 } STMT_END
1894
1895 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1896 STMT_START { \
1897 while (s < strend) { \
1898 CODE \
1899 s++; \
1900 } \
1901 } STMT_END
1902
1903 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1904 STMT_START { \
1905 while (s < strend) { \
1906 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1907 } \
1908 } STMT_END
1909
1910 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
1911 STMT_START { \
1912 while (s < strend) { \
1913 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1914 } \
1915 } STMT_END
1916
1917 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1918 if (COND) { \
1919 FBC_CHECK_AND_TRY \
1920 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1921 previous_occurrence_end = s; \
1922 } \
1923 else { \
1924 s += UTF8SKIP(s); \
1925 }
1926
1927 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1928 if (COND) { \
1929 FBC_CHECK_AND_TRY \
1930 s++; \
1931 previous_occurrence_end = s; \
1932 } \
1933 else { \
1934 s++; \
1935 }
1936
1937 /* We keep track of where the next character should start after an occurrence
1938 * of the one we're looking for. Knowing that, we can see right away if the
1939 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1940 * don't accept the 2nd and succeeding adjacent occurrences */
1941 #define FBC_CHECK_AND_TRY \
1942 if ( ( doevery \
1943 || s != previous_occurrence_end) \
1944 && ( reginfo->intuit \
1945 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1946 { \
1947 goto got_it; \
1948 }
1949
1950
1951 /* These differ from the above macros in that they call a function which
1952 * returns the next occurrence of the thing being looked for in 's'; and
1953 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c)
1954 * */
1955 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
1956 while (s < strend) { \
1957 s = (char *) (f); \
1958 if (s >= strend) { \
1959 break; \
1960 } \
1961 \
1962 FBC_CHECK_AND_TRY \
1963 s += UTF8SKIP(s); \
1964 previous_occurrence_end = s; \
1965 }
1966
1967 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1968 while (s < strend) { \
1969 s = (char *) (f); \
1970 if (s >= strend) { \
1971 break; \
1972 } \
1973 \
1974 FBC_CHECK_AND_TRY \
1975 s++; \
1976 previous_occurrence_end = s; \
1977 }
1978
1979 /* This is like the above macro except the function returns NULL if there is no
1980 * occurrence, and there is a further condition that must be matched besides
1981 * the function */
1982 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \
1983 while (s < strend) { \
1984 s = (char *) (f); \
1985 if (s == NULL) { \
1986 s = (char *) strend; \
1987 break; \
1988 } \
1989 \
1990 if (COND) { \
1991 FBC_CHECK_AND_TRY \
1992 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1993 previous_occurrence_end = s; \
1994 } \
1995 else { \
1996 s += UTF8SKIP(s); \
1997 } \
1998 }
1999
2000 /* This differs from the above macros in that it is passed a single byte that
2001 * is known to begin the next occurrence of the thing being looked for in 's'.
2002 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
2003 * at that position. */
2004 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
2005 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \
2006 COND)
2007
2008 /* This is like the function above, but takes an entire string to look for
2009 * instead of a single byte */
2010 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \
2011 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \
2012 ninstr(s, strend, substr, substr_end), \
2013 COND)
2014
2015 /* The four macros below are slightly different versions of the same logic.
2016 *
2017 * The first is for /a and /aa when the target string is UTF-8. This can only
2018 * match ascii, but it must advance based on UTF-8. The other three handle
2019 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
2020 * looking for the boundary (or non-boundary) between a word and non-word
2021 * character. The utf8 and non-utf8 cases have the same logic, but the details
2022 * must be different. Find the "wordness" of the character just prior to this
2023 * one, and compare it with the wordness of this one. If they differ, we have
2024 * a boundary. At the beginning of the string, pretend that the previous
2025 * character was a new-line.
2026 *
2027 * All these macros uncleanly have side-effects with each other and outside
2028 * variables. So far it's been too much trouble to clean-up
2029 *
2030 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
2031 * a word character or not.
2032 * IF_SUCCESS is code to do if it finds that we are at a boundary between
2033 * word/non-word
2034 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
2035 *
2036 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
2037 * are looking for a boundary or for a non-boundary. If we are looking for a
2038 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
2039 * see if this tentative match actually works, and if so, to quit the loop
2040 * here. And vice-versa if we are looking for a non-boundary.
2041 *
2042 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
2043 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
2044 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
2045 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2046 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2047 * complement. But in that branch we complement tmp, meaning that at the
2048 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2049 * which means at the top of the loop in the next iteration, it is
2050 * TEST_NON_UTF8(s-1) */
2051 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2052 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2053 tmp = TEST_NON_UTF8(tmp); \
2054 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
2055 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
2056 tmp = !tmp; \
2057 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
2058 } \
2059 else { \
2060 IF_FAIL; \
2061 } \
2062 ); \
2063
2064 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2065 * TEST_UTF8 is a macro that for the same input code points returns identically
2066 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
2067 * end pointer as well) */
2068 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
2069 if (s == reginfo->strbeg) { \
2070 tmp = '\n'; \
2071 } \
2072 else { /* Back-up to the start of the previous character */ \
2073 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
2074 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
2075 0, UTF8_ALLOW_DEFAULT); \
2076 } \
2077 tmp = TEST_UV(tmp); \
2078 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
2079 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
2080 tmp = !tmp; \
2081 IF_SUCCESS; \
2082 } \
2083 else { \
2084 IF_FAIL; \
2085 } \
2086 );
2087
2088 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
2089 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
2090 * set-up by the FBC_BOUND, etc macros below */
2091 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2092 UTF8_CODE; \
2093 /* Here, things have been set up by the previous code so that tmp is the \
2094 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
2095 * against the EOS, which we treat as a \n */ \
2096 if (tmp == ! TEST_NON_UTF8('\n')) { \
2097 IF_SUCCESS; \
2098 } \
2099 else { \
2100 IF_FAIL; \
2101 }
2102
2103 /* Same as the macro above, but the target isn't UTF-8 */
2104 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2105 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2106 tmp = TEST_NON_UTF8(tmp); \
2107 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2108 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2109 IF_SUCCESS; \
2110 tmp = !tmp; \
2111 } \
2112 else { \
2113 IF_FAIL; \
2114 } \
2115 ); \
2116 /* Here, things have been set up by the previous code so that tmp is \
2117 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2118 * matches against the EOS, which we treat as a \n */ \
2119 if (tmp == ! TEST_NON_UTF8('\n')) { \
2120 IF_SUCCESS; \
2121 } \
2122 else { \
2123 IF_FAIL; \
2124 }
2125
2126 /* This is the macro to use when we want to see if something that looks like it
2127 * could match, actually does, and if so exits the loop. It needs to be used
2128 * only for bounds checking macros, as it allows for matching beyond the end of
2129 * string (which should be zero length without having to look at the string
2130 * contents) */
2131 #define REXEC_FBC_TRYIT \
2132 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
2133 goto got_it
2134
2135 /* The only difference between the BOUND and NBOUND cases is that
2136 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2137 * NBOUND. This is accomplished by passing it as either the if or else clause,
2138 * with the other one being empty (PLACEHOLDER is defined as empty).
2139 *
2140 * The TEST_FOO parameters are for operating on different forms of input, but
2141 * all should be ones that return identically for the same underlying code
2142 * points */
2143
2144 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2145 FBC_BOUND_COMMON_UTF8( \
2146 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2147 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2148
2149 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2150 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2151
2152 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2153 FBC_BOUND_COMMON_UTF8( \
2154 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2155 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2156
2157 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2158 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2159
2160 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2161 FBC_BOUND_COMMON_UTF8( \
2162 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2163 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2164
2165 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2166 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2167
2168 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2169 FBC_BOUND_COMMON_UTF8( \
2170 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2171 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2172
2173 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2174 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2175
2176 #ifdef DEBUGGING
2177 static IV
S_get_break_val_cp_checked(SV * const invlist,const UV cp_in)2178 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2179 IV cp_out = _invlist_search(invlist, cp_in);
2180 assert(cp_out >= 0);
2181 return cp_out;
2182 }
2183 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2184 invmap[S_get_break_val_cp_checked(invlist, cp)]
2185 #else
2186 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2187 invmap[_invlist_search(invlist, cp)]
2188 #endif
2189
2190 /* Takes a pointer to an inversion list, a pointer to its corresponding
2191 * inversion map, and a code point, and returns the code point's value
2192 * according to the two arrays. It assumes that all code points have a value.
2193 * This is used as the base macro for macros for particular properties */
2194 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2195 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2196
2197 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2198 * of a code point, returning the value for the first code point in the string.
2199 * And it takes the particular macro name that finds the desired value given a
2200 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2201 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2202 (__ASSERT_(pos < strend) \
2203 /* Note assumes is valid UTF-8 */ \
2204 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2205
2206 /* Returns the GCB value for the input code point */
2207 #define getGCB_VAL_CP(cp) \
2208 _generic_GET_BREAK_VAL_CP( \
2209 PL_GCB_invlist, \
2210 _Perl_GCB_invmap, \
2211 (cp))
2212
2213 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2214 * bounded by pos and strend */
2215 #define getGCB_VAL_UTF8(pos, strend) \
2216 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2217
2218 /* Returns the LB value for the input code point */
2219 #define getLB_VAL_CP(cp) \
2220 _generic_GET_BREAK_VAL_CP( \
2221 PL_LB_invlist, \
2222 _Perl_LB_invmap, \
2223 (cp))
2224
2225 /* Returns the LB value for the first code point in the UTF-8 encoded string
2226 * bounded by pos and strend */
2227 #define getLB_VAL_UTF8(pos, strend) \
2228 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2229
2230
2231 /* Returns the SB value for the input code point */
2232 #define getSB_VAL_CP(cp) \
2233 _generic_GET_BREAK_VAL_CP( \
2234 PL_SB_invlist, \
2235 _Perl_SB_invmap, \
2236 (cp))
2237
2238 /* Returns the SB value for the first code point in the UTF-8 encoded string
2239 * bounded by pos and strend */
2240 #define getSB_VAL_UTF8(pos, strend) \
2241 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2242
2243 /* Returns the WB value for the input code point */
2244 #define getWB_VAL_CP(cp) \
2245 _generic_GET_BREAK_VAL_CP( \
2246 PL_WB_invlist, \
2247 _Perl_WB_invmap, \
2248 (cp))
2249
2250 /* Returns the WB value for the first code point in the UTF-8 encoded string
2251 * bounded by pos and strend */
2252 #define getWB_VAL_UTF8(pos, strend) \
2253 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2254
2255 /* We know what class REx starts with. Try to find this position... */
2256 /* if reginfo->intuit, its a dryrun */
2257 /* annoyingly all the vars in this routine have different names from their counterparts
2258 in regmatch. /grrr */
2259 STATIC char *
S_find_byclass(pTHX_ regexp * prog,const regnode * c,char * s,const char * strend,regmatch_info * reginfo)2260 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2261 const char *strend, regmatch_info *reginfo)
2262 {
2263
2264 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2265 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2266
2267 char *pat_string; /* The pattern's exactish string */
2268 char *pat_end; /* ptr to end char of pat_string */
2269 re_fold_t folder; /* Function for computing non-utf8 folds */
2270 const U8 *fold_array; /* array for folding ords < 256 */
2271 STRLEN ln;
2272 STRLEN lnc;
2273 U8 c1;
2274 U8 c2;
2275 char *e = NULL;
2276
2277 /* In some cases we accept only the first occurrence of 'x' in a sequence of
2278 * them. This variable points to just beyond the end of the previous
2279 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2280 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2281 * hop back.) */
2282 char * previous_occurrence_end = 0;
2283
2284 I32 tmp; /* Scratch variable */
2285 const bool utf8_target = reginfo->is_utf8_target;
2286 UV utf8_fold_flags = 0;
2287 const bool is_utf8_pat = reginfo->is_utf8_pat;
2288 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2289 with a result inverts that result, as 0^1 =
2290 1 and 1^1 = 0 */
2291 char_class_number_ classnum;
2292
2293 RXi_GET_DECL(prog,progi);
2294
2295 PERL_ARGS_ASSERT_FIND_BYCLASS;
2296
2297 /* We know what class it must start with. The case statements below have
2298 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2299 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2300 * ('p8' and 'pb'. */
2301 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2302 SV * anyofh_list;
2303
2304 case ANYOFPOSIXL_t8_pb:
2305 case ANYOFPOSIXL_t8_p8:
2306 case ANYOFL_t8_pb:
2307 case ANYOFL_t8_p8:
2308 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2309 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2310
2311 /* FALLTHROUGH */
2312
2313 case ANYOFD_t8_pb:
2314 case ANYOFD_t8_p8:
2315 case ANYOF_t8_pb:
2316 case ANYOF_t8_p8:
2317 REXEC_FBC_UTF8_CLASS_SCAN(
2318 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2319 break;
2320
2321 case ANYOFPOSIXL_tb_pb:
2322 case ANYOFPOSIXL_tb_p8:
2323 case ANYOFL_tb_pb:
2324 case ANYOFL_tb_p8:
2325 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2326 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2327
2328 /* FALLTHROUGH */
2329
2330 case ANYOFD_tb_pb:
2331 case ANYOFD_tb_p8:
2332 case ANYOF_tb_pb:
2333 case ANYOF_tb_p8:
2334 if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) {
2335 /* We know that s is in the bitmap range since the target isn't
2336 * UTF-8, so what happens for out-of-range values is not relevant,
2337 * so exclude that from the flags */
2338 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2339 }
2340 else {
2341 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2342 0));
2343 }
2344 break;
2345
2346 case ANYOFM_tb_pb: /* ARG1u() is the base byte; FLAGS() the mask byte */
2347 case ANYOFM_tb_p8:
2348 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2349 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2350 break;
2351
2352 case ANYOFM_t8_pb:
2353 case ANYOFM_t8_p8:
2354 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2355 * we do anyway for performance reasons, as otherwise we would have to
2356 * examine all the continuation characters */
2357 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2358 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2359 break;
2360
2361 case NANYOFM_tb_pb:
2362 case NANYOFM_tb_p8:
2363 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2364 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c)));
2365 break;
2366
2367 case NANYOFM_t8_pb:
2368 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2369 variants. */
2370 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2371 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2372 (U8) ARG1u(c), FLAGS(c)));
2373 break;
2374
2375 /* These nodes all require at least one code point to be in UTF-8 to
2376 * match */
2377 case ANYOFH_tb_pb:
2378 case ANYOFH_tb_p8:
2379 case ANYOFHb_tb_pb:
2380 case ANYOFHb_tb_p8:
2381 case ANYOFHbbm_tb_pb:
2382 case ANYOFHbbm_tb_p8:
2383 case ANYOFHr_tb_pb:
2384 case ANYOFHr_tb_p8:
2385 case ANYOFHs_tb_pb:
2386 case ANYOFHs_tb_p8:
2387 case EXACTFLU8_tb_pb:
2388 case EXACTFLU8_tb_p8:
2389 case EXACTFU_REQ8_tb_pb:
2390 case EXACTFU_REQ8_tb_p8:
2391 break;
2392
2393 case ANYOFH_t8_pb:
2394 case ANYOFH_t8_p8:
2395 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2396 REXEC_FBC_UTF8_CLASS_SCAN(
2397 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2398 && _invlist_contains_cp(anyofh_list,
2399 utf8_to_uvchr_buf((U8 *) s,
2400 (U8 *) strend,
2401 NULL))));
2402 break;
2403
2404 case ANYOFHb_t8_pb:
2405 case ANYOFHb_t8_p8:
2406 {
2407 /* We know what the first byte of any matched string should be. */
2408 U8 first_byte = FLAGS(c);
2409
2410 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2411 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2412 _invlist_contains_cp(anyofh_list,
2413 utf8_to_uvchr_buf((U8 *) s,
2414 (U8 *) strend,
2415 NULL)));
2416 }
2417 break;
2418
2419 case ANYOFHbbm_t8_pb:
2420 case ANYOFHbbm_t8_p8:
2421 {
2422 /* We know what the first byte of any matched string should be. */
2423 U8 first_byte = FLAGS(c);
2424
2425 /* And a bitmap defines all the legal 2nd byte matches */
2426 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2427 ( s < strend
2428 && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap,
2429 (U8) s[1] & UTF_CONTINUATION_MASK)));
2430 }
2431 break;
2432
2433 case ANYOFHr_t8_pb:
2434 case ANYOFHr_t8_p8:
2435 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2436 REXEC_FBC_UTF8_CLASS_SCAN(
2437 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2438 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2439 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2440 && _invlist_contains_cp(anyofh_list,
2441 utf8_to_uvchr_buf((U8 *) s,
2442 (U8 *) strend,
2443 NULL))));
2444 break;
2445
2446 case ANYOFHs_t8_pb:
2447 case ANYOFHs_t8_p8:
2448 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2449 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2450 ((struct regnode_anyofhs *) c)->string,
2451 /* Note FLAGS is the string length in this regnode */
2452 ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2453 _invlist_contains_cp(anyofh_list,
2454 utf8_to_uvchr_buf((U8 *) s,
2455 (U8 *) strend,
2456 NULL)));
2457 break;
2458
2459 case ANYOFR_tb_pb:
2460 case ANYOFR_tb_p8:
2461 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2462 ANYOFRbase(c), ANYOFRdelta(c)));
2463 break;
2464
2465 case ANYOFR_t8_pb:
2466 case ANYOFR_t8_p8:
2467 REXEC_FBC_UTF8_CLASS_SCAN(
2468 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2469 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2470 (U8 *) strend,
2471 NULL),
2472 ANYOFRbase(c), ANYOFRdelta(c))));
2473 break;
2474
2475 case ANYOFRb_tb_pb:
2476 case ANYOFRb_tb_p8:
2477 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2478 ANYOFRbase(c), ANYOFRdelta(c)));
2479 break;
2480
2481 case ANYOFRb_t8_pb:
2482 case ANYOFRb_t8_p8:
2483 { /* We know what the first byte of any matched string should be */
2484 U8 first_byte = FLAGS(c);
2485
2486 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2487 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2488 (U8 *) strend,
2489 NULL),
2490 ANYOFRbase(c), ANYOFRdelta(c)));
2491 }
2492 break;
2493
2494 case EXACTFAA_tb_pb:
2495
2496 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2497 * which these functions don't handle anyway */
2498 fold_array = PL_fold_latin1;
2499 folder = S_foldEQ_latin1_s2_folded;
2500 goto do_exactf_non_utf8;
2501
2502 case EXACTF_tb_pb:
2503 fold_array = PL_fold;
2504 folder = Perl_foldEQ;
2505 goto do_exactf_non_utf8;
2506
2507 case EXACTFL_tb_pb:
2508 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2509
2510 if (IN_UTF8_CTYPE_LOCALE) {
2511 utf8_fold_flags = FOLDEQ_LOCALE;
2512 goto do_exactf_utf8;
2513 }
2514
2515 fold_array = PL_fold_locale;
2516 folder = Perl_foldEQ_locale;
2517 goto do_exactf_non_utf8;
2518
2519 case EXACTFU_tb_pb:
2520 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2521 * don't have to worry here about this single special case in the
2522 * Latin1 range */
2523 fold_array = PL_fold_latin1;
2524 folder = S_foldEQ_latin1_s2_folded;
2525
2526 /* FALLTHROUGH */
2527
2528 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2529 are no glitches with fold-length differences
2530 between the target string and pattern */
2531
2532 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2533 * character of the EXACTF* node and then, if necessary,
2534 * case-insensitively compare the full text of the node. c1 is the
2535 * first character. c2 is its fold. This logic will not work for
2536 * Unicode semantics and the german sharp ss, which hence should not be
2537 * compiled into a node that gets here. */
2538 pat_string = STRINGs(c);
2539 ln = STR_LENs(c); /* length to match in octets/bytes */
2540
2541 /* We know that we have to match at least 'ln' bytes (which is the same
2542 * as characters, since not utf8). If we have to match 3 characters,
2543 * and there are only 2 available, we know without trying that it will
2544 * fail; so don't start a match past the required minimum number from
2545 * the far end */
2546 e = HOP3c(strend, -((SSize_t)ln), s);
2547 if (e < s)
2548 break;
2549
2550 c1 = *pat_string;
2551 c2 = fold_array[c1];
2552 if (c1 == c2) { /* If char and fold are the same */
2553 while (s <= e) {
2554 s = (char *) memchr(s, c1, e + 1 - s);
2555 if (s == NULL) {
2556 break;
2557 }
2558
2559 /* Check that the rest of the node matches */
2560 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2561 && (reginfo->intuit || regtry(reginfo, &s)) )
2562 {
2563 goto got_it;
2564 }
2565 s++;
2566 }
2567 }
2568 else {
2569 U8 bits_differing = c1 ^ c2;
2570
2571 /* If the folds differ in one bit position only, we can mask to
2572 * match either of them, and can use this faster find method. Both
2573 * ASCII and EBCDIC tend to have their case folds differ in only
2574 * one position, so this is very likely */
2575 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2576 bits_differing = ~ bits_differing;
2577 while (s <= e) {
2578 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2579 (c1 & bits_differing), bits_differing);
2580 if (s > e) {
2581 break;
2582 }
2583
2584 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2585 && (reginfo->intuit || regtry(reginfo, &s)) )
2586 {
2587 goto got_it;
2588 }
2589 s++;
2590 }
2591 }
2592 else { /* Otherwise, stuck with looking byte-at-a-time. This
2593 should actually happen only in EXACTFL nodes */
2594 while (s <= e) {
2595 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2596 && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2597 && (reginfo->intuit || regtry(reginfo, &s)) )
2598 {
2599 goto got_it;
2600 }
2601 s++;
2602 }
2603 }
2604 }
2605 break;
2606
2607 case EXACTFAA_tb_p8:
2608 case EXACTFAA_t8_p8:
2609 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2610 |FOLDEQ_S2_ALREADY_FOLDED
2611 |FOLDEQ_S2_FOLDS_SANE;
2612 goto do_exactf_utf8;
2613
2614 case EXACTFAA_NO_TRIE_tb_pb:
2615 case EXACTFAA_NO_TRIE_t8_pb:
2616 case EXACTFAA_t8_pb:
2617
2618 /* Here, and elsewhere in this file, the reason we can't consider a
2619 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2620 * is because any MICRO SIGN in the pattern won't be folded. Since the
2621 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2622 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2623 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2624 goto do_exactf_utf8;
2625
2626 case EXACTFL_tb_p8:
2627 case EXACTFL_t8_pb:
2628 case EXACTFL_t8_p8:
2629 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2630 utf8_fold_flags = FOLDEQ_LOCALE;
2631 goto do_exactf_utf8;
2632
2633 case EXACTFLU8_t8_pb:
2634 case EXACTFLU8_t8_p8:
2635 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2636 | FOLDEQ_S2_FOLDS_SANE;
2637 goto do_exactf_utf8;
2638
2639 case EXACTFU_REQ8_t8_p8:
2640 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2641 goto do_exactf_utf8;
2642
2643 case EXACTFU_tb_p8:
2644 case EXACTFU_t8_pb:
2645 case EXACTFU_t8_p8:
2646 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2647 goto do_exactf_utf8;
2648
2649 /* The following are problematic even though pattern isn't UTF-8. Use
2650 * full functionality normally not done except for UTF-8. */
2651 case EXACTF_t8_pb:
2652 case EXACTFUP_tb_pb:
2653 case EXACTFUP_t8_pb:
2654
2655 do_exactf_utf8:
2656 {
2657 unsigned expansion;
2658
2659 /* If one of the operands is in utf8, we can't use the simpler
2660 * folding above, due to the fact that many different characters
2661 * can have the same fold, or portion of a fold, or different-
2662 * length fold */
2663 pat_string = STRINGs(c);
2664 ln = STR_LENs(c); /* length to match in octets/bytes */
2665 pat_end = pat_string + ln;
2666 lnc = is_utf8_pat /* length to match in characters */
2667 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2668 : ln;
2669
2670 /* We have 'lnc' characters to match in the pattern, but because of
2671 * multi-character folding, each character in the target can match
2672 * up to 3 characters (Unicode guarantees it will never exceed
2673 * this) if it is utf8-encoded; and up to 2 if not (based on the
2674 * fact that the Latin 1 folds are already determined, and the only
2675 * multi-char fold in that range is the sharp-s folding to 'ss'.
2676 * Thus, a pattern character can match as little as 1/3 of a string
2677 * character. Adjust lnc accordingly, rounding up, so that if we
2678 * need to match at least 4+1/3 chars, that really is 5. */
2679 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2680 lnc = (lnc + expansion - 1) / expansion;
2681
2682 /* As in the non-UTF8 case, if we have to match 3 characters, and
2683 * only 2 are left, it's guaranteed to fail, so don't start a match
2684 * that would require us to go beyond the end of the string */
2685 e = HOP3c(strend, -((SSize_t)lnc), s);
2686
2687 /* XXX Note that we could recalculate e to stop the loop earlier,
2688 * as the worst case expansion above will rarely be met, and as we
2689 * go along we would usually find that e moves further to the left.
2690 * This would happen only after we reached the point in the loop
2691 * where if there were no expansion we should fail. Unclear if
2692 * worth the expense */
2693
2694 while (s <= e) {
2695 char *my_strend= (char *)strend;
2696 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2697 pat_string, NULL, ln, is_utf8_pat,
2698 utf8_fold_flags)
2699 && (reginfo->intuit || regtry(reginfo, &s)) )
2700 {
2701 goto got_it;
2702 }
2703 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2704 }
2705 }
2706 break;
2707
2708 case BOUNDA_tb_pb:
2709 case BOUNDA_tb_p8:
2710 case BOUND_tb_pb: /* /d without utf8 target is /a */
2711 case BOUND_tb_p8:
2712 /* regcomp.c makes sure that these only have the traditional \b
2713 * meaning. */
2714 assert(FLAGS(c) == TRADITIONAL_BOUND);
2715
2716 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2717 break;
2718
2719 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2720 case BOUNDA_t8_p8:
2721 /* regcomp.c makes sure that these only have the traditional \b
2722 * meaning. */
2723 assert(FLAGS(c) == TRADITIONAL_BOUND);
2724
2725 FBC_BOUND_A_UTF8(isWORDCHAR_A);
2726 break;
2727
2728 case NBOUNDA_tb_pb:
2729 case NBOUNDA_tb_p8:
2730 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2731 case NBOUND_tb_p8:
2732 /* regcomp.c makes sure that these only have the traditional \b
2733 * meaning. */
2734 assert(FLAGS(c) == TRADITIONAL_BOUND);
2735
2736 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2737 break;
2738
2739 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2740 case NBOUNDA_t8_p8:
2741 /* regcomp.c makes sure that these only have the traditional \b
2742 * meaning. */
2743 assert(FLAGS(c) == TRADITIONAL_BOUND);
2744
2745 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2746 break;
2747
2748 case NBOUNDU_tb_pb:
2749 case NBOUNDU_tb_p8:
2750 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2751 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2752 break;
2753 }
2754
2755 to_complement = 1;
2756 goto do_boundu_non_utf8;
2757
2758 case NBOUNDL_tb_pb:
2759 case NBOUNDL_tb_p8:
2760 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2761 if (FLAGS(c) == TRADITIONAL_BOUND) {
2762 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2763 break;
2764 }
2765
2766 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2767
2768 to_complement = 1;
2769 goto do_boundu_non_utf8;
2770
2771 case BOUNDL_tb_pb:
2772 case BOUNDL_tb_p8:
2773 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2774 if (FLAGS(c) == TRADITIONAL_BOUND) {
2775 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2776 break;
2777 }
2778
2779 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2780
2781 goto do_boundu_non_utf8;
2782
2783 case BOUNDU_tb_pb:
2784 case BOUNDU_tb_p8:
2785 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2786 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2787 break;
2788 }
2789
2790 do_boundu_non_utf8:
2791 if (s == reginfo->strbeg) {
2792 if (reginfo->intuit || regtry(reginfo, &s))
2793 {
2794 goto got_it;
2795 }
2796
2797 /* Didn't match. Try at the next position (if there is one) */
2798 s++;
2799 if (UNLIKELY(s >= reginfo->strend)) {
2800 break;
2801 }
2802 }
2803
2804 switch((bound_type) FLAGS(c)) {
2805 case TRADITIONAL_BOUND: /* Should have already been handled */
2806 assert(0);
2807 break;
2808
2809 case GCB_BOUND:
2810 /* Not utf8. Everything is a GCB except between CR and LF */
2811 while (s < strend) {
2812 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2813 || UCHARAT(s) != '\n'))
2814 && (reginfo->intuit || regtry(reginfo, &s)))
2815 {
2816 goto got_it;
2817 }
2818 s++;
2819 }
2820
2821 break;
2822
2823 case LB_BOUND:
2824 {
2825 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2826 while (s < strend) {
2827 LB_enum after = getLB_VAL_CP((U8) *s);
2828 if (to_complement ^ isLB(before,
2829 after,
2830 (U8*) reginfo->strbeg,
2831 (U8*) s,
2832 (U8*) reginfo->strend,
2833 0 /* target not utf8 */ )
2834 && (reginfo->intuit || regtry(reginfo, &s)))
2835 {
2836 goto got_it;
2837 }
2838 before = after;
2839 s++;
2840 }
2841 }
2842
2843 break;
2844
2845 case SB_BOUND:
2846 {
2847 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2848 while (s < strend) {
2849 SB_enum after = getSB_VAL_CP((U8) *s);
2850 if ((to_complement ^ isSB(before,
2851 after,
2852 (U8*) reginfo->strbeg,
2853 (U8*) s,
2854 (U8*) reginfo->strend,
2855 0 /* target not utf8 */ ))
2856 && (reginfo->intuit || regtry(reginfo, &s)))
2857 {
2858 goto got_it;
2859 }
2860 before = after;
2861 s++;
2862 }
2863 }
2864
2865 break;
2866
2867 case WB_BOUND:
2868 {
2869 WB_enum previous = WB_UNKNOWN;
2870 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2871 while (s < strend) {
2872 WB_enum after = getWB_VAL_CP((U8) *s);
2873 if ((to_complement ^ isWB(previous,
2874 before,
2875 after,
2876 (U8*) reginfo->strbeg,
2877 (U8*) s,
2878 (U8*) reginfo->strend,
2879 0 /* target not utf8 */ ))
2880 && (reginfo->intuit || regtry(reginfo, &s)))
2881 {
2882 goto got_it;
2883 }
2884 previous = before;
2885 before = after;
2886 s++;
2887 }
2888 }
2889 }
2890
2891 /* Here are at the final position in the target string, which is a
2892 * boundary by definition, so matches, depending on other constraints.
2893 * */
2894 if ( reginfo->intuit
2895 || (s <= reginfo->strend && regtry(reginfo, &s)))
2896 {
2897 goto got_it;
2898 }
2899
2900 break;
2901
2902 case BOUNDL_t8_pb:
2903 case BOUNDL_t8_p8:
2904 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2905 if (FLAGS(c) == TRADITIONAL_BOUND) {
2906 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2907 isWORDCHAR_LC_utf8_safe);
2908 break;
2909 }
2910
2911 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2912
2913 to_complement = 1;
2914 goto do_boundu_utf8;
2915
2916 case NBOUNDL_t8_pb:
2917 case NBOUNDL_t8_p8:
2918 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2919 if (FLAGS(c) == TRADITIONAL_BOUND) {
2920 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2921 isWORDCHAR_LC_utf8_safe);
2922 break;
2923 }
2924
2925 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2926
2927 to_complement = 1;
2928 goto do_boundu_utf8;
2929
2930 case NBOUND_t8_pb:
2931 case NBOUND_t8_p8:
2932 /* regcomp.c makes sure that these only have the traditional \b
2933 * meaning. */
2934 assert(FLAGS(c) == TRADITIONAL_BOUND);
2935
2936 /* FALLTHROUGH */
2937
2938 case NBOUNDU_t8_pb:
2939 case NBOUNDU_t8_p8:
2940 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2941 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2942 isWORDCHAR_utf8_safe);
2943 break;
2944 }
2945
2946 to_complement = 1;
2947 goto do_boundu_utf8;
2948
2949 case BOUND_t8_pb:
2950 case BOUND_t8_p8:
2951 /* regcomp.c makes sure that these only have the traditional \b
2952 * meaning. */
2953 assert(FLAGS(c) == TRADITIONAL_BOUND);
2954
2955 /* FALLTHROUGH */
2956
2957 case BOUNDU_t8_pb:
2958 case BOUNDU_t8_p8:
2959 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2960 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2961 break;
2962 }
2963
2964 do_boundu_utf8:
2965 if (s == reginfo->strbeg) {
2966 if (reginfo->intuit || regtry(reginfo, &s))
2967 {
2968 goto got_it;
2969 }
2970
2971 /* Didn't match. Try at the next position (if there is one) */
2972 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2973 if (UNLIKELY(s >= reginfo->strend)) {
2974 break;
2975 }
2976 }
2977
2978 switch((bound_type) FLAGS(c)) {
2979 case TRADITIONAL_BOUND: /* Should have already been handled */
2980 assert(0);
2981 break;
2982
2983 case GCB_BOUND:
2984 {
2985 GCB_enum before = getGCB_VAL_UTF8(
2986 reghop3((U8*)s, -1,
2987 (U8*)(reginfo->strbeg)),
2988 (U8*) reginfo->strend);
2989 while (s < strend) {
2990 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2991 (U8*) reginfo->strend);
2992 if ( (to_complement ^ isGCB(before,
2993 after,
2994 (U8*) reginfo->strbeg,
2995 (U8*) s,
2996 1 /* target is utf8 */ ))
2997 && (reginfo->intuit || regtry(reginfo, &s)))
2998 {
2999 goto got_it;
3000 }
3001 before = after;
3002 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3003 }
3004 }
3005 break;
3006
3007 case LB_BOUND:
3008 {
3009 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
3010 -1,
3011 (U8*)(reginfo->strbeg)),
3012 (U8*) reginfo->strend);
3013 while (s < strend) {
3014 LB_enum after = getLB_VAL_UTF8((U8*) s,
3015 (U8*) reginfo->strend);
3016 if (to_complement ^ isLB(before,
3017 after,
3018 (U8*) reginfo->strbeg,
3019 (U8*) s,
3020 (U8*) reginfo->strend,
3021 1 /* target is utf8 */ )
3022 && (reginfo->intuit || regtry(reginfo, &s)))
3023 {
3024 goto got_it;
3025 }
3026 before = after;
3027 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3028 }
3029 }
3030
3031 break;
3032
3033 case SB_BOUND:
3034 {
3035 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
3036 -1,
3037 (U8*)(reginfo->strbeg)),
3038 (U8*) reginfo->strend);
3039 while (s < strend) {
3040 SB_enum after = getSB_VAL_UTF8((U8*) s,
3041 (U8*) reginfo->strend);
3042 if ((to_complement ^ isSB(before,
3043 after,
3044 (U8*) reginfo->strbeg,
3045 (U8*) s,
3046 (U8*) reginfo->strend,
3047 1 /* target is utf8 */ ))
3048 && (reginfo->intuit || regtry(reginfo, &s)))
3049 {
3050 goto got_it;
3051 }
3052 before = after;
3053 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3054 }
3055 }
3056
3057 break;
3058
3059 case WB_BOUND:
3060 {
3061 /* We are at a boundary between char_sub_0 and char_sub_1.
3062 * We also keep track of the value for char_sub_-1 as we
3063 * loop through the line. Context may be needed to make a
3064 * determination, and if so, this can save having to
3065 * recalculate it */
3066 WB_enum previous = WB_UNKNOWN;
3067 WB_enum before = getWB_VAL_UTF8(
3068 reghop3((U8*)s,
3069 -1,
3070 (U8*)(reginfo->strbeg)),
3071 (U8*) reginfo->strend);
3072 while (s < strend) {
3073 WB_enum after = getWB_VAL_UTF8((U8*) s,
3074 (U8*) reginfo->strend);
3075 if ((to_complement ^ isWB(previous,
3076 before,
3077 after,
3078 (U8*) reginfo->strbeg,
3079 (U8*) s,
3080 (U8*) reginfo->strend,
3081 1 /* target is utf8 */ ))
3082 && (reginfo->intuit || regtry(reginfo, &s)))
3083 {
3084 goto got_it;
3085 }
3086 previous = before;
3087 before = after;
3088 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3089 }
3090 }
3091 }
3092
3093 /* Here are at the final position in the target string, which is a
3094 * boundary by definition, so matches, depending on other constraints.
3095 * */
3096
3097 if ( reginfo->intuit
3098 || (s <= reginfo->strend && regtry(reginfo, &s)))
3099 {
3100 goto got_it;
3101 }
3102 break;
3103
3104 case LNBREAK_t8_pb:
3105 case LNBREAK_t8_p8:
3106 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
3107 break;
3108
3109 case LNBREAK_tb_pb:
3110 case LNBREAK_tb_p8:
3111 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
3112 break;
3113
3114 /* The argument to all the POSIX node types is the class number to pass
3115 * to generic_isCC_() to build a mask for searching in PL_charclass[] */
3116
3117 case NPOSIXL_t8_pb:
3118 case NPOSIXL_t8_p8:
3119 to_complement = 1;
3120 /* FALLTHROUGH */
3121
3122 case POSIXL_t8_pb:
3123 case POSIXL_t8_p8:
3124 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3125 REXEC_FBC_UTF8_CLASS_SCAN(
3126 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3127 (U8 *) strend)));
3128 break;
3129
3130 case NPOSIXL_tb_pb:
3131 case NPOSIXL_tb_p8:
3132 to_complement = 1;
3133 /* FALLTHROUGH */
3134
3135 case POSIXL_tb_pb:
3136 case POSIXL_tb_p8:
3137 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3138 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3139 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3140 break;
3141
3142 case NPOSIXA_t8_pb:
3143 case NPOSIXA_t8_p8:
3144 /* The complement of something that matches only ASCII matches all
3145 * non-ASCII, plus everything in ASCII that isn't in the class. */
3146 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3147 || ! generic_isCC_A_(*s, FLAGS(c)));
3148 break;
3149
3150 case POSIXA_t8_pb:
3151 case POSIXA_t8_p8:
3152 /* Don't need to worry about utf8, as it can match only a single
3153 * byte invariant character. But we do anyway for performance reasons,
3154 * as otherwise we would have to examine all the continuation
3155 * characters */
3156 REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c)));
3157 break;
3158
3159 case NPOSIXD_tb_pb:
3160 case NPOSIXD_tb_p8:
3161 case NPOSIXA_tb_pb:
3162 case NPOSIXA_tb_p8:
3163 to_complement = 1;
3164 /* FALLTHROUGH */
3165
3166 case POSIXD_tb_pb:
3167 case POSIXD_tb_p8:
3168 case POSIXA_tb_pb:
3169 case POSIXA_tb_p8:
3170 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3171 to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c))));
3172 break;
3173
3174 case NPOSIXU_tb_pb:
3175 case NPOSIXU_tb_p8:
3176 to_complement = 1;
3177 /* FALLTHROUGH */
3178
3179 case POSIXU_tb_pb:
3180 case POSIXU_tb_p8:
3181 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3182 to_complement ^ cBOOL(generic_isCC_(*s,
3183 FLAGS(c))));
3184 break;
3185
3186 case NPOSIXD_t8_pb:
3187 case NPOSIXD_t8_p8:
3188 case NPOSIXU_t8_pb:
3189 case NPOSIXU_t8_p8:
3190 to_complement = 1;
3191 /* FALLTHROUGH */
3192
3193 case POSIXD_t8_pb:
3194 case POSIXD_t8_p8:
3195 case POSIXU_t8_pb:
3196 case POSIXU_t8_p8:
3197 classnum = (char_class_number_) FLAGS(c);
3198 switch (classnum) {
3199 default:
3200 REXEC_FBC_UTF8_CLASS_SCAN(
3201 to_complement ^ cBOOL(_invlist_contains_cp(
3202 PL_XPosix_ptrs[classnum],
3203 utf8_to_uvchr_buf((U8 *) s,
3204 (U8 *) strend,
3205 NULL))));
3206 break;
3207
3208 case CC_ENUM_SPACE_:
3209 REXEC_FBC_UTF8_CLASS_SCAN(
3210 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3211 break;
3212
3213 case CC_ENUM_BLANK_:
3214 REXEC_FBC_UTF8_CLASS_SCAN(
3215 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3216 break;
3217
3218 case CC_ENUM_XDIGIT_:
3219 REXEC_FBC_UTF8_CLASS_SCAN(
3220 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3221 break;
3222
3223 case CC_ENUM_VERTSPACE_:
3224 REXEC_FBC_UTF8_CLASS_SCAN(
3225 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3226 break;
3227
3228 case CC_ENUM_CNTRL_:
3229 REXEC_FBC_UTF8_CLASS_SCAN(
3230 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3231 break;
3232 }
3233 break;
3234
3235 case AHOCORASICKC_tb_pb:
3236 case AHOCORASICKC_tb_p8:
3237 case AHOCORASICKC_t8_pb:
3238 case AHOCORASICKC_t8_p8:
3239 case AHOCORASICK_tb_pb:
3240 case AHOCORASICK_tb_p8:
3241 case AHOCORASICK_t8_pb:
3242 case AHOCORASICK_t8_p8:
3243 {
3244 DECL_TRIE_TYPE(c);
3245 /* what trie are we using right now */
3246 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG1u( c ) ];
3247 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3248 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3249
3250 const char *last_start = strend - trie->minlen;
3251 #ifdef DEBUGGING
3252 const char *real_start = s;
3253 #endif
3254 STRLEN maxlen = trie->maxlen;
3255 SV *sv_points;
3256 U8 **points; /* map of where we were in the input string
3257 when reading a given char. For ASCII this
3258 is unnecessary overhead as the relationship
3259 is always 1:1, but for Unicode, especially
3260 case folded Unicode this is not true. */
3261 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3262 U8 *bitmap=NULL;
3263
3264
3265 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3266
3267 /* We can't just allocate points here. We need to wrap it in
3268 * an SV so it gets freed properly if there is a croak while
3269 * running the match */
3270 ENTER;
3271 SAVETMPS;
3272 sv_points=newSV(maxlen * sizeof(U8 *));
3273 SvCUR_set(sv_points,
3274 maxlen * sizeof(U8 *));
3275 SvPOK_on(sv_points);
3276 sv_2mortal(sv_points);
3277 points=(U8**)SvPV_nolen(sv_points );
3278 if ( trie_type != trie_utf8_fold
3279 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3280 {
3281 if (trie->bitmap)
3282 bitmap=(U8*)trie->bitmap;
3283 else
3284 bitmap=(U8*)ANYOF_BITMAP(c);
3285 }
3286 /* this is the Aho-Corasick algorithm modified a touch
3287 to include special handling for long "unknown char" sequences.
3288 The basic idea being that we use AC as long as we are dealing
3289 with a possible matching char, when we encounter an unknown char
3290 (and we have not encountered an accepting state) we scan forward
3291 until we find a legal starting char.
3292 AC matching is basically that of trie matching, except that when
3293 we encounter a failing transition, we fall back to the current
3294 states "fail state", and try the current char again, a process
3295 we repeat until we reach the root state, state 1, or a legal
3296 transition. If we fail on the root state then we can either
3297 terminate if we have reached an accepting state previously, or
3298 restart the entire process from the beginning if we have not.
3299
3300 */
3301 while (s <= last_start) {
3302 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3303 U8 *uc = (U8*)s;
3304 U16 charid = 0;
3305 U32 base = 1;
3306 U32 state = 1;
3307 UV uvc = 0;
3308 STRLEN len = 0;
3309 STRLEN foldlen = 0;
3310 U8 *uscan = (U8*)NULL;
3311 U8 *leftmost = NULL;
3312 #ifdef DEBUGGING
3313 U32 accepted_word= 0;
3314 #endif
3315 U32 pointpos = 0;
3316
3317 while ( state && uc <= (U8*)strend ) {
3318 int failed=0;
3319 U32 word = aho->states[ state ].wordnum;
3320
3321 if( state==1 ) {
3322 if ( bitmap ) {
3323 DEBUG_TRIE_EXECUTE_r(
3324 if ( uc <= (U8*)last_start
3325 && !BITMAP_TEST(bitmap,*uc) )
3326 {
3327 dump_exec_pos( (char *)uc, c, strend,
3328 real_start,
3329 (char *)uc, utf8_target, 0 );
3330 Perl_re_printf( aTHX_
3331 " Scanning for legal start char...\n");
3332 }
3333 );
3334 if (utf8_target) {
3335 while ( uc <= (U8*)last_start
3336 && !BITMAP_TEST(bitmap,*uc) )
3337 {
3338 uc += UTF8SKIP(uc);
3339 }
3340 } else {
3341 while ( uc <= (U8*)last_start
3342 && ! BITMAP_TEST(bitmap,*uc) )
3343 {
3344 uc++;
3345 }
3346 }
3347 s= (char *)uc;
3348 }
3349 if (uc >(U8*)last_start) break;
3350 }
3351
3352 if ( word ) {
3353 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3354 % maxlen ];
3355 if (!leftmost || lpos < leftmost) {
3356 DEBUG_r(accepted_word=word);
3357 leftmost= lpos;
3358 }
3359 if (base==0) break;
3360
3361 }
3362 points[pointpos++ % maxlen]= uc;
3363 if (foldlen || uc < (U8*)strend) {
3364 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3365 (U8 *) strend, uscan, len, uvc,
3366 charid, foldlen, foldbuf,
3367 uniflags);
3368 DEBUG_TRIE_EXECUTE_r({
3369 dump_exec_pos( (char *)uc, c, strend,
3370 real_start, s, utf8_target, 0);
3371 Perl_re_printf( aTHX_
3372 " Charid:%3u CP:%4" UVxf " ",
3373 charid, uvc);
3374 });
3375 }
3376 else {
3377 len = 0;
3378 charid = 0;
3379 }
3380
3381
3382 do {
3383 #ifdef DEBUGGING
3384 word = aho->states[ state ].wordnum;
3385 #endif
3386 base = aho->states[ state ].trans.base;
3387
3388 DEBUG_TRIE_EXECUTE_r({
3389 if (failed)
3390 dump_exec_pos((char *)uc, c, strend, real_start,
3391 s, utf8_target, 0 );
3392 Perl_re_printf( aTHX_
3393 "%sState: %4" UVxf ", word=%" UVxf,
3394 failed ? " Fail transition to " : "",
3395 (UV)state, (UV)word);
3396 });
3397 if ( base ) {
3398 U32 tmp;
3399 I32 offset;
3400 if (charid &&
3401 ( ((offset = base + charid
3402 - 1 - trie->uniquecharcount)) >= 0)
3403 && ((U32)offset < trie->lasttrans)
3404 && trie->trans[offset].check == state
3405 && (tmp=trie->trans[offset].next))
3406 {
3407 DEBUG_TRIE_EXECUTE_r(
3408 Perl_re_printf( aTHX_ " - legal\n"));
3409 state = tmp;
3410 break;
3411 }
3412 else {
3413 DEBUG_TRIE_EXECUTE_r(
3414 Perl_re_printf( aTHX_ " - fail\n"));
3415 failed = 1;
3416 state = aho->fail[state];
3417 }
3418 }
3419 else {
3420 /* we must be accepting here */
3421 DEBUG_TRIE_EXECUTE_r(
3422 Perl_re_printf( aTHX_ " - accepting\n"));
3423 failed = 1;
3424 break;
3425 }
3426 } while(state);
3427 uc += len;
3428 if (failed) {
3429 if (leftmost)
3430 break;
3431 if (!state) state = 1;
3432 }
3433 }
3434 if ( aho->states[ state ].wordnum ) {
3435 U8 *lpos = points[ (pointpos
3436 - trie->wordinfo[aho->states[ state ]
3437 .wordnum].len) % maxlen ];
3438 if (!leftmost || lpos < leftmost) {
3439 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3440 leftmost = lpos;
3441 }
3442 }
3443 if (leftmost) {
3444 s = (char*)leftmost;
3445 DEBUG_TRIE_EXECUTE_r({
3446 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3447 " at position %" IVdf ". Trying full"
3448 " pattern...\n",
3449 (UV)accepted_word, (IV)(s - real_start)
3450 );
3451 });
3452 if (reginfo->intuit || regtry(reginfo, &s)) {
3453 FREETMPS;
3454 LEAVE;
3455 goto got_it;
3456 }
3457 if (s < reginfo->strend) {
3458 s = HOPc(s,1);
3459 }
3460 DEBUG_TRIE_EXECUTE_r({
3461 Perl_re_printf( aTHX_
3462 "Pattern failed. Looking for new start"
3463 " point...\n");
3464 });
3465 } else {
3466 DEBUG_TRIE_EXECUTE_r(
3467 Perl_re_printf( aTHX_ "No match.\n"));
3468 break;
3469 }
3470 }
3471 FREETMPS;
3472 LEAVE;
3473 }
3474 break;
3475
3476 case EXACTFU_REQ8_t8_pb:
3477 case EXACTFUP_tb_p8:
3478 case EXACTFUP_t8_p8:
3479 case EXACTF_tb_p8:
3480 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3481 case EXACTFAA_NO_TRIE_tb_p8:
3482 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3483 patterns */
3484 assert(0);
3485
3486 default:
3487 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3488 } /* End of switch on node type */
3489
3490 return 0;
3491
3492 got_it:
3493 return s;
3494 }
3495
3496 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3497 * flags have same meanings as with regexec_flags() */
3498
3499 static void
S_reg_set_capture_string(pTHX_ REGEXP * const rx,char * strbeg,char * strend,SV * sv,U32 flags,bool utf8_target)3500 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3501 char *strbeg,
3502 char *strend,
3503 SV *sv,
3504 U32 flags,
3505 bool utf8_target)
3506 {
3507 struct regexp *const prog = ReANY(rx);
3508
3509 if (flags & REXEC_COPY_STR) {
3510 #ifdef PERL_ANY_COW
3511 if (SvCANCOW(sv)) {
3512 DEBUG_C(Perl_re_printf( aTHX_
3513 "Copy on write: regexp capture, type %d\n",
3514 (int) SvTYPE(sv)));
3515 /* Create a new COW SV to share the match string and store
3516 * in saved_copy, unless the current COW SV in saved_copy
3517 * is valid and suitable for our purpose */
3518 if (( RXp_SAVED_COPY(prog)
3519 && SvIsCOW(RXp_SAVED_COPY(prog))
3520 && SvPOKp(RXp_SAVED_COPY(prog))
3521 && SvIsCOW(sv)
3522 && SvPOKp(sv)
3523 && SvPVX(sv) == SvPVX(RXp_SAVED_COPY(prog))))
3524 {
3525 /* just reuse saved_copy SV */
3526 if (RXp_MATCH_COPIED(prog)) {
3527 Safefree(RXp_SUBBEG(prog));
3528 RXp_MATCH_COPIED_off(prog);
3529 }
3530 }
3531 else {
3532 /* create new COW SV to share string */
3533 RXp_MATCH_COPY_FREE(prog);
3534 RXp_SAVED_COPY(prog) = sv_setsv_cow(RXp_SAVED_COPY(prog), sv);
3535 }
3536 RXp_SUBBEG(prog) = (char *)SvPVX_const(RXp_SAVED_COPY(prog));
3537 assert (SvPOKp(RXp_SAVED_COPY(prog)));
3538 RXp_SUBLEN(prog) = strend - strbeg;
3539 RXp_SUBOFFSET(prog) = 0;
3540 RXp_SUBCOFFSET(prog) = 0;
3541 } else
3542 #endif
3543 {
3544 SSize_t min = 0;
3545 SSize_t max = strend - strbeg;
3546 SSize_t sublen;
3547
3548 if ( (flags & REXEC_COPY_SKIP_POST)
3549 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3550 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3551 ) { /* don't copy $' part of string */
3552 SSize_t offs_end;
3553 U32 n = 0;
3554 max = -1;
3555 /* calculate the right-most part of the string covered
3556 * by a capture. Due to lookahead, this may be to
3557 * the right of $&, so we have to scan all captures */
3558 while (n <= RXp_LASTPAREN(prog)) {
3559 if ((offs_end = RXp_OFFS_END(prog,n)) > max)
3560 max = offs_end;
3561 n++;
3562 }
3563 if (max == -1)
3564 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3565 ? RXp_OFFS_START(prog,0)
3566 : 0;
3567 assert(max >= 0 && max <= strend - strbeg);
3568 }
3569
3570 if ( (flags & REXEC_COPY_SKIP_PRE)
3571 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3572 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3573 ) { /* don't copy $` part of string */
3574 U32 n = 0;
3575 min = max;
3576 /* calculate the left-most part of the string covered
3577 * by a capture. Due to lookbehind, this may be to
3578 * the left of $&, so we have to scan all captures */
3579 while (min && n <= RXp_LASTPAREN(prog)) {
3580 I32 start = RXp_OFFS_START(prog,n);
3581 if ( start != -1
3582 && start < min)
3583 {
3584 min = start;
3585 }
3586 n++;
3587 }
3588 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3589 && min > RXp_OFFS_END(prog,0)
3590 )
3591 min = RXp_OFFS_END(prog,0);
3592
3593 }
3594
3595 assert(min >= 0 && min <= max && min <= strend - strbeg);
3596 sublen = max - min;
3597
3598 if (RXp_MATCH_COPIED(prog)) {
3599 if (sublen > RXp_SUBLEN(prog))
3600 RXp_SUBBEG(prog) =
3601 (char*)saferealloc(RXp_SUBBEG(prog), sublen+1);
3602 }
3603 else
3604 RXp_SUBBEG(prog) = (char*)safemalloc(sublen+1);
3605 Copy(strbeg + min, RXp_SUBBEG(prog), sublen, char);
3606 RXp_SUBBEG(prog)[sublen] = '\0';
3607 RXp_SUBOFFSET(prog) = min;
3608 RXp_SUBLEN(prog) = sublen;
3609 RXp_MATCH_COPIED_on(prog);
3610 }
3611 RXp_SUBCOFFSET(prog) = RXp_SUBOFFSET(prog);
3612 if (RXp_SUBOFFSET(prog) && utf8_target) {
3613 /* Convert byte offset to chars.
3614 * XXX ideally should only compute this if @-/@+
3615 * has been seen, a la PL_sawampersand ??? */
3616
3617 /* If there's a direct correspondence between the
3618 * string which we're matching and the original SV,
3619 * then we can use the utf8 len cache associated with
3620 * the SV. In particular, it means that under //g,
3621 * sv_pos_b2u() will use the previously cached
3622 * position to speed up working out the new length of
3623 * subcoffset, rather than counting from the start of
3624 * the string each time. This stops
3625 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3626 * from going quadratic */
3627 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3628 RXp_SUBCOFFSET(prog) = sv_pos_b2u_flags(sv, RXp_SUBCOFFSET(prog),
3629 SV_GMAGIC|SV_CONST_RETURN);
3630 else
3631 RXp_SUBCOFFSET(prog) = utf8_length((U8*)strbeg,
3632 (U8*)(strbeg+RXp_SUBOFFSET(prog)));
3633 }
3634 }
3635 else {
3636 RXp_MATCH_COPY_FREE(prog);
3637 RXp_SUBBEG(prog) = strbeg;
3638 RXp_SUBOFFSET(prog) = 0;
3639 RXp_SUBCOFFSET(prog) = 0;
3640 RXp_SUBLEN(prog) = strend - strbeg;
3641 }
3642 }
3643
3644
3645
3646
3647 /*
3648 - regexec_flags - match a regexp against a string
3649 */
3650 I32
Perl_regexec_flags(pTHX_ REGEXP * const rx,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * sv,void * data,U32 flags)3651 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3652 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3653 /* stringarg: the point in the string at which to begin matching */
3654 /* strend: pointer to null at end of string */
3655 /* strbeg: real beginning of string */
3656 /* minend: end of match must be >= minend bytes after stringarg. */
3657 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3658 * itself is accessed via the pointers above */
3659 /* data: May be used for some additional optimizations.
3660 Currently unused. */
3661 /* flags: For optimizations. See REXEC_* in regexp.h */
3662
3663 {
3664 struct regexp *const prog = ReANY(rx);
3665 char *s;
3666 regnode *c;
3667 char *startpos;
3668 SSize_t minlen; /* must match at least this many chars */
3669 SSize_t dontbother = 0; /* how many characters not to try at end */
3670 const bool utf8_target = cBOOL(DO_UTF8(sv));
3671 I32 multiline;
3672 RXi_GET_DECL(prog,progi);
3673 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3674 regmatch_info *const reginfo = ®info_buf;
3675 regexp_paren_pair *swap = NULL;
3676 I32 oldsave;
3677 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3678
3679 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3680 PERL_UNUSED_ARG(data);
3681
3682 /* Be paranoid... */
3683 if (prog == NULL) {
3684 Perl_croak(aTHX_ "NULL regexp parameter");
3685 }
3686
3687 DEBUG_EXECUTE_r(
3688 debug_start_match(rx, utf8_target, stringarg, strend,
3689 "Matching");
3690 );
3691
3692 startpos = stringarg;
3693
3694 /* set these early as they may be used by the HOP macros below */
3695 reginfo->strbeg = strbeg;
3696 reginfo->strend = strend;
3697 reginfo->is_utf8_target = cBOOL(utf8_target);
3698
3699 if (prog->intflags & PREGf_GPOS_SEEN) {
3700 MAGIC *mg;
3701
3702 /* set reginfo->ganch, the position where \G can match */
3703
3704 reginfo->ganch =
3705 (flags & REXEC_IGNOREPOS)
3706 ? stringarg /* use start pos rather than pos() */
3707 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3708 /* Defined pos(): */
3709 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3710 : strbeg; /* pos() not defined; use start of string */
3711
3712 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3713 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3714
3715 /* in the presence of \G, we may need to start looking earlier in
3716 * the string than the suggested start point of stringarg:
3717 * if prog->gofs is set, then that's a known, fixed minimum
3718 * offset, such as
3719 * /..\G/: gofs = 2
3720 * /ab|c\G/: gofs = 1
3721 * or if the minimum offset isn't known, then we have to go back
3722 * to the start of the string, e.g. /w+\G/
3723 */
3724
3725 if (prog->intflags & PREGf_ANCH_GPOS) {
3726 if (prog->gofs) {
3727 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3728 if (!startpos ||
3729 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3730 {
3731 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3732 "fail: ganch-gofs before earliest possible start\n"));
3733 return 0;
3734 }
3735 }
3736 else
3737 startpos = reginfo->ganch;
3738 }
3739 else if (prog->gofs) {
3740 startpos = HOPBACKc(startpos, prog->gofs);
3741 if (!startpos)
3742 startpos = strbeg;
3743 }
3744 else if (prog->intflags & PREGf_GPOS_FLOAT)
3745 startpos = strbeg;
3746 }
3747
3748 minlen = prog->minlen;
3749 if ((startpos + minlen) > strend || startpos < strbeg) {
3750 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3751 "Regex match can't succeed, so not even tried\n"));
3752 return 0;
3753 }
3754
3755 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3756 * which will call destuctors to reset PL_regmatch_state, free higher
3757 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3758 * regmatch_info_aux_eval */
3759
3760 oldsave = PL_savestack_ix;
3761
3762 s = startpos;
3763
3764 if ((prog->extflags & RXf_USE_INTUIT)
3765 && !(flags & REXEC_CHECKED))
3766 {
3767 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3768 flags, NULL);
3769 if (!s)
3770 return 0;
3771
3772 if (prog->extflags & RXf_CHECK_ALL) {
3773 /* we can match based purely on the result of INTUIT.
3774 * Set up captures etc just for $& and $-[0]
3775 * (an intuit-only match wont have $1,$2,..) */
3776 assert(!prog->nparens);
3777
3778 /* s/// doesn't like it if $& is earlier than where we asked it to
3779 * start searching (which can happen on something like /.\G/) */
3780 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3781 && (s < stringarg))
3782 {
3783 /* this should only be possible under \G */
3784 assert(prog->intflags & PREGf_GPOS_SEEN);
3785 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3786 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3787 goto phooey;
3788 }
3789
3790 /* match via INTUIT shouldn't have any captures.
3791 * Let @-, @+, $^N know */
3792 RXp_LASTPAREN(prog) = RXp_LASTCLOSEPAREN(prog) = 0;
3793 RXp_MATCH_UTF8_set(prog, utf8_target);
3794 SSize_t match_start = s - strbeg;
3795 SSize_t match_end = utf8_target
3796 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3797 : s - strbeg + prog->minlenret;
3798 CLOSE_ANY_CAPTURE(prog, 0, match_start, match_end);
3799 if ( !(flags & REXEC_NOT_FIRST) )
3800 S_reg_set_capture_string(aTHX_ rx,
3801 strbeg, strend,
3802 sv, flags, utf8_target);
3803
3804 return 1;
3805 }
3806 }
3807
3808 multiline = prog->extflags & RXf_PMf_MULTILINE;
3809
3810 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3811 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3812 "String too short [regexec_flags]...\n"));
3813 goto phooey;
3814 }
3815
3816 /* Check validity of program. */
3817 if (UCHARAT(progi->program) != REG_MAGIC) {
3818 Perl_croak(aTHX_ "corrupted regexp program");
3819 }
3820
3821 RXp_MATCH_TAINTED_off(prog);
3822 RXp_MATCH_UTF8_set(prog, utf8_target);
3823
3824 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3825 reginfo->intuit = 0;
3826 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3827 reginfo->warned = FALSE;
3828 reginfo->sv = sv;
3829 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3830 /* see how far we have to get to not match where we matched before */
3831 reginfo->till = stringarg + minend;
3832
3833 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3834 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3835 S_cleanup_regmatch_info_aux has executed (registered by
3836 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3837 magic belonging to this SV.
3838 Not newSVsv, either, as it does not COW.
3839 */
3840 reginfo->sv = newSV_type(SVt_NULL);
3841 SvSetSV_nosteal(reginfo->sv, sv);
3842 SAVEFREESV(reginfo->sv);
3843 }
3844
3845 /* reserve next 2 or 3 slots in PL_regmatch_state:
3846 * slot N+0: may currently be in use: skip it
3847 * slot N+1: use for regmatch_info_aux struct
3848 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3849 * slot N+3: ready for use by regmatch()
3850 */
3851
3852 {
3853 regmatch_state *old_regmatch_state;
3854 regmatch_slab *old_regmatch_slab;
3855 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3856
3857 /* on first ever match, allocate first slab */
3858 if (!PL_regmatch_slab) {
3859 Newx(PL_regmatch_slab, 1, regmatch_slab);
3860 PL_regmatch_slab->prev = NULL;
3861 PL_regmatch_slab->next = NULL;
3862 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3863 }
3864
3865 old_regmatch_state = PL_regmatch_state;
3866 old_regmatch_slab = PL_regmatch_slab;
3867
3868 for (i=0; i <= max; i++) {
3869 if (i == 1)
3870 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3871 else if (i ==2)
3872 reginfo->info_aux_eval =
3873 reginfo->info_aux->info_aux_eval =
3874 &(PL_regmatch_state->u.info_aux_eval);
3875
3876 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3877 PL_regmatch_state = S_push_slab(aTHX);
3878 }
3879
3880 /* note initial PL_regmatch_state position; at end of match we'll
3881 * pop back to there and free any higher slabs */
3882
3883 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3884 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3885 reginfo->info_aux->poscache = NULL;
3886
3887 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3888
3889 if ((prog->extflags & RXf_EVAL_SEEN))
3890 S_setup_eval_state(aTHX_ reginfo);
3891 else
3892 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3893 }
3894
3895 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3896 /* We have to be careful. If the previous successful match
3897 was from this regex we don't want a subsequent partially
3898 successful match to clobber the old results.
3899 So when we detect this possibility we add a swap buffer
3900 to the re, and switch the buffer each match. If we fail,
3901 we switch it back; otherwise we leave it swapped.
3902 */
3903 swap = RXp_OFFSp(prog);
3904 /* avoid leak if we die, or clean up anyway if match completes */
3905 SAVEFREEPV(swap);
3906 Newxz(RXp_OFFSp(prog), (prog->nparens + 1), regexp_paren_pair);
3907 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3908 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3909 0,
3910 PTR2UV(prog),
3911 PTR2UV(swap),
3912 PTR2UV(RXp_OFFSp(prog))
3913 ));
3914 }
3915
3916 if (prog->recurse_locinput)
3917 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3918
3919 /* Simplest case: anchored match (but not \G) need be tried only once,
3920 * or with MBOL, only at the beginning of each line.
3921 *
3922 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3923 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3924 * match at the start of the string then it won't match anywhere else
3925 * either; while with /.*.../, if it doesn't match at the beginning,
3926 * the earliest it could match is at the start of the next line */
3927
3928 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3929 char *end;
3930
3931 if (regtry(reginfo, &s))
3932 goto got_it;
3933
3934 if (!(prog->intflags & PREGf_ANCH_MBOL))
3935 goto phooey;
3936
3937 /* didn't match at start, try at other newline positions */
3938
3939 if (minlen)
3940 dontbother = minlen - 1;
3941 end = HOP3c(strend, -dontbother, strbeg) - 1;
3942
3943 /* skip to next newline */
3944
3945 while (s <= end) { /* note it could be possible to match at the end of the string */
3946 /* NB: newlines are the same in unicode as they are in latin */
3947 if (*s++ != '\n')
3948 continue;
3949 if (prog->check_substr || prog->check_utf8) {
3950 /* note that with PREGf_IMPLICIT, intuit can only fail
3951 * or return the start position, so it's of limited utility.
3952 * Nevertheless, I made the decision that the potential for
3953 * quick fail was still worth it - DAPM */
3954 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3955 if (!s)
3956 goto phooey;
3957 }
3958 if (regtry(reginfo, &s))
3959 goto got_it;
3960 }
3961 goto phooey;
3962 } /* end anchored search */
3963
3964 /* anchored \G match */
3965 if (prog->intflags & PREGf_ANCH_GPOS)
3966 {
3967 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3968 assert(prog->intflags & PREGf_GPOS_SEEN);
3969 /* For anchored \G, the only position it can match from is
3970 * (ganch-gofs); we already set startpos to this above; if intuit
3971 * moved us on from there, we can't possibly succeed */
3972 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3973 if (s == startpos && regtry(reginfo, &s))
3974 goto got_it;
3975 goto phooey;
3976 }
3977
3978 /* Messy cases: unanchored match. */
3979
3980 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3981 /* we have /x+whatever/ */
3982 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3983 char ch;
3984 #ifdef DEBUGGING
3985 int did_match = 0;
3986 #endif
3987 if (utf8_target) {
3988 if (! prog->anchored_utf8) {
3989 to_utf8_substr(prog);
3990 }
3991 ch = SvPVX_const(prog->anchored_utf8)[0];
3992 REXEC_FBC_UTF8_SCAN(
3993 if (*s == ch) {
3994 DEBUG_EXECUTE_r( did_match = 1 );
3995 if (regtry(reginfo, &s)) goto got_it;
3996 s += UTF8_SAFE_SKIP(s, strend);
3997 while (s < strend && *s == ch)
3998 s += UTF8SKIP(s);
3999 }
4000 );
4001
4002 }
4003 else {
4004 if (! prog->anchored_substr) {
4005 if (! to_byte_substr(prog)) {
4006 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4007 }
4008 }
4009 ch = SvPVX_const(prog->anchored_substr)[0];
4010 REXEC_FBC_NON_UTF8_SCAN(
4011 if (*s == ch) {
4012 DEBUG_EXECUTE_r( did_match = 1 );
4013 if (regtry(reginfo, &s)) goto got_it;
4014 s++;
4015 while (s < strend && *s == ch)
4016 s++;
4017 }
4018 );
4019 }
4020 DEBUG_EXECUTE_r(if (!did_match)
4021 Perl_re_printf( aTHX_
4022 "Did not find anchored character...\n")
4023 );
4024 }
4025 else if (prog->anchored_substr != NULL
4026 || prog->anchored_utf8 != NULL
4027 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
4028 && prog->float_max_offset < strend - s)) {
4029 SV *must;
4030 SSize_t back_max;
4031 SSize_t back_min;
4032 char *last;
4033 char *last1; /* Last position checked before */
4034 #ifdef DEBUGGING
4035 int did_match = 0;
4036 #endif
4037 if (prog->anchored_substr || prog->anchored_utf8) {
4038 if (utf8_target) {
4039 if (! prog->anchored_utf8) {
4040 to_utf8_substr(prog);
4041 }
4042 must = prog->anchored_utf8;
4043 }
4044 else {
4045 if (! prog->anchored_substr) {
4046 if (! to_byte_substr(prog)) {
4047 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4048 }
4049 }
4050 must = prog->anchored_substr;
4051 }
4052 back_max = back_min = prog->anchored_offset;
4053 } else {
4054 if (utf8_target) {
4055 if (! prog->float_utf8) {
4056 to_utf8_substr(prog);
4057 }
4058 must = prog->float_utf8;
4059 }
4060 else {
4061 if (! prog->float_substr) {
4062 if (! to_byte_substr(prog)) {
4063 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4064 }
4065 }
4066 must = prog->float_substr;
4067 }
4068 back_max = prog->float_max_offset;
4069 back_min = prog->float_min_offset;
4070 }
4071
4072 if (back_min<0) {
4073 last = strend;
4074 } else {
4075 last = HOP3c(strend, /* Cannot start after this */
4076 -(SSize_t)(CHR_SVLEN(must)
4077 - (SvTAIL(must) != 0) + back_min), strbeg);
4078 }
4079 if (s > reginfo->strbeg)
4080 last1 = HOPc(s, -1);
4081 else
4082 last1 = s - 1; /* bogus */
4083
4084 /* XXXX check_substr already used to find "s", can optimize if
4085 check_substr==must. */
4086 dontbother = 0;
4087 strend = HOPc(strend, -dontbother);
4088 while ( (s <= last) &&
4089 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
4090 (unsigned char*)strend, must,
4091 multiline ? FBMrf_MULTILINE : 0)) ) {
4092 DEBUG_EXECUTE_r( did_match = 1 );
4093 if (HOPc(s, -back_max) > last1) {
4094 last1 = HOPc(s, -back_min);
4095 s = HOPc(s, -back_max);
4096 }
4097 else {
4098 char * const t = (last1 >= reginfo->strbeg)
4099 ? HOPc(last1, 1) : last1 + 1;
4100
4101 last1 = HOPc(s, -back_min);
4102 s = t;
4103 }
4104 if (utf8_target) {
4105 while (s <= last1) {
4106 if (regtry(reginfo, &s))
4107 goto got_it;
4108 if (s >= last1) {
4109 s++; /* to break out of outer loop */
4110 break;
4111 }
4112 s += UTF8SKIP(s);
4113 }
4114 }
4115 else {
4116 while (s <= last1) {
4117 if (regtry(reginfo, &s))
4118 goto got_it;
4119 s++;
4120 }
4121 }
4122 }
4123 DEBUG_EXECUTE_r(if (!did_match) {
4124 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4125 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4126 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
4127 ((must == prog->anchored_substr || must == prog->anchored_utf8)
4128 ? "anchored" : "floating"),
4129 quoted, RE_SV_TAIL(must));
4130 });
4131 goto phooey;
4132 }
4133 else if ( (c = progi->regstclass) ) {
4134 if (minlen) {
4135 const OPCODE op = OP(progi->regstclass);
4136 /* don't bother with what can't match */
4137 if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE)
4138 strend = HOPc(strend, -(minlen - 1));
4139 }
4140 DEBUG_EXECUTE_r({
4141 SV * const prop = sv_newmortal();
4142 regprop(prog, prop, c, reginfo, NULL);
4143 {
4144 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4145 s,strend-s,PL_dump_re_max_len);
4146 Perl_re_printf( aTHX_
4147 "Matching stclass %.*s against %s (%d bytes)\n",
4148 (int)SvCUR(prop), SvPVX_const(prop),
4149 quoted, (int)(strend - s));
4150 }
4151 });
4152 if (find_byclass(prog, c, s, strend, reginfo))
4153 goto got_it;
4154 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
4155 }
4156 else {
4157 dontbother = 0;
4158 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4159 /* Trim the end. */
4160 char *last= NULL;
4161 SV* float_real;
4162 STRLEN len;
4163 const char *little;
4164
4165 if (utf8_target) {
4166 if (! prog->float_utf8) {
4167 to_utf8_substr(prog);
4168 }
4169 float_real = prog->float_utf8;
4170 }
4171 else {
4172 if (! prog->float_substr) {
4173 if (! to_byte_substr(prog)) {
4174 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4175 }
4176 }
4177 float_real = prog->float_substr;
4178 }
4179
4180 little = SvPV_const(float_real, len);
4181 if (SvTAIL(float_real)) {
4182 /* This means that float_real contains an artificial \n on
4183 * the end due to the presence of something like this:
4184 * /foo$/ where we can match both "foo" and "foo\n" at the
4185 * end of the string. So we have to compare the end of the
4186 * string first against the float_real without the \n and
4187 * then against the full float_real with the string. We
4188 * have to watch out for cases where the string might be
4189 * smaller than the float_real or the float_real without
4190 * the \n. */
4191 char *checkpos= strend - len;
4192 DEBUG_OPTIMISE_r(
4193 Perl_re_printf( aTHX_
4194 "%sChecking for float_real.%s\n",
4195 PL_colors[4], PL_colors[5]));
4196 if (checkpos + 1 < strbeg) {
4197 /* can't match, even if we remove the trailing \n
4198 * string is too short to match */
4199 DEBUG_EXECUTE_r(
4200 Perl_re_printf( aTHX_
4201 "%sString shorter than required trailing substring, cannot match.%s\n",
4202 PL_colors[4], PL_colors[5]));
4203 goto phooey;
4204 } else if (memEQ(checkpos + 1, little, len - 1)) {
4205 /* can match, the end of the string matches without the
4206 * "\n" */
4207 last = checkpos + 1;
4208 } else if (checkpos < strbeg) {
4209 /* cant match, string is too short when the "\n" is
4210 * included */
4211 DEBUG_EXECUTE_r(
4212 Perl_re_printf( aTHX_
4213 "%sString does not contain required trailing substring, cannot match.%s\n",
4214 PL_colors[4], PL_colors[5]));
4215 goto phooey;
4216 } else if (!multiline) {
4217 /* non multiline match, so compare with the "\n" at the
4218 * end of the string */
4219 if (memEQ(checkpos, little, len)) {
4220 last= checkpos;
4221 } else {
4222 DEBUG_EXECUTE_r(
4223 Perl_re_printf( aTHX_
4224 "%sString does not contain required trailing substring, cannot match.%s\n",
4225 PL_colors[4], PL_colors[5]));
4226 goto phooey;
4227 }
4228 } else {
4229 /* multiline match, so we have to search for a place
4230 * where the full string is located */
4231 goto find_last;
4232 }
4233 } else {
4234 find_last:
4235 if (len)
4236 last = rninstr(s, strend, little, little + len);
4237 else
4238 last = strend; /* matching "$" */
4239 }
4240 if (!last) {
4241 /* at one point this block contained a comment which was
4242 * probably incorrect, which said that this was a "should not
4243 * happen" case. Even if it was true when it was written I am
4244 * pretty sure it is not anymore, so I have removed the comment
4245 * and replaced it with this one. Yves */
4246 DEBUG_EXECUTE_r(
4247 Perl_re_printf( aTHX_
4248 "%sString does not contain required substring, cannot match.%s\n",
4249 PL_colors[4], PL_colors[5]
4250 ));
4251 goto phooey;
4252 }
4253 dontbother = strend - last + prog->float_min_offset;
4254 }
4255 if (minlen && (dontbother < minlen))
4256 dontbother = minlen - 1;
4257 strend -= dontbother; /* this one's always in bytes! */
4258 /* We don't know much -- general case. */
4259 if (utf8_target) {
4260 for (;;) {
4261 if (regtry(reginfo, &s))
4262 goto got_it;
4263 if (s >= strend)
4264 break;
4265 s += UTF8SKIP(s);
4266 };
4267 }
4268 else {
4269 do {
4270 if (regtry(reginfo, &s))
4271 goto got_it;
4272 } while (s++ < strend);
4273 }
4274 }
4275
4276 /* Failure. */
4277 goto phooey;
4278
4279 got_it:
4280 /* s/// doesn't like it if $& is earlier than where we asked it to
4281 * start searching (which can happen on something like /.\G/) */
4282 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
4283 && (RXp_OFFS_START(prog,0) < stringarg - strbeg))
4284 {
4285 /* this should only be possible under \G */
4286 assert(prog->intflags & PREGf_GPOS_SEEN);
4287 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4288 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4289 goto phooey;
4290 }
4291
4292 /* clean up; this will trigger destructors that will free all slabs
4293 * above the current one, and cleanup the regmatch_info_aux
4294 * and regmatch_info_aux_eval sructs */
4295
4296 LEAVE_SCOPE(oldsave);
4297
4298 if (RXp_PAREN_NAMES(prog))
4299 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4300
4301 /* make sure $`, $&, $', and $digit will work later */
4302 if ( !(flags & REXEC_NOT_FIRST) )
4303 S_reg_set_capture_string(aTHX_ rx,
4304 strbeg, reginfo->strend,
4305 sv, flags, utf8_target);
4306
4307 return 1;
4308
4309 phooey:
4310 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
4311 PL_colors[4], PL_colors[5]));
4312
4313 if (swap) {
4314 /* we failed :-( roll it back.
4315 * Since the swap buffer will be freed on scope exit which follows
4316 * shortly, restore the old captures by copying 'swap's original
4317 * data to the new offs buffer
4318 */
4319 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4320 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4321 0,
4322 PTR2UV(prog),
4323 PTR2UV(RXp_OFFSp(prog)),
4324 PTR2UV(swap)
4325 ));
4326
4327 Copy(swap, RXp_OFFSp(prog), prog->nparens + 1, regexp_paren_pair);
4328 }
4329
4330 /* clean up; this will trigger destructors that will free all slabs
4331 * above the current one, and cleanup the regmatch_info_aux
4332 * and regmatch_info_aux_eval sructs */
4333
4334 LEAVE_SCOPE(oldsave);
4335
4336 return 0;
4337 }
4338
4339
4340 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4341 * Do inc before dec, in case old and new rex are the same */
4342 #define SET_reg_curpm(Re2) \
4343 if (reginfo->info_aux_eval) { \
4344 (void)ReREFCNT_inc(Re2); \
4345 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
4346 PM_SETRE((PL_reg_curpm), (Re2)); \
4347 }
4348
4349
4350 /*
4351 - regtry - try match at specific point
4352 */
4353 STATIC bool /* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info * reginfo,char ** startposp)4354 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4355 {
4356 CHECKPOINT lastcp;
4357 REGEXP *const rx = reginfo->prog;
4358 regexp *const prog = ReANY(rx);
4359 SSize_t result;
4360 #ifdef DEBUGGING
4361 U32 depth = 0; /* used by REGCP_SET */
4362 #endif
4363 RXi_GET_DECL(prog,progi);
4364 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4365
4366 PERL_ARGS_ASSERT_REGTRY;
4367
4368 reginfo->cutpoint=NULL;
4369
4370 RXp_OFFSp(prog)[0].start = *startposp - reginfo->strbeg;
4371 RXp_LASTPAREN(prog) = 0;
4372 RXp_LASTCLOSEPAREN(prog) = 0;
4373
4374 /* XXXX What this code is doing here?!!! There should be no need
4375 to do this again and again, RXp_LASTPAREN(prog) should take care of
4376 this! --ilya*/
4377
4378 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4379 * Actually, the code in regcppop() (which Ilya may be meaning by
4380 * RXp_LASTPAREN(prog)), is not needed at all by the test suite
4381 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4382 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4383 * Meanwhile, this code *is* needed for the
4384 * above-mentioned test suite tests to succeed. The common theme
4385 * on those tests seems to be returning null fields from matches.
4386 * --jhi updated by dapm */
4387
4388 /* After encountering a variant of the issue mentioned above I think
4389 * the point Ilya was making is that if we properly unwind whenever
4390 * we set lastparen to a smaller value then we should not need to do
4391 * this every time, only when needed. So if we have tests that fail if
4392 * we remove this, then it suggests somewhere else we are improperly
4393 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4394 * places it is called, and related regcp() routines. - Yves */
4395 #if 1
4396 if (prog->nparens) {
4397 regexp_paren_pair *pp = RXp_OFFSp(prog);
4398 I32 i;
4399 for (i = prog->nparens; i > (I32)RXp_LASTPAREN(prog); i--) {
4400 ++pp;
4401 pp->start = -1;
4402 pp->end = -1;
4403 }
4404 }
4405 #endif
4406 REGCP_SET(lastcp);
4407 result = regmatch(reginfo, *startposp, progi->program + 1);
4408 if (result != -1) {
4409 RXp_OFFSp(prog)[0].end = result;
4410 return 1;
4411 }
4412 if (reginfo->cutpoint)
4413 *startposp= reginfo->cutpoint;
4414 REGCP_UNWIND(lastcp);
4415 return 0;
4416 }
4417
4418 /* this is used to determine how far from the left messages like
4419 'failed...' are printed in regexec.c. It should be set such that
4420 messages are inline with the regop output that created them.
4421 */
4422 #define REPORT_CODE_OFF 29
4423 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4424 #ifdef DEBUGGING
4425 int
Perl_re_exec_indentf(pTHX_ const char * fmt,U32 depth,...)4426 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4427 {
4428 va_list ap;
4429 int result;
4430 PerlIO *f= Perl_debug_log;
4431 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4432 va_start(ap, depth);
4433 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4434 result = PerlIO_vprintf(f, fmt, ap);
4435 va_end(ap);
4436 return result;
4437 }
4438 #endif /* DEBUGGING */
4439
4440 /* grab a new slab and return the first slot in it */
4441
4442 STATIC regmatch_state *
S_push_slab(pTHX)4443 S_push_slab(pTHX)
4444 {
4445 regmatch_slab *s = PL_regmatch_slab->next;
4446 if (!s) {
4447 Newx(s, 1, regmatch_slab);
4448 s->prev = PL_regmatch_slab;
4449 s->next = NULL;
4450 PL_regmatch_slab->next = s;
4451 }
4452 PL_regmatch_slab = s;
4453 return SLAB_FIRST(s);
4454 }
4455
4456 #ifdef DEBUGGING
4457
4458 STATIC void
S_debug_start_match(pTHX_ const REGEXP * prog,const bool utf8_target,const char * start,const char * end,const char * blurb)4459 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4460 const char *start, const char *end, const char *blurb)
4461 {
4462 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4463
4464 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4465
4466 if (!PL_colorset)
4467 reginitcolors();
4468 {
4469 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4470 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4471
4472 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4473 start, end - start, PL_dump_re_max_len);
4474
4475 Perl_re_printf( aTHX_
4476 "%s%s REx%s %s against %s\n",
4477 PL_colors[4], blurb, PL_colors[5], s0, s1);
4478
4479 if (utf8_target||utf8_pat)
4480 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4481 utf8_pat ? "pattern" : "",
4482 utf8_pat && utf8_target ? " and " : "",
4483 utf8_target ? "string" : ""
4484 );
4485 }
4486 }
4487
4488 STATIC void
S_dump_exec_pos(pTHX_ const char * locinput,const regnode * scan,const char * loc_regeol,const char * loc_bostr,const char * loc_reg_starttry,const bool utf8_target,const U32 depth)4489 S_dump_exec_pos(pTHX_ const char *locinput,
4490 const regnode *scan,
4491 const char *loc_regeol,
4492 const char *loc_bostr,
4493 const char *loc_reg_starttry,
4494 const bool utf8_target,
4495 const U32 depth
4496 )
4497 {
4498 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4499 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4500 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4501 /* The part of the string before starttry has one color
4502 (pref0_len chars), between starttry and current
4503 position another one (pref_len - pref0_len chars),
4504 after the current position the third one.
4505 We assume that pref0_len <= pref_len, otherwise we
4506 decrease pref0_len. */
4507 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4508 ? (5 + taill) - l : locinput - loc_bostr;
4509 int pref0_len;
4510
4511 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4512
4513 if (utf8_target) {
4514 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) {
4515 pref_len++;
4516 }
4517 }
4518 pref0_len = pref_len - (locinput - loc_reg_starttry);
4519 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4520 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4521 ? (5 + taill) - pref_len : loc_regeol - locinput);
4522 if (utf8_target) {
4523 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) {
4524 l--;
4525 }
4526 }
4527 if (pref0_len < 0)
4528 pref0_len = 0;
4529 if (pref0_len > pref_len)
4530 pref0_len = pref_len;
4531 {
4532 const int is_uni = utf8_target ? 1 : 0;
4533
4534 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4535 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4536
4537 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4538 (locinput - pref_len + pref0_len),
4539 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4540
4541 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4542 locinput, loc_regeol - locinput, 10, 0, 1);
4543
4544 const STRLEN tlen=len0+len1+len2;
4545 Perl_re_printf( aTHX_
4546 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4547 (IV)(locinput - loc_bostr),
4548 len0, s0,
4549 len1, s1,
4550 (docolor ? "" : "> <"),
4551 len2, s2,
4552 (int)(tlen > 19 ? 0 : 19 - tlen),
4553 "",
4554 (UV)depth);
4555 }
4556 }
4557
4558 #endif
4559
4560 /* reg_check_named_buff_matched()
4561 * Checks to see if a named buffer has matched. The data array of
4562 * buffer numbers corresponding to the buffer is expected to reside
4563 * in the regexp->data->data array in the slot stored in the ARG1u() of
4564 * node involved. Note that this routine doesn't actually care about the
4565 * name, that information is not preserved from compilation to execution.
4566 * Returns the index of the leftmost defined buffer with the given name
4567 * or 0 if non of the buffers matched.
4568 */
4569 STATIC I32
S_reg_check_named_buff_matched(const regexp * rex,const regnode * scan)4570 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4571 {
4572 I32 n;
4573 RXi_GET_DECL(rex,rexi);
4574 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
4575 I32 *nums=(I32*)SvPVX(sv_dat);
4576
4577 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4578
4579 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4580 if ((I32)RXp_LASTPAREN(rex) >= nums[n] &&
4581 RXp_OFFS_END(rex,nums[n]) != -1)
4582 {
4583 return nums[n];
4584 }
4585 }
4586 return 0;
4587 }
4588
4589 static bool
S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,struct next_matchable_info * m,regmatch_info * reginfo)4590 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4591 struct next_matchable_info * m,
4592 regmatch_info *reginfo)
4593 {
4594 /* This function determines various characteristics about every possible
4595 * initial match of the passed-in EXACTish <text_node>, and stores them in
4596 * <*m>.
4597 *
4598 * That includes a match string and a parallel mask, such that if you AND
4599 * the target string with the mask and compare with the match string,
4600 * you'll have a pretty good idea, perhaps even perfect, if that portion of
4601 * the target matches or not.
4602 *
4603 * The motivation behind this function is to allow the caller to set up
4604 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where
4605 * B is an arbitrary EXACTish node. To find the end of .*, we look for the
4606 * beginning oF B, which is the passed in <text_node> That's where this
4607 * function comes in. The values it returns can quickly be used to rule
4608 * out many, or all, cases of possible matches not actually being the
4609 * beginning of B, <text_node>. It is also used in regrepeat() where we
4610 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently
4611 * determine where the span of 'A's stop.
4612 *
4613 * If <text_node> is of type EXACT, there is only one possible character
4614 * that can match its first character, and so the situation is quite
4615 * simple. But things can get much more complicated if folding is
4616 * involved. It may be that the first character of an EXACTFish node
4617 * doesn't participate in any possible fold, e.g., punctuation, so it can
4618 * be matched only by itself. The vast majority of characters that are in
4619 * folds match just two things, their lower and upper-case equivalents.
4620 * But not all are like that; some have multiple possible matches, or match
4621 * sequences of more than one character. This function sorts all that out.
4622 *
4623 * It returns information about all possibilities of what the first
4624 * character(s) of <text_node> could look like. Again, if <text_node> is a
4625 * plain EXACT node, that's just the actual first bytes of the first
4626 * character; but otherwise it is the bytes, that when masked, match all
4627 * possible combinations of all the initial bytes of all the characters
4628 * that could match, folded. (Actually, this is a slight over promise. It
4629 * handles only up to the initial 5 bytes, which is enough for all Unicode
4630 * characters, but not for all non-Unicode ones.)
4631 *
4632 * Here's an example to clarify. Suppose the first character of
4633 * <text_node> is the letter 'C', and we are under /i matching. That means
4634 * 'c' also matches. The representations of these two characters differ in
4635 * just one bit, so the mask would be a zero in that position and ones in
4636 * the other 7. And the returned string would be the AND of these two
4637 * characters, and would be one byte long, since these characters are each
4638 * a single byte. ANDing the target <text_node> with this mask will yield
4639 * the returned string if and only if <text_node> begins with one of these
4640 * two characters. So, the function would also return that the definitive
4641 * length matched is 1 byte.
4642 *
4643 * Now, suppose instead of the letter 'C', <text_node> begins with the
4644 * letter 'F'. The situation is much more complicated because there are
4645 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4646 * begins with 'f', and hence could match. We add these into the returned
4647 * string and mask, but the result isn't definitive; the caller has to
4648 * check further if its AND and compare pass. But the failure of that
4649 * compare will quickly rule out most possible inputs.
4650 *
4651 * Much of this could be done in regcomp.c at compile time, except for
4652 * locale-dependent, and UTF-8 target dependent data. Extra data fields
4653 * could be used for one or the other eventualities.
4654 *
4655 * If this function determines that no possible character in the target
4656 * string can match, it returns FALSE; otherwise TRUE. (The FALSE
4657 * situation occurs if the first character in <text_node> requires UTF-8 to
4658 * represent, and the target string isn't in UTF-8.)
4659 *
4660 * Some analysis is in GH #18414, located at the time of this writing at:
4661 * https://github.com/Perl/perl5/issues/18414
4662 */
4663
4664 const bool utf8_target = reginfo->is_utf8_target;
4665 bool utf8_pat = reginfo->is_utf8_pat;
4666
4667 PERL_UINT_FAST8_T i;
4668
4669 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4670 */
4671 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4672 U8 lengths[MAX_MATCHES] = { 0 };
4673
4674 U8 index_of_longest = 0;
4675
4676 U8 *pat = (U8*)STRING(text_node);
4677 Size_t pat_len = STR_LEN(text_node);
4678 U8 op = OP(text_node);
4679
4680 U8 byte_mask[5] = {0};
4681 U8 byte_anded[5] = {0};
4682
4683 /* There are some folds in Unicode to multiple characters. This will hold
4684 * such characters that could fold to the beginning of 'text_node' */
4685 UV multi_fold_from = 0;
4686
4687 /* We may have to create a modified copy of the pattern */
4688 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4689
4690 m->max_length = 0;
4691 m->min_length = 255;
4692 m->count = 0;
4693
4694 /* Even if the first character in the node can match something in Latin1,
4695 * if there is anything in the node that can't, the match must fail */
4696 if (! utf8_target && isEXACT_REQ8(op)) {
4697 return FALSE;
4698 }
4699
4700 /* Define a temporary op for use in this function, using an existing one that
4701 * should never be a real op during execution */
4702 #define TURKISH PSEUDO
4703
4704 /* What to do about these two nodes had to be deferred to runtime (which is
4705 * now). If the extra information we now have so indicates, turn them into
4706 * EXACTFU nodes */
4707 if ( (op == EXACTF && utf8_target)
4708 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4709 {
4710 if (op == EXACTFL && IN_UTF8_TURKIC_LOCALE) {
4711 op = TURKISH;
4712 }
4713 else {
4714 op = EXACTFU;
4715 }
4716
4717 /* And certain situations are better handled if we create a modified
4718 * version of the pattern */
4719 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4720 specific problematic characters */
4721 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4722
4723 /* The node could start with characters that are the first ones
4724 * of a multi-character fold. */
4725 multi_fold_from
4726 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4727 if (multi_fold_from) {
4728
4729 /* Here, they do form a sequence that matches the fold of a
4730 * single character. That single character then is a
4731 * possible match. Below we will look again at this, but
4732 * the code below is expecting every character in the
4733 * pattern to be folded, which the input isn't required to
4734 * be in this case. So, just fold the single character,
4735 * and the result will be in the expected form. */
4736 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4737 FOLD_FLAGS_FULL);
4738 pat = mod_pat;
4739 }
4740 /* Turkish has a couple extra possibilities. */
4741 else if ( UNLIKELY(op == TURKISH)
4742 && pat_len >= 3
4743 && isALPHA_FOLD_EQ(pat[0], 'f')
4744 && ( memBEGINs(pat + 1, pat_len - 1,
4745 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4746 || ( pat_len >= 4
4747 && isALPHA_FOLD_EQ(pat[1], 'f')
4748 && memBEGINs(pat + 2, pat_len - 2,
4749 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4750 ))) {
4751 /* The macros for finding a multi-char fold don't include
4752 * the Turkish possibilities, in which U+130 folds to 'i'.
4753 * Hard-code these. It's very unlikely that Unicode will
4754 * ever add any others. */
4755 if (pat[1] == 'f') {
4756 pat_len = 3;
4757 Copy("ffi", mod_pat, pat_len, U8);
4758 }
4759 else {
4760 pat_len = 2;
4761 Copy("fi", mod_pat, pat_len, U8);
4762 }
4763 pat = mod_pat;
4764 }
4765 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat)
4766 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4767 && LIKELY(memNEs(pat, pat_len,
4768 LATIN_SMALL_LETTER_SHARP_S_UTF8))
4769 && (LIKELY(op != TURKISH || *pat != 'I')))
4770 {
4771 /* For all cases of things between 0-255, except the ones
4772 * in the conditional above, the fold is just the lower
4773 * case, which is faster than the more general case. */
4774 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4775 pat[1]));
4776 pat_len = 1;
4777 pat = mod_pat;
4778 utf8_pat = FALSE;
4779 }
4780 else { /* Code point above 255, or needs special handling */
4781 _to_utf8_fold_flags(pat, pat + pat_len,
4782 mod_pat, &pat_len,
4783 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4784 pat = mod_pat;
4785 }
4786 }
4787 }
4788 else if /* Below is not a UTF-8 pattern; there's a somewhat different
4789 set of problematic characters */
4790 ((multi_fold_from
4791 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4792 {
4793 /* We may have to canonicalize a multi-char fold, as in the UTF-8
4794 * case */
4795 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4796 FOLD_FLAGS_FULL);
4797 pat = mod_pat;
4798 }
4799 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4800 mod_pat[0] = mod_pat[1] = 's';
4801 pat_len = 2;
4802 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4803 chars, and speeds copying */
4804 pat = mod_pat;
4805 }
4806 else if (LIKELY(op != TURKISH || *pat != 'I')) {
4807 mod_pat[0] = toLOWER_L1(*pat);
4808 pat_len = 1;
4809 pat = mod_pat;
4810 }
4811 }
4812 else if /* Below isn't a node that we convert to UTF-8 */
4813 ( utf8_target
4814 && ! utf8_pat
4815 && op == EXACTFAA_NO_TRIE
4816 && *pat == LATIN_SMALL_LETTER_SHARP_S)
4817 {
4818 /* A very special case. Folding U+DF goes to U+17F under /iaa. We
4819 * did this at compile time when the pattern was UTF-8 , but otherwise
4820 * we couldn't do it earlier, because it requires a UTF-8 target for
4821 * this match to be legal. */
4822 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4823 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4824 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4825 pat = mod_pat;
4826 utf8_pat = TRUE;
4827 }
4828
4829 /* Here, we have taken care of the initial work for a few very problematic
4830 * situations, possibly creating a modified pattern.
4831 *
4832 * Now ready for the general case. We build up all the possible things
4833 * that could match the first character of the pattern into the elements of
4834 * 'matches[]'
4835 *
4836 * Everything generally matches at least itself. But if there is a
4837 * UTF8ness mismatch, we have to convert to that of the target string. */
4838 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
4839 matches[0][0] = pat[0];
4840 lengths[0] = 1;
4841 m->count++;
4842 }
4843 else if (utf8_target) {
4844 if (utf8_pat) {
4845 lengths[0] = UTF8SKIP(pat);
4846 Copy(pat, matches[0], lengths[0], U8);
4847 m->count++;
4848 }
4849 else { /* target is UTF-8, pattern isn't */
4850 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4851 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4852 lengths[0] = 2;
4853 m->count++;
4854 }
4855 }
4856 else if (! utf8_pat) { /* Neither is UTF-8 */
4857 matches[0][0] = pat[0];
4858 lengths[0] = 1;
4859 m->count++;
4860 }
4861 else /* target isn't UTF-8; pattern is. No match possible unless the
4862 pattern's first character can fit in a byte */
4863 if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4864 {
4865 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4866 lengths[0] = 1;
4867 m->count++;
4868 }
4869
4870 /* Here we have taken care of any necessary node-type changes */
4871
4872 if (m->count) {
4873 m->max_length = lengths[0];
4874 m->min_length = lengths[0];
4875 }
4876
4877 /* For non-folding nodes, there are no other possible candidate matches,
4878 * but for foldable ones, we have to look further. */
4879 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4880 UV folded; /* The first character in the pattern, folded */
4881 U32 first_fold_from; /* A character that folds to it */
4882 const U32 * remaining_fold_froms; /* The remaining characters that
4883 fold to it, if any */
4884 Size_t folds_to_count; /* The total number of characters that fold to
4885 'folded' */
4886
4887 /* If the node begins with a sequence of more than one character that
4888 * together form the fold of a single character, it is called a
4889 * 'multi-character fold', and the normal functions don't handle this
4890 * case. We set 'multi_fold_from' to the single folded-from character,
4891 * which is handled in an extra iteration below */
4892 if (utf8_pat) {
4893 folded = valid_utf8_to_uvchr(pat, NULL);
4894 multi_fold_from
4895 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4896 }
4897 else {
4898 folded = *pat;
4899
4900 /* This may generate illegal combinations for things like EXACTF,
4901 * but rather than repeat the logic and exclude them here, all such
4902 * illegalities are checked for and skipped below in the loop */
4903 multi_fold_from
4904 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4905 }
4906
4907 /* Everything matches at least itself; initialize to that because the
4908 * only the branches below that set it are the ones where the number
4909 * isn't 1. */
4910 folds_to_count = 1;
4911
4912 /* There are a few special cases for locale-dependent nodes, where the
4913 * run-time context was needed before we could know what matched */
4914 if (UNLIKELY(op == EXACTFL) && folded < 256) {
4915 first_fold_from = PL_fold_locale[folded];
4916 }
4917 else if ( op == EXACTFL && utf8_target && utf8_pat
4918 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4919 LATIN_SMALL_LETTER_LONG_S_UTF8))
4920 {
4921 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4922 }
4923 else if (UNLIKELY( op == TURKISH
4924 && ( isALPHA_FOLD_EQ(folded, 'i')
4925 || inRANGE(folded,
4926 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4927 LATIN_SMALL_LETTER_DOTLESS_I))))
4928 { /* Turkish folding requires special handling */
4929 if (folded == 'i')
4930 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4931 else if (folded == 'I')
4932 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4933 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4934 first_fold_from = 'i';
4935 else first_fold_from = 'I';
4936 }
4937 else {
4938 /* Here, isn't a special case: use the generic function to
4939 * calculate what folds to this */
4940 redo_multi:
4941 /* Look up what code points (besides itself) fold to 'folded';
4942 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4943 folds_to_count = _inverse_folds(folded, &first_fold_from,
4944 &remaining_fold_froms);
4945 }
4946
4947 /* Add each character that folds to 'folded' to the list of them,
4948 * subject to limitations based on the node type and target UTF8ness.
4949 * If there was a character that folded to multiple characters, do an
4950 * extra iteration for it. (Note the extra iteration if there is a
4951 * multi-character fold) */
4952 for (i = 0; i < folds_to_count
4953 + UNLIKELY(multi_fold_from != 0); i++)
4954 {
4955 UV fold_from = 0;
4956
4957 if (i >= folds_to_count) { /* Final iteration: handle the
4958 multi-char */
4959 fold_from = multi_fold_from;
4960 }
4961 else if (i == 0) {
4962 fold_from = first_fold_from;
4963 }
4964 else if (i < folds_to_count) {
4965 fold_from = remaining_fold_froms[i-1];
4966 }
4967
4968 if (folded == fold_from) { /* We already added the character
4969 itself */
4970 continue;
4971 }
4972
4973 /* EXACTF doesn't have any non-ascii folds */
4974 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4975 continue;
4976 }
4977
4978 /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4979 * */
4980 if ( isASCII(folded) != isASCII(fold_from)
4981 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4982
4983 {
4984 continue;
4985 }
4986
4987 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4988 * locale, but those have been converted to EXACTFU above) */
4989 if ( op == EXACTFL
4990 && (folded < 256) != (fold_from < 256))
4991 {
4992 continue;
4993 }
4994
4995 /* If this triggers, it likely is because of the unlikely case
4996 * where a new Unicode standard has changed what MAX_MATCHES should
4997 * be set to */
4998 assert(m->count < MAX_MATCHES);
4999
5000 /* Add this character to the list of possible matches */
5001 if (utf8_target) {
5002 uvchr_to_utf8(matches[(U8) m->count], fold_from);
5003 lengths[m->count] = UVCHR_SKIP(fold_from);
5004 m->count++;
5005 }
5006 else { /* Non-UTF8 target: no code point above 255 can appear in it
5007 */
5008 if (fold_from > 255) {
5009 continue;
5010 }
5011
5012 matches[m->count][0] = fold_from;
5013 lengths[m->count] = 1;
5014 m->count++;
5015 }
5016
5017 /* Update min and mlengths */
5018 if (m->min_length > lengths[m->count-1]) {
5019 m->min_length = lengths[m->count-1];
5020 }
5021
5022 if (m->max_length < lengths[m->count-1]) {
5023 index_of_longest = m->count - 1;
5024 m->max_length = lengths[index_of_longest];
5025 }
5026 } /* looped through each potential fold */
5027
5028 /* If there is something that folded to an initial multi-character
5029 * fold, repeat, using it. This catches some edge cases. An example
5030 * of one is /ss/i when UTF-8 encoded. The function
5031 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
5032 * (LATIN SMALL SHARP S). If it returned a list of characters, this
5033 * code wouldn't be needed. But since it doesn't, we have to look what
5034 * folds to the U+DF. In this case, U+1E9E does, and has to be added.
5035 * */
5036 if (multi_fold_from) {
5037 folded = multi_fold_from;
5038 multi_fold_from = 0;
5039 goto redo_multi;
5040 }
5041 } /* End of finding things that participate in this fold */
5042
5043 if (m->count == 0) { /* If nothing found, can't match */
5044 m->min_length = 0;
5045 return FALSE;
5046 }
5047
5048 /* Have calculated all possible matches. Now calculate the mask and AND
5049 * values */
5050 m->initial_exact = 0;
5051 m->initial_definitive = 0;
5052
5053 {
5054 unsigned int mask_ones = 0;
5055 unsigned int possible_ones = 0;
5056 U8 j;
5057
5058 /* For each byte that is in all possible matches ... */
5059 for (j = 0; j < MIN(m->min_length, 5); j++) {
5060
5061 /* Initialize the accumulator for this byte */
5062 byte_mask[j] = 0xFF;
5063 byte_anded[j] = matches[0][j];
5064
5065 /* Then the rest of the rows (folds). The mask is based on, like,
5066 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
5067 * where they differ. */
5068 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
5069 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]);
5070 byte_anded[j] &= matches[i][j];
5071 }
5072
5073 /* Keep track of the number of initial mask bytes that are all one
5074 * bits. The code calling this can use this number to know that
5075 * a string that matches this number of bytes in the pattern is an
5076 * exact match of that pattern for this number of bytes. But also
5077 * counted are the number of initial bytes that in total have a
5078 * single zero bit. If a string matches those, masked, it must be
5079 * one of two possibilites, both of which this function has
5080 * determined are legal. (But if that single 0 is one of the
5081 * initial bits for masking a UTF-8 start byte, that could
5082 * incorrectly lead to different length strings appearing to be
5083 * equivalent, so only do this optimization when the matchables are
5084 * all the same length. This was uncovered by testing
5085 * /\x{029E}/i.) */
5086 if (m->min_length == m->max_length) {
5087 mask_ones += PL_bitcount[byte_mask[j]];
5088 possible_ones += 8;
5089 if (mask_ones + 1 >= possible_ones) {
5090 m->initial_definitive++;
5091 if (mask_ones >= possible_ones) {
5092 m->initial_exact++;
5093 }
5094 }
5095 }
5096 }
5097 }
5098
5099 /* The first byte is separate for speed */
5100 m->first_byte_mask = byte_mask[0];
5101 m->first_byte_anded = byte_anded[0];
5102
5103 /* Then pack up to the next 4 bytes into a word */
5104 m->mask32 = m->anded32 = 0;
5105 for (i = 1; i < MIN(m->min_length, 5); i++) {
5106 U8 which = i;
5107 U8 shift = (which - 1) * 8;
5108 m->mask32 |= (U32) byte_mask[i] << shift;
5109 m->anded32 |= (U32) byte_anded[i] << shift;
5110 }
5111
5112 /* Finally, take the match strings and place them sequentially into a
5113 * one-dimensional array. (This is done to save significant space in the
5114 * structure.) Sort so the longest (presumably the least likely) is last.
5115 * XXX When this gets moved to regcomp, may want to fully sort shortest
5116 * first, but above we generally used the folded code point first, and
5117 * those tend to be no longer than their upper case values, so this is
5118 * already pretty well sorted by size.
5119 *
5120 * If the asserts fail, it's most likely because a new version of the
5121 * Unicode standard requires more space; simply increase the declaration
5122 * size. */
5123 {
5124 U8 cur_pos = 0;
5125 U8 output_index = 0;
5126
5127 if (m->count > 1) { /* No need to sort a single entry */
5128 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
5129
5130 /* Keep the same order for all but the longest. (If the
5131 * asserts fail, it could be because m->matches is declared too
5132 * short, either because of a new Unicode release, or an
5133 * overlooked test case, or it could be a bug.) */
5134 if (i != index_of_longest) {
5135 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5136 Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5137 cur_pos += lengths[i];
5138 m->lengths[output_index++] = lengths[i];
5139 }
5140 }
5141 }
5142
5143 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5144 Copy(matches[index_of_longest], m->matches + cur_pos,
5145 lengths[index_of_longest], U8);
5146
5147 /* Place the longest match last */
5148 m->lengths[output_index] = lengths[index_of_longest];
5149 }
5150
5151
5152 return TRUE;
5153 }
5154
5155 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */
5156 bool
S_test_EXACTISH_ST(const char * loc,struct next_matchable_info info)5157 S_test_EXACTISH_ST(const char * loc,
5158 struct next_matchable_info info)
5159 {
5160 /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5161 * bytes starting at 'loc' can match based on 'next_matchable_info' */
5162
5163 U32 input32 = 0;
5164
5165 /* Check the first byte */
5166 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5167 return FALSE;
5168
5169 /* Pack the next up-to-4 bytes into a 32 bit word */
5170 switch (info.min_length) {
5171 default:
5172 input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5173 /* FALLTHROUGH */
5174 case 4:
5175 input32 |= (U8) loc[3] << 2 * 8;
5176 /* FALLTHROUGH */
5177 case 3:
5178 input32 |= (U8) loc[2] << 1 * 8;
5179 /* FALLTHROUGH */
5180 case 2:
5181 input32 |= (U8) loc[1];
5182 break;
5183 case 1:
5184 return TRUE; /* We already tested and passed the 0th byte */
5185 case 0:
5186 ASSUME(0);
5187 }
5188
5189 /* And AND that with the mask and compare that with the assembled ANDED
5190 * values */
5191 return (input32 & info.mask32) == info.anded32;
5192 }
5193
5194 STATIC bool
S_isGCB(pTHX_ const GCB_enum before,const GCB_enum after,const U8 * const strbeg,const U8 * const curpos,const bool utf8_target)5195 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5196 {
5197 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5198 * between the inputs. See https://www.unicode.org/reports/tr29/. */
5199
5200 PERL_ARGS_ASSERT_ISGCB;
5201
5202 switch (GCB_table[before][after]) {
5203 case GCB_BREAKABLE:
5204 return TRUE;
5205
5206 case GCB_NOBREAK:
5207 return FALSE;
5208
5209 case GCB_RI_then_RI:
5210 {
5211 int RI_count = 1;
5212 U8 * temp_pos = (U8 *) curpos;
5213
5214 /* Do not break within emoji flag sequences. That is, do not
5215 * break between regional indicator (RI) symbols if there is an
5216 * odd number of RI characters before the break point.
5217 * GB12 sot (RI RI)* RI × RI
5218 * GB13 [^RI] (RI RI)* RI × RI */
5219
5220 while (backup_one_GCB(strbeg,
5221 &temp_pos,
5222 utf8_target) == GCB_Regional_Indicator)
5223 {
5224 RI_count++;
5225 }
5226
5227 return RI_count % 2 != 1;
5228 }
5229
5230 case GCB_EX_then_EM:
5231
5232 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
5233 {
5234 U8 * temp_pos = (U8 *) curpos;
5235 GCB_enum prev;
5236
5237 do {
5238 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5239 }
5240 while (prev == GCB_Extend);
5241
5242 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5243 }
5244
5245 case GCB_Maybe_Emoji_NonBreak:
5246
5247 {
5248
5249 /* Do not break within emoji modifier sequences or emoji zwj sequences.
5250 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5251 */
5252 U8 * temp_pos = (U8 *) curpos;
5253 GCB_enum prev;
5254
5255 do {
5256 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5257 }
5258 while (prev == GCB_Extend);
5259
5260 return prev != GCB_ExtPict_XX;
5261 }
5262
5263 default:
5264 break;
5265 }
5266
5267 #ifdef DEBUGGING
5268 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5269 before, after, GCB_table[before][after]);
5270 assert(0);
5271 #endif
5272 return TRUE;
5273 }
5274
5275 STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5276 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5277 {
5278 GCB_enum gcb;
5279
5280 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5281
5282 if (*curpos < strbeg) {
5283 return GCB_EDGE;
5284 }
5285
5286 if (utf8_target) {
5287 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5288 U8 * prev_prev_char_pos;
5289
5290 if (! prev_char_pos) {
5291 return GCB_EDGE;
5292 }
5293
5294 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5295 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5296 *curpos = prev_char_pos;
5297 prev_char_pos = prev_prev_char_pos;
5298 }
5299 else {
5300 *curpos = (U8 *) strbeg;
5301 return GCB_EDGE;
5302 }
5303 }
5304 else {
5305 if (*curpos - 2 < strbeg) {
5306 *curpos = (U8 *) strbeg;
5307 return GCB_EDGE;
5308 }
5309 (*curpos)--;
5310 gcb = getGCB_VAL_CP(*(*curpos - 1));
5311 }
5312
5313 return gcb;
5314 }
5315
5316 /* Combining marks attach to most classes that precede them, but this defines
5317 * the exceptions (from TR14) */
5318 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
5319 || prev == LB_Mandatory_Break \
5320 || prev == LB_Carriage_Return \
5321 || prev == LB_Line_Feed \
5322 || prev == LB_Next_Line \
5323 || prev == LB_Space \
5324 || prev == LB_ZWSpace))
5325
5326 STATIC bool
S_isLB(pTHX_ LB_enum before,LB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5327 S_isLB(pTHX_ LB_enum before,
5328 LB_enum after,
5329 const U8 * const strbeg,
5330 const U8 * const curpos,
5331 const U8 * const strend,
5332 const bool utf8_target)
5333 {
5334 U8 * temp_pos = (U8 *) curpos;
5335 LB_enum prev = before;
5336
5337 /* Is the boundary between 'before' and 'after' line-breakable?
5338 * Most of this is just a table lookup of a generated table from Unicode
5339 * rules. But some rules require context to decide, and so have to be
5340 * implemented in code */
5341
5342 PERL_ARGS_ASSERT_ISLB;
5343
5344 /* Rule numbers in the comments below are as of Unicode 9.0 */
5345
5346 redo:
5347 before = prev;
5348 switch (LB_table[before][after]) {
5349 case LB_BREAKABLE:
5350 return TRUE;
5351
5352 case LB_NOBREAK:
5353 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5354 return FALSE;
5355
5356 case LB_SP_foo + LB_BREAKABLE:
5357 case LB_SP_foo + LB_NOBREAK:
5358 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5359
5360 /* When we have something following a SP, we have to look at the
5361 * context in order to know what to do.
5362 *
5363 * SP SP should not reach here because LB7: Do not break before
5364 * spaces. (For two spaces in a row there is nothing that
5365 * overrides that) */
5366 assert(after != LB_Space);
5367
5368 /* Here we have a space followed by a non-space. Mostly this is a
5369 * case of LB18: "Break after spaces". But there are complications
5370 * as the handling of spaces is somewhat tricky. They are in a
5371 * number of rules, which have to be applied in priority order, but
5372 * something earlier in the string can cause a rule to be skipped
5373 * and a lower priority rule invoked. A prime example is LB7 which
5374 * says don't break before a space. But rule LB8 (lower priority)
5375 * says that the first break opportunity after a ZW is after any
5376 * span of spaces immediately after it. If a ZW comes before a SP
5377 * in the input, rule LB8 applies, and not LB7. Other such rules
5378 * involve combining marks which are rules 9 and 10, but they may
5379 * override higher priority rules if they come earlier in the
5380 * string. Since we're doing random access into the middle of the
5381 * string, we have to look for rules that should get applied based
5382 * on both string position and priority. Combining marks do not
5383 * attach to either ZW nor SP, so we don't have to consider them
5384 * until later.
5385 *
5386 * To check for LB8, we have to find the first non-space character
5387 * before this span of spaces */
5388 do {
5389 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5390 }
5391 while (prev == LB_Space);
5392
5393 /* LB8 Break before any character following a zero-width space,
5394 * even if one or more spaces intervene.
5395 * ZW SP* ÷
5396 * So if we have a ZW just before this span, and to get here this
5397 * is the final space in the span. */
5398 if (prev == LB_ZWSpace) {
5399 return TRUE;
5400 }
5401
5402 /* Here, not ZW SP+. There are several rules that have higher
5403 * priority than LB18 and can be resolved now, as they don't depend
5404 * on anything earlier in the string (except ZW, which we have
5405 * already handled). One of these rules is LB11 Do not break
5406 * before Word joiner, but we have specially encoded that in the
5407 * lookup table so it is caught by the single test below which
5408 * catches the other ones. */
5409 if (LB_table[LB_Space][after] - LB_SP_foo
5410 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5411 {
5412 return FALSE;
5413 }
5414
5415 /* If we get here, we have to XXX consider combining marks. */
5416 if (prev == LB_Combining_Mark) {
5417
5418 /* What happens with these depends on the character they
5419 * follow. */
5420 do {
5421 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5422 }
5423 while (prev == LB_Combining_Mark);
5424
5425 /* Most times these attach to and inherit the characteristics
5426 * of that character, but not always, and when not, they are to
5427 * be treated as AL by rule LB10. */
5428 if (! LB_CM_ATTACHES_TO(prev)) {
5429 prev = LB_Alphabetic;
5430 }
5431 }
5432
5433 /* Here, we have the character preceding the span of spaces all set
5434 * up. We follow LB18: "Break after spaces" unless the table shows
5435 * that is overridden */
5436 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5437
5438 case LB_CM_ZWJ_foo:
5439
5440 /* We don't know how to treat the CM except by looking at the first
5441 * non-CM character preceding it. ZWJ is treated as CM */
5442 do {
5443 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5444 }
5445 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5446
5447 /* Here, 'prev' is that first earlier non-CM character. If the CM
5448 * attaches to it, then it inherits the behavior of 'prev'. If it
5449 * doesn't attach, it is to be treated as an AL */
5450 if (! LB_CM_ATTACHES_TO(prev)) {
5451 prev = LB_Alphabetic;
5452 }
5453
5454 goto redo;
5455
5456 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5457 case LB_HY_or_BA_then_foo + LB_NOBREAK:
5458
5459 /* LB21a Don't break after Hebrew + Hyphen.
5460 * HL (HY | BA) × */
5461
5462 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5463 == LB_Hebrew_Letter)
5464 {
5465 return FALSE;
5466 }
5467
5468 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5469
5470 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5471 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5472
5473 /* LB25a (PR | PO) × ( OP | HY )? NU */
5474 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5475 return FALSE;
5476 }
5477
5478 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5479 == LB_BREAKABLE;
5480
5481 case LB_SY_or_IS_then_various + LB_BREAKABLE:
5482 case LB_SY_or_IS_then_various + LB_NOBREAK:
5483 {
5484 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5485
5486 LB_enum temp = prev;
5487 do {
5488 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5489 }
5490 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5491 if (temp == LB_Numeric) {
5492 return FALSE;
5493 }
5494
5495 return LB_table[prev][after] - LB_SY_or_IS_then_various
5496 == LB_BREAKABLE;
5497 }
5498
5499 case LB_various_then_PO_or_PR + LB_BREAKABLE:
5500 case LB_various_then_PO_or_PR + LB_NOBREAK:
5501 {
5502 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5503
5504 LB_enum temp = prev;
5505 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5506 {
5507 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5508 }
5509 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5510 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5511 }
5512 if (temp == LB_Numeric) {
5513 return FALSE;
5514 }
5515 return LB_various_then_PO_or_PR;
5516 }
5517
5518 case LB_RI_then_RI + LB_NOBREAK:
5519 case LB_RI_then_RI + LB_BREAKABLE:
5520 {
5521 int RI_count = 1;
5522
5523 /* LB30a Break between two regional indicator symbols if and
5524 * only if there are an even number of regional indicators
5525 * preceding the position of the break.
5526 *
5527 * sot (RI RI)* RI × RI
5528 * [^RI] (RI RI)* RI × RI */
5529
5530 while (backup_one_LB(strbeg,
5531 &temp_pos,
5532 utf8_target) == LB_Regional_Indicator)
5533 {
5534 RI_count++;
5535 }
5536
5537 return RI_count % 2 == 0;
5538 }
5539
5540 default:
5541 break;
5542 }
5543
5544 #ifdef DEBUGGING
5545 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5546 before, after, LB_table[before][after]);
5547 assert(0);
5548 #endif
5549 return TRUE;
5550 }
5551
5552 STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5553 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5554 {
5555
5556 LB_enum lb;
5557
5558 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5559
5560 if (*curpos >= strend) {
5561 return LB_EDGE;
5562 }
5563
5564 if (utf8_target) {
5565 *curpos += UTF8SKIP(*curpos);
5566 if (*curpos >= strend) {
5567 return LB_EDGE;
5568 }
5569 lb = getLB_VAL_UTF8(*curpos, strend);
5570 }
5571 else {
5572 (*curpos)++;
5573 if (*curpos >= strend) {
5574 return LB_EDGE;
5575 }
5576 lb = getLB_VAL_CP(**curpos);
5577 }
5578
5579 return lb;
5580 }
5581
5582 STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5583 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5584 {
5585 LB_enum lb;
5586
5587 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5588
5589 if (*curpos < strbeg) {
5590 return LB_EDGE;
5591 }
5592
5593 if (utf8_target) {
5594 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5595 U8 * prev_prev_char_pos;
5596
5597 if (! prev_char_pos) {
5598 return LB_EDGE;
5599 }
5600
5601 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5602 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5603 *curpos = prev_char_pos;
5604 prev_char_pos = prev_prev_char_pos;
5605 }
5606 else {
5607 *curpos = (U8 *) strbeg;
5608 return LB_EDGE;
5609 }
5610 }
5611 else {
5612 if (*curpos - 2 < strbeg) {
5613 *curpos = (U8 *) strbeg;
5614 return LB_EDGE;
5615 }
5616 (*curpos)--;
5617 lb = getLB_VAL_CP(*(*curpos - 1));
5618 }
5619
5620 return lb;
5621 }
5622
5623 STATIC bool
S_isSB(pTHX_ SB_enum before,SB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5624 S_isSB(pTHX_ SB_enum before,
5625 SB_enum after,
5626 const U8 * const strbeg,
5627 const U8 * const curpos,
5628 const U8 * const strend,
5629 const bool utf8_target)
5630 {
5631 /* returns a boolean indicating if there is a Sentence Boundary Break
5632 * between the inputs. See https://www.unicode.org/reports/tr29/ */
5633
5634 U8 * lpos = (U8 *) curpos;
5635 bool has_para_sep = FALSE;
5636 bool has_sp = FALSE;
5637
5638 PERL_ARGS_ASSERT_ISSB;
5639
5640 /* Break at the start and end of text.
5641 SB1. sot ÷
5642 SB2. ÷ eot
5643 But unstated in Unicode is don't break if the text is empty */
5644 if (before == SB_EDGE || after == SB_EDGE) {
5645 return before != after;
5646 }
5647
5648 /* SB 3: Do not break within CRLF. */
5649 if (before == SB_CR && after == SB_LF) {
5650 return FALSE;
5651 }
5652
5653 /* Break after paragraph separators. CR and LF are considered
5654 * so because Unicode views text as like word processing text where there
5655 * are no newlines except between paragraphs, and the word processor takes
5656 * care of wrapping without there being hard line-breaks in the text *./
5657 SB4. Sep | CR | LF ÷ */
5658 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5659 return TRUE;
5660 }
5661
5662 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5663 * (See Section 6.2, Replacing Ignore Rules.)
5664 SB5. X (Extend | Format)* → X */
5665 if (after == SB_Extend || after == SB_Format) {
5666
5667 /* Implied is that the these characters attach to everything
5668 * immediately prior to them except for those separator-type
5669 * characters. And the rules earlier have already handled the case
5670 * when one of those immediately precedes the extend char */
5671 return FALSE;
5672 }
5673
5674 if (before == SB_Extend || before == SB_Format) {
5675 U8 * temp_pos = lpos;
5676 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5677 if ( backup != SB_EDGE
5678 && backup != SB_Sep
5679 && backup != SB_CR
5680 && backup != SB_LF)
5681 {
5682 before = backup;
5683 lpos = temp_pos;
5684 }
5685
5686 /* Here, both 'before' and 'backup' are these types; implied is that we
5687 * don't break between them */
5688 if (backup == SB_Extend || backup == SB_Format) {
5689 return FALSE;
5690 }
5691 }
5692
5693 /* Do not break after ambiguous terminators like period, if they are
5694 * immediately followed by a number or lowercase letter, if they are
5695 * between uppercase letters, if the first following letter (optionally
5696 * after certain punctuation) is lowercase, or if they are followed by
5697 * "continuation" punctuation such as comma, colon, or semicolon. For
5698 * example, a period may be an abbreviation or numeric period, and thus may
5699 * not mark the end of a sentence.
5700
5701 * SB6. ATerm × Numeric */
5702 if (before == SB_ATerm && after == SB_Numeric) {
5703 return FALSE;
5704 }
5705
5706 /* SB7. (Upper | Lower) ATerm × Upper */
5707 if (before == SB_ATerm && after == SB_Upper) {
5708 U8 * temp_pos = lpos;
5709 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5710 if (backup == SB_Upper || backup == SB_Lower) {
5711 return FALSE;
5712 }
5713 }
5714
5715 /* The remaining rules that aren't the final one, all require an STerm or
5716 * an ATerm after having backed up over some Close* Sp*, and in one case an
5717 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5718 * So do that backup now, setting flags if either Sp or a paragraph
5719 * separator are found */
5720
5721 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5722 has_para_sep = TRUE;
5723 before = backup_one_SB(strbeg, &lpos, utf8_target);
5724 }
5725
5726 if (before == SB_Sp) {
5727 has_sp = TRUE;
5728 do {
5729 before = backup_one_SB(strbeg, &lpos, utf8_target);
5730 }
5731 while (before == SB_Sp);
5732 }
5733
5734 while (before == SB_Close) {
5735 before = backup_one_SB(strbeg, &lpos, utf8_target);
5736 }
5737
5738 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5739 * most cases an STerm */
5740 if (before == SB_STerm || before == SB_ATerm) {
5741
5742 /* So, here the lhs matches
5743 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5744 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5745 * The rules that apply here are:
5746 *
5747 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5748 | LF | STerm | ATerm) )* Lower
5749 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5750 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5751 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5752 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5753 */
5754
5755 /* And all but SB11 forbid having seen a paragraph separator */
5756 if (! has_para_sep) {
5757 if (before == SB_ATerm) { /* SB8 */
5758 U8 * rpos = (U8 *) curpos;
5759 SB_enum later = after;
5760
5761 while ( later != SB_OLetter
5762 && later != SB_Upper
5763 && later != SB_Lower
5764 && later != SB_Sep
5765 && later != SB_CR
5766 && later != SB_LF
5767 && later != SB_STerm
5768 && later != SB_ATerm
5769 && later != SB_EDGE)
5770 {
5771 later = advance_one_SB(&rpos, strend, utf8_target);
5772 }
5773 if (later == SB_Lower) {
5774 return FALSE;
5775 }
5776 }
5777
5778 if ( after == SB_SContinue /* SB8a */
5779 || after == SB_STerm
5780 || after == SB_ATerm)
5781 {
5782 return FALSE;
5783 }
5784
5785 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5786 if ( after == SB_Close
5787 || after == SB_Sp
5788 || after == SB_Sep
5789 || after == SB_CR
5790 || after == SB_LF)
5791 {
5792 return FALSE;
5793 }
5794 }
5795
5796 /* SB10. This and SB9 could probably be combined some way, but khw
5797 * has decided to follow the Unicode rule book precisely for
5798 * simplified maintenance */
5799 if ( after == SB_Sp
5800 || after == SB_Sep
5801 || after == SB_CR
5802 || after == SB_LF)
5803 {
5804 return FALSE;
5805 }
5806 }
5807
5808 /* SB11. */
5809 return TRUE;
5810 }
5811
5812 /* Otherwise, do not break.
5813 SB12. Any × Any */
5814
5815 return FALSE;
5816 }
5817
5818 STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target)5819 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5820 {
5821 SB_enum sb;
5822
5823 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5824
5825 if (*curpos >= strend) {
5826 return SB_EDGE;
5827 }
5828
5829 if (utf8_target) {
5830 do {
5831 *curpos += UTF8SKIP(*curpos);
5832 if (*curpos >= strend) {
5833 return SB_EDGE;
5834 }
5835 sb = getSB_VAL_UTF8(*curpos, strend);
5836 } while (sb == SB_Extend || sb == SB_Format);
5837 }
5838 else {
5839 do {
5840 (*curpos)++;
5841 if (*curpos >= strend) {
5842 return SB_EDGE;
5843 }
5844 sb = getSB_VAL_CP(**curpos);
5845 } while (sb == SB_Extend || sb == SB_Format);
5846 }
5847
5848 return sb;
5849 }
5850
5851 STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg,U8 ** curpos,const bool utf8_target)5852 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5853 {
5854 SB_enum sb;
5855
5856 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5857
5858 if (*curpos < strbeg) {
5859 return SB_EDGE;
5860 }
5861
5862 if (utf8_target) {
5863 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5864 if (! prev_char_pos) {
5865 return SB_EDGE;
5866 }
5867
5868 /* Back up over Extend and Format. curpos is always just to the right
5869 * of the character whose value we are getting */
5870 do {
5871 U8 * prev_prev_char_pos;
5872 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5873 strbeg)))
5874 {
5875 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5876 *curpos = prev_char_pos;
5877 prev_char_pos = prev_prev_char_pos;
5878 }
5879 else {
5880 *curpos = (U8 *) strbeg;
5881 return SB_EDGE;
5882 }
5883 } while (sb == SB_Extend || sb == SB_Format);
5884 }
5885 else {
5886 do {
5887 if (*curpos - 2 < strbeg) {
5888 *curpos = (U8 *) strbeg;
5889 return SB_EDGE;
5890 }
5891 (*curpos)--;
5892 sb = getSB_VAL_CP(*(*curpos - 1));
5893 } while (sb == SB_Extend || sb == SB_Format);
5894 }
5895
5896 return sb;
5897 }
5898
5899 STATIC bool
S_isWB(pTHX_ WB_enum previous,WB_enum before,WB_enum after,const U8 * const strbeg,const U8 * const curpos,const U8 * const strend,const bool utf8_target)5900 S_isWB(pTHX_ WB_enum previous,
5901 WB_enum before,
5902 WB_enum after,
5903 const U8 * const strbeg,
5904 const U8 * const curpos,
5905 const U8 * const strend,
5906 const bool utf8_target)
5907 {
5908 /* Return a boolean as to if the boundary between 'before' and 'after' is
5909 * a Unicode word break, using their published algorithm, but tailored for
5910 * Perl by treating spans of white space as one unit. Context may be
5911 * needed to make this determination. If the value for the character
5912 * before 'before' is known, it is passed as 'previous'; otherwise that
5913 * should be set to WB_UNKNOWN. The other input parameters give the
5914 * boundaries and current position in the matching of the string. That
5915 * is, 'curpos' marks the position where the character whose wb value is
5916 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5917
5918 U8 * before_pos = (U8 *) curpos;
5919 U8 * after_pos = (U8 *) curpos;
5920 WB_enum prev = before;
5921 WB_enum next;
5922
5923 PERL_ARGS_ASSERT_ISWB;
5924
5925 /* Rule numbers in the comments below are as of Unicode 9.0 */
5926
5927 redo:
5928 before = prev;
5929 switch (WB_table[before][after]) {
5930 case WB_BREAKABLE:
5931 return TRUE;
5932
5933 case WB_NOBREAK:
5934 return FALSE;
5935
5936 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5937 next = advance_one_WB(&after_pos, strend, utf8_target,
5938 FALSE /* Don't skip Extend nor Format */ );
5939 /* A space immediately preceding an Extend or Format is attached
5940 * to by them, and hence gets separated from previous spaces.
5941 * Otherwise don't break between horizontal white space */
5942 return next == WB_Extend || next == WB_Format;
5943
5944 /* WB4 Ignore Format and Extend characters, except when they appear at
5945 * the beginning of a region of text. This code currently isn't
5946 * general purpose, but it works as the rules are currently and likely
5947 * to be laid out. The reason it works is that when 'they appear at
5948 * the beginning of a region of text', the rule is to break before
5949 * them, just like any other character. Therefore, the default rule
5950 * applies and we don't have to look in more depth. Should this ever
5951 * change, we would have to have 2 'case' statements, like in the rules
5952 * below, and backup a single character (not spacing over the extend
5953 * ones) and then see if that is one of the region-end characters and
5954 * go from there */
5955 case WB_Ex_or_FO_or_ZWJ_then_foo:
5956 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5957 goto redo;
5958
5959 case WB_DQ_then_HL + WB_BREAKABLE:
5960 case WB_DQ_then_HL + WB_NOBREAK:
5961
5962 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5963
5964 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5965 == WB_Hebrew_Letter)
5966 {
5967 return FALSE;
5968 }
5969
5970 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5971
5972 case WB_HL_then_DQ + WB_BREAKABLE:
5973 case WB_HL_then_DQ + WB_NOBREAK:
5974
5975 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5976
5977 if (advance_one_WB(&after_pos, strend, utf8_target,
5978 TRUE /* Do skip Extend and Format */ )
5979 == WB_Hebrew_Letter)
5980 {
5981 return FALSE;
5982 }
5983
5984 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5985
5986 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5987 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5988
5989 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5990 * | Single_Quote) (ALetter | Hebrew_Letter) */
5991
5992 next = advance_one_WB(&after_pos, strend, utf8_target,
5993 TRUE /* Do skip Extend and Format */ );
5994
5995 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5996 {
5997 return FALSE;
5998 }
5999
6000 return WB_table[before][after]
6001 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
6002
6003 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
6004 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
6005
6006 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
6007 * | Single_Quote) × (ALetter | Hebrew_Letter) */
6008
6009 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
6010 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
6011 {
6012 return FALSE;
6013 }
6014
6015 return WB_table[before][after]
6016 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
6017
6018 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
6019 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
6020
6021 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
6022 * */
6023
6024 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
6025 == WB_Numeric)
6026 {
6027 return FALSE;
6028 }
6029
6030 return WB_table[before][after]
6031 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
6032
6033 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
6034 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
6035
6036 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
6037
6038 if (advance_one_WB(&after_pos, strend, utf8_target,
6039 TRUE /* Do skip Extend and Format */ )
6040 == WB_Numeric)
6041 {
6042 return FALSE;
6043 }
6044
6045 return WB_table[before][after]
6046 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
6047
6048 case WB_RI_then_RI + WB_NOBREAK:
6049 case WB_RI_then_RI + WB_BREAKABLE:
6050 {
6051 int RI_count = 1;
6052
6053 /* Do not break within emoji flag sequences. That is, do not
6054 * break between regional indicator (RI) symbols if there is an
6055 * odd number of RI characters before the potential break
6056 * point.
6057 *
6058 * WB15 sot (RI RI)* RI × RI
6059 * WB16 [^RI] (RI RI)* RI × RI */
6060
6061 while (backup_one_WB(&previous,
6062 strbeg,
6063 &before_pos,
6064 utf8_target) == WB_Regional_Indicator)
6065 {
6066 RI_count++;
6067 }
6068
6069 return RI_count % 2 != 1;
6070 }
6071
6072 default:
6073 break;
6074 }
6075
6076 #ifdef DEBUGGING
6077 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
6078 before, after, WB_table[before][after]);
6079 assert(0);
6080 #endif
6081 return TRUE;
6082 }
6083
6084 STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos,const U8 * const strend,const bool utf8_target,const bool skip_Extend_Format)6085 S_advance_one_WB(pTHX_ U8 ** curpos,
6086 const U8 * const strend,
6087 const bool utf8_target,
6088 const bool skip_Extend_Format)
6089 {
6090 WB_enum wb;
6091
6092 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
6093
6094 if (*curpos >= strend) {
6095 return WB_EDGE;
6096 }
6097
6098 if (utf8_target) {
6099
6100 /* Advance over Extend and Format */
6101 do {
6102 *curpos += UTF8SKIP(*curpos);
6103 if (*curpos >= strend) {
6104 return WB_EDGE;
6105 }
6106 wb = getWB_VAL_UTF8(*curpos, strend);
6107 } while ( skip_Extend_Format
6108 && (wb == WB_Extend || wb == WB_Format));
6109 }
6110 else {
6111 do {
6112 (*curpos)++;
6113 if (*curpos >= strend) {
6114 return WB_EDGE;
6115 }
6116 wb = getWB_VAL_CP(**curpos);
6117 } while ( skip_Extend_Format
6118 && (wb == WB_Extend || wb == WB_Format));
6119 }
6120
6121 return wb;
6122 }
6123
6124 STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous,const U8 * const strbeg,U8 ** curpos,const bool utf8_target)6125 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
6126 {
6127 WB_enum wb;
6128
6129 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
6130
6131 /* If we know what the previous character's break value is, don't have
6132 * to look it up */
6133 if (*previous != WB_UNKNOWN) {
6134 wb = *previous;
6135
6136 /* But we need to move backwards by one */
6137 if (utf8_target) {
6138 *curpos = reghopmaybe3(*curpos, -1, strbeg);
6139 if (! *curpos) {
6140 *previous = WB_EDGE;
6141 *curpos = (U8 *) strbeg;
6142 }
6143 else {
6144 *previous = WB_UNKNOWN;
6145 }
6146 }
6147 else {
6148 (*curpos)--;
6149 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6150 }
6151
6152 /* And we always back up over these three types */
6153 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6154 return wb;
6155 }
6156 }
6157
6158 if (*curpos < strbeg) {
6159 return WB_EDGE;
6160 }
6161
6162 if (utf8_target) {
6163 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6164 if (! prev_char_pos) {
6165 return WB_EDGE;
6166 }
6167
6168 /* Back up over Extend and Format. curpos is always just to the right
6169 * of the character whose value we are getting */
6170 do {
6171 U8 * prev_prev_char_pos;
6172 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6173 -1,
6174 strbeg)))
6175 {
6176 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6177 *curpos = prev_char_pos;
6178 prev_char_pos = prev_prev_char_pos;
6179 }
6180 else {
6181 *curpos = (U8 *) strbeg;
6182 return WB_EDGE;
6183 }
6184 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6185 }
6186 else {
6187 do {
6188 if (*curpos - 2 < strbeg) {
6189 *curpos = (U8 *) strbeg;
6190 return WB_EDGE;
6191 }
6192 (*curpos)--;
6193 wb = getWB_VAL_CP(*(*curpos - 1));
6194 } while (wb == WB_Extend || wb == WB_Format);
6195 }
6196
6197 return wb;
6198 }
6199
6200 /* Macros for regmatch(), using its internal variables */
6201 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6202 #define NEXTCHR_IS_EOS (nextbyte < 0)
6203
6204 #define SET_nextchr \
6205 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6206
6207 #define SET_locinput(p) \
6208 locinput = (p); \
6209 SET_nextchr
6210
6211 #define sayYES goto yes
6212 #define sayNO goto no
6213 #define sayNO_SILENT goto no_silent
6214
6215 /* we don't use STMT_START/END here because it leads to
6216 "unreachable code" warnings, which are bogus, but distracting. */
6217 #define CACHEsayNO \
6218 if (ST.cache_mask) \
6219 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6220 sayNO
6221
6222 #define EVAL_CLOSE_PAREN_IS(st,expr) \
6223 ( \
6224 ( ( st ) ) && \
6225 ( ( st )->u.eval.close_paren ) && \
6226 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6227 )
6228
6229 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
6230 ( \
6231 ( ( st ) ) && \
6232 ( ( st )->u.eval.close_paren ) && \
6233 ( ( expr ) ) && \
6234 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6235 )
6236
6237
6238 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6239 (st)->u.eval.close_paren = ( (expr) + 1 )
6240
6241 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6242 (st)->u.eval.close_paren = 0
6243
6244 /* push a new state then goto it */
6245
6246 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
6247 pushinput = input; \
6248 pusheol = eol; \
6249 pushsr0 = sr0; \
6250 scan = node; \
6251 st->resume_state = state; \
6252 goto push_state;
6253
6254 /* push a new state with success backtracking, then goto it */
6255
6256 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
6257 pushinput = input; \
6258 pusheol = eol; \
6259 pushsr0 = sr0; \
6260 scan = node; \
6261 st->resume_state = state; \
6262 goto push_yes_state;
6263
6264 #define DEBUG_STATE_pp(pp) \
6265 DEBUG_STATE_r({ \
6266 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6267 Perl_re_printf( aTHX_ \
6268 "%*s" pp " %s%s%s%s%s\n", \
6269 INDENT_CHARS(depth), "", \
6270 REGNODE_NAME(st->resume_state), \
6271 ((st==yes_state||st==mark_state) ? "[" : ""), \
6272 ((st==yes_state) ? "Y" : ""), \
6273 ((st==mark_state) ? "M" : ""), \
6274 ((st==yes_state||st==mark_state) ? "]" : "") \
6275 ); \
6276 });
6277
6278 /*
6279
6280 regmatch() - main matching routine
6281
6282 This is basically one big switch statement in a loop. We execute an op,
6283 set 'next' to point the next op, and continue. If we come to a point which
6284 we may need to backtrack to on failure such as (A|B|C), we push a
6285 backtrack state onto the backtrack stack. On failure, we pop the top
6286 state, and re-enter the loop at the state indicated. If there are no more
6287 states to pop, we return failure.
6288
6289 Sometimes we also need to backtrack on success; for example /A+/, where
6290 after successfully matching one A, we need to go back and try to
6291 match another one; similarly for lookahead assertions: if the assertion
6292 completes successfully, we backtrack to the state just before the assertion
6293 and then carry on. In these cases, the pushed state is marked as
6294 'backtrack on success too'. This marking is in fact done by a chain of
6295 pointers, each pointing to the previous 'yes' state. On success, we pop to
6296 the nearest yes state, discarding any intermediate failure-only states.
6297 Sometimes a yes state is pushed just to force some cleanup code to be
6298 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6299 it to free the inner regex.
6300
6301 Note that failure backtracking rewinds the cursor position, while
6302 success backtracking leaves it alone.
6303
6304 A pattern is complete when the END op is executed, while a subpattern
6305 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6306 ops trigger the "pop to last yes state if any, otherwise return true"
6307 behaviour.
6308
6309 A common convention in this function is to use A and B to refer to the two
6310 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6311 the subpattern to be matched possibly multiple times, while B is the entire
6312 rest of the pattern. Variable and state names reflect this convention.
6313
6314 The states in the main switch are the union of ops and failure/success of
6315 substates associated with that op. For example, IFMATCH is the op
6316 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6317 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6318 successfully matched A and IFMATCH_A_fail is a state saying that we have
6319 just failed to match A. Resume states always come in pairs. The backtrack
6320 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6321 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6322 on success or failure.
6323
6324 The struct that holds a backtracking state is actually a big union, with
6325 one variant for each major type of op. The variable st points to the
6326 top-most backtrack struct. To make the code clearer, within each
6327 block of code we #define ST to alias the relevant union.
6328
6329 Here's a concrete example of a (vastly oversimplified) IFMATCH
6330 implementation:
6331
6332 switch (state) {
6333 ....
6334
6335 #define ST st->u.ifmatch
6336
6337 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6338 ST.foo = ...; // some state we wish to save
6339 ...
6340 // push a yes backtrack state with a resume value of
6341 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6342 // first node of A:
6343 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6344 // NOTREACHED
6345
6346 case IFMATCH_A: // we have successfully executed A; now continue with B
6347 next = B;
6348 bar = ST.foo; // do something with the preserved value
6349 break;
6350
6351 case IFMATCH_A_fail: // A failed, so the assertion failed
6352 ...; // do some housekeeping, then ...
6353 sayNO; // propagate the failure
6354
6355 #undef ST
6356
6357 ...
6358 }
6359
6360 For any old-timers reading this who are familiar with the old recursive
6361 approach, the code above is equivalent to:
6362
6363 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6364 {
6365 int foo = ...
6366 ...
6367 if (regmatch(A)) {
6368 next = B;
6369 bar = foo;
6370 break;
6371 }
6372 ...; // do some housekeeping, then ...
6373 sayNO; // propagate the failure
6374 }
6375
6376 The topmost backtrack state, pointed to by st, is usually free. If you
6377 want to claim it, populate any ST.foo fields in it with values you wish to
6378 save, then do one of
6379
6380 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6381 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6382
6383 which sets that backtrack state's resume value to 'resume_state', pushes a
6384 new free entry to the top of the backtrack stack, then goes to 'node'.
6385 On backtracking, the free slot is popped, and the saved state becomes the
6386 new free state. An ST.foo field in this new top state can be temporarily
6387 accessed to retrieve values, but once the main loop is re-entered, it
6388 becomes available for reuse.
6389
6390 Note that the depth of the backtrack stack constantly increases during the
6391 left-to-right execution of the pattern, rather than going up and down with
6392 the pattern nesting. For example the stack is at its maximum at Z at the
6393 end of the pattern, rather than at X in the following:
6394
6395 /(((X)+)+)+....(Y)+....Z/
6396
6397 The only exceptions to this are lookahead/behind assertions and the cut,
6398 (?>A), which pop all the backtrack states associated with A before
6399 continuing.
6400
6401 Backtrack state structs are allocated in slabs of about 4K in size.
6402 PL_regmatch_state and st always point to the currently active state,
6403 and PL_regmatch_slab points to the slab currently containing
6404 PL_regmatch_state. The first time regmatch() is called, the first slab is
6405 allocated, and is never freed until interpreter destruction. When the slab
6406 is full, a new one is allocated and chained to the end. At exit from
6407 regmatch(), slabs allocated since entry are freed.
6408
6409 In order to work with variable length lookbehinds, an upper limit is placed on
6410 lookbehinds which is set to where the match position is at the end of where the
6411 lookbehind would get to. Nothing in the lookbehind should match above that,
6412 except we should be able to look beyond if for things like \b, which need the
6413 next character in the string to be able to determine if this is a boundary or
6414 not. We also can't match the end of string/line unless we are also at the end
6415 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6416 that match a width, we have to add a condition that they are within the legal
6417 bounds of our window into the string.
6418
6419 */
6420
6421 /* returns -1 on failure, $+[0] on success */
6422 STATIC SSize_t
S_regmatch(pTHX_ regmatch_info * reginfo,char * startpos,regnode * prog)6423 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6424 {
6425 const bool utf8_target = reginfo->is_utf8_target;
6426 const U32 uniflags = UTF8_ALLOW_DEFAULT;
6427 REGEXP *rex_sv = reginfo->prog;
6428 regexp *rex = ReANY(rex_sv);
6429 RXi_GET_DECL(rex,rexi);
6430 /* the current state. This is a cached copy of PL_regmatch_state */
6431 regmatch_state *st;
6432 /* cache heavy used fields of st in registers */
6433 regnode *scan;
6434 regnode *next;
6435 U32 n = 0; /* general value; init to avoid compiler warning */
6436 U32 utmp = 0; /* tmp variable - valid for at most one opcode */
6437 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
6438 SSize_t endref = 0; /* offset of end of backref when ln is start */
6439 char *locinput = startpos;
6440 char *loceol = reginfo->strend;
6441 char *pushinput; /* where to continue after a PUSH */
6442 char *pusheol; /* where to stop matching (loceol) after a PUSH */
6443 U8 *pushsr0; /* save starting pos of script run */
6444 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
6445 at EOS */
6446
6447 bool result = 0; /* return value of S_regmatch */
6448 U32 depth = 0; /* depth of backtrack stack */
6449 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6450 const U32 max_nochange_depth =
6451 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6452 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6453 regmatch_state *yes_state = NULL; /* state to pop to on success of
6454 subpattern */
6455 /* mark_state piggy backs on the yes_state logic so that when we unwind
6456 the stack on success we can update the mark_state as we go */
6457 regmatch_state *mark_state = NULL; /* last mark state we have seen */
6458 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6459 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
6460 U32 state_num;
6461 bool no_final = 0; /* prevent failure from backtracking? */
6462 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
6463 char *startpoint = locinput;
6464 SV *popmark = NULL; /* are we looking for a mark? */
6465 SV *sv_commit = NULL; /* last mark name seen in failure */
6466 SV *sv_yes_mark = NULL; /* last mark name we have seen
6467 during a successful match */
6468 U32 lastopen = 0; /* last open we saw */
6469 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6470 SV* const oreplsv = GvSVn(PL_replgv);
6471 /* these three flags are set by various ops to signal information to
6472 * the very next op. They have a useful lifetime of exactly one loop
6473 * iteration, and are not preserved or restored by state pushes/pops
6474 */
6475 bool sw = 0; /* the condition value in (?(cond)a|b) */
6476 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
6477 int logical = 0; /* the following EVAL is:
6478 0: (?{...})
6479 1: (?(?{...})X|Y)
6480 2: (??{...})
6481 or the following IFMATCH/UNLESSM is:
6482 false: plain (?=foo)
6483 true: used as a condition: (?(?=foo))
6484 */
6485 PAD* last_pad = NULL;
6486 dMULTICALL;
6487 U8 gimme = G_SCALAR;
6488 CV *caller_cv = NULL; /* who called us */
6489 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
6490 U32 maxopenparen = 0; /* max '(' index seen so far */
6491 int to_complement; /* Invert the result? */
6492 char_class_number_ classnum;
6493 bool is_utf8_pat = reginfo->is_utf8_pat;
6494 bool match = FALSE;
6495 I32 orig_savestack_ix = PL_savestack_ix;
6496 U8 * script_run_begin = NULL;
6497 char *match_end= NULL; /* where a match MUST end to be considered successful */
6498 bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6499 re_fold_t folder = NULL; /* used by various EXACTish regops */
6500 const U8 * fold_array = NULL; /* used by various EXACTish regops */
6501
6502 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6503 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6504 # define SOLARIS_BAD_OPTIMIZER
6505 const U32 *pl_charclass_dup = PL_charclass;
6506 # define PL_charclass pl_charclass_dup
6507 #endif
6508
6509 #ifdef DEBUGGING
6510 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6511 #endif
6512
6513 /* protect against undef(*^R) */
6514 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6515
6516 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6517 multicall_oldcatch = 0;
6518 PERL_UNUSED_VAR(multicall_cop);
6519
6520 PERL_ARGS_ASSERT_REGMATCH;
6521
6522 st = PL_regmatch_state;
6523
6524 /* Note that nextbyte is a byte even in UTF */
6525 SET_nextchr;
6526 scan = prog;
6527
6528 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6529 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6530 Perl_re_printf( aTHX_ "regmatch start\n" );
6531 }));
6532
6533 while (scan != NULL) {
6534 next = scan + NEXT_OFF(scan);
6535 if (next == scan)
6536 next = NULL;
6537 state_num = OP(scan);
6538
6539 reenter_switch:
6540 DEBUG_EXECUTE_r(
6541 if (state_num <= REGNODE_MAX) {
6542 SV * const prop = sv_newmortal();
6543 regnode *rnext = regnext(scan);
6544
6545 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6546 regprop(rex, prop, scan, reginfo, NULL);
6547 Perl_re_printf( aTHX_
6548 "%*s%" IVdf ":%s(%" IVdf ")\n",
6549 INDENT_CHARS(depth), "",
6550 (IV)(scan - rexi->program),
6551 SvPVX_const(prop),
6552 (REGNODE_TYPE(OP(scan)) == END || !rnext) ?
6553 0 : (IV)(rnext - rexi->program));
6554 }
6555 );
6556
6557 to_complement = 0;
6558
6559 SET_nextchr;
6560 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6561
6562 switch (state_num) {
6563 SV * anyofh_list;
6564
6565 case SBOL: /* /^../ and /\A../ */
6566 if (locinput == reginfo->strbeg)
6567 break;
6568 sayNO;
6569
6570 case MBOL: /* /^../m */
6571 if (locinput == reginfo->strbeg ||
6572 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6573 {
6574 break;
6575 }
6576 sayNO;
6577
6578 case GPOS: /* \G */
6579 if (locinput == reginfo->ganch)
6580 break;
6581 sayNO;
6582
6583 case KEEPS: /* \K */
6584 /* update the startpoint */
6585 st->u.keeper.val = RXp_OFFS_START(rex,0);
6586 RXp_OFFSp(rex)[0].start = locinput - reginfo->strbeg;
6587 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6588 script_run_begin);
6589 NOT_REACHED; /* NOTREACHED */
6590
6591 case KEEPS_next_fail:
6592 /* rollback the start point change */
6593 RXp_OFFSp(rex)[0].start = st->u.keeper.val;
6594 sayNO_SILENT;
6595 NOT_REACHED; /* NOTREACHED */
6596
6597 case MEOL: /* /..$/m */
6598 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6599 sayNO;
6600 break;
6601
6602 case SEOL: /* /..$/ */
6603 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6604 sayNO;
6605 if (reginfo->strend - locinput > 1)
6606 sayNO;
6607 break;
6608
6609 case EOS: /* \z */
6610 if (!NEXTCHR_IS_EOS)
6611 sayNO;
6612 break;
6613
6614 case SANY: /* /./s */
6615 if (NEXTCHR_IS_EOS || locinput >= loceol)
6616 sayNO;
6617 goto increment_locinput;
6618
6619 case REG_ANY: /* /./ */
6620 if ( NEXTCHR_IS_EOS
6621 || locinput >= loceol
6622 || nextbyte == '\n')
6623 {
6624 sayNO;
6625 }
6626 goto increment_locinput;
6627
6628
6629 #undef ST
6630 #define ST st->u.trie
6631 case TRIEC: /* (ab|cd) with known charclass */
6632 /* In this case the charclass data is available inline so
6633 we can fail fast without a lot of extra overhead.
6634 */
6635 if ( ! NEXTCHR_IS_EOS
6636 && locinput < loceol
6637 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6638 {
6639 DEBUG_EXECUTE_r(
6640 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6641 depth, PL_colors[4], PL_colors[5])
6642 );
6643 sayNO_SILENT;
6644 NOT_REACHED; /* NOTREACHED */
6645 }
6646 /* FALLTHROUGH */
6647 case TRIE: /* (ab|cd) */
6648 /* the basic plan of execution of the trie is:
6649 * At the beginning, run though all the states, and
6650 * find the longest-matching word. Also remember the position
6651 * of the shortest matching word. For example, this pattern:
6652 * 1 2 3 4 5
6653 * ab|a|x|abcd|abc
6654 * when matched against the string "abcde", will generate
6655 * accept states for all words except 3, with the longest
6656 * matching word being 4, and the shortest being 2 (with
6657 * the position being after char 1 of the string).
6658 *
6659 * Then for each matching word, in word order (i.e. 1,2,4,5),
6660 * we run the remainder of the pattern; on each try setting
6661 * the current position to the character following the word,
6662 * returning to try the next word on failure.
6663 *
6664 * We avoid having to build a list of words at runtime by
6665 * using a compile-time structure, wordinfo[].prev, which
6666 * gives, for each word, the previous accepting word (if any).
6667 * In the case above it would contain the mappings 1->2, 2->0,
6668 * 3->0, 4->5, 5->1. We can use this table to generate, from
6669 * the longest word (4 above), a list of all words, by
6670 * following the list of prev pointers; this gives us the
6671 * unordered list 4,5,1,2. Then given the current word we have
6672 * just tried, we can go through the list and find the
6673 * next-biggest word to try (so if we just failed on word 2,
6674 * the next in the list is 4).
6675 *
6676 * Since at runtime we don't record the matching position in
6677 * the string for each word, we have to work that out for
6678 * each word we're about to process. The wordinfo table holds
6679 * the character length of each word; given that we recorded
6680 * at the start: the position of the shortest word and its
6681 * length in chars, we just need to move the pointer the
6682 * difference between the two char lengths. Depending on
6683 * Unicode status and folding, that's cheap or expensive.
6684 *
6685 * This algorithm is optimised for the case where are only a
6686 * small number of accept states, i.e. 0,1, or maybe 2.
6687 * With lots of accepts states, and having to try all of them,
6688 * it becomes quadratic on number of accept states to find all
6689 * the next words.
6690 */
6691
6692 {
6693 /* what type of TRIE am I? (utf8 makes this contextual) */
6694 DECL_TRIE_TYPE(scan);
6695
6696 /* what trie are we using right now */
6697 reg_trie_data * const trie
6698 = (reg_trie_data*)rexi->data->data[ ARG1u( scan ) ];
6699 ST.before_paren = trie->before_paren;
6700 ST.after_paren = trie->after_paren;
6701 assert(ST.before_paren<=rex->nparens);
6702 assert(ST.after_paren<=rex->nparens);
6703
6704 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG1u( scan ) + 1 ]);
6705 U32 state = trie->startstate;
6706
6707 if (FLAGS(scan) == EXACTL || FLAGS(scan) == EXACTFLU8) {
6708 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
6709 if (utf8_target
6710 && ! NEXTCHR_IS_EOS
6711 && UTF8_IS_ABOVE_LATIN1(nextbyte)
6712 && FLAGS(scan) == EXACTL)
6713 {
6714 /* We only output for EXACTL, as we let the folder
6715 * output this message for EXACTFLU8 to avoid
6716 * duplication */
6717 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6718 reginfo->strend);
6719 }
6720 }
6721 if ( trie->bitmap
6722 && ( NEXTCHR_IS_EOS
6723 || locinput >= loceol
6724 || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6725 {
6726 if (trie->states[ state ].wordnum) {
6727 DEBUG_EXECUTE_r(
6728 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
6729 depth, PL_colors[4], PL_colors[5])
6730 );
6731 if (!trie->jump)
6732 break;
6733 } else {
6734 DEBUG_EXECUTE_r(
6735 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6736 depth, PL_colors[4], PL_colors[5])
6737 );
6738 sayNO_SILENT;
6739 }
6740 }
6741
6742 {
6743 U8 *uc = ( U8* )locinput;
6744
6745 STRLEN len = 0;
6746 STRLEN foldlen = 0;
6747 U8 *uscan = (U8*)NULL;
6748 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6749 U32 charcount = 0; /* how many input chars we have matched */
6750 U32 accepted = 0; /* have we seen any accepting states? */
6751
6752 ST.jump = trie->jump;
6753 ST.j_before_paren = trie->j_before_paren;
6754 ST.j_after_paren= trie->j_after_paren;
6755 ST.me = scan;
6756 ST.firstpos = NULL;
6757 ST.longfold = FALSE; /* char longer if folded => it's harder */
6758 ST.nextword = 0;
6759
6760 /* fully traverse the TRIE; note the position of the
6761 shortest accept state and the wordnum of the longest
6762 accept state */
6763
6764 while ( state && uc <= (U8*)(loceol) ) {
6765 U32 base = trie->states[ state ].trans.base;
6766 UV uvc = 0;
6767 U16 charid = 0;
6768 U16 wordnum;
6769 wordnum = trie->states[ state ].wordnum;
6770
6771 if (wordnum) { /* it's an accept state */
6772 if (!accepted) {
6773 accepted = 1;
6774 /* record first match position */
6775 if (ST.longfold) {
6776 ST.firstpos = (U8*)locinput;
6777 ST.firstchars = 0;
6778 }
6779 else {
6780 ST.firstpos = uc;
6781 ST.firstchars = charcount;
6782 }
6783 }
6784 if (!ST.nextword || wordnum < ST.nextword)
6785 ST.nextword = wordnum;
6786 ST.topword = wordnum;
6787 }
6788
6789 DEBUG_TRIE_EXECUTE_r({
6790 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6791 /* HERE */
6792 PerlIO_printf( Perl_debug_log,
6793 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6794 INDENT_CHARS(depth), "", PL_colors[4],
6795 (UV)state, (accepted ? 'Y' : 'N'));
6796 });
6797
6798 /* read a char and goto next state */
6799 if ( base && (foldlen || uc < (U8*)(loceol))) {
6800 I32 offset;
6801 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6802 (U8 *) loceol, uscan,
6803 len, uvc, charid, foldlen,
6804 foldbuf, uniflags);
6805 charcount++;
6806 if (foldlen>0)
6807 ST.longfold = TRUE;
6808 if (charid &&
6809 ( ((offset =
6810 base + charid - 1 - trie->uniquecharcount)) >= 0)
6811
6812 && ((U32)offset < trie->lasttrans)
6813 && trie->trans[offset].check == state)
6814 {
6815 state = trie->trans[offset].next;
6816 }
6817 else {
6818 state = 0;
6819 }
6820 uc += len;
6821
6822 }
6823 else {
6824 state = 0;
6825 }
6826 DEBUG_TRIE_EXECUTE_r(
6827 Perl_re_printf( aTHX_
6828 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6829 charid, uvc, (UV)state, PL_colors[5] );
6830 );
6831 }
6832 if (!accepted)
6833 sayNO;
6834
6835 /* calculate total number of accept states */
6836 {
6837 U16 w = ST.topword;
6838 accepted = 0;
6839 while (w) {
6840 w = trie->wordinfo[w].prev;
6841 accepted++;
6842 }
6843 ST.accepted = accepted;
6844 }
6845
6846 DEBUG_EXECUTE_r(
6847 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6848 depth,
6849 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6850 );
6851 goto trie_first_try; /* jump into the fail handler */
6852 }}
6853 NOT_REACHED; /* NOTREACHED */
6854
6855 case TRIE_next_fail: /* we failed - try next alternative */
6856 {
6857 U8 *uc;
6858 if (RE_PESSIMISTIC_PARENS) {
6859 REGCP_UNWIND(ST.lastcp);
6860 regcppop(rex,&maxopenparen);
6861 }
6862 if ( ST.jump ) {
6863 /* undo any captures done in the tail part of a branch,
6864 * e.g.
6865 * /(?:X(.)(.)|Y(.)).../
6866 * where the trie just matches X then calls out to do the
6867 * rest of the branch */
6868 REGCP_UNWIND(ST.cp);
6869 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6870 if (ST.after_paren) {
6871 assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens);
6872 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "TRIE_next_fail");
6873 }
6874 }
6875 if (!--ST.accepted) {
6876 DEBUG_EXECUTE_r({
6877 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6878 depth,
6879 PL_colors[4],
6880 PL_colors[5] );
6881 });
6882 sayNO_SILENT;
6883 }
6884 {
6885 /* Find next-highest word to process. Note that this code
6886 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6887 U16 min = 0;
6888 U16 word;
6889 U16 const nextword = ST.nextword;
6890 reg_trie_wordinfo * const wordinfo
6891 = ((reg_trie_data*)rexi->data->data[ARG1u(ST.me)])->wordinfo;
6892 for (word=ST.topword; word; word=wordinfo[word].prev) {
6893 if (word > nextword && (!min || word < min))
6894 min = word;
6895 }
6896 ST.nextword = min;
6897 }
6898
6899 trie_first_try:
6900 if (do_cutgroup) {
6901 do_cutgroup = 0;
6902 no_final = 0;
6903 }
6904
6905 if ( ST.jump ) {
6906 ST.lastparen = RXp_LASTPAREN(rex);
6907 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
6908 REGCP_SET(ST.cp);
6909 }
6910
6911 /* find start char of end of current word */
6912 {
6913 U32 chars; /* how many chars to skip */
6914 reg_trie_data * const trie
6915 = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)];
6916
6917 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6918 >= ST.firstchars);
6919 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6920 - ST.firstchars;
6921 uc = ST.firstpos;
6922
6923 if (ST.longfold) {
6924 /* the hard option - fold each char in turn and find
6925 * its folded length (which may be different */
6926 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6927 STRLEN foldlen;
6928 STRLEN len;
6929 UV uvc;
6930 U8 *uscan;
6931
6932 while (chars) {
6933 if (utf8_target) {
6934 /* XXX This assumes the length is well-formed, as
6935 * does the UTF8SKIP below */
6936 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6937 uniflags);
6938 uc += len;
6939 }
6940 else {
6941 uvc = *uc;
6942 uc++;
6943 }
6944 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6945 uscan = foldbuf;
6946 while (foldlen) {
6947 if (!--chars)
6948 break;
6949 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6950 uniflags);
6951 uscan += len;
6952 foldlen -= len;
6953 }
6954 }
6955 }
6956 else {
6957 if (utf8_target)
6958 uc = utf8_hop(uc, chars);
6959 else
6960 uc += chars;
6961 }
6962 }
6963 if (ST.jump && ST.jump[ST.nextword]) {
6964 scan = ST.me + ST.jump[ST.nextword];
6965 ST.before_paren = ST.j_before_paren[ST.nextword];
6966 assert(ST.before_paren <= rex->nparens);
6967 ST.after_paren = ST.j_after_paren[ST.nextword];
6968 assert(ST.after_paren <= rex->nparens);
6969 } else {
6970 scan = ST.me + NEXT_OFF(ST.me);
6971 }
6972
6973
6974 DEBUG_EXECUTE_r({
6975 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6976 depth,
6977 PL_colors[4],
6978 ST.nextword,
6979 PL_colors[5]
6980 );
6981 });
6982
6983 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6984 if (RE_PESSIMISTIC_PARENS) {
6985 (void)regcppush(rex, 0, maxopenparen);
6986 REGCP_SET(ST.lastcp);
6987 }
6988 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6989 script_run_begin);
6990 NOT_REACHED; /* NOTREACHED */
6991 }
6992 /* only one choice left - just continue */
6993 DEBUG_EXECUTE_r({
6994 AV *const trie_words
6995 = MUTABLE_AV(rexi->data->data[ARG1u(ST.me)+TRIE_WORDS_OFFSET]);
6996 SV ** const tmp = trie_words
6997 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6998 SV *sv= tmp ? sv_newmortal() : NULL;
6999
7000 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
7001 depth, PL_colors[4],
7002 ST.nextword,
7003 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
7004 PL_colors[0], PL_colors[1],
7005 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
7006 )
7007 : "not compiled under -Dr",
7008 PL_colors[5] );
7009 });
7010
7011 locinput = (char*)uc;
7012 continue; /* execute rest of RE */
7013 /* NOTREACHED */
7014 }
7015 #undef ST
7016
7017 case LEXACT_REQ8:
7018 if (! utf8_target) {
7019 sayNO;
7020 }
7021 /* FALLTHROUGH */
7022
7023 case LEXACT:
7024 {
7025 char *s;
7026
7027 s = STRINGl(scan);
7028 ln = STR_LENl(scan);
7029 goto join_short_long_exact;
7030
7031 case EXACTL: /* /abc/l */
7032 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7033
7034 /* Complete checking would involve going through every character
7035 * matched by the string to see if any is above latin1. But the
7036 * comparison otherwise might very well be a fast assembly
7037 * language routine, and I (khw) don't think slowing things down
7038 * just to check for this warning is worth it. So this just checks
7039 * the first character */
7040 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
7041 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
7042 }
7043 goto do_exact;
7044 case EXACT_REQ8:
7045 if (! utf8_target) {
7046 sayNO;
7047 }
7048 /* FALLTHROUGH */
7049
7050 case EXACT: /* /abc/ */
7051 do_exact:
7052 s = STRINGs(scan);
7053 ln = STR_LENs(scan);
7054
7055 join_short_long_exact:
7056 if (utf8_target != is_utf8_pat) {
7057 /* The target and the pattern have differing utf8ness. */
7058 char *l = locinput;
7059 const char * const e = s + ln;
7060
7061 if (utf8_target) {
7062 /* The target is utf8, the pattern is not utf8.
7063 * Above-Latin1 code points can't match the pattern;
7064 * invariants match exactly, and the other Latin1 ones need
7065 * to be downgraded to a single byte in order to do the
7066 * comparison. (If we could be confident that the target
7067 * is not malformed, this could be refactored to have fewer
7068 * tests by just assuming that if the first bytes match, it
7069 * is an invariant, but there are tests in the test suite
7070 * dealing with (??{...}) which violate this) */
7071 while (s < e) {
7072 if ( l >= loceol
7073 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
7074 {
7075 sayNO;
7076 }
7077 if (UTF8_IS_INVARIANT(*(U8*)l)) {
7078 if (*l != *s) {
7079 sayNO;
7080 }
7081 l++;
7082 }
7083 else {
7084 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
7085 {
7086 sayNO;
7087 }
7088 l += 2;
7089 }
7090 s++;
7091 }
7092 }
7093 else {
7094 /* The target is not utf8, the pattern is utf8. */
7095 while (s < e) {
7096 if ( l >= loceol
7097 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
7098 {
7099 sayNO;
7100 }
7101 if (UTF8_IS_INVARIANT(*(U8*)s)) {
7102 if (*s != *l) {
7103 sayNO;
7104 }
7105 s++;
7106 }
7107 else {
7108 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
7109 {
7110 sayNO;
7111 }
7112 s += 2;
7113 }
7114 l++;
7115 }
7116 }
7117 locinput = l;
7118 }
7119 else {
7120 /* The target and the pattern have the same utf8ness. */
7121 /* Inline the first character, for speed. */
7122 if ( loceol - locinput < ln
7123 || UCHARAT(s) != nextbyte
7124 || (ln > 1 && memNE(s, locinput, ln)))
7125 {
7126 sayNO;
7127 }
7128 locinput += ln;
7129 }
7130 break;
7131 }
7132
7133 case EXACTFL: /* /abc/il */
7134 {
7135 const char * s;
7136 U32 fold_utf8_flags;
7137
7138 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7139 folder = Perl_foldEQ_locale;
7140 fold_array = PL_fold_locale;
7141 fold_utf8_flags = FOLDEQ_LOCALE;
7142 goto do_exactf;
7143
7144 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
7145 is effectively /u; hence to match, target
7146 must be UTF-8. */
7147 if (! utf8_target) {
7148 sayNO;
7149 }
7150 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
7151 | FOLDEQ_S2_FOLDS_SANE;
7152 folder = S_foldEQ_latin1_s2_folded;
7153 fold_array = PL_fold_latin1;
7154 goto do_exactf;
7155
7156 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
7157 if (! utf8_target) {
7158 sayNO;
7159 }
7160 assert(is_utf8_pat);
7161 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7162 #ifdef DEBUGGING
7163 /* this is only used in an assert check, so we restrict it to DEBUGGING mode.
7164 * In theory neither of these variables should be used in this mode. */
7165 folder = NULL;
7166 fold_array = NULL;
7167 #endif
7168 goto do_exactf;
7169
7170 case EXACTFUP: /* /foo/iu, and something is problematic in
7171 'foo' so can't take shortcuts. */
7172 assert(! is_utf8_pat);
7173 folder = Perl_foldEQ_latin1;
7174 fold_array = PL_fold_latin1;
7175 fold_utf8_flags = 0;
7176 goto do_exactf;
7177
7178 case EXACTFU: /* /abc/iu */
7179 folder = S_foldEQ_latin1_s2_folded;
7180 fold_array = PL_fold_latin1;
7181 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7182 goto do_exactf;
7183
7184 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
7185 patterns */
7186 assert(! is_utf8_pat);
7187 /* FALLTHROUGH */
7188 case EXACTFAA: /* /abc/iaa */
7189 folder = S_foldEQ_latin1_s2_folded;
7190 fold_array = PL_fold_latin1;
7191 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7192 if (is_utf8_pat || ! utf8_target) {
7193
7194 /* The possible presence of a MICRO SIGN in the pattern forbids
7195 * us to view a non-UTF-8 pattern as folded when there is a
7196 * UTF-8 target */
7197 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7198 |FOLDEQ_S2_FOLDS_SANE;
7199 }
7200 goto do_exactf;
7201
7202
7203 case EXACTF: /* /abc/i This node only generated for
7204 non-utf8 patterns */
7205 assert(! is_utf8_pat);
7206 folder = Perl_foldEQ;
7207 fold_array = PL_fold;
7208 fold_utf8_flags = 0;
7209
7210 do_exactf:
7211 s = STRINGs(scan);
7212 ln = STR_LENs(scan);
7213
7214 if ( utf8_target
7215 || is_utf8_pat
7216 || state_num == EXACTFUP
7217 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7218 {
7219 /* Either target or the pattern are utf8, or has the issue where
7220 * the fold lengths may differ. */
7221 const char * const l = locinput;
7222 char *e = loceol;
7223
7224 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
7225 s, 0, ln, is_utf8_pat,fold_utf8_flags))
7226 {
7227 sayNO;
7228 }
7229 locinput = e;
7230 break;
7231 }
7232
7233 /* Neither the target nor the pattern are utf8 */
7234 assert(fold_array);
7235 if (UCHARAT(s) != nextbyte
7236 && !NEXTCHR_IS_EOS
7237 && UCHARAT(s) != fold_array[nextbyte])
7238 {
7239 sayNO;
7240 }
7241 if (loceol - locinput < ln)
7242 sayNO;
7243 assert(folder);
7244 if (ln > 1 && ! folder(aTHX_ locinput, s, ln))
7245 sayNO;
7246 locinput += ln;
7247 break;
7248 }
7249
7250 case NBOUNDL: /* /\B/l */
7251 to_complement = 1;
7252 /* FALLTHROUGH */
7253
7254 case BOUNDL: /* /\b/l */
7255 {
7256 bool b1, b2;
7257 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7258
7259 if (FLAGS(scan) != TRADITIONAL_BOUND) {
7260 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7261 goto boundu;
7262 }
7263
7264 if (utf8_target) {
7265 if (locinput == reginfo->strbeg)
7266 b1 = isWORDCHAR_LC('\n');
7267 else {
7268 U8 *p = reghop3((U8*)locinput, -1,
7269 (U8*)(reginfo->strbeg));
7270 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7271 }
7272 b2 = (NEXTCHR_IS_EOS)
7273 ? isWORDCHAR_LC('\n')
7274 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7275 (U8*) reginfo->strend);
7276 }
7277 else { /* Here the string isn't utf8 */
7278 b1 = (locinput == reginfo->strbeg)
7279 ? isWORDCHAR_LC('\n')
7280 : isWORDCHAR_LC(UCHARAT(locinput - 1));
7281 b2 = (NEXTCHR_IS_EOS)
7282 ? isWORDCHAR_LC('\n')
7283 : isWORDCHAR_LC(nextbyte);
7284 }
7285 if (to_complement ^ (b1 == b2)) {
7286 sayNO;
7287 }
7288 break;
7289 }
7290
7291 case NBOUND: /* /\B/ */
7292 to_complement = 1;
7293 /* FALLTHROUGH */
7294
7295 case BOUND: /* /\b/ */
7296 if (utf8_target) {
7297 goto bound_utf8;
7298 }
7299 goto bound_ascii_match_only;
7300
7301 case NBOUNDA: /* /\B/a */
7302 to_complement = 1;
7303 /* FALLTHROUGH */
7304
7305 case BOUNDA: /* /\b/a */
7306 {
7307 bool b1, b2;
7308
7309 bound_ascii_match_only:
7310 /* Here the string isn't utf8, or is utf8 and only ascii characters
7311 * are to match \w. In the latter case looking at the byte just
7312 * prior to the current one may be just the final byte of a
7313 * multi-byte character. This is ok. There are two cases:
7314 * 1) it is a single byte character, and then the test is doing
7315 * just what it's supposed to.
7316 * 2) it is a multi-byte character, in which case the final byte is
7317 * never mistakable for ASCII, and so the test will say it is
7318 * not a word character, which is the correct answer. */
7319 b1 = (locinput == reginfo->strbeg)
7320 ? isWORDCHAR_A('\n')
7321 : isWORDCHAR_A(UCHARAT(locinput - 1));
7322 b2 = (NEXTCHR_IS_EOS)
7323 ? isWORDCHAR_A('\n')
7324 : isWORDCHAR_A(nextbyte);
7325 if (to_complement ^ (b1 == b2)) {
7326 sayNO;
7327 }
7328 break;
7329 }
7330
7331 case NBOUNDU: /* /\B/u */
7332 to_complement = 1;
7333 /* FALLTHROUGH */
7334
7335 case BOUNDU: /* /\b/u */
7336
7337 boundu:
7338 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7339 match = FALSE;
7340 }
7341 else if (utf8_target) {
7342 bound_utf8:
7343 switch((bound_type) FLAGS(scan)) {
7344 case TRADITIONAL_BOUND:
7345 {
7346 bool b1, b2;
7347 if (locinput == reginfo->strbeg) {
7348 b1 = 0 /* isWORDCHAR_L1('\n') */;
7349 }
7350 else {
7351 U8 *p = reghop3((U8*)locinput, -1,
7352 (U8*)(reginfo->strbeg));
7353
7354 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7355 }
7356 b2 = (NEXTCHR_IS_EOS)
7357 ? 0 /* isWORDCHAR_L1('\n') */
7358 : isWORDCHAR_utf8_safe((U8*)locinput,
7359 (U8*) reginfo->strend);
7360 match = cBOOL(b1 != b2);
7361 break;
7362 }
7363 case GCB_BOUND:
7364 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7365 match = TRUE; /* GCB always matches at begin and
7366 end */
7367 }
7368 else {
7369 /* Find the gcb values of previous and current
7370 * chars, then see if is a break point */
7371 match = isGCB(getGCB_VAL_UTF8(
7372 reghop3((U8*)locinput,
7373 -1,
7374 (U8*)(reginfo->strbeg)),
7375 (U8*) reginfo->strend),
7376 getGCB_VAL_UTF8((U8*) locinput,
7377 (U8*) reginfo->strend),
7378 (U8*) reginfo->strbeg,
7379 (U8*) locinput,
7380 utf8_target);
7381 }
7382 break;
7383
7384 case LB_BOUND:
7385 if (locinput == reginfo->strbeg) {
7386 match = FALSE;
7387 }
7388 else if (NEXTCHR_IS_EOS) {
7389 match = TRUE;
7390 }
7391 else {
7392 match = isLB(getLB_VAL_UTF8(
7393 reghop3((U8*)locinput,
7394 -1,
7395 (U8*)(reginfo->strbeg)),
7396 (U8*) reginfo->strend),
7397 getLB_VAL_UTF8((U8*) locinput,
7398 (U8*) reginfo->strend),
7399 (U8*) reginfo->strbeg,
7400 (U8*) locinput,
7401 (U8*) reginfo->strend,
7402 utf8_target);
7403 }
7404 break;
7405
7406 case SB_BOUND: /* Always matches at begin and end */
7407 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7408 match = TRUE;
7409 }
7410 else {
7411 match = isSB(getSB_VAL_UTF8(
7412 reghop3((U8*)locinput,
7413 -1,
7414 (U8*)(reginfo->strbeg)),
7415 (U8*) reginfo->strend),
7416 getSB_VAL_UTF8((U8*) locinput,
7417 (U8*) reginfo->strend),
7418 (U8*) reginfo->strbeg,
7419 (U8*) locinput,
7420 (U8*) reginfo->strend,
7421 utf8_target);
7422 }
7423 break;
7424
7425 case WB_BOUND:
7426 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7427 match = TRUE;
7428 }
7429 else {
7430 match = isWB(WB_UNKNOWN,
7431 getWB_VAL_UTF8(
7432 reghop3((U8*)locinput,
7433 -1,
7434 (U8*)(reginfo->strbeg)),
7435 (U8*) reginfo->strend),
7436 getWB_VAL_UTF8((U8*) locinput,
7437 (U8*) reginfo->strend),
7438 (U8*) reginfo->strbeg,
7439 (U8*) locinput,
7440 (U8*) reginfo->strend,
7441 utf8_target);
7442 }
7443 break;
7444 }
7445 }
7446 else { /* Not utf8 target */
7447 switch((bound_type) FLAGS(scan)) {
7448 case TRADITIONAL_BOUND:
7449 {
7450 bool b1, b2;
7451 b1 = (locinput == reginfo->strbeg)
7452 ? 0 /* isWORDCHAR_L1('\n') */
7453 : isWORDCHAR_L1(UCHARAT(locinput - 1));
7454 b2 = (NEXTCHR_IS_EOS)
7455 ? 0 /* isWORDCHAR_L1('\n') */
7456 : isWORDCHAR_L1(nextbyte);
7457 match = cBOOL(b1 != b2);
7458 break;
7459 }
7460
7461 case GCB_BOUND:
7462 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7463 match = TRUE; /* GCB always matches at begin and
7464 end */
7465 }
7466 else { /* Only CR-LF combo isn't a GCB in 0-255
7467 range */
7468 match = UCHARAT(locinput - 1) != '\r'
7469 || UCHARAT(locinput) != '\n';
7470 }
7471 break;
7472
7473 case LB_BOUND:
7474 if (locinput == reginfo->strbeg) {
7475 match = FALSE;
7476 }
7477 else if (NEXTCHR_IS_EOS) {
7478 match = TRUE;
7479 }
7480 else {
7481 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7482 getLB_VAL_CP(UCHARAT(locinput)),
7483 (U8*) reginfo->strbeg,
7484 (U8*) locinput,
7485 (U8*) reginfo->strend,
7486 utf8_target);
7487 }
7488 break;
7489
7490 case SB_BOUND: /* Always matches at begin and end */
7491 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7492 match = TRUE;
7493 }
7494 else {
7495 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7496 getSB_VAL_CP(UCHARAT(locinput)),
7497 (U8*) reginfo->strbeg,
7498 (U8*) locinput,
7499 (U8*) reginfo->strend,
7500 utf8_target);
7501 }
7502 break;
7503
7504 case WB_BOUND:
7505 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7506 match = TRUE;
7507 }
7508 else {
7509 match = isWB(WB_UNKNOWN,
7510 getWB_VAL_CP(UCHARAT(locinput -1)),
7511 getWB_VAL_CP(UCHARAT(locinput)),
7512 (U8*) reginfo->strbeg,
7513 (U8*) locinput,
7514 (U8*) reginfo->strend,
7515 utf8_target);
7516 }
7517 break;
7518 }
7519 }
7520
7521 if (to_complement ^ ! match) {
7522 sayNO;
7523 }
7524 break;
7525
7526 case ANYOFPOSIXL:
7527 case ANYOFL: /* /[abc]/l */
7528 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7529 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7530
7531 /* FALLTHROUGH */
7532 case ANYOFD: /* /[abc]/d */
7533 case ANYOF: /* /[abc]/ */
7534 if (NEXTCHR_IS_EOS || locinput >= loceol)
7535 sayNO;
7536 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7537 && ! ANYOF_FLAGS(scan)
7538 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan))
7539 {
7540 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7541 sayNO;
7542 }
7543 locinput++;
7544 }
7545 else {
7546 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7547 utf8_target))
7548 {
7549 sayNO;
7550 }
7551 goto increment_locinput;
7552 }
7553 break;
7554
7555 case ANYOFM:
7556 if ( NEXTCHR_IS_EOS
7557 || (UCHARAT(locinput) & FLAGS(scan)) != ARG1u(scan)
7558 || locinput >= loceol)
7559 {
7560 sayNO;
7561 }
7562 locinput++; /* ANYOFM is always single byte */
7563 break;
7564
7565 case NANYOFM:
7566 if ( NEXTCHR_IS_EOS
7567 || (UCHARAT(locinput) & FLAGS(scan)) == ARG1u(scan)
7568 || locinput >= loceol)
7569 {
7570 sayNO;
7571 }
7572 goto increment_locinput;
7573 break;
7574
7575 case ANYOFH:
7576 if ( ! utf8_target
7577 || NEXTCHR_IS_EOS
7578 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7579 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7580 || ! _invlist_contains_cp(anyofh_list,
7581 utf8_to_uvchr_buf((U8 *) locinput,
7582 (U8 *) loceol,
7583 NULL)))
7584 {
7585 sayNO;
7586 }
7587 goto increment_locinput;
7588 break;
7589
7590 case ANYOFHb:
7591 if ( ! utf8_target
7592 || NEXTCHR_IS_EOS
7593 || ANYOF_FLAGS(scan) != (U8) *locinput
7594 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7595 || ! _invlist_contains_cp(anyofh_list,
7596 utf8_to_uvchr_buf((U8 *) locinput,
7597 (U8 *) loceol,
7598 NULL)))
7599 {
7600 sayNO;
7601 }
7602 goto increment_locinput;
7603 break;
7604
7605 case ANYOFHbbm:
7606 if ( ! utf8_target
7607 || NEXTCHR_IS_EOS
7608 || ANYOF_FLAGS(scan) != (U8) locinput[0]
7609 || locinput >= reginfo->strend
7610 || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap,
7611 (U8) locinput[1] & UTF_CONTINUATION_MASK))
7612 {
7613 sayNO;
7614 }
7615 goto increment_locinput;
7616 break;
7617
7618 case ANYOFHr:
7619 if ( ! utf8_target
7620 || NEXTCHR_IS_EOS
7621 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7622 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7623 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7624 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7625 || ! _invlist_contains_cp(anyofh_list,
7626 utf8_to_uvchr_buf((U8 *) locinput,
7627 (U8 *) loceol,
7628 NULL)))
7629 {
7630 sayNO;
7631 }
7632 goto increment_locinput;
7633 break;
7634
7635 case ANYOFHs:
7636 if ( ! utf8_target
7637 || NEXTCHR_IS_EOS
7638 || loceol - locinput < FLAGS(scan)
7639 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7640 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7641 || ! _invlist_contains_cp(anyofh_list,
7642 utf8_to_uvchr_buf((U8 *) locinput,
7643 (U8 *) loceol,
7644 NULL)))
7645 {
7646 sayNO;
7647 }
7648 goto increment_locinput;
7649 break;
7650
7651 case ANYOFR:
7652 if (NEXTCHR_IS_EOS) {
7653 sayNO;
7654 }
7655
7656 if (utf8_target) {
7657 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7658 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7659 (U8 *) reginfo->strend,
7660 NULL),
7661 ANYOFRbase(scan), ANYOFRdelta(scan)))
7662 {
7663 sayNO;
7664 }
7665 }
7666 else {
7667 if (! withinCOUNT((U8) *locinput,
7668 ANYOFRbase(scan), ANYOFRdelta(scan)))
7669 {
7670 sayNO;
7671 }
7672 }
7673 goto increment_locinput;
7674 break;
7675
7676 case ANYOFRb:
7677 if (NEXTCHR_IS_EOS) {
7678 sayNO;
7679 }
7680
7681 if (utf8_target) {
7682 if ( ANYOF_FLAGS(scan) != (U8) *locinput
7683 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7684 (U8 *) reginfo->strend,
7685 NULL),
7686 ANYOFRbase(scan), ANYOFRdelta(scan)))
7687 {
7688 sayNO;
7689 }
7690 }
7691 else {
7692 if (! withinCOUNT((U8) *locinput,
7693 ANYOFRbase(scan), ANYOFRdelta(scan)))
7694 {
7695 sayNO;
7696 }
7697 }
7698 goto increment_locinput;
7699 break;
7700
7701 /* The argument (FLAGS) to all the POSIX node types is the class number
7702 * */
7703
7704 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
7705 to_complement = 1;
7706 /* FALLTHROUGH */
7707
7708 case POSIXL: /* \w or [:punct:] etc. under /l */
7709 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7710 if (NEXTCHR_IS_EOS || locinput >= loceol)
7711 sayNO;
7712
7713 /* Use isFOO_lc() for characters within Latin1. (Note that
7714 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7715 * wouldn't be invariant) */
7716 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7717 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7718 sayNO;
7719 }
7720
7721 locinput++;
7722 break;
7723 }
7724
7725 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7726 /* An above Latin-1 code point, or malformed */
7727 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7728 reginfo->strend);
7729 goto utf8_posix_above_latin1;
7730 }
7731
7732 /* Here is a UTF-8 variant code point below 256 and the target is
7733 * UTF-8 */
7734 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7735 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7736 *(locinput + 1))))))
7737 {
7738 sayNO;
7739 }
7740
7741 goto increment_locinput;
7742
7743 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
7744 to_complement = 1;
7745 /* FALLTHROUGH */
7746
7747 case POSIXD: /* \w or [:punct:] etc. under /d */
7748 if (utf8_target) {
7749 goto utf8_posix;
7750 }
7751 goto posixa;
7752
7753 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
7754
7755 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7756 sayNO;
7757 }
7758
7759 /* All UTF-8 variants match */
7760 if (! UTF8_IS_INVARIANT(nextbyte)) {
7761 goto increment_locinput;
7762 }
7763
7764 to_complement = 1;
7765 goto join_nposixa;
7766
7767 case POSIXA: /* \w or [:punct:] etc. under /a */
7768
7769 posixa:
7770 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7771 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7772 * character is a single byte */
7773
7774 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7775 sayNO;
7776 }
7777
7778 join_nposixa:
7779
7780 if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte,
7781 FLAGS(scan)))))
7782 {
7783 sayNO;
7784 }
7785
7786 /* Here we are either not in utf8, or we matched a utf8-invariant,
7787 * so the next char is the next byte */
7788 locinput++;
7789 break;
7790
7791 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
7792 to_complement = 1;
7793 /* FALLTHROUGH */
7794
7795 case POSIXU: /* \w or [:punct:] etc. under /u */
7796 utf8_posix:
7797 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7798 sayNO;
7799 }
7800
7801 /* Use generic_isCC_() for characters within Latin1. (Note that
7802 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7803 * wouldn't be invariant) */
7804 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7805 if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte,
7806 FLAGS(scan)))))
7807 {
7808 sayNO;
7809 }
7810 locinput++;
7811 }
7812 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7813 if (! (to_complement
7814 ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7815 *(locinput + 1)),
7816 FLAGS(scan)))))
7817 {
7818 sayNO;
7819 }
7820 locinput += 2;
7821 }
7822 else { /* Handle above Latin-1 code points */
7823 utf8_posix_above_latin1:
7824 classnum = (char_class_number_) FLAGS(scan);
7825 switch (classnum) {
7826 default:
7827 if (! (to_complement
7828 ^ cBOOL(_invlist_contains_cp(
7829 PL_XPosix_ptrs[classnum],
7830 utf8_to_uvchr_buf((U8 *) locinput,
7831 (U8 *) reginfo->strend,
7832 NULL)))))
7833 {
7834 sayNO;
7835 }
7836 break;
7837 case CC_ENUM_SPACE_:
7838 if (! (to_complement
7839 ^ cBOOL(is_XPERLSPACE_high(locinput))))
7840 {
7841 sayNO;
7842 }
7843 break;
7844 case CC_ENUM_BLANK_:
7845 if (! (to_complement
7846 ^ cBOOL(is_HORIZWS_high(locinput))))
7847 {
7848 sayNO;
7849 }
7850 break;
7851 case CC_ENUM_XDIGIT_:
7852 if (! (to_complement
7853 ^ cBOOL(is_XDIGIT_high(locinput))))
7854 {
7855 sayNO;
7856 }
7857 break;
7858 case CC_ENUM_VERTSPACE_:
7859 if (! (to_complement
7860 ^ cBOOL(is_VERTWS_high(locinput))))
7861 {
7862 sayNO;
7863 }
7864 break;
7865 case CC_ENUM_CNTRL_: /* These can't match above Latin1 */
7866 case CC_ENUM_ASCII_:
7867 if (! to_complement) {
7868 sayNO;
7869 }
7870 break;
7871 }
7872 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7873 }
7874 break;
7875
7876 case CLUMP: /* Match \X: logical Unicode character. This is defined as
7877 a Unicode extended Grapheme Cluster */
7878 if (NEXTCHR_IS_EOS || locinput >= loceol)
7879 sayNO;
7880 if (! utf8_target) {
7881
7882 /* Match either CR LF or '.', as all the other possibilities
7883 * require utf8 */
7884 locinput++; /* Match the . or CR */
7885 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7886 match the LF */
7887 && locinput < loceol
7888 && UCHARAT(locinput) == '\n')
7889 {
7890 locinput++;
7891 }
7892 }
7893 else {
7894
7895 /* Get the gcb type for the current character */
7896 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7897 (U8*) reginfo->strend);
7898
7899 /* Then scan through the input until we get to the first
7900 * character whose type is supposed to be a gcb with the
7901 * current character. (There is always a break at the
7902 * end-of-input) */
7903 locinput += UTF8SKIP(locinput);
7904 while (locinput < loceol) {
7905 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7906 (U8*) reginfo->strend);
7907 if (isGCB(prev_gcb, cur_gcb,
7908 (U8*) reginfo->strbeg, (U8*) locinput,
7909 utf8_target))
7910 {
7911 break;
7912 }
7913
7914 prev_gcb = cur_gcb;
7915 locinput += UTF8SKIP(locinput);
7916 }
7917
7918
7919 }
7920 break;
7921
7922 case REFFLN: /* /\g{name}/il */
7923 { /* The capture buffer cases. The ones beginning with N for the
7924 named buffers just convert to the equivalent numbered and
7925 pretend they were called as the corresponding numbered buffer
7926 op. */
7927 /* don't initialize these in the declaration, it makes C++
7928 unhappy */
7929 const char *s;
7930 char type;
7931 re_fold_t folder;
7932 const U8 *fold_array;
7933 UV utf8_fold_flags;
7934
7935 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7936 folder = Perl_foldEQ_locale;
7937 fold_array = PL_fold_locale;
7938 type = REFFL;
7939 utf8_fold_flags = FOLDEQ_LOCALE;
7940 goto do_nref;
7941
7942 case REFFAN: /* /\g{name}/iaa */
7943 folder = Perl_foldEQ_latin1;
7944 fold_array = PL_fold_latin1;
7945 type = REFFA;
7946 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7947 goto do_nref;
7948
7949 case REFFUN: /* /\g{name}/iu */
7950 folder = Perl_foldEQ_latin1;
7951 fold_array = PL_fold_latin1;
7952 type = REFFU;
7953 utf8_fold_flags = 0;
7954 goto do_nref;
7955
7956 case REFFN: /* /\g{name}/i */
7957 folder = Perl_foldEQ;
7958 fold_array = PL_fold;
7959 type = REFF;
7960 utf8_fold_flags = 0;
7961 goto do_nref;
7962
7963 case REFN: /* /\g{name}/ */
7964 type = REF;
7965 folder = NULL;
7966 fold_array = NULL;
7967 utf8_fold_flags = 0;
7968 do_nref:
7969
7970 /* For the named back references, find the corresponding buffer
7971 * number */
7972 n = reg_check_named_buff_matched(rex,scan);
7973
7974 if ( ! n ) {
7975 sayNO;
7976 }
7977 goto do_nref_ref_common;
7978
7979 case REFFL: /* /\1/il */
7980 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7981 folder = Perl_foldEQ_locale;
7982 fold_array = PL_fold_locale;
7983 utf8_fold_flags = FOLDEQ_LOCALE;
7984 goto do_ref;
7985
7986 case REFFA: /* /\1/iaa */
7987 folder = Perl_foldEQ_latin1;
7988 fold_array = PL_fold_latin1;
7989 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7990 goto do_ref;
7991
7992 case REFFU: /* /\1/iu */
7993 folder = Perl_foldEQ_latin1;
7994 fold_array = PL_fold_latin1;
7995 utf8_fold_flags = 0;
7996 goto do_ref;
7997
7998 case REFF: /* /\1/i */
7999 folder = Perl_foldEQ;
8000 fold_array = PL_fold;
8001 utf8_fold_flags = 0;
8002 goto do_ref;
8003
8004 #undef ST
8005 #define ST st->u.backref
8006 case REF: /* /\1/ */
8007 folder = NULL;
8008 fold_array = NULL;
8009 utf8_fold_flags = 0;
8010
8011 do_ref:
8012 type = OP(scan);
8013 n = ARG1u(scan); /* which paren pair */
8014 if (rex->logical_to_parno) {
8015 n = rex->logical_to_parno[n];
8016 do {
8017 if ( RXp_LASTPAREN(rex) < n ||
8018 RXp_OFFS_START(rex,n) == -1 ||
8019 RXp_OFFS_END(rex,n) == -1
8020 ) {
8021 n = rex->parno_to_logical_next[n];
8022 }
8023 else {
8024 break;
8025 }
8026 } while(n);
8027
8028 if (!n) /* this means there is nothing that matched */
8029 sayNO;
8030 }
8031
8032 do_nref_ref_common:
8033 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8034 if (RXp_LASTPAREN(rex) < n)
8035 sayNO;
8036
8037 ln = RXp_OFFSp(rex)[n].start;
8038 endref = RXp_OFFSp(rex)[n].end;
8039 if (ln == -1 || endref == -1)
8040 sayNO; /* Do not match unless seen CLOSEn. */
8041
8042 if (ln == endref)
8043 goto ref_yes;
8044
8045 s = reginfo->strbeg + ln;
8046 if (type != REF /* REF can do byte comparison */
8047 && (utf8_target || type == REFFU || type == REFFL))
8048 {
8049 char * limit = loceol;
8050
8051 /* This call case insensitively compares the entire buffer
8052 * at s, with the current input starting at locinput, but
8053 * not going off the end given by loceol, and
8054 * returns in <limit> upon success, how much of the
8055 * current input was matched */
8056 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
8057 locinput, &limit, 0, utf8_target, utf8_fold_flags))
8058 {
8059 sayNO;
8060 }
8061 locinput = limit;
8062 goto ref_yes;
8063 }
8064
8065 /* Not utf8: Inline the first character, for speed. */
8066 if ( ! NEXTCHR_IS_EOS
8067 && locinput < loceol
8068 && UCHARAT(s) != nextbyte
8069 && ( type == REF
8070 || UCHARAT(s) != fold_array[nextbyte]))
8071 {
8072 sayNO;
8073 }
8074 ln = endref - ln;
8075 if (locinput + ln > loceol)
8076 sayNO;
8077 if (ln > 1 && (type == REF
8078 ? memNE(s, locinput, ln)
8079 : ! folder(aTHX_ locinput, s, ln)))
8080 sayNO;
8081 locinput += ln;
8082 }
8083 ref_yes:
8084 if (FLAGS(scan)) { /* == VOLATILE_REF but only other value is 0 */
8085 ST.cp = regcppush(rex, ARG2u(scan) - 1, maxopenparen);
8086 REGCP_SET(ST.lastcp);
8087 PUSH_STATE_GOTO(REF_next, next, locinput, loceol,
8088 script_run_begin);
8089 }
8090 break;
8091 NOT_REACHED; /* NOTREACHED */
8092
8093 case REF_next:
8094 sayYES;
8095 break;
8096
8097 case REF_next_fail:
8098 REGCP_UNWIND(ST.lastcp);
8099 regcppop(rex, &maxopenparen);
8100 sayNO;
8101 break;
8102
8103 case NOTHING: /* null op; e.g. the 'nothing' following
8104 * the '*' in m{(a+|b)*}' */
8105 break;
8106 case TAIL: /* placeholder while compiling (A|B|C) */
8107 break;
8108
8109 #undef ST
8110 #define ST st->u.eval
8111 #define CUR_EVAL cur_eval->u.eval
8112
8113 {
8114 SV *ret;
8115 REGEXP *re_sv;
8116 regexp *re;
8117 regexp_internal *rei;
8118 regnode *startpoint;
8119 U32 arg;
8120
8121 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
8122 arg = ARG1u(scan);
8123 if (cur_eval && cur_eval->locinput == locinput) {
8124 if ( ++nochange_depth > max_nochange_depth )
8125 Perl_croak(aTHX_
8126 "Pattern subroutine nesting without pos change"
8127 " exceeded limit in regex");
8128 } else {
8129 nochange_depth = 0;
8130 }
8131 re_sv = rex_sv;
8132 re = rex;
8133 rei = rexi;
8134 startpoint = scan + ARG2i(scan);
8135 EVAL_CLOSE_PAREN_SET( st, arg );
8136 /* Detect infinite recursion
8137 *
8138 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
8139 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
8140 * So we track the position in the string we are at each time
8141 * we recurse and if we try to enter the same routine twice from
8142 * the same position we throw an error.
8143 */
8144 if ( rex->recurse_locinput[arg] == locinput ) {
8145 /* FIXME: we should show the regop that is failing as part
8146 * of the error message. */
8147 Perl_croak(aTHX_ "Infinite recursion in regex");
8148 } else {
8149 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
8150 rex->recurse_locinput[arg]= locinput;
8151
8152 DEBUG_r({
8153 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8154 DEBUG_STACK_r({
8155 Perl_re_exec_indentf( aTHX_
8156 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
8157 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
8158 );
8159 });
8160 });
8161 }
8162
8163 /* Save all the positions seen so far. */
8164 ST.cp = regcppush(rex, 0, maxopenparen);
8165 REGCP_SET(ST.lastcp);
8166
8167 /* and then jump to the code we share with EVAL */
8168 goto eval_recurse_doit;
8169 /* NOTREACHED */
8170
8171 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
8172 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
8173 if ( ++nochange_depth > max_nochange_depth )
8174 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
8175 } else {
8176 nochange_depth = 0;
8177 }
8178 {
8179 /* execute the code in the {...} */
8180
8181 dSP;
8182 IV before;
8183 OP * const oop = PL_op;
8184 COP * const ocurcop = PL_curcop;
8185 OP *nop;
8186 CV *newcv;
8187
8188 /* save *all* paren positions */
8189 ST.cp = regcppush(rex, 0, maxopenparen);
8190 REGCP_SET(ST.lastcp);
8191
8192 if (!caller_cv)
8193 caller_cv = find_runcv(NULL);
8194
8195 n = ARG1u(scan);
8196
8197 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8198 newcv = (ReANY(
8199 (REGEXP*)(rexi->data->data[n])
8200 ))->qr_anoncv;
8201 nop = (OP*)rexi->data->data[n+1];
8202 }
8203 else if (rexi->data->what[n] == 'l') { /* literal code */
8204 newcv = caller_cv;
8205 nop = (OP*)rexi->data->data[n];
8206 assert(CvDEPTH(newcv));
8207 }
8208 else {
8209 /* literal with own CV */
8210 assert(rexi->data->what[n] == 'L');
8211 newcv = rex->qr_anoncv;
8212 nop = (OP*)rexi->data->data[n];
8213 }
8214
8215 /* Some notes about MULTICALL and the context and save stacks.
8216 *
8217 * In something like
8218 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
8219 * since codeblocks don't introduce a new scope (so that
8220 * local() etc accumulate), at the end of a successful
8221 * match there will be a SAVEt_CLEARSV on the savestack
8222 * for each of $x, $y, $z. If the three code blocks above
8223 * happen to have come from different CVs (e.g. via
8224 * embedded qr//s), then we must ensure that during any
8225 * savestack unwinding, PL_comppad always points to the
8226 * right pad at each moment. We achieve this by
8227 * interleaving SAVEt_COMPPAD's on the savestack whenever
8228 * there is a change of pad.
8229 * In theory whenever we call a code block, we should
8230 * push a CXt_SUB context, then pop it on return from
8231 * that code block. This causes a bit of an issue in that
8232 * normally popping a context also clears the savestack
8233 * back to cx->blk_oldsaveix, but here we specifically
8234 * don't want to clear the save stack on exit from the
8235 * code block.
8236 * Also for efficiency we don't want to keep pushing and
8237 * popping the single SUB context as we backtrack etc.
8238 * So instead, we push a single context the first time
8239 * we need, it, then hang onto it until the end of this
8240 * function. Whenever we encounter a new code block, we
8241 * update the CV etc if that's changed. During the times
8242 * in this function where we're not executing a code
8243 * block, having the SUB context still there is a bit
8244 * naughty - but we hope that no-one notices.
8245 * When the SUB context is initially pushed, we fake up
8246 * cx->blk_oldsaveix to be as if we'd pushed this context
8247 * on first entry to S_regmatch rather than at some random
8248 * point during the regexe execution. That way if we
8249 * croak, popping the context stack will ensure that
8250 * *everything* SAVEd by this function is undone and then
8251 * the context popped, rather than e.g., popping the
8252 * context (and restoring the original PL_comppad) then
8253 * popping more of the savestack and restoring a bad
8254 * PL_comppad.
8255 */
8256
8257 /* If this is the first EVAL, push a MULTICALL. On
8258 * subsequent calls, if we're executing a different CV, or
8259 * if PL_comppad has got messed up from backtracking
8260 * through SAVECOMPPADs, then refresh the context.
8261 */
8262 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8263 {
8264 U8 flags = (CXp_SUB_RE |
8265 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8266 SAVECOMPPAD();
8267 if (last_pushed_cv) {
8268 CHANGE_MULTICALL_FLAGS(newcv, flags);
8269 }
8270 else {
8271 PUSH_MULTICALL_FLAGS(newcv, flags);
8272 }
8273 /* see notes above */
8274 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8275
8276 last_pushed_cv = newcv;
8277 }
8278 else {
8279 /* these assignments are just to silence compiler
8280 * warnings */
8281 multicall_cop = NULL;
8282 }
8283 last_pad = PL_comppad;
8284
8285 /* the initial nextstate you would normally execute
8286 * at the start of an eval (which would cause error
8287 * messages to come from the eval), may be optimised
8288 * away from the execution path in the regex code blocks;
8289 * so manually set PL_curcop to it initially */
8290 {
8291 OP *o = cUNOPx(nop)->op_first;
8292 assert(o->op_type == OP_NULL);
8293 if (o->op_targ == OP_SCOPE) {
8294 o = cUNOPo->op_first;
8295 }
8296 else {
8297 assert(o->op_targ == OP_LEAVE);
8298 o = cUNOPo->op_first;
8299 assert(o->op_type == OP_ENTER);
8300 o = OpSIBLING(o);
8301 }
8302
8303 if (o->op_type != OP_STUB) {
8304 assert( o->op_type == OP_NEXTSTATE
8305 || o->op_type == OP_DBSTATE
8306 || (o->op_type == OP_NULL
8307 && ( o->op_targ == OP_NEXTSTATE
8308 || o->op_targ == OP_DBSTATE
8309 )
8310 )
8311 );
8312 PL_curcop = (COP*)o;
8313 }
8314 }
8315 nop = nop->op_next;
8316
8317 DEBUG_STATE_r( Perl_re_printf( aTHX_
8318 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8319
8320 RXp_OFFSp(rex)[0].end = locinput - reginfo->strbeg;
8321 if (reginfo->info_aux_eval->pos_magic)
8322 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8323 reginfo->sv, reginfo->strbeg,
8324 locinput - reginfo->strbeg);
8325
8326 if (sv_yes_mark) {
8327 SV *sv_mrk = get_sv("REGMARK", 1);
8328 sv_setsv(sv_mrk, sv_yes_mark);
8329 }
8330
8331 /* we don't use MULTICALL here as we want to call the
8332 * first op of the block of interest, rather than the
8333 * first op of the sub. Also, we don't want to free
8334 * the savestack frame */
8335 before = (IV)(SP-PL_stack_base);
8336 PL_op = nop;
8337 CALLRUNOPS(aTHX); /* Scalar context. */
8338 SPAGAIN;
8339 if ((IV)(SP-PL_stack_base) == before)
8340 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8341 else {
8342 ret = POPs;
8343 PUTBACK;
8344 }
8345
8346 /* before restoring everything, evaluate the returned
8347 * value, so that 'uninit' warnings don't use the wrong
8348 * PL_op or pad. Also need to process any magic vars
8349 * (e.g. $1) *before* parentheses are restored */
8350
8351 PL_op = NULL;
8352
8353 re_sv = NULL;
8354 if (logical == 0) { /* /(?{ ... })/ and /(*{ ... })/ */
8355 SV *replsv = save_scalar(PL_replgv);
8356 sv_setsv(replsv, ret); /* $^R */
8357 SvSETMAGIC(replsv);
8358 }
8359 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
8360 sw = cBOOL(SvTRUE_NN(ret));
8361 logical = 0;
8362 }
8363 else { /* /(??{ ... }) */
8364 /* if its overloaded, let the regex compiler handle
8365 * it; otherwise extract regex, or stringify */
8366 if (SvGMAGICAL(ret))
8367 ret = sv_mortalcopy(ret);
8368 if (!SvAMAGIC(ret)) {
8369 SV *sv = ret;
8370 if (SvROK(sv))
8371 sv = SvRV(sv);
8372 if (SvTYPE(sv) == SVt_REGEXP)
8373 re_sv = (REGEXP*) sv;
8374 else if (SvSMAGICAL(ret)) {
8375 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8376 if (mg)
8377 re_sv = (REGEXP *) mg->mg_obj;
8378 }
8379
8380 /* force any undef warnings here */
8381 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8382 ret = sv_mortalcopy(ret);
8383 (void) SvPV_force_nolen(ret);
8384 }
8385 }
8386
8387 }
8388
8389 /* *** Note that at this point we don't restore
8390 * PL_comppad, (or pop the CxSUB) on the assumption it may
8391 * be used again soon. This is safe as long as nothing
8392 * in the regexp code uses the pad ! */
8393 PL_op = oop;
8394 PL_curcop = ocurcop;
8395 regcp_restore(rex, ST.lastcp, &maxopenparen);
8396 PL_curpm_under = PL_curpm;
8397 PL_curpm = PL_reg_curpm;
8398
8399 if (logical != 2) {
8400 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8401 script_run_begin);
8402 /* NOTREACHED */
8403 }
8404 }
8405
8406 /* only /(??{ ... })/ from now on */
8407 logical = 0;
8408 {
8409 /* extract RE object from returned value; compiling if
8410 * necessary */
8411
8412 if (re_sv) {
8413 re_sv = reg_temp_copy(NULL, re_sv);
8414 }
8415 else {
8416 U32 pm_flags = 0;
8417
8418 if (SvUTF8(ret) && IN_BYTES) {
8419 /* In use 'bytes': make a copy of the octet
8420 * sequence, but without the flag on */
8421 STRLEN len;
8422 const char *const p = SvPV(ret, len);
8423 ret = newSVpvn_flags(p, len, SVs_TEMP);
8424 }
8425 if (rex->intflags & PREGf_USE_RE_EVAL)
8426 pm_flags |= PMf_USE_RE_EVAL;
8427
8428 /* if we got here, it should be an engine which
8429 * supports compiling code blocks and stuff */
8430 assert(rex->engine && rex->engine->op_comp);
8431 assert(!(FLAGS(scan) & ~RXf_PMf_COMPILETIME));
8432 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8433 rex->engine, NULL, NULL,
8434 /* copy /msixn etc to inner pattern */
8435 ARG2i(scan),
8436 pm_flags);
8437
8438 if (!(SvFLAGS(ret)
8439 & (SVs_TEMP | SVs_GMG | SVf_ROK))
8440 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8441 /* This isn't a first class regexp. Instead, it's
8442 caching a regexp onto an existing, Perl visible
8443 scalar. */
8444 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8445 }
8446 }
8447 SAVEFREESV(re_sv);
8448 re = ReANY(re_sv);
8449 }
8450 RXp_MATCH_COPIED_off(re);
8451 RXp_SUBBEG(re) = RXp_SUBBEG(rex);
8452 RXp_SUBLEN(re) = RXp_SUBLEN(rex);
8453 RXp_SUBOFFSET(re) = RXp_SUBOFFSET(rex);
8454 RXp_SUBCOFFSET(re) = RXp_SUBCOFFSET(rex);
8455 RXp_LASTPAREN(re) = 0;
8456 RXp_LASTCLOSEPAREN(re) = 0;
8457 rei = RXi_GET(re);
8458 DEBUG_EXECUTE_r(
8459 debug_start_match(re_sv, utf8_target, locinput,
8460 reginfo->strend, "EVAL/GOSUB: Matching embedded");
8461 );
8462 startpoint = rei->program + 1;
8463 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8464 * close_paren only for GOSUB */
8465 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8466
8467 /* note we saved the paren state earlier:
8468 ST.cp = regcppush(rex, 0, maxopenparen);
8469 REGCP_SET(ST.lastcp);
8470 */
8471 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8472 maxopenparen = 0;
8473 /* run the pattern returned from (??{...}) */
8474
8475 eval_recurse_doit: /* Share code with GOSUB below this line
8476 * At this point we expect the stack context to be
8477 * set up correctly */
8478
8479 /* invalidate the S-L poscache. We're now executing a
8480 * different set of WHILEM ops (and their associated
8481 * indexes) against the same string, so the bits in the
8482 * cache are meaningless. Setting maxiter to zero forces
8483 * the cache to be invalidated and zeroed before reuse.
8484 * XXX This is too dramatic a measure. Ideally we should
8485 * save the old cache and restore when running the outer
8486 * pattern again */
8487 reginfo->poscache_maxiter = 0;
8488
8489 /* the new regexp might have a different is_utf8_pat than we do */
8490 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8491
8492 ST.prev_rex = rex_sv;
8493 ST.prev_curlyx = cur_curlyx;
8494 rex_sv = re_sv;
8495 SET_reg_curpm(rex_sv);
8496 rex = re;
8497 rexi = rei;
8498 cur_curlyx = NULL;
8499 ST.B = next;
8500 ST.prev_eval = cur_eval;
8501 cur_eval = st;
8502 /* now continue from first node in postoned RE */
8503 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8504 loceol, script_run_begin);
8505 NOT_REACHED; /* NOTREACHED */
8506 }
8507
8508 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8509 /* note: this is called twice; first after popping B, then A */
8510 DEBUG_STACK_r({
8511 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
8512 depth, cur_eval, ST.prev_eval);
8513 });
8514
8515 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8516 if ( cur_eval && CUR_EVAL.close_paren ) {\
8517 DEBUG_STACK_r({ \
8518 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8519 depth, \
8520 CUR_EVAL.close_paren - 1,\
8521 cur_eval, \
8522 VAL); \
8523 }); \
8524 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8525 }
8526
8527 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8528
8529 rex_sv = ST.prev_rex;
8530 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8531 SET_reg_curpm(rex_sv);
8532 rex = ReANY(rex_sv);
8533 rexi = RXi_GET(rex);
8534 {
8535 /* preserve $^R across LEAVE's. See Bug 121070. */
8536 SV *save_sv= GvSV(PL_replgv);
8537 SV *replsv;
8538 SvREFCNT_inc(save_sv);
8539 regcpblow(ST.cp); /* LEAVE in disguise */
8540 /* don't move this initialization up */
8541 replsv = GvSV(PL_replgv);
8542 sv_setsv(replsv, save_sv);
8543 SvSETMAGIC(replsv);
8544 SvREFCNT_dec(save_sv);
8545 }
8546 cur_eval = ST.prev_eval;
8547 cur_curlyx = ST.prev_curlyx;
8548
8549 /* Invalidate cache. See "invalidate" comment above. */
8550 reginfo->poscache_maxiter = 0;
8551 if ( nochange_depth )
8552 nochange_depth--;
8553
8554 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8555 sayYES;
8556
8557
8558 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8559 REGCP_UNWIND(ST.lastcp);
8560 regcppop(rex, &maxopenparen);
8561 sayNO;
8562
8563 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8564 /* note: this is called twice; first after popping B, then A */
8565 DEBUG_STACK_r({
8566 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8567 depth, cur_eval, ST.prev_eval);
8568 });
8569
8570 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8571
8572 rex_sv = ST.prev_rex;
8573 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8574 SET_reg_curpm(rex_sv);
8575 rex = ReANY(rex_sv);
8576 rexi = RXi_GET(rex);
8577
8578 REGCP_UNWIND(ST.lastcp);
8579 regcppop(rex, &maxopenparen);
8580 cur_eval = ST.prev_eval;
8581 cur_curlyx = ST.prev_curlyx;
8582
8583 /* Invalidate cache. See "invalidate" comment above. */
8584 reginfo->poscache_maxiter = 0;
8585 if ( nochange_depth )
8586 nochange_depth--;
8587
8588 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8589 sayNO_SILENT;
8590 #undef ST
8591
8592 case OPEN: /* ( */
8593 n = PARNO(scan); /* which paren pair */
8594 RXp_OFFSp(rex)[n].start_tmp = locinput - reginfo->strbeg;
8595 if (n > maxopenparen)
8596 maxopenparen = n;
8597 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8598 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8599 depth,
8600 PTR2UV(rex),
8601 PTR2UV(RXp_OFFSp(rex)),
8602 (UV)n,
8603 (IV)RXp_OFFSp(rex)[n].start_tmp,
8604 (UV)maxopenparen
8605 ));
8606 lastopen = n;
8607 break;
8608
8609 case SROPEN: /* (*SCRIPT_RUN: */
8610 script_run_begin = (U8 *) locinput;
8611 break;
8612
8613
8614 case CLOSE: /* ) */
8615 n = PARNO(scan); /* which paren pair */
8616 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8617 locinput - reginfo->strbeg);
8618 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8619 goto fake_end;
8620
8621 break;
8622
8623 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
8624
8625 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8626 {
8627 sayNO;
8628 }
8629
8630 break;
8631
8632
8633 case ACCEPT: /* (*ACCEPT) */
8634 is_accepted = true;
8635 if (FLAGS(scan))
8636 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
8637 utmp = ARG2u(scan);
8638
8639 if ( utmp ) {
8640 regnode *cursor;
8641 for (
8642 cursor = scan;
8643 cursor && ( OP(cursor) != END );
8644 cursor = (
8645 REGNODE_TYPE( OP(cursor) ) == END
8646 || REGNODE_TYPE( OP(cursor) ) == WHILEM
8647 )
8648 ? REGNODE_AFTER(cursor)
8649 : regnext(cursor)
8650 ){
8651 if ( OP(cursor) != CLOSE )
8652 continue;
8653
8654 n = PARNO(cursor);
8655
8656 if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8657 continue; /* so skip this one */
8658
8659 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp,
8660 locinput - reginfo->strbeg);
8661
8662 if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8663 break;
8664 }
8665 }
8666 goto fake_end;
8667 /* NOTREACHED */
8668
8669 case GROUPP: /* (?(1)) */
8670 n = ARG1u(scan); /* which paren pair */
8671 sw = cBOOL(RXp_LASTPAREN(rex) >= n && RXp_OFFS_END(rex,n) != -1);
8672 break;
8673
8674 case GROUPPN: /* (?(<name>)) */
8675 /* reg_check_named_buff_matched returns 0 for no match */
8676 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8677 break;
8678
8679 case INSUBP: /* (?(R)) */
8680 n = ARG1u(scan);
8681 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8682 * of SCAN is already set up as matches a eval.close_paren */
8683 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8684 break;
8685
8686 case DEFINEP: /* (?(DEFINE)) */
8687 sw = 0;
8688 break;
8689
8690 case IFTHEN: /* (?(cond)A|B) */
8691 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8692 if (sw)
8693 next = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
8694 else {
8695 next = scan + ARG1u(scan);
8696 if (OP(next) == IFTHEN) /* Fake one. */
8697 next = REGNODE_AFTER_type(next,tregnode_IFTHEN);
8698 }
8699 break;
8700
8701 case LOGICAL: /* modifier for EVAL and IFMATCH */
8702 logical = FLAGS(scan) & EVAL_FLAGS_MASK; /* reserve a bit for optimistic eval */
8703 break;
8704
8705 /*******************************************************************
8706
8707 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8708 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8709 STAR/PLUS/CURLY/CURLYN are used instead.)
8710
8711 A*B is compiled as <CURLYX><A><WHILEM><B>
8712
8713 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8714 state, which contains the current count, initialised to -1. It also sets
8715 cur_curlyx to point to this state, with any previous value saved in the
8716 state block.
8717
8718 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8719 since the pattern may possibly match zero times (i.e. it's a while {} loop
8720 rather than a do {} while loop).
8721
8722 Each entry to WHILEM represents a successful match of A. The count in the
8723 CURLYX block is incremented, another WHILEM state is pushed, and execution
8724 passes to A or B depending on greediness and the current count.
8725
8726 For example, if matching against the string a1a2a3b (where the aN are
8727 substrings that match /A/), then the match progresses as follows: (the
8728 pushed states are interspersed with the bits of strings matched so far):
8729
8730 <CURLYX cnt=-1>
8731 <CURLYX cnt=0><WHILEM>
8732 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8733 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8734 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8735 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8736
8737 (Contrast this with something like CURLYM, which maintains only a single
8738 backtrack state:
8739
8740 <CURLYM cnt=0> a1
8741 a1 <CURLYM cnt=1> a2
8742 a1 a2 <CURLYM cnt=2> a3
8743 a1 a2 a3 <CURLYM cnt=3> b
8744 )
8745
8746 Each WHILEM state block marks a point to backtrack to upon partial failure
8747 of A or B, and also contains some minor state data related to that
8748 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
8749 overall state, such as the count, and pointers to the A and B ops.
8750
8751 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8752 must always point to the *current* CURLYX block, the rules are:
8753
8754 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8755 and set cur_curlyx to point the new block.
8756
8757 When popping the CURLYX block after a successful or unsuccessful match,
8758 restore the previous cur_curlyx.
8759
8760 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8761 to the outer one saved in the CURLYX block.
8762
8763 When popping the WHILEM block after a successful or unsuccessful B match,
8764 restore the previous cur_curlyx.
8765
8766 Here's an example for the pattern (AI* BI)*BO
8767 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8768
8769 cur_
8770 curlyx backtrack stack
8771 ------ ---------------
8772 NULL
8773 CO <CO prev=NULL> <WO>
8774 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8775 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8776 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8777
8778 At this point the pattern succeeds, and we work back down the stack to
8779 clean up, restoring as we go:
8780
8781 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8782 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8783 CO <CO prev=NULL> <WO>
8784 NULL
8785
8786 *******************************************************************/
8787
8788 #define ST st->u.curlyx
8789
8790 case CURLYX: /* start of /A*B/ (for complex A) */
8791 {
8792 /* No need to save/restore up to this paren */
8793 I32 parenfloor = FLAGS(scan);
8794
8795 assert(next); /* keep Coverity happy */
8796 if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */
8797 next += ARG1u(next);
8798
8799 /* XXXX Probably it is better to teach regpush to support
8800 parenfloor > maxopenparen ... */
8801 if (parenfloor > (I32)RXp_LASTPAREN(rex))
8802 parenfloor = RXp_LASTPAREN(rex); /* Pessimization... */
8803
8804 ST.prev_curlyx= cur_curlyx;
8805 cur_curlyx = st;
8806 ST.cp = PL_savestack_ix;
8807
8808 /* these fields contain the state of the current curly.
8809 * they are accessed by subsequent WHILEMs */
8810 ST.parenfloor = parenfloor;
8811 ST.me = scan;
8812 ST.B = next;
8813 ST.minmod = minmod;
8814 minmod = 0;
8815 ST.count = -1; /* this will be updated by WHILEM */
8816 ST.lastloc = NULL; /* this will be updated by WHILEM */
8817
8818 PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol,
8819 script_run_begin);
8820 NOT_REACHED; /* NOTREACHED */
8821 }
8822
8823 case CURLYX_end: /* just finished matching all of A*B */
8824 cur_curlyx = ST.prev_curlyx;
8825 sayYES;
8826 NOT_REACHED; /* NOTREACHED */
8827
8828 case CURLYX_end_fail: /* just failed to match all of A*B */
8829 regcpblow(ST.cp);
8830 cur_curlyx = ST.prev_curlyx;
8831 sayNO;
8832 NOT_REACHED; /* NOTREACHED */
8833
8834
8835 #undef ST
8836 #define ST st->u.whilem
8837
8838 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
8839 {
8840 /* see the discussion above about CURLYX/WHILEM */
8841 I32 n;
8842 int min, max;
8843 /* U16 first_paren, last_paren; */
8844 regnode *A;
8845
8846 assert(cur_curlyx); /* keep Coverity happy */
8847
8848 min = ARG1i(cur_curlyx->u.curlyx.me);
8849 max = ARG2i(cur_curlyx->u.curlyx.me);
8850 /* first_paren = ARG3a(cur_curlyx->u.curlyx.me); */
8851 /* last_paren = ARG3b(cur_curlyx->u.curlyx.me); */
8852 A = REGNODE_AFTER(cur_curlyx->u.curlyx.me);
8853 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8854 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8855 ST.cache_offset = 0;
8856 ST.cache_mask = 0;
8857
8858 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
8859 depth, (long)n, min, max)
8860 );
8861
8862 /* First just match a string of min A's. */
8863 if (n < min) {
8864 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8865 cur_curlyx->u.curlyx.lastloc = locinput;
8866 REGCP_SET(ST.lastcp);
8867
8868 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8869 script_run_begin);
8870 NOT_REACHED; /* NOTREACHED */
8871 }
8872
8873 /* If degenerate A matches "", assume A done. */
8874
8875 if (locinput == cur_curlyx->u.curlyx.lastloc) {
8876 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
8877 depth)
8878 );
8879 goto do_whilem_B_max;
8880 }
8881
8882 /* super-linear cache processing.
8883 *
8884 * The idea here is that for certain types of CURLYX/WHILEM -
8885 * principally those whose upper bound is infinity (and
8886 * excluding regexes that have things like \1 and other very
8887 * non-regular expressiony things), then if a pattern like
8888 * /....A*.../ fails and we backtrack to the WHILEM, then we
8889 * make a note that this particular WHILEM op was at string
8890 * position 47 (say) when the rest of pattern failed. Then, if
8891 * we ever find ourselves back at that WHILEM, and at string
8892 * position 47 again, we can just fail immediately rather than
8893 * running the rest of the pattern again.
8894 *
8895 * This is very handy when patterns start to go
8896 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8897 * with a combinatorial explosion of backtracking.
8898 *
8899 * The cache is implemented as a bit array, with one bit per
8900 * string byte position per WHILEM op (up to 16) - so its
8901 * between 0.25 and 2x the string size.
8902 *
8903 * To avoid allocating a poscache buffer every time, we do an
8904 * initially countdown; only after we have executed a WHILEM
8905 * op (string-length x #WHILEMs) times do we allocate the
8906 * cache.
8907 *
8908 * The top 4 bits of FLAGS(scan) byte say how many different
8909 * relevant CURLLYX/WHILEM op pairs there are, while the
8910 * bottom 4-bits is the identifying index number of this
8911 * WHILEM.
8912 */
8913
8914 if (FLAGS(scan)) {
8915
8916 if (!reginfo->poscache_maxiter) {
8917 /* start the countdown: Postpone detection until we
8918 * know the match is not *that* much linear. */
8919 reginfo->poscache_maxiter
8920 = (reginfo->strend - reginfo->strbeg + 1)
8921 * (FLAGS(scan)>>4);
8922 /* possible overflow for long strings and many CURLYX's */
8923 if (reginfo->poscache_maxiter < 0)
8924 reginfo->poscache_maxiter = I32_MAX;
8925 reginfo->poscache_iter = reginfo->poscache_maxiter;
8926 }
8927
8928 if (reginfo->poscache_iter-- == 0) {
8929 /* initialise cache */
8930 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8931 regmatch_info_aux *const aux = reginfo->info_aux;
8932 if (aux->poscache) {
8933 if ((SSize_t)reginfo->poscache_size < size) {
8934 Renew(aux->poscache, size, char);
8935 reginfo->poscache_size = size;
8936 }
8937 Zero(aux->poscache, size, char);
8938 }
8939 else {
8940 reginfo->poscache_size = size;
8941 Newxz(aux->poscache, size, char);
8942 }
8943 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8944 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8945 PL_colors[4], PL_colors[5])
8946 );
8947 }
8948
8949 if (reginfo->poscache_iter < 0) {
8950 /* have we already failed at this position? */
8951 SSize_t offset, mask;
8952
8953 reginfo->poscache_iter = -1; /* stop eventual underflow */
8954 offset = (FLAGS(scan) & 0xf) - 1
8955 + (locinput - reginfo->strbeg)
8956 * (FLAGS(scan)>>4);
8957 mask = 1 << (offset % 8);
8958 offset /= 8;
8959 if (reginfo->info_aux->poscache[offset] & mask) {
8960 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
8961 depth)
8962 );
8963 cur_curlyx->u.curlyx.count--;
8964 sayNO; /* cache records failure */
8965 }
8966 ST.cache_offset = offset;
8967 ST.cache_mask = mask;
8968 }
8969 }
8970
8971 /* Prefer B over A for minimal matching. */
8972
8973 if (cur_curlyx->u.curlyx.minmod) {
8974 ST.save_curlyx = cur_curlyx;
8975 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8976 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8977 locinput, loceol, script_run_begin);
8978 NOT_REACHED; /* NOTREACHED */
8979 }
8980
8981 /* Prefer A over B for maximal matching. */
8982
8983 if (n < max) { /* More greed allowed? */
8984 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8985 maxopenparen);
8986 cur_curlyx->u.curlyx.lastloc = locinput;
8987 REGCP_SET(ST.lastcp);
8988 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8989 script_run_begin);
8990 NOT_REACHED; /* NOTREACHED */
8991 }
8992 goto do_whilem_B_max;
8993 }
8994 NOT_REACHED; /* NOTREACHED */
8995
8996 case WHILEM_B_min: /* just matched B in a minimal match */
8997 case WHILEM_B_max: /* just matched B in a maximal match */
8998 cur_curlyx = ST.save_curlyx;
8999 sayYES;
9000 NOT_REACHED; /* NOTREACHED */
9001
9002 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
9003 cur_curlyx = ST.save_curlyx;
9004 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9005 cur_curlyx->u.curlyx.count--;
9006 CACHEsayNO;
9007 NOT_REACHED; /* NOTREACHED */
9008
9009 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
9010 /* FALLTHROUGH */
9011 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
9012 REGCP_UNWIND(ST.lastcp);
9013 regcppop(rex, &maxopenparen);
9014 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
9015 cur_curlyx->u.curlyx.count--;
9016 CACHEsayNO;
9017 NOT_REACHED; /* NOTREACHED */
9018
9019 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
9020 REGCP_UNWIND(ST.lastcp);
9021 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9022 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
9023 depth)
9024 );
9025
9026 do_whilem_B_max:
9027 /* now try B */
9028 ST.save_curlyx = cur_curlyx;
9029 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
9030 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
9031 locinput, loceol, script_run_begin);
9032 NOT_REACHED; /* NOTREACHED */
9033
9034 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
9035 cur_curlyx = ST.save_curlyx;
9036
9037 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2i(cur_curlyx->u.curlyx.me)) {
9038 /* Maximum greed exceeded */
9039 cur_curlyx->u.curlyx.count--;
9040 CACHEsayNO;
9041 }
9042
9043 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
9044 );
9045 /* Try grabbing another A and see if it helps. */
9046 cur_curlyx->u.curlyx.lastloc = locinput;
9047 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
9048 REGCP_SET(ST.lastcp);
9049 PUSH_STATE_GOTO(WHILEM_A_min,
9050 /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me),
9051 locinput, loceol, script_run_begin);
9052 NOT_REACHED; /* NOTREACHED */
9053
9054 #undef ST
9055 #define ST st->u.branch
9056
9057 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
9058 next = scan + ARG1u(scan);
9059 if (next == scan)
9060 next = NULL;
9061 ST.before_paren = ARG2a(scan);
9062 ST.after_paren = ARG2b(scan);
9063 goto branch_logic;
9064 NOT_REACHED; /* NOTREACHED */
9065
9066 case BRANCH: /* /(...|A|...)/ */
9067 ST.before_paren = ARG1a(scan);
9068 ST.after_paren = ARG1b(scan);
9069 branch_logic:
9070 scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
9071 assert(scan);
9072 ST.lastparen = RXp_LASTPAREN(rex);
9073 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9074 ST.next_branch = next;
9075 REGCP_SET(ST.cp);
9076 if (RE_PESSIMISTIC_PARENS) {
9077 regcppush(rex, 0, maxopenparen);
9078 REGCP_SET(ST.lastcp);
9079 }
9080
9081 /* Now go into the branch */
9082 if (has_cutgroup) {
9083 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9084 script_run_begin);
9085 } else {
9086 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
9087 script_run_begin);
9088 }
9089 NOT_REACHED; /* NOTREACHED */
9090
9091 case CUTGROUP: /* /(*THEN)/ */
9092 sv_yes_mark = st->u.mark.mark_name = FLAGS(scan)
9093 ? MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ])
9094 : NULL;
9095 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
9096 script_run_begin);
9097 NOT_REACHED; /* NOTREACHED */
9098
9099 case CUTGROUP_next_fail:
9100 do_cutgroup = 1;
9101 no_final = 1;
9102 if (st->u.mark.mark_name)
9103 sv_commit = st->u.mark.mark_name;
9104 sayNO;
9105 NOT_REACHED; /* NOTREACHED */
9106
9107 case BRANCH_next:
9108 sayYES;
9109 NOT_REACHED; /* NOTREACHED */
9110
9111 case BRANCH_next_fail: /* that branch failed; try the next, if any */
9112 if (do_cutgroup) {
9113 do_cutgroup = 0;
9114 no_final = 0;
9115 }
9116 if (RE_PESSIMISTIC_PARENS) {
9117 REGCP_UNWIND(ST.lastcp);
9118 regcppop(rex,&maxopenparen);
9119 }
9120 REGCP_UNWIND(ST.cp);
9121 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9122 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "BRANCH_next_fail");
9123 scan = ST.next_branch;
9124 /* no more branches? */
9125 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
9126 DEBUG_EXECUTE_r({
9127 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
9128 depth,
9129 PL_colors[4],
9130 PL_colors[5] );
9131 });
9132 sayNO_SILENT;
9133 }
9134 continue; /* execute next BRANCH[J] op */
9135 /* NOTREACHED */
9136
9137 case MINMOD: /* next op will be non-greedy, e.g. A*? */
9138 minmod = 1;
9139 break;
9140
9141 #undef ST
9142 #define ST st->u.curlym
9143
9144 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
9145
9146 /* This is an optimisation of CURLYX that enables us to push
9147 * only a single backtracking state, no matter how many matches
9148 * there are in {m,n}. It relies on the pattern being constant
9149 * length, with no parens to influence future backrefs
9150 */
9151
9152 ST.me = scan;
9153 scan = REGNODE_AFTER_type(scan, tregnode_CURLYM);
9154
9155 ST.lastparen = RXp_LASTPAREN(rex);
9156 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9157
9158 /* if paren positive, emulate an OPEN/CLOSE around A */
9159 if (FLAGS(ST.me)) {
9160 U32 paren = FLAGS(ST.me);
9161 lastopen = paren;
9162 if (paren > maxopenparen)
9163 maxopenparen = paren;
9164 scan += NEXT_OFF(scan); /* Skip former OPEN. */
9165 }
9166 ST.A = scan;
9167 ST.B = next;
9168 ST.alen = 0;
9169 ST.count = 0;
9170 ST.minmod = minmod;
9171 minmod = 0;
9172 ST.Binfo.count = -1;
9173 REGCP_SET(ST.cp);
9174
9175 if (!(ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me))) /* min/max */
9176 goto curlym_do_B;
9177
9178 curlym_do_A: /* execute the A in /A{m,n}B/ */
9179 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
9180 script_run_begin);
9181 NOT_REACHED; /* NOTREACHED */
9182
9183 case CURLYM_A: /* we've just matched an A */
9184 ST.count++;
9185 /* after first match, determine A's length: u.curlym.alen */
9186 if (ST.count == 1) {
9187 if (reginfo->is_utf8_target) {
9188 char *s = st->locinput;
9189 while (s < locinput) {
9190 ST.alen++;
9191 s += UTF8SKIP(s);
9192 }
9193 }
9194 else {
9195 ST.alen = locinput - st->locinput;
9196 }
9197 if (ST.alen == 0)
9198 ST.count = ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me);
9199 }
9200 DEBUG_EXECUTE_r(
9201 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
9202 depth, (IV) ST.count, (IV)ST.alen)
9203 );
9204
9205 if (FLAGS(ST.me)) {
9206 /* emulate CLOSE: mark current A as captured */
9207 U32 paren = (U32)FLAGS(ST.me);
9208 CLOSE_CAPTURE(rex, paren,
9209 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9210 locinput - reginfo->strbeg);
9211 }
9212
9213 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9214 goto fake_end;
9215
9216
9217 if (!is_accepted) {
9218 I32 max = (ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me));
9219 if ( max == REG_INFTY || ST.count < max )
9220 goto curlym_do_A; /* try to match another A */
9221 }
9222 goto curlym_do_B; /* try to match B */
9223
9224 case CURLYM_A_fail: /* just failed to match an A */
9225 REGCP_UNWIND(ST.cp);
9226
9227
9228 if (ST.minmod || ST.count < ARG1i(ST.me) /* min*/
9229 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9230 sayNO;
9231
9232 curlym_do_B: /* execute the B in /A{m,n}B/ */
9233 if (is_accepted)
9234 goto curlym_close_B;
9235
9236 if (ST.Binfo.count < 0) {
9237 /* calculate possible match of 1st char following curly */
9238 assert(ST.B);
9239 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
9240 regnode *text_node = ST.B;
9241 if (! HAS_TEXT(text_node))
9242 FIND_NEXT_IMPT(text_node);
9243 if (REGNODE_TYPE(OP(text_node)) == EXACT) {
9244 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9245 &ST.Binfo, reginfo))
9246 {
9247 sayNO;
9248 }
9249 }
9250 }
9251 }
9252
9253 DEBUG_EXECUTE_r(
9254 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
9255 depth, (IV)ST.count)
9256 );
9257 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
9258 assert(ST.Binfo.count > 0);
9259
9260 /* Do a quick test to hopefully rule out most non-matches */
9261 if ( locinput + ST.Binfo.min_length > loceol
9262 || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9263 {
9264 DEBUG_OPTIMISE_r(
9265 Perl_re_exec_indentf( aTHX_
9266 "CURLYM Fast bail next target=0x%X anded==0x%X"
9267 " mask=0x%X\n",
9268 depth,
9269 (int) nextbyte, ST.Binfo.first_byte_anded,
9270 ST.Binfo.first_byte_mask)
9271 );
9272 state_num = CURLYM_B_fail;
9273 goto reenter_switch;
9274 }
9275 }
9276
9277 curlym_close_B:
9278 if (FLAGS(ST.me)) {
9279 /* emulate CLOSE: mark current A as captured */
9280 U32 paren = (U32)FLAGS(ST.me);
9281 if (ST.count || is_accepted) {
9282 CLOSE_CAPTURE(rex, paren,
9283 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9284 locinput - reginfo->strbeg);
9285 }
9286 else
9287 RXp_OFFSp(rex)[paren].end = -1;
9288
9289 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me)))
9290 {
9291 if (ST.count || is_accepted)
9292 goto fake_end;
9293 else
9294 sayNO;
9295 }
9296 }
9297
9298 if (is_accepted)
9299 goto fake_end;
9300
9301 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
9302 script_run_begin);
9303 NOT_REACHED; /* NOTREACHED */
9304
9305 case CURLYM_B_fail: /* just failed to match a B */
9306 REGCP_UNWIND(ST.cp);
9307 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9308 if (ST.minmod) {
9309 I32 max = ARG2i(ST.me);
9310 if (max != REG_INFTY && ST.count == max)
9311 sayNO;
9312 goto curlym_do_A; /* try to match a further A */
9313 }
9314 /* backtrack one A */
9315 if (ST.count == ARG1i(ST.me) /* min */)
9316 sayNO;
9317 ST.count--;
9318 SET_locinput(HOPc(locinput, -ST.alen));
9319 goto curlym_do_B; /* try to match B */
9320
9321 #undef ST
9322 #define ST st->u.curly
9323
9324 #define CURLY_SETPAREN(paren, success) \
9325 if (paren) { \
9326 if (success) { \
9327 CLOSE_CAPTURE(rex, paren, HOPc(locinput, -1) - reginfo->strbeg, \
9328 locinput - reginfo->strbeg); \
9329 } \
9330 else { \
9331 RXp_OFFSp(rex)[paren].end = -1; \
9332 RXp_LASTPAREN(rex) = ST.lastparen; \
9333 RXp_LASTCLOSEPAREN(rex) = ST.lastcloseparen; \
9334 } \
9335 }
9336
9337 case STAR: /* /A*B/ where A is width 1 char */
9338 ST.paren = 0;
9339 ST.min = 0;
9340 ST.max = REG_INFTY;
9341 scan = REGNODE_AFTER_type(scan,tregnode_STAR);
9342 goto repeat;
9343
9344 case PLUS: /* /A+B/ where A is width 1 char */
9345 ST.paren = 0;
9346 ST.min = 1;
9347 ST.max = REG_INFTY;
9348 scan = REGNODE_AFTER_type(scan,tregnode_PLUS);
9349 goto repeat;
9350
9351 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
9352 ST.paren = FLAGS(scan); /* Which paren to set */
9353 ST.lastparen = RXp_LASTPAREN(rex);
9354 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex);
9355 if (ST.paren > maxopenparen)
9356 maxopenparen = ST.paren;
9357 ST.min = ARG1i(scan); /* min to match */
9358 ST.max = ARG2i(scan); /* max to match */
9359 scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN));
9360
9361 /* handle the single-char capture called as a GOSUB etc */
9362 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9363 {
9364 char *li = locinput;
9365 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9366 sayNO;
9367 SET_locinput(li);
9368 goto fake_end;
9369 }
9370
9371 goto repeat;
9372
9373 case CURLY: /* /A{m,n}B/ where A is width 1 char */
9374 ST.paren = 0;
9375 ST.min = ARG1i(scan); /* min to match */
9376 ST.max = ARG2i(scan); /* max to match */
9377 scan = REGNODE_AFTER_type(scan, tregnode_CURLY);
9378 repeat:
9379 /*
9380 * Lookahead to avoid useless match attempts
9381 * when we know what character comes next.
9382 *
9383 * Used to only do .*x and .*?x, but now it allows
9384 * for )'s, ('s and (?{ ... })'s to be in the way
9385 * of the quantifier and the EXACT-like node. -- japhy
9386 */
9387
9388 assert(ST.min <= ST.max);
9389 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9390 ST.Binfo.count = 0;
9391 }
9392 else {
9393 regnode *text_node = next;
9394
9395 if (! HAS_TEXT(text_node))
9396 FIND_NEXT_IMPT(text_node);
9397
9398 if (! HAS_TEXT(text_node))
9399 ST.Binfo.count = 0;
9400 else {
9401 if ( REGNODE_TYPE(OP(text_node)) != EXACT ) {
9402 ST.Binfo.count = 0;
9403 }
9404 else {
9405 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9406 &ST.Binfo, reginfo))
9407 {
9408 sayNO;
9409 }
9410 }
9411 }
9412 }
9413
9414 ST.A = scan;
9415 ST.B = next;
9416 if (minmod) {
9417 char *li = locinput;
9418 minmod = 0;
9419 if (ST.min &&
9420 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9421 < ST.min)
9422 sayNO;
9423 SET_locinput(li);
9424 ST.count = ST.min;
9425 REGCP_SET(ST.cp);
9426
9427 if (ST.Binfo.count <= 0)
9428 goto curly_try_B_min;
9429
9430 ST.oldloc = locinput;
9431
9432 /* set ST.maxpos to the furthest point along the
9433 * string that could possibly match, i.e., that a match could
9434 * start at. */
9435 if (ST.max == REG_INFTY) {
9436 ST.maxpos = loceol - 1;
9437 if (utf8_target)
9438 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9439 ST.maxpos--;
9440 }
9441 else if (utf8_target) {
9442 int m = ST.max - ST.min;
9443 for (ST.maxpos = locinput;
9444 m >0 && ST.maxpos < loceol; m--)
9445 ST.maxpos += UTF8SKIP(ST.maxpos);
9446 }
9447 else {
9448 ST.maxpos = locinput + ST.max - ST.min;
9449 if (ST.maxpos >= loceol)
9450 ST.maxpos = loceol - 1;
9451 }
9452 goto curly_try_B_min_known;
9453
9454 }
9455 else {
9456 /* avoid taking address of locinput, so it can remain
9457 * a register var */
9458 char *li = locinput;
9459 if (ST.max)
9460 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9461 else
9462 ST.count = 0;
9463 if (ST.count < ST.min)
9464 sayNO;
9465 SET_locinput(li);
9466 if ((ST.count > ST.min)
9467 && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL))
9468 {
9469 /* A{m,n} must come at the end of the string, there's
9470 * no point in backing off ... */
9471 ST.min = ST.count;
9472 /* ...except that $ and \Z can match before *and* after
9473 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
9474 We may back off by one in this case. */
9475 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9476 ST.min--;
9477 }
9478 REGCP_SET(ST.cp);
9479 goto curly_try_B_max;
9480 }
9481 NOT_REACHED; /* NOTREACHED */
9482
9483 case CURLY_B_min_fail:
9484 /* failed to find B in a non-greedy match. */
9485 if (RE_PESSIMISTIC_PARENS) {
9486 REGCP_UNWIND(ST.lastcp);
9487 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9488 }
9489 REGCP_UNWIND(ST.cp);
9490 if (ST.paren) {
9491 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9492 }
9493
9494 if (ST.Binfo.count == 0) {
9495 /* failed -- move forward one */
9496 char *li = locinput;
9497 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9498 sayNO;
9499 }
9500 locinput = li;
9501 ST.count++;
9502 if (!( ST.count <= ST.max
9503 /* count overflow ? */
9504 || (ST.max == REG_INFTY && ST.count > 0))
9505 )
9506 sayNO;
9507 }
9508 else {
9509 int n;
9510 /* Couldn't or didn't -- move forward. */
9511 ST.oldloc = locinput;
9512 if (utf8_target)
9513 locinput += UTF8SKIP(locinput);
9514 else
9515 locinput++;
9516 ST.count++;
9517
9518 curly_try_B_min_known:
9519 /* find the next place where 'B' could work, then call B */
9520 if (locinput + ST.Binfo.initial_exact < loceol) {
9521 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9522
9523 /* Here, the mask is all 1's for the entire length of
9524 * any possible match. (That actually means that there
9525 * is only one possible match.) Look for the next
9526 * occurrence */
9527 locinput = ninstr(locinput, loceol,
9528 (char *) ST.Binfo.matches,
9529 (char *) ST.Binfo.matches
9530 + ST.Binfo.initial_exact);
9531 if (locinput == NULL) {
9532 sayNO;
9533 }
9534 }
9535 else do {
9536 /* If the first byte(s) of the mask are all ones, it
9537 * means those bytes must match identically, so can use
9538 * ninstr() to find the next possible matchpoint */
9539 if (ST.Binfo.initial_exact > 0) {
9540 locinput = ninstr(locinput, loceol,
9541 (char *) ST.Binfo.matches,
9542 (char *) ST.Binfo.matches
9543 + ST.Binfo.initial_exact);
9544 }
9545 else { /* Otherwise find the next byte that matches,
9546 masked */
9547 locinput = (char *) find_next_masked(
9548 (U8 *) locinput, (U8 *) loceol,
9549 ST.Binfo.first_byte_anded,
9550 ST.Binfo.first_byte_mask);
9551 /* Advance to the end of a multi-byte character */
9552 if (utf8_target) {
9553 while ( locinput < loceol
9554 && UTF8_IS_CONTINUATION(*locinput))
9555 {
9556 locinput++;
9557 }
9558 }
9559 }
9560 if ( locinput == NULL
9561 || locinput + ST.Binfo.min_length > loceol)
9562 {
9563 sayNO;
9564 }
9565
9566 /* Here, we have found a possible match point; if can't
9567 * rule it out, quit the loop so can check fully */
9568 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9569 break;
9570 }
9571
9572 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9573
9574 } while (locinput <= ST.maxpos);
9575 }
9576
9577 if (locinput > ST.maxpos)
9578 sayNO;
9579
9580 n = (utf8_target)
9581 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9582 : (STRLEN) (locinput - ST.oldloc);
9583
9584
9585 /* Here is at the beginning of a character that meets the mask
9586 * criteria. Need to make sure that some real possibility */
9587
9588 if (n) {
9589 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9590 * at what may be the beginning of b; check that everything
9591 * between oldloc and locinput matches */
9592 char *li = ST.oldloc;
9593 ST.count += n;
9594 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9595 sayNO;
9596 assert(n == REG_INFTY || locinput == li);
9597 }
9598 }
9599
9600 curly_try_B_min:
9601 if (RE_PESSIMISTIC_PARENS) {
9602 (void)regcppush(rex, 0, maxopenparen);
9603 REGCP_SET(ST.lastcp);
9604 }
9605 CURLY_SETPAREN(ST.paren, ST.count);
9606 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9607 script_run_begin);
9608 NOT_REACHED; /* NOTREACHED */
9609
9610
9611 curly_try_B_max:
9612 /* a successful greedy match: now try to match B */
9613 if ( ST.Binfo.count <= 0
9614 || ( ST.Binfo.count > 0
9615 && locinput + ST.Binfo.min_length <= loceol
9616 && S_test_EXACTISH_ST(locinput, ST.Binfo)))
9617 {
9618 if (RE_PESSIMISTIC_PARENS) {
9619 (void)regcppush(rex, 0, maxopenparen);
9620 REGCP_SET(ST.lastcp);
9621 }
9622 CURLY_SETPAREN(ST.paren, ST.count);
9623 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9624 script_run_begin);
9625 NOT_REACHED; /* NOTREACHED */
9626 }
9627 goto CURLY_B_all_failed;
9628 NOT_REACHED; /* NOTREACHED */
9629
9630 case CURLY_B_max_fail:
9631 /* failed to find B in a greedy match */
9632
9633 if (RE_PESSIMISTIC_PARENS) {
9634 REGCP_UNWIND(ST.lastcp);
9635 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9636 }
9637 CURLY_B_all_failed:
9638 REGCP_UNWIND(ST.cp);
9639 if (ST.paren) {
9640 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9641 }
9642 /* back up. */
9643 if (--ST.count < ST.min)
9644 sayNO;
9645 locinput = HOPc(locinput, -1);
9646 goto curly_try_B_max;
9647
9648 #undef ST
9649
9650 case END: /* last op of main pattern */
9651 fake_end:
9652 if (cur_eval) {
9653 /* we've just finished A in /(??{A})B/; now continue with B */
9654 is_accepted= false;
9655 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9656 st->u.eval.prev_rex = rex_sv; /* inner */
9657
9658 /* Save *all* the positions. */
9659 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9660 rex_sv = CUR_EVAL.prev_rex;
9661 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9662 SET_reg_curpm(rex_sv);
9663 rex = ReANY(rex_sv);
9664 rexi = RXi_GET(rex);
9665
9666 st->u.eval.prev_curlyx = cur_curlyx;
9667 cur_curlyx = CUR_EVAL.prev_curlyx;
9668
9669 REGCP_SET(st->u.eval.lastcp);
9670
9671 /* Restore parens of the outer rex without popping the
9672 * savestack */
9673 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9674
9675 st->u.eval.prev_eval = cur_eval;
9676 cur_eval = CUR_EVAL.prev_eval;
9677 DEBUG_EXECUTE_r(
9678 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
9679 depth, cur_eval););
9680 if ( nochange_depth )
9681 nochange_depth--;
9682
9683 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9684
9685 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
9686 st->u.eval.prev_eval->u.eval.B,
9687 locinput, loceol, script_run_begin);
9688 }
9689
9690 if (locinput < reginfo->till) {
9691 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9692 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9693 PL_colors[4],
9694 (long)(locinput - startpos),
9695 (long)(reginfo->till - startpos),
9696 PL_colors[5]));
9697
9698 sayNO_SILENT; /* Cannot match: too short. */
9699 }
9700 sayYES; /* Success! */
9701
9702 case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9703 matches end at the right spot, required for
9704 variable length matches. */
9705 if (match_end && locinput != match_end)
9706 {
9707 DEBUG_EXECUTE_r(
9708 Perl_re_exec_indentf( aTHX_
9709 "%sLOOKBEHIND_END: subpattern failed...%s\n",
9710 depth, PL_colors[4], PL_colors[5]));
9711 sayNO; /* Variable length match didn't line up */
9712 }
9713 /* FALLTHROUGH */
9714
9715 case SUCCEED: /* successful SUSPEND/CURLYM and
9716 *lookahead* IFMATCH/UNLESSM*/
9717 DEBUG_EXECUTE_r(
9718 Perl_re_exec_indentf( aTHX_
9719 "%sSUCCEED: subpattern success...%s\n",
9720 depth, PL_colors[4], PL_colors[5]));
9721 sayYES; /* Success! */
9722
9723 #undef ST
9724 #define ST st->u.ifmatch
9725
9726 case SUSPEND: /* (?>A) */
9727 ST.wanted = 1;
9728 ST.start = locinput;
9729 ST.end = loceol;
9730 ST.count = 1;
9731 goto do_ifmatch;
9732
9733 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9734 ST.wanted = 0;
9735 goto ifmatch_trivial_fail_test;
9736
9737 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9738 ST.wanted = 1;
9739 ifmatch_trivial_fail_test:
9740 ST.prev_match_end= match_end;
9741 ST.count = NEXT_OFF(scan) + 1; /* next_off repurposed to be
9742 lookbehind count, requires
9743 non-zero flags */
9744 if (! FLAGS(scan)) { /* 'flags' zero means lookahed */
9745
9746 /* Lookahead starts here and ends at the normal place */
9747 ST.start = locinput;
9748 ST.end = loceol;
9749 match_end = NULL;
9750 }
9751 else {
9752 PERL_UINT_FAST8_T back_count = FLAGS(scan);
9753 char * s;
9754 match_end = locinput;
9755
9756 /* Lookbehind can look beyond the current position */
9757 ST.end = loceol;
9758
9759 /* ... and starts at the first place in the input that is in
9760 * the range of the possible start positions */
9761 for (; ST.count > 0; ST.count--, back_count--) {
9762 s = HOPBACKc(locinput, back_count);
9763 if (s) {
9764 ST.start = s;
9765 goto do_ifmatch;
9766 }
9767 }
9768
9769 /* If the lookbehind doesn't start in the actual string, is a
9770 * trivial match failure */
9771 match_end = ST.prev_match_end;
9772 if (logical) {
9773 logical = 0;
9774 sw = 1 - cBOOL(ST.wanted);
9775 }
9776 else if (ST.wanted)
9777 sayNO;
9778
9779 /* Here, we didn't want it to match, so is actually success */
9780 next = scan + ARG1u(scan);
9781 if (next == scan)
9782 next = NULL;
9783 break;
9784 }
9785
9786 do_ifmatch:
9787 ST.me = scan;
9788 ST.logical = logical;
9789 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9790
9791 /* execute body of (?...A) */
9792 PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start,
9793 ST.end, script_run_begin);
9794 NOT_REACHED; /* NOTREACHED */
9795
9796 {
9797 bool matched;
9798
9799 case IFMATCH_A_fail: /* body of (?...A) failed */
9800 if (! ST.logical && ST.count > 1) {
9801
9802 /* It isn't a real failure until we've tried all starting
9803 * positions. Move to the next starting position and retry */
9804 ST.count--;
9805 ST.start = HOPc(ST.start, 1);
9806 scan = ST.me;
9807 logical = ST.logical;
9808 goto do_ifmatch;
9809 }
9810
9811 /* Here, all starting positions have been tried. */
9812 matched = FALSE;
9813 goto ifmatch_done;
9814
9815 case IFMATCH_A: /* body of (?...A) succeeded */
9816 matched = TRUE;
9817 ifmatch_done:
9818 sw = matched == ST.wanted;
9819 match_end = ST.prev_match_end;
9820 if (! ST.logical && !sw) {
9821 sayNO;
9822 }
9823
9824 if (OP(ST.me) != SUSPEND) {
9825 /* restore old position except for (?>...) */
9826 locinput = st->locinput;
9827 loceol = st->loceol;
9828 script_run_begin = st->sr0;
9829 }
9830 scan = ST.me + ARG1u(ST.me);
9831 if (scan == ST.me)
9832 scan = NULL;
9833 continue; /* execute B */
9834 }
9835
9836 #undef ST
9837
9838 case LONGJMP: /* alternative with many branches compiles to
9839 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9840 next = scan + ARG1u(scan);
9841 if (next == scan)
9842 next = NULL;
9843 break;
9844
9845 case COMMIT: /* (*COMMIT) */
9846 reginfo->cutpoint = loceol;
9847 /* FALLTHROUGH */
9848
9849 case PRUNE: /* (*PRUNE) */
9850 if (FLAGS(scan))
9851 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9852 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9853 script_run_begin);
9854 NOT_REACHED; /* NOTREACHED */
9855
9856 case COMMIT_next_fail:
9857 no_final = 1;
9858 /* FALLTHROUGH */
9859 sayNO;
9860 NOT_REACHED; /* NOTREACHED */
9861
9862 case OPFAIL: /* (*FAIL) */
9863 if (FLAGS(scan))
9864 sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9865 if (logical) {
9866 /* deal with (?(?!)X|Y) properly,
9867 * make sure we trigger the no branch
9868 * of the trailing IFTHEN structure*/
9869 sw= 0;
9870 break;
9871 } else {
9872 sayNO;
9873 }
9874 NOT_REACHED; /* NOTREACHED */
9875
9876 #define ST st->u.mark
9877 case MARKPOINT: /* (*MARK:foo) */
9878 ST.prev_mark = mark_state;
9879 ST.mark_name = sv_commit = sv_yes_mark
9880 = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9881 mark_state = st;
9882 ST.mark_loc = locinput;
9883 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9884 script_run_begin);
9885 NOT_REACHED; /* NOTREACHED */
9886
9887 case MARKPOINT_next:
9888 mark_state = ST.prev_mark;
9889 sayYES;
9890 NOT_REACHED; /* NOTREACHED */
9891
9892 case MARKPOINT_next_fail:
9893 if (popmark && sv_eq(ST.mark_name,popmark))
9894 {
9895 if (ST.mark_loc > startpoint)
9896 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9897 popmark = NULL; /* we found our mark */
9898 sv_commit = ST.mark_name;
9899
9900 DEBUG_EXECUTE_r({
9901 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9902 depth,
9903 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9904 });
9905 }
9906 mark_state = ST.prev_mark;
9907 sv_yes_mark = mark_state ?
9908 mark_state->u.mark.mark_name : NULL;
9909 sayNO;
9910 NOT_REACHED; /* NOTREACHED */
9911
9912 case SKIP: /* (*SKIP) */
9913 if (!FLAGS(scan)) {
9914 /* (*SKIP) : if we fail we cut here*/
9915 ST.mark_name = NULL;
9916 ST.mark_loc = locinput;
9917 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9918 script_run_begin);
9919 } else {
9920 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9921 otherwise do nothing. Meaning we need to scan
9922 */
9923 regmatch_state *cur = mark_state;
9924 SV *find = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]);
9925
9926 while (cur) {
9927 if ( sv_eq( cur->u.mark.mark_name,
9928 find ) )
9929 {
9930 ST.mark_name = find;
9931 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9932 script_run_begin);
9933 }
9934 cur = cur->u.mark.prev_mark;
9935 }
9936 }
9937 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9938 break;
9939
9940 case SKIP_next_fail:
9941 if (ST.mark_name) {
9942 /* (*CUT:NAME) - Set up to search for the name as we
9943 collapse the stack*/
9944 popmark = ST.mark_name;
9945 } else {
9946 /* (*CUT) - No name, we cut here.*/
9947 if (ST.mark_loc > startpoint)
9948 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9949 /* but we set sv_commit to latest mark_name if there
9950 is one so they can test to see how things lead to this
9951 cut */
9952 if (mark_state)
9953 sv_commit=mark_state->u.mark.mark_name;
9954 }
9955 no_final = 1;
9956 sayNO;
9957 NOT_REACHED; /* NOTREACHED */
9958 #undef ST
9959
9960 case LNBREAK: /* \R */
9961 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9962 locinput += n;
9963 } else
9964 sayNO;
9965 break;
9966
9967 default:
9968 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9969 PTR2UV(scan), OP(scan));
9970 Perl_croak(aTHX_ "regexp memory corruption");
9971
9972 /* this is a point to jump to in order to increment
9973 * locinput by one character */
9974 increment_locinput:
9975 assert(!NEXTCHR_IS_EOS);
9976 if (utf8_target) {
9977 locinput += PL_utf8skip[nextbyte];
9978 /* locinput is allowed to go 1 char off the end (signifying
9979 * EOS), but not 2+ */
9980 if (locinput > loceol)
9981 sayNO;
9982 }
9983 else
9984 locinput++;
9985 break;
9986
9987 } /* end switch */
9988
9989 /* switch break jumps here */
9990 scan = next; /* prepare to execute the next op and ... */
9991 continue; /* ... jump back to the top, reusing st */
9992 /* NOTREACHED */
9993
9994 push_yes_state:
9995 /* push a state that backtracks on success */
9996 st->u.yes.prev_yes_state = yes_state;
9997 yes_state = st;
9998 /* FALLTHROUGH */
9999 push_state:
10000 /* push a new regex state, then continue at scan */
10001 {
10002 regmatch_state *newst;
10003 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10004
10005 DEBUG_r( /* DEBUG_STACK_r */
10006 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
10007 regmatch_state *cur = st;
10008 regmatch_state *curyes = yes_state;
10009 U32 i;
10010 regmatch_slab *slab = PL_regmatch_slab;
10011 for (i = 0; i < 3 && i <= depth; cur--,i++) {
10012 if (cur < SLAB_FIRST(slab)) {
10013 slab = slab->prev;
10014 cur = SLAB_LAST(slab);
10015 }
10016 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
10017 depth,
10018 i ? " " : "push",
10019 depth - i, REGNODE_NAME(cur->resume_state),
10020 (curyes == cur) ? "yes" : ""
10021 );
10022 if (curyes == cur)
10023 curyes = cur->u.yes.prev_yes_state;
10024 }
10025 } else {
10026 DEBUG_STATE_pp("push")
10027 });
10028 depth++;
10029 st->locinput = locinput;
10030 st->loceol = loceol;
10031 st->sr0 = script_run_begin;
10032 newst = st+1;
10033 if (newst > SLAB_LAST(PL_regmatch_slab))
10034 newst = S_push_slab(aTHX);
10035 PL_regmatch_state = newst;
10036
10037 locinput = pushinput;
10038 loceol = pusheol;
10039 script_run_begin = pushsr0;
10040 st = newst;
10041 continue;
10042 /* NOTREACHED */
10043 }
10044 }
10045 #ifdef SOLARIS_BAD_OPTIMIZER
10046 # undef PL_charclass
10047 #endif
10048
10049 /*
10050 * We get here only if there's trouble -- normally "case END" is
10051 * the terminating point.
10052 */
10053 Perl_croak(aTHX_ "corrupted regexp pointers");
10054 NOT_REACHED; /* NOTREACHED */
10055
10056 yes:
10057 if (yes_state) {
10058 /* we have successfully completed a subexpression, but we must now
10059 * pop to the state marked by yes_state and continue from there */
10060 assert(st != yes_state);
10061 #ifdef DEBUGGING
10062 while (st != yes_state) {
10063 st--;
10064 if (st < SLAB_FIRST(PL_regmatch_slab)) {
10065 PL_regmatch_slab = PL_regmatch_slab->prev;
10066 st = SLAB_LAST(PL_regmatch_slab);
10067 }
10068 DEBUG_STATE_r({
10069 if (no_final) {
10070 DEBUG_STATE_pp("pop (no final)");
10071 } else {
10072 DEBUG_STATE_pp("pop (yes)");
10073 }
10074 });
10075 depth--;
10076 }
10077 #else
10078 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
10079 || yes_state > SLAB_LAST(PL_regmatch_slab))
10080 {
10081 /* not in this slab, pop slab */
10082 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
10083 PL_regmatch_slab = PL_regmatch_slab->prev;
10084 st = SLAB_LAST(PL_regmatch_slab);
10085 }
10086 depth -= (st - yes_state);
10087 #endif
10088 st = yes_state;
10089 yes_state = st->u.yes.prev_yes_state;
10090 PL_regmatch_state = st;
10091
10092 if (no_final) {
10093 locinput= st->locinput;
10094 loceol= st->loceol;
10095 script_run_begin = st->sr0;
10096 }
10097 state_num = st->resume_state + no_final;
10098 goto reenter_switch;
10099 }
10100
10101 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
10102 PL_colors[4], PL_colors[5]));
10103
10104 if (reginfo->info_aux_eval) {
10105 /* each successfully executed (?{...}) block does the equivalent of
10106 * local $^R = do {...}
10107 * When popping the save stack, all these locals would be undone;
10108 * bypass this by setting the outermost saved $^R to the latest
10109 * value */
10110 /* I don't know if this is needed or works properly now.
10111 * see code related to PL_replgv elsewhere in this file.
10112 * Yves
10113 */
10114 if (oreplsv != GvSV(PL_replgv)) {
10115 sv_setsv(oreplsv, GvSV(PL_replgv));
10116 SvSETMAGIC(oreplsv);
10117 }
10118 }
10119 result = 1;
10120 goto final_exit;
10121
10122 no:
10123 DEBUG_EXECUTE_r(
10124 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
10125 depth,
10126 PL_colors[4], PL_colors[5])
10127 );
10128
10129 no_silent:
10130 if (no_final) {
10131 if (yes_state) {
10132 goto yes;
10133 } else {
10134 goto final_exit;
10135 }
10136 }
10137 if (depth) {
10138 /* there's a previous state to backtrack to */
10139 st--;
10140 if (st < SLAB_FIRST(PL_regmatch_slab)) {
10141 PL_regmatch_slab = PL_regmatch_slab->prev;
10142 st = SLAB_LAST(PL_regmatch_slab);
10143 }
10144 PL_regmatch_state = st;
10145 locinput= st->locinput;
10146 loceol= st->loceol;
10147 script_run_begin = st->sr0;
10148
10149 DEBUG_STATE_pp("pop");
10150 depth--;
10151 if (yes_state == st)
10152 yes_state = st->u.yes.prev_yes_state;
10153
10154 state_num = st->resume_state + 1; /* failure = success + 1 */
10155 PERL_ASYNC_CHECK();
10156 goto reenter_switch;
10157 }
10158 result = 0;
10159
10160 final_exit:
10161 if (rex->intflags & PREGf_VERBARG_SEEN) {
10162 SV *sv_err = get_sv("REGERROR", 1);
10163 SV *sv_mrk = get_sv("REGMARK", 1);
10164 if (result) {
10165 sv_commit = &PL_sv_no;
10166 if (!sv_yes_mark)
10167 sv_yes_mark = &PL_sv_yes;
10168 } else {
10169 if (!sv_commit)
10170 sv_commit = &PL_sv_yes;
10171 sv_yes_mark = &PL_sv_no;
10172 }
10173 assert(sv_err);
10174 assert(sv_mrk);
10175 sv_setsv(sv_err, sv_commit);
10176 sv_setsv(sv_mrk, sv_yes_mark);
10177 }
10178
10179
10180 if (last_pushed_cv) {
10181 dSP;
10182 /* see "Some notes about MULTICALL" above */
10183 POP_MULTICALL;
10184 PERL_UNUSED_VAR(SP);
10185 }
10186 else
10187 LEAVE_SCOPE(orig_savestack_ix);
10188
10189 assert(!result || locinput - reginfo->strbeg >= 0);
10190 return result ? locinput - reginfo->strbeg : -1;
10191 }
10192
10193 /*
10194 - regrepeat - repeatedly match something simple, report how many
10195 *
10196 * What 'simple' means is a node which can be the operand of a quantifier like
10197 * '+', or {1,3}
10198 *
10199 * startposp - pointer to a pointer to the start position. This is updated
10200 * to point to the byte following the highest successful
10201 * match.
10202 * p - the regnode to be repeatedly matched against.
10203 * loceol - pointer to the end position beyond which we aren't supposed to
10204 * look.
10205 * reginfo - struct holding match state, such as utf8_target
10206 * max - maximum number of things to match.
10207 * depth - (for debugging) backtracking depth.
10208 */
10209 STATIC I32
S_regrepeat(pTHX_ regexp * prog,char ** startposp,const regnode * p,char * loceol,regmatch_info * const reginfo,I32 max comma_pDEPTH)10210 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
10211 char * loceol, regmatch_info *const reginfo, I32 max comma_pDEPTH)
10212 {
10213 char *scan; /* Pointer to current position in target string */
10214 I32 c;
10215 char *this_eol = loceol; /* potentially adjusted version. */
10216 I32 hardcount = 0; /* How many matches so far */
10217 bool utf8_target = reginfo->is_utf8_target;
10218 unsigned int to_complement = 0; /* Invert the result? */
10219 char_class_number_ classnum;
10220
10221 PERL_ARGS_ASSERT_REGREPEAT;
10222
10223 /* This routine is structured so that we switch on the input OP. Each OP
10224 * case: statement contains a loop to repeatedly apply the OP, advancing
10225 * the input until it fails, or reaches the end of the input, or until it
10226 * reaches the upper limit of matches. */
10227
10228 scan = *startposp;
10229 if (max == REG_INFTY) /* This is a special marker to go to the platform's
10230 max */
10231 max = I32_MAX;
10232 else if (! utf8_target && this_eol - scan > max)
10233 this_eol = scan + max;
10234
10235 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
10236 * down to the maximum of how far we should go in it (but leaving it set to
10237 * the real end if the maximum permissible would take us beyond that).
10238 * This allows us to make the loop exit condition that we haven't gone past
10239 * <this_eol> to also mean that we haven't exceeded the max permissible
10240 * count, saving a test each time through the loop. But it assumes that
10241 * the OP matches a single byte, which is true for most of the OPs below
10242 * when applied to a non-UTF-8 target. Those relatively few OPs that don't
10243 * have this characteristic have to compensate.
10244 *
10245 * There is no such adjustment for UTF-8 targets, since the number of bytes
10246 * per character can vary. OPs will have to test both that the count is
10247 * less than the max permissible (using <hardcount> to keep track), and
10248 * that we are still within the bounds of the string (using <this_eol>. A
10249 * few OPs match a single byte no matter what the encoding. They can omit
10250 * the max test if, for the UTF-8 case, they do the adjustment that was
10251 * skipped above.
10252 *
10253 * Thus, the code above sets things up for the common case; and exceptional
10254 * cases need extra work; the common case is to make sure <scan> doesn't go
10255 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
10256 * count doesn't exceed the maximum permissible */
10257
10258 switch (with_t_UTF8ness(OP(p), utf8_target)) {
10259 SV * anyofh_list;
10260
10261 case REG_ANY_t8:
10262 while (scan < this_eol && hardcount < max && *scan != '\n') {
10263 scan += UTF8SKIP(scan);
10264 hardcount++;
10265 }
10266 break;
10267
10268 case REG_ANY_tb:
10269 scan = (char *) memchr(scan, '\n', this_eol - scan);
10270 if (! scan) {
10271 scan = this_eol;
10272 }
10273 break;
10274
10275 case SANY_t8:
10276 while (scan < this_eol && hardcount < max) {
10277 scan += UTF8SKIP(scan);
10278 hardcount++;
10279 }
10280 break;
10281
10282 case SANY_tb:
10283 scan = this_eol;
10284 break;
10285
10286 case EXACT_REQ8_tb:
10287 case LEXACT_REQ8_tb:
10288 case EXACTFU_REQ8_tb:
10289 break;
10290
10291 case EXACTL_t8:
10292 if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10293 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10294 }
10295 /* FALLTHROUGH */
10296
10297 case EXACTL_tb:
10298 case EXACTFL_t8:
10299 case EXACTFL_tb:
10300 case EXACTFLU8_t8:
10301 case EXACTFLU8_tb:
10302 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10303 /* FALLTHROUGH */
10304
10305 case EXACT_REQ8_t8:
10306 case LEXACT_REQ8_t8:
10307 case EXACTFU_REQ8_t8:
10308 case LEXACT_t8:
10309 case LEXACT_tb:
10310 case EXACT_t8:
10311 case EXACT_tb:
10312 case EXACTF_t8:
10313 case EXACTF_tb:
10314 case EXACTFAA_NO_TRIE_t8:
10315 case EXACTFAA_NO_TRIE_tb:
10316 case EXACTFAA_t8:
10317 case EXACTFAA_tb:
10318 case EXACTFU_t8:
10319 case EXACTFU_tb:
10320 case EXACTFUP_t8:
10321 case EXACTFUP_tb:
10322
10323 {
10324 struct next_matchable_info Binfo;
10325 PERL_UINT_FAST8_T definitive_len;
10326
10327 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10328
10329 /* Set up termination info, and quit if we can rule out that we've
10330 * gotten a match of the termination criteria */
10331 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10332 || scan + Binfo.min_length > this_eol
10333 || ! S_test_EXACTISH_ST(scan, Binfo))
10334 {
10335 break;
10336 }
10337
10338 definitive_len = Binfo.initial_definitive;
10339
10340 /* Here there are potential matches, and the first byte(s) matched our
10341 * filter
10342 *
10343 * If we got a definitive match of some initial bytes, there is no
10344 * possibility of false positives as far as it got */
10345 if (definitive_len > 0) {
10346
10347 /* If as far as it got is the maximum possible, there were no false
10348 * positives at all. Since we have everything set up, see how many
10349 * repeats there are. */
10350 if (definitive_len >= Binfo.max_length) {
10351
10352 /* We've already found one match */
10353 scan += definitive_len;
10354 hardcount++;
10355
10356 /* If want more than the one match, and there is room for more,
10357 * see if there are any */
10358 if (hardcount < max && scan + definitive_len <= this_eol) {
10359
10360 /* If the character is only a single byte long, just span
10361 * all such bytes. */
10362 if (definitive_len == 1) {
10363 const char * orig_scan = scan;
10364
10365 if (this_eol - (scan - hardcount) > max) {
10366 this_eol = scan - hardcount + max;
10367 }
10368
10369 /* Use different routines depending on whether it's an
10370 * exact match or matches with a mask */
10371 if (Binfo.initial_exact == 1) {
10372 scan = (char *) find_span_end((U8 *) scan,
10373 (U8 *) this_eol,
10374 Binfo.matches[0]);
10375 }
10376 else {
10377 scan = (char *) find_span_end_mask(
10378 (U8 *) scan,
10379 (U8 *) this_eol,
10380 Binfo.first_byte_anded,
10381 Binfo.first_byte_mask);
10382 }
10383
10384 hardcount += scan - orig_scan;
10385 }
10386 else { /* Here, the full character definitive match is more
10387 than one byte */
10388 while ( hardcount < max
10389 && scan + definitive_len <= this_eol
10390 && S_test_EXACTISH_ST(scan, Binfo))
10391 {
10392 scan += definitive_len;
10393 hardcount++;
10394 }
10395 }
10396 }
10397
10398 break;
10399 } /* End of a full character is definitively matched */
10400
10401 /* Here, an initial portion of the character matched definitively,
10402 * and the rest matched as well, but could have false positives */
10403
10404 do {
10405 int i;
10406 U8 * matches = Binfo.matches;
10407
10408 /* The first bytes were definitive. Look at the remaining */
10409 for (i = 0; i < Binfo.count; i++) {
10410 if (memEQ(scan + definitive_len,
10411 matches + definitive_len,
10412 Binfo.lengths[i] - definitive_len))
10413 {
10414 goto found_a_completion;
10415 }
10416
10417 matches += Binfo.lengths[i];
10418 }
10419
10420 /* Didn't find anything to complete our initial match. Stop
10421 * here */
10422 break;
10423
10424 found_a_completion:
10425
10426 /* Here, matched a full character, Include it in the result,
10427 * and then look to see if the next char matches */
10428 hardcount++;
10429 scan += Binfo.lengths[i];
10430
10431 } while ( hardcount < max
10432 && scan + definitive_len < this_eol
10433 && S_test_EXACTISH_ST(scan, Binfo));
10434
10435 /* Here, have advanced as far as possible */
10436 break;
10437 } /* End of found some initial bytes that definitively matched */
10438
10439 /* Here, we can't rule out that we have found the beginning of 'B', but
10440 * there were no initial bytes that could rule out anything
10441 * definitively. Use brute force to examine all the possibilities */
10442 while (scan < this_eol && hardcount < max) {
10443 int i;
10444 U8 * matches = Binfo.matches;
10445
10446 for (i = 0; i < Binfo.count; i++) {
10447 if (memEQ(scan, matches, Binfo.lengths[i])) {
10448 goto found1;
10449 }
10450
10451 matches += Binfo.lengths[i];
10452 }
10453
10454 break;
10455
10456 found1:
10457 hardcount++;
10458 scan += Binfo.lengths[i];
10459 }
10460
10461 break;
10462 }
10463
10464 case ANYOFPOSIXL_t8:
10465 case ANYOFL_t8:
10466 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10467 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10468
10469 /* FALLTHROUGH */
10470 case ANYOFD_t8:
10471 case ANYOF_t8:
10472 while ( hardcount < max
10473 && scan < this_eol
10474 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10475 {
10476 scan += UTF8SKIP(scan);
10477 hardcount++;
10478 }
10479 break;
10480
10481 case ANYOFPOSIXL_tb:
10482 case ANYOFL_tb:
10483 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10484 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10485 /* FALLTHROUGH */
10486
10487 case ANYOFD_tb:
10488 case ANYOF_tb:
10489 if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) {
10490 while ( scan < this_eol
10491 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10492 scan++;
10493 }
10494 else {
10495 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10496 scan++;
10497 }
10498 break;
10499
10500 case ANYOFM_t8:
10501 if (this_eol - scan > max) {
10502
10503 /* We didn't adjust <this_eol> at the beginning of this routine
10504 * because is UTF-8, but it is actually ok to do so, since here, to
10505 * match, 1 char == 1 byte. */
10506 this_eol = scan + max;
10507 }
10508 /* FALLTHROUGH */
10509
10510 case ANYOFM_tb:
10511 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10512 (U8) ARG1u(p), FLAGS(p));
10513 break;
10514
10515 case NANYOFM_t8:
10516 while ( hardcount < max
10517 && scan < this_eol
10518 && (*scan & FLAGS(p)) != ARG1u(p))
10519 {
10520 scan += UTF8SKIP(scan);
10521 hardcount++;
10522 }
10523 break;
10524
10525 case NANYOFM_tb:
10526 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10527 (U8) ARG1u(p), FLAGS(p));
10528 break;
10529
10530 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10531 case ANYOFHb_tb:
10532 case ANYOFHbbm_tb:
10533 case ANYOFHr_tb:
10534 case ANYOFHs_tb:
10535 break;
10536
10537 case ANYOFH_t8:
10538 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10539 while ( hardcount < max
10540 && scan < this_eol
10541 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10542 && _invlist_contains_cp(anyofh_list,
10543 utf8_to_uvchr_buf((U8 *) scan,
10544 (U8 *) this_eol,
10545 NULL)))
10546 {
10547 scan += UTF8SKIP(scan);
10548 hardcount++;
10549 }
10550 break;
10551
10552 case ANYOFHb_t8:
10553 /* we know the first byte must be the FLAGS field */
10554 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10555 while ( hardcount < max
10556 && scan < this_eol
10557 && (U8) *scan == ANYOF_FLAGS(p)
10558 && _invlist_contains_cp(anyofh_list,
10559 utf8_to_uvchr_buf((U8 *) scan,
10560 (U8 *) this_eol,
10561 NULL)))
10562 {
10563 scan += UTF8SKIP(scan);
10564 hardcount++;
10565 }
10566 break;
10567
10568 case ANYOFHbbm_t8:
10569 while ( hardcount < max
10570 && scan + 1 < this_eol
10571 && (U8) *scan == ANYOF_FLAGS(p)
10572 && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap,
10573 (U8) scan[1] & UTF_CONTINUATION_MASK))
10574 {
10575 scan += 2; /* This node only matces 2-byte UTF-8 */
10576 hardcount++;
10577 }
10578 break;
10579
10580 case ANYOFHr_t8:
10581 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10582 while ( hardcount < max
10583 && scan < this_eol
10584 && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10585 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10586 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10587 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10588 && _invlist_contains_cp(anyofh_list,
10589 utf8_to_uvchr_buf((U8 *) scan,
10590 (U8 *) this_eol,
10591 NULL)))
10592 {
10593 scan += UTF8SKIP(scan);
10594 hardcount++;
10595 }
10596 break;
10597
10598 case ANYOFHs_t8:
10599 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10600 while ( hardcount < max
10601 && scan + FLAGS(p) < this_eol
10602 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10603 && _invlist_contains_cp(anyofh_list,
10604 utf8_to_uvchr_buf((U8 *) scan,
10605 (U8 *) this_eol,
10606 NULL)))
10607 {
10608 scan += UTF8SKIP(scan);
10609 hardcount++;
10610 }
10611 break;
10612
10613 case ANYOFR_t8:
10614 while ( hardcount < max
10615 && scan < this_eol
10616 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10617 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10618 (U8 *) this_eol,
10619 NULL),
10620 ANYOFRbase(p), ANYOFRdelta(p)))
10621 {
10622 scan += UTF8SKIP(scan);
10623 hardcount++;
10624 }
10625 break;
10626
10627 case ANYOFR_tb:
10628 while ( hardcount < max
10629 && scan < this_eol
10630 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10631 {
10632 scan++;
10633 hardcount++;
10634 }
10635 break;
10636
10637 case ANYOFRb_t8:
10638 while ( hardcount < max
10639 && scan < this_eol
10640 && (U8) *scan == ANYOF_FLAGS(p)
10641 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10642 (U8 *) this_eol,
10643 NULL),
10644 ANYOFRbase(p), ANYOFRdelta(p)))
10645 {
10646 scan += UTF8SKIP(scan);
10647 hardcount++;
10648 }
10649 break;
10650
10651 case ANYOFRb_tb:
10652 while ( hardcount < max
10653 && scan < this_eol
10654 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10655 {
10656 scan++;
10657 hardcount++;
10658 }
10659 break;
10660
10661 /* The argument (FLAGS) to all the POSIX node types is the class number */
10662
10663 case NPOSIXL_tb:
10664 to_complement = 1;
10665 /* FALLTHROUGH */
10666
10667 case POSIXL_tb:
10668 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10669 while ( scan < this_eol
10670 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10671 {
10672 scan++;
10673 }
10674 break;
10675
10676 case NPOSIXL_t8:
10677 to_complement = 1;
10678 /* FALLTHROUGH */
10679
10680 case POSIXL_t8:
10681 while ( hardcount < max && scan < this_eol
10682 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10683 (U8 *) scan,
10684 (U8 *) this_eol)))
10685 {
10686 scan += UTF8SKIP(scan);
10687 hardcount++;
10688 }
10689 break;
10690
10691 case POSIXD_tb:
10692 /* FALLTHROUGH */
10693
10694 case POSIXA_t8:
10695 if (this_eol - scan > max) {
10696
10697 /* We didn't adjust <this_eol> at the beginning of this routine
10698 * because is UTF-8, but it is actually ok to do so, since here, to
10699 * match, 1 char == 1 byte. */
10700 this_eol = scan + max;
10701 }
10702 /* FALLTHROUGH */
10703
10704 case POSIXA_tb:
10705 while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) {
10706 scan++;
10707 }
10708 break;
10709
10710 case NPOSIXD_tb:
10711 /* FALLTHROUGH */
10712
10713 case NPOSIXA_tb:
10714 while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) {
10715 scan++;
10716 }
10717 break;
10718
10719 case NPOSIXA_t8:
10720
10721 /* The complement of something that matches only ASCII matches all
10722 * non-ASCII, plus everything in ASCII that isn't in the class. */
10723 while ( hardcount < max && scan < this_eol
10724 && ( ! isASCII_utf8_safe(scan, loceol)
10725 || ! generic_isCC_A_((U8) *scan, FLAGS(p))))
10726 {
10727 scan += UTF8SKIP(scan);
10728 hardcount++;
10729 }
10730 break;
10731
10732 case NPOSIXU_tb:
10733 to_complement = 1;
10734 /* FALLTHROUGH */
10735
10736 case POSIXU_tb:
10737 while ( scan < this_eol
10738 && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p))))
10739 {
10740 scan++;
10741 }
10742 break;
10743
10744 case NPOSIXU_t8:
10745 case NPOSIXD_t8:
10746 to_complement = 1;
10747 /* FALLTHROUGH */
10748
10749 case POSIXD_t8:
10750 case POSIXU_t8:
10751 classnum = (char_class_number_) FLAGS(p);
10752 switch (classnum) {
10753 default:
10754 while ( hardcount < max && scan < this_eol
10755 && to_complement
10756 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10757 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10758 {
10759 scan += UTF8SKIP(scan);
10760 hardcount++;
10761 }
10762 break;
10763
10764 /* For the classes below, the knowledge of how to handle every code
10765 * point is compiled into Perl via a macro. This code is written
10766 * for making the loops as tight as possible. It could be
10767 * refactored to save space instead. */
10768
10769 case CC_ENUM_SPACE_:
10770 while ( hardcount < max
10771 && scan < this_eol
10772 && (to_complement
10773 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10774 {
10775 scan += UTF8SKIP(scan);
10776 hardcount++;
10777 }
10778 break;
10779 case CC_ENUM_BLANK_:
10780 while ( hardcount < max
10781 && scan < this_eol
10782 && (to_complement
10783 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10784 {
10785 scan += UTF8SKIP(scan);
10786 hardcount++;
10787 }
10788 break;
10789 case CC_ENUM_XDIGIT_:
10790 while ( hardcount < max
10791 && scan < this_eol
10792 && (to_complement
10793 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10794 {
10795 scan += UTF8SKIP(scan);
10796 hardcount++;
10797 }
10798 break;
10799 case CC_ENUM_VERTSPACE_:
10800 while ( hardcount < max
10801 && scan < this_eol
10802 && (to_complement
10803 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10804 {
10805 scan += UTF8SKIP(scan);
10806 hardcount++;
10807 }
10808 break;
10809 case CC_ENUM_CNTRL_:
10810 while ( hardcount < max
10811 && scan < this_eol
10812 && (to_complement
10813 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10814 {
10815 scan += UTF8SKIP(scan);
10816 hardcount++;
10817 }
10818 break;
10819 }
10820 break;
10821
10822 case LNBREAK_t8:
10823 while ( hardcount < max && scan < this_eol
10824 && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10825 {
10826 scan += c;
10827 hardcount++;
10828 }
10829 break;
10830
10831 case LNBREAK_tb:
10832 /* LNBREAK can match one or two latin chars, which is ok, but we have
10833 * to use hardcount in this situation, and throw away the adjustment to
10834 * <this_eol> done before the switch statement */
10835 while (
10836 hardcount < max && scan < loceol
10837 && (c = is_LNBREAK_latin1_safe(scan, loceol))
10838 ) {
10839 scan += c;
10840 hardcount++;
10841 }
10842 break;
10843
10844 default:
10845 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10846 " node type %d='%s'", OP(p), REGNODE_NAME(OP(p)));
10847 NOT_REACHED; /* NOTREACHED */
10848
10849 }
10850
10851 if (hardcount)
10852 c = hardcount;
10853 else
10854 c = scan - *startposp;
10855 *startposp = scan;
10856
10857 DEBUG_r({
10858 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10859 DEBUG_EXECUTE_r({
10860 SV * const prop = sv_newmortal();
10861 regprop(prog, prop, p, reginfo, NULL);
10862 Perl_re_exec_indentf( aTHX_
10863 "%s can match %" IVdf " times out of %" IVdf "...\n",
10864 depth, SvPVX_const(prop),(IV)c,(IV)max);
10865 });
10866 });
10867
10868 return(c);
10869 }
10870
10871 /*
10872 - reginclass - determine if a character falls into a character class
10873
10874 n is the ANYOF-type regnode
10875 p is the target string
10876 p_end points to one byte beyond the end of the target string
10877 utf8_target tells whether p is in UTF-8.
10878
10879 Returns true if matched; false otherwise.
10880
10881 Note that this can be a synthetic start class, a combination of various
10882 nodes, so things you think might be mutually exclusive, such as locale,
10883 aren't. It can match both locale and non-locale
10884
10885 */
10886
10887 STATIC bool
S_reginclass(pTHX_ regexp * const prog,const regnode * const n,const U8 * const p,const U8 * const p_end,const bool utf8_target)10888 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10889 {
10890 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10891 ? 0
10892 : ANYOF_FLAGS(n);
10893 bool match = FALSE;
10894 UV c = *p;
10895
10896 PERL_ARGS_ASSERT_REGINCLASS;
10897
10898 /* If c is not already the code point, get it. Note that
10899 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10900 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10901 STRLEN c_len = 0;
10902 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10903 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10904 if (c_len == (STRLEN)-1) {
10905 _force_out_malformed_utf8_message(p, p_end,
10906 utf8n_flags,
10907 1 /* 1 means die */ );
10908 NOT_REACHED; /* NOTREACHED */
10909 }
10910 if ( c > 255
10911 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10912 && ! (flags & ANYOFL_UTF8_LOCALE_REQD))
10913 {
10914 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10915 }
10916 }
10917
10918 /* If this character is potentially in the bitmap, check it */
10919 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10920 if (ANYOF_BITMAP_TEST(n, c))
10921 match = TRUE;
10922 else if ( (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)
10923 && OP(n) == ANYOFD
10924 && ! utf8_target
10925 && ! isASCII(c))
10926 {
10927 match = TRUE;
10928 }
10929 else if (flags & ANYOF_LOCALE_FLAGS) {
10930 if ( (flags & ANYOFL_FOLD)
10931 && c < 256
10932 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10933 {
10934 match = TRUE;
10935 }
10936 else if ( ANYOF_POSIXL_TEST_ANY_SET(n)
10937 && c <= U8_MAX /* param to isFOO_lc() */
10938 ) {
10939 /* The data structure is arranged so bits 0, 2, 4, ... are set
10940 * if the class includes the Posix character class given by
10941 * bit/2; and 1, 3, 5, ... are set if the class includes the
10942 * complemented Posix class given by int(bit/2), so the
10943 * remainder modulo 2 tells us if to complement or not.
10944 *
10945 * Note that this code assumes that all the classes are closed
10946 * under folding. For example, if a character matches \w, then
10947 * its fold does too; and vice versa. This should be true for
10948 * any well-behaved locale for all the currently defined Posix
10949 * classes, except for :lower: and :upper:, which are handled
10950 * by the pseudo-class :cased: which matches if either of the
10951 * other two does. To get rid of this assumption, an outer
10952 * loop could be used below to iterate over both the source
10953 * character, and its fold (if different) */
10954
10955 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10956
10957 do {
10958 /* Find the next set bit indicating a class to try matching
10959 * against */
10960 U8 bit_pos = lsbit_pos32(posixl_bits);
10961
10962 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10963 match = TRUE;
10964 break;
10965 }
10966
10967 /* Remove this class from consideration; repeat */
10968 POSIXL_CLEAR(posixl_bits, bit_pos);
10969 } while(posixl_bits != 0);
10970 }
10971 }
10972 }
10973
10974 /* If the bitmap didn't (or couldn't) match, and something outside the
10975 * bitmap could match, try that. */
10976 if (!match) {
10977 if ( c >= NUM_ANYOF_CODE_POINTS
10978 && ANYOF_ONLY_HAS_BITMAP(n)
10979 && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES))
10980 {
10981 /* In this case, the ARG is set up so that the final bit indicates
10982 * whether it matches or not */
10983 match = ARG1u(n) & 1;
10984 }
10985 else
10986 /* Here, the main way it could match is if the code point is
10987 * outside the bitmap and an inversion list exists to handle such
10988 * things. The other ways all occur when a flag is set to indicate
10989 * we need special handling. That handling doesn't come in to
10990 * effect for ANYOFD nodes unless the target string is UTF-8 and
10991 * that matters to code point being matched. */
10992 if ( c >= NUM_ANYOF_CODE_POINTS
10993 || ( (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)
10994 && ( UNLIKELY(OP(n) != ANYOFD)
10995 || (utf8_target && ! isASCII_uvchr(c)
10996 # if NUM_ANYOF_CODE_POINTS > 256
10997 && c < 256
10998 # endif
10999 ))))
11000 {
11001 /* Here, we have an inversion list for outside-the-bitmap code
11002 * points and/or the flag is set indicating special handling is
11003 * needed. The flag is set when there is auxiliary data beyond the
11004 * normal inversion list, or if there is the possibility of a
11005 * special Turkic locale match, even without auxiliary data.
11006 *
11007 * First check if there is an inversion list and/or auxiliary data.
11008 * */
11009 if (ANYOF_HAS_AUX(n)) {
11010 SV* only_utf8_locale = NULL;
11011
11012 /* This call will return in 'definition' the union of the base
11013 * non-bitmap inversion list, if any, plus the deferred
11014 * definitions of user-defined properties, if any. It croaks
11015 * if there is such a property but which still has no definition
11016 * available */
11017 SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0,
11018 &only_utf8_locale, NULL);
11019 if (definition) {
11020 /* Most likely is the outside-the-bitmap inversion list. */
11021 if (_invlist_contains_cp(definition, c)) {
11022 match = TRUE;
11023 }
11024 else /* Failing that, hardcode the two tests for a Turkic
11025 locale */
11026 if ( UNLIKELY(IN_UTF8_TURKIC_LOCALE)
11027 && isALPHA_FOLD_EQ(*p, 'i'))
11028 {
11029 /* Turkish locales have these hard-coded rules
11030 * overriding normal ones */
11031 if (*p == 'i') {
11032 if (_invlist_contains_cp(definition,
11033 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11034 {
11035 match = TRUE;
11036 }
11037 }
11038 else if (_invlist_contains_cp(definition,
11039 LATIN_SMALL_LETTER_DOTLESS_I))
11040 {
11041 match = TRUE;
11042 }
11043 }
11044 }
11045
11046 if ( UNLIKELY(only_utf8_locale)
11047 && UNLIKELY(IN_UTF8_CTYPE_LOCALE)
11048 && ! match)
11049 {
11050 match = _invlist_contains_cp(only_utf8_locale, c);
11051 }
11052 }
11053
11054 /* In a Turkic locale under folding, hard-code the I i case pair
11055 * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES
11056 * flag set unless [Ii] were match possibilities */
11057 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE) && ! match) {
11058 if (utf8_target) {
11059 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
11060 if (ANYOF_BITMAP_TEST(n, 'i')) {
11061 match = TRUE;
11062 }
11063 }
11064 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
11065 if (ANYOF_BITMAP_TEST(n, 'I')) {
11066 match = TRUE;
11067 }
11068 }
11069 }
11070
11071 #if NUM_ANYOF_CODE_POINTS > 256
11072 /* Larger bitmap means these special cases aren't handled
11073 * outside the bitmap above. */
11074 if (*p == 'i') {
11075 if (ANYOF_BITMAP_TEST(n,
11076 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
11077 {
11078 match = TRUE;
11079 }
11080 }
11081 else if (*p == 'I') {
11082 if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) {
11083 match = TRUE;
11084 }
11085 }
11086 #endif
11087 }
11088 }
11089
11090 if ( UNICODE_IS_SUPER(c)
11091 && (flags & ANYOF_WARN_SUPER__shared)
11092 && OP(n) != ANYOFD
11093 && ckWARN_d(WARN_NON_UNICODE))
11094 {
11095 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
11096 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
11097 }
11098 }
11099
11100 #if ANYOF_INVERT != 1
11101 /* Depending on compiler optimization cBOOL takes time, so if don't have to
11102 * use it, don't */
11103 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
11104 #endif
11105
11106 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
11107 return (flags & ANYOF_INVERT) ^ match;
11108 }
11109
11110 STATIC U8 *
S_reghop3(U8 * s,SSize_t off,const U8 * lim)11111 S_reghop3(U8 *s, SSize_t off, const U8* lim)
11112 {
11113 /* return the position 'off' UTF-8 characters away from 's', forward if
11114 * 'off' >= 0, backwards if negative. But don't go outside of position
11115 * 'lim', which better be < s if off < 0 */
11116
11117 PERL_ARGS_ASSERT_REGHOP3;
11118
11119 if (off >= 0) {
11120 while (off-- && s < lim) {
11121 /* XXX could check well-formedness here */
11122 U8 *new_s = s + UTF8SKIP(s);
11123 if (new_s > lim) /* lim may be in the middle of a long character */
11124 return s;
11125 s = new_s;
11126 }
11127 }
11128 else {
11129 while (off++ && s > lim) {
11130 s--;
11131 if (UTF8_IS_CONTINUED(*s)) {
11132 while (s > lim && UTF8_IS_CONTINUATION(*s))
11133 s--;
11134 if (! UTF8_IS_START(*s)) {
11135 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11136 }
11137 }
11138 /* XXX could check well-formedness here */
11139 }
11140 }
11141 return s;
11142 }
11143
11144 STATIC U8 *
S_reghop4(U8 * s,SSize_t off,const U8 * llim,const U8 * rlim)11145 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
11146 {
11147 PERL_ARGS_ASSERT_REGHOP4;
11148
11149 if (off >= 0) {
11150 while (off-- && s < rlim) {
11151 /* XXX could check well-formedness here */
11152 s += UTF8SKIP(s);
11153 }
11154 }
11155 else {
11156 while (off++ && s > llim) {
11157 s--;
11158 if (UTF8_IS_CONTINUED(*s)) {
11159 while (s > llim && UTF8_IS_CONTINUATION(*s))
11160 s--;
11161 if (! UTF8_IS_START(*s)) {
11162 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11163 }
11164 }
11165 /* XXX could check well-formedness here */
11166 }
11167 }
11168 return s;
11169 }
11170
11171 /* like reghop3, but returns NULL on overrun, rather than returning last
11172 * char pos */
11173
11174 STATIC U8 *
S_reghopmaybe3(U8 * s,SSize_t off,const U8 * const lim)11175 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
11176 {
11177 PERL_ARGS_ASSERT_REGHOPMAYBE3;
11178
11179 if (off >= 0) {
11180 while (off-- && s < lim) {
11181 /* XXX could check well-formedness here */
11182 s += UTF8SKIP(s);
11183 }
11184 if (off >= 0)
11185 return NULL;
11186 }
11187 else {
11188 while (off++ && s > lim) {
11189 s--;
11190 if (UTF8_IS_CONTINUED(*s)) {
11191 while (s > lim && UTF8_IS_CONTINUATION(*s))
11192 s--;
11193 if (! UTF8_IS_START(*s)) {
11194 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
11195 }
11196 }
11197 /* XXX could check well-formedness here */
11198 }
11199 if (off <= 0)
11200 return NULL;
11201 }
11202 return s;
11203 }
11204
11205
11206 /* when executing a regex that may have (?{}), extra stuff needs setting
11207 up that will be visible to the called code, even before the current
11208 match has finished. In particular:
11209
11210 * $_ is localised to the SV currently being matched;
11211 * pos($_) is created if necessary, ready to be updated on each call-out
11212 to code;
11213 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
11214 isn't set until the current pattern is successfully finished), so that
11215 $1 etc of the match-so-far can be seen;
11216 * save the old values of subbeg etc of the current regex, and set then
11217 to the current string (again, this is normally only done at the end
11218 of execution)
11219 */
11220
11221 static void
S_setup_eval_state(pTHX_ regmatch_info * const reginfo)11222 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
11223 {
11224 MAGIC *mg;
11225 regexp *const rex = ReANY(reginfo->prog);
11226 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
11227
11228 eval_state->rex = rex;
11229 eval_state->sv = reginfo->sv;
11230
11231 if (reginfo->sv) {
11232 /* Make $_ available to executed code. */
11233 if (reginfo->sv != DEFSV) {
11234 SAVE_DEFSV;
11235 DEFSV_set(reginfo->sv);
11236 }
11237 /* will be dec'd by S_cleanup_regmatch_info_aux */
11238 SvREFCNT_inc_NN(reginfo->sv);
11239
11240 if (!(mg = mg_find_mglob(reginfo->sv))) {
11241 /* prepare for quick setting of pos */
11242 mg = sv_magicext_mglob(reginfo->sv);
11243 mg->mg_len = -1;
11244 }
11245 eval_state->pos_magic = mg;
11246 eval_state->pos = mg->mg_len;
11247 eval_state->pos_flags = mg->mg_flags;
11248 }
11249 else
11250 eval_state->pos_magic = NULL;
11251
11252 if (!PL_reg_curpm) {
11253 /* PL_reg_curpm is a fake PMOP that we can attach the current
11254 * regex to and point PL_curpm at, so that $1 et al are visible
11255 * within a /(?{})/. It's just allocated once per interpreter the
11256 * first time its needed */
11257 Newxz(PL_reg_curpm, 1, PMOP);
11258 #ifdef USE_ITHREADS
11259 {
11260 SV* const repointer = &PL_sv_undef;
11261 /* this regexp is also owned by the new PL_reg_curpm, which
11262 will try to free it. */
11263 av_push(PL_regex_padav, repointer);
11264 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
11265 PL_regex_pad = AvARRAY(PL_regex_padav);
11266 }
11267 #endif
11268 }
11269 SET_reg_curpm(reginfo->prog);
11270 eval_state->curpm = PL_curpm;
11271 PL_curpm_under = PL_curpm;
11272 PL_curpm = PL_reg_curpm;
11273 if (RXp_MATCH_COPIED(rex)) {
11274 /* Here is a serious problem: we cannot rewrite subbeg,
11275 since it may be needed if this match fails. Thus
11276 $` inside (?{}) could fail... */
11277 eval_state->subbeg = RXp_SUBBEG(rex);
11278 eval_state->sublen = RXp_SUBLEN(rex);
11279 eval_state->suboffset = RXp_SUBOFFSET(rex);
11280 eval_state->subcoffset = RXp_SUBCOFFSET(rex);
11281 #ifdef PERL_ANY_COW
11282 eval_state->saved_copy = RXp_SAVED_COPY(rex);
11283 #endif
11284 RXp_MATCH_COPIED_off(rex);
11285 }
11286 else
11287 eval_state->subbeg = NULL;
11288 RXp_SUBBEG(rex) = (char *)reginfo->strbeg;
11289 RXp_SUBOFFSET(rex) = 0;
11290 RXp_SUBCOFFSET(rex) = 0;
11291 RXp_SUBLEN(rex) = reginfo->strend - reginfo->strbeg;
11292 }
11293
11294
11295 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
11296
11297 static void
S_cleanup_regmatch_info_aux(pTHX_ void * arg)11298 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
11299 {
11300 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
11301 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
11302 regmatch_slab *s;
11303
11304 Safefree(aux->poscache);
11305
11306 if (eval_state) {
11307
11308 /* undo the effects of S_setup_eval_state() */
11309
11310 if (eval_state->subbeg) {
11311 regexp * const rex = eval_state->rex;
11312 RXp_SUBBEG(rex) = eval_state->subbeg;
11313 RXp_SUBLEN(rex) = eval_state->sublen;
11314 RXp_SUBOFFSET(rex) = eval_state->suboffset;
11315 RXp_SUBCOFFSET(rex) = eval_state->subcoffset;
11316 #ifdef PERL_ANY_COW
11317 RXp_SAVED_COPY(rex) = eval_state->saved_copy;
11318 #endif
11319 RXp_MATCH_COPIED_on(rex);
11320 }
11321 if (eval_state->pos_magic)
11322 {
11323 eval_state->pos_magic->mg_len = eval_state->pos;
11324 eval_state->pos_magic->mg_flags =
11325 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
11326 | (eval_state->pos_flags & MGf_BYTES);
11327 }
11328
11329 PL_curpm = eval_state->curpm;
11330 SvREFCNT_dec(eval_state->sv);
11331 }
11332
11333 PL_regmatch_state = aux->old_regmatch_state;
11334 PL_regmatch_slab = aux->old_regmatch_slab;
11335
11336 /* free all slabs above current one - this must be the last action
11337 * of this function, as aux and eval_state are allocated within
11338 * slabs and may be freed here */
11339
11340 s = PL_regmatch_slab->next;
11341 if (s) {
11342 PL_regmatch_slab->next = NULL;
11343 while (s) {
11344 regmatch_slab * const osl = s;
11345 s = s->next;
11346 Safefree(osl);
11347 }
11348 }
11349 }
11350
11351
11352 STATIC void
S_to_utf8_substr(pTHX_ regexp * prog)11353 S_to_utf8_substr(pTHX_ regexp *prog)
11354 {
11355 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11356 * on the converted value */
11357
11358 int i = 1;
11359
11360 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11361
11362 do {
11363 if (prog->substrs->data[i].substr
11364 && !prog->substrs->data[i].utf8_substr) {
11365 SV* const sv = newSVsv(prog->substrs->data[i].substr);
11366 prog->substrs->data[i].utf8_substr = sv;
11367 sv_utf8_upgrade(sv);
11368 if (SvVALID(prog->substrs->data[i].substr)) {
11369 if (SvTAIL(prog->substrs->data[i].substr)) {
11370 /* Trim the trailing \n that fbm_compile added last
11371 time. */
11372 SvCUR_set(sv, SvCUR(sv) - 1);
11373 /* Whilst this makes the SV technically "invalid" (as its
11374 buffer is no longer followed by "\0") when fbm_compile()
11375 adds the "\n" back, a "\0" is restored. */
11376 fbm_compile(sv, FBMcf_TAIL);
11377 } else
11378 fbm_compile(sv, 0);
11379 }
11380 if (prog->substrs->data[i].substr == prog->check_substr)
11381 prog->check_utf8 = sv;
11382 }
11383 } while (i--);
11384 }
11385
11386 STATIC bool
S_to_byte_substr(pTHX_ regexp * prog)11387 S_to_byte_substr(pTHX_ regexp *prog)
11388 {
11389 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11390 * on the converted value; returns FALSE if can't be converted. */
11391
11392 int i = 1;
11393
11394 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11395
11396 do {
11397 if (prog->substrs->data[i].utf8_substr
11398 && !prog->substrs->data[i].substr) {
11399 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11400 if (! sv_utf8_downgrade(sv, TRUE)) {
11401 SvREFCNT_dec_NN(sv);
11402 return FALSE;
11403 }
11404 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11405 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11406 /* Trim the trailing \n that fbm_compile added last
11407 time. */
11408 SvCUR_set(sv, SvCUR(sv) - 1);
11409 fbm_compile(sv, FBMcf_TAIL);
11410 } else
11411 fbm_compile(sv, 0);
11412 }
11413 prog->substrs->data[i].substr = sv;
11414 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11415 prog->check_substr = sv;
11416 }
11417 } while (i--);
11418
11419 return TRUE;
11420 }
11421
11422 #ifndef PERL_IN_XSUB_RE
11423
11424 bool
Perl_is_grapheme(pTHX_ const U8 * strbeg,const U8 * s,const U8 * strend,const UV cp)11425 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11426 {
11427 /* Temporary helper function for toke.c. Verify that the code point 'cp'
11428 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
11429 * the larger string bounded by 'strbeg' and 'strend'.
11430 *
11431 * 'cp' needs to be assigned (if not, a future version of the Unicode
11432 * Standard could make it something that combines with adjacent characters,
11433 * so code using it would then break), and there has to be a GCB break
11434 * before and after the character. */
11435
11436
11437 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11438 const U8 * prev_cp_start;
11439
11440 PERL_ARGS_ASSERT_IS_GRAPHEME;
11441
11442 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
11443 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11444 {
11445 /* These are considered graphemes */
11446 return TRUE;
11447 }
11448
11449 /* Otherwise, unassigned code points are forbidden */
11450 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11451 _invlist_search(PL_Assigned_invlist, cp))))
11452 {
11453 return FALSE;
11454 }
11455
11456 cp_gcb_val = getGCB_VAL_CP(cp);
11457
11458 /* Find the GCB value of the previous code point in the input */
11459 prev_cp_start = utf8_hop_back(s, -1, strbeg);
11460 if (UNLIKELY(prev_cp_start == s)) {
11461 prev_cp_gcb_val = GCB_EDGE;
11462 }
11463 else {
11464 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11465 }
11466
11467 /* And check that is a grapheme boundary */
11468 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11469 TRUE /* is UTF-8 encoded */ ))
11470 {
11471 return FALSE;
11472 }
11473
11474 /* Similarly verify there is a break between the current character and the
11475 * following one */
11476 s += UTF8SKIP(s);
11477 if (s >= strend) {
11478 next_cp_gcb_val = GCB_EDGE;
11479 }
11480 else {
11481 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11482 }
11483
11484 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11485 }
11486
11487 /*
11488 =for apidoc_section $unicode
11489
11490 =for apidoc isSCRIPT_RUN
11491
11492 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11493 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
11494 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
11495 two degenerate cases given below, this function returns TRUE iff all code
11496 points in it come from any combination of three "scripts" given by the Unicode
11497 "Script Extensions" property: Common, Inherited, and possibly one other.
11498 Additionally all decimal digits must come from the same consecutive sequence of
11499 10.
11500
11501 For example, if all the characters in the sequence are Greek, or Common, or
11502 Inherited, this function will return TRUE, provided any decimal digits in it
11503 are from the same block of digits in Common. (These are the ASCII digits
11504 "0".."9" and additionally a block for full width forms of these, and several
11505 others used in mathematical notation.) For scripts (unlike Greek) that have
11506 their own digits defined this will accept either digits from that set or from
11507 one of the Common digit sets, but not a combination of the two. Some scripts,
11508 such as Arabic, have more than one set of digits. All digits must come from
11509 the same set for this function to return TRUE.
11510
11511 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11512 contain the script found, using the C<SCX_enum> typedef. Its value will be
11513 C<SCX_INVALID> if the function returns FALSE.
11514
11515 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11516 will be C<SCX_INVALID>.
11517
11518 If the sequence contains a single code point which is unassigned to a character
11519 in the version of Unicode being used, the function will return TRUE, and the
11520 script will be C<SCX_Unknown>. Any other combination of unassigned code points
11521 in the input sequence will result in the function treating the input as not
11522 being a script run.
11523
11524 The returned script will be C<SCX_Inherited> iff all the code points in it are
11525 from the Inherited script.
11526
11527 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11528 it are from the Inherited or Common scripts.
11529
11530 =cut
11531
11532 */
11533
11534 bool
Perl_isSCRIPT_RUN(pTHX_ const U8 * s,const U8 * send,const bool utf8_target)11535 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11536 {
11537 /* Basically, it looks at each character in the sequence to see if the
11538 * above conditions are met; if not it fails. It uses an inversion map to
11539 * find the enum corresponding to the script of each character. But this
11540 * is complicated by the fact that a few code points can be in any of
11541 * several scripts. The data has been constructed so that there are
11542 * additional enum values (all negative) for these situations. The
11543 * absolute value of those is an index into another table which contains
11544 * pointers to auxiliary tables for each such situation. Each aux array
11545 * lists all the scripts for the given situation. There is another,
11546 * parallel, table that gives the number of entries in each aux table.
11547 * These are all defined in charclass_invlists.h */
11548
11549 /* XXX Here are the additional things UTS 39 says could be done:
11550 *
11551 * Forbid sequences of the same nonspacing mark
11552 *
11553 * Check to see that all the characters are in the sets of exemplar
11554 * characters for at least one language in the Unicode Common Locale Data
11555 * Repository [CLDR]. */
11556
11557
11558 /* Things that match /\d/u */
11559 SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_];
11560 UV * decimals_array = invlist_array(decimals_invlist);
11561
11562 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11563 * not currently known) */
11564 UV zero_of_run = 0;
11565
11566 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
11567 SCX_enum script_of_char = SCX_INVALID;
11568
11569 /* If the script remains not fully determined from iteration to iteration,
11570 * this is the current intersection of the possiblities. */
11571 SCX_enum * intersection = NULL;
11572 PERL_UINT_FAST8_T intersection_len = 0;
11573
11574 bool retval = TRUE;
11575 SCX_enum * ret_script = NULL;
11576
11577 assert(send >= s);
11578
11579 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11580
11581 /* All code points in 0..255 are either Common or Latin, so must be a
11582 * script run. We can return immediately unless we need to know which
11583 * script it is. */
11584 if (! utf8_target && LIKELY(send > s)) {
11585 if (ret_script == NULL) {
11586 return TRUE;
11587 }
11588
11589 /* If any character is Latin, the run is Latin */
11590 while (s < send) {
11591 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11592 *ret_script = SCX_Latin;
11593 return TRUE;
11594 }
11595 }
11596
11597 /* Here, all are Common */
11598 *ret_script = SCX_Common;
11599 return TRUE;
11600 }
11601
11602 /* Look at each character in the sequence */
11603 while (s < send) {
11604 /* If the current character being examined is a digit, this is the code
11605 * point of the zero for its sequence of 10 */
11606 UV zero_of_char;
11607
11608 UV cp;
11609
11610 /* The code allows all scripts to use the ASCII digits. This is
11611 * because they are in the Common script. Hence any ASCII ones found
11612 * are ok, unless and until a digit from another set has already been
11613 * encountered. digit ranges in Common are not similarly blessed) */
11614 if (UNLIKELY(isDIGIT(*s))) {
11615 if (UNLIKELY(script_of_run == SCX_Unknown)) {
11616 retval = FALSE;
11617 break;
11618 }
11619 if (zero_of_run) {
11620 if (zero_of_run != '0') {
11621 retval = FALSE;
11622 break;
11623 }
11624 }
11625 else {
11626 zero_of_run = '0';
11627 }
11628 s++;
11629 continue;
11630 }
11631
11632 /* Here, isn't an ASCII digit. Find the code point of the character */
11633 if (! UTF8_IS_INVARIANT(*s)) {
11634 Size_t len;
11635 cp = valid_utf8_to_uvchr((U8 *) s, &len);
11636 s += len;
11637 }
11638 else {
11639 cp = *(s++);
11640 }
11641
11642 /* If is within the range [+0 .. +9] of the script's zero, it also is a
11643 * digit in that script. We can skip the rest of this code for this
11644 * character. */
11645 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11646 continue;
11647 }
11648
11649 /* Find the character's script. The correct values are hard-coded here
11650 * for small-enough code points. */
11651 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
11652 unlikely to change */
11653 if ( cp > 255
11654 || ( isALPHA_L1(cp)
11655 && LIKELY(cp != MICRO_SIGN_NATIVE)))
11656 {
11657 script_of_char = SCX_Latin;
11658 }
11659 else {
11660 script_of_char = SCX_Common;
11661 }
11662 }
11663 else {
11664 script_of_char = _Perl_SCX_invmap[
11665 _invlist_search(PL_SCX_invlist, cp)];
11666 }
11667
11668 /* We arbitrarily accept a single unassigned character, but not in
11669 * combination with anything else, and not a run of them. */
11670 if ( UNLIKELY(script_of_run == SCX_Unknown)
11671 || UNLIKELY( script_of_run != SCX_INVALID
11672 && script_of_char == SCX_Unknown))
11673 {
11674 retval = FALSE;
11675 break;
11676 }
11677
11678 /* For the first character, or the run is inherited, the run's script
11679 * is set to the char's */
11680 if ( UNLIKELY(script_of_run == SCX_INVALID)
11681 || UNLIKELY(script_of_run == SCX_Inherited))
11682 {
11683 script_of_run = script_of_char;
11684 }
11685
11686 /* For the character's script to be Unknown, it must be the first
11687 * character in the sequence (for otherwise a test above would have
11688 * prevented us from reaching here), and we have set the run's script
11689 * to it. Nothing further to be done for this character */
11690 if (UNLIKELY(script_of_char == SCX_Unknown)) {
11691 continue;
11692 }
11693
11694 /* We accept 'inherited' script characters currently even at the
11695 * beginning. (We know that no characters in Inherited are digits, or
11696 * we'd have to check for that) */
11697 if (UNLIKELY(script_of_char == SCX_Inherited)) {
11698 continue;
11699 }
11700
11701 /* If the run so far is Common, and the new character isn't, change the
11702 * run's script to that of this character */
11703 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11704 script_of_run = script_of_char;
11705 }
11706
11707 /* Now we can see if the script of the new character is the same as
11708 * that of the run */
11709 if (LIKELY(script_of_char == script_of_run)) {
11710 /* By far the most common case */
11711 goto scripts_match;
11712 }
11713
11714 /* Here, the script of the run isn't Common. But characters in Common
11715 * match any script */
11716 if (script_of_char == SCX_Common) {
11717 goto scripts_match;
11718 }
11719
11720 #ifndef HAS_SCX_AUX_TABLES
11721
11722 /* Too early a Unicode version to have a code point belonging to more
11723 * than one script, so, if the scripts don't exactly match, fail */
11724 PERL_UNUSED_VAR(intersection_len);
11725 retval = FALSE;
11726 break;
11727
11728 #else
11729
11730 /* Here there is no exact match between the character's script and the
11731 * run's. And we've handled the special cases of scripts Unknown,
11732 * Inherited, and Common.
11733 *
11734 * Negative script numbers signify that the value may be any of several
11735 * scripts, and we need to look at auxiliary information to make our
11736 * determination. But if both are non-negative, we can fail now */
11737 if (LIKELY(script_of_char >= 0)) {
11738 const SCX_enum * search_in;
11739 PERL_UINT_FAST8_T search_in_len;
11740 PERL_UINT_FAST8_T i;
11741
11742 if (LIKELY(script_of_run >= 0)) {
11743 retval = FALSE;
11744 break;
11745 }
11746
11747 /* Use the previously constructed set of possible scripts, if any.
11748 * */
11749 if (intersection) {
11750 search_in = intersection;
11751 search_in_len = intersection_len;
11752 }
11753 else {
11754 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11755 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11756 }
11757
11758 for (i = 0; i < search_in_len; i++) {
11759 if (search_in[i] == script_of_char) {
11760 script_of_run = script_of_char;
11761 goto scripts_match;
11762 }
11763 }
11764
11765 retval = FALSE;
11766 break;
11767 }
11768 else if (LIKELY(script_of_run >= 0)) {
11769 /* script of character could be one of several, but run is a single
11770 * script */
11771 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11772 const PERL_UINT_FAST8_T search_in_len
11773 = SCX_AUX_TABLE_lengths[-script_of_char];
11774 PERL_UINT_FAST8_T i;
11775
11776 for (i = 0; i < search_in_len; i++) {
11777 if (search_in[i] == script_of_run) {
11778 script_of_char = script_of_run;
11779 goto scripts_match;
11780 }
11781 }
11782
11783 retval = FALSE;
11784 break;
11785 }
11786 else {
11787 /* Both run and char could be in one of several scripts. If the
11788 * intersection is empty, then this character isn't in this script
11789 * run. Otherwise, we need to calculate the intersection to use
11790 * for future iterations of the loop, unless we are already at the
11791 * final character */
11792 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11793 const PERL_UINT_FAST8_T char_len
11794 = SCX_AUX_TABLE_lengths[-script_of_char];
11795 const SCX_enum * search_run;
11796 PERL_UINT_FAST8_T run_len;
11797
11798 SCX_enum * new_overlap = NULL;
11799 PERL_UINT_FAST8_T i, j;
11800
11801 if (intersection) {
11802 search_run = intersection;
11803 run_len = intersection_len;
11804 }
11805 else {
11806 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11807 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11808 }
11809
11810 intersection_len = 0;
11811
11812 for (i = 0; i < run_len; i++) {
11813 for (j = 0; j < char_len; j++) {
11814 if (search_run[i] == search_char[j]) {
11815
11816 /* Here, the script at i,j matches. That means this
11817 * character is in the run. But continue on to find
11818 * the complete intersection, for the next loop
11819 * iteration, and for the digit check after it.
11820 *
11821 * On the first found common script, we malloc space
11822 * for the intersection list for the worst case of the
11823 * intersection, which is the minimum of the number of
11824 * scripts remaining in each set. */
11825 if (intersection_len == 0) {
11826 Newx(new_overlap,
11827 MIN(run_len - i, char_len - j),
11828 SCX_enum);
11829 }
11830 new_overlap[intersection_len++] = search_run[i];
11831 }
11832 }
11833 }
11834
11835 /* Here we've looked through everything. If they have no scripts
11836 * in common, not a run */
11837 if (intersection_len == 0) {
11838 retval = FALSE;
11839 break;
11840 }
11841
11842 /* If there is only a single script in common, set to that.
11843 * Otherwise, use the intersection going forward */
11844 Safefree(intersection);
11845 intersection = NULL;
11846 if (intersection_len == 1) {
11847 script_of_run = script_of_char = new_overlap[0];
11848 Safefree(new_overlap);
11849 new_overlap = NULL;
11850 }
11851 else {
11852 intersection = new_overlap;
11853 }
11854 }
11855
11856 #endif
11857
11858 scripts_match:
11859
11860 /* Here, the script of the character is compatible with that of the
11861 * run. That means that in most cases, it continues the script run.
11862 * Either it and the run match exactly, or one or both can be in any of
11863 * several scripts, and the intersection is not empty. However, if the
11864 * character is a decimal digit, it could still mean failure if it is
11865 * from the wrong sequence of 10. So, we need to look at if it's a
11866 * digit. We've already handled the 10 digits [0-9], and the next
11867 * lowest one is this one: */
11868 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11869 continue; /* Not a digit; this character is part of the run */
11870 }
11871
11872 /* If we have a definitive '0' for the script of this character, we
11873 * know that for this to be a digit, it must be in the range of +0..+9
11874 * of that zero. */
11875 if ( script_of_char >= 0
11876 && (zero_of_char = script_zeros[script_of_char]))
11877 {
11878 if (! withinCOUNT(cp, zero_of_char, 9)) {
11879 continue; /* Not a digit; this character is part of the run
11880 */
11881 }
11882
11883 }
11884 else { /* Need to look up if this character is a digit or not */
11885 SSize_t index_of_zero_of_char;
11886 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11887 if ( UNLIKELY(index_of_zero_of_char < 0)
11888 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11889 {
11890 continue; /* Not a digit; this character is part of the run.
11891 */
11892 }
11893
11894 zero_of_char = decimals_array[index_of_zero_of_char];
11895 }
11896
11897 /* Here, the character is a decimal digit, and the zero of its sequence
11898 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
11899 * they better be the same. */
11900 if (zero_of_run) {
11901 if (zero_of_run != zero_of_char) {
11902 retval = FALSE;
11903 break;
11904 }
11905 }
11906 else { /* Otherwise we now have a zero for this run */
11907 zero_of_run = zero_of_char;
11908 }
11909 } /* end of looping through CLOSESR text */
11910
11911 Safefree(intersection);
11912
11913 if (ret_script != NULL) {
11914 if (retval) {
11915 *ret_script = script_of_run;
11916 }
11917 else {
11918 *ret_script = SCX_INVALID;
11919 }
11920 }
11921
11922 return retval;
11923 }
11924 #endif /* ifndef PERL_IN_XSUB_RE */
11925
11926 /* Buffer logic. */
11927 SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx,SV * const key,SV * const value,const U32 flags)11928 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
11929 const U32 flags)
11930 {
11931 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
11932
11933 PERL_UNUSED_ARG(value);
11934
11935 if (flags & RXapif_FETCH) {
11936 return reg_named_buff_fetch(rx, key, flags);
11937 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
11938 Perl_croak_no_modify();
11939 return NULL;
11940 } else if (flags & RXapif_EXISTS) {
11941 return reg_named_buff_exists(rx, key, flags)
11942 ? &PL_sv_yes
11943 : &PL_sv_no;
11944 } else if (flags & RXapif_REGNAMES) {
11945 return reg_named_buff_all(rx, flags);
11946 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
11947 return reg_named_buff_scalar(rx, flags);
11948 } else {
11949 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
11950 return NULL;
11951 }
11952 }
11953
11954 SV*
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx,const SV * const lastkey,const U32 flags)11955 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
11956 const U32 flags)
11957 {
11958 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
11959 PERL_UNUSED_ARG(lastkey);
11960
11961 if (flags & RXapif_FIRSTKEY)
11962 return reg_named_buff_firstkey(rx, flags);
11963 else if (flags & RXapif_NEXTKEY)
11964 return reg_named_buff_nextkey(rx, flags);
11965 else {
11966 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
11967 (int)flags);
11968 return NULL;
11969 }
11970 }
11971
11972 SV*
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r,SV * const namesv,const U32 flags)11973 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
11974 const U32 flags)
11975 {
11976 SV *ret;
11977 struct regexp *const rx = ReANY(r);
11978
11979 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
11980
11981 if (rx && RXp_PAREN_NAMES(rx)) {
11982 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
11983 if (he_str) {
11984 IV i;
11985 SV* sv_dat=HeVAL(he_str);
11986 I32 *nums=(I32*)SvPVX(sv_dat);
11987 AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL;
11988 for ( i=0; i<SvIVX(sv_dat); i++ ) {
11989 if ((I32)(rx->nparens) >= nums[i]
11990 && RXp_OFFS_VALID(rx,nums[i]))
11991 {
11992 ret = newSVpvs("");
11993 Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE);
11994 if (!retarray)
11995 return ret;
11996 } else {
11997 if (retarray)
11998 ret = newSV_type(SVt_NULL);
11999 }
12000 if (retarray)
12001 av_push_simple(retarray, ret);
12002 }
12003 if (retarray)
12004 return newRV_noinc(MUTABLE_SV(retarray));
12005 }
12006 }
12007 return NULL;
12008 }
12009
12010 bool
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r,SV * const key,const U32 flags)12011 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
12012 const U32 flags)
12013 {
12014 struct regexp *const rx = ReANY(r);
12015
12016 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
12017
12018 if (rx && RXp_PAREN_NAMES(rx)) {
12019 if (flags & RXapif_ALL) {
12020 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
12021 } else {
12022 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
12023 if (sv) {
12024 SvREFCNT_dec_NN(sv);
12025 return TRUE;
12026 } else {
12027 return FALSE;
12028 }
12029 }
12030 } else {
12031 return FALSE;
12032 }
12033 }
12034
12035 SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r,const U32 flags)12036 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
12037 {
12038 struct regexp *const rx = ReANY(r);
12039
12040 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
12041
12042 if ( rx && RXp_PAREN_NAMES(rx) ) {
12043 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
12044
12045 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
12046 } else {
12047 return FALSE;
12048 }
12049 }
12050
12051 SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r,const U32 flags)12052 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
12053 {
12054 struct regexp *const rx = ReANY(r);
12055 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12056
12057 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
12058
12059 if (rx && RXp_PAREN_NAMES(rx)) {
12060 HV *hv = RXp_PAREN_NAMES(rx);
12061 HE *temphe;
12062 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12063 IV i;
12064 IV parno = 0;
12065 SV* sv_dat = HeVAL(temphe);
12066 I32 *nums = (I32*)SvPVX(sv_dat);
12067 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12068 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12069 RXp_OFFS_VALID(rx,nums[i]))
12070 {
12071 parno = nums[i];
12072 break;
12073 }
12074 }
12075 if (parno || flags & RXapif_ALL) {
12076 return newSVhek(HeKEY_hek(temphe));
12077 }
12078 }
12079 }
12080 return NULL;
12081 }
12082
12083 SV*
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r,const U32 flags)12084 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
12085 {
12086 SV *ret;
12087 AV *av;
12088 SSize_t length;
12089 struct regexp *const rx = ReANY(r);
12090
12091 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
12092
12093 if (rx && RXp_PAREN_NAMES(rx)) {
12094 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
12095 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
12096 } else if (flags & RXapif_ONE) {
12097 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
12098 av = MUTABLE_AV(SvRV(ret));
12099 length = av_count(av);
12100 SvREFCNT_dec_NN(ret);
12101 return newSViv(length);
12102 } else {
12103 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
12104 (int)flags);
12105 return NULL;
12106 }
12107 }
12108 return &PL_sv_undef;
12109 }
12110
12111 SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r,const U32 flags)12112 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
12113 {
12114 struct regexp *const rx = ReANY(r);
12115 AV *av = newAV();
12116
12117 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
12118
12119 if (rx && RXp_PAREN_NAMES(rx)) {
12120 HV *hv= RXp_PAREN_NAMES(rx);
12121 HE *temphe;
12122 (void)hv_iterinit(hv);
12123 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
12124 IV i;
12125 IV parno = 0;
12126 SV* sv_dat = HeVAL(temphe);
12127 I32 *nums = (I32*)SvPVX(sv_dat);
12128 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
12129 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] &&
12130 RXp_OFFS_VALID(rx,nums[i]))
12131 {
12132 parno = nums[i];
12133 break;
12134 }
12135 }
12136 if (parno || flags & RXapif_ALL) {
12137 av_push_simple(av, newSVhek(HeKEY_hek(temphe)));
12138 }
12139 }
12140 }
12141
12142 return newRV_noinc(MUTABLE_SV(av));
12143 }
12144
12145 void
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re,const I32 paren,SV * const sv)12146 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren,
12147 SV * const sv)
12148 {
12149 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
12150 Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0);
12151 }
12152
12153 #ifndef PERL_IN_XSUB_RE
12154
12155 void
Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re,const I32 paren,SV * const sv,U32 flags)12156 Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren,
12157 SV * const sv, U32 flags)
12158 {
12159 struct regexp *const rx = ReANY(re);
12160 char *s = NULL;
12161 SSize_t i,t = 0;
12162 SSize_t s1, t1;
12163 I32 n = paren;
12164 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12165
12166 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS;
12167
12168 if ( n == RX_BUFF_IDX_CARET_PREMATCH
12169 || n == RX_BUFF_IDX_CARET_FULLMATCH
12170 || n == RX_BUFF_IDX_CARET_POSTMATCH
12171 )
12172 {
12173 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12174 if (!keepcopy) {
12175 /* on something like
12176 * $r = qr/.../;
12177 * /$qr/p;
12178 * the KEEPCOPY is set on the PMOP rather than the regex */
12179 if (PL_curpm && re == PM_GETRE(PL_curpm))
12180 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12181 }
12182 if (!keepcopy)
12183 goto ret_undef;
12184 }
12185
12186 if (!RXp_SUBBEG(rx))
12187 goto ret_undef;
12188
12189 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
12190 /* no need to distinguish between them any more */
12191 n = RX_BUFF_IDX_FULLMATCH;
12192
12193 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
12194 && (i = RXp_OFFS_START(rx,0)) != -1)
12195 {
12196 /* $`, ${^PREMATCH} */
12197 s = RXp_SUBBEG(rx);
12198 }
12199 else
12200 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
12201 && (t = RXp_OFFS_END(rx,0)) != -1)
12202 {
12203 /* $', ${^POSTMATCH} */
12204 s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + t;
12205 i = RXp_SUBLEN(rx) + RXp_SUBOFFSET(rx) - t;
12206 }
12207 else /* when flags is true we do an absolute lookup, and compare against rx->nparens */
12208 if (inRANGE(n, 0, flags ? (I32)rx->nparens : logical_nparens)) {
12209 I32 *map = (!flags && n) ? rx->logical_to_parno : NULL;
12210 I32 true_parno = map ? map[n] : n;
12211 do {
12212 if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1) &&
12213 ((t1 = RXp_OFFS_END(rx,true_parno)) != -1))
12214 {
12215 /* $&, ${^MATCH}, $1 ... */
12216 i = t1 - s1;
12217 s = RXp_SUBBEG(rx) + s1 - RXp_SUBOFFSET(rx);
12218 goto found_it;
12219 }
12220 else if (map) {
12221 true_parno = rx->parno_to_logical_next[true_parno];
12222 }
12223 else {
12224 break;
12225 }
12226 } while (true_parno);
12227 goto ret_undef;
12228 } else {
12229 goto ret_undef;
12230 }
12231
12232 found_it:
12233 assert(s >= RXp_SUBBEG(rx));
12234 assert((STRLEN)RXp_SUBLEN(rx) >= (STRLEN)((s - RXp_SUBBEG(rx)) + i) );
12235 if (i >= 0) {
12236 #ifdef NO_TAINT_SUPPORT
12237 sv_setpvn(sv, s, i);
12238 #else
12239 const int oldtainted = TAINT_get;
12240 TAINT_NOT;
12241 sv_setpvn(sv, s, i);
12242 TAINT_set(oldtainted);
12243 #endif
12244 if (RXp_MATCH_UTF8(rx))
12245 SvUTF8_on(sv);
12246 else
12247 SvUTF8_off(sv);
12248 if (TAINTING_get) {
12249 if (RXp_MATCH_TAINTED(rx)) {
12250 if (SvTYPE(sv) >= SVt_PVMG) {
12251 MAGIC* const mg = SvMAGIC(sv);
12252 MAGIC* mgt;
12253 TAINT;
12254 SvMAGIC_set(sv, mg->mg_moremagic);
12255 SvTAINT(sv);
12256 if ((mgt = SvMAGIC(sv))) {
12257 mg->mg_moremagic = mgt;
12258 SvMAGIC_set(sv, mg);
12259 }
12260 } else {
12261 TAINT;
12262 SvTAINT(sv);
12263 }
12264 } else
12265 SvTAINTED_off(sv);
12266 }
12267 } else {
12268 ret_undef:
12269 sv_set_undef(sv);
12270 return;
12271 }
12272 }
12273
12274 #endif
12275
12276 void
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx,const I32 paren,SV const * const value)12277 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
12278 SV const * const value)
12279 {
12280 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
12281
12282 PERL_UNUSED_ARG(rx);
12283 PERL_UNUSED_ARG(paren);
12284 PERL_UNUSED_ARG(value);
12285
12286 if (!PL_localizing)
12287 Perl_croak_no_modify();
12288 }
12289
12290 I32
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r,const SV * const sv,const I32 paren)12291 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
12292 const I32 paren)
12293 {
12294 struct regexp *const rx = ReANY(r);
12295 I32 i,j;
12296 I32 s1, t1;
12297 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens;
12298
12299 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
12300
12301 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
12302 || paren == RX_BUFF_IDX_CARET_FULLMATCH
12303 || paren == RX_BUFF_IDX_CARET_POSTMATCH
12304 )
12305 {
12306 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
12307 if (!keepcopy) {
12308 /* on something like
12309 * $r = qr/.../;
12310 * /$qr/p;
12311 * the KEEPCOPY is set on the PMOP rather than the regex */
12312 if (PL_curpm && r == PM_GETRE(PL_curpm))
12313 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
12314 }
12315 if (!keepcopy)
12316 goto warn_undef;
12317 }
12318
12319 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
12320 switch (paren) {
12321 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
12322 case RX_BUFF_IDX_PREMATCH: /* $` */
12323 if ( (i = RXp_OFFS_START(rx,0)) != -1) {
12324 if (i > 0) {
12325 s1 = 0;
12326 t1 = i;
12327 goto getlen;
12328 }
12329 }
12330 return 0;
12331
12332 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
12333 case RX_BUFF_IDX_POSTMATCH: /* $' */
12334 if ( (j = RXp_OFFS_END(rx,0)) != -1 ) {
12335 i = RXp_SUBLEN(rx) - j;
12336 if (i > 0) {
12337 s1 = j;
12338 t1 = RXp_SUBLEN(rx);
12339 goto getlen;
12340 }
12341 }
12342 return 0;
12343
12344 default: /* $& / ${^MATCH}, $1, $2, ... */
12345 if (paren <= logical_nparens) {
12346 I32 true_paren = rx->logical_to_parno
12347 ? rx->logical_to_parno[paren]
12348 : paren;
12349 do {
12350 if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) &&
12351 ((t1 = RXp_OFFS_END(rx,true_paren)) != -1))
12352 {
12353 i = t1 - s1;
12354 goto getlen;
12355 } else if (rx->parno_to_logical_next) {
12356 true_paren = rx->parno_to_logical_next[true_paren];
12357 } else {
12358 break;
12359 }
12360 } while(true_paren);
12361 }
12362 warn_undef:
12363 if (ckWARN(WARN_UNINITIALIZED))
12364 report_uninit((const SV *)sv);
12365 return 0;
12366 }
12367 getlen:
12368 if (i > 0 && RXp_MATCH_UTF8(rx)) {
12369 const char * const s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + s1;
12370 const U8 *ep;
12371 STRLEN el;
12372
12373 i = t1 - s1;
12374 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
12375 i = el;
12376 }
12377 return i;
12378 }
12379
12380 /*
12381 * ex: set ts=8 sts=4 sw=4 et:
12382 */
12383