1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_STUDY_C
9 #include "perl.h"
10
11 #ifdef PERL_IN_XSUB_RE
12 # include "re_comp.h"
13 #else
14 # include "regcomp.h"
15 #endif
16
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20
21 #define INIT_AND_WITHP \
22 assert(!and_withp); \
23 Newx(and_withp, 1, regnode_ssc); \
24 SAVEFREEPV(and_withp)
25
26
27 STATIC void
S_unwind_scan_frames(pTHX_ const void * p)28 S_unwind_scan_frames(pTHX_ const void *p)
29 {
30 PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
31 scan_frame *f= (scan_frame *)p;
32 do {
33 scan_frame *n= f->next_frame;
34 Safefree(f);
35 f= n;
36 } while (f);
37 }
38
39 /* Follow the next-chain of the current node and optimize away
40 all the NOTHINGs from it.
41 */
42 STATIC void
S_rck_elide_nothing(pTHX_ regnode * node)43 S_rck_elide_nothing(pTHX_ regnode *node)
44 {
45 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
46
47 if (OP(node) != CURLYX) {
48 const int max = (REGNODE_OFF_BY_ARG(OP(node))
49 ? I32_MAX
50 /* I32 may be smaller than U16 on CRAYs! */
51 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
52 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG1u(node) : NEXT_OFF(node));
53 int noff;
54 regnode *n = node;
55
56 /* Skip NOTHING and LONGJMP. */
57 while (
58 (n = regnext(n))
59 && (
60 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
61 || ((OP(n) == LONGJMP) && (noff = ARG1u(n)))
62 )
63 && off + noff < max
64 ) {
65 off += noff;
66 }
67 if (REGNODE_OFF_BY_ARG(OP(node)))
68 ARG1u(node) = off;
69 else
70 NEXT_OFF(node) = off;
71 }
72 return;
73 }
74
75
76 /*
77 * As best we can, determine the characters that can match the start of
78 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
79 * can be false positive matches
80 *
81 * Returns the invlist as a new SV*; it is the caller's responsibility to
82 * call SvREFCNT_dec() when done with it.
83 */
84 STATIC SV*
S_make_exactf_invlist(pTHX_ RExC_state_t * pRExC_state,regnode * node)85 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
86 {
87 const U8 * s = (U8*)STRING(node);
88 SSize_t bytelen = STR_LEN(node);
89 UV uc;
90 /* Start out big enough for 2 separate code points */
91 SV* invlist = _new_invlist(4);
92
93 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
94
95 if (! UTF) {
96 uc = *s;
97
98 /* We punt and assume can match anything if the node begins
99 * with a multi-character fold. Things are complicated. For
100 * example, /ffi/i could match any of:
101 * "\N{LATIN SMALL LIGATURE FFI}"
102 * "\N{LATIN SMALL LIGATURE FF}I"
103 * "F\N{LATIN SMALL LIGATURE FI}"
104 * plus several other things; and making sure we have all the
105 * possibilities is hard. */
106 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
107 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
108 }
109 else {
110 /* Any Latin1 range character can potentially match any
111 * other depending on the locale, and in Turkic locales, 'I' and
112 * 'i' can match U+130 and U+131 */
113 if (OP(node) == EXACTFL) {
114 _invlist_union(invlist, PL_Latin1, &invlist);
115 if (isALPHA_FOLD_EQ(uc, 'I')) {
116 invlist = add_cp_to_invlist(invlist,
117 LATIN_SMALL_LETTER_DOTLESS_I);
118 invlist = add_cp_to_invlist(invlist,
119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
120 }
121 }
122 else {
123 /* But otherwise, it matches at least itself. We can
124 * quickly tell if it has a distinct fold, and if so,
125 * it matches that as well */
126 invlist = add_cp_to_invlist(invlist, uc);
127 if (IS_IN_SOME_FOLD_L1(uc))
128 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
129 }
130
131 /* Some characters match above-Latin1 ones under /i. This
132 * is true of EXACTFL ones when the locale is UTF-8 */
133 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
134 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
135 EXACTFAA_NO_TRIE)))
136 {
137 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
138 }
139 }
140 }
141 else { /* Pattern is UTF-8 */
142 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
143 const U8* e = s + bytelen;
144 IV fc;
145
146 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
147
148 /* The only code points that aren't folded in a UTF EXACTFish
149 * node are the problematic ones in EXACTFL nodes */
150 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
151 /* We need to check for the possibility that this EXACTFL
152 * node begins with a multi-char fold. Therefore we fold
153 * the first few characters of it so that we can make that
154 * check */
155 U8 *d = folded;
156 int i;
157
158 fc = -1;
159 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
160 if (isASCII(*s)) {
161 *(d++) = (U8) toFOLD(*s);
162 if (fc < 0) { /* Save the first fold */
163 fc = *(d-1);
164 }
165 s++;
166 }
167 else {
168 STRLEN len;
169 UV fold = toFOLD_utf8_safe(s, e, d, &len);
170 if (fc < 0) { /* Save the first fold */
171 fc = fold;
172 }
173 d += len;
174 s += UTF8SKIP(s);
175 }
176 }
177
178 /* And set up so the code below that looks in this folded
179 * buffer instead of the node's string */
180 e = d;
181 s = folded;
182 }
183
184 /* When we reach here 's' points to the fold of the first
185 * character(s) of the node; and 'e' points to far enough along
186 * the folded string to be just past any possible multi-char
187 * fold.
188 *
189 * Like the non-UTF case above, we punt if the node begins with a
190 * multi-char fold */
191
192 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
193 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
194 }
195 else { /* Single char fold */
196 unsigned int k;
197 U32 first_fold;
198 const U32 * remaining_folds;
199 Size_t folds_count;
200
201 /* It matches itself */
202 invlist = add_cp_to_invlist(invlist, fc);
203
204 /* ... plus all the things that fold to it, which are found in
205 * PL_utf8_foldclosures */
206 folds_count = _inverse_folds(fc, &first_fold,
207 &remaining_folds);
208 for (k = 0; k < folds_count; k++) {
209 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
210
211 /* /aa doesn't allow folds between ASCII and non- */
212 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
213 && isASCII(c) != isASCII(fc))
214 {
215 continue;
216 }
217
218 invlist = add_cp_to_invlist(invlist, c);
219 }
220
221 if (OP(node) == EXACTFL) {
222
223 /* If either [iI] are present in an EXACTFL node the above code
224 * should have added its normal case pair, but under a Turkish
225 * locale they could match instead the case pairs from it. Add
226 * those as potential matches as well */
227 if (isALPHA_FOLD_EQ(fc, 'I')) {
228 invlist = add_cp_to_invlist(invlist,
229 LATIN_SMALL_LETTER_DOTLESS_I);
230 invlist = add_cp_to_invlist(invlist,
231 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
232 }
233 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
234 invlist = add_cp_to_invlist(invlist, 'I');
235 }
236 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
237 invlist = add_cp_to_invlist(invlist, 'i');
238 }
239 }
240 }
241 }
242
243 return invlist;
244 }
245
246
247 /* Mark that we cannot extend a found fixed substring at this point.
248 Update the longest found anchored substring or the longest found
249 floating substrings if needed. */
250
251 void
Perl_scan_commit(pTHX_ const RExC_state_t * pRExC_state,scan_data_t * data,SSize_t * minlenp,int is_inf)252 Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
253 SSize_t *minlenp, int is_inf)
254 {
255 const STRLEN l = CHR_SVLEN(data->last_found);
256 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
257 const STRLEN old_l = CHR_SVLEN(longest_sv);
258 DECLARE_AND_GET_RE_DEBUG_FLAGS;
259
260 PERL_ARGS_ASSERT_SCAN_COMMIT;
261
262 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
263 const U8 i = data->cur_is_floating;
264 SvSetMagicSV(longest_sv, data->last_found);
265 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
266
267 if (!i) /* fixed */
268 data->substrs[0].max_offset = data->substrs[0].min_offset;
269 else { /* float */
270 data->substrs[1].max_offset =
271 (is_inf)
272 ? OPTIMIZE_INFTY
273 : (l
274 ? data->last_start_max
275 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
276 ? OPTIMIZE_INFTY
277 : data->pos_min + data->pos_delta));
278 }
279
280 data->substrs[i].flags &= ~SF_BEFORE_EOL;
281 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
282 data->substrs[i].minlenp = minlenp;
283 data->substrs[i].lookbehind = 0;
284 }
285
286 SvCUR_set(data->last_found, 0);
287 {
288 SV * const sv = data->last_found;
289 if (SvUTF8(sv) && SvMAGICAL(sv)) {
290 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
291 if (mg)
292 mg->mg_len = 0;
293 }
294 }
295 data->last_end = -1;
296 data->flags &= ~SF_BEFORE_EOL;
297 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
298 }
299
300 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
301 * list that describes which code points it matches */
302
303 STATIC void
S_ssc_anything(pTHX_ regnode_ssc * ssc)304 S_ssc_anything(pTHX_ regnode_ssc *ssc)
305 {
306 /* Set the SSC 'ssc' to match an empty string or any code point */
307
308 PERL_ARGS_ASSERT_SSC_ANYTHING;
309
310 assert(is_ANYOF_SYNTHETIC(ssc));
311
312 /* mortalize so won't leak */
313 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
314 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
315 }
316
317 STATIC int
S_ssc_is_anything(const regnode_ssc * ssc)318 S_ssc_is_anything(const regnode_ssc *ssc)
319 {
320 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
321 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
322 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
323 * in any way, so there's no point in using it */
324
325 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
326 bool ret;
327
328 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
329
330 assert(is_ANYOF_SYNTHETIC(ssc));
331
332 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
333 return FALSE;
334 }
335
336 /* See if the list consists solely of the range 0 - Infinity */
337 invlist_iterinit(ssc->invlist);
338 ret = invlist_iternext(ssc->invlist, &start, &end)
339 && start == 0
340 && end == UV_MAX;
341
342 invlist_iterfinish(ssc->invlist);
343
344 if (ret) {
345 return TRUE;
346 }
347
348 /* If e.g., both \w and \W are set, matches everything */
349 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
350 int i;
351 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
352 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
353 return TRUE;
354 }
355 }
356 }
357
358 return FALSE;
359 }
360
361 void
Perl_ssc_init(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc)362 Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
363 {
364 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
365 * string, any code point, or any posix class under locale */
366
367 PERL_ARGS_ASSERT_SSC_INIT;
368
369 Zero(ssc, 1, regnode_ssc);
370 set_ANYOF_SYNTHETIC(ssc);
371 ARG1u_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
372 ssc_anything(ssc);
373
374 /* If any portion of the regex is to operate under locale rules that aren't
375 * fully known at compile time, initialization includes it. The reason
376 * this isn't done for all regexes is that the optimizer was written under
377 * the assumption that locale was all-or-nothing. Given the complexity and
378 * lack of documentation in the optimizer, and that there are inadequate
379 * test cases for locale, many parts of it may not work properly, it is
380 * safest to avoid locale unless necessary. */
381 if (RExC_contains_locale) {
382 ANYOF_POSIXL_SETALL(ssc);
383 }
384 else {
385 ANYOF_POSIXL_ZERO(ssc);
386 }
387 }
388
389 STATIC int
S_ssc_is_cp_posixl_init(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)390 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
391 const regnode_ssc *ssc)
392 {
393 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
394 * to the list of code points matched, and locale posix classes; hence does
395 * not check its flags) */
396
397 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
398 bool ret;
399
400 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
401
402 assert(is_ANYOF_SYNTHETIC(ssc));
403
404 invlist_iterinit(ssc->invlist);
405 ret = invlist_iternext(ssc->invlist, &start, &end)
406 && start == 0
407 && end == UV_MAX;
408
409 invlist_iterfinish(ssc->invlist);
410
411 if (! ret) {
412 return FALSE;
413 }
414
415 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
416 return FALSE;
417 }
418
419 return TRUE;
420 }
421
422
423 STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t * pRExC_state,const regnode_charclass * const node)424 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
425 const regnode_charclass* const node)
426 {
427 /* Returns a mortal inversion list defining which code points are matched
428 * by 'node', which is of ANYOF-ish type . Handles complementing the
429 * result if appropriate. If some code points aren't knowable at this
430 * time, the returned list must, and will, contain every code point that is
431 * a possibility. */
432
433 SV* invlist = NULL;
434 SV* only_utf8_locale_invlist = NULL;
435 bool new_node_has_latin1 = FALSE;
436 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
437 ? ANYOF_FLAGS(node)
438 : 0;
439
440 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
441
442 /* Look at the data structure created by S_set_ANYOF_arg() */
443 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
444 invlist = sv_2mortal(_new_invlist(1));
445 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
446 }
447 else if (ANYOF_HAS_AUX(node)) {
448 const U32 n = ARG1u(node);
449 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
450 AV * const av = MUTABLE_AV(SvRV(rv));
451 SV **const ary = AvARRAY(av);
452
453 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
454
455 /* Here there are things that won't be known until runtime -- we
456 * have to assume it could be anything */
457 invlist = sv_2mortal(_new_invlist(1));
458 return _add_range_to_invlist(invlist, 0, UV_MAX);
459 }
460 else if (ary[INVLIST_INDEX]) {
461
462 /* Use the node's inversion list */
463 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
464 }
465
466 /* Get the code points valid only under UTF-8 locales */
467 if ( (flags & ANYOFL_FOLD)
468 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
469 {
470 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
471 }
472 }
473
474 if (! invlist) {
475 invlist = sv_2mortal(_new_invlist(0));
476 }
477
478 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
479 * code points, and an inversion list for the others, but if there are code
480 * points that should match only conditionally on the target string being
481 * UTF-8, those are placed in the inversion list, and not the bitmap.
482 * Since there are circumstances under which they could match, they are
483 * included in the SSC. But if the ANYOF node is to be inverted, we have
484 * to exclude them here, so that when we invert below, the end result
485 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
486 * have to do this here before we add the unconditionally matched code
487 * points */
488 if (flags & ANYOF_INVERT) {
489 _invlist_intersection_complement_2nd(invlist,
490 PL_UpperLatin1,
491 &invlist);
492 }
493
494 /* Add in the points from the bit map */
495 if (REGNODE_TYPE(OP(node)) == ANYOF){
496 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
497 if (ANYOF_BITMAP_TEST(node, i)) {
498 unsigned int start = i++;
499
500 for (; i < NUM_ANYOF_CODE_POINTS
501 && ANYOF_BITMAP_TEST(node, i); ++i)
502 {
503 /* empty */
504 }
505 invlist = _add_range_to_invlist(invlist, start, i-1);
506 new_node_has_latin1 = TRUE;
507 }
508 }
509 }
510
511 /* If this can match all upper Latin1 code points, have to add them
512 * as well. But don't add them if inverting, as when that gets done below,
513 * it would exclude all these characters, including the ones it shouldn't
514 * that were added just above */
515 if ( ! (flags & ANYOF_INVERT)
516 && OP(node) == ANYOFD
517 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
518 {
519 _invlist_union(invlist, PL_UpperLatin1, &invlist);
520 }
521
522 /* Similarly for these */
523 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
524 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
525 }
526
527 if (flags & ANYOF_INVERT) {
528 _invlist_invert(invlist);
529 }
530 else if (flags & ANYOFL_FOLD) {
531 if (new_node_has_latin1) {
532
533 /* These folds are potential in Turkic locales */
534 if (_invlist_contains_cp(invlist, 'i')) {
535 invlist = add_cp_to_invlist(invlist,
536 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
537 }
538 if (_invlist_contains_cp(invlist, 'I')) {
539 invlist = add_cp_to_invlist(invlist,
540 LATIN_SMALL_LETTER_DOTLESS_I);
541 }
542
543 /* Under /li, any 0-255 could fold to any other 0-255, depending on
544 * the locale. We can skip this if there are no 0-255 at all. */
545 _invlist_union(invlist, PL_Latin1, &invlist);
546 }
547 else {
548 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
549 invlist = add_cp_to_invlist(invlist, 'I');
550 }
551 if (_invlist_contains_cp(invlist,
552 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
553 {
554 invlist = add_cp_to_invlist(invlist, 'i');
555 }
556 }
557 }
558
559 /* Similarly add the UTF-8 locale possible matches. These have to be
560 * deferred until after the non-UTF-8 locale ones are taken care of just
561 * above, or it leads to wrong results under ANYOF_INVERT */
562 if (only_utf8_locale_invlist) {
563 _invlist_union_maybe_complement_2nd(invlist,
564 only_utf8_locale_invlist,
565 flags & ANYOF_INVERT,
566 &invlist);
567 }
568
569 return invlist;
570 }
571
572 /* 'AND' a given class with another one. Can create false positives. 'ssc'
573 * should not be inverted. */
574
575 STATIC void
S_ssc_and(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * and_with)576 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
577 const regnode_charclass *and_with)
578 {
579 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
580 * another SSC or a regular ANYOF class. Can create false positives. */
581
582 SV* anded_cp_list;
583 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
584 ? ANYOF_FLAGS(and_with)
585 : 0;
586 U8 anded_flags;
587
588 PERL_ARGS_ASSERT_SSC_AND;
589
590 assert(is_ANYOF_SYNTHETIC(ssc));
591
592 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
593 * the code point inversion list and just the relevant flags */
594 if (is_ANYOF_SYNTHETIC(and_with)) {
595 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
596 anded_flags = and_with_flags;
597
598 /* XXX This is a kludge around what appears to be deficiencies in the
599 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
600 * there are paths through the optimizer where it doesn't get weeded
601 * out when it should. And if we don't make some extra provision for
602 * it like the code just below, it doesn't get added when it should.
603 * This solution is to add it only when AND'ing, which is here, and
604 * only when what is being AND'ed is the pristine, original node
605 * matching anything. Thus it is like adding it to ssc_anything() but
606 * only when the result is to be AND'ed. Probably the same solution
607 * could be adopted for the same problem we have with /l matching,
608 * which is solved differently in S_ssc_init(), and that would lead to
609 * fewer false positives than that solution has. But if this solution
610 * creates bugs, the consequences are only that a warning isn't raised
611 * that should be; while the consequences for having /l bugs is
612 * incorrect matches */
613 if (ssc_is_anything((regnode_ssc *)and_with)) {
614 anded_flags |= ANYOF_WARN_SUPER__shared;
615 }
616 }
617 else {
618 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
619 if (OP(and_with) == ANYOFD) {
620 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
621 }
622 else {
623 anded_flags = and_with_flags
624 & ( ANYOF_COMMON_FLAGS
625 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
626 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
627 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
628 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
629 }
630 }
631 }
632
633 ANYOF_FLAGS(ssc) &= anded_flags;
634
635 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
636 * C2 is the list of code points in 'and-with'; P2, its posix classes.
637 * 'and_with' may be inverted. When not inverted, we have the situation of
638 * computing:
639 * (C1 | P1) & (C2 | P2)
640 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
641 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
642 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
643 * <= ((C1 & C2) | P1 | P2)
644 * Alternatively, the last few steps could be:
645 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
646 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
647 * <= (C1 | C2 | (P1 & P2))
648 * We favor the second approach if either P1 or P2 is non-empty. This is
649 * because these components are a barrier to doing optimizations, as what
650 * they match cannot be known until the moment of matching as they are
651 * dependent on the current locale, 'AND"ing them likely will reduce or
652 * eliminate them.
653 * But we can do better if we know that C1,P1 are in their initial state (a
654 * frequent occurrence), each matching everything:
655 * (<everything>) & (C2 | P2) = C2 | P2
656 * Similarly, if C2,P2 are in their initial state (again a frequent
657 * occurrence), the result is a no-op
658 * (C1 | P1) & (<everything>) = C1 | P1
659 *
660 * Inverted, we have
661 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
662 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
663 * <= (C1 & ~C2) | (P1 & ~P2)
664 * */
665
666 if ((and_with_flags & ANYOF_INVERT)
667 && ! is_ANYOF_SYNTHETIC(and_with))
668 {
669 unsigned int i;
670
671 ssc_intersection(ssc,
672 anded_cp_list,
673 FALSE /* Has already been inverted */
674 );
675
676 /* If either P1 or P2 is empty, the intersection will be also; can skip
677 * the loop */
678 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
679 ANYOF_POSIXL_ZERO(ssc);
680 }
681 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
682
683 /* Note that the Posix class component P from 'and_with' actually
684 * looks like:
685 * P = Pa | Pb | ... | Pn
686 * where each component is one posix class, such as in [\w\s].
687 * Thus
688 * ~P = ~(Pa | Pb | ... | Pn)
689 * = ~Pa & ~Pb & ... & ~Pn
690 * <= ~Pa | ~Pb | ... | ~Pn
691 * The last is something we can easily calculate, but unfortunately
692 * is likely to have many false positives. We could do better
693 * in some (but certainly not all) instances if two classes in
694 * P have known relationships. For example
695 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
696 * So
697 * :lower: & :print: = :lower:
698 * And similarly for classes that must be disjoint. For example,
699 * since \s and \w can have no elements in common based on rules in
700 * the POSIX standard,
701 * \w & ^\S = nothing
702 * Unfortunately, some vendor locales do not meet the Posix
703 * standard, in particular almost everything by Microsoft.
704 * The loop below just changes e.g., \w into \W and vice versa */
705
706 regnode_charclass_posixl temp;
707 int add = 1; /* To calculate the index of the complement */
708
709 Zero(&temp, 1, regnode_charclass_posixl);
710 ANYOF_POSIXL_ZERO(&temp);
711 for (i = 0; i < ANYOF_MAX; i++) {
712 assert(i % 2 != 0
713 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
714 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
715
716 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
717 ANYOF_POSIXL_SET(&temp, i + add);
718 }
719 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
720 }
721 ANYOF_POSIXL_AND(&temp, ssc);
722
723 } /* else ssc already has no posixes */
724 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
725 in its initial state */
726 else if (! is_ANYOF_SYNTHETIC(and_with)
727 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
728 {
729 /* But if 'ssc' is in its initial state, the result is just 'and_with';
730 * copy it over 'ssc' */
731 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
732 if (is_ANYOF_SYNTHETIC(and_with)) {
733 StructCopy(and_with, ssc, regnode_ssc);
734 }
735 else {
736 ssc->invlist = anded_cp_list;
737 ANYOF_POSIXL_ZERO(ssc);
738 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
739 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
740 }
741 }
742 }
743 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
744 || (and_with_flags & ANYOF_MATCHES_POSIXL))
745 {
746 /* One or the other of P1, P2 is non-empty. */
747 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
748 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
749 }
750 ssc_union(ssc, anded_cp_list, FALSE);
751 }
752 else { /* P1 = P2 = empty */
753 ssc_intersection(ssc, anded_cp_list, FALSE);
754 }
755 }
756 }
757
758 STATIC void
S_ssc_or(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * or_with)759 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
760 const regnode_charclass *or_with)
761 {
762 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
763 * another SSC or a regular ANYOF class. Can create false positives if
764 * 'or_with' is to be inverted. */
765
766 SV* ored_cp_list;
767 U8 ored_flags;
768 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
769 ? ANYOF_FLAGS(or_with)
770 : 0;
771
772 PERL_ARGS_ASSERT_SSC_OR;
773
774 assert(is_ANYOF_SYNTHETIC(ssc));
775
776 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
777 * the code point inversion list and just the relevant flags */
778 if (is_ANYOF_SYNTHETIC(or_with)) {
779 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
780 ored_flags = or_with_flags;
781 }
782 else {
783 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
784 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
785 if (OP(or_with) != ANYOFD) {
786 ored_flags |=
787 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
788 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
789 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
790 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
791 }
792 }
793 }
794
795 ANYOF_FLAGS(ssc) |= ored_flags;
796
797 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
798 * C2 is the list of code points in 'or-with'; P2, its posix classes.
799 * 'or_with' may be inverted. When not inverted, we have the simple
800 * situation of computing:
801 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
802 * If P1|P2 yields a situation with both a class and its complement are
803 * set, like having both \w and \W, this matches all code points, and we
804 * can delete these from the P component of the ssc going forward. XXX We
805 * might be able to delete all the P components, but I (khw) am not certain
806 * about this, and it is better to be safe.
807 *
808 * Inverted, we have
809 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
810 * <= (C1 | P1) | ~C2
811 * <= (C1 | ~C2) | P1
812 * (which results in actually simpler code than the non-inverted case)
813 * */
814
815 if ((or_with_flags & ANYOF_INVERT)
816 && ! is_ANYOF_SYNTHETIC(or_with))
817 {
818 /* We ignore P2, leaving P1 going forward */
819 } /* else Not inverted */
820 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
821 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
822 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
823 unsigned int i;
824 for (i = 0; i < ANYOF_MAX; i += 2) {
825 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
826 {
827 ssc_match_all_cp(ssc);
828 ANYOF_POSIXL_CLEAR(ssc, i);
829 ANYOF_POSIXL_CLEAR(ssc, i+1);
830 }
831 }
832 }
833 }
834
835 ssc_union(ssc,
836 ored_cp_list,
837 FALSE /* Already has been inverted */
838 );
839 }
840
841 STATIC void
S_ssc_union(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)842 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
843 {
844 PERL_ARGS_ASSERT_SSC_UNION;
845
846 assert(is_ANYOF_SYNTHETIC(ssc));
847
848 _invlist_union_maybe_complement_2nd(ssc->invlist,
849 invlist,
850 invert2nd,
851 &ssc->invlist);
852 }
853
854 STATIC void
S_ssc_intersection(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)855 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
856 SV* const invlist,
857 const bool invert2nd)
858 {
859 PERL_ARGS_ASSERT_SSC_INTERSECTION;
860
861 assert(is_ANYOF_SYNTHETIC(ssc));
862
863 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
864 invlist,
865 invert2nd,
866 &ssc->invlist);
867 }
868
869 STATIC void
S_ssc_add_range(pTHX_ regnode_ssc * ssc,const UV start,const UV end)870 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
871 {
872 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
873
874 assert(is_ANYOF_SYNTHETIC(ssc));
875
876 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
877 }
878
879 STATIC void
S_ssc_cp_and(pTHX_ regnode_ssc * ssc,const UV cp)880 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
881 {
882 /* AND just the single code point 'cp' into the SSC 'ssc' */
883
884 SV* cp_list = _new_invlist(2);
885
886 PERL_ARGS_ASSERT_SSC_CP_AND;
887
888 assert(is_ANYOF_SYNTHETIC(ssc));
889
890 cp_list = add_cp_to_invlist(cp_list, cp);
891 ssc_intersection(ssc, cp_list,
892 FALSE /* Not inverted */
893 );
894 SvREFCNT_dec_NN(cp_list);
895 }
896
897 STATIC void
S_ssc_clear_locale(regnode_ssc * ssc)898 S_ssc_clear_locale(regnode_ssc *ssc)
899 {
900 /* Set the SSC 'ssc' to not match any locale things */
901 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
902
903 assert(is_ANYOF_SYNTHETIC(ssc));
904
905 ANYOF_POSIXL_ZERO(ssc);
906 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
907 }
908
909
910
911 /* The below joins as many adjacent EXACTish nodes as possible into a single
912 * one. The regop may be changed if the node(s) contain certain sequences that
913 * require special handling. The joining is only done if:
914 * 1) there is room in the current conglomerated node to entirely contain the
915 * next one.
916 * 2) they are compatible node types
917 *
918 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
919 * these get optimized out
920 *
921 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
922 * as possible, even if that means splitting an existing node so that its first
923 * part is moved to the preceding node. This would maximise the efficiency of
924 * memEQ during matching.
925 *
926 * If a node is to match under /i (folded), the number of characters it matches
927 * can be different than its character length if it contains a multi-character
928 * fold. *min_subtract is set to the total delta number of characters of the
929 * input nodes.
930 *
931 * And *unfolded_multi_char is set to indicate whether or not the node contains
932 * an unfolded multi-char fold. This happens when it won't be known until
933 * runtime whether the fold is valid or not; namely
934 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
935 * target string being matched against turns out to be UTF-8 is that fold
936 * valid; or
937 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
938 * runtime.
939 * (Multi-char folds whose components are all above the Latin1 range are not
940 * run-time locale dependent, and have already been folded by the time this
941 * function is called.)
942 *
943 * This is as good a place as any to discuss the design of handling these
944 * multi-character fold sequences. It's been wrong in Perl for a very long
945 * time. There are three code points in Unicode whose multi-character folds
946 * were long ago discovered to mess things up. The previous designs for
947 * dealing with these involved assigning a special node for them. This
948 * approach doesn't always work, as evidenced by this example:
949 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
950 * Both sides fold to "sss", but if the pattern is parsed to create a node that
951 * would match just the \xDF, it won't be able to handle the case where a
952 * successful match would have to cross the node's boundary. The new approach
953 * that hopefully generally solves the problem generates an EXACTFUP node
954 * that is "sss" in this case.
955 *
956 * It turns out that there are problems with all multi-character folds, and not
957 * just these three. Now the code is general, for all such cases. The
958 * approach taken is:
959 * 1) This routine examines each EXACTFish node that could contain multi-
960 * character folded sequences. Since a single character can fold into
961 * such a sequence, the minimum match length for this node is less than
962 * the number of characters in the node. This routine returns in
963 * *min_subtract how many characters to subtract from the actual
964 * length of the string to get a real minimum match length; it is 0 if
965 * there are no multi-char foldeds. This delta is used by the caller to
966 * adjust the min length of the match, and the delta between min and max,
967 * so that the optimizer doesn't reject these possibilities based on size
968 * constraints.
969 *
970 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
971 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
972 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
973 * EXACTFU nodes. The node type of such nodes is then changed to
974 * EXACTFUP, indicating it is problematic, and needs careful handling.
975 * (The procedures in step 1) above are sufficient to handle this case in
976 * UTF-8 encoded nodes.) The reason this is problematic is that this is
977 * the only case where there is a possible fold length change in non-UTF-8
978 * patterns. By reserving a special node type for problematic cases, the
979 * far more common regular EXACTFU nodes can be processed faster.
980 * regexec.c takes advantage of this.
981 *
982 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
983 * problematic cases. These all only occur when the pattern is not
984 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
985 * length change, it handles the situation where the string cannot be
986 * entirely folded. The strings in an EXACTFish node are folded as much
987 * as possible during compilation in regcomp.c. This saves effort in
988 * regex matching. By using an EXACTFUP node when it is not possible to
989 * fully fold at compile time, regexec.c can know that everything in an
990 * EXACTFU node is folded, so folding can be skipped at runtime. The only
991 * case where folding in EXACTFU nodes can't be done at compile time is
992 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
993 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
994 * handle two very different cases. Alternatively, there could have been
995 * a node type where there are length changes, one for unfolded, and one
996 * for both. If yet another special case needed to be created, the number
997 * of required node types would have to go to 7. khw figures that even
998 * though there are plenty of node types to spare, that the maintenance
999 * cost wasn't worth the small speedup of doing it that way, especially
1000 * since he thinks the MICRO SIGN is rarely encountered in practice.
1001 *
1002 * There are other cases where folding isn't done at compile time, but
1003 * none of them are under /u, and hence not for EXACTFU nodes. The folds
1004 * in EXACTFL nodes aren't known until runtime, and vary as the locale
1005 * changes. Some folds in EXACTF depend on if the runtime target string
1006 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
1007 * when no fold in it depends on the UTF-8ness of the target string.)
1008 *
1009 * 3) A problem remains for unfolded multi-char folds. (These occur when the
1010 * validity of the fold won't be known until runtime, and so must remain
1011 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
1012 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
1013 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
1014 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
1015 * The reason this is a problem is that the optimizer part of regexec.c
1016 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
1017 * that a character in the pattern corresponds to at most a single
1018 * character in the target string. (And I do mean character, and not byte
1019 * here, unlike other parts of the documentation that have never been
1020 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
1021 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
1022 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
1023 * EXACTFL nodes, violate the assumption, and they are the only instances
1024 * where it is violated. I'm reluctant to try to change the assumption,
1025 * as the code involved is impenetrable to me (khw), so instead the code
1026 * here punts. This routine examines EXACTFL nodes, and (when the pattern
1027 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
1028 * boolean indicating whether or not the node contains such a fold. When
1029 * it is true, the caller sets a flag that later causes the optimizer in
1030 * this file to not set values for the floating and fixed string lengths,
1031 * and thus avoids the optimizer code in regexec.c that makes the invalid
1032 * assumption. Thus, there is no optimization based on string lengths for
1033 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
1034 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
1035 * assumption is wrong only in these cases is that all other non-UTF-8
1036 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
1037 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
1038 * EXACTF nodes because we don't know at compile time if it actually
1039 * matches 'ss' or not. For EXACTF nodes it will match iff the target
1040 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
1041 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
1042 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
1043 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
1044 * string would require the pattern to be forced into UTF-8, the overhead
1045 * of which we want to avoid. Similarly the unfolded multi-char folds in
1046 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
1047 * locale.)
1048 *
1049 * Similarly, the code that generates tries doesn't currently handle
1050 * not-already-folded multi-char folds, and it looks like a pain to change
1051 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
1052 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
1053 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
1054 * using /iaa matching will be doing so almost entirely with ASCII
1055 * strings, so this should rarely be encountered in practice */
1056
1057 U32
Perl_join_exact(pTHX_ RExC_state_t * pRExC_state,regnode * scan,UV * min_subtract,bool * unfolded_multi_char,U32 flags,regnode * val,U32 depth)1058 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
1059 UV *min_subtract, bool *unfolded_multi_char,
1060 U32 flags, regnode *val, U32 depth)
1061 {
1062 /* Merge several consecutive EXACTish nodes into one. */
1063
1064 regnode *n = regnext(scan);
1065 U32 stringok = 1;
1066 regnode *next = REGNODE_AFTER_varies(scan);
1067 U32 stopnow = 0;
1068 #ifdef DEBUGGING
1069 U32 merged = 0;
1070 regnode *stop = scan;
1071 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1072 #else
1073 PERL_UNUSED_ARG(depth);
1074 #endif
1075
1076 PERL_ARGS_ASSERT_JOIN_EXACT;
1077 #ifndef EXPERIMENTAL_INPLACESCAN
1078 PERL_UNUSED_ARG(flags);
1079 PERL_UNUSED_ARG(val);
1080 #endif
1081 DEBUG_PEEP("join", scan, depth, 0);
1082
1083 assert(REGNODE_TYPE(OP(scan)) == EXACT);
1084
1085 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
1086 * EXACT ones that are mergeable to the current one. */
1087 while ( n
1088 && ( REGNODE_TYPE(OP(n)) == NOTHING
1089 || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
1090 && NEXT_OFF(n)
1091 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
1092 {
1093
1094 if (OP(n) == TAIL || n > next)
1095 stringok = 0;
1096 if (REGNODE_TYPE(OP(n)) == NOTHING) {
1097 DEBUG_PEEP("skip:", n, depth, 0);
1098 NEXT_OFF(scan) += NEXT_OFF(n);
1099 next = n + NODE_STEP_REGNODE;
1100 #ifdef DEBUGGING
1101 if (stringok)
1102 stop = n;
1103 #endif
1104 n = regnext(n);
1105 }
1106 else if (stringok) {
1107 const unsigned int oldl = STR_LEN(scan);
1108 regnode * const nnext = regnext(n);
1109
1110 /* XXX I (khw) kind of doubt that this works on platforms (should
1111 * Perl ever run on one) where U8_MAX is above 255 because of lots
1112 * of other assumptions */
1113 /* Don't join if the sum can't fit into a single node */
1114 if (oldl + STR_LEN(n) > U8_MAX)
1115 break;
1116
1117 /* Joining something that requires UTF-8 with something that
1118 * doesn't, means the result requires UTF-8. */
1119 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
1120 OP(scan) = EXACT_REQ8;
1121 }
1122 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
1123 ; /* join is compatible, no need to change OP */
1124 }
1125 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
1126 OP(scan) = EXACTFU_REQ8;
1127 }
1128 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
1129 ; /* join is compatible, no need to change OP */
1130 }
1131 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
1132 ; /* join is compatible, no need to change OP */
1133 }
1134 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
1135
1136 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
1137 * which can join with EXACTFU ones. We check for this case
1138 * here. These need to be resolved to either EXACTFU or
1139 * EXACTF at joining time. They have nothing in them that
1140 * would forbid them from being the more desirable EXACTFU
1141 * nodes except that they begin and/or end with a single [Ss].
1142 * The reason this is problematic is because they could be
1143 * joined in this loop with an adjacent node that ends and/or
1144 * begins with [Ss] which would then form the sequence 'ss',
1145 * which matches differently under /di than /ui, in which case
1146 * EXACTFU can't be used. If the 'ss' sequence doesn't get
1147 * formed, the nodes get absorbed into any adjacent EXACTFU
1148 * node. And if the only adjacent node is EXACTF, they get
1149 * absorbed into that, under the theory that a longer node is
1150 * better than two shorter ones, even if one is EXACTFU. Note
1151 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
1152 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
1153
1154 if (STRING(n)[STR_LEN(n)-1] == 's') {
1155
1156 /* Here the joined node would end with 's'. If the node
1157 * following the combination is an EXACTF one, it's better to
1158 * join this trailing edge 's' node with that one, leaving the
1159 * current one in 'scan' be the more desirable EXACTFU */
1160 if (OP(nnext) == EXACTF) {
1161 break;
1162 }
1163
1164 OP(scan) = EXACTFU_S_EDGE;
1165
1166 } /* Otherwise, the beginning 's' of the 2nd node just
1167 becomes an interior 's' in 'scan' */
1168 }
1169 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
1170 ; /* join is compatible, no need to change OP */
1171 }
1172 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
1173
1174 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
1175 * nodes. But the latter nodes can be also joined with EXACTFU
1176 * ones, and that is a better outcome, so if the node following
1177 * 'n' is EXACTFU, quit now so that those two can be joined
1178 * later */
1179 if (OP(nnext) == EXACTFU) {
1180 break;
1181 }
1182
1183 /* The join is compatible, and the combined node will be
1184 * EXACTF. (These don't care if they begin or end with 's' */
1185 }
1186 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
1187 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
1188 && STRING(n)[0] == 's')
1189 {
1190 /* When combined, we have the sequence 'ss', which means we
1191 * have to remain /di */
1192 OP(scan) = EXACTF;
1193 }
1194 }
1195 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
1196 if (STRING(n)[0] == 's') {
1197 ; /* Here the join is compatible and the combined node
1198 starts with 's', no need to change OP */
1199 }
1200 else { /* Now the trailing 's' is in the interior */
1201 OP(scan) = EXACTFU;
1202 }
1203 }
1204 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
1205
1206 /* The join is compatible, and the combined node will be
1207 * EXACTF. (These don't care if they begin or end with 's' */
1208 OP(scan) = EXACTF;
1209 }
1210 else if (OP(scan) != OP(n)) {
1211
1212 /* The only other compatible joinings are the same node type */
1213 break;
1214 }
1215
1216 DEBUG_PEEP("merg", n, depth, 0);
1217 #ifdef DEBUGGING
1218 merged++;
1219 #endif
1220
1221 next = REGNODE_AFTER_varies(n);
1222 NEXT_OFF(scan) += NEXT_OFF(n);
1223 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
1224 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
1225 /* Now we can overwrite *n : */
1226 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1227 #ifdef DEBUGGING
1228 stop = next - 1;
1229 #endif
1230 n = nnext;
1231 if (stopnow) break;
1232 }
1233
1234 #ifdef EXPERIMENTAL_INPLACESCAN
1235 if (flags && !NEXT_OFF(n)) {
1236 DEBUG_PEEP("atch", val, depth, 0);
1237 if (REGNODE_OFF_BY_ARG(OP(n))) {
1238 ARG1u_SET(n, val - n);
1239 }
1240 else {
1241 NEXT_OFF(n) = val - n;
1242 }
1243 stopnow = 1;
1244 }
1245 #endif
1246 }
1247
1248 /* This temporary node can now be turned into EXACTFU, and must, as
1249 * regexec.c doesn't handle it */
1250 if (OP(scan) == EXACTFU_S_EDGE) {
1251 OP(scan) = EXACTFU;
1252 }
1253
1254 *min_subtract = 0;
1255 *unfolded_multi_char = FALSE;
1256
1257 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
1258 * can now analyze for sequences of problematic code points. (Prior to
1259 * this final joining, sequences could have been split over boundaries, and
1260 * hence missed). The sequences only happen in folding, hence for any
1261 * non-EXACT EXACTish node */
1262 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
1263 U8* s0 = (U8*) STRING(scan);
1264 U8* s = s0;
1265 U8* s_end = s0 + STR_LEN(scan);
1266
1267 int total_count_delta = 0; /* Total delta number of characters that
1268 multi-char folds expand to */
1269
1270 /* One pass is made over the node's string looking for all the
1271 * possibilities. To avoid some tests in the loop, there are two main
1272 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
1273 * non-UTF-8 */
1274 if (UTF) {
1275 U8* folded = NULL;
1276
1277 if (OP(scan) == EXACTFL) {
1278 U8 *d;
1279
1280 /* An EXACTFL node would already have been changed to another
1281 * node type unless there is at least one character in it that
1282 * is problematic; likely a character whose fold definition
1283 * won't be known until runtime, and so has yet to be folded.
1284 * For all but the UTF-8 locale, folds are 1-1 in length, but
1285 * to handle the UTF-8 case, we need to create a temporary
1286 * folded copy using UTF-8 locale rules in order to analyze it.
1287 * This is because our macros that look to see if a sequence is
1288 * a multi-char fold assume everything is folded (otherwise the
1289 * tests in those macros would be too complicated and slow).
1290 * Note that here, the non-problematic folds will have already
1291 * been done, so we can just copy such characters. We actually
1292 * don't completely fold the EXACTFL string. We skip the
1293 * unfolded multi-char folds, as that would just create work
1294 * below to figure out the size they already are */
1295
1296 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
1297 d = folded;
1298 while (s < s_end) {
1299 STRLEN s_len = UTF8SKIP(s);
1300 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
1301 Copy(s, d, s_len, U8);
1302 d += s_len;
1303 }
1304 else if (is_FOLDS_TO_MULTI_utf8(s)) {
1305 *unfolded_multi_char = TRUE;
1306 Copy(s, d, s_len, U8);
1307 d += s_len;
1308 }
1309 else if (isASCII(*s)) {
1310 *(d++) = toFOLD(*s);
1311 }
1312 else {
1313 STRLEN len;
1314 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
1315 d += len;
1316 }
1317 s += s_len;
1318 }
1319
1320 /* Point the remainder of the routine to look at our temporary
1321 * folded copy */
1322 s = folded;
1323 s_end = d;
1324 } /* End of creating folded copy of EXACTFL string */
1325
1326 /* Examine the string for a multi-character fold sequence. UTF-8
1327 * patterns have all characters pre-folded by the time this code is
1328 * executed */
1329 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
1330 length sequence we are looking for is 2 */
1331 {
1332 int count = 0; /* How many characters in a multi-char fold */
1333 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
1334 if (! len) { /* Not a multi-char fold: get next char */
1335 s += UTF8SKIP(s);
1336 continue;
1337 }
1338
1339 { /* Here is a generic multi-char fold. */
1340 U8* multi_end = s + len;
1341
1342 /* Count how many characters are in it. In the case of
1343 * /aa, no folds which contain ASCII code points are
1344 * allowed, so check for those, and skip if found. */
1345 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
1346 count = utf8_length(s, multi_end);
1347 s = multi_end;
1348 }
1349 else {
1350 while (s < multi_end) {
1351 if (isASCII(*s)) {
1352 s++;
1353 goto next_iteration;
1354 }
1355 else {
1356 s += UTF8SKIP(s);
1357 }
1358 count++;
1359 }
1360 }
1361 }
1362
1363 /* The delta is how long the sequence is minus 1 (1 is how long
1364 * the character that folds to the sequence is) */
1365 total_count_delta += count - 1;
1366 next_iteration: ;
1367 }
1368
1369 /* We created a temporary folded copy of the string in EXACTFL
1370 * nodes. Therefore we need to be sure it doesn't go below zero,
1371 * as the real string could be shorter */
1372 if (OP(scan) == EXACTFL) {
1373 int total_chars = utf8_length((U8*) STRING(scan),
1374 (U8*) STRING(scan) + STR_LEN(scan));
1375 if (total_count_delta > total_chars) {
1376 total_count_delta = total_chars;
1377 }
1378 }
1379
1380 *min_subtract += total_count_delta;
1381 Safefree(folded);
1382 }
1383 else if (OP(scan) == EXACTFAA) {
1384
1385 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
1386 * fold to the ASCII range (and there are no existing ones in the
1387 * upper latin1 range). But, as outlined in the comments preceding
1388 * this function, we need to flag any occurrences of the sharp s.
1389 * This character forbids trie formation (because of added
1390 * complexity) */
1391 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
1392 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
1393 || UNICODE_DOT_DOT_VERSION > 0)
1394 while (s < s_end) {
1395 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
1396 OP(scan) = EXACTFAA_NO_TRIE;
1397 *unfolded_multi_char = TRUE;
1398 break;
1399 }
1400 s++;
1401 }
1402 }
1403 else if (OP(scan) != EXACTFAA_NO_TRIE) {
1404
1405 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
1406 * folds that are all Latin1. As explained in the comments
1407 * preceding this function, we look also for the sharp s in EXACTF
1408 * and EXACTFL nodes; it can be in the final position. Otherwise
1409 * we can stop looking 1 byte earlier because have to find at least
1410 * two characters for a multi-fold */
1411 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
1412 ? s_end
1413 : s_end -1;
1414
1415 while (s < upper) {
1416 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
1417 if (! len) { /* Not a multi-char fold. */
1418 if (*s == LATIN_SMALL_LETTER_SHARP_S
1419 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
1420 {
1421 *unfolded_multi_char = TRUE;
1422 }
1423 s++;
1424 continue;
1425 }
1426
1427 if (len == 2
1428 && isALPHA_FOLD_EQ(*s, 's')
1429 && isALPHA_FOLD_EQ(*(s+1), 's'))
1430 {
1431
1432 /* EXACTF nodes need to know that the minimum length
1433 * changed so that a sharp s in the string can match this
1434 * ss in the pattern, but they remain EXACTF nodes, as they
1435 * won't match this unless the target string is in UTF-8,
1436 * which we don't know until runtime. EXACTFL nodes can't
1437 * transform into EXACTFU nodes */
1438 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
1439 OP(scan) = EXACTFUP;
1440 }
1441 }
1442
1443 *min_subtract += len - 1;
1444 s += len;
1445 }
1446 #endif
1447 }
1448 }
1449
1450 #ifdef DEBUGGING
1451 /* Allow dumping but overwriting the collection of skipped
1452 * ops and/or strings with fake optimized ops */
1453 n = REGNODE_AFTER_varies(scan);
1454 while (n <= stop) {
1455 OP(n) = OPTIMIZED;
1456 FLAGS(n) = 0;
1457 NEXT_OFF(n) = 0;
1458 n++;
1459 }
1460 #endif
1461 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
1462 return stopnow;
1463 }
1464
1465 /* REx optimizer. Converts nodes into quicker variants "in place".
1466 Finds fixed substrings. */
1467
1468
1469 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1470 to the position after last scanned or to NULL. */
1471
1472 /* the return from this sub is the minimum length that could possibly match */
1473 SSize_t
Perl_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,SSize_t * minlenp,SSize_t * deltap,regnode * last,scan_data_t * data,I32 stopparen,U32 recursed_depth,regnode_ssc * and_withp,U32 flags,U32 depth,bool was_mutate_ok)1474 Perl_study_chunk(pTHX_
1475 RExC_state_t *pRExC_state,
1476 regnode **scanp, /* Start here (read-write). */
1477 SSize_t *minlenp, /* used for the minlen of substrings? */
1478 SSize_t *deltap, /* Write maxlen-minlen here. */
1479 regnode *last, /* Stop before this one. */
1480 scan_data_t *data, /* string data about the pattern */
1481 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
1482 U32 recursed_depth, /* how deep have we recursed via GOSUB */
1483 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
1484 U32 flags, /* flags controlling this call, see SCF_ flags */
1485 U32 depth, /* how deep have we recursed period */
1486 bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
1487 FALSE only if the caller (recursively) was
1488 prohibited from modifying the regops, because
1489 a higher caller is holding a ptr to them. */
1490 )
1491 {
1492 /* vars about the regnodes we are working with */
1493 regnode *scan = *scanp; /* the current opcode we are inspecting */
1494 regnode *next = NULL; /* the next opcode beyond scan, tmp var */
1495 regnode *first_non_open = scan; /* FIXME: should this init to NULL?
1496 the first non open regop, if the init
1497 val IS an OPEN then we will skip past
1498 it just after the var decls section */
1499 I32 code = 0; /* temp var used to hold the optype of a regop */
1500
1501 /* vars about the min and max length of the pattern */
1502 SSize_t min = 0; /* min length of this part of the pattern */
1503 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
1504 this is adjusted down if we find
1505 an ACCEPT */
1506 SSize_t delta = 0; /* difference between min and max length
1507 (not accounting for stopmin) */
1508
1509 /* vars about capture buffers in the pattern */
1510 I32 pars = 0; /* count of OPEN opcodes */
1511 I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
1512
1513 /* vars about whether this pattern contains something that can match
1514 * infinitely long strings, eg, X* or X+ */
1515 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1516 int is_inf_internal = 0; /* The studied chunk is infinite */
1517
1518 /* scan_data_t (struct) is used to hold information about the substrings
1519 * and start class we have extracted from the string */
1520 scan_data_t data_fake; /* temp var used for recursing in some cases */
1521
1522 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
1523 trie optimizations */
1524
1525 scan_frame *frame = NULL; /* used as part of fake recursion */
1526
1527 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1528
1529 PERL_ARGS_ASSERT_STUDY_CHUNK;
1530 RExC_study_started= 1;
1531
1532 Zero(&data_fake, 1, scan_data_t);
1533
1534 if ( depth == 0 ) {
1535 while (first_non_open && OP(first_non_open) == OPEN)
1536 first_non_open=regnext(first_non_open);
1537 }
1538
1539 fake_study_recurse:
1540 DEBUG_r(
1541 RExC_study_chunk_recursed_count++;
1542 );
1543 DEBUG_OPTIMISE_MORE_r(
1544 {
1545 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
1546 depth, (long)stopparen,
1547 (unsigned long)RExC_study_chunk_recursed_count,
1548 (unsigned long)depth, (unsigned long)recursed_depth,
1549 scan,
1550 last);
1551 if (recursed_depth) {
1552 U32 i;
1553 U32 j;
1554 for ( j = 0 ; j < recursed_depth ; j++ ) {
1555 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
1556 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
1557 Perl_re_printf( aTHX_ " %d",(int)i);
1558 break;
1559 }
1560 }
1561 if ( j + 1 < recursed_depth ) {
1562 Perl_re_printf( aTHX_ ",");
1563 }
1564 }
1565 }
1566 Perl_re_printf( aTHX_ "\n");
1567 }
1568 );
1569 while ( scan && OP(scan) != END && scan < last ){
1570 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
1571 node length to get a real minimum (because
1572 the folded version may be shorter) */
1573 bool unfolded_multi_char = FALSE;
1574 /* avoid mutating ops if we are anywhere within the recursed or
1575 * enframed handling for a GOSUB: the outermost level will handle it.
1576 */
1577 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
1578 /* Peephole optimizer: */
1579 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
1580 DEBUG_PEEP("Peep", scan, depth, flags);
1581
1582
1583 /* The reason we do this here is that we need to deal with things like
1584 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
1585 * parsing code, as each (?:..) is handled by a different invocation of
1586 * reg() -- Yves
1587 */
1588 if (REGNODE_TYPE(OP(scan)) == EXACT
1589 && OP(scan) != LEXACT
1590 && OP(scan) != LEXACT_REQ8
1591 && mutate_ok
1592 ) {
1593 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
1594 0, NULL, depth + 1);
1595 }
1596
1597 /* Follow the next-chain of the current node and optimize
1598 away all the NOTHINGs from it.
1599 */
1600 rck_elide_nothing(scan);
1601
1602 /* The principal pseudo-switch. Cannot be a switch, since we look into
1603 * several different things. */
1604 if ( OP(scan) == DEFINEP ) {
1605 SSize_t minlen = 0;
1606 SSize_t deltanext = 0;
1607 SSize_t fake_last_close = 0;
1608 regnode *fake_last_close_op = NULL;
1609 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
1610
1611 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1612 scan = regnext(scan);
1613 assert( OP(scan) == IFTHEN );
1614 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
1615
1616 data_fake.last_closep= &fake_last_close;
1617 data_fake.last_close_opp= &fake_last_close_op;
1618 minlen = *minlenp;
1619 next = regnext(scan);
1620 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
1621 DEBUG_PEEP("scan", scan, depth, flags);
1622 DEBUG_PEEP("next", next, depth, flags);
1623
1624 /* we suppose the run is continuous, last=next...
1625 * NOTE we dont use the return here! */
1626 /* DEFINEP study_chunk() recursion */
1627 (void)study_chunk(pRExC_state, &scan, &minlen,
1628 &deltanext, next, &data_fake, stopparen,
1629 recursed_depth, NULL, f, depth+1, mutate_ok);
1630
1631 scan = next;
1632 } else
1633 if (
1634 OP(scan) == BRANCH ||
1635 OP(scan) == BRANCHJ ||
1636 OP(scan) == IFTHEN
1637 ) {
1638 next = regnext(scan);
1639 code = OP(scan);
1640
1641 /* The op(next)==code check below is to see if we
1642 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
1643 * IFTHEN is special as it might not appear in pairs.
1644 * Not sure whether BRANCH-BRANCHJ is possible, regardless
1645 * we dont handle it cleanly. */
1646 if (OP(next) == code || code == IFTHEN) {
1647 /* NOTE - There is similar code to this block below for
1648 * handling TRIE nodes on a re-study. If you change stuff here
1649 * check there too. */
1650 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
1651 regnode_ssc accum;
1652 regnode * const startbranch=scan;
1653
1654 if (flags & SCF_DO_SUBSTR) {
1655 /* Cannot merge strings after this. */
1656 scan_commit(pRExC_state, data, minlenp, is_inf);
1657 }
1658
1659 if (flags & SCF_DO_STCLASS)
1660 ssc_init_zero(pRExC_state, &accum);
1661
1662 while (OP(scan) == code) {
1663 SSize_t deltanext, minnext, fake_last_close = 0;
1664 regnode *fake_last_close_op = NULL;
1665 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
1666 regnode_ssc this_class;
1667
1668 DEBUG_PEEP("Branch", scan, depth, flags);
1669
1670 num++;
1671 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1672 if (data) {
1673 data_fake.whilem_c = data->whilem_c;
1674 data_fake.last_closep = data->last_closep;
1675 data_fake.last_close_opp = data->last_close_opp;
1676 }
1677 else {
1678 data_fake.last_closep = &fake_last_close;
1679 data_fake.last_close_opp = &fake_last_close_op;
1680 }
1681
1682 data_fake.pos_delta = delta;
1683 next = regnext(scan);
1684
1685 scan = REGNODE_AFTER_opcode(scan, code);
1686
1687 if (flags & SCF_DO_STCLASS) {
1688 ssc_init(pRExC_state, &this_class);
1689 data_fake.start_class = &this_class;
1690 f |= SCF_DO_STCLASS_AND;
1691 }
1692 if (flags & SCF_WHILEM_VISITED_POS)
1693 f |= SCF_WHILEM_VISITED_POS;
1694
1695 /* we suppose the run is continuous, last=next...*/
1696 /* recurse study_chunk() for each BRANCH in an alternation */
1697 minnext = study_chunk(pRExC_state, &scan, minlenp,
1698 &deltanext, next, &data_fake, stopparen,
1699 recursed_depth, NULL, f, depth+1,
1700 mutate_ok);
1701
1702 if (min1 > minnext)
1703 min1 = minnext;
1704 if (deltanext == OPTIMIZE_INFTY) {
1705 is_inf = is_inf_internal = 1;
1706 max1 = OPTIMIZE_INFTY;
1707 } else if (max1 < minnext + deltanext)
1708 max1 = minnext + deltanext;
1709 scan = next;
1710 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1711 pars++;
1712 if (data_fake.flags & SCF_SEEN_ACCEPT) {
1713 if ( stopmin > minnext)
1714 stopmin = min + min1;
1715 flags &= ~SCF_DO_SUBSTR;
1716 if (data)
1717 data->flags |= SCF_SEEN_ACCEPT;
1718 }
1719 if (data) {
1720 if (data_fake.flags & SF_HAS_EVAL)
1721 data->flags |= SF_HAS_EVAL;
1722 data->whilem_c = data_fake.whilem_c;
1723 }
1724 if (flags & SCF_DO_STCLASS)
1725 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
1726 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
1727 }
1728 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1729 min1 = 0;
1730 if (flags & SCF_DO_SUBSTR) {
1731 data->pos_min += min1;
1732 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
1733 data->pos_delta = OPTIMIZE_INFTY;
1734 else
1735 data->pos_delta += max1 - min1;
1736 if (max1 != min1 || is_inf)
1737 data->cur_is_floating = 1;
1738 }
1739 min += min1;
1740 if (delta == OPTIMIZE_INFTY
1741 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
1742 delta = OPTIMIZE_INFTY;
1743 else
1744 delta += max1 - min1;
1745 if (flags & SCF_DO_STCLASS_OR) {
1746 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
1747 if (min1) {
1748 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
1749 flags &= ~SCF_DO_STCLASS;
1750 }
1751 }
1752 else if (flags & SCF_DO_STCLASS_AND) {
1753 if (min1) {
1754 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
1755 flags &= ~SCF_DO_STCLASS;
1756 }
1757 else {
1758 /* Switch to OR mode: cache the old value of
1759 * data->start_class */
1760 INIT_AND_WITHP;
1761 StructCopy(data->start_class, and_withp, regnode_ssc);
1762 flags &= ~SCF_DO_STCLASS_AND;
1763 StructCopy(&accum, data->start_class, regnode_ssc);
1764 flags |= SCF_DO_STCLASS_OR;
1765 }
1766 }
1767 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
1768
1769 if (PERL_ENABLE_TRIE_OPTIMISATION
1770 && OP(startbranch) == BRANCH
1771 && mutate_ok
1772 ) {
1773 /* demq.
1774
1775 Assuming this was/is a branch we are dealing with: 'scan'
1776 now points at the item that follows the branch sequence,
1777 whatever it is. We now start at the beginning of the
1778 sequence and look for subsequences of
1779
1780 BRANCH->EXACT=>x1
1781 BRANCH->EXACT=>x2
1782 tail
1783
1784 which would be constructed from a pattern like
1785 /A|LIST|OF|WORDS/
1786
1787 If we can find such a subsequence we need to turn the first
1788 element into a trie and then add the subsequent branch exact
1789 strings to the trie.
1790
1791 We have two cases
1792
1793 1. patterns where the whole set of branches can be
1794 converted.
1795
1796 2. patterns where only a subset can be converted.
1797
1798 In case 1 we can replace the whole set with a single regop
1799 for the trie. In case 2 we need to keep the start and end
1800 branches so
1801
1802 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1803 becomes BRANCH TRIE; BRANCH X;
1804
1805 There is an additional case, that being where there is a
1806 common prefix, which gets split out into an EXACT like node
1807 preceding the TRIE node.
1808
1809 If X(1..n)==tail then we can do a simple trie, if not we make
1810 a "jump" trie, such that when we match the appropriate word
1811 we "jump" to the appropriate tail node. Essentially we turn
1812 a nested if into a case structure of sorts.
1813
1814 */
1815
1816 int made=0;
1817 if (!re_trie_maxbuff) {
1818 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1819 if (!SvIOK(re_trie_maxbuff))
1820 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1821 }
1822 if ( SvIV(re_trie_maxbuff)>=0 ) {
1823 regnode *cur;
1824 regnode *first = (regnode *)NULL;
1825 regnode *prev = (regnode *)NULL;
1826 regnode *tail = scan;
1827 U8 trietype = 0;
1828 U32 count=0;
1829
1830 /* var tail is used because there may be a TAIL
1831 regop in the way. Ie, the exacts will point to the
1832 thing following the TAIL, but the last branch will
1833 point at the TAIL. So we advance tail. If we
1834 have nested (?:) we may have to move through several
1835 tails.
1836 */
1837
1838 while ( OP( tail ) == TAIL ) {
1839 /* this is the TAIL generated by (?:) */
1840 tail = regnext( tail );
1841 }
1842
1843
1844 DEBUG_TRIE_COMPILE_r({
1845 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
1846 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
1847 depth+1,
1848 "Looking for TRIE'able sequences. Tail node is ",
1849 (UV) REGNODE_OFFSET(tail),
1850 SvPV_nolen_const( RExC_mysv )
1851 );
1852 });
1853
1854 /*
1855
1856 Step through the branches
1857 cur represents each branch,
1858 noper is the first thing to be matched as part
1859 of that branch
1860 noper_next is the regnext() of that node.
1861
1862 We normally handle a case like this
1863 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
1864 support building with NOJUMPTRIE, which restricts
1865 the trie logic to structures like /FOO|BAR/.
1866
1867 If noper is a trieable nodetype then the branch is
1868 a possible optimization target. If we are building
1869 under NOJUMPTRIE then we require that noper_next is
1870 the same as scan (our current position in the regex
1871 program).
1872
1873 Once we have two or more consecutive such branches
1874 we can create a trie of the EXACT's contents and
1875 stitch it in place into the program.
1876
1877 If the sequence represents all of the branches in
1878 the alternation we replace the entire thing with a
1879 single TRIE node.
1880
1881 Otherwise when it is a subsequence we need to
1882 stitch it in place and replace only the relevant
1883 branches. This means the first branch has to remain
1884 as it is used by the alternation logic, and its
1885 next pointer, and needs to be repointed at the item
1886 on the branch chain following the last branch we
1887 have optimized away.
1888
1889 This could be either a BRANCH, in which case the
1890 subsequence is internal, or it could be the item
1891 following the branch sequence in which case the
1892 subsequence is at the end (which does not
1893 necessarily mean the first node is the start of the
1894 alternation).
1895
1896 TRIE_TYPE(X) is a define which maps the optype to a
1897 trietype.
1898
1899 optype | trietype
1900 ----------------+-----------
1901 NOTHING | NOTHING
1902 EXACT | EXACT
1903 EXACT_REQ8 | EXACT
1904 EXACTFU | EXACTFU
1905 EXACTFU_REQ8 | EXACTFU
1906 EXACTFUP | EXACTFU
1907 EXACTFAA | EXACTFAA
1908 EXACTL | EXACTL
1909 EXACTFLU8 | EXACTFLU8
1910
1911
1912 */
1913 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
1914 ? NOTHING \
1915 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
1916 ? EXACT \
1917 : ( EXACTFU == (X) \
1918 || EXACTFU_REQ8 == (X) \
1919 || EXACTFUP == (X) ) \
1920 ? EXACTFU \
1921 : ( EXACTFAA == (X) ) \
1922 ? EXACTFAA \
1923 : ( EXACTL == (X) ) \
1924 ? EXACTL \
1925 : ( EXACTFLU8 == (X) ) \
1926 ? EXACTFLU8 \
1927 : 0 )
1928
1929 /* dont use tail as the end marker for this traverse */
1930 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1931 regnode * const noper = REGNODE_AFTER( cur );
1932 U8 noper_type = OP( noper );
1933 U8 noper_trietype = TRIE_TYPE( noper_type );
1934 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
1935 regnode * const noper_next = regnext( noper );
1936 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1937 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
1938 #endif
1939
1940 DEBUG_TRIE_COMPILE_r({
1941 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
1942 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
1943 depth+1,
1944 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
1945
1946 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
1947 Perl_re_printf( aTHX_ " -> %d:%s",
1948 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
1949
1950 if ( noper_next ) {
1951 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
1952 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
1953 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
1954 }
1955 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
1956 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
1957 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
1958 );
1959 });
1960
1961 /* Is noper a trieable nodetype that can be merged
1962 * with the current trie (if there is one)? */
1963 if ( noper_trietype
1964 &&
1965 (
1966 ( noper_trietype == NOTHING )
1967 || ( trietype == NOTHING )
1968 || ( trietype == noper_trietype )
1969 )
1970 #ifdef NOJUMPTRIE
1971 && noper_next >= tail
1972 #endif
1973 && count < U16_MAX)
1974 {
1975 /* Handle mergable triable node Either we are
1976 * the first node in a new trieable sequence,
1977 * in which case we do some bookkeeping,
1978 * otherwise we update the end pointer. */
1979 if ( !first ) {
1980 first = cur;
1981 if ( noper_trietype == NOTHING ) {
1982 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
1983 regnode * const noper_next = regnext( noper );
1984 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1985 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
1986 #endif
1987
1988 if ( noper_next_trietype ) {
1989 trietype = noper_next_trietype;
1990 } else if (noper_next_type) {
1991 /* a NOTHING regop is 1 regop wide.
1992 * We need at least two for a trie
1993 * so we can't merge this in */
1994 first = NULL;
1995 }
1996 } else {
1997 trietype = noper_trietype;
1998 }
1999 } else {
2000 if ( trietype == NOTHING )
2001 trietype = noper_trietype;
2002 prev = cur;
2003 }
2004 if (first)
2005 count++;
2006 } /* end handle mergable triable node */
2007 else {
2008 /* handle unmergable node -
2009 * noper may either be a triable node which can
2010 * not be tried together with the current trie,
2011 * or a non triable node */
2012 if ( prev ) {
2013 /* If last is set and trietype is not
2014 * NOTHING then we have found at least two
2015 * triable branch sequences in a row of a
2016 * similar trietype so we can turn them
2017 * into a trie. If/when we allow NOTHING to
2018 * start a trie sequence this condition
2019 * will be required, and it isn't expensive
2020 * so we leave it in for now. */
2021 if ( trietype && trietype != NOTHING )
2022 make_trie( pRExC_state,
2023 startbranch, first, cur, tail,
2024 count, trietype, depth+1 );
2025 prev = NULL; /* note: we clear/update
2026 first, trietype etc below,
2027 so we dont do it here */
2028 }
2029 if ( noper_trietype
2030 #ifdef NOJUMPTRIE
2031 && noper_next >= tail
2032 #endif
2033 ){
2034 /* noper is triable, so we can start a new
2035 * trie sequence */
2036 count = 1;
2037 first = cur;
2038 trietype = noper_trietype;
2039 } else if (first) {
2040 /* if we already saw a first but the
2041 * current node is not triable then we have
2042 * to reset the first information. */
2043 count = 0;
2044 first = NULL;
2045 trietype = 0;
2046 }
2047 } /* end handle unmergable node */
2048 } /* loop over branches */
2049 DEBUG_TRIE_COMPILE_r({
2050 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2051 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
2052 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2053 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
2054 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2055 REGNODE_NAME(trietype)
2056 );
2057
2058 });
2059 if ( prev && trietype ) {
2060 if ( trietype != NOTHING ) {
2061 /* the last branch of the sequence was part of
2062 * a trie, so we have to construct it here
2063 * outside of the loop */
2064 made= make_trie( pRExC_state, startbranch,
2065 first, scan, tail, count,
2066 trietype, depth+1 );
2067 #ifdef TRIE_STUDY_OPT
2068 if ( ((made == MADE_EXACT_TRIE &&
2069 startbranch == first)
2070 || ( first_non_open == first )) &&
2071 depth==0 ) {
2072 flags |= SCF_TRIE_RESTUDY;
2073 if ( startbranch == first
2074 && scan >= tail )
2075 {
2076 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
2077 }
2078 }
2079 #endif
2080 } else {
2081 /* at this point we know whatever we have is a
2082 * NOTHING sequence/branch AND if 'startbranch'
2083 * is 'first' then we can turn the whole thing
2084 * into a NOTHING
2085 */
2086 if ( startbranch == first ) {
2087 regnode *opt;
2088 /* the entire thing is a NOTHING sequence,
2089 * something like this: (?:|) So we can
2090 * turn it into a plain NOTHING op. */
2091 DEBUG_TRIE_COMPILE_r({
2092 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2093 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
2094 depth+1,
2095 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2096
2097 });
2098 OP(startbranch)= NOTHING;
2099 NEXT_OFF(startbranch)= tail - startbranch;
2100 for ( opt= startbranch + 1; opt < tail ; opt++ )
2101 OP(opt)= OPTIMIZED;
2102 }
2103 }
2104 } /* end if ( prev) */
2105 } /* TRIE_MAXBUF is non zero */
2106 } /* do trie */
2107 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
2108 }
2109 else
2110 scan = REGNODE_AFTER_opcode(scan,code);
2111 continue;
2112 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
2113 I32 paren = 0;
2114 regnode *start = NULL;
2115 regnode *end = NULL;
2116 U32 my_recursed_depth= recursed_depth;
2117
2118 if (OP(scan) != SUSPEND) { /* GOSUB */
2119 /* Do setup, note this code has side effects beyond
2120 * the rest of this block. Specifically setting
2121 * RExC_recurse[] must happen at least once during
2122 * study_chunk(). */
2123 paren = ARG1u(scan);
2124 RExC_recurse[ARG2i(scan)] = scan;
2125 start = REGNODE_p(RExC_open_parens[paren]);
2126 end = REGNODE_p(RExC_close_parens[paren]);
2127
2128 /* NOTE we MUST always execute the above code, even
2129 * if we do nothing with a GOSUB */
2130 if (
2131 ( flags & SCF_IN_DEFINE )
2132 ||
2133 (
2134 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
2135 &&
2136 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
2137 )
2138 ) {
2139 /* no need to do anything here if we are in a define. */
2140 /* or we are after some kind of infinite construct
2141 * so we can skip recursing into this item.
2142 * Since it is infinite we will not change the maxlen
2143 * or delta, and if we miss something that might raise
2144 * the minlen it will merely pessimise a little.
2145 *
2146 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
2147 * might result in a minlen of 1 and not of 4,
2148 * but this doesn't make us mismatch, just try a bit
2149 * harder than we should.
2150 *
2151 * However we must assume this GOSUB is infinite, to
2152 * avoid wrongly applying other optimizations in the
2153 * enclosing scope - see GH 18096, for example.
2154 */
2155 is_inf = is_inf_internal = 1;
2156 scan= regnext(scan);
2157 continue;
2158 }
2159
2160 if (
2161 !recursed_depth
2162 || !PAREN_TEST(recursed_depth - 1, paren)
2163 ) {
2164 /* it is quite possible that there are more efficient ways
2165 * to do this. We maintain a bitmap per level of recursion
2166 * of which patterns we have entered so we can detect if a
2167 * pattern creates a possible infinite loop. When we
2168 * recurse down a level we copy the previous levels bitmap
2169 * down. When we are at recursion level 0 we zero the top
2170 * level bitmap. It would be nice to implement a different
2171 * more efficient way of doing this. In particular the top
2172 * level bitmap may be unnecessary.
2173 */
2174 if (!recursed_depth) {
2175 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
2176 } else {
2177 Copy(PAREN_OFFSET(recursed_depth - 1),
2178 PAREN_OFFSET(recursed_depth),
2179 RExC_study_chunk_recursed_bytes, U8);
2180 }
2181 /* we havent recursed into this paren yet, so recurse into it */
2182 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
2183 PAREN_SET(recursed_depth, paren);
2184 my_recursed_depth= recursed_depth + 1;
2185 } else {
2186 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
2187 /* some form of infinite recursion, assume infinite length
2188 * */
2189 if (flags & SCF_DO_SUBSTR) {
2190 scan_commit(pRExC_state, data, minlenp, is_inf);
2191 data->cur_is_floating = 1;
2192 }
2193 is_inf = is_inf_internal = 1;
2194 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2195 ssc_anything(data->start_class);
2196 flags &= ~SCF_DO_STCLASS;
2197
2198 start= NULL; /* reset start so we dont recurse later on. */
2199 }
2200 } else {
2201 paren = stopparen;
2202 start = scan + 2;
2203 end = regnext(scan);
2204 }
2205 if (start) {
2206 scan_frame *newframe;
2207 assert(end);
2208 if (!RExC_frame_last) {
2209 Newxz(newframe, 1, scan_frame);
2210 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
2211 RExC_frame_head= newframe;
2212 RExC_frame_count++;
2213 } else if (!RExC_frame_last->next_frame) {
2214 Newxz(newframe, 1, scan_frame);
2215 RExC_frame_last->next_frame= newframe;
2216 newframe->prev_frame= RExC_frame_last;
2217 RExC_frame_count++;
2218 } else {
2219 newframe= RExC_frame_last->next_frame;
2220 }
2221 RExC_frame_last= newframe;
2222
2223 newframe->next_regnode = regnext(scan);
2224 newframe->last_regnode = last;
2225 newframe->stopparen = stopparen;
2226 newframe->prev_recursed_depth = recursed_depth;
2227 newframe->this_prev_frame= frame;
2228 newframe->in_gosub = (
2229 (frame && frame->in_gosub) || OP(scan) == GOSUB
2230 );
2231
2232 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
2233 DEBUG_PEEP("fnew", scan, depth, flags);
2234
2235 frame = newframe;
2236 scan = start;
2237 stopparen = paren;
2238 last = end;
2239 depth = depth + 1;
2240 recursed_depth= my_recursed_depth;
2241
2242 continue;
2243 }
2244 }
2245 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
2246 SSize_t bytelen = STR_LEN(scan), charlen;
2247 UV uc;
2248 assert(bytelen);
2249 if (UTF) {
2250 const U8 * const s = (U8*)STRING(scan);
2251 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
2252 charlen = utf8_length(s, s + bytelen);
2253 } else {
2254 uc = *((U8*)STRING(scan));
2255 charlen = bytelen;
2256 }
2257 min += charlen;
2258 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2259 /* The code below prefers earlier match for fixed
2260 offset, later match for variable offset. */
2261 if (data->last_end == -1) { /* Update the start info. */
2262 data->last_start_min = data->pos_min;
2263 data->last_start_max =
2264 is_inf ? OPTIMIZE_INFTY
2265 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
2266 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
2267 }
2268 sv_catpvn(data->last_found, STRING(scan), bytelen);
2269 if (UTF)
2270 SvUTF8_on(data->last_found);
2271 {
2272 SV * const sv = data->last_found;
2273 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2274 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2275 if (mg && mg->mg_len >= 0)
2276 mg->mg_len += charlen;
2277 }
2278 data->last_end = data->pos_min + charlen;
2279 data->pos_min += charlen; /* As in the first entry. */
2280 data->flags &= ~SF_BEFORE_EOL;
2281 }
2282
2283 /* ANDing the code point leaves at most it, and not in locale, and
2284 * can't match null string */
2285 if (flags & SCF_DO_STCLASS_AND) {
2286 ssc_cp_and(data->start_class, uc);
2287 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2288 ssc_clear_locale(data->start_class);
2289 }
2290 else if (flags & SCF_DO_STCLASS_OR) {
2291 ssc_add_cp(data->start_class, uc);
2292 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2293
2294 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2295 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2296 }
2297 flags &= ~SCF_DO_STCLASS;
2298 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
2299 }
2300 else if (REGNODE_TYPE(OP(scan)) == EXACT) {
2301 /* But OP != EXACT!, so is EXACTFish */
2302 SSize_t bytelen = STR_LEN(scan), charlen;
2303 const U8 * s = (U8*)STRING(scan);
2304
2305 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
2306 * with the mask set to the complement of the bit that differs
2307 * between upper and lower case, and the lowest code point of the
2308 * pair (which the '&' forces) */
2309 if ( bytelen == 1
2310 && isALPHA_A(*s)
2311 && ( OP(scan) == EXACTFAA
2312 || ( OP(scan) == EXACTFU
2313 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
2314 && mutate_ok
2315 ) {
2316 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
2317
2318 OP(scan) = ANYOFM;
2319 ARG1u_SET(scan, *s & mask);
2320 FLAGS(scan) = mask;
2321 /* We're not EXACTFish any more, so restudy.
2322 * Search for "restudy" in this file to find
2323 * a comment with details. */
2324 continue;
2325 }
2326
2327 /* Search for fixed substrings supports EXACT only. */
2328 if (flags & SCF_DO_SUBSTR) {
2329 assert(data);
2330 scan_commit(pRExC_state, data, minlenp, is_inf);
2331 }
2332 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
2333 if (unfolded_multi_char) {
2334 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
2335 }
2336 min += charlen - min_subtract;
2337 assert (min >= 0);
2338 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2339 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2340 ) {
2341 delta += min_subtract;
2342 } else {
2343 delta = OPTIMIZE_INFTY;
2344 }
2345 if (flags & SCF_DO_SUBSTR) {
2346 data->pos_min += charlen - min_subtract;
2347 if (data->pos_min < 0) {
2348 data->pos_min = 0;
2349 }
2350 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2351 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2352 ) {
2353 data->pos_delta += min_subtract;
2354 } else {
2355 data->pos_delta = OPTIMIZE_INFTY;
2356 }
2357 if (min_subtract) {
2358 data->cur_is_floating = 1; /* float */
2359 }
2360 }
2361
2362 if (flags & SCF_DO_STCLASS) {
2363 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
2364
2365 assert(EXACTF_invlist);
2366 if (flags & SCF_DO_STCLASS_AND) {
2367 if (OP(scan) != EXACTFL)
2368 ssc_clear_locale(data->start_class);
2369 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2370 ANYOF_POSIXL_ZERO(data->start_class);
2371 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
2372 }
2373 else { /* SCF_DO_STCLASS_OR */
2374 ssc_union(data->start_class, EXACTF_invlist, FALSE);
2375 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2376
2377 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2378 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2379 }
2380 flags &= ~SCF_DO_STCLASS;
2381 SvREFCNT_dec(EXACTF_invlist);
2382 }
2383 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
2384 }
2385 else if (REGNODE_VARIES(OP(scan))) {
2386 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
2387 I32 fl = 0;
2388 U32 f = flags;
2389 regnode * const oscan = scan;
2390 regnode_ssc this_class;
2391 regnode_ssc *oclass = NULL;
2392 I32 next_is_eval = 0;
2393
2394 switch (REGNODE_TYPE(OP(scan))) {
2395 case WHILEM: /* End of (?:...)* . */
2396 scan = REGNODE_AFTER(scan);
2397 goto finish;
2398 case PLUS:
2399 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2400 next = REGNODE_AFTER(scan);
2401 if ( ( REGNODE_TYPE(OP(next)) == EXACT
2402 && ! isEXACTFish(OP(next)))
2403 || (flags & SCF_DO_STCLASS))
2404 {
2405 mincount = 1;
2406 maxcount = REG_INFTY;
2407 next = regnext(scan);
2408 scan = REGNODE_AFTER(scan);
2409 goto do_curly;
2410 }
2411 }
2412 if (flags & SCF_DO_SUBSTR)
2413 data->pos_min++;
2414 /* This will bypass the formal 'min += minnext * mincount'
2415 * calculation in the do_curly path, so assumes min width
2416 * of the PLUS payload is exactly one. */
2417 min++;
2418 /* FALLTHROUGH */
2419 case STAR:
2420 next = REGNODE_AFTER(scan);
2421
2422 /* This temporary node can now be turned into EXACTFU, and
2423 * must, as regexec.c doesn't handle it */
2424 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
2425 OP(next) = EXACTFU;
2426 }
2427
2428 if ( STR_LEN(next) == 1
2429 && isALPHA_A(* STRING(next))
2430 && ( OP(next) == EXACTFAA
2431 || ( OP(next) == EXACTFU
2432 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
2433 && mutate_ok
2434 ) {
2435 /* These differ in just one bit */
2436 U8 mask = ~ ('A' ^ 'a');
2437
2438 assert(isALPHA_A(* STRING(next)));
2439
2440 /* Then replace it by an ANYOFM node, with
2441 * the mask set to the complement of the
2442 * bit that differs between upper and lower
2443 * case, and the lowest code point of the
2444 * pair (which the '&' forces) */
2445 OP(next) = ANYOFM;
2446 ARG1u_SET(next, *STRING(next) & mask);
2447 FLAGS(next) = mask;
2448 }
2449
2450 if (flags & SCF_DO_STCLASS) {
2451 mincount = 0;
2452 maxcount = REG_INFTY;
2453 next = regnext(scan);
2454 scan = REGNODE_AFTER(scan);
2455 goto do_curly;
2456 }
2457 if (flags & SCF_DO_SUBSTR) {
2458 scan_commit(pRExC_state, data, minlenp, is_inf);
2459 /* Cannot extend fixed substrings */
2460 data->cur_is_floating = 1; /* float */
2461 }
2462 is_inf = is_inf_internal = 1;
2463 scan = regnext(scan);
2464 goto optimize_curly_tail;
2465 case CURLY:
2466 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2467 && (FLAGS(scan) == stopparen))
2468 {
2469 mincount = 1;
2470 maxcount = 1;
2471 } else {
2472 mincount = ARG1i(scan);
2473 maxcount = ARG2i(scan);
2474 }
2475 next = regnext(scan);
2476 if (OP(scan) == CURLYX) {
2477 I32 lp = (data ? *(data->last_closep) : 0);
2478 FLAGS(scan) = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2479 }
2480 scan = REGNODE_AFTER(scan);
2481 next_is_eval = (OP(scan) == EVAL);
2482 do_curly:
2483 if (flags & SCF_DO_SUBSTR) {
2484 if (mincount == 0)
2485 scan_commit(pRExC_state, data, minlenp, is_inf);
2486 /* Cannot extend fixed substrings */
2487 pos_before = data->pos_min;
2488 }
2489 if (data) {
2490 fl = data->flags;
2491 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2492 if (is_inf)
2493 data->flags |= SF_IS_INF;
2494 }
2495 if (flags & SCF_DO_STCLASS) {
2496 ssc_init(pRExC_state, &this_class);
2497 oclass = data->start_class;
2498 data->start_class = &this_class;
2499 f |= SCF_DO_STCLASS_AND;
2500 f &= ~SCF_DO_STCLASS_OR;
2501 }
2502 /* Exclude from super-linear cache processing any {n,m}
2503 regops for which the combination of input pos and regex
2504 pos is not enough information to determine if a match
2505 will be possible.
2506
2507 For example, in the regex /foo(bar\s*){4,8}baz/ with the
2508 regex pos at the \s*, the prospects for a match depend not
2509 only on the input position but also on how many (bar\s*)
2510 repeats into the {4,8} we are. */
2511 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
2512 f &= ~SCF_WHILEM_VISITED_POS;
2513
2514 /* This will finish on WHILEM, setting scan, or on NULL: */
2515 /* recurse study_chunk() on loop bodies */
2516 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2517 last, data, stopparen, recursed_depth, NULL,
2518 (mincount == 0
2519 ? (f & ~SCF_DO_SUBSTR)
2520 : f)
2521 , depth+1, mutate_ok);
2522
2523 if (data && data->flags & SCF_SEEN_ACCEPT) {
2524 if (mincount > 1)
2525 mincount = 1;
2526 }
2527
2528 if (flags & SCF_DO_STCLASS)
2529 data->start_class = oclass;
2530 if (mincount == 0 || minnext == 0) {
2531 if (flags & SCF_DO_STCLASS_OR) {
2532 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2533 }
2534 else if (flags & SCF_DO_STCLASS_AND) {
2535 /* Switch to OR mode: cache the old value of
2536 * data->start_class */
2537 INIT_AND_WITHP;
2538 StructCopy(data->start_class, and_withp, regnode_ssc);
2539 flags &= ~SCF_DO_STCLASS_AND;
2540 StructCopy(&this_class, data->start_class, regnode_ssc);
2541 flags |= SCF_DO_STCLASS_OR;
2542 ANYOF_FLAGS(data->start_class)
2543 |= SSC_MATCHES_EMPTY_STRING;
2544 }
2545 } else { /* Non-zero len */
2546 if (flags & SCF_DO_STCLASS_OR) {
2547 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2548 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2549 }
2550 else if (flags & SCF_DO_STCLASS_AND)
2551 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2552 flags &= ~SCF_DO_STCLASS;
2553 }
2554 if (!scan) /* It was not CURLYX, but CURLY. */
2555 scan = next;
2556 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
2557 /* ? quantifier ok, except for (?{ ... }) */
2558 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2559 && (minnext == 0) && (deltanext == 0)
2560 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2561 && maxcount <= REG_INFTY/3) /* Complement check for big
2562 count */
2563 {
2564 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
2565 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
2566 "Quantifier unexpected on zero-length expression "
2567 "in regex m/%" UTF8f "/",
2568 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
2569 RExC_precomp)));
2570 }
2571
2572 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
2573 || min >= SSize_t_MAX - minnext * mincount )
2574 {
2575 FAIL("Regexp out of space");
2576 }
2577
2578 min += minnext * mincount;
2579 is_inf_internal |= deltanext == OPTIMIZE_INFTY
2580 || (maxcount == REG_INFTY && minnext + deltanext > 0);
2581 is_inf |= is_inf_internal;
2582 if (is_inf) {
2583 delta = OPTIMIZE_INFTY;
2584 } else {
2585 delta += (minnext + deltanext) * maxcount
2586 - minnext * mincount;
2587 }
2588
2589 if (data && data->flags & SCF_SEEN_ACCEPT) {
2590 if (flags & SCF_DO_SUBSTR) {
2591 scan_commit(pRExC_state, data, minlenp, is_inf);
2592 flags &= ~SCF_DO_SUBSTR;
2593 }
2594 if (stopmin > min)
2595 stopmin = min;
2596 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
2597 }
2598 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2599 /* Try powerful optimization CURLYX => CURLYN. */
2600 if ( RE_OPTIMIZE_CURLYX_TO_CURLYN
2601 && OP(oscan) == CURLYX
2602 && data
2603 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2604 non optimistic eval is seen
2605 anywhere.*/
2606 && ( data->flags & SF_IN_PAR ) /* has parens */
2607 && !deltanext
2608 && minnext == 1
2609 && mutate_ok
2610 ) {
2611 DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2612 /* Try to optimize to CURLYN. */
2613 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
2614 regnode * const nxt1 = nxt;
2615 #ifdef DEBUGGING
2616 regnode *nxt2;
2617 #endif
2618 /* Skip open. */
2619 nxt = regnext(nxt);
2620 if (!REGNODE_SIMPLE(OP(nxt))
2621 && !(REGNODE_TYPE(OP(nxt)) == EXACT
2622 && STR_LEN(nxt) == 1))
2623 goto nogo;
2624 #ifdef DEBUGGING
2625 nxt2 = nxt;
2626 #endif
2627 nxt = regnext(nxt);
2628 if (OP(nxt) != CLOSE)
2629 goto nogo;
2630 if (RExC_open_parens) {
2631
2632 /*open->CURLYM*/
2633 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2634
2635 /*close->while*/
2636 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
2637 }
2638 /* Now we know that nxt2 is the only contents: */
2639 FLAGS(oscan) = (U8)PARNO(nxt);
2640 OP(oscan) = CURLYN;
2641 OP(nxt1) = NOTHING; /* was OPEN. */
2642
2643 #ifdef DEBUGGING
2644 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2645 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
2646 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
2647 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2648 OP(nxt + 1) = OPTIMIZED; /* was count. */
2649 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
2650 #endif
2651 }
2652 nogo:
2653
2654 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2655
2656 /* Try optimization CURLYX => CURLYM. */
2657 if ( RE_OPTIMIZE_CURLYX_TO_CURLYM
2658 && OP(oscan) == CURLYX
2659 && data
2660 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2661 non optimistic eval is seen
2662 anywhere.*/
2663 && !(data->flags & SF_HAS_PAR) /* no parens! */
2664 && !deltanext /* atom is fixed width */
2665 && minnext != 0 /* CURLYM can't handle zero width */
2666 /* Nor characters whose fold at run-time may be
2667 * multi-character */
2668 && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN)
2669 && mutate_ok
2670 ) {
2671 DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2672 /* XXXX How to optimize if data == 0? */
2673 /* Optimize to a simpler form. */
2674 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
2675 regnode *nxt2;
2676
2677 OP(oscan) = CURLYM;
2678 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2679 && (OP(nxt2) != WHILEM))
2680 nxt = nxt2;
2681 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2682 /* Need to optimize away parenths. */
2683 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
2684 /* Set the parenth number. */
2685 /* note that we have changed the type of oscan to CURLYM here */
2686 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
2687
2688 FLAGS(oscan) = (U8)PARNO(nxt);
2689 if (RExC_open_parens) {
2690 /*open->CURLYM*/
2691 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2692
2693 /*close->NOTHING*/
2694 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
2695 + 1;
2696 }
2697 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2698 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2699
2700 #ifdef DEBUGGING
2701 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2702 OP(nxt + 1) = OPTIMIZED; /* was count. */
2703 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
2704 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
2705 #endif
2706 #if 0
2707 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2708 regnode *nnxt = regnext(nxt1);
2709 if (nnxt == nxt) {
2710 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
2711 ARG1u_SET(nxt1, nxt2 - nxt1);
2712 else if (nxt2 - nxt1 < U16_MAX)
2713 NEXT_OFF(nxt1) = nxt2 - nxt1;
2714 else
2715 OP(nxt) = NOTHING; /* Cannot beautify */
2716 }
2717 nxt1 = nnxt;
2718 }
2719 #endif
2720 /* Optimize again: */
2721 /* recurse study_chunk() on optimised CURLYX => CURLYM */
2722 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2723 NULL, stopparen, recursed_depth, NULL, 0,
2724 depth+1, mutate_ok);
2725 }
2726 else
2727 FLAGS(oscan) = 0;
2728 }
2729 else if ((OP(oscan) == CURLYX)
2730 && (flags & SCF_WHILEM_VISITED_POS)
2731 /* See the comment on a similar expression above.
2732 However, this time it's not a subexpression
2733 we care about, but the expression itself. */
2734 && (maxcount == REG_INFTY)
2735 && data) {
2736 /* This stays as CURLYX, we can put the count/of pair. */
2737 /* Find WHILEM (as in regexec.c) */
2738 regnode *nxt = oscan + NEXT_OFF(oscan);
2739
2740 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
2741 nxt += ARG1u(nxt);
2742 nxt = REGNODE_BEFORE(nxt);
2743 if (FLAGS(nxt) & 0xf) {
2744 /* we've already set whilem count on this node */
2745 } else if (++data->whilem_c < 16) {
2746 assert(data->whilem_c <= RExC_whilem_seen);
2747 FLAGS(nxt) = (U8)(data->whilem_c
2748 | (RExC_whilem_seen << 4)); /* On WHILEM */
2749 }
2750 }
2751 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2752 pars++;
2753 if (flags & SCF_DO_SUBSTR) {
2754 SV *last_str = NULL;
2755 STRLEN last_chrs = 0;
2756 int counted = mincount != 0;
2757
2758 if (data->last_end > 0 && mincount != 0) { /* Ends with a
2759 string. */
2760 SSize_t b = pos_before >= data->last_start_min
2761 ? pos_before : data->last_start_min;
2762 STRLEN l;
2763 const char * const s = SvPV_const(data->last_found, l);
2764 SSize_t old = b - data->last_start_min;
2765 assert(old >= 0);
2766
2767 if (UTF)
2768 old = utf8_hop_forward((U8*)s, old,
2769 (U8 *) SvEND(data->last_found))
2770 - (U8*)s;
2771 l -= old;
2772 /* Get the added string: */
2773 last_str = newSVpvn_utf8(s + old, l, UTF);
2774 last_chrs = UTF ? utf8_length((U8*)(s + old),
2775 (U8*)(s + old + l)) : l;
2776 if (deltanext == 0 && pos_before == b) {
2777 /* What was added is a constant string */
2778 if (mincount > 1) {
2779
2780 SvGROW(last_str, (mincount * l) + 1);
2781 repeatcpy(SvPVX(last_str) + l,
2782 SvPVX_const(last_str), l,
2783 mincount - 1);
2784 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2785 /* Add additional parts. */
2786 SvCUR_set(data->last_found,
2787 SvCUR(data->last_found) - l);
2788 sv_catsv(data->last_found, last_str);
2789 {
2790 SV * sv = data->last_found;
2791 MAGIC *mg =
2792 SvUTF8(sv) && SvMAGICAL(sv) ?
2793 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2794 if (mg && mg->mg_len >= 0)
2795 mg->mg_len += last_chrs * (mincount-1);
2796 }
2797 last_chrs *= mincount;
2798 data->last_end += l * (mincount - 1);
2799 }
2800 } else {
2801 /* start offset must point into the last copy */
2802 data->last_start_min += minnext * (mincount - 1);
2803 data->last_start_max =
2804 is_inf
2805 ? OPTIMIZE_INFTY
2806 : data->last_start_max +
2807 (maxcount - 1) * (minnext + data->pos_delta);
2808 }
2809 }
2810 /* It is counted once already... */
2811 data->pos_min += minnext * (mincount - counted);
2812 #if 0
2813 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
2814 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
2815 " maxcount=%" UVuf " mincount=%" UVuf
2816 " data->pos_delta=%" UVuf "\n",
2817 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
2818 (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
2819 if (deltanext != OPTIMIZE_INFTY)
2820 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
2821 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
2822 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
2823 #endif
2824 if (deltanext == OPTIMIZE_INFTY
2825 || data->pos_delta == OPTIMIZE_INFTY
2826 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
2827 data->pos_delta = OPTIMIZE_INFTY;
2828 else
2829 data->pos_delta += - counted * deltanext +
2830 (minnext + deltanext) * maxcount - minnext * mincount;
2831 if (mincount != maxcount) {
2832 /* Cannot extend fixed substrings found inside
2833 the group. */
2834 scan_commit(pRExC_state, data, minlenp, is_inf);
2835 if (mincount && last_str) {
2836 SV * const sv = data->last_found;
2837 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2838 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2839
2840 if (mg)
2841 mg->mg_len = -1;
2842 sv_setsv(sv, last_str);
2843 data->last_end = data->pos_min;
2844 data->last_start_min = data->pos_min - last_chrs;
2845 data->last_start_max = is_inf
2846 ? OPTIMIZE_INFTY
2847 : data->pos_min + data->pos_delta - last_chrs;
2848 }
2849 data->cur_is_floating = 1; /* float */
2850 }
2851 SvREFCNT_dec(last_str);
2852 }
2853 if (data && (fl & SF_HAS_EVAL))
2854 data->flags |= SF_HAS_EVAL;
2855 optimize_curly_tail:
2856 rck_elide_nothing(oscan);
2857 continue;
2858
2859 default:
2860 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
2861 OP(scan));
2862 case REF:
2863 case CLUMP:
2864 if (flags & SCF_DO_SUBSTR) {
2865 /* Cannot expect anything... */
2866 scan_commit(pRExC_state, data, minlenp, is_inf);
2867 data->cur_is_floating = 1; /* float */
2868 }
2869 is_inf = is_inf_internal = 1;
2870 if (flags & SCF_DO_STCLASS_OR) {
2871 if (OP(scan) == CLUMP) {
2872 /* Actually is any start char, but very few code points
2873 * aren't start characters */
2874 ssc_match_all_cp(data->start_class);
2875 }
2876 else {
2877 ssc_anything(data->start_class);
2878 }
2879 }
2880 flags &= ~SCF_DO_STCLASS;
2881 break;
2882 }
2883 }
2884 else if (OP(scan) == LNBREAK) {
2885 if (flags & SCF_DO_STCLASS) {
2886 if (flags & SCF_DO_STCLASS_AND) {
2887 ssc_intersection(data->start_class,
2888 PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
2889 ssc_clear_locale(data->start_class);
2890 ANYOF_FLAGS(data->start_class)
2891 &= ~SSC_MATCHES_EMPTY_STRING;
2892 }
2893 else if (flags & SCF_DO_STCLASS_OR) {
2894 ssc_union(data->start_class,
2895 PL_XPosix_ptrs[CC_VERTSPACE_],
2896 FALSE);
2897 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2898
2899 /* See commit msg for
2900 * 749e076fceedeb708a624933726e7989f2302f6a */
2901 ANYOF_FLAGS(data->start_class)
2902 &= ~SSC_MATCHES_EMPTY_STRING;
2903 }
2904 flags &= ~SCF_DO_STCLASS;
2905 }
2906 min++;
2907 if (delta != OPTIMIZE_INFTY)
2908 delta++; /* Because of the 2 char string cr-lf */
2909 if (flags & SCF_DO_SUBSTR) {
2910 /* Cannot expect anything... */
2911 scan_commit(pRExC_state, data, minlenp, is_inf);
2912 data->pos_min += 1;
2913 if (data->pos_delta != OPTIMIZE_INFTY) {
2914 data->pos_delta += 1;
2915 }
2916 data->cur_is_floating = 1; /* float */
2917 }
2918 }
2919 else if (REGNODE_SIMPLE(OP(scan))) {
2920
2921 if (flags & SCF_DO_SUBSTR) {
2922 scan_commit(pRExC_state, data, minlenp, is_inf);
2923 data->pos_min++;
2924 }
2925 min++;
2926 if (flags & SCF_DO_STCLASS) {
2927 bool invert = 0;
2928 SV* my_invlist = NULL;
2929 U8 namedclass;
2930
2931 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2932 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2933
2934 /* Some of the logic below assumes that switching
2935 locale on will only add false positives. */
2936 switch (OP(scan)) {
2937
2938 default:
2939 #ifdef DEBUGGING
2940 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
2941 OP(scan));
2942 #endif
2943 case SANY:
2944 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2945 ssc_match_all_cp(data->start_class);
2946 break;
2947
2948 case REG_ANY:
2949 {
2950 SV* REG_ANY_invlist = _new_invlist(2);
2951 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
2952 '\n');
2953 if (flags & SCF_DO_STCLASS_OR) {
2954 ssc_union(data->start_class,
2955 REG_ANY_invlist,
2956 TRUE /* TRUE => invert, hence all but \n
2957 */
2958 );
2959 }
2960 else if (flags & SCF_DO_STCLASS_AND) {
2961 ssc_intersection(data->start_class,
2962 REG_ANY_invlist,
2963 TRUE /* TRUE => invert */
2964 );
2965 ssc_clear_locale(data->start_class);
2966 }
2967 SvREFCNT_dec_NN(REG_ANY_invlist);
2968 }
2969 break;
2970
2971 case ANYOFD:
2972 case ANYOFL:
2973 case ANYOFPOSIXL:
2974 case ANYOFH:
2975 case ANYOFHb:
2976 case ANYOFHr:
2977 case ANYOFHs:
2978 case ANYOF:
2979 if (flags & SCF_DO_STCLASS_AND)
2980 ssc_and(pRExC_state, data->start_class,
2981 (regnode_charclass *) scan);
2982 else
2983 ssc_or(pRExC_state, data->start_class,
2984 (regnode_charclass *) scan);
2985 break;
2986
2987 case ANYOFHbbm:
2988 {
2989 SV* cp_list = get_ANYOFHbbm_contents(scan);
2990
2991 if (flags & SCF_DO_STCLASS_OR) {
2992 ssc_union(data->start_class, cp_list, invert);
2993 }
2994 else if (flags & SCF_DO_STCLASS_AND) {
2995 ssc_intersection(data->start_class, cp_list, invert);
2996 }
2997
2998 SvREFCNT_dec_NN(cp_list);
2999 break;
3000 }
3001
3002 case NANYOFM: /* NANYOFM already contains the inversion of the
3003 input ANYOF data, so, unlike things like
3004 NPOSIXA, don't change 'invert' to TRUE */
3005 /* FALLTHROUGH */
3006 case ANYOFM:
3007 {
3008 SV* cp_list = get_ANYOFM_contents(scan);
3009
3010 if (flags & SCF_DO_STCLASS_OR) {
3011 ssc_union(data->start_class, cp_list, invert);
3012 }
3013 else if (flags & SCF_DO_STCLASS_AND) {
3014 ssc_intersection(data->start_class, cp_list, invert);
3015 }
3016
3017 SvREFCNT_dec_NN(cp_list);
3018 break;
3019 }
3020
3021 case ANYOFR:
3022 case ANYOFRb:
3023 {
3024 SV* cp_list = NULL;
3025
3026 cp_list = _add_range_to_invlist(cp_list,
3027 ANYOFRbase(scan),
3028 ANYOFRbase(scan) + ANYOFRdelta(scan));
3029
3030 if (flags & SCF_DO_STCLASS_OR) {
3031 ssc_union(data->start_class, cp_list, invert);
3032 }
3033 else if (flags & SCF_DO_STCLASS_AND) {
3034 ssc_intersection(data->start_class, cp_list, invert);
3035 }
3036
3037 SvREFCNT_dec_NN(cp_list);
3038 break;
3039 }
3040
3041 case NPOSIXL:
3042 invert = 1;
3043 /* FALLTHROUGH */
3044
3045 case POSIXL:
3046 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
3047 if (flags & SCF_DO_STCLASS_AND) {
3048 bool was_there = cBOOL(
3049 ANYOF_POSIXL_TEST(data->start_class,
3050 namedclass));
3051 ANYOF_POSIXL_ZERO(data->start_class);
3052 if (was_there) { /* Do an AND */
3053 ANYOF_POSIXL_SET(data->start_class, namedclass);
3054 }
3055 /* No individual code points can now match */
3056 data->start_class->invlist
3057 = sv_2mortal(_new_invlist(0));
3058 }
3059 else {
3060 int complement = namedclass + ((invert) ? -1 : 1);
3061
3062 assert(flags & SCF_DO_STCLASS_OR);
3063
3064 /* If the complement of this class was already there,
3065 * the result is that they match all code points,
3066 * (\d + \D == everything). Remove the classes from
3067 * future consideration. Locale is not relevant in
3068 * this case */
3069 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
3070 ssc_match_all_cp(data->start_class);
3071 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
3072 ANYOF_POSIXL_CLEAR(data->start_class, complement);
3073 }
3074 else { /* The usual case; just add this class to the
3075 existing set */
3076 ANYOF_POSIXL_SET(data->start_class, namedclass);
3077 }
3078 }
3079 break;
3080
3081 case NPOSIXA: /* For these, we always know the exact set of
3082 what's matched */
3083 invert = 1;
3084 /* FALLTHROUGH */
3085 case POSIXA:
3086 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
3087 goto join_posix_and_ascii;
3088
3089 case NPOSIXD:
3090 case NPOSIXU:
3091 invert = 1;
3092 /* FALLTHROUGH */
3093 case POSIXD:
3094 case POSIXU:
3095 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
3096
3097 /* NPOSIXD matches all upper Latin1 code points unless the
3098 * target string being matched is UTF-8, which is
3099 * unknowable until match time. Since we are going to
3100 * invert, we want to get rid of all of them so that the
3101 * inversion will match all */
3102 if (OP(scan) == NPOSIXD) {
3103 _invlist_subtract(my_invlist, PL_UpperLatin1,
3104 &my_invlist);
3105 }
3106
3107 join_posix_and_ascii:
3108
3109 if (flags & SCF_DO_STCLASS_AND) {
3110 ssc_intersection(data->start_class, my_invlist, invert);
3111 ssc_clear_locale(data->start_class);
3112 }
3113 else {
3114 assert(flags & SCF_DO_STCLASS_OR);
3115 ssc_union(data->start_class, my_invlist, invert);
3116 }
3117 SvREFCNT_dec(my_invlist);
3118 }
3119 if (flags & SCF_DO_STCLASS_OR)
3120 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3121 flags &= ~SCF_DO_STCLASS;
3122 }
3123 }
3124 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
3125 data->flags |= (OP(scan) == MEOL
3126 ? SF_BEFORE_MEOL
3127 : SF_BEFORE_SEOL);
3128 scan_commit(pRExC_state, data, minlenp, is_inf);
3129
3130 }
3131 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
3132 /* Lookbehind, or need to calculate parens/evals/stclass: */
3133 && (FLAGS(scan) || data || (flags & SCF_DO_STCLASS))
3134 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
3135 {
3136 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3137 || OP(scan) == UNLESSM )
3138 {
3139 /* Negative Lookahead/lookbehind
3140 In this case we can't do fixed string optimisation.
3141 */
3142
3143 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
3144 SSize_t deltanext, minnext;
3145 SSize_t fake_last_close = 0;
3146 regnode *fake_last_close_op = NULL;
3147 regnode *cur_last_close_op;
3148 regnode *nscan;
3149 regnode_ssc intrnl;
3150 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3151
3152 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3153 if (data) {
3154 data_fake.whilem_c = data->whilem_c;
3155 data_fake.last_closep = data->last_closep;
3156 data_fake.last_close_opp = data->last_close_opp;
3157 }
3158 else {
3159 data_fake.last_closep = &fake_last_close;
3160 data_fake.last_close_opp = &fake_last_close_op;
3161 }
3162
3163 /* remember the last_close_op we saw so we can see if
3164 * we are dealing with variable length lookbehind that
3165 * contains capturing buffers, which are considered
3166 * experimental */
3167 cur_last_close_op= *(data_fake.last_close_opp);
3168
3169 data_fake.pos_delta = delta;
3170 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3171 && OP(scan) == IFMATCH ) { /* Lookahead */
3172 ssc_init(pRExC_state, &intrnl);
3173 data_fake.start_class = &intrnl;
3174 f |= SCF_DO_STCLASS_AND;
3175 }
3176 if (flags & SCF_WHILEM_VISITED_POS)
3177 f |= SCF_WHILEM_VISITED_POS;
3178 next = regnext(scan);
3179 nscan = REGNODE_AFTER(scan);
3180
3181 /* recurse study_chunk() for lookahead body */
3182 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3183 last, &data_fake, stopparen,
3184 recursed_depth, NULL, f, depth+1,
3185 mutate_ok);
3186
3187 if (FLAGS(scan)) {
3188 if ( deltanext < 0
3189 || deltanext > (I32) U8_MAX
3190 || minnext > (I32)U8_MAX
3191 || minnext + deltanext > (I32)U8_MAX)
3192 {
3193 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3194 (UV)U8_MAX);
3195 }
3196
3197 /* The 'next_off' field has been repurposed to count the
3198 * additional starting positions to try beyond the initial
3199 * one. (This leaves it at 0 for non-variable length
3200 * matches to avoid breakage for those not using this
3201 * extension) */
3202 if (deltanext) {
3203 NEXT_OFF(scan) = deltanext;
3204 if (
3205 /* See a CLOSE op inside this lookbehind? */
3206 cur_last_close_op != *(data_fake.last_close_opp)
3207 /* and not doing restudy. see: restudied */
3208 && !(flags & SCF_TRIE_DOING_RESTUDY)
3209 ) {
3210 /* this is positive variable length lookbehind with
3211 * capture buffers inside of it */
3212 ckWARNexperimental_with_arg(RExC_parse,
3213 WARN_EXPERIMENTAL__VLB,
3214 "Variable length %s lookbehind with capturing is experimental",
3215 is_positive ? "positive" : "negative");
3216 }
3217 }
3218 FLAGS(scan) = (U8)minnext + deltanext;
3219 }
3220 if (data) {
3221 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3222 pars++;
3223 if (data_fake.flags & SF_HAS_EVAL)
3224 data->flags |= SF_HAS_EVAL;
3225 data->whilem_c = data_fake.whilem_c;
3226 }
3227 if (f & SCF_DO_STCLASS_AND) {
3228 if (flags & SCF_DO_STCLASS_OR) {
3229 /* OR before, AND after: ideally we would recurse with
3230 * data_fake to get the AND applied by study of the
3231 * remainder of the pattern, and then derecurse;
3232 * *** HACK *** for now just treat as "no information".
3233 * See [perl #56690].
3234 */
3235 ssc_init(pRExC_state, data->start_class);
3236 } else {
3237 /* AND before and after: combine and continue. These
3238 * assertions are zero-length, so can match an EMPTY
3239 * string */
3240 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3241 ANYOF_FLAGS(data->start_class)
3242 |= SSC_MATCHES_EMPTY_STRING;
3243 }
3244 }
3245 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
3246 }
3247 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3248 else {
3249 /* Positive Lookahead/lookbehind
3250 In this case we can do fixed string optimisation,
3251 but we must be careful about it. Note in the case of
3252 lookbehind the positions will be offset by the minimum
3253 length of the pattern, something we won't know about
3254 until after the recurse.
3255 */
3256 SSize_t deltanext, fake_last_close = 0;
3257 regnode *last_close_op = NULL;
3258 regnode *nscan;
3259 regnode_ssc intrnl;
3260 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3261 /* We use SAVEFREEPV so that when the full compile
3262 is finished perl will clean up the allocated
3263 minlens when it's all done. This way we don't
3264 have to worry about freeing them when we know
3265 they wont be used, which would be a pain.
3266 */
3267 SSize_t *minnextp;
3268 Newx( minnextp, 1, SSize_t );
3269 SAVEFREEPV(minnextp);
3270
3271 if (data) {
3272 StructCopy(data, &data_fake, scan_data_t);
3273 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3274 f |= SCF_DO_SUBSTR;
3275 if (FLAGS(scan))
3276 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
3277 data_fake.last_found=newSVsv(data->last_found);
3278 }
3279 }
3280 else {
3281 data_fake.last_closep = &fake_last_close;
3282 data_fake.last_close_opp = &fake_last_close_opp;
3283 }
3284 data_fake.flags = 0;
3285 data_fake.substrs[0].flags = 0;
3286 data_fake.substrs[1].flags = 0;
3287 data_fake.pos_delta = delta;
3288 if (is_inf)
3289 data_fake.flags |= SF_IS_INF;
3290 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3291 && OP(scan) == IFMATCH ) { /* Lookahead */
3292 ssc_init(pRExC_state, &intrnl);
3293 data_fake.start_class = &intrnl;
3294 f |= SCF_DO_STCLASS_AND;
3295 }
3296 if (flags & SCF_WHILEM_VISITED_POS)
3297 f |= SCF_WHILEM_VISITED_POS;
3298 next = regnext(scan);
3299 nscan = REGNODE_AFTER(scan);
3300
3301 /* positive lookahead study_chunk() recursion */
3302 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
3303 &deltanext, last, &data_fake,
3304 stopparen, recursed_depth, NULL,
3305 f, depth+1, mutate_ok);
3306 if (FLAGS(scan)) {
3307 assert(0); /* This code has never been tested since this
3308 is normally not compiled */
3309 if ( deltanext < 0
3310 || deltanext > (I32) U8_MAX
3311 || *minnextp > (I32)U8_MAX
3312 || *minnextp + deltanext > (I32)U8_MAX)
3313 {
3314 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3315 (UV)U8_MAX);
3316 }
3317
3318 if (deltanext) {
3319 NEXT_OFF(scan) = deltanext;
3320 }
3321 FLAGS(scan) = (U8)*minnextp + deltanext;
3322 }
3323
3324 *minnextp += min;
3325
3326 if (f & SCF_DO_STCLASS_AND) {
3327 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3328 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
3329 }
3330 if (data) {
3331 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3332 pars++;
3333 if (data_fake.flags & SF_HAS_EVAL)
3334 data->flags |= SF_HAS_EVAL;
3335 data->whilem_c = data_fake.whilem_c;
3336 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3337 int i;
3338 if (RExC_rx->minlen < *minnextp)
3339 RExC_rx->minlen = *minnextp;
3340 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
3341 SvREFCNT_dec_NN(data_fake.last_found);
3342
3343 for (i = 0; i < 2; i++) {
3344 if (data_fake.substrs[i].minlenp != minlenp) {
3345 data->substrs[i].min_offset =
3346 data_fake.substrs[i].min_offset;
3347 data->substrs[i].max_offset =
3348 data_fake.substrs[i].max_offset;
3349 data->substrs[i].minlenp =
3350 data_fake.substrs[i].minlenp;
3351 data->substrs[i].lookbehind += FLAGS(scan);
3352 }
3353 }
3354 }
3355 }
3356 }
3357 #endif
3358 }
3359 else if (OP(scan) == OPEN) {
3360 if (stopparen != (I32)PARNO(scan))
3361 pars++;
3362 }
3363 else if (OP(scan) == CLOSE) {
3364 if (stopparen == (I32)PARNO(scan)) {
3365 break;
3366 }
3367 if ((I32)PARNO(scan) == is_par) {
3368 next = regnext(scan);
3369
3370 if ( next && (OP(next) != WHILEM) && next < last)
3371 is_par = 0; /* Disable optimization */
3372 }
3373 if (data) {
3374 *(data->last_closep) = PARNO(scan);
3375 *(data->last_close_opp) = scan;
3376 }
3377 }
3378 else if (OP(scan) == EVAL) {
3379 if (data && !(FLAGS(scan) & EVAL_OPTIMISTIC_FLAG) )
3380 data->flags |= SF_HAS_EVAL;
3381 }
3382 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
3383 if (flags & SCF_DO_SUBSTR) {
3384 scan_commit(pRExC_state, data, minlenp, is_inf);
3385 flags &= ~SCF_DO_SUBSTR;
3386 }
3387 if (OP(scan)==ACCEPT) {
3388 /* m{(*ACCEPT)x} does not have to start with 'x' */
3389 flags &= ~SCF_DO_STCLASS;
3390 if (data)
3391 data->flags |= SCF_SEEN_ACCEPT;
3392 if (stopmin > min)
3393 stopmin = min;
3394 }
3395 }
3396 else if (OP(scan) == COMMIT) {
3397 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
3398 * must not end up with "abcxyz" as a fixed substring else we'll
3399 * skip straight to attempting to match at offset 4.
3400 */
3401 if (flags & SCF_DO_SUBSTR) {
3402 scan_commit(pRExC_state, data, minlenp, is_inf);
3403 flags &= ~SCF_DO_SUBSTR;
3404 }
3405 }
3406 else if (OP(scan) == LOGICAL && FLAGS(scan) == 2) /* Embedded follows */
3407 {
3408 if (flags & SCF_DO_SUBSTR) {
3409 scan_commit(pRExC_state, data, minlenp, is_inf);
3410 data->cur_is_floating = 1; /* float */
3411 }
3412 is_inf = is_inf_internal = 1;
3413 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3414 ssc_anything(data->start_class);
3415 flags &= ~SCF_DO_STCLASS;
3416 }
3417 else if (OP(scan) == GPOS) {
3418 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
3419 !(delta || is_inf || (data && data->pos_delta)))
3420 {
3421 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
3422 RExC_rx->intflags |= PREGf_ANCH_GPOS;
3423 if (RExC_rx->gofs < (STRLEN)min)
3424 RExC_rx->gofs = min;
3425 } else {
3426 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
3427 RExC_rx->gofs = 0;
3428 }
3429 }
3430 #ifdef TRIE_STUDY_OPT
3431 #ifdef FULL_TRIE_STUDY
3432 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3433 /* NOTE - There is similar code to this block above for handling
3434 BRANCH nodes on the initial study. If you change stuff here
3435 check there too. */
3436 regnode *trie_node= scan;
3437 regnode *tail= regnext(scan);
3438 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3439 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
3440 regnode_ssc accum;
3441
3442 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
3443 /* Cannot merge strings after this. */
3444 scan_commit(pRExC_state, data, minlenp, is_inf);
3445 }
3446 if (flags & SCF_DO_STCLASS)
3447 ssc_init_zero(pRExC_state, &accum);
3448
3449 if (!trie->jump) {
3450 min1= trie->minlen;
3451 max1= trie->maxlen;
3452 } else {
3453 const regnode *nextbranch= NULL;
3454 U32 word;
3455
3456 for ( word=1 ; word <= trie->wordcount ; word++)
3457 {
3458 SSize_t deltanext = 0, minnext = 0;
3459 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3460 SSize_t fake_last_close = 0;
3461 regnode *fake_last_close_op = NULL;
3462 regnode_ssc this_class;
3463
3464 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3465 if (data) {
3466 data_fake.whilem_c = data->whilem_c;
3467 data_fake.last_closep = data->last_closep;
3468 data_fake.last_close_opp = data->last_close_opp;
3469 }
3470 else {
3471 data_fake.last_closep = &fake_last_close;
3472 data_fake.last_close_opp = &fake_last_close_op;
3473 }
3474 data_fake.pos_delta = delta;
3475 if (flags & SCF_DO_STCLASS) {
3476 ssc_init(pRExC_state, &this_class);
3477 data_fake.start_class = &this_class;
3478 f |= SCF_DO_STCLASS_AND;
3479 }
3480 if (flags & SCF_WHILEM_VISITED_POS)
3481 f |= SCF_WHILEM_VISITED_POS;
3482
3483 if (trie->jump[word]) {
3484 if (!nextbranch)
3485 nextbranch = trie_node + trie->jump[0];
3486 scan= trie_node + trie->jump[word];
3487 /* We go from the jump point to the branch that follows
3488 it. Note this means we need the vestigal unused
3489 branches even though they arent otherwise used. */
3490 /* optimise study_chunk() for TRIE */
3491 minnext = study_chunk(pRExC_state, &scan, minlenp,
3492 &deltanext, (regnode *)nextbranch, &data_fake,
3493 stopparen, recursed_depth, NULL, f, depth+1,
3494 mutate_ok);
3495 }
3496 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
3497 nextbranch= regnext((regnode*)nextbranch);
3498
3499 if (min1 > (SSize_t)(minnext + trie->minlen))
3500 min1 = minnext + trie->minlen;
3501 if (deltanext == OPTIMIZE_INFTY) {
3502 is_inf = is_inf_internal = 1;
3503 max1 = OPTIMIZE_INFTY;
3504 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
3505 max1 = minnext + deltanext + trie->maxlen;
3506
3507 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3508 pars++;
3509 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3510 if ( stopmin > min + min1)
3511 stopmin = min + min1;
3512 flags &= ~SCF_DO_SUBSTR;
3513 if (data)
3514 data->flags |= SCF_SEEN_ACCEPT;
3515 }
3516 if (data) {
3517 if (data_fake.flags & SF_HAS_EVAL)
3518 data->flags |= SF_HAS_EVAL;
3519 data->whilem_c = data_fake.whilem_c;
3520 }
3521 if (flags & SCF_DO_STCLASS)
3522 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
3523 }
3524 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
3525 }
3526 if (flags & SCF_DO_SUBSTR) {
3527 data->pos_min += min1;
3528 data->pos_delta += max1 - min1;
3529 if (max1 != min1 || is_inf)
3530 data->cur_is_floating = 1; /* float */
3531 }
3532 min += min1;
3533 if (delta != OPTIMIZE_INFTY) {
3534 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
3535 delta += max1 - min1;
3536 else
3537 delta = OPTIMIZE_INFTY;
3538 }
3539 if (flags & SCF_DO_STCLASS_OR) {
3540 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3541 if (min1) {
3542 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3543 flags &= ~SCF_DO_STCLASS;
3544 }
3545 }
3546 else if (flags & SCF_DO_STCLASS_AND) {
3547 if (min1) {
3548 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3549 flags &= ~SCF_DO_STCLASS;
3550 }
3551 else {
3552 /* Switch to OR mode: cache the old value of
3553 * data->start_class */
3554 INIT_AND_WITHP;
3555 StructCopy(data->start_class, and_withp, regnode_ssc);
3556 flags &= ~SCF_DO_STCLASS_AND;
3557 StructCopy(&accum, data->start_class, regnode_ssc);
3558 flags |= SCF_DO_STCLASS_OR;
3559 }
3560 }
3561 scan= tail;
3562 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
3563 continue;
3564 }
3565 #else
3566 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3567 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3568 U8*bang=NULL;
3569
3570 min += trie->minlen;
3571 delta += (trie->maxlen - trie->minlen);
3572 flags &= ~SCF_DO_STCLASS; /* xxx */
3573 if (flags & SCF_DO_SUBSTR) {
3574 /* Cannot expect anything... */
3575 scan_commit(pRExC_state, data, minlenp, is_inf);
3576 data->pos_min += trie->minlen;
3577 data->pos_delta += (trie->maxlen - trie->minlen);
3578 if (trie->maxlen != trie->minlen)
3579 data->cur_is_floating = 1; /* float */
3580 }
3581 if (trie->jump) /* no more substrings -- for now /grr*/
3582 flags &= ~SCF_DO_SUBSTR;
3583 }
3584
3585 #endif /* old or new */
3586 #endif /* TRIE_STUDY_OPT */
3587
3588 else if (OP(scan) == REGEX_SET) {
3589 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
3590 " before optimization", REGNODE_NAME(REGEX_SET));
3591 }
3592
3593 /* Else: zero-length, ignore. */
3594 scan = regnext(scan);
3595 }
3596
3597 finish:
3598 if (frame) {
3599 /* we need to unwind recursion. */
3600 depth = depth - 1;
3601
3602 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
3603 DEBUG_PEEP("fend", scan, depth, flags);
3604
3605 /* restore previous context */
3606 last = frame->last_regnode;
3607 scan = frame->next_regnode;
3608 stopparen = frame->stopparen;
3609 recursed_depth = frame->prev_recursed_depth;
3610
3611 RExC_frame_last = frame->prev_frame;
3612 frame = frame->this_prev_frame;
3613 goto fake_study_recurse;
3614 }
3615
3616 assert(!frame);
3617 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
3618
3619 /* is this pattern infinite? Eg, consider /(a|b+)/ */
3620 if (is_inf_internal)
3621 delta = OPTIMIZE_INFTY;
3622
3623 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
3624 if (min > stopmin) {
3625 /*
3626 At this point 'min' represents the minimum length string we can
3627 match while *ignoring* the implication of ACCEPT, and 'delta'
3628 represents the difference between the minimum length and maximum
3629 length, and if the pattern matches an infinitely long string
3630 (consider the + and * quantifiers) then we use the special delta
3631 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
3632 minimum length that can be matched *and* accepted.
3633
3634 A pattern is accepted when matching was successful *and*
3635 complete, and thus there is no further matching needing to be
3636 done, no backtracking to occur, etc. Prior to the introduction
3637 of ACCEPT the only opcode that signaled acceptance was the END
3638 opcode, which is always the very last opcode in a regex program.
3639 ACCEPT is thus conceptually an early successful return out of
3640 the matching process. stopmin starts out as OPTIMIZE_INFTY to
3641 represent "the entire pattern", and is ratched down to the
3642 "current min" if necessary when an ACCEPT opcode is encountered.
3643
3644 Thus stopmin might be smaller than min if we saw an (*ACCEPT),
3645 and we now need to account for it in both min and delta.
3646 Consider that in a pattern /AB/ normally the min length it can
3647 match can be computed as min(A)+min(B). But (*ACCEPT) means
3648 that it might be something else, not even neccesarily min(A) at
3649 all. Consider
3650
3651 A = /(foo(*ACCEPT)|x+)/
3652 B = /whop/
3653 AB = /(foo(*ACCEPT)|x+)whop/
3654
3655 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
3656 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
3657 "whop", and the delta of 0 as the pattern is of fixed length, the
3658 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
3659 When handling AB we expect to see a min of 5 for "xwhop", and a
3660 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
3661 for "foo". This should result in a final min of 3 for "foo", and
3662 a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
3663
3664 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
3665 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
3666 stop min would be 4 for "dude". This should result in a final
3667 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
3668
3669 When min is smaller than stopmin then we can ignore it. In the
3670 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
3671 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
3672 the ACCEPT doesn't reduce the minimum length of the string that
3673 might be matched, nor affect the maximum length.
3674
3675 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
3676 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
3677 "foo". We currently turn this into a min of 3 for "foo" and a
3678 delta of 3 for "foobar" even though technically "foobar" isn't
3679 possible. ACCEPT affects some aspects of the optimizer, like
3680 length computations and mandatory substring optimizations, but
3681 there are other optimzations this routine perfoms that are not
3682 affected and this compromise simplifies implementation.
3683
3684 It might be helpful to consider that this C function is called
3685 recursively on the pattern in a bottom up fashion, and that the
3686 min returned by a nested call may be marked as coming from an
3687 ACCEPT, causing its callers to treat the returned min as a
3688 stopmin as the recursion unwinds. Thus a single ACCEPT can affect
3689 multiple calls into this function in different ways.
3690 */
3691
3692 if (OPTIMIZE_INFTY - delta >= min - stopmin)
3693 delta += min - stopmin;
3694 else
3695 delta = OPTIMIZE_INFTY;
3696 min = stopmin;
3697 }
3698
3699 *scanp = scan;
3700 *deltap = delta;
3701
3702 if (flags & SCF_DO_SUBSTR && is_inf)
3703 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
3704 if (is_par > (I32)U8_MAX)
3705 is_par = 0;
3706 if (is_par && pars==1 && data) {
3707 data->flags |= SF_IN_PAR;
3708 data->flags &= ~SF_HAS_PAR;
3709 }
3710 else if (pars && data) {
3711 data->flags |= SF_HAS_PAR;
3712 data->flags &= ~SF_IN_PAR;
3713 }
3714 if (flags & SCF_DO_STCLASS_OR)
3715 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3716 if (flags & SCF_TRIE_RESTUDY)
3717 data->flags |= SCF_TRIE_RESTUDY;
3718
3719
3720 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
3721 if (min > OPTIMIZE_INFTY - delta)
3722 RExC_maxlen = OPTIMIZE_INFTY;
3723 else if (RExC_maxlen < min + delta)
3724 RExC_maxlen = min + delta;
3725 }
3726 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
3727 return min;
3728 }
3729