1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_STUDY_C
9 #include "perl.h"
10
11 #ifdef PERL_IN_XSUB_RE
12 # include "re_comp.h"
13 #else
14 # include "regcomp.h"
15 #endif
16
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20
21 #define INIT_AND_WITHP \
22 assert(!and_withp); \
23 Newx(and_withp, 1, regnode_ssc); \
24 SAVEFREEPV(and_withp)
25
26
27 STATIC void
S_unwind_scan_frames(pTHX_ const void * p)28 S_unwind_scan_frames(pTHX_ const void *p)
29 {
30 PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
31 scan_frame *f= (scan_frame *)p;
32 do {
33 scan_frame *n= f->next_frame;
34 Safefree(f);
35 f= n;
36 } while (f);
37 }
38
39 /* Follow the next-chain of the current node and optimize away
40 all the NOTHINGs from it.
41 */
42 STATIC void
S_rck_elide_nothing(pTHX_ regnode * node)43 S_rck_elide_nothing(pTHX_ regnode *node)
44 {
45 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
46
47 if (OP(node) != CURLYX) {
48 const int max = (REGNODE_OFF_BY_ARG(OP(node))
49 ? I32_MAX
50 /* I32 may be smaller than U16 on CRAYs! */
51 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
52 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG1u(node) : NEXT_OFF(node));
53 int noff;
54 regnode *n = node;
55
56 /* Skip NOTHING and LONGJMP. */
57 while (
58 (n = regnext(n))
59 && (
60 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
61 || ((OP(n) == LONGJMP) && (noff = ARG1u(n)))
62 )
63 && off + noff < max
64 ) {
65 off += noff;
66 }
67 if (REGNODE_OFF_BY_ARG(OP(node)))
68 ARG1u(node) = off;
69 else
70 NEXT_OFF(node) = off;
71 }
72 return;
73 }
74
75
76 /*
77 * As best we can, determine the characters that can match the start of
78 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
79 * can be false positive matches
80 *
81 * Returns the invlist as a new SV*; it is the caller's responsibility to
82 * call SvREFCNT_dec() when done with it.
83 */
84 STATIC SV*
S_make_exactf_invlist(pTHX_ RExC_state_t * pRExC_state,regnode * node)85 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
86 {
87 const U8 * s = (U8*)STRING(node);
88 SSize_t bytelen = STR_LEN(node);
89 UV uc;
90 /* Start out big enough for 2 separate code points */
91 SV* invlist = _new_invlist(4);
92
93 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
94
95 if (! UTF) {
96 uc = *s;
97
98 /* We punt and assume can match anything if the node begins
99 * with a multi-character fold. Things are complicated. For
100 * example, /ffi/i could match any of:
101 * "\N{LATIN SMALL LIGATURE FFI}"
102 * "\N{LATIN SMALL LIGATURE FF}I"
103 * "F\N{LATIN SMALL LIGATURE FI}"
104 * plus several other things; and making sure we have all the
105 * possibilities is hard. */
106 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
107 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
108 }
109 else {
110 /* Any Latin1 range character can potentially match any
111 * other depending on the locale, and in Turkic locales, 'I' and
112 * 'i' can match U+130 and U+131 */
113 if (OP(node) == EXACTFL) {
114 _invlist_union(invlist, PL_Latin1, &invlist);
115 if (isALPHA_FOLD_EQ(uc, 'I')) {
116 invlist = add_cp_to_invlist(invlist,
117 LATIN_SMALL_LETTER_DOTLESS_I);
118 invlist = add_cp_to_invlist(invlist,
119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
120 }
121 }
122 else {
123 /* But otherwise, it matches at least itself. We can
124 * quickly tell if it has a distinct fold, and if so,
125 * it matches that as well */
126 invlist = add_cp_to_invlist(invlist, uc);
127 if (IS_IN_SOME_FOLD_L1(uc))
128 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
129 }
130
131 /* Some characters match above-Latin1 ones under /i. This
132 * is true of EXACTFL ones when the locale is UTF-8 */
133 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
134 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
135 EXACTFAA_NO_TRIE)))
136 {
137 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
138 }
139 }
140 }
141 else { /* Pattern is UTF-8 */
142 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
143 const U8* e = s + bytelen;
144 IV fc;
145
146 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
147
148 /* The only code points that aren't folded in a UTF EXACTFish
149 * node are the problematic ones in EXACTFL nodes */
150 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
151 /* We need to check for the possibility that this EXACTFL
152 * node begins with a multi-char fold. Therefore we fold
153 * the first few characters of it so that we can make that
154 * check */
155 U8 *d = folded;
156 int i;
157
158 fc = -1;
159 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
160 if (isASCII(*s)) {
161 *(d++) = (U8) toFOLD(*s);
162 if (fc < 0) { /* Save the first fold */
163 fc = *(d-1);
164 }
165 s++;
166 }
167 else {
168 STRLEN len;
169 UV fold = toFOLD_utf8_safe(s, e, d, &len);
170 if (fc < 0) { /* Save the first fold */
171 fc = fold;
172 }
173 d += len;
174 s += UTF8SKIP(s);
175 }
176 }
177
178 /* And set up so the code below that looks in this folded
179 * buffer instead of the node's string */
180 e = d;
181 s = folded;
182 }
183
184 /* When we reach here 's' points to the fold of the first
185 * character(s) of the node; and 'e' points to far enough along
186 * the folded string to be just past any possible multi-char
187 * fold.
188 *
189 * Like the non-UTF case above, we punt if the node begins with a
190 * multi-char fold */
191
192 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
193 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
194 }
195 else { /* Single char fold */
196 unsigned int k;
197 U32 first_fold;
198 const U32 * remaining_folds;
199 Size_t folds_count;
200
201 /* It matches itself */
202 invlist = add_cp_to_invlist(invlist, fc);
203
204 /* ... plus all the things that fold to it, which are found in
205 * PL_utf8_foldclosures */
206 folds_count = _inverse_folds(fc, &first_fold,
207 &remaining_folds);
208 for (k = 0; k < folds_count; k++) {
209 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
210
211 /* /aa doesn't allow folds between ASCII and non- */
212 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
213 && isASCII(c) != isASCII(fc))
214 {
215 continue;
216 }
217
218 invlist = add_cp_to_invlist(invlist, c);
219 }
220
221 if (OP(node) == EXACTFL) {
222
223 /* If either [iI] are present in an EXACTFL node the above code
224 * should have added its normal case pair, but under a Turkish
225 * locale they could match instead the case pairs from it. Add
226 * those as potential matches as well */
227 if (isALPHA_FOLD_EQ(fc, 'I')) {
228 invlist = add_cp_to_invlist(invlist,
229 LATIN_SMALL_LETTER_DOTLESS_I);
230 invlist = add_cp_to_invlist(invlist,
231 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
232 }
233 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
234 invlist = add_cp_to_invlist(invlist, 'I');
235 }
236 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
237 invlist = add_cp_to_invlist(invlist, 'i');
238 }
239 }
240 }
241 }
242
243 return invlist;
244 }
245
246
247 /* Mark that we cannot extend a found fixed substring at this point.
248 Update the longest found anchored substring or the longest found
249 floating substrings if needed. */
250
251 void
Perl_scan_commit(pTHX_ const RExC_state_t * pRExC_state,scan_data_t * data,SSize_t * minlenp,int is_inf)252 Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
253 SSize_t *minlenp, int is_inf)
254 {
255 const STRLEN l = CHR_SVLEN(data->last_found);
256 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
257 const STRLEN old_l = CHR_SVLEN(longest_sv);
258 DECLARE_AND_GET_RE_DEBUG_FLAGS;
259
260 PERL_ARGS_ASSERT_SCAN_COMMIT;
261
262 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
263 const U8 i = data->cur_is_floating;
264 SvSetMagicSV(longest_sv, data->last_found);
265 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
266
267 if (!i) /* fixed */
268 data->substrs[0].max_offset = data->substrs[0].min_offset;
269 else { /* float */
270 data->substrs[1].max_offset =
271 (is_inf)
272 ? OPTIMIZE_INFTY
273 : (l
274 ? data->last_start_max
275 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
276 ? OPTIMIZE_INFTY
277 : data->pos_min + data->pos_delta));
278 }
279
280 data->substrs[i].flags &= ~SF_BEFORE_EOL;
281 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
282 data->substrs[i].minlenp = minlenp;
283 data->substrs[i].lookbehind = 0;
284 }
285
286 SvCUR_set(data->last_found, 0);
287 {
288 SV * const sv = data->last_found;
289 if (SvUTF8(sv) && SvMAGICAL(sv)) {
290 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
291 if (mg)
292 mg->mg_len = 0;
293 }
294 }
295 data->last_end = -1;
296 data->flags &= ~SF_BEFORE_EOL;
297 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
298 }
299
300 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
301 * list that describes which code points it matches */
302
303 STATIC void
S_ssc_anything(pTHX_ regnode_ssc * ssc)304 S_ssc_anything(pTHX_ regnode_ssc *ssc)
305 {
306 /* Set the SSC 'ssc' to match an empty string or any code point */
307
308 PERL_ARGS_ASSERT_SSC_ANYTHING;
309
310 assert(is_ANYOF_SYNTHETIC(ssc));
311
312 /* mortalize so won't leak */
313 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
314 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
315 }
316
317 STATIC int
S_ssc_is_anything(const regnode_ssc * ssc)318 S_ssc_is_anything(const regnode_ssc *ssc)
319 {
320 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
321 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
322 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
323 * in any way, so there's no point in using it */
324
325 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
326 bool ret;
327
328 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
329
330 assert(is_ANYOF_SYNTHETIC(ssc));
331
332 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
333 return FALSE;
334 }
335
336 /* See if the list consists solely of the range 0 - Infinity */
337 invlist_iterinit(ssc->invlist);
338 ret = invlist_iternext(ssc->invlist, &start, &end)
339 && start == 0
340 && end == UV_MAX;
341
342 invlist_iterfinish(ssc->invlist);
343
344 if (ret) {
345 return TRUE;
346 }
347
348 /* If e.g., both \w and \W are set, matches everything */
349 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
350 int i;
351 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
352 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
353 return TRUE;
354 }
355 }
356 }
357
358 return FALSE;
359 }
360
361 void
Perl_ssc_init(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc)362 Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
363 {
364 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
365 * string, any code point, or any posix class under locale */
366
367 PERL_ARGS_ASSERT_SSC_INIT;
368
369 Zero(ssc, 1, regnode_ssc);
370 set_ANYOF_SYNTHETIC(ssc);
371 ARG1u_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
372 ssc_anything(ssc);
373
374 /* If any portion of the regex is to operate under locale rules that aren't
375 * fully known at compile time, initialization includes it. The reason
376 * this isn't done for all regexes is that the optimizer was written under
377 * the assumption that locale was all-or-nothing. Given the complexity and
378 * lack of documentation in the optimizer, and that there are inadequate
379 * test cases for locale, many parts of it may not work properly, it is
380 * safest to avoid locale unless necessary. */
381 if (RExC_contains_locale) {
382 ANYOF_POSIXL_SETALL(ssc);
383 }
384 else {
385 ANYOF_POSIXL_ZERO(ssc);
386 }
387 }
388
389 STATIC int
S_ssc_is_cp_posixl_init(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)390 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
391 const regnode_ssc *ssc)
392 {
393 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
394 * to the list of code points matched, and locale posix classes; hence does
395 * not check its flags) */
396
397 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
398 bool ret;
399
400 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
401
402 assert(is_ANYOF_SYNTHETIC(ssc));
403
404 invlist_iterinit(ssc->invlist);
405 ret = invlist_iternext(ssc->invlist, &start, &end)
406 && start == 0
407 && end == UV_MAX;
408
409 invlist_iterfinish(ssc->invlist);
410
411 if (! ret) {
412 return FALSE;
413 }
414
415 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
416 return FALSE;
417 }
418
419 return TRUE;
420 }
421
422
423 STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t * pRExC_state,const regnode_charclass * const node)424 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
425 const regnode_charclass* const node)
426 {
427 /* Returns a mortal inversion list defining which code points are matched
428 * by 'node', which is of ANYOF-ish type . Handles complementing the
429 * result if appropriate. If some code points aren't knowable at this
430 * time, the returned list must, and will, contain every code point that is
431 * a possibility. */
432
433 SV* invlist = NULL;
434 SV* only_utf8_locale_invlist = NULL;
435 bool new_node_has_latin1 = FALSE;
436 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
437 ? ANYOF_FLAGS(node)
438 : 0;
439
440 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
441
442 /* Look at the data structure created by S_set_ANYOF_arg() */
443 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
444 invlist = sv_2mortal(_new_invlist(1));
445 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
446 }
447 else if (ANYOF_HAS_AUX(node)) {
448 const U32 n = ARG1u(node);
449 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
450 AV * const av = MUTABLE_AV(SvRV(rv));
451 SV **const ary = AvARRAY(av);
452
453 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
454
455 /* Here there are things that won't be known until runtime -- we
456 * have to assume it could be anything */
457 invlist = sv_2mortal(_new_invlist(1));
458 return _add_range_to_invlist(invlist, 0, UV_MAX);
459 }
460 else if (ary[INVLIST_INDEX]) {
461
462 /* Use the node's inversion list */
463 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
464 }
465
466 /* Get the code points valid only under UTF-8 locales */
467 if ( (flags & ANYOFL_FOLD)
468 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
469 {
470 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
471 }
472 }
473
474 if (! invlist) {
475 invlist = sv_2mortal(_new_invlist(0));
476 }
477
478 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
479 * code points, and an inversion list for the others, but if there are code
480 * points that should match only conditionally on the target string being
481 * UTF-8, those are placed in the inversion list, and not the bitmap.
482 * Since there are circumstances under which they could match, they are
483 * included in the SSC. But if the ANYOF node is to be inverted, we have
484 * to exclude them here, so that when we invert below, the end result
485 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
486 * have to do this here before we add the unconditionally matched code
487 * points */
488 if (flags & ANYOF_INVERT) {
489 _invlist_intersection_complement_2nd(invlist,
490 PL_UpperLatin1,
491 &invlist);
492 }
493
494 /* Add in the points from the bit map */
495 if (REGNODE_TYPE(OP(node)) == ANYOF){
496 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
497 if (ANYOF_BITMAP_TEST(node, i)) {
498 unsigned int start = i++;
499
500 for (; i < NUM_ANYOF_CODE_POINTS
501 && ANYOF_BITMAP_TEST(node, i); ++i)
502 {
503 /* empty */
504 }
505 invlist = _add_range_to_invlist(invlist, start, i-1);
506 new_node_has_latin1 = TRUE;
507 }
508 }
509 }
510
511 /* If this can match all upper Latin1 code points, have to add them
512 * as well. But don't add them if inverting, as when that gets done below,
513 * it would exclude all these characters, including the ones it shouldn't
514 * that were added just above */
515 if ( ! (flags & ANYOF_INVERT)
516 && OP(node) == ANYOFD
517 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
518 {
519 _invlist_union(invlist, PL_UpperLatin1, &invlist);
520 }
521
522 /* Similarly for these */
523 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
524 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
525 }
526
527 if (flags & ANYOF_INVERT) {
528 _invlist_invert(invlist);
529 }
530 else if (flags & ANYOFL_FOLD) {
531 if (new_node_has_latin1) {
532
533 /* These folds are potential in Turkic locales */
534 if (_invlist_contains_cp(invlist, 'i')) {
535 invlist = add_cp_to_invlist(invlist,
536 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
537 }
538 if (_invlist_contains_cp(invlist, 'I')) {
539 invlist = add_cp_to_invlist(invlist,
540 LATIN_SMALL_LETTER_DOTLESS_I);
541 }
542
543 /* Under /li, any 0-255 could fold to any other 0-255, depending on
544 * the locale. We can skip this if there are no 0-255 at all. */
545 _invlist_union(invlist, PL_Latin1, &invlist);
546 }
547 else {
548 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
549 invlist = add_cp_to_invlist(invlist, 'I');
550 }
551 if (_invlist_contains_cp(invlist,
552 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
553 {
554 invlist = add_cp_to_invlist(invlist, 'i');
555 }
556 }
557 }
558
559 /* Similarly add the UTF-8 locale possible matches. These have to be
560 * deferred until after the non-UTF-8 locale ones are taken care of just
561 * above, or it leads to wrong results under ANYOF_INVERT */
562 if (only_utf8_locale_invlist) {
563 _invlist_union_maybe_complement_2nd(invlist,
564 only_utf8_locale_invlist,
565 flags & ANYOF_INVERT,
566 &invlist);
567 }
568
569 return invlist;
570 }
571
572 /* 'AND' a given class with another one. Can create false positives. 'ssc'
573 * should not be inverted. */
574
575 STATIC void
S_ssc_and(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * and_with)576 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
577 const regnode_charclass *and_with)
578 {
579 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
580 * another SSC or a regular ANYOF class. Can create false positives. */
581
582 SV* anded_cp_list;
583 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
584 ? ANYOF_FLAGS(and_with)
585 : 0;
586 U8 anded_flags;
587
588 PERL_ARGS_ASSERT_SSC_AND;
589
590 assert(is_ANYOF_SYNTHETIC(ssc));
591
592 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
593 * the code point inversion list and just the relevant flags */
594 if (is_ANYOF_SYNTHETIC(and_with)) {
595 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
596 anded_flags = and_with_flags;
597
598 /* XXX This is a kludge around what appears to be deficiencies in the
599 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
600 * there are paths through the optimizer where it doesn't get weeded
601 * out when it should. And if we don't make some extra provision for
602 * it like the code just below, it doesn't get added when it should.
603 * This solution is to add it only when AND'ing, which is here, and
604 * only when what is being AND'ed is the pristine, original node
605 * matching anything. Thus it is like adding it to ssc_anything() but
606 * only when the result is to be AND'ed. Probably the same solution
607 * could be adopted for the same problem we have with /l matching,
608 * which is solved differently in S_ssc_init(), and that would lead to
609 * fewer false positives than that solution has. But if this solution
610 * creates bugs, the consequences are only that a warning isn't raised
611 * that should be; while the consequences for having /l bugs is
612 * incorrect matches */
613 if (ssc_is_anything((regnode_ssc *)and_with)) {
614 anded_flags |= ANYOF_WARN_SUPER__shared;
615 }
616 }
617 else {
618 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
619 if (OP(and_with) == ANYOFD) {
620 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
621 }
622 else {
623 anded_flags = and_with_flags
624 & ( ANYOF_COMMON_FLAGS
625 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
626 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
627 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
628 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
629 }
630 }
631 }
632
633 ANYOF_FLAGS(ssc) &= anded_flags;
634
635 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
636 * C2 is the list of code points in 'and-with'; P2, its posix classes.
637 * 'and_with' may be inverted. When not inverted, we have the situation of
638 * computing:
639 * (C1 | P1) & (C2 | P2)
640 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
641 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
642 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
643 * <= ((C1 & C2) | P1 | P2)
644 * Alternatively, the last few steps could be:
645 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
646 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
647 * <= (C1 | C2 | (P1 & P2))
648 * We favor the second approach if either P1 or P2 is non-empty. This is
649 * because these components are a barrier to doing optimizations, as what
650 * they match cannot be known until the moment of matching as they are
651 * dependent on the current locale, 'AND"ing them likely will reduce or
652 * eliminate them.
653 * But we can do better if we know that C1,P1 are in their initial state (a
654 * frequent occurrence), each matching everything:
655 * (<everything>) & (C2 | P2) = C2 | P2
656 * Similarly, if C2,P2 are in their initial state (again a frequent
657 * occurrence), the result is a no-op
658 * (C1 | P1) & (<everything>) = C1 | P1
659 *
660 * Inverted, we have
661 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
662 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
663 * <= (C1 & ~C2) | (P1 & ~P2)
664 * */
665
666 if ((and_with_flags & ANYOF_INVERT)
667 && ! is_ANYOF_SYNTHETIC(and_with))
668 {
669 unsigned int i;
670
671 ssc_intersection(ssc,
672 anded_cp_list,
673 FALSE /* Has already been inverted */
674 );
675
676 /* If either P1 or P2 is empty, the intersection will be also; can skip
677 * the loop */
678 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
679 ANYOF_POSIXL_ZERO(ssc);
680 }
681 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
682
683 /* Note that the Posix class component P from 'and_with' actually
684 * looks like:
685 * P = Pa | Pb | ... | Pn
686 * where each component is one posix class, such as in [\w\s].
687 * Thus
688 * ~P = ~(Pa | Pb | ... | Pn)
689 * = ~Pa & ~Pb & ... & ~Pn
690 * <= ~Pa | ~Pb | ... | ~Pn
691 * The last is something we can easily calculate, but unfortunately
692 * is likely to have many false positives. We could do better
693 * in some (but certainly not all) instances if two classes in
694 * P have known relationships. For example
695 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
696 * So
697 * :lower: & :print: = :lower:
698 * And similarly for classes that must be disjoint. For example,
699 * since \s and \w can have no elements in common based on rules in
700 * the POSIX standard,
701 * \w & ^\S = nothing
702 * Unfortunately, some vendor locales do not meet the Posix
703 * standard, in particular almost everything by Microsoft.
704 * The loop below just changes e.g., \w into \W and vice versa */
705
706 regnode_charclass_posixl temp;
707 int add = 1; /* To calculate the index of the complement */
708
709 Zero(&temp, 1, regnode_charclass_posixl);
710 ANYOF_POSIXL_ZERO(&temp);
711 for (i = 0; i < ANYOF_MAX; i++) {
712 assert(i % 2 != 0
713 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
714 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
715
716 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
717 ANYOF_POSIXL_SET(&temp, i + add);
718 }
719 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
720 }
721 ANYOF_POSIXL_AND(&temp, ssc);
722
723 } /* else ssc already has no posixes */
724 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
725 in its initial state */
726 else if (! is_ANYOF_SYNTHETIC(and_with)
727 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
728 {
729 /* But if 'ssc' is in its initial state, the result is just 'and_with';
730 * copy it over 'ssc' */
731 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
732 if (is_ANYOF_SYNTHETIC(and_with)) {
733 StructCopy(and_with, ssc, regnode_ssc);
734 }
735 else {
736 ssc->invlist = anded_cp_list;
737 ANYOF_POSIXL_ZERO(ssc);
738 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
739 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
740 }
741 }
742 }
743 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
744 || (and_with_flags & ANYOF_MATCHES_POSIXL))
745 {
746 /* One or the other of P1, P2 is non-empty. */
747 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
748 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
749 }
750 ssc_union(ssc, anded_cp_list, FALSE);
751 }
752 else { /* P1 = P2 = empty */
753 ssc_intersection(ssc, anded_cp_list, FALSE);
754 }
755 }
756 }
757
758 STATIC void
S_ssc_or(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * or_with)759 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
760 const regnode_charclass *or_with)
761 {
762 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
763 * another SSC or a regular ANYOF class. Can create false positives if
764 * 'or_with' is to be inverted. */
765
766 SV* ored_cp_list;
767 U8 ored_flags;
768 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
769 ? ANYOF_FLAGS(or_with)
770 : 0;
771
772 PERL_ARGS_ASSERT_SSC_OR;
773
774 assert(is_ANYOF_SYNTHETIC(ssc));
775
776 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
777 * the code point inversion list and just the relevant flags */
778 if (is_ANYOF_SYNTHETIC(or_with)) {
779 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
780 ored_flags = or_with_flags;
781 }
782 else {
783 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
784 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
785 if (OP(or_with) != ANYOFD) {
786 ored_flags |=
787 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
788 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
789 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
790 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
791 }
792 }
793 }
794
795 ANYOF_FLAGS(ssc) |= ored_flags;
796
797 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
798 * C2 is the list of code points in 'or-with'; P2, its posix classes.
799 * 'or_with' may be inverted. When not inverted, we have the simple
800 * situation of computing:
801 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
802 * If P1|P2 yields a situation with both a class and its complement are
803 * set, like having both \w and \W, this matches all code points, and we
804 * can delete these from the P component of the ssc going forward. XXX We
805 * might be able to delete all the P components, but I (khw) am not certain
806 * about this, and it is better to be safe.
807 *
808 * Inverted, we have
809 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
810 * <= (C1 | P1) | ~C2
811 * <= (C1 | ~C2) | P1
812 * (which results in actually simpler code than the non-inverted case)
813 * */
814
815 if ((or_with_flags & ANYOF_INVERT)
816 && ! is_ANYOF_SYNTHETIC(or_with))
817 {
818 /* We ignore P2, leaving P1 going forward */
819 } /* else Not inverted */
820 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
821 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
822 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
823 unsigned int i;
824 for (i = 0; i < ANYOF_MAX; i += 2) {
825 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
826 {
827 ssc_match_all_cp(ssc);
828 ANYOF_POSIXL_CLEAR(ssc, i);
829 ANYOF_POSIXL_CLEAR(ssc, i+1);
830 }
831 }
832 }
833 }
834
835 ssc_union(ssc,
836 ored_cp_list,
837 FALSE /* Already has been inverted */
838 );
839 }
840
841 STATIC void
S_ssc_union(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)842 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
843 {
844 PERL_ARGS_ASSERT_SSC_UNION;
845
846 assert(is_ANYOF_SYNTHETIC(ssc));
847
848 _invlist_union_maybe_complement_2nd(ssc->invlist,
849 invlist,
850 invert2nd,
851 &ssc->invlist);
852 }
853
854 STATIC void
S_ssc_intersection(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)855 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
856 SV* const invlist,
857 const bool invert2nd)
858 {
859 PERL_ARGS_ASSERT_SSC_INTERSECTION;
860
861 assert(is_ANYOF_SYNTHETIC(ssc));
862
863 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
864 invlist,
865 invert2nd,
866 &ssc->invlist);
867 }
868
869 STATIC void
S_ssc_add_range(pTHX_ regnode_ssc * ssc,const UV start,const UV end)870 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
871 {
872 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
873
874 assert(is_ANYOF_SYNTHETIC(ssc));
875
876 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
877 }
878
879 STATIC void
S_ssc_cp_and(pTHX_ regnode_ssc * ssc,const UV cp)880 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
881 {
882 /* AND just the single code point 'cp' into the SSC 'ssc' */
883
884 SV* cp_list = _new_invlist(2);
885
886 PERL_ARGS_ASSERT_SSC_CP_AND;
887
888 assert(is_ANYOF_SYNTHETIC(ssc));
889
890 cp_list = add_cp_to_invlist(cp_list, cp);
891 ssc_intersection(ssc, cp_list,
892 FALSE /* Not inverted */
893 );
894 SvREFCNT_dec_NN(cp_list);
895 }
896
897 STATIC void
S_ssc_clear_locale(regnode_ssc * ssc)898 S_ssc_clear_locale(regnode_ssc *ssc)
899 {
900 /* Set the SSC 'ssc' to not match any locale things */
901 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
902
903 assert(is_ANYOF_SYNTHETIC(ssc));
904
905 ANYOF_POSIXL_ZERO(ssc);
906 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
907 }
908
909 bool
Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)910 Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
911 {
912 /* The synthetic start class is used to hopefully quickly winnow down
913 * places where a pattern could start a match in the target string. If it
914 * doesn't really narrow things down that much, there isn't much point to
915 * having the overhead of using it. This function uses some very crude
916 * heuristics to decide if to use the ssc or not.
917 *
918 * It returns TRUE if 'ssc' rules out more than half what it considers to
919 * be the "likely" possible matches, but of course it doesn't know what the
920 * actual things being matched are going to be; these are only guesses
921 *
922 * For /l matches, it assumes that the only likely matches are going to be
923 * in the 0-255 range, uniformly distributed, so half of that is 127
924 * For /a and /d matches, it assumes that the likely matches will be just
925 * the ASCII range, so half of that is 63
926 * For /u and there isn't anything matching above the Latin1 range, it
927 * assumes that that is the only range likely to be matched, and uses
928 * half that as the cut-off: 127. If anything matches above Latin1,
929 * it assumes that all of Unicode could match (uniformly), except for
930 * non-Unicode code points and things in the General Category "Other"
931 * (unassigned, private use, surrogates, controls and formats). This
932 * is a much large number. */
933
934 U32 count = 0; /* Running total of number of code points matched by
935 'ssc' */
936 UV start, end; /* Start and end points of current range in inversion
937 XXX outdated. UTF-8 locales are common, what about invert? list */
938 const U32 max_code_points = (LOC)
939 ? 256
940 : (( ! UNI_SEMANTICS
941 || invlist_highest(ssc->invlist) < 256)
942 ? 128
943 : NON_OTHER_COUNT);
944 const U32 max_match = max_code_points / 2;
945
946 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
947
948 invlist_iterinit(ssc->invlist);
949 while (invlist_iternext(ssc->invlist, &start, &end)) {
950 if (start >= max_code_points) {
951 break;
952 }
953 end = MIN(end, max_code_points - 1);
954 count += end - start + 1;
955 if (count >= max_match) {
956 invlist_iterfinish(ssc->invlist);
957 return FALSE;
958 }
959 }
960
961 return TRUE;
962 }
963
964
965 void
Perl_ssc_finalize(pTHX_ RExC_state_t * pRExC_state,regnode_ssc * ssc)966 Perl_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
967 {
968 /* The inversion list in the SSC is marked mortal; now we need a more
969 * permanent copy, which is stored the same way that is done in a regular
970 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
971 * map */
972
973 SV* invlist = invlist_clone(ssc->invlist, NULL);
974
975 PERL_ARGS_ASSERT_SSC_FINALIZE;
976
977 assert(is_ANYOF_SYNTHETIC(ssc));
978
979 /* The code in this file assumes that all but these flags aren't relevant
980 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
981 * by the time we reach here */
982 assert(! (ANYOF_FLAGS(ssc)
983 & ~( ANYOF_COMMON_FLAGS
984 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
985 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
986
987 populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
988
989 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
990 SvREFCNT_dec(invlist);
991
992 /* Make sure is clone-safe */
993 ssc->invlist = NULL;
994
995 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
996 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
997 OP(ssc) = ANYOFPOSIXL;
998 }
999 else if (RExC_contains_locale) {
1000 OP(ssc) = ANYOFL;
1001 }
1002
1003 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1004 }
1005
1006 /* The below joins as many adjacent EXACTish nodes as possible into a single
1007 * one. The regop may be changed if the node(s) contain certain sequences that
1008 * require special handling. The joining is only done if:
1009 * 1) there is room in the current conglomerated node to entirely contain the
1010 * next one.
1011 * 2) they are compatible node types
1012 *
1013 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
1014 * these get optimized out
1015 *
1016 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
1017 * as possible, even if that means splitting an existing node so that its first
1018 * part is moved to the preceding node. This would maximise the efficiency of
1019 * memEQ during matching.
1020 *
1021 * If a node is to match under /i (folded), the number of characters it matches
1022 * can be different than its character length if it contains a multi-character
1023 * fold. *min_subtract is set to the total delta number of characters of the
1024 * input nodes.
1025 *
1026 * And *unfolded_multi_char is set to indicate whether or not the node contains
1027 * an unfolded multi-char fold. This happens when it won't be known until
1028 * runtime whether the fold is valid or not; namely
1029 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
1030 * target string being matched against turns out to be UTF-8 is that fold
1031 * valid; or
1032 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
1033 * runtime.
1034 * (Multi-char folds whose components are all above the Latin1 range are not
1035 * run-time locale dependent, and have already been folded by the time this
1036 * function is called.)
1037 *
1038 * This is as good a place as any to discuss the design of handling these
1039 * multi-character fold sequences. It's been wrong in Perl for a very long
1040 * time. There are three code points in Unicode whose multi-character folds
1041 * were long ago discovered to mess things up. The previous designs for
1042 * dealing with these involved assigning a special node for them. This
1043 * approach doesn't always work, as evidenced by this example:
1044 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
1045 * Both sides fold to "sss", but if the pattern is parsed to create a node that
1046 * would match just the \xDF, it won't be able to handle the case where a
1047 * successful match would have to cross the node's boundary. The new approach
1048 * that hopefully generally solves the problem generates an EXACTFUP node
1049 * that is "sss" in this case.
1050 *
1051 * It turns out that there are problems with all multi-character folds, and not
1052 * just these three. Now the code is general, for all such cases. The
1053 * approach taken is:
1054 * 1) This routine examines each EXACTFish node that could contain multi-
1055 * character folded sequences. Since a single character can fold into
1056 * such a sequence, the minimum match length for this node is less than
1057 * the number of characters in the node. This routine returns in
1058 * *min_subtract how many characters to subtract from the actual
1059 * length of the string to get a real minimum match length; it is 0 if
1060 * there are no multi-char foldeds. This delta is used by the caller to
1061 * adjust the min length of the match, and the delta between min and max,
1062 * so that the optimizer doesn't reject these possibilities based on size
1063 * constraints.
1064 *
1065 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
1066 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
1067 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
1068 * EXACTFU nodes. The node type of such nodes is then changed to
1069 * EXACTFUP, indicating it is problematic, and needs careful handling.
1070 * (The procedures in step 1) above are sufficient to handle this case in
1071 * UTF-8 encoded nodes.) The reason this is problematic is that this is
1072 * the only case where there is a possible fold length change in non-UTF-8
1073 * patterns. By reserving a special node type for problematic cases, the
1074 * far more common regular EXACTFU nodes can be processed faster.
1075 * regexec.c takes advantage of this.
1076 *
1077 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
1078 * problematic cases. These all only occur when the pattern is not
1079 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
1080 * length change, it handles the situation where the string cannot be
1081 * entirely folded. The strings in an EXACTFish node are folded as much
1082 * as possible during compilation in regcomp.c. This saves effort in
1083 * regex matching. By using an EXACTFUP node when it is not possible to
1084 * fully fold at compile time, regexec.c can know that everything in an
1085 * EXACTFU node is folded, so folding can be skipped at runtime. The only
1086 * case where folding in EXACTFU nodes can't be done at compile time is
1087 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
1088 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
1089 * handle two very different cases. Alternatively, there could have been
1090 * a node type where there are length changes, one for unfolded, and one
1091 * for both. If yet another special case needed to be created, the number
1092 * of required node types would have to go to 7. khw figures that even
1093 * though there are plenty of node types to spare, that the maintenance
1094 * cost wasn't worth the small speedup of doing it that way, especially
1095 * since he thinks the MICRO SIGN is rarely encountered in practice.
1096 *
1097 * There are other cases where folding isn't done at compile time, but
1098 * none of them are under /u, and hence not for EXACTFU nodes. The folds
1099 * in EXACTFL nodes aren't known until runtime, and vary as the locale
1100 * changes. Some folds in EXACTF depend on if the runtime target string
1101 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
1102 * when no fold in it depends on the UTF-8ness of the target string.)
1103 *
1104 * 3) A problem remains for unfolded multi-char folds. (These occur when the
1105 * validity of the fold won't be known until runtime, and so must remain
1106 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
1107 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
1108 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
1109 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
1110 * The reason this is a problem is that the optimizer part of regexec.c
1111 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
1112 * that a character in the pattern corresponds to at most a single
1113 * character in the target string. (And I do mean character, and not byte
1114 * here, unlike other parts of the documentation that have never been
1115 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
1116 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
1117 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
1118 * EXACTFL nodes, violate the assumption, and they are the only instances
1119 * where it is violated. I'm reluctant to try to change the assumption,
1120 * as the code involved is impenetrable to me (khw), so instead the code
1121 * here punts. This routine examines EXACTFL nodes, and (when the pattern
1122 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
1123 * boolean indicating whether or not the node contains such a fold. When
1124 * it is true, the caller sets a flag that later causes the optimizer in
1125 * this file to not set values for the floating and fixed string lengths,
1126 * and thus avoids the optimizer code in regexec.c that makes the invalid
1127 * assumption. Thus, there is no optimization based on string lengths for
1128 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
1129 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
1130 * assumption is wrong only in these cases is that all other non-UTF-8
1131 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
1132 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
1133 * EXACTF nodes because we don't know at compile time if it actually
1134 * matches 'ss' or not. For EXACTF nodes it will match iff the target
1135 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
1136 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
1137 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
1138 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
1139 * string would require the pattern to be forced into UTF-8, the overhead
1140 * of which we want to avoid. Similarly the unfolded multi-char folds in
1141 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
1142 * locale.)
1143 *
1144 * Similarly, the code that generates tries doesn't currently handle
1145 * not-already-folded multi-char folds, and it looks like a pain to change
1146 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
1147 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
1148 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
1149 * using /iaa matching will be doing so almost entirely with ASCII
1150 * strings, so this should rarely be encountered in practice */
1151
1152 U32
Perl_join_exact(pTHX_ RExC_state_t * pRExC_state,regnode * scan,UV * min_subtract,bool * unfolded_multi_char,U32 flags,regnode * val,U32 depth)1153 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
1154 UV *min_subtract, bool *unfolded_multi_char,
1155 U32 flags, regnode *val, U32 depth)
1156 {
1157 /* Merge several consecutive EXACTish nodes into one. */
1158
1159 regnode *n = regnext(scan);
1160 U32 stringok = 1;
1161 regnode *next = REGNODE_AFTER_varies(scan);
1162 U32 merged = 0;
1163 U32 stopnow = 0;
1164 #ifdef DEBUGGING
1165 regnode *stop = scan;
1166 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1167 #else
1168 PERL_UNUSED_ARG(depth);
1169 #endif
1170
1171 PERL_ARGS_ASSERT_JOIN_EXACT;
1172 #ifndef EXPERIMENTAL_INPLACESCAN
1173 PERL_UNUSED_ARG(flags);
1174 PERL_UNUSED_ARG(val);
1175 #endif
1176 DEBUG_PEEP("join", scan, depth, 0);
1177
1178 assert(REGNODE_TYPE(OP(scan)) == EXACT);
1179
1180 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
1181 * EXACT ones that are mergeable to the current one. */
1182 while ( n
1183 && ( REGNODE_TYPE(OP(n)) == NOTHING
1184 || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
1185 && NEXT_OFF(n)
1186 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
1187 {
1188
1189 if (OP(n) == TAIL || n > next)
1190 stringok = 0;
1191 if (REGNODE_TYPE(OP(n)) == NOTHING) {
1192 DEBUG_PEEP("skip:", n, depth, 0);
1193 NEXT_OFF(scan) += NEXT_OFF(n);
1194 next = n + NODE_STEP_REGNODE;
1195 #ifdef DEBUGGING
1196 if (stringok)
1197 stop = n;
1198 #endif
1199 n = regnext(n);
1200 }
1201 else if (stringok) {
1202 const unsigned int oldl = STR_LEN(scan);
1203 regnode * const nnext = regnext(n);
1204
1205 /* XXX I (khw) kind of doubt that this works on platforms (should
1206 * Perl ever run on one) where U8_MAX is above 255 because of lots
1207 * of other assumptions */
1208 /* Don't join if the sum can't fit into a single node */
1209 if (oldl + STR_LEN(n) > U8_MAX)
1210 break;
1211
1212 /* Joining something that requires UTF-8 with something that
1213 * doesn't, means the result requires UTF-8. */
1214 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
1215 OP(scan) = EXACT_REQ8;
1216 }
1217 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
1218 ; /* join is compatible, no need to change OP */
1219 }
1220 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
1221 OP(scan) = EXACTFU_REQ8;
1222 }
1223 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
1224 ; /* join is compatible, no need to change OP */
1225 }
1226 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
1227 ; /* join is compatible, no need to change OP */
1228 }
1229 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
1230
1231 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
1232 * which can join with EXACTFU ones. We check for this case
1233 * here. These need to be resolved to either EXACTFU or
1234 * EXACTF at joining time. They have nothing in them that
1235 * would forbid them from being the more desirable EXACTFU
1236 * nodes except that they begin and/or end with a single [Ss].
1237 * The reason this is problematic is because they could be
1238 * joined in this loop with an adjacent node that ends and/or
1239 * begins with [Ss] which would then form the sequence 'ss',
1240 * which matches differently under /di than /ui, in which case
1241 * EXACTFU can't be used. If the 'ss' sequence doesn't get
1242 * formed, the nodes get absorbed into any adjacent EXACTFU
1243 * node. And if the only adjacent node is EXACTF, they get
1244 * absorbed into that, under the theory that a longer node is
1245 * better than two shorter ones, even if one is EXACTFU. Note
1246 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
1247 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
1248
1249 if (STRING(n)[STR_LEN(n)-1] == 's') {
1250
1251 /* Here the joined node would end with 's'. If the node
1252 * following the combination is an EXACTF one, it's better to
1253 * join this trailing edge 's' node with that one, leaving the
1254 * current one in 'scan' be the more desirable EXACTFU */
1255 if (OP(nnext) == EXACTF) {
1256 break;
1257 }
1258
1259 OP(scan) = EXACTFU_S_EDGE;
1260
1261 } /* Otherwise, the beginning 's' of the 2nd node just
1262 becomes an interior 's' in 'scan' */
1263 }
1264 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
1265 ; /* join is compatible, no need to change OP */
1266 }
1267 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
1268
1269 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
1270 * nodes. But the latter nodes can be also joined with EXACTFU
1271 * ones, and that is a better outcome, so if the node following
1272 * 'n' is EXACTFU, quit now so that those two can be joined
1273 * later */
1274 if (OP(nnext) == EXACTFU) {
1275 break;
1276 }
1277
1278 /* The join is compatible, and the combined node will be
1279 * EXACTF. (These don't care if they begin or end with 's' */
1280 }
1281 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
1282 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
1283 && STRING(n)[0] == 's')
1284 {
1285 /* When combined, we have the sequence 'ss', which means we
1286 * have to remain /di */
1287 OP(scan) = EXACTF;
1288 }
1289 }
1290 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
1291 if (STRING(n)[0] == 's') {
1292 ; /* Here the join is compatible and the combined node
1293 starts with 's', no need to change OP */
1294 }
1295 else { /* Now the trailing 's' is in the interior */
1296 OP(scan) = EXACTFU;
1297 }
1298 }
1299 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
1300
1301 /* The join is compatible, and the combined node will be
1302 * EXACTF. (These don't care if they begin or end with 's' */
1303 OP(scan) = EXACTF;
1304 }
1305 else if (OP(scan) != OP(n)) {
1306
1307 /* The only other compatible joinings are the same node type */
1308 break;
1309 }
1310
1311 DEBUG_PEEP("merg", n, depth, 0);
1312 merged++;
1313
1314 next = REGNODE_AFTER_varies(n);
1315 NEXT_OFF(scan) += NEXT_OFF(n);
1316 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
1317 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
1318 /* Now we can overwrite *n : */
1319 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1320 #ifdef DEBUGGING
1321 stop = next - 1;
1322 #endif
1323 n = nnext;
1324 if (stopnow) break;
1325 }
1326
1327 #ifdef EXPERIMENTAL_INPLACESCAN
1328 if (flags && !NEXT_OFF(n)) {
1329 DEBUG_PEEP("atch", val, depth, 0);
1330 if (REGNODE_OFF_BY_ARG(OP(n))) {
1331 ARG1u_SET(n, val - n);
1332 }
1333 else {
1334 NEXT_OFF(n) = val - n;
1335 }
1336 stopnow = 1;
1337 }
1338 #endif
1339 }
1340
1341 /* This temporary node can now be turned into EXACTFU, and must, as
1342 * regexec.c doesn't handle it */
1343 if (OP(scan) == EXACTFU_S_EDGE) {
1344 OP(scan) = EXACTFU;
1345 }
1346
1347 *min_subtract = 0;
1348 *unfolded_multi_char = FALSE;
1349
1350 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
1351 * can now analyze for sequences of problematic code points. (Prior to
1352 * this final joining, sequences could have been split over boundaries, and
1353 * hence missed). The sequences only happen in folding, hence for any
1354 * non-EXACT EXACTish node */
1355 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
1356 U8* s0 = (U8*) STRING(scan);
1357 U8* s = s0;
1358 U8* s_end = s0 + STR_LEN(scan);
1359
1360 int total_count_delta = 0; /* Total delta number of characters that
1361 multi-char folds expand to */
1362
1363 /* One pass is made over the node's string looking for all the
1364 * possibilities. To avoid some tests in the loop, there are two main
1365 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
1366 * non-UTF-8 */
1367 if (UTF) {
1368 U8* folded = NULL;
1369
1370 if (OP(scan) == EXACTFL) {
1371 U8 *d;
1372
1373 /* An EXACTFL node would already have been changed to another
1374 * node type unless there is at least one character in it that
1375 * is problematic; likely a character whose fold definition
1376 * won't be known until runtime, and so has yet to be folded.
1377 * For all but the UTF-8 locale, folds are 1-1 in length, but
1378 * to handle the UTF-8 case, we need to create a temporary
1379 * folded copy using UTF-8 locale rules in order to analyze it.
1380 * This is because our macros that look to see if a sequence is
1381 * a multi-char fold assume everything is folded (otherwise the
1382 * tests in those macros would be too complicated and slow).
1383 * Note that here, the non-problematic folds will have already
1384 * been done, so we can just copy such characters. We actually
1385 * don't completely fold the EXACTFL string. We skip the
1386 * unfolded multi-char folds, as that would just create work
1387 * below to figure out the size they already are */
1388
1389 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
1390 d = folded;
1391 while (s < s_end) {
1392 STRLEN s_len = UTF8SKIP(s);
1393 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
1394 Copy(s, d, s_len, U8);
1395 d += s_len;
1396 }
1397 else if (is_FOLDS_TO_MULTI_utf8(s)) {
1398 *unfolded_multi_char = TRUE;
1399 Copy(s, d, s_len, U8);
1400 d += s_len;
1401 }
1402 else if (isASCII(*s)) {
1403 *(d++) = toFOLD(*s);
1404 }
1405 else {
1406 STRLEN len;
1407 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
1408 d += len;
1409 }
1410 s += s_len;
1411 }
1412
1413 /* Point the remainder of the routine to look at our temporary
1414 * folded copy */
1415 s = folded;
1416 s_end = d;
1417 } /* End of creating folded copy of EXACTFL string */
1418
1419 /* Examine the string for a multi-character fold sequence. UTF-8
1420 * patterns have all characters pre-folded by the time this code is
1421 * executed */
1422 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
1423 length sequence we are looking for is 2 */
1424 {
1425 int count = 0; /* How many characters in a multi-char fold */
1426 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
1427 if (! len) { /* Not a multi-char fold: get next char */
1428 s += UTF8SKIP(s);
1429 continue;
1430 }
1431
1432 { /* Here is a generic multi-char fold. */
1433 U8* multi_end = s + len;
1434
1435 /* Count how many characters are in it. In the case of
1436 * /aa, no folds which contain ASCII code points are
1437 * allowed, so check for those, and skip if found. */
1438 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
1439 count = utf8_length(s, multi_end);
1440 s = multi_end;
1441 }
1442 else {
1443 while (s < multi_end) {
1444 if (isASCII(*s)) {
1445 s++;
1446 goto next_iteration;
1447 }
1448 else {
1449 s += UTF8SKIP(s);
1450 }
1451 count++;
1452 }
1453 }
1454 }
1455
1456 /* The delta is how long the sequence is minus 1 (1 is how long
1457 * the character that folds to the sequence is) */
1458 total_count_delta += count - 1;
1459 next_iteration: ;
1460 }
1461
1462 /* We created a temporary folded copy of the string in EXACTFL
1463 * nodes. Therefore we need to be sure it doesn't go below zero,
1464 * as the real string could be shorter */
1465 if (OP(scan) == EXACTFL) {
1466 int total_chars = utf8_length((U8*) STRING(scan),
1467 (U8*) STRING(scan) + STR_LEN(scan));
1468 if (total_count_delta > total_chars) {
1469 total_count_delta = total_chars;
1470 }
1471 }
1472
1473 *min_subtract += total_count_delta;
1474 Safefree(folded);
1475 }
1476 else if (OP(scan) == EXACTFAA) {
1477
1478 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
1479 * fold to the ASCII range (and there are no existing ones in the
1480 * upper latin1 range). But, as outlined in the comments preceding
1481 * this function, we need to flag any occurrences of the sharp s.
1482 * This character forbids trie formation (because of added
1483 * complexity) */
1484 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
1485 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
1486 || UNICODE_DOT_DOT_VERSION > 0)
1487 while (s < s_end) {
1488 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
1489 OP(scan) = EXACTFAA_NO_TRIE;
1490 *unfolded_multi_char = TRUE;
1491 break;
1492 }
1493 s++;
1494 }
1495 }
1496 else if (OP(scan) != EXACTFAA_NO_TRIE) {
1497
1498 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
1499 * folds that are all Latin1. As explained in the comments
1500 * preceding this function, we look also for the sharp s in EXACTF
1501 * and EXACTFL nodes; it can be in the final position. Otherwise
1502 * we can stop looking 1 byte earlier because have to find at least
1503 * two characters for a multi-fold */
1504 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
1505 ? s_end
1506 : s_end -1;
1507
1508 while (s < upper) {
1509 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
1510 if (! len) { /* Not a multi-char fold. */
1511 if (*s == LATIN_SMALL_LETTER_SHARP_S
1512 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
1513 {
1514 *unfolded_multi_char = TRUE;
1515 }
1516 s++;
1517 continue;
1518 }
1519
1520 if (len == 2
1521 && isALPHA_FOLD_EQ(*s, 's')
1522 && isALPHA_FOLD_EQ(*(s+1), 's'))
1523 {
1524
1525 /* EXACTF nodes need to know that the minimum length
1526 * changed so that a sharp s in the string can match this
1527 * ss in the pattern, but they remain EXACTF nodes, as they
1528 * won't match this unless the target string is in UTF-8,
1529 * which we don't know until runtime. EXACTFL nodes can't
1530 * transform into EXACTFU nodes */
1531 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
1532 OP(scan) = EXACTFUP;
1533 }
1534 }
1535
1536 *min_subtract += len - 1;
1537 s += len;
1538 }
1539 #endif
1540 }
1541 }
1542
1543 #ifdef DEBUGGING
1544 /* Allow dumping but overwriting the collection of skipped
1545 * ops and/or strings with fake optimized ops */
1546 n = REGNODE_AFTER_varies(scan);
1547 while (n <= stop) {
1548 OP(n) = OPTIMIZED;
1549 FLAGS(n) = 0;
1550 NEXT_OFF(n) = 0;
1551 n++;
1552 }
1553 #endif
1554 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
1555 return stopnow;
1556 }
1557
1558 /* REx optimizer. Converts nodes into quicker variants "in place".
1559 Finds fixed substrings. */
1560
1561
1562 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1563 to the position after last scanned or to NULL. */
1564
1565 /* the return from this sub is the minimum length that could possibly match */
1566 SSize_t
Perl_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,SSize_t * minlenp,SSize_t * deltap,regnode * last,scan_data_t * data,I32 stopparen,U32 recursed_depth,regnode_ssc * and_withp,U32 flags,U32 depth,bool was_mutate_ok)1567 Perl_study_chunk(pTHX_
1568 RExC_state_t *pRExC_state,
1569 regnode **scanp, /* Start here (read-write). */
1570 SSize_t *minlenp, /* used for the minlen of substrings? */
1571 SSize_t *deltap, /* Write maxlen-minlen here. */
1572 regnode *last, /* Stop before this one. */
1573 scan_data_t *data, /* string data about the pattern */
1574 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
1575 U32 recursed_depth, /* how deep have we recursed via GOSUB */
1576 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
1577 U32 flags, /* flags controlling this call, see SCF_ flags */
1578 U32 depth, /* how deep have we recursed period */
1579 bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
1580 FALSE only if the caller (recursively) was
1581 prohibited from modifying the regops, because
1582 a higher caller is holding a ptr to them. */
1583 )
1584 {
1585 /* vars about the regnodes we are working with */
1586 regnode *scan = *scanp; /* the current opcode we are inspecting */
1587 regnode *next = NULL; /* the next opcode beyond scan, tmp var */
1588 regnode *first_non_open = scan; /* FIXME: should this init to NULL?
1589 the first non open regop, if the init
1590 val IS an OPEN then we will skip past
1591 it just after the var decls section */
1592 I32 code = 0; /* temp var used to hold the optype of a regop */
1593
1594 /* vars about the min and max length of the pattern */
1595 SSize_t min = 0; /* min length of this part of the pattern */
1596 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
1597 this is adjusted down if we find
1598 an ACCEPT */
1599 SSize_t delta = 0; /* difference between min and max length
1600 (not accounting for stopmin) */
1601
1602 /* vars about capture buffers in the pattern */
1603 I32 pars = 0; /* count of OPEN opcodes */
1604 I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
1605
1606 /* vars about whether this pattern contains something that can match
1607 * infinitely long strings, eg, X* or X+ */
1608 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1609 int is_inf_internal = 0; /* The studied chunk is infinite */
1610
1611 /* scan_data_t (struct) is used to hold information about the substrings
1612 * and start class we have extracted from the string */
1613 scan_data_t data_fake; /* temp var used for recursing in some cases */
1614
1615 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
1616 trie optimizations */
1617
1618 scan_frame *frame = NULL; /* used as part of fake recursion */
1619
1620 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1621
1622 PERL_ARGS_ASSERT_STUDY_CHUNK;
1623 RExC_study_started= 1;
1624
1625 Zero(&data_fake, 1, scan_data_t);
1626
1627 if ( depth == 0 ) {
1628 while (first_non_open && OP(first_non_open) == OPEN)
1629 first_non_open=regnext(first_non_open);
1630 }
1631
1632 fake_study_recurse:
1633 DEBUG_r(
1634 RExC_study_chunk_recursed_count++;
1635 );
1636 DEBUG_OPTIMISE_MORE_r(
1637 {
1638 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
1639 depth, (long)stopparen,
1640 (unsigned long)RExC_study_chunk_recursed_count,
1641 (unsigned long)depth, (unsigned long)recursed_depth,
1642 scan,
1643 last);
1644 if (recursed_depth) {
1645 U32 i;
1646 U32 j;
1647 for ( j = 0 ; j < recursed_depth ; j++ ) {
1648 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
1649 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
1650 Perl_re_printf( aTHX_ " %d",(int)i);
1651 break;
1652 }
1653 }
1654 if ( j + 1 < recursed_depth ) {
1655 Perl_re_printf( aTHX_ ",");
1656 }
1657 }
1658 }
1659 Perl_re_printf( aTHX_ "\n");
1660 }
1661 );
1662 while ( scan && OP(scan) != END && scan < last ){
1663 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
1664 node length to get a real minimum (because
1665 the folded version may be shorter) */
1666 bool unfolded_multi_char = FALSE;
1667 /* avoid mutating ops if we are anywhere within the recursed or
1668 * enframed handling for a GOSUB: the outermost level will handle it.
1669 */
1670 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
1671 /* Peephole optimizer: */
1672 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
1673 DEBUG_PEEP("Peep", scan, depth, flags);
1674
1675
1676 /* The reason we do this here is that we need to deal with things like
1677 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
1678 * parsing code, as each (?:..) is handled by a different invocation of
1679 * reg() -- Yves
1680 */
1681 if (REGNODE_TYPE(OP(scan)) == EXACT
1682 && OP(scan) != LEXACT
1683 && OP(scan) != LEXACT_REQ8
1684 && mutate_ok
1685 ) {
1686 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
1687 0, NULL, depth + 1);
1688 }
1689
1690 /* Follow the next-chain of the current node and optimize
1691 away all the NOTHINGs from it.
1692 */
1693 rck_elide_nothing(scan);
1694
1695 /* The principal pseudo-switch. Cannot be a switch, since we look into
1696 * several different things. */
1697 if ( OP(scan) == DEFINEP ) {
1698 SSize_t minlen = 0;
1699 SSize_t deltanext = 0;
1700 SSize_t fake_last_close = 0;
1701 regnode *fake_last_close_op = NULL;
1702 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
1703
1704 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1705 scan = regnext(scan);
1706 assert( OP(scan) == IFTHEN );
1707 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
1708
1709 data_fake.last_closep= &fake_last_close;
1710 data_fake.last_close_opp= &fake_last_close_op;
1711 minlen = *minlenp;
1712 next = regnext(scan);
1713 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
1714 DEBUG_PEEP("scan", scan, depth, flags);
1715 DEBUG_PEEP("next", next, depth, flags);
1716
1717 /* we suppose the run is continuous, last=next...
1718 * NOTE we dont use the return here! */
1719 /* DEFINEP study_chunk() recursion */
1720 (void)study_chunk(pRExC_state, &scan, &minlen,
1721 &deltanext, next, &data_fake, stopparen,
1722 recursed_depth, NULL, f, depth+1, mutate_ok);
1723
1724 scan = next;
1725 } else
1726 if (
1727 OP(scan) == BRANCH ||
1728 OP(scan) == BRANCHJ ||
1729 OP(scan) == IFTHEN
1730 ) {
1731 next = regnext(scan);
1732 code = OP(scan);
1733
1734 /* The op(next)==code check below is to see if we
1735 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
1736 * IFTHEN is special as it might not appear in pairs.
1737 * Not sure whether BRANCH-BRANCHJ is possible, regardless
1738 * we dont handle it cleanly. */
1739 if (OP(next) == code || code == IFTHEN) {
1740 /* NOTE - There is similar code to this block below for
1741 * handling TRIE nodes on a re-study. If you change stuff here
1742 * check there too. */
1743 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
1744 regnode_ssc accum;
1745 regnode * const startbranch=scan;
1746
1747 if (flags & SCF_DO_SUBSTR) {
1748 /* Cannot merge strings after this. */
1749 scan_commit(pRExC_state, data, minlenp, is_inf);
1750 }
1751
1752 if (flags & SCF_DO_STCLASS)
1753 ssc_init_zero(pRExC_state, &accum);
1754
1755 while (OP(scan) == code) {
1756 SSize_t deltanext, minnext, fake_last_close = 0;
1757 regnode *fake_last_close_op = NULL;
1758 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
1759 regnode_ssc this_class;
1760
1761 DEBUG_PEEP("Branch", scan, depth, flags);
1762
1763 num++;
1764 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1765 if (data) {
1766 data_fake.whilem_c = data->whilem_c;
1767 data_fake.last_closep = data->last_closep;
1768 data_fake.last_close_opp = data->last_close_opp;
1769 }
1770 else {
1771 data_fake.last_closep = &fake_last_close;
1772 data_fake.last_close_opp = &fake_last_close_op;
1773 }
1774
1775 data_fake.pos_delta = delta;
1776 next = regnext(scan);
1777
1778 scan = REGNODE_AFTER_opcode(scan, code);
1779
1780 if (flags & SCF_DO_STCLASS) {
1781 ssc_init(pRExC_state, &this_class);
1782 data_fake.start_class = &this_class;
1783 f |= SCF_DO_STCLASS_AND;
1784 }
1785 if (flags & SCF_WHILEM_VISITED_POS)
1786 f |= SCF_WHILEM_VISITED_POS;
1787
1788 /* we suppose the run is continuous, last=next...*/
1789 /* recurse study_chunk() for each BRANCH in an alternation */
1790 minnext = study_chunk(pRExC_state, &scan, minlenp,
1791 &deltanext, next, &data_fake, stopparen,
1792 recursed_depth, NULL, f, depth+1,
1793 mutate_ok);
1794
1795 if (min1 > minnext)
1796 min1 = minnext;
1797 if (deltanext == OPTIMIZE_INFTY) {
1798 is_inf = is_inf_internal = 1;
1799 max1 = OPTIMIZE_INFTY;
1800 } else if (max1 < minnext + deltanext)
1801 max1 = minnext + deltanext;
1802 scan = next;
1803 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1804 pars++;
1805 if (data_fake.flags & SCF_SEEN_ACCEPT) {
1806 if ( stopmin > minnext)
1807 stopmin = min + min1;
1808 flags &= ~SCF_DO_SUBSTR;
1809 if (data)
1810 data->flags |= SCF_SEEN_ACCEPT;
1811 }
1812 if (data) {
1813 if (data_fake.flags & SF_HAS_EVAL)
1814 data->flags |= SF_HAS_EVAL;
1815 data->whilem_c = data_fake.whilem_c;
1816 }
1817 if (flags & SCF_DO_STCLASS)
1818 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
1819 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
1820 }
1821 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1822 min1 = 0;
1823 if (flags & SCF_DO_SUBSTR) {
1824 data->pos_min += min1;
1825 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
1826 data->pos_delta = OPTIMIZE_INFTY;
1827 else
1828 data->pos_delta += max1 - min1;
1829 if (max1 != min1 || is_inf)
1830 data->cur_is_floating = 1;
1831 }
1832 min += min1;
1833 if (delta == OPTIMIZE_INFTY
1834 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
1835 delta = OPTIMIZE_INFTY;
1836 else
1837 delta += max1 - min1;
1838 if (flags & SCF_DO_STCLASS_OR) {
1839 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
1840 if (min1) {
1841 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
1842 flags &= ~SCF_DO_STCLASS;
1843 }
1844 }
1845 else if (flags & SCF_DO_STCLASS_AND) {
1846 if (min1) {
1847 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
1848 flags &= ~SCF_DO_STCLASS;
1849 }
1850 else {
1851 /* Switch to OR mode: cache the old value of
1852 * data->start_class */
1853 INIT_AND_WITHP;
1854 StructCopy(data->start_class, and_withp, regnode_ssc);
1855 flags &= ~SCF_DO_STCLASS_AND;
1856 StructCopy(&accum, data->start_class, regnode_ssc);
1857 flags |= SCF_DO_STCLASS_OR;
1858 }
1859 }
1860 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
1861
1862 if (PERL_ENABLE_TRIE_OPTIMISATION
1863 && OP(startbranch) == BRANCH
1864 && mutate_ok
1865 ) {
1866 /* demq.
1867
1868 Assuming this was/is a branch we are dealing with: 'scan'
1869 now points at the item that follows the branch sequence,
1870 whatever it is. We now start at the beginning of the
1871 sequence and look for subsequences of
1872
1873 BRANCH->EXACT=>x1
1874 BRANCH->EXACT=>x2
1875 tail
1876
1877 which would be constructed from a pattern like
1878 /A|LIST|OF|WORDS/
1879
1880 If we can find such a subsequence we need to turn the first
1881 element into a trie and then add the subsequent branch exact
1882 strings to the trie.
1883
1884 We have two cases
1885
1886 1. patterns where the whole set of branches can be
1887 converted.
1888
1889 2. patterns where only a subset can be converted.
1890
1891 In case 1 we can replace the whole set with a single regop
1892 for the trie. In case 2 we need to keep the start and end
1893 branches so
1894
1895 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1896 becomes BRANCH TRIE; BRANCH X;
1897
1898 There is an additional case, that being where there is a
1899 common prefix, which gets split out into an EXACT like node
1900 preceding the TRIE node.
1901
1902 If X(1..n)==tail then we can do a simple trie, if not we make
1903 a "jump" trie, such that when we match the appropriate word
1904 we "jump" to the appropriate tail node. Essentially we turn
1905 a nested if into a case structure of sorts.
1906
1907 */
1908
1909 int made=0;
1910 if (!re_trie_maxbuff) {
1911 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1912 if (!SvIOK(re_trie_maxbuff))
1913 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1914 }
1915 if ( SvIV(re_trie_maxbuff)>=0 ) {
1916 regnode *cur;
1917 regnode *first = (regnode *)NULL;
1918 regnode *prev = (regnode *)NULL;
1919 regnode *tail = scan;
1920 U8 trietype = 0;
1921 U32 count=0;
1922
1923 /* var tail is used because there may be a TAIL
1924 regop in the way. Ie, the exacts will point to the
1925 thing following the TAIL, but the last branch will
1926 point at the TAIL. So we advance tail. If we
1927 have nested (?:) we may have to move through several
1928 tails.
1929 */
1930
1931 while ( OP( tail ) == TAIL ) {
1932 /* this is the TAIL generated by (?:) */
1933 tail = regnext( tail );
1934 }
1935
1936
1937 DEBUG_TRIE_COMPILE_r({
1938 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
1939 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
1940 depth+1,
1941 "Looking for TRIE'able sequences. Tail node is ",
1942 (UV) REGNODE_OFFSET(tail),
1943 SvPV_nolen_const( RExC_mysv )
1944 );
1945 });
1946
1947 /*
1948
1949 Step through the branches
1950 cur represents each branch,
1951 noper is the first thing to be matched as part
1952 of that branch
1953 noper_next is the regnext() of that node.
1954
1955 We normally handle a case like this
1956 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
1957 support building with NOJUMPTRIE, which restricts
1958 the trie logic to structures like /FOO|BAR/.
1959
1960 If noper is a trieable nodetype then the branch is
1961 a possible optimization target. If we are building
1962 under NOJUMPTRIE then we require that noper_next is
1963 the same as scan (our current position in the regex
1964 program).
1965
1966 Once we have two or more consecutive such branches
1967 we can create a trie of the EXACT's contents and
1968 stitch it in place into the program.
1969
1970 If the sequence represents all of the branches in
1971 the alternation we replace the entire thing with a
1972 single TRIE node.
1973
1974 Otherwise when it is a subsequence we need to
1975 stitch it in place and replace only the relevant
1976 branches. This means the first branch has to remain
1977 as it is used by the alternation logic, and its
1978 next pointer, and needs to be repointed at the item
1979 on the branch chain following the last branch we
1980 have optimized away.
1981
1982 This could be either a BRANCH, in which case the
1983 subsequence is internal, or it could be the item
1984 following the branch sequence in which case the
1985 subsequence is at the end (which does not
1986 necessarily mean the first node is the start of the
1987 alternation).
1988
1989 TRIE_TYPE(X) is a define which maps the optype to a
1990 trietype.
1991
1992 optype | trietype
1993 ----------------+-----------
1994 NOTHING | NOTHING
1995 EXACT | EXACT
1996 EXACT_REQ8 | EXACT
1997 EXACTFU | EXACTFU
1998 EXACTFU_REQ8 | EXACTFU
1999 EXACTFUP | EXACTFU
2000 EXACTFAA | EXACTFAA
2001 EXACTL | EXACTL
2002 EXACTFLU8 | EXACTFLU8
2003
2004
2005 */
2006 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
2007 ? NOTHING \
2008 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
2009 ? EXACT \
2010 : ( EXACTFU == (X) \
2011 || EXACTFU_REQ8 == (X) \
2012 || EXACTFUP == (X) ) \
2013 ? EXACTFU \
2014 : ( EXACTFAA == (X) ) \
2015 ? EXACTFAA \
2016 : ( EXACTL == (X) ) \
2017 ? EXACTL \
2018 : ( EXACTFLU8 == (X) ) \
2019 ? EXACTFLU8 \
2020 : 0 )
2021
2022 /* dont use tail as the end marker for this traverse */
2023 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2024 regnode * const noper = REGNODE_AFTER( cur );
2025 U8 noper_type = OP( noper );
2026 U8 noper_trietype = TRIE_TYPE( noper_type );
2027 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2028 regnode * const noper_next = regnext( noper );
2029 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
2030 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
2031 #endif
2032
2033 DEBUG_TRIE_COMPILE_r({
2034 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2035 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
2036 depth+1,
2037 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
2038
2039 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
2040 Perl_re_printf( aTHX_ " -> %d:%s",
2041 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
2042
2043 if ( noper_next ) {
2044 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
2045 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
2046 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
2047 }
2048 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
2049 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2050 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
2051 );
2052 });
2053
2054 /* Is noper a trieable nodetype that can be merged
2055 * with the current trie (if there is one)? */
2056 if ( noper_trietype
2057 &&
2058 (
2059 ( noper_trietype == NOTHING )
2060 || ( trietype == NOTHING )
2061 || ( trietype == noper_trietype )
2062 )
2063 #ifdef NOJUMPTRIE
2064 && noper_next >= tail
2065 #endif
2066 && count < U16_MAX)
2067 {
2068 /* Handle mergable triable node Either we are
2069 * the first node in a new trieable sequence,
2070 * in which case we do some bookkeeping,
2071 * otherwise we update the end pointer. */
2072 if ( !first ) {
2073 first = cur;
2074 if ( noper_trietype == NOTHING ) {
2075 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
2076 regnode * const noper_next = regnext( noper );
2077 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
2078 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
2079 #endif
2080
2081 if ( noper_next_trietype ) {
2082 trietype = noper_next_trietype;
2083 } else if (noper_next_type) {
2084 /* a NOTHING regop is 1 regop wide.
2085 * We need at least two for a trie
2086 * so we can't merge this in */
2087 first = NULL;
2088 }
2089 } else {
2090 trietype = noper_trietype;
2091 }
2092 } else {
2093 if ( trietype == NOTHING )
2094 trietype = noper_trietype;
2095 prev = cur;
2096 }
2097 if (first)
2098 count++;
2099 } /* end handle mergable triable node */
2100 else {
2101 /* handle unmergable node -
2102 * noper may either be a triable node which can
2103 * not be tried together with the current trie,
2104 * or a non triable node */
2105 if ( prev ) {
2106 /* If last is set and trietype is not
2107 * NOTHING then we have found at least two
2108 * triable branch sequences in a row of a
2109 * similar trietype so we can turn them
2110 * into a trie. If/when we allow NOTHING to
2111 * start a trie sequence this condition
2112 * will be required, and it isn't expensive
2113 * so we leave it in for now. */
2114 if ( trietype && trietype != NOTHING )
2115 make_trie( pRExC_state,
2116 startbranch, first, cur, tail,
2117 count, trietype, depth+1 );
2118 prev = NULL; /* note: we clear/update
2119 first, trietype etc below,
2120 so we dont do it here */
2121 }
2122 if ( noper_trietype
2123 #ifdef NOJUMPTRIE
2124 && noper_next >= tail
2125 #endif
2126 ){
2127 /* noper is triable, so we can start a new
2128 * trie sequence */
2129 count = 1;
2130 first = cur;
2131 trietype = noper_trietype;
2132 } else if (first) {
2133 /* if we already saw a first but the
2134 * current node is not triable then we have
2135 * to reset the first information. */
2136 count = 0;
2137 first = NULL;
2138 trietype = 0;
2139 }
2140 } /* end handle unmergable node */
2141 } /* loop over branches */
2142 DEBUG_TRIE_COMPILE_r({
2143 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2144 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
2145 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2146 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
2147 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2148 REGNODE_NAME(trietype)
2149 );
2150
2151 });
2152 if ( prev && trietype ) {
2153 if ( trietype != NOTHING ) {
2154 /* the last branch of the sequence was part of
2155 * a trie, so we have to construct it here
2156 * outside of the loop */
2157 made= make_trie( pRExC_state, startbranch,
2158 first, scan, tail, count,
2159 trietype, depth+1 );
2160 #ifdef TRIE_STUDY_OPT
2161 if ( ((made == MADE_EXACT_TRIE &&
2162 startbranch == first)
2163 || ( first_non_open == first )) &&
2164 depth==0 ) {
2165 flags |= SCF_TRIE_RESTUDY;
2166 if ( startbranch == first
2167 && scan >= tail )
2168 {
2169 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
2170 }
2171 }
2172 #endif
2173 } else {
2174 /* at this point we know whatever we have is a
2175 * NOTHING sequence/branch AND if 'startbranch'
2176 * is 'first' then we can turn the whole thing
2177 * into a NOTHING
2178 */
2179 if ( startbranch == first ) {
2180 regnode *opt;
2181 /* the entire thing is a NOTHING sequence,
2182 * something like this: (?:|) So we can
2183 * turn it into a plain NOTHING op. */
2184 DEBUG_TRIE_COMPILE_r({
2185 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2186 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
2187 depth+1,
2188 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2189
2190 });
2191 OP(startbranch)= NOTHING;
2192 NEXT_OFF(startbranch)= tail - startbranch;
2193 for ( opt= startbranch + 1; opt < tail ; opt++ )
2194 OP(opt)= OPTIMIZED;
2195 }
2196 }
2197 } /* end if ( prev) */
2198 } /* TRIE_MAXBUF is non zero */
2199 } /* do trie */
2200 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
2201 }
2202 else
2203 scan = REGNODE_AFTER_opcode(scan,code);
2204 continue;
2205 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
2206 I32 paren = 0;
2207 regnode *start = NULL;
2208 regnode *end = NULL;
2209 U32 my_recursed_depth= recursed_depth;
2210
2211 if (OP(scan) != SUSPEND) { /* GOSUB */
2212 /* Do setup, note this code has side effects beyond
2213 * the rest of this block. Specifically setting
2214 * RExC_recurse[] must happen at least once during
2215 * study_chunk(). */
2216 paren = ARG1u(scan);
2217 RExC_recurse[ARG2i(scan)] = scan;
2218 start = REGNODE_p(RExC_open_parens[paren]);
2219 end = REGNODE_p(RExC_close_parens[paren]);
2220
2221 /* NOTE we MUST always execute the above code, even
2222 * if we do nothing with a GOSUB */
2223 if (
2224 ( flags & SCF_IN_DEFINE )
2225 ||
2226 (
2227 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
2228 &&
2229 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
2230 )
2231 ) {
2232 /* no need to do anything here if we are in a define. */
2233 /* or we are after some kind of infinite construct
2234 * so we can skip recursing into this item.
2235 * Since it is infinite we will not change the maxlen
2236 * or delta, and if we miss something that might raise
2237 * the minlen it will merely pessimise a little.
2238 *
2239 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
2240 * might result in a minlen of 1 and not of 4,
2241 * but this doesn't make us mismatch, just try a bit
2242 * harder than we should.
2243 *
2244 * However we must assume this GOSUB is infinite, to
2245 * avoid wrongly applying other optimizations in the
2246 * enclosing scope - see GH 18096, for example.
2247 */
2248 is_inf = is_inf_internal = 1;
2249 scan= regnext(scan);
2250 continue;
2251 }
2252
2253 if (
2254 !recursed_depth
2255 || !PAREN_TEST(recursed_depth - 1, paren)
2256 ) {
2257 /* it is quite possible that there are more efficient ways
2258 * to do this. We maintain a bitmap per level of recursion
2259 * of which patterns we have entered so we can detect if a
2260 * pattern creates a possible infinite loop. When we
2261 * recurse down a level we copy the previous levels bitmap
2262 * down. When we are at recursion level 0 we zero the top
2263 * level bitmap. It would be nice to implement a different
2264 * more efficient way of doing this. In particular the top
2265 * level bitmap may be unnecessary.
2266 */
2267 if (!recursed_depth) {
2268 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
2269 } else {
2270 Copy(PAREN_OFFSET(recursed_depth - 1),
2271 PAREN_OFFSET(recursed_depth),
2272 RExC_study_chunk_recursed_bytes, U8);
2273 }
2274 /* we havent recursed into this paren yet, so recurse into it */
2275 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
2276 PAREN_SET(recursed_depth, paren);
2277 my_recursed_depth= recursed_depth + 1;
2278 } else {
2279 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
2280 /* some form of infinite recursion, assume infinite length
2281 * */
2282 if (flags & SCF_DO_SUBSTR) {
2283 scan_commit(pRExC_state, data, minlenp, is_inf);
2284 data->cur_is_floating = 1;
2285 }
2286 is_inf = is_inf_internal = 1;
2287 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2288 ssc_anything(data->start_class);
2289 flags &= ~SCF_DO_STCLASS;
2290
2291 start= NULL; /* reset start so we dont recurse later on. */
2292 }
2293 } else {
2294 paren = stopparen;
2295 start = scan + 2;
2296 end = regnext(scan);
2297 }
2298 if (start) {
2299 scan_frame *newframe;
2300 assert(end);
2301 if (!RExC_frame_last) {
2302 Newxz(newframe, 1, scan_frame);
2303 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
2304 RExC_frame_head= newframe;
2305 RExC_frame_count++;
2306 } else if (!RExC_frame_last->next_frame) {
2307 Newxz(newframe, 1, scan_frame);
2308 RExC_frame_last->next_frame= newframe;
2309 newframe->prev_frame= RExC_frame_last;
2310 RExC_frame_count++;
2311 } else {
2312 newframe= RExC_frame_last->next_frame;
2313 }
2314 RExC_frame_last= newframe;
2315
2316 newframe->next_regnode = regnext(scan);
2317 newframe->last_regnode = last;
2318 newframe->stopparen = stopparen;
2319 newframe->prev_recursed_depth = recursed_depth;
2320 newframe->this_prev_frame= frame;
2321 newframe->in_gosub = (
2322 (frame && frame->in_gosub) || OP(scan) == GOSUB
2323 );
2324
2325 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
2326 DEBUG_PEEP("fnew", scan, depth, flags);
2327
2328 frame = newframe;
2329 scan = start;
2330 stopparen = paren;
2331 last = end;
2332 depth = depth + 1;
2333 recursed_depth= my_recursed_depth;
2334
2335 continue;
2336 }
2337 }
2338 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
2339 SSize_t bytelen = STR_LEN(scan), charlen;
2340 UV uc;
2341 assert(bytelen);
2342 if (UTF) {
2343 const U8 * const s = (U8*)STRING(scan);
2344 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
2345 charlen = utf8_length(s, s + bytelen);
2346 } else {
2347 uc = *((U8*)STRING(scan));
2348 charlen = bytelen;
2349 }
2350 min += charlen;
2351 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2352 /* The code below prefers earlier match for fixed
2353 offset, later match for variable offset. */
2354 if (data->last_end == -1) { /* Update the start info. */
2355 data->last_start_min = data->pos_min;
2356 data->last_start_max =
2357 is_inf ? OPTIMIZE_INFTY
2358 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
2359 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
2360 }
2361 sv_catpvn(data->last_found, STRING(scan), bytelen);
2362 if (UTF)
2363 SvUTF8_on(data->last_found);
2364 {
2365 SV * const sv = data->last_found;
2366 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2367 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2368 if (mg && mg->mg_len >= 0)
2369 mg->mg_len += charlen;
2370 }
2371 data->last_end = data->pos_min + charlen;
2372 data->pos_min += charlen; /* As in the first entry. */
2373 data->flags &= ~SF_BEFORE_EOL;
2374 }
2375
2376 /* ANDing the code point leaves at most it, and not in locale, and
2377 * can't match null string */
2378 if (flags & SCF_DO_STCLASS_AND) {
2379 ssc_cp_and(data->start_class, uc);
2380 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2381 ssc_clear_locale(data->start_class);
2382 }
2383 else if (flags & SCF_DO_STCLASS_OR) {
2384 ssc_add_cp(data->start_class, uc);
2385 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2386
2387 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2388 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2389 }
2390 flags &= ~SCF_DO_STCLASS;
2391 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
2392 }
2393 else if (REGNODE_TYPE(OP(scan)) == EXACT) {
2394 /* But OP != EXACT!, so is EXACTFish */
2395 SSize_t bytelen = STR_LEN(scan), charlen;
2396 const U8 * s = (U8*)STRING(scan);
2397
2398 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
2399 * with the mask set to the complement of the bit that differs
2400 * between upper and lower case, and the lowest code point of the
2401 * pair (which the '&' forces) */
2402 if ( bytelen == 1
2403 && isALPHA_A(*s)
2404 && ( OP(scan) == EXACTFAA
2405 || ( OP(scan) == EXACTFU
2406 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
2407 && mutate_ok
2408 ) {
2409 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
2410
2411 OP(scan) = ANYOFM;
2412 ARG1u_SET(scan, *s & mask);
2413 FLAGS(scan) = mask;
2414 /* We're not EXACTFish any more, so restudy.
2415 * Search for "restudy" in this file to find
2416 * a comment with details. */
2417 continue;
2418 }
2419
2420 /* Search for fixed substrings supports EXACT only. */
2421 if (flags & SCF_DO_SUBSTR) {
2422 assert(data);
2423 scan_commit(pRExC_state, data, minlenp, is_inf);
2424 }
2425 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
2426 if (unfolded_multi_char) {
2427 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
2428 }
2429 min += charlen - min_subtract;
2430 assert (min >= 0);
2431 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2432 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2433 ) {
2434 delta += min_subtract;
2435 } else {
2436 delta = OPTIMIZE_INFTY;
2437 }
2438 if (flags & SCF_DO_SUBSTR) {
2439 data->pos_min += charlen - min_subtract;
2440 if (data->pos_min < 0) {
2441 data->pos_min = 0;
2442 }
2443 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2444 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2445 ) {
2446 data->pos_delta += min_subtract;
2447 } else {
2448 data->pos_delta = OPTIMIZE_INFTY;
2449 }
2450 if (min_subtract) {
2451 data->cur_is_floating = 1; /* float */
2452 }
2453 }
2454
2455 if (flags & SCF_DO_STCLASS) {
2456 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
2457
2458 assert(EXACTF_invlist);
2459 if (flags & SCF_DO_STCLASS_AND) {
2460 if (OP(scan) != EXACTFL)
2461 ssc_clear_locale(data->start_class);
2462 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2463 ANYOF_POSIXL_ZERO(data->start_class);
2464 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
2465 }
2466 else { /* SCF_DO_STCLASS_OR */
2467 ssc_union(data->start_class, EXACTF_invlist, FALSE);
2468 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2469
2470 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2471 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2472 }
2473 flags &= ~SCF_DO_STCLASS;
2474 SvREFCNT_dec(EXACTF_invlist);
2475 }
2476 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
2477 }
2478 else if (REGNODE_VARIES(OP(scan))) {
2479 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
2480 I32 fl = 0;
2481 U32 f = flags;
2482 regnode * const oscan = scan;
2483 regnode_ssc this_class;
2484 regnode_ssc *oclass = NULL;
2485 I32 next_is_eval = 0;
2486
2487 switch (REGNODE_TYPE(OP(scan))) {
2488 case WHILEM: /* End of (?:...)* . */
2489 scan = REGNODE_AFTER(scan);
2490 goto finish;
2491 case PLUS:
2492 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2493 next = REGNODE_AFTER(scan);
2494 if ( ( REGNODE_TYPE(OP(next)) == EXACT
2495 && ! isEXACTFish(OP(next)))
2496 || (flags & SCF_DO_STCLASS))
2497 {
2498 mincount = 1;
2499 maxcount = REG_INFTY;
2500 next = regnext(scan);
2501 scan = REGNODE_AFTER(scan);
2502 goto do_curly;
2503 }
2504 }
2505 if (flags & SCF_DO_SUBSTR)
2506 data->pos_min++;
2507 /* This will bypass the formal 'min += minnext * mincount'
2508 * calculation in the do_curly path, so assumes min width
2509 * of the PLUS payload is exactly one. */
2510 min++;
2511 /* FALLTHROUGH */
2512 case STAR:
2513 next = REGNODE_AFTER(scan);
2514
2515 /* This temporary node can now be turned into EXACTFU, and
2516 * must, as regexec.c doesn't handle it */
2517 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
2518 OP(next) = EXACTFU;
2519 }
2520
2521 if ( STR_LEN(next) == 1
2522 && isALPHA_A(* STRING(next))
2523 && ( OP(next) == EXACTFAA
2524 || ( OP(next) == EXACTFU
2525 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
2526 && mutate_ok
2527 ) {
2528 /* These differ in just one bit */
2529 U8 mask = ~ ('A' ^ 'a');
2530
2531 assert(isALPHA_A(* STRING(next)));
2532
2533 /* Then replace it by an ANYOFM node, with
2534 * the mask set to the complement of the
2535 * bit that differs between upper and lower
2536 * case, and the lowest code point of the
2537 * pair (which the '&' forces) */
2538 OP(next) = ANYOFM;
2539 ARG1u_SET(next, *STRING(next) & mask);
2540 FLAGS(next) = mask;
2541 }
2542
2543 if (flags & SCF_DO_STCLASS) {
2544 mincount = 0;
2545 maxcount = REG_INFTY;
2546 next = regnext(scan);
2547 scan = REGNODE_AFTER(scan);
2548 goto do_curly;
2549 }
2550 if (flags & SCF_DO_SUBSTR) {
2551 scan_commit(pRExC_state, data, minlenp, is_inf);
2552 /* Cannot extend fixed substrings */
2553 data->cur_is_floating = 1; /* float */
2554 }
2555 is_inf = is_inf_internal = 1;
2556 scan = regnext(scan);
2557 goto optimize_curly_tail;
2558 case CURLY:
2559 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2560 && (FLAGS(scan) == stopparen))
2561 {
2562 mincount = 1;
2563 maxcount = 1;
2564 } else {
2565 mincount = ARG1i(scan);
2566 maxcount = ARG2i(scan);
2567 }
2568 next = regnext(scan);
2569 if (OP(scan) == CURLYX) {
2570 I32 lp = (data ? *(data->last_closep) : 0);
2571 FLAGS(scan) = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2572 }
2573 scan = REGNODE_AFTER(scan);
2574 next_is_eval = (OP(scan) == EVAL);
2575 do_curly:
2576 if (flags & SCF_DO_SUBSTR) {
2577 if (mincount == 0)
2578 scan_commit(pRExC_state, data, minlenp, is_inf);
2579 /* Cannot extend fixed substrings */
2580 pos_before = data->pos_min;
2581 }
2582 if (data) {
2583 fl = data->flags;
2584 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2585 if (is_inf)
2586 data->flags |= SF_IS_INF;
2587 }
2588 if (flags & SCF_DO_STCLASS) {
2589 ssc_init(pRExC_state, &this_class);
2590 oclass = data->start_class;
2591 data->start_class = &this_class;
2592 f |= SCF_DO_STCLASS_AND;
2593 f &= ~SCF_DO_STCLASS_OR;
2594 }
2595 /* Exclude from super-linear cache processing any {n,m}
2596 regops for which the combination of input pos and regex
2597 pos is not enough information to determine if a match
2598 will be possible.
2599
2600 For example, in the regex /foo(bar\s*){4,8}baz/ with the
2601 regex pos at the \s*, the prospects for a match depend not
2602 only on the input position but also on how many (bar\s*)
2603 repeats into the {4,8} we are. */
2604 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
2605 f &= ~SCF_WHILEM_VISITED_POS;
2606
2607 /* This will finish on WHILEM, setting scan, or on NULL: */
2608 /* recurse study_chunk() on loop bodies */
2609 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2610 last, data, stopparen, recursed_depth, NULL,
2611 (mincount == 0
2612 ? (f & ~SCF_DO_SUBSTR)
2613 : f)
2614 , depth+1, mutate_ok);
2615
2616 if (data && data->flags & SCF_SEEN_ACCEPT) {
2617 if (mincount > 1)
2618 mincount = 1;
2619 }
2620
2621 if (flags & SCF_DO_STCLASS)
2622 data->start_class = oclass;
2623 if (mincount == 0 || minnext == 0) {
2624 if (flags & SCF_DO_STCLASS_OR) {
2625 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2626 }
2627 else if (flags & SCF_DO_STCLASS_AND) {
2628 /* Switch to OR mode: cache the old value of
2629 * data->start_class */
2630 INIT_AND_WITHP;
2631 StructCopy(data->start_class, and_withp, regnode_ssc);
2632 flags &= ~SCF_DO_STCLASS_AND;
2633 StructCopy(&this_class, data->start_class, regnode_ssc);
2634 flags |= SCF_DO_STCLASS_OR;
2635 ANYOF_FLAGS(data->start_class)
2636 |= SSC_MATCHES_EMPTY_STRING;
2637 }
2638 } else { /* Non-zero len */
2639 if (flags & SCF_DO_STCLASS_OR) {
2640 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2641 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2642 }
2643 else if (flags & SCF_DO_STCLASS_AND)
2644 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2645 flags &= ~SCF_DO_STCLASS;
2646 }
2647 if (!scan) /* It was not CURLYX, but CURLY. */
2648 scan = next;
2649 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
2650 /* ? quantifier ok, except for (?{ ... }) */
2651 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2652 && (minnext == 0) && (deltanext == 0)
2653 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2654 && maxcount <= REG_INFTY/3) /* Complement check for big
2655 count */
2656 {
2657 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
2658 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
2659 "Quantifier unexpected on zero-length expression "
2660 "in regex m/%" UTF8f "/",
2661 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
2662 RExC_precomp)));
2663 }
2664
2665 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
2666 || min >= SSize_t_MAX - minnext * mincount )
2667 {
2668 FAIL("Regexp out of space");
2669 }
2670
2671 min += minnext * mincount;
2672 is_inf_internal |= deltanext == OPTIMIZE_INFTY
2673 || (maxcount == REG_INFTY && minnext + deltanext > 0);
2674 is_inf |= is_inf_internal;
2675 if (is_inf) {
2676 delta = OPTIMIZE_INFTY;
2677 } else {
2678 delta += (minnext + deltanext) * maxcount
2679 - minnext * mincount;
2680 }
2681
2682 if (data && data->flags & SCF_SEEN_ACCEPT) {
2683 if (flags & SCF_DO_SUBSTR) {
2684 scan_commit(pRExC_state, data, minlenp, is_inf);
2685 flags &= ~SCF_DO_SUBSTR;
2686 }
2687 if (stopmin > min)
2688 stopmin = min;
2689 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
2690 }
2691 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2692 /* Try powerful optimization CURLYX => CURLYN. */
2693 if ( RE_OPTIMIZE_CURLYX_TO_CURLYN
2694 && OP(oscan) == CURLYX
2695 && data
2696 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2697 non optimistic eval is seen
2698 anywhere.*/
2699 && ( data->flags & SF_IN_PAR ) /* has parens */
2700 && !deltanext
2701 && minnext == 1
2702 && mutate_ok
2703 ) {
2704 DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2705 /* Try to optimize to CURLYN. */
2706 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
2707 regnode * const nxt1 = nxt;
2708 #ifdef DEBUGGING
2709 regnode *nxt2;
2710 #endif
2711 /* Skip open. */
2712 nxt = regnext(nxt);
2713 if (!REGNODE_SIMPLE(OP(nxt))
2714 && !(REGNODE_TYPE(OP(nxt)) == EXACT
2715 && STR_LEN(nxt) == 1))
2716 goto nogo;
2717 #ifdef DEBUGGING
2718 nxt2 = nxt;
2719 #endif
2720 nxt = regnext(nxt);
2721 if (OP(nxt) != CLOSE)
2722 goto nogo;
2723 if (RExC_open_parens) {
2724
2725 /*open->CURLYM*/
2726 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2727
2728 /*close->while*/
2729 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
2730 }
2731 /* Now we know that nxt2 is the only contents: */
2732 FLAGS(oscan) = (U8)PARNO(nxt);
2733 OP(oscan) = CURLYN;
2734 OP(nxt1) = NOTHING; /* was OPEN. */
2735
2736 #ifdef DEBUGGING
2737 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2738 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
2739 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
2740 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2741 OP(nxt + 1) = OPTIMIZED; /* was count. */
2742 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
2743 #endif
2744 }
2745 nogo:
2746
2747 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2748
2749 /* Try optimization CURLYX => CURLYM. */
2750 if ( RE_OPTIMIZE_CURLYX_TO_CURLYM
2751 && OP(oscan) == CURLYX
2752 && data
2753 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2754 non optimistic eval is seen
2755 anywhere.*/
2756 && !(data->flags & SF_HAS_PAR) /* no parens! */
2757 && !deltanext /* atom is fixed width */
2758 && minnext != 0 /* CURLYM can't handle zero width */
2759 /* Nor characters whose fold at run-time may be
2760 * multi-character */
2761 && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN)
2762 && mutate_ok
2763 ) {
2764 DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2765 /* XXXX How to optimize if data == 0? */
2766 /* Optimize to a simpler form. */
2767 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
2768 regnode *nxt2;
2769
2770 OP(oscan) = CURLYM;
2771 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2772 && (OP(nxt2) != WHILEM))
2773 nxt = nxt2;
2774 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2775 /* Need to optimize away parenths. */
2776 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
2777 /* Set the parenth number. */
2778 /* note that we have changed the type of oscan to CURLYM here */
2779 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
2780
2781 FLAGS(oscan) = (U8)PARNO(nxt);
2782 if (RExC_open_parens) {
2783 /*open->CURLYM*/
2784 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2785
2786 /*close->NOTHING*/
2787 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
2788 + 1;
2789 }
2790 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2791 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2792
2793 #ifdef DEBUGGING
2794 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2795 OP(nxt + 1) = OPTIMIZED; /* was count. */
2796 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
2797 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
2798 #endif
2799 #if 0
2800 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2801 regnode *nnxt = regnext(nxt1);
2802 if (nnxt == nxt) {
2803 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
2804 ARG1u_SET(nxt1, nxt2 - nxt1);
2805 else if (nxt2 - nxt1 < U16_MAX)
2806 NEXT_OFF(nxt1) = nxt2 - nxt1;
2807 else
2808 OP(nxt) = NOTHING; /* Cannot beautify */
2809 }
2810 nxt1 = nnxt;
2811 }
2812 #endif
2813 /* Optimize again: */
2814 /* recurse study_chunk() on optimised CURLYX => CURLYM */
2815 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2816 NULL, stopparen, recursed_depth, NULL, 0,
2817 depth+1, mutate_ok);
2818 }
2819 else
2820 FLAGS(oscan) = 0;
2821 }
2822 else if ((OP(oscan) == CURLYX)
2823 && (flags & SCF_WHILEM_VISITED_POS)
2824 /* See the comment on a similar expression above.
2825 However, this time it's not a subexpression
2826 we care about, but the expression itself. */
2827 && (maxcount == REG_INFTY)
2828 && data) {
2829 /* This stays as CURLYX, we can put the count/of pair. */
2830 /* Find WHILEM (as in regexec.c) */
2831 regnode *nxt = oscan + NEXT_OFF(oscan);
2832
2833 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
2834 nxt += ARG1u(nxt);
2835 nxt = REGNODE_BEFORE(nxt);
2836 if (FLAGS(nxt) & 0xf) {
2837 /* we've already set whilem count on this node */
2838 } else if (++data->whilem_c < 16) {
2839 assert(data->whilem_c <= RExC_whilem_seen);
2840 FLAGS(nxt) = (U8)(data->whilem_c
2841 | (RExC_whilem_seen << 4)); /* On WHILEM */
2842 }
2843 }
2844 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2845 pars++;
2846 if (flags & SCF_DO_SUBSTR) {
2847 SV *last_str = NULL;
2848 STRLEN last_chrs = 0;
2849 int counted = mincount != 0;
2850
2851 if (data->last_end > 0 && mincount != 0) { /* Ends with a
2852 string. */
2853 SSize_t b = pos_before >= data->last_start_min
2854 ? pos_before : data->last_start_min;
2855 STRLEN l;
2856 const char * const s = SvPV_const(data->last_found, l);
2857 SSize_t old = b - data->last_start_min;
2858 assert(old >= 0);
2859
2860 if (UTF)
2861 old = utf8_hop_forward((U8*)s, old,
2862 (U8 *) SvEND(data->last_found))
2863 - (U8*)s;
2864 l -= old;
2865 /* Get the added string: */
2866 last_str = newSVpvn_utf8(s + old, l, UTF);
2867 last_chrs = UTF ? utf8_length((U8*)(s + old),
2868 (U8*)(s + old + l)) : l;
2869 if (deltanext == 0 && pos_before == b) {
2870 /* What was added is a constant string */
2871 if (mincount > 1) {
2872
2873 SvGROW(last_str, (mincount * l) + 1);
2874 repeatcpy(SvPVX(last_str) + l,
2875 SvPVX_const(last_str), l,
2876 mincount - 1);
2877 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2878 /* Add additional parts. */
2879 SvCUR_set(data->last_found,
2880 SvCUR(data->last_found) - l);
2881 sv_catsv(data->last_found, last_str);
2882 {
2883 SV * sv = data->last_found;
2884 MAGIC *mg =
2885 SvUTF8(sv) && SvMAGICAL(sv) ?
2886 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2887 if (mg && mg->mg_len >= 0)
2888 mg->mg_len += last_chrs * (mincount-1);
2889 }
2890 last_chrs *= mincount;
2891 data->last_end += l * (mincount - 1);
2892 }
2893 } else {
2894 /* start offset must point into the last copy */
2895 data->last_start_min += minnext * (mincount - 1);
2896 data->last_start_max =
2897 is_inf
2898 ? OPTIMIZE_INFTY
2899 : data->last_start_max +
2900 (maxcount - 1) * (minnext + data->pos_delta);
2901 }
2902 }
2903 /* It is counted once already... */
2904 data->pos_min += minnext * (mincount - counted);
2905 #if 0
2906 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
2907 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
2908 " maxcount=%" UVuf " mincount=%" UVuf
2909 " data->pos_delta=%" UVuf "\n",
2910 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
2911 (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
2912 if (deltanext != OPTIMIZE_INFTY)
2913 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
2914 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
2915 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
2916 #endif
2917 if (deltanext == OPTIMIZE_INFTY
2918 || data->pos_delta == OPTIMIZE_INFTY
2919 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
2920 data->pos_delta = OPTIMIZE_INFTY;
2921 else
2922 data->pos_delta += - counted * deltanext +
2923 (minnext + deltanext) * maxcount - minnext * mincount;
2924 if (mincount != maxcount) {
2925 /* Cannot extend fixed substrings found inside
2926 the group. */
2927 scan_commit(pRExC_state, data, minlenp, is_inf);
2928 if (mincount && last_str) {
2929 SV * const sv = data->last_found;
2930 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2931 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2932
2933 if (mg)
2934 mg->mg_len = -1;
2935 sv_setsv(sv, last_str);
2936 data->last_end = data->pos_min;
2937 data->last_start_min = data->pos_min - last_chrs;
2938 data->last_start_max = is_inf
2939 ? OPTIMIZE_INFTY
2940 : data->pos_min + data->pos_delta - last_chrs;
2941 }
2942 data->cur_is_floating = 1; /* float */
2943 }
2944 SvREFCNT_dec(last_str);
2945 }
2946 if (data && (fl & SF_HAS_EVAL))
2947 data->flags |= SF_HAS_EVAL;
2948 optimize_curly_tail:
2949 rck_elide_nothing(oscan);
2950 continue;
2951
2952 default:
2953 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
2954 OP(scan));
2955 case REF:
2956 case CLUMP:
2957 if (flags & SCF_DO_SUBSTR) {
2958 /* Cannot expect anything... */
2959 scan_commit(pRExC_state, data, minlenp, is_inf);
2960 data->cur_is_floating = 1; /* float */
2961 }
2962 is_inf = is_inf_internal = 1;
2963 if (flags & SCF_DO_STCLASS_OR) {
2964 if (OP(scan) == CLUMP) {
2965 /* Actually is any start char, but very few code points
2966 * aren't start characters */
2967 ssc_match_all_cp(data->start_class);
2968 }
2969 else {
2970 ssc_anything(data->start_class);
2971 }
2972 }
2973 flags &= ~SCF_DO_STCLASS;
2974 break;
2975 }
2976 }
2977 else if (OP(scan) == LNBREAK) {
2978 if (flags & SCF_DO_STCLASS) {
2979 if (flags & SCF_DO_STCLASS_AND) {
2980 ssc_intersection(data->start_class,
2981 PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
2982 ssc_clear_locale(data->start_class);
2983 ANYOF_FLAGS(data->start_class)
2984 &= ~SSC_MATCHES_EMPTY_STRING;
2985 }
2986 else if (flags & SCF_DO_STCLASS_OR) {
2987 ssc_union(data->start_class,
2988 PL_XPosix_ptrs[CC_VERTSPACE_],
2989 FALSE);
2990 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2991
2992 /* See commit msg for
2993 * 749e076fceedeb708a624933726e7989f2302f6a */
2994 ANYOF_FLAGS(data->start_class)
2995 &= ~SSC_MATCHES_EMPTY_STRING;
2996 }
2997 flags &= ~SCF_DO_STCLASS;
2998 }
2999 min++;
3000 if (delta != OPTIMIZE_INFTY)
3001 delta++; /* Because of the 2 char string cr-lf */
3002 if (flags & SCF_DO_SUBSTR) {
3003 /* Cannot expect anything... */
3004 scan_commit(pRExC_state, data, minlenp, is_inf);
3005 data->pos_min += 1;
3006 if (data->pos_delta != OPTIMIZE_INFTY) {
3007 data->pos_delta += 1;
3008 }
3009 data->cur_is_floating = 1; /* float */
3010 }
3011 }
3012 else if (REGNODE_SIMPLE(OP(scan))) {
3013
3014 if (flags & SCF_DO_SUBSTR) {
3015 scan_commit(pRExC_state, data, minlenp, is_inf);
3016 data->pos_min++;
3017 }
3018 min++;
3019 if (flags & SCF_DO_STCLASS) {
3020 bool invert = 0;
3021 SV* my_invlist = NULL;
3022 U8 namedclass;
3023
3024 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
3025 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
3026
3027 /* Some of the logic below assumes that switching
3028 locale on will only add false positives. */
3029 switch (OP(scan)) {
3030
3031 default:
3032 #ifdef DEBUGGING
3033 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
3034 OP(scan));
3035 #endif
3036 case SANY:
3037 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3038 ssc_match_all_cp(data->start_class);
3039 break;
3040
3041 case REG_ANY:
3042 {
3043 SV* REG_ANY_invlist = _new_invlist(2);
3044 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
3045 '\n');
3046 if (flags & SCF_DO_STCLASS_OR) {
3047 ssc_union(data->start_class,
3048 REG_ANY_invlist,
3049 TRUE /* TRUE => invert, hence all but \n
3050 */
3051 );
3052 }
3053 else if (flags & SCF_DO_STCLASS_AND) {
3054 ssc_intersection(data->start_class,
3055 REG_ANY_invlist,
3056 TRUE /* TRUE => invert */
3057 );
3058 ssc_clear_locale(data->start_class);
3059 }
3060 SvREFCNT_dec_NN(REG_ANY_invlist);
3061 }
3062 break;
3063
3064 case ANYOFD:
3065 case ANYOFL:
3066 case ANYOFPOSIXL:
3067 case ANYOFH:
3068 case ANYOFHb:
3069 case ANYOFHr:
3070 case ANYOFHs:
3071 case ANYOF:
3072 if (flags & SCF_DO_STCLASS_AND)
3073 ssc_and(pRExC_state, data->start_class,
3074 (regnode_charclass *) scan);
3075 else
3076 ssc_or(pRExC_state, data->start_class,
3077 (regnode_charclass *) scan);
3078 break;
3079
3080 case ANYOFHbbm:
3081 {
3082 SV* cp_list = get_ANYOFHbbm_contents(scan);
3083
3084 if (flags & SCF_DO_STCLASS_OR) {
3085 ssc_union(data->start_class, cp_list, invert);
3086 }
3087 else if (flags & SCF_DO_STCLASS_AND) {
3088 ssc_intersection(data->start_class, cp_list, invert);
3089 }
3090
3091 SvREFCNT_dec_NN(cp_list);
3092 break;
3093 }
3094
3095 case NANYOFM: /* NANYOFM already contains the inversion of the
3096 input ANYOF data, so, unlike things like
3097 NPOSIXA, don't change 'invert' to TRUE */
3098 /* FALLTHROUGH */
3099 case ANYOFM:
3100 {
3101 SV* cp_list = get_ANYOFM_contents(scan);
3102
3103 if (flags & SCF_DO_STCLASS_OR) {
3104 ssc_union(data->start_class, cp_list, invert);
3105 }
3106 else if (flags & SCF_DO_STCLASS_AND) {
3107 ssc_intersection(data->start_class, cp_list, invert);
3108 }
3109
3110 SvREFCNT_dec_NN(cp_list);
3111 break;
3112 }
3113
3114 case ANYOFR:
3115 case ANYOFRb:
3116 {
3117 SV* cp_list = NULL;
3118
3119 cp_list = _add_range_to_invlist(cp_list,
3120 ANYOFRbase(scan),
3121 ANYOFRbase(scan) + ANYOFRdelta(scan));
3122
3123 if (flags & SCF_DO_STCLASS_OR) {
3124 ssc_union(data->start_class, cp_list, invert);
3125 }
3126 else if (flags & SCF_DO_STCLASS_AND) {
3127 ssc_intersection(data->start_class, cp_list, invert);
3128 }
3129
3130 SvREFCNT_dec_NN(cp_list);
3131 break;
3132 }
3133
3134 case NPOSIXL:
3135 invert = 1;
3136 /* FALLTHROUGH */
3137
3138 case POSIXL:
3139 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
3140 if (flags & SCF_DO_STCLASS_AND) {
3141 bool was_there = cBOOL(
3142 ANYOF_POSIXL_TEST(data->start_class,
3143 namedclass));
3144 ANYOF_POSIXL_ZERO(data->start_class);
3145 if (was_there) { /* Do an AND */
3146 ANYOF_POSIXL_SET(data->start_class, namedclass);
3147 }
3148 /* No individual code points can now match */
3149 data->start_class->invlist
3150 = sv_2mortal(_new_invlist(0));
3151 }
3152 else {
3153 int complement = namedclass + ((invert) ? -1 : 1);
3154
3155 assert(flags & SCF_DO_STCLASS_OR);
3156
3157 /* If the complement of this class was already there,
3158 * the result is that they match all code points,
3159 * (\d + \D == everything). Remove the classes from
3160 * future consideration. Locale is not relevant in
3161 * this case */
3162 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
3163 ssc_match_all_cp(data->start_class);
3164 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
3165 ANYOF_POSIXL_CLEAR(data->start_class, complement);
3166 }
3167 else { /* The usual case; just add this class to the
3168 existing set */
3169 ANYOF_POSIXL_SET(data->start_class, namedclass);
3170 }
3171 }
3172 break;
3173
3174 case NPOSIXA: /* For these, we always know the exact set of
3175 what's matched */
3176 invert = 1;
3177 /* FALLTHROUGH */
3178 case POSIXA:
3179 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
3180 goto join_posix_and_ascii;
3181
3182 case NPOSIXD:
3183 case NPOSIXU:
3184 invert = 1;
3185 /* FALLTHROUGH */
3186 case POSIXD:
3187 case POSIXU:
3188 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
3189
3190 /* NPOSIXD matches all upper Latin1 code points unless the
3191 * target string being matched is UTF-8, which is
3192 * unknowable until match time. Since we are going to
3193 * invert, we want to get rid of all of them so that the
3194 * inversion will match all */
3195 if (OP(scan) == NPOSIXD) {
3196 _invlist_subtract(my_invlist, PL_UpperLatin1,
3197 &my_invlist);
3198 }
3199
3200 join_posix_and_ascii:
3201
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 ssc_intersection(data->start_class, my_invlist, invert);
3204 ssc_clear_locale(data->start_class);
3205 }
3206 else {
3207 assert(flags & SCF_DO_STCLASS_OR);
3208 ssc_union(data->start_class, my_invlist, invert);
3209 }
3210 SvREFCNT_dec(my_invlist);
3211 }
3212 if (flags & SCF_DO_STCLASS_OR)
3213 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3214 flags &= ~SCF_DO_STCLASS;
3215 }
3216 }
3217 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
3218 data->flags |= (OP(scan) == MEOL
3219 ? SF_BEFORE_MEOL
3220 : SF_BEFORE_SEOL);
3221 scan_commit(pRExC_state, data, minlenp, is_inf);
3222
3223 }
3224 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
3225 /* Lookbehind, or need to calculate parens/evals/stclass: */
3226 && (FLAGS(scan) || data || (flags & SCF_DO_STCLASS))
3227 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
3228 {
3229 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3230 || OP(scan) == UNLESSM )
3231 {
3232 /* Negative Lookahead/lookbehind
3233 In this case we can't do fixed string optimisation.
3234 */
3235
3236 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
3237 SSize_t deltanext, minnext;
3238 SSize_t fake_last_close = 0;
3239 regnode *fake_last_close_op = NULL;
3240 regnode *cur_last_close_op;
3241 regnode *nscan;
3242 regnode_ssc intrnl;
3243 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3244
3245 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3246 if (data) {
3247 data_fake.whilem_c = data->whilem_c;
3248 data_fake.last_closep = data->last_closep;
3249 data_fake.last_close_opp = data->last_close_opp;
3250 }
3251 else {
3252 data_fake.last_closep = &fake_last_close;
3253 data_fake.last_close_opp = &fake_last_close_op;
3254 }
3255
3256 /* remember the last_close_op we saw so we can see if
3257 * we are dealing with variable length lookbehind that
3258 * contains capturing buffers, which are considered
3259 * experimental */
3260 cur_last_close_op= *(data_fake.last_close_opp);
3261
3262 data_fake.pos_delta = delta;
3263 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3264 && OP(scan) == IFMATCH ) { /* Lookahead */
3265 ssc_init(pRExC_state, &intrnl);
3266 data_fake.start_class = &intrnl;
3267 f |= SCF_DO_STCLASS_AND;
3268 }
3269 if (flags & SCF_WHILEM_VISITED_POS)
3270 f |= SCF_WHILEM_VISITED_POS;
3271 next = regnext(scan);
3272 nscan = REGNODE_AFTER(scan);
3273
3274 /* recurse study_chunk() for lookahead body */
3275 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3276 last, &data_fake, stopparen,
3277 recursed_depth, NULL, f, depth+1,
3278 mutate_ok);
3279
3280 if (FLAGS(scan)) {
3281 if ( deltanext < 0
3282 || deltanext > (I32) U8_MAX
3283 || minnext > (I32)U8_MAX
3284 || minnext + deltanext > (I32)U8_MAX)
3285 {
3286 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3287 (UV)U8_MAX);
3288 }
3289
3290 /* The 'next_off' field has been repurposed to count the
3291 * additional starting positions to try beyond the initial
3292 * one. (This leaves it at 0 for non-variable length
3293 * matches to avoid breakage for those not using this
3294 * extension) */
3295 if (deltanext) {
3296 NEXT_OFF(scan) = deltanext;
3297 if (
3298 /* See a CLOSE op inside this lookbehind? */
3299 cur_last_close_op != *(data_fake.last_close_opp)
3300 /* and not doing restudy. see: restudied */
3301 && !(flags & SCF_TRIE_DOING_RESTUDY)
3302 ) {
3303 /* this is positive variable length lookbehind with
3304 * capture buffers inside of it */
3305 ckWARNexperimental_with_arg(RExC_parse,
3306 WARN_EXPERIMENTAL__VLB,
3307 "Variable length %s lookbehind with capturing is experimental",
3308 is_positive ? "positive" : "negative");
3309 }
3310 }
3311 FLAGS(scan) = (U8)minnext + deltanext;
3312 }
3313 if (data) {
3314 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3315 pars++;
3316 if (data_fake.flags & SF_HAS_EVAL)
3317 data->flags |= SF_HAS_EVAL;
3318 data->whilem_c = data_fake.whilem_c;
3319 }
3320 if (f & SCF_DO_STCLASS_AND) {
3321 if (flags & SCF_DO_STCLASS_OR) {
3322 /* OR before, AND after: ideally we would recurse with
3323 * data_fake to get the AND applied by study of the
3324 * remainder of the pattern, and then derecurse;
3325 * *** HACK *** for now just treat as "no information".
3326 * See [perl #56690].
3327 */
3328 ssc_init(pRExC_state, data->start_class);
3329 } else {
3330 /* AND before and after: combine and continue. These
3331 * assertions are zero-length, so can match an EMPTY
3332 * string */
3333 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3334 ANYOF_FLAGS(data->start_class)
3335 |= SSC_MATCHES_EMPTY_STRING;
3336 }
3337 }
3338 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
3339 }
3340 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3341 else {
3342 /* Positive Lookahead/lookbehind
3343 In this case we can do fixed string optimisation,
3344 but we must be careful about it. Note in the case of
3345 lookbehind the positions will be offset by the minimum
3346 length of the pattern, something we won't know about
3347 until after the recurse.
3348 */
3349 SSize_t deltanext, fake_last_close = 0;
3350 regnode *last_close_op = NULL;
3351 regnode *nscan;
3352 regnode_ssc intrnl;
3353 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3354 /* We use SAVEFREEPV so that when the full compile
3355 is finished perl will clean up the allocated
3356 minlens when it's all done. This way we don't
3357 have to worry about freeing them when we know
3358 they wont be used, which would be a pain.
3359 */
3360 SSize_t *minnextp;
3361 Newx( minnextp, 1, SSize_t );
3362 SAVEFREEPV(minnextp);
3363
3364 if (data) {
3365 StructCopy(data, &data_fake, scan_data_t);
3366 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3367 f |= SCF_DO_SUBSTR;
3368 if (FLAGS(scan))
3369 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
3370 data_fake.last_found=newSVsv(data->last_found);
3371 }
3372 }
3373 else {
3374 data_fake.last_closep = &fake_last_close;
3375 data_fake.last_close_opp = &fake_last_close_opp;
3376 }
3377 data_fake.flags = 0;
3378 data_fake.substrs[0].flags = 0;
3379 data_fake.substrs[1].flags = 0;
3380 data_fake.pos_delta = delta;
3381 if (is_inf)
3382 data_fake.flags |= SF_IS_INF;
3383 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3384 && OP(scan) == IFMATCH ) { /* Lookahead */
3385 ssc_init(pRExC_state, &intrnl);
3386 data_fake.start_class = &intrnl;
3387 f |= SCF_DO_STCLASS_AND;
3388 }
3389 if (flags & SCF_WHILEM_VISITED_POS)
3390 f |= SCF_WHILEM_VISITED_POS;
3391 next = regnext(scan);
3392 nscan = REGNODE_AFTER(scan);
3393
3394 /* positive lookahead study_chunk() recursion */
3395 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
3396 &deltanext, last, &data_fake,
3397 stopparen, recursed_depth, NULL,
3398 f, depth+1, mutate_ok);
3399 if (FLAGS(scan)) {
3400 assert(0); /* This code has never been tested since this
3401 is normally not compiled */
3402 if ( deltanext < 0
3403 || deltanext > (I32) U8_MAX
3404 || *minnextp > (I32)U8_MAX
3405 || *minnextp + deltanext > (I32)U8_MAX)
3406 {
3407 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3408 (UV)U8_MAX);
3409 }
3410
3411 if (deltanext) {
3412 NEXT_OFF(scan) = deltanext;
3413 }
3414 FLAGS(scan) = (U8)*minnextp + deltanext;
3415 }
3416
3417 *minnextp += min;
3418
3419 if (f & SCF_DO_STCLASS_AND) {
3420 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3421 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
3422 }
3423 if (data) {
3424 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3425 pars++;
3426 if (data_fake.flags & SF_HAS_EVAL)
3427 data->flags |= SF_HAS_EVAL;
3428 data->whilem_c = data_fake.whilem_c;
3429 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3430 int i;
3431 if (RExC_rx->minlen < *minnextp)
3432 RExC_rx->minlen = *minnextp;
3433 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
3434 SvREFCNT_dec_NN(data_fake.last_found);
3435
3436 for (i = 0; i < 2; i++) {
3437 if (data_fake.substrs[i].minlenp != minlenp) {
3438 data->substrs[i].min_offset =
3439 data_fake.substrs[i].min_offset;
3440 data->substrs[i].max_offset =
3441 data_fake.substrs[i].max_offset;
3442 data->substrs[i].minlenp =
3443 data_fake.substrs[i].minlenp;
3444 data->substrs[i].lookbehind += FLAGS(scan);
3445 }
3446 }
3447 }
3448 }
3449 }
3450 #endif
3451 }
3452 else if (OP(scan) == OPEN) {
3453 if (stopparen != (I32)PARNO(scan))
3454 pars++;
3455 }
3456 else if (OP(scan) == CLOSE) {
3457 if (stopparen == (I32)PARNO(scan)) {
3458 break;
3459 }
3460 if ((I32)PARNO(scan) == is_par) {
3461 next = regnext(scan);
3462
3463 if ( next && (OP(next) != WHILEM) && next < last)
3464 is_par = 0; /* Disable optimization */
3465 }
3466 if (data) {
3467 *(data->last_closep) = PARNO(scan);
3468 *(data->last_close_opp) = scan;
3469 }
3470 }
3471 else if (OP(scan) == EVAL) {
3472 if (data && !(FLAGS(scan) & EVAL_OPTIMISTIC_FLAG) )
3473 data->flags |= SF_HAS_EVAL;
3474 }
3475 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
3476 if (flags & SCF_DO_SUBSTR) {
3477 scan_commit(pRExC_state, data, minlenp, is_inf);
3478 flags &= ~SCF_DO_SUBSTR;
3479 }
3480 if (OP(scan)==ACCEPT) {
3481 /* m{(*ACCEPT)x} does not have to start with 'x' */
3482 flags &= ~SCF_DO_STCLASS;
3483 if (data)
3484 data->flags |= SCF_SEEN_ACCEPT;
3485 if (stopmin > min)
3486 stopmin = min;
3487 }
3488 }
3489 else if (OP(scan) == COMMIT) {
3490 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
3491 * must not end up with "abcxyz" as a fixed substring else we'll
3492 * skip straight to attempting to match at offset 4.
3493 */
3494 if (flags & SCF_DO_SUBSTR) {
3495 scan_commit(pRExC_state, data, minlenp, is_inf);
3496 flags &= ~SCF_DO_SUBSTR;
3497 }
3498 }
3499 else if (OP(scan) == LOGICAL && FLAGS(scan) == 2) /* Embedded follows */
3500 {
3501 if (flags & SCF_DO_SUBSTR) {
3502 scan_commit(pRExC_state, data, minlenp, is_inf);
3503 data->cur_is_floating = 1; /* float */
3504 }
3505 is_inf = is_inf_internal = 1;
3506 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3507 ssc_anything(data->start_class);
3508 flags &= ~SCF_DO_STCLASS;
3509 }
3510 else if (OP(scan) == GPOS) {
3511 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
3512 !(delta || is_inf || (data && data->pos_delta)))
3513 {
3514 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
3515 RExC_rx->intflags |= PREGf_ANCH_GPOS;
3516 if (RExC_rx->gofs < (STRLEN)min)
3517 RExC_rx->gofs = min;
3518 } else {
3519 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
3520 RExC_rx->gofs = 0;
3521 }
3522 }
3523 #ifdef TRIE_STUDY_OPT
3524 #ifdef FULL_TRIE_STUDY
3525 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3526 /* NOTE - There is similar code to this block above for handling
3527 BRANCH nodes on the initial study. If you change stuff here
3528 check there too. */
3529 regnode *trie_node= scan;
3530 regnode *tail= regnext(scan);
3531 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3532 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
3533 regnode_ssc accum;
3534
3535 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
3536 /* Cannot merge strings after this. */
3537 scan_commit(pRExC_state, data, minlenp, is_inf);
3538 }
3539 if (flags & SCF_DO_STCLASS)
3540 ssc_init_zero(pRExC_state, &accum);
3541
3542 if (!trie->jump) {
3543 min1= trie->minlen;
3544 max1= trie->maxlen;
3545 } else {
3546 const regnode *nextbranch= NULL;
3547 U32 word;
3548
3549 for ( word=1 ; word <= trie->wordcount ; word++)
3550 {
3551 SSize_t deltanext = 0, minnext = 0;
3552 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3553 SSize_t fake_last_close = 0;
3554 regnode *fake_last_close_op = NULL;
3555 regnode_ssc this_class;
3556
3557 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3558 if (data) {
3559 data_fake.whilem_c = data->whilem_c;
3560 data_fake.last_closep = data->last_closep;
3561 data_fake.last_close_opp = data->last_close_opp;
3562 }
3563 else {
3564 data_fake.last_closep = &fake_last_close;
3565 data_fake.last_close_opp = &fake_last_close_op;
3566 }
3567 data_fake.pos_delta = delta;
3568 if (flags & SCF_DO_STCLASS) {
3569 ssc_init(pRExC_state, &this_class);
3570 data_fake.start_class = &this_class;
3571 f |= SCF_DO_STCLASS_AND;
3572 }
3573 if (flags & SCF_WHILEM_VISITED_POS)
3574 f |= SCF_WHILEM_VISITED_POS;
3575
3576 if (trie->jump[word]) {
3577 if (!nextbranch)
3578 nextbranch = trie_node + trie->jump[0];
3579 scan= trie_node + trie->jump[word];
3580 /* We go from the jump point to the branch that follows
3581 it. Note this means we need the vestigal unused
3582 branches even though they arent otherwise used. */
3583 /* optimise study_chunk() for TRIE */
3584 minnext = study_chunk(pRExC_state, &scan, minlenp,
3585 &deltanext, (regnode *)nextbranch, &data_fake,
3586 stopparen, recursed_depth, NULL, f, depth+1,
3587 mutate_ok);
3588 }
3589 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
3590 nextbranch= regnext((regnode*)nextbranch);
3591
3592 if (min1 > (SSize_t)(minnext + trie->minlen))
3593 min1 = minnext + trie->minlen;
3594 if (deltanext == OPTIMIZE_INFTY) {
3595 is_inf = is_inf_internal = 1;
3596 max1 = OPTIMIZE_INFTY;
3597 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
3598 max1 = minnext + deltanext + trie->maxlen;
3599
3600 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3601 pars++;
3602 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3603 if ( stopmin > min + min1)
3604 stopmin = min + min1;
3605 flags &= ~SCF_DO_SUBSTR;
3606 if (data)
3607 data->flags |= SCF_SEEN_ACCEPT;
3608 }
3609 if (data) {
3610 if (data_fake.flags & SF_HAS_EVAL)
3611 data->flags |= SF_HAS_EVAL;
3612 data->whilem_c = data_fake.whilem_c;
3613 }
3614 if (flags & SCF_DO_STCLASS)
3615 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
3616 }
3617 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
3618 }
3619 if (flags & SCF_DO_SUBSTR) {
3620 data->pos_min += min1;
3621 data->pos_delta += max1 - min1;
3622 if (max1 != min1 || is_inf)
3623 data->cur_is_floating = 1; /* float */
3624 }
3625 min += min1;
3626 if (delta != OPTIMIZE_INFTY) {
3627 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
3628 delta += max1 - min1;
3629 else
3630 delta = OPTIMIZE_INFTY;
3631 }
3632 if (flags & SCF_DO_STCLASS_OR) {
3633 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3634 if (min1) {
3635 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3636 flags &= ~SCF_DO_STCLASS;
3637 }
3638 }
3639 else if (flags & SCF_DO_STCLASS_AND) {
3640 if (min1) {
3641 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3642 flags &= ~SCF_DO_STCLASS;
3643 }
3644 else {
3645 /* Switch to OR mode: cache the old value of
3646 * data->start_class */
3647 INIT_AND_WITHP;
3648 StructCopy(data->start_class, and_withp, regnode_ssc);
3649 flags &= ~SCF_DO_STCLASS_AND;
3650 StructCopy(&accum, data->start_class, regnode_ssc);
3651 flags |= SCF_DO_STCLASS_OR;
3652 }
3653 }
3654 scan= tail;
3655 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
3656 continue;
3657 }
3658 #else
3659 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3660 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3661 U8*bang=NULL;
3662
3663 min += trie->minlen;
3664 delta += (trie->maxlen - trie->minlen);
3665 flags &= ~SCF_DO_STCLASS; /* xxx */
3666 if (flags & SCF_DO_SUBSTR) {
3667 /* Cannot expect anything... */
3668 scan_commit(pRExC_state, data, minlenp, is_inf);
3669 data->pos_min += trie->minlen;
3670 data->pos_delta += (trie->maxlen - trie->minlen);
3671 if (trie->maxlen != trie->minlen)
3672 data->cur_is_floating = 1; /* float */
3673 }
3674 if (trie->jump) /* no more substrings -- for now /grr*/
3675 flags &= ~SCF_DO_SUBSTR;
3676 }
3677
3678 #endif /* old or new */
3679 #endif /* TRIE_STUDY_OPT */
3680
3681 else if (OP(scan) == REGEX_SET) {
3682 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
3683 " before optimization", REGNODE_NAME(REGEX_SET));
3684 }
3685
3686 /* Else: zero-length, ignore. */
3687 scan = regnext(scan);
3688 }
3689
3690 finish:
3691 if (frame) {
3692 /* we need to unwind recursion. */
3693 depth = depth - 1;
3694
3695 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
3696 DEBUG_PEEP("fend", scan, depth, flags);
3697
3698 /* restore previous context */
3699 last = frame->last_regnode;
3700 scan = frame->next_regnode;
3701 stopparen = frame->stopparen;
3702 recursed_depth = frame->prev_recursed_depth;
3703
3704 RExC_frame_last = frame->prev_frame;
3705 frame = frame->this_prev_frame;
3706 goto fake_study_recurse;
3707 }
3708
3709 assert(!frame);
3710 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
3711
3712 /* is this pattern infinite? Eg, consider /(a|b+)/ */
3713 if (is_inf_internal)
3714 delta = OPTIMIZE_INFTY;
3715
3716 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
3717 if (min > stopmin) {
3718 /*
3719 At this point 'min' represents the minimum length string we can
3720 match while *ignoring* the implication of ACCEPT, and 'delta'
3721 represents the difference between the minimum length and maximum
3722 length, and if the pattern matches an infinitely long string
3723 (consider the + and * quantifiers) then we use the special delta
3724 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
3725 minimum length that can be matched *and* accepted.
3726
3727 A pattern is accepted when matching was successful *and*
3728 complete, and thus there is no further matching needing to be
3729 done, no backtracking to occur, etc. Prior to the introduction
3730 of ACCEPT the only opcode that signaled acceptance was the END
3731 opcode, which is always the very last opcode in a regex program.
3732 ACCEPT is thus conceptually an early successful return out of
3733 the matching process. stopmin starts out as OPTIMIZE_INFTY to
3734 represent "the entire pattern", and is ratched down to the
3735 "current min" if necessary when an ACCEPT opcode is encountered.
3736
3737 Thus stopmin might be smaller than min if we saw an (*ACCEPT),
3738 and we now need to account for it in both min and delta.
3739 Consider that in a pattern /AB/ normally the min length it can
3740 match can be computed as min(A)+min(B). But (*ACCEPT) means
3741 that it might be something else, not even neccesarily min(A) at
3742 all. Consider
3743
3744 A = /(foo(*ACCEPT)|x+)/
3745 B = /whop/
3746 AB = /(foo(*ACCEPT)|x+)whop/
3747
3748 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
3749 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
3750 "whop", and the delta of 0 as the pattern is of fixed length, the
3751 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
3752 When handling AB we expect to see a min of 5 for "xwhop", and a
3753 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
3754 for "foo". This should result in a final min of 3 for "foo", and
3755 a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
3756
3757 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
3758 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
3759 stop min would be 4 for "dude". This should result in a final
3760 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
3761
3762 When min is smaller than stopmin then we can ignore it. In the
3763 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
3764 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
3765 the ACCEPT doesn't reduce the minimum length of the string that
3766 might be matched, nor affect the maximum length.
3767
3768 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
3769 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
3770 "foo". We currently turn this into a min of 3 for "foo" and a
3771 delta of 3 for "foobar" even though technically "foobar" isn't
3772 possible. ACCEPT affects some aspects of the optimizer, like
3773 length computations and mandatory substring optimizations, but
3774 there are other optimzations this routine perfoms that are not
3775 affected and this compromise simplifies implementation.
3776
3777 It might be helpful to consider that this C function is called
3778 recursively on the pattern in a bottom up fashion, and that the
3779 min returned by a nested call may be marked as coming from an
3780 ACCEPT, causing its callers to treat the returned min as a
3781 stopmin as the recursion unwinds. Thus a single ACCEPT can affect
3782 multiple calls into this function in different ways.
3783 */
3784
3785 if (OPTIMIZE_INFTY - delta >= min - stopmin)
3786 delta += min - stopmin;
3787 else
3788 delta = OPTIMIZE_INFTY;
3789 min = stopmin;
3790 }
3791
3792 *scanp = scan;
3793 *deltap = delta;
3794
3795 if (flags & SCF_DO_SUBSTR && is_inf)
3796 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
3797 if (is_par > (I32)U8_MAX)
3798 is_par = 0;
3799 if (is_par && pars==1 && data) {
3800 data->flags |= SF_IN_PAR;
3801 data->flags &= ~SF_HAS_PAR;
3802 }
3803 else if (pars && data) {
3804 data->flags |= SF_HAS_PAR;
3805 data->flags &= ~SF_IN_PAR;
3806 }
3807 if (flags & SCF_DO_STCLASS_OR)
3808 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3809 if (flags & SCF_TRIE_RESTUDY)
3810 data->flags |= SCF_TRIE_RESTUDY;
3811
3812
3813 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
3814 if (min > OPTIMIZE_INFTY - delta)
3815 RExC_maxlen = OPTIMIZE_INFTY;
3816 else if (RExC_maxlen < min + delta)
3817 RExC_maxlen = min + delta;
3818 }
3819 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
3820 return min;
3821 }
3822