xref: /openbsd/gnu/usr.bin/perl/regcomp_study.c (revision 5486feef)
1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4 
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_STUDY_C
9 #include "perl.h"
10 
11 #ifdef PERL_IN_XSUB_RE
12 #  include "re_comp.h"
13 #else
14 #  include "regcomp.h"
15 #endif
16 
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20 
21 #define INIT_AND_WITHP \
22     assert(!and_withp); \
23     Newx(and_withp, 1, regnode_ssc); \
24     SAVEFREEPV(and_withp)
25 
26 
27 STATIC void
S_unwind_scan_frames(pTHX_ const void * p)28 S_unwind_scan_frames(pTHX_ const void *p)
29 {
30     PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
31     scan_frame *f= (scan_frame *)p;
32     do {
33         scan_frame *n= f->next_frame;
34         Safefree(f);
35         f= n;
36     } while (f);
37 }
38 
39 /* Follow the next-chain of the current node and optimize away
40    all the NOTHINGs from it.
41  */
42 STATIC void
S_rck_elide_nothing(pTHX_ regnode * node)43 S_rck_elide_nothing(pTHX_ regnode *node)
44 {
45     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
46 
47     if (OP(node) != CURLYX) {
48         const int max = (REGNODE_OFF_BY_ARG(OP(node))
49                         ? I32_MAX
50                           /* I32 may be smaller than U16 on CRAYs! */
51                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
52         int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG1u(node) : NEXT_OFF(node));
53         int noff;
54         regnode *n = node;
55 
56         /* Skip NOTHING and LONGJMP. */
57         while (
58             (n = regnext(n))
59             && (
60                 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
61                 || ((OP(n) == LONGJMP) && (noff = ARG1u(n)))
62             )
63             && off + noff < max
64         ) {
65             off += noff;
66         }
67         if (REGNODE_OFF_BY_ARG(OP(node)))
68             ARG1u(node) = off;
69         else
70             NEXT_OFF(node) = off;
71     }
72     return;
73 }
74 
75 
76 /*
77  * As best we can, determine the characters that can match the start of
78  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
79  * can be false positive matches
80  *
81  * Returns the invlist as a new SV*; it is the caller's responsibility to
82  * call SvREFCNT_dec() when done with it.
83  */
84 STATIC SV*
S_make_exactf_invlist(pTHX_ RExC_state_t * pRExC_state,regnode * node)85 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
86 {
87     const U8 * s = (U8*)STRING(node);
88     SSize_t bytelen = STR_LEN(node);
89     UV uc;
90     /* Start out big enough for 2 separate code points */
91     SV* invlist = _new_invlist(4);
92 
93     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
94 
95     if (! UTF) {
96         uc = *s;
97 
98         /* We punt and assume can match anything if the node begins
99          * with a multi-character fold.  Things are complicated.  For
100          * example, /ffi/i could match any of:
101          *  "\N{LATIN SMALL LIGATURE FFI}"
102          *  "\N{LATIN SMALL LIGATURE FF}I"
103          *  "F\N{LATIN SMALL LIGATURE FI}"
104          *  plus several other things; and making sure we have all the
105          *  possibilities is hard. */
106         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
107             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
108         }
109         else {
110             /* Any Latin1 range character can potentially match any
111              * other depending on the locale, and in Turkic locales, 'I' and
112              * 'i' can match U+130 and U+131 */
113             if (OP(node) == EXACTFL) {
114                 _invlist_union(invlist, PL_Latin1, &invlist);
115                 if (isALPHA_FOLD_EQ(uc, 'I')) {
116                     invlist = add_cp_to_invlist(invlist,
117                                                 LATIN_SMALL_LETTER_DOTLESS_I);
118                     invlist = add_cp_to_invlist(invlist,
119                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
120                 }
121             }
122             else {
123                 /* But otherwise, it matches at least itself.  We can
124                  * quickly tell if it has a distinct fold, and if so,
125                  * it matches that as well */
126                 invlist = add_cp_to_invlist(invlist, uc);
127                 if (IS_IN_SOME_FOLD_L1(uc))
128                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
129             }
130 
131             /* Some characters match above-Latin1 ones under /i.  This
132              * is true of EXACTFL ones when the locale is UTF-8 */
133             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
134                 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
135                                                          EXACTFAA_NO_TRIE)))
136             {
137                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
138             }
139         }
140     }
141     else {  /* Pattern is UTF-8 */
142         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
143         const U8* e = s + bytelen;
144         IV fc;
145 
146         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
147 
148         /* The only code points that aren't folded in a UTF EXACTFish
149          * node are the problematic ones in EXACTFL nodes */
150         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
151             /* We need to check for the possibility that this EXACTFL
152              * node begins with a multi-char fold.  Therefore we fold
153              * the first few characters of it so that we can make that
154              * check */
155             U8 *d = folded;
156             int i;
157 
158             fc = -1;
159             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
160                 if (isASCII(*s)) {
161                     *(d++) = (U8) toFOLD(*s);
162                     if (fc < 0) {       /* Save the first fold */
163                         fc = *(d-1);
164                     }
165                     s++;
166                 }
167                 else {
168                     STRLEN len;
169                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
170                     if (fc < 0) {       /* Save the first fold */
171                         fc = fold;
172                     }
173                     d += len;
174                     s += UTF8SKIP(s);
175                 }
176             }
177 
178             /* And set up so the code below that looks in this folded
179              * buffer instead of the node's string */
180             e = d;
181             s = folded;
182         }
183 
184         /* When we reach here 's' points to the fold of the first
185          * character(s) of the node; and 'e' points to far enough along
186          * the folded string to be just past any possible multi-char
187          * fold.
188          *
189          * Like the non-UTF case above, we punt if the node begins with a
190          * multi-char fold  */
191 
192         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
193             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
194         }
195         else {  /* Single char fold */
196             unsigned int k;
197             U32 first_fold;
198             const U32 * remaining_folds;
199             Size_t folds_count;
200 
201             /* It matches itself */
202             invlist = add_cp_to_invlist(invlist, fc);
203 
204             /* ... plus all the things that fold to it, which are found in
205              * PL_utf8_foldclosures */
206             folds_count = _inverse_folds(fc, &first_fold,
207                                                 &remaining_folds);
208             for (k = 0; k < folds_count; k++) {
209                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
210 
211                 /* /aa doesn't allow folds between ASCII and non- */
212                 if (   inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
213                     && isASCII(c) != isASCII(fc))
214                 {
215                     continue;
216                 }
217 
218                 invlist = add_cp_to_invlist(invlist, c);
219             }
220 
221             if (OP(node) == EXACTFL) {
222 
223                 /* If either [iI] are present in an EXACTFL node the above code
224                  * should have added its normal case pair, but under a Turkish
225                  * locale they could match instead the case pairs from it.  Add
226                  * those as potential matches as well */
227                 if (isALPHA_FOLD_EQ(fc, 'I')) {
228                     invlist = add_cp_to_invlist(invlist,
229                                                 LATIN_SMALL_LETTER_DOTLESS_I);
230                     invlist = add_cp_to_invlist(invlist,
231                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
232                 }
233                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
234                     invlist = add_cp_to_invlist(invlist, 'I');
235                 }
236                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
237                     invlist = add_cp_to_invlist(invlist, 'i');
238                 }
239             }
240         }
241     }
242 
243     return invlist;
244 }
245 
246 
247 /* Mark that we cannot extend a found fixed substring at this point.
248    Update the longest found anchored substring or the longest found
249    floating substrings if needed. */
250 
251 void
Perl_scan_commit(pTHX_ const RExC_state_t * pRExC_state,scan_data_t * data,SSize_t * minlenp,int is_inf)252 Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
253                     SSize_t *minlenp, int is_inf)
254 {
255     const STRLEN l = CHR_SVLEN(data->last_found);
256     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
257     const STRLEN old_l = CHR_SVLEN(longest_sv);
258     DECLARE_AND_GET_RE_DEBUG_FLAGS;
259 
260     PERL_ARGS_ASSERT_SCAN_COMMIT;
261 
262     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
263         const U8 i = data->cur_is_floating;
264         SvSetMagicSV(longest_sv, data->last_found);
265         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
266 
267         if (!i) /* fixed */
268             data->substrs[0].max_offset = data->substrs[0].min_offset;
269         else { /* float */
270             data->substrs[1].max_offset =
271                       (is_inf)
272                        ? OPTIMIZE_INFTY
273                        : (l
274                           ? data->last_start_max
275                           : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
276                                          ? OPTIMIZE_INFTY
277                                          : data->pos_min + data->pos_delta));
278         }
279 
280         data->substrs[i].flags &= ~SF_BEFORE_EOL;
281         data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
282         data->substrs[i].minlenp = minlenp;
283         data->substrs[i].lookbehind = 0;
284     }
285 
286     SvCUR_set(data->last_found, 0);
287     {
288         SV * const sv = data->last_found;
289         if (SvUTF8(sv) && SvMAGICAL(sv)) {
290             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
291             if (mg)
292                 mg->mg_len = 0;
293         }
294     }
295     data->last_end = -1;
296     data->flags &= ~SF_BEFORE_EOL;
297     DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
298 }
299 
300 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
301  * list that describes which code points it matches */
302 
303 STATIC void
S_ssc_anything(pTHX_ regnode_ssc * ssc)304 S_ssc_anything(pTHX_ regnode_ssc *ssc)
305 {
306     /* Set the SSC 'ssc' to match an empty string or any code point */
307 
308     PERL_ARGS_ASSERT_SSC_ANYTHING;
309 
310     assert(is_ANYOF_SYNTHETIC(ssc));
311 
312     /* mortalize so won't leak */
313     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
314     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
315 }
316 
317 STATIC int
S_ssc_is_anything(const regnode_ssc * ssc)318 S_ssc_is_anything(const regnode_ssc *ssc)
319 {
320     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
321      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
322      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
323      * in any way, so there's no point in using it */
324 
325     UV start = 0, end = 0;  /* Initialize due to messages from dumb compiler */
326     bool ret;
327 
328     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
329 
330     assert(is_ANYOF_SYNTHETIC(ssc));
331 
332     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
333         return FALSE;
334     }
335 
336     /* See if the list consists solely of the range 0 - Infinity */
337     invlist_iterinit(ssc->invlist);
338     ret = invlist_iternext(ssc->invlist, &start, &end)
339           && start == 0
340           && end == UV_MAX;
341 
342     invlist_iterfinish(ssc->invlist);
343 
344     if (ret) {
345         return TRUE;
346     }
347 
348     /* If e.g., both \w and \W are set, matches everything */
349     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
350         int i;
351         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
352             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
353                 return TRUE;
354             }
355         }
356     }
357 
358     return FALSE;
359 }
360 
361 void
Perl_ssc_init(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc)362 Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
363 {
364     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
365      * string, any code point, or any posix class under locale */
366 
367     PERL_ARGS_ASSERT_SSC_INIT;
368 
369     Zero(ssc, 1, regnode_ssc);
370     set_ANYOF_SYNTHETIC(ssc);
371     ARG1u_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
372     ssc_anything(ssc);
373 
374     /* If any portion of the regex is to operate under locale rules that aren't
375      * fully known at compile time, initialization includes it.  The reason
376      * this isn't done for all regexes is that the optimizer was written under
377      * the assumption that locale was all-or-nothing.  Given the complexity and
378      * lack of documentation in the optimizer, and that there are inadequate
379      * test cases for locale, many parts of it may not work properly, it is
380      * safest to avoid locale unless necessary. */
381     if (RExC_contains_locale) {
382         ANYOF_POSIXL_SETALL(ssc);
383     }
384     else {
385         ANYOF_POSIXL_ZERO(ssc);
386     }
387 }
388 
389 STATIC int
S_ssc_is_cp_posixl_init(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)390 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
391                         const regnode_ssc *ssc)
392 {
393     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
394      * to the list of code points matched, and locale posix classes; hence does
395      * not check its flags) */
396 
397     UV start = 0, end = 0;  /* Initialize due to messages from dumb compiler */
398     bool ret;
399 
400     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
401 
402     assert(is_ANYOF_SYNTHETIC(ssc));
403 
404     invlist_iterinit(ssc->invlist);
405     ret = invlist_iternext(ssc->invlist, &start, &end)
406           && start == 0
407           && end == UV_MAX;
408 
409     invlist_iterfinish(ssc->invlist);
410 
411     if (! ret) {
412         return FALSE;
413     }
414 
415     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
416         return FALSE;
417     }
418 
419     return TRUE;
420 }
421 
422 
423 STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t * pRExC_state,const regnode_charclass * const node)424 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
425                                const regnode_charclass* const node)
426 {
427     /* Returns a mortal inversion list defining which code points are matched
428      * by 'node', which is of ANYOF-ish type .  Handles complementing the
429      * result if appropriate.  If some code points aren't knowable at this
430      * time, the returned list must, and will, contain every code point that is
431      * a possibility. */
432 
433     SV* invlist = NULL;
434     SV* only_utf8_locale_invlist = NULL;
435     bool new_node_has_latin1 = FALSE;
436     const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
437                       ? ANYOF_FLAGS(node)
438                       : 0;
439 
440     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
441 
442     /* Look at the data structure created by S_set_ANYOF_arg() */
443     if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
444         invlist = sv_2mortal(_new_invlist(1));
445         invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
446     }
447     else if (ANYOF_HAS_AUX(node)) {
448         const U32 n = ARG1u(node);
449         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
450         AV * const av = MUTABLE_AV(SvRV(rv));
451         SV **const ary = AvARRAY(av);
452 
453         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
454 
455             /* Here there are things that won't be known until runtime -- we
456              * have to assume it could be anything */
457             invlist = sv_2mortal(_new_invlist(1));
458             return _add_range_to_invlist(invlist, 0, UV_MAX);
459         }
460         else if (ary[INVLIST_INDEX]) {
461 
462             /* Use the node's inversion list */
463             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
464         }
465 
466         /* Get the code points valid only under UTF-8 locales */
467         if (   (flags & ANYOFL_FOLD)
468             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
469         {
470             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
471         }
472     }
473 
474     if (! invlist) {
475         invlist = sv_2mortal(_new_invlist(0));
476     }
477 
478     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
479      * code points, and an inversion list for the others, but if there are code
480      * points that should match only conditionally on the target string being
481      * UTF-8, those are placed in the inversion list, and not the bitmap.
482      * Since there are circumstances under which they could match, they are
483      * included in the SSC.  But if the ANYOF node is to be inverted, we have
484      * to exclude them here, so that when we invert below, the end result
485      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
486      * have to do this here before we add the unconditionally matched code
487      * points */
488     if (flags & ANYOF_INVERT) {
489         _invlist_intersection_complement_2nd(invlist,
490                                              PL_UpperLatin1,
491                                              &invlist);
492     }
493 
494     /* Add in the points from the bit map */
495     if (REGNODE_TYPE(OP(node)) == ANYOF){
496         for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
497             if (ANYOF_BITMAP_TEST(node, i)) {
498                 unsigned int start = i++;
499 
500                 for (;    i < NUM_ANYOF_CODE_POINTS
501                        && ANYOF_BITMAP_TEST(node, i); ++i)
502                 {
503                     /* empty */
504                 }
505                 invlist = _add_range_to_invlist(invlist, start, i-1);
506                 new_node_has_latin1 = TRUE;
507             }
508         }
509     }
510 
511     /* If this can match all upper Latin1 code points, have to add them
512      * as well.  But don't add them if inverting, as when that gets done below,
513      * it would exclude all these characters, including the ones it shouldn't
514      * that were added just above */
515     if ( ! (flags & ANYOF_INVERT)
516         &&  OP(node) == ANYOFD
517         && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
518     {
519         _invlist_union(invlist, PL_UpperLatin1, &invlist);
520     }
521 
522     /* Similarly for these */
523     if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
524         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
525     }
526 
527     if (flags & ANYOF_INVERT) {
528         _invlist_invert(invlist);
529     }
530     else if (flags & ANYOFL_FOLD) {
531         if (new_node_has_latin1) {
532 
533             /* These folds are potential in Turkic locales */
534             if (_invlist_contains_cp(invlist, 'i')) {
535                 invlist = add_cp_to_invlist(invlist,
536                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
537             }
538             if (_invlist_contains_cp(invlist, 'I')) {
539                 invlist = add_cp_to_invlist(invlist,
540                                                 LATIN_SMALL_LETTER_DOTLESS_I);
541             }
542 
543             /* Under /li, any 0-255 could fold to any other 0-255, depending on
544              * the locale.  We can skip this if there are no 0-255 at all. */
545             _invlist_union(invlist, PL_Latin1, &invlist);
546         }
547         else {
548             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
549                 invlist = add_cp_to_invlist(invlist, 'I');
550             }
551             if (_invlist_contains_cp(invlist,
552                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
553             {
554                 invlist = add_cp_to_invlist(invlist, 'i');
555             }
556         }
557     }
558 
559     /* Similarly add the UTF-8 locale possible matches.  These have to be
560      * deferred until after the non-UTF-8 locale ones are taken care of just
561      * above, or it leads to wrong results under ANYOF_INVERT */
562     if (only_utf8_locale_invlist) {
563         _invlist_union_maybe_complement_2nd(invlist,
564                                             only_utf8_locale_invlist,
565                                             flags & ANYOF_INVERT,
566                                             &invlist);
567     }
568 
569     return invlist;
570 }
571 
572 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
573  * should not be inverted. */
574 
575 STATIC void
S_ssc_and(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * and_with)576 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
577                 const regnode_charclass *and_with)
578 {
579     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
580      * another SSC or a regular ANYOF class.  Can create false positives. */
581 
582     SV* anded_cp_list;
583     U8  and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
584                           ? ANYOF_FLAGS(and_with)
585                           : 0;
586     U8  anded_flags;
587 
588     PERL_ARGS_ASSERT_SSC_AND;
589 
590     assert(is_ANYOF_SYNTHETIC(ssc));
591 
592     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
593      * the code point inversion list and just the relevant flags */
594     if (is_ANYOF_SYNTHETIC(and_with)) {
595         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
596         anded_flags = and_with_flags;
597 
598         /* XXX This is a kludge around what appears to be deficiencies in the
599          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
600          * there are paths through the optimizer where it doesn't get weeded
601          * out when it should.  And if we don't make some extra provision for
602          * it like the code just below, it doesn't get added when it should.
603          * This solution is to add it only when AND'ing, which is here, and
604          * only when what is being AND'ed is the pristine, original node
605          * matching anything.  Thus it is like adding it to ssc_anything() but
606          * only when the result is to be AND'ed.  Probably the same solution
607          * could be adopted for the same problem we have with /l matching,
608          * which is solved differently in S_ssc_init(), and that would lead to
609          * fewer false positives than that solution has.  But if this solution
610          * creates bugs, the consequences are only that a warning isn't raised
611          * that should be; while the consequences for having /l bugs is
612          * incorrect matches */
613         if (ssc_is_anything((regnode_ssc *)and_with)) {
614             anded_flags |= ANYOF_WARN_SUPER__shared;
615         }
616     }
617     else {
618         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
619         if (OP(and_with) == ANYOFD) {
620             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
621         }
622         else {
623             anded_flags = and_with_flags
624                             & ( ANYOF_COMMON_FLAGS
625                                |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
626                                |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
627             if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
628                 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
629             }
630         }
631     }
632 
633     ANYOF_FLAGS(ssc) &= anded_flags;
634 
635     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
636      * C2 is the list of code points in 'and-with'; P2, its posix classes.
637      * 'and_with' may be inverted.  When not inverted, we have the situation of
638      * computing:
639      *  (C1 | P1) & (C2 | P2)
640      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
641      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
642      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
643      *                    <=  ((C1 & C2) | P1 | P2)
644      * Alternatively, the last few steps could be:
645      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
646      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
647      *                    <=  (C1 | C2 | (P1 & P2))
648      * We favor the second approach if either P1 or P2 is non-empty.  This is
649      * because these components are a barrier to doing optimizations, as what
650      * they match cannot be known until the moment of matching as they are
651      * dependent on the current locale, 'AND"ing them likely will reduce or
652      * eliminate them.
653      * But we can do better if we know that C1,P1 are in their initial state (a
654      * frequent occurrence), each matching everything:
655      *  (<everything>) & (C2 | P2) =  C2 | P2
656      * Similarly, if C2,P2 are in their initial state (again a frequent
657      * occurrence), the result is a no-op
658      *  (C1 | P1) & (<everything>) =  C1 | P1
659      *
660      * Inverted, we have
661      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
662      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
663      *                         <=  (C1 & ~C2) | (P1 & ~P2)
664      * */
665 
666     if ((and_with_flags & ANYOF_INVERT)
667         && ! is_ANYOF_SYNTHETIC(and_with))
668     {
669         unsigned int i;
670 
671         ssc_intersection(ssc,
672                          anded_cp_list,
673                          FALSE /* Has already been inverted */
674                          );
675 
676         /* If either P1 or P2 is empty, the intersection will be also; can skip
677          * the loop */
678         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
679             ANYOF_POSIXL_ZERO(ssc);
680         }
681         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
682 
683             /* Note that the Posix class component P from 'and_with' actually
684              * looks like:
685              *      P = Pa | Pb | ... | Pn
686              * where each component is one posix class, such as in [\w\s].
687              * Thus
688              *      ~P = ~(Pa | Pb | ... | Pn)
689              *         = ~Pa & ~Pb & ... & ~Pn
690              *        <= ~Pa | ~Pb | ... | ~Pn
691              * The last is something we can easily calculate, but unfortunately
692              * is likely to have many false positives.  We could do better
693              * in some (but certainly not all) instances if two classes in
694              * P have known relationships.  For example
695              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
696              * So
697              *      :lower: & :print: = :lower:
698              * And similarly for classes that must be disjoint.  For example,
699              * since \s and \w can have no elements in common based on rules in
700              * the POSIX standard,
701              *      \w & ^\S = nothing
702              * Unfortunately, some vendor locales do not meet the Posix
703              * standard, in particular almost everything by Microsoft.
704              * The loop below just changes e.g., \w into \W and vice versa */
705 
706             regnode_charclass_posixl temp;
707             int add = 1;    /* To calculate the index of the complement */
708 
709             Zero(&temp, 1, regnode_charclass_posixl);
710             ANYOF_POSIXL_ZERO(&temp);
711             for (i = 0; i < ANYOF_MAX; i++) {
712                 assert(i % 2 != 0
713                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
714                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
715 
716                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
717                     ANYOF_POSIXL_SET(&temp, i + add);
718                 }
719                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
720             }
721             ANYOF_POSIXL_AND(&temp, ssc);
722 
723         } /* else ssc already has no posixes */
724     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
725          in its initial state */
726     else if (! is_ANYOF_SYNTHETIC(and_with)
727              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
728     {
729         /* But if 'ssc' is in its initial state, the result is just 'and_with';
730          * copy it over 'ssc' */
731         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
732             if (is_ANYOF_SYNTHETIC(and_with)) {
733                 StructCopy(and_with, ssc, regnode_ssc);
734             }
735             else {
736                 ssc->invlist = anded_cp_list;
737                 ANYOF_POSIXL_ZERO(ssc);
738                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
739                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
740                 }
741             }
742         }
743         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
744                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
745         {
746             /* One or the other of P1, P2 is non-empty. */
747             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
748                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
749             }
750             ssc_union(ssc, anded_cp_list, FALSE);
751         }
752         else { /* P1 = P2 = empty */
753             ssc_intersection(ssc, anded_cp_list, FALSE);
754         }
755     }
756 }
757 
758 STATIC void
S_ssc_or(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * or_with)759 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
760                const regnode_charclass *or_with)
761 {
762     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
763      * another SSC or a regular ANYOF class.  Can create false positives if
764      * 'or_with' is to be inverted. */
765 
766     SV* ored_cp_list;
767     U8 ored_flags;
768     U8  or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
769                          ? ANYOF_FLAGS(or_with)
770                          : 0;
771 
772     PERL_ARGS_ASSERT_SSC_OR;
773 
774     assert(is_ANYOF_SYNTHETIC(ssc));
775 
776     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
777      * the code point inversion list and just the relevant flags */
778     if (is_ANYOF_SYNTHETIC(or_with)) {
779         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
780         ored_flags = or_with_flags;
781     }
782     else {
783         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
784         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
785         if (OP(or_with) != ANYOFD) {
786             ored_flags |=
787                 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
788                                  |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
789             if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
790                 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
791             }
792         }
793     }
794 
795     ANYOF_FLAGS(ssc) |= ored_flags;
796 
797     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
798      * C2 is the list of code points in 'or-with'; P2, its posix classes.
799      * 'or_with' may be inverted.  When not inverted, we have the simple
800      * situation of computing:
801      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
802      * If P1|P2 yields a situation with both a class and its complement are
803      * set, like having both \w and \W, this matches all code points, and we
804      * can delete these from the P component of the ssc going forward.  XXX We
805      * might be able to delete all the P components, but I (khw) am not certain
806      * about this, and it is better to be safe.
807      *
808      * Inverted, we have
809      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
810      *                         <=  (C1 | P1) | ~C2
811      *                         <=  (C1 | ~C2) | P1
812      * (which results in actually simpler code than the non-inverted case)
813      * */
814 
815     if ((or_with_flags & ANYOF_INVERT)
816         && ! is_ANYOF_SYNTHETIC(or_with))
817     {
818         /* We ignore P2, leaving P1 going forward */
819     }   /* else  Not inverted */
820     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
821         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
822         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
823             unsigned int i;
824             for (i = 0; i < ANYOF_MAX; i += 2) {
825                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
826                 {
827                     ssc_match_all_cp(ssc);
828                     ANYOF_POSIXL_CLEAR(ssc, i);
829                     ANYOF_POSIXL_CLEAR(ssc, i+1);
830                 }
831             }
832         }
833     }
834 
835     ssc_union(ssc,
836               ored_cp_list,
837               FALSE /* Already has been inverted */
838               );
839 }
840 
841 STATIC void
S_ssc_union(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)842 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
843 {
844     PERL_ARGS_ASSERT_SSC_UNION;
845 
846     assert(is_ANYOF_SYNTHETIC(ssc));
847 
848     _invlist_union_maybe_complement_2nd(ssc->invlist,
849                                         invlist,
850                                         invert2nd,
851                                         &ssc->invlist);
852 }
853 
854 STATIC void
S_ssc_intersection(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)855 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
856                          SV* const invlist,
857                          const bool invert2nd)
858 {
859     PERL_ARGS_ASSERT_SSC_INTERSECTION;
860 
861     assert(is_ANYOF_SYNTHETIC(ssc));
862 
863     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
864                                                invlist,
865                                                invert2nd,
866                                                &ssc->invlist);
867 }
868 
869 STATIC void
S_ssc_add_range(pTHX_ regnode_ssc * ssc,const UV start,const UV end)870 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
871 {
872     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
873 
874     assert(is_ANYOF_SYNTHETIC(ssc));
875 
876     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
877 }
878 
879 STATIC void
S_ssc_cp_and(pTHX_ regnode_ssc * ssc,const UV cp)880 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
881 {
882     /* AND just the single code point 'cp' into the SSC 'ssc' */
883 
884     SV* cp_list = _new_invlist(2);
885 
886     PERL_ARGS_ASSERT_SSC_CP_AND;
887 
888     assert(is_ANYOF_SYNTHETIC(ssc));
889 
890     cp_list = add_cp_to_invlist(cp_list, cp);
891     ssc_intersection(ssc, cp_list,
892                      FALSE /* Not inverted */
893                      );
894     SvREFCNT_dec_NN(cp_list);
895 }
896 
897 STATIC void
S_ssc_clear_locale(regnode_ssc * ssc)898 S_ssc_clear_locale(regnode_ssc *ssc)
899 {
900     /* Set the SSC 'ssc' to not match any locale things */
901     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
902 
903     assert(is_ANYOF_SYNTHETIC(ssc));
904 
905     ANYOF_POSIXL_ZERO(ssc);
906     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
907 }
908 
909 
910 
911 /* The below joins as many adjacent EXACTish nodes as possible into a single
912  * one.  The regop may be changed if the node(s) contain certain sequences that
913  * require special handling.  The joining is only done if:
914  * 1) there is room in the current conglomerated node to entirely contain the
915  *    next one.
916  * 2) they are compatible node types
917  *
918  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
919  * these get optimized out
920  *
921  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
922  * as possible, even if that means splitting an existing node so that its first
923  * part is moved to the preceding node.  This would maximise the efficiency of
924  * memEQ during matching.
925  *
926  * If a node is to match under /i (folded), the number of characters it matches
927  * can be different than its character length if it contains a multi-character
928  * fold.  *min_subtract is set to the total delta number of characters of the
929  * input nodes.
930  *
931  * And *unfolded_multi_char is set to indicate whether or not the node contains
932  * an unfolded multi-char fold.  This happens when it won't be known until
933  * runtime whether the fold is valid or not; namely
934  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
935  *      target string being matched against turns out to be UTF-8 is that fold
936  *      valid; or
937  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
938  *      runtime.
939  * (Multi-char folds whose components are all above the Latin1 range are not
940  * run-time locale dependent, and have already been folded by the time this
941  * function is called.)
942  *
943  * This is as good a place as any to discuss the design of handling these
944  * multi-character fold sequences.  It's been wrong in Perl for a very long
945  * time.  There are three code points in Unicode whose multi-character folds
946  * were long ago discovered to mess things up.  The previous designs for
947  * dealing with these involved assigning a special node for them.  This
948  * approach doesn't always work, as evidenced by this example:
949  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
950  * Both sides fold to "sss", but if the pattern is parsed to create a node that
951  * would match just the \xDF, it won't be able to handle the case where a
952  * successful match would have to cross the node's boundary.  The new approach
953  * that hopefully generally solves the problem generates an EXACTFUP node
954  * that is "sss" in this case.
955  *
956  * It turns out that there are problems with all multi-character folds, and not
957  * just these three.  Now the code is general, for all such cases.  The
958  * approach taken is:
959  * 1)   This routine examines each EXACTFish node that could contain multi-
960  *      character folded sequences.  Since a single character can fold into
961  *      such a sequence, the minimum match length for this node is less than
962  *      the number of characters in the node.  This routine returns in
963  *      *min_subtract how many characters to subtract from the actual
964  *      length of the string to get a real minimum match length; it is 0 if
965  *      there are no multi-char foldeds.  This delta is used by the caller to
966  *      adjust the min length of the match, and the delta between min and max,
967  *      so that the optimizer doesn't reject these possibilities based on size
968  *      constraints.
969  *
970  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
971  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
972  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
973  *      EXACTFU nodes.  The node type of such nodes is then changed to
974  *      EXACTFUP, indicating it is problematic, and needs careful handling.
975  *      (The procedures in step 1) above are sufficient to handle this case in
976  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
977  *      the only case where there is a possible fold length change in non-UTF-8
978  *      patterns.  By reserving a special node type for problematic cases, the
979  *      far more common regular EXACTFU nodes can be processed faster.
980  *      regexec.c takes advantage of this.
981  *
982  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
983  *      problematic cases.   These all only occur when the pattern is not
984  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
985  *      length change, it handles the situation where the string cannot be
986  *      entirely folded.  The strings in an EXACTFish node are folded as much
987  *      as possible during compilation in regcomp.c.  This saves effort in
988  *      regex matching.  By using an EXACTFUP node when it is not possible to
989  *      fully fold at compile time, regexec.c can know that everything in an
990  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
991  *      case where folding in EXACTFU nodes can't be done at compile time is
992  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
993  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
994  *      handle two very different cases.  Alternatively, there could have been
995  *      a node type where there are length changes, one for unfolded, and one
996  *      for both.  If yet another special case needed to be created, the number
997  *      of required node types would have to go to 7.  khw figures that even
998  *      though there are plenty of node types to spare, that the maintenance
999  *      cost wasn't worth the small speedup of doing it that way, especially
1000  *      since he thinks the MICRO SIGN is rarely encountered in practice.
1001  *
1002  *      There are other cases where folding isn't done at compile time, but
1003  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
1004  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
1005  *      changes.  Some folds in EXACTF depend on if the runtime target string
1006  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
1007  *      when no fold in it depends on the UTF-8ness of the target string.)
1008  *
1009  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
1010  *      validity of the fold won't be known until runtime, and so must remain
1011  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
1012  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
1013  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
1014  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
1015  *      The reason this is a problem is that the optimizer part of regexec.c
1016  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
1017  *      that a character in the pattern corresponds to at most a single
1018  *      character in the target string.  (And I do mean character, and not byte
1019  *      here, unlike other parts of the documentation that have never been
1020  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
1021  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
1022  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
1023  *      EXACTFL nodes, violate the assumption, and they are the only instances
1024  *      where it is violated.  I'm reluctant to try to change the assumption,
1025  *      as the code involved is impenetrable to me (khw), so instead the code
1026  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
1027  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
1028  *      boolean indicating whether or not the node contains such a fold.  When
1029  *      it is true, the caller sets a flag that later causes the optimizer in
1030  *      this file to not set values for the floating and fixed string lengths,
1031  *      and thus avoids the optimizer code in regexec.c that makes the invalid
1032  *      assumption.  Thus, there is no optimization based on string lengths for
1033  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
1034  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
1035  *      assumption is wrong only in these cases is that all other non-UTF-8
1036  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
1037  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
1038  *      EXACTF nodes because we don't know at compile time if it actually
1039  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
1040  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
1041  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
1042  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
1043  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
1044  *      string would require the pattern to be forced into UTF-8, the overhead
1045  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
1046  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
1047  *      locale.)
1048  *
1049  *      Similarly, the code that generates tries doesn't currently handle
1050  *      not-already-folded multi-char folds, and it looks like a pain to change
1051  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
1052  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
1053  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
1054  *      using /iaa matching will be doing so almost entirely with ASCII
1055  *      strings, so this should rarely be encountered in practice */
1056 
1057 U32
Perl_join_exact(pTHX_ RExC_state_t * pRExC_state,regnode * scan,UV * min_subtract,bool * unfolded_multi_char,U32 flags,regnode * val,U32 depth)1058 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
1059                    UV *min_subtract, bool *unfolded_multi_char,
1060                    U32 flags, regnode *val, U32 depth)
1061 {
1062     /* Merge several consecutive EXACTish nodes into one. */
1063 
1064     regnode *n = regnext(scan);
1065     U32 stringok = 1;
1066     regnode *next = REGNODE_AFTER_varies(scan);
1067     U32 stopnow = 0;
1068 #ifdef DEBUGGING
1069     U32 merged = 0;
1070     regnode *stop = scan;
1071     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1072 #else
1073     PERL_UNUSED_ARG(depth);
1074 #endif
1075 
1076     PERL_ARGS_ASSERT_JOIN_EXACT;
1077 #ifndef EXPERIMENTAL_INPLACESCAN
1078     PERL_UNUSED_ARG(flags);
1079     PERL_UNUSED_ARG(val);
1080 #endif
1081     DEBUG_PEEP("join", scan, depth, 0);
1082 
1083     assert(REGNODE_TYPE(OP(scan)) == EXACT);
1084 
1085     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
1086      * EXACT ones that are mergeable to the current one. */
1087     while (    n
1088            && (    REGNODE_TYPE(OP(n)) == NOTHING
1089                || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
1090            && NEXT_OFF(n)
1091            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
1092     {
1093 
1094         if (OP(n) == TAIL || n > next)
1095             stringok = 0;
1096         if (REGNODE_TYPE(OP(n)) == NOTHING) {
1097             DEBUG_PEEP("skip:", n, depth, 0);
1098             NEXT_OFF(scan) += NEXT_OFF(n);
1099             next = n + NODE_STEP_REGNODE;
1100 #ifdef DEBUGGING
1101             if (stringok)
1102                 stop = n;
1103 #endif
1104             n = regnext(n);
1105         }
1106         else if (stringok) {
1107             const unsigned int oldl = STR_LEN(scan);
1108             regnode * const nnext = regnext(n);
1109 
1110             /* XXX I (khw) kind of doubt that this works on platforms (should
1111              * Perl ever run on one) where U8_MAX is above 255 because of lots
1112              * of other assumptions */
1113             /* Don't join if the sum can't fit into a single node */
1114             if (oldl + STR_LEN(n) > U8_MAX)
1115                 break;
1116 
1117             /* Joining something that requires UTF-8 with something that
1118              * doesn't, means the result requires UTF-8. */
1119             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
1120                 OP(scan) = EXACT_REQ8;
1121             }
1122             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
1123                 ;   /* join is compatible, no need to change OP */
1124             }
1125             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
1126                 OP(scan) = EXACTFU_REQ8;
1127             }
1128             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
1129                 ;   /* join is compatible, no need to change OP */
1130             }
1131             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
1132                 ;   /* join is compatible, no need to change OP */
1133             }
1134             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
1135 
1136                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
1137                   * which can join with EXACTFU ones.  We check for this case
1138                   * here.  These need to be resolved to either EXACTFU or
1139                   * EXACTF at joining time.  They have nothing in them that
1140                   * would forbid them from being the more desirable EXACTFU
1141                   * nodes except that they begin and/or end with a single [Ss].
1142                   * The reason this is problematic is because they could be
1143                   * joined in this loop with an adjacent node that ends and/or
1144                   * begins with [Ss] which would then form the sequence 'ss',
1145                   * which matches differently under /di than /ui, in which case
1146                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
1147                   * formed, the nodes get absorbed into any adjacent EXACTFU
1148                   * node.  And if the only adjacent node is EXACTF, they get
1149                   * absorbed into that, under the theory that a longer node is
1150                   * better than two shorter ones, even if one is EXACTFU.  Note
1151                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
1152                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
1153 
1154                 if (STRING(n)[STR_LEN(n)-1] == 's') {
1155 
1156                     /* Here the joined node would end with 's'.  If the node
1157                      * following the combination is an EXACTF one, it's better to
1158                      * join this trailing edge 's' node with that one, leaving the
1159                      * current one in 'scan' be the more desirable EXACTFU */
1160                     if (OP(nnext) == EXACTF) {
1161                         break;
1162                     }
1163 
1164                     OP(scan) = EXACTFU_S_EDGE;
1165 
1166                 }   /* Otherwise, the beginning 's' of the 2nd node just
1167                        becomes an interior 's' in 'scan' */
1168             }
1169             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
1170                 ;   /* join is compatible, no need to change OP */
1171             }
1172             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
1173 
1174                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
1175                  * nodes.  But the latter nodes can be also joined with EXACTFU
1176                  * ones, and that is a better outcome, so if the node following
1177                  * 'n' is EXACTFU, quit now so that those two can be joined
1178                  * later */
1179                 if (OP(nnext) == EXACTFU) {
1180                     break;
1181                 }
1182 
1183                 /* The join is compatible, and the combined node will be
1184                  * EXACTF.  (These don't care if they begin or end with 's' */
1185             }
1186             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
1187                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
1188                     && STRING(n)[0] == 's')
1189                 {
1190                     /* When combined, we have the sequence 'ss', which means we
1191                      * have to remain /di */
1192                     OP(scan) = EXACTF;
1193                 }
1194             }
1195             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
1196                 if (STRING(n)[0] == 's') {
1197                     ;   /* Here the join is compatible and the combined node
1198                            starts with 's', no need to change OP */
1199                 }
1200                 else {  /* Now the trailing 's' is in the interior */
1201                     OP(scan) = EXACTFU;
1202                 }
1203             }
1204             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
1205 
1206                 /* The join is compatible, and the combined node will be
1207                  * EXACTF.  (These don't care if they begin or end with 's' */
1208                 OP(scan) = EXACTF;
1209             }
1210             else if (OP(scan) != OP(n)) {
1211 
1212                 /* The only other compatible joinings are the same node type */
1213                 break;
1214             }
1215 
1216             DEBUG_PEEP("merg", n, depth, 0);
1217 #ifdef DEBUGGING
1218             merged++;
1219 #endif
1220 
1221             next = REGNODE_AFTER_varies(n);
1222             NEXT_OFF(scan) += NEXT_OFF(n);
1223             assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
1224             setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
1225             /* Now we can overwrite *n : */
1226             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1227 #ifdef DEBUGGING
1228             stop = next - 1;
1229 #endif
1230             n = nnext;
1231             if (stopnow) break;
1232         }
1233 
1234 #ifdef EXPERIMENTAL_INPLACESCAN
1235         if (flags && !NEXT_OFF(n)) {
1236             DEBUG_PEEP("atch", val, depth, 0);
1237             if (REGNODE_OFF_BY_ARG(OP(n))) {
1238                 ARG1u_SET(n, val - n);
1239             }
1240             else {
1241                 NEXT_OFF(n) = val - n;
1242             }
1243             stopnow = 1;
1244         }
1245 #endif
1246     }
1247 
1248     /* This temporary node can now be turned into EXACTFU, and must, as
1249      * regexec.c doesn't handle it */
1250     if (OP(scan) == EXACTFU_S_EDGE) {
1251         OP(scan) = EXACTFU;
1252     }
1253 
1254     *min_subtract = 0;
1255     *unfolded_multi_char = FALSE;
1256 
1257     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
1258      * can now analyze for sequences of problematic code points.  (Prior to
1259      * this final joining, sequences could have been split over boundaries, and
1260      * hence missed).  The sequences only happen in folding, hence for any
1261      * non-EXACT EXACTish node */
1262     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
1263         U8* s0 = (U8*) STRING(scan);
1264         U8* s = s0;
1265         U8* s_end = s0 + STR_LEN(scan);
1266 
1267         int total_count_delta = 0;  /* Total delta number of characters that
1268                                        multi-char folds expand to */
1269 
1270         /* One pass is made over the node's string looking for all the
1271          * possibilities.  To avoid some tests in the loop, there are two main
1272          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
1273          * non-UTF-8 */
1274         if (UTF) {
1275             U8* folded = NULL;
1276 
1277             if (OP(scan) == EXACTFL) {
1278                 U8 *d;
1279 
1280                 /* An EXACTFL node would already have been changed to another
1281                  * node type unless there is at least one character in it that
1282                  * is problematic; likely a character whose fold definition
1283                  * won't be known until runtime, and so has yet to be folded.
1284                  * For all but the UTF-8 locale, folds are 1-1 in length, but
1285                  * to handle the UTF-8 case, we need to create a temporary
1286                  * folded copy using UTF-8 locale rules in order to analyze it.
1287                  * This is because our macros that look to see if a sequence is
1288                  * a multi-char fold assume everything is folded (otherwise the
1289                  * tests in those macros would be too complicated and slow).
1290                  * Note that here, the non-problematic folds will have already
1291                  * been done, so we can just copy such characters.  We actually
1292                  * don't completely fold the EXACTFL string.  We skip the
1293                  * unfolded multi-char folds, as that would just create work
1294                  * below to figure out the size they already are */
1295 
1296                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
1297                 d = folded;
1298                 while (s < s_end) {
1299                     STRLEN s_len = UTF8SKIP(s);
1300                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
1301                         Copy(s, d, s_len, U8);
1302                         d += s_len;
1303                     }
1304                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
1305                         *unfolded_multi_char = TRUE;
1306                         Copy(s, d, s_len, U8);
1307                         d += s_len;
1308                     }
1309                     else if (isASCII(*s)) {
1310                         *(d++) = toFOLD(*s);
1311                     }
1312                     else {
1313                         STRLEN len;
1314                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
1315                         d += len;
1316                     }
1317                     s += s_len;
1318                 }
1319 
1320                 /* Point the remainder of the routine to look at our temporary
1321                  * folded copy */
1322                 s = folded;
1323                 s_end = d;
1324             } /* End of creating folded copy of EXACTFL string */
1325 
1326             /* Examine the string for a multi-character fold sequence.  UTF-8
1327              * patterns have all characters pre-folded by the time this code is
1328              * executed */
1329             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
1330                                      length sequence we are looking for is 2 */
1331             {
1332                 int count = 0;  /* How many characters in a multi-char fold */
1333                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
1334                 if (! len) {    /* Not a multi-char fold: get next char */
1335                     s += UTF8SKIP(s);
1336                     continue;
1337                 }
1338 
1339                 { /* Here is a generic multi-char fold. */
1340                     U8* multi_end  = s + len;
1341 
1342                     /* Count how many characters are in it.  In the case of
1343                      * /aa, no folds which contain ASCII code points are
1344                      * allowed, so check for those, and skip if found. */
1345                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
1346                         count = utf8_length(s, multi_end);
1347                         s = multi_end;
1348                     }
1349                     else {
1350                         while (s < multi_end) {
1351                             if (isASCII(*s)) {
1352                                 s++;
1353                                 goto next_iteration;
1354                             }
1355                             else {
1356                                 s += UTF8SKIP(s);
1357                             }
1358                             count++;
1359                         }
1360                     }
1361                 }
1362 
1363                 /* The delta is how long the sequence is minus 1 (1 is how long
1364                  * the character that folds to the sequence is) */
1365                 total_count_delta += count - 1;
1366               next_iteration: ;
1367             }
1368 
1369             /* We created a temporary folded copy of the string in EXACTFL
1370              * nodes.  Therefore we need to be sure it doesn't go below zero,
1371              * as the real string could be shorter */
1372             if (OP(scan) == EXACTFL) {
1373                 int total_chars = utf8_length((U8*) STRING(scan),
1374                                            (U8*) STRING(scan) + STR_LEN(scan));
1375                 if (total_count_delta > total_chars) {
1376                     total_count_delta = total_chars;
1377                 }
1378             }
1379 
1380             *min_subtract += total_count_delta;
1381             Safefree(folded);
1382         }
1383         else if (OP(scan) == EXACTFAA) {
1384 
1385             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
1386              * fold to the ASCII range (and there are no existing ones in the
1387              * upper latin1 range).  But, as outlined in the comments preceding
1388              * this function, we need to flag any occurrences of the sharp s.
1389              * This character forbids trie formation (because of added
1390              * complexity) */
1391 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
1392    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
1393                                       || UNICODE_DOT_DOT_VERSION > 0)
1394             while (s < s_end) {
1395                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
1396                     OP(scan) = EXACTFAA_NO_TRIE;
1397                     *unfolded_multi_char = TRUE;
1398                     break;
1399                 }
1400                 s++;
1401             }
1402         }
1403         else if (OP(scan) != EXACTFAA_NO_TRIE) {
1404 
1405             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
1406              * folds that are all Latin1.  As explained in the comments
1407              * preceding this function, we look also for the sharp s in EXACTF
1408              * and EXACTFL nodes; it can be in the final position.  Otherwise
1409              * we can stop looking 1 byte earlier because have to find at least
1410              * two characters for a multi-fold */
1411             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
1412                               ? s_end
1413                               : s_end -1;
1414 
1415             while (s < upper) {
1416                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
1417                 if (! len) {    /* Not a multi-char fold. */
1418                     if (*s == LATIN_SMALL_LETTER_SHARP_S
1419                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
1420                     {
1421                         *unfolded_multi_char = TRUE;
1422                     }
1423                     s++;
1424                     continue;
1425                 }
1426 
1427                 if (len == 2
1428                     && isALPHA_FOLD_EQ(*s, 's')
1429                     && isALPHA_FOLD_EQ(*(s+1), 's'))
1430                 {
1431 
1432                     /* EXACTF nodes need to know that the minimum length
1433                      * changed so that a sharp s in the string can match this
1434                      * ss in the pattern, but they remain EXACTF nodes, as they
1435                      * won't match this unless the target string is in UTF-8,
1436                      * which we don't know until runtime.  EXACTFL nodes can't
1437                      * transform into EXACTFU nodes */
1438                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
1439                         OP(scan) = EXACTFUP;
1440                     }
1441                 }
1442 
1443                 *min_subtract += len - 1;
1444                 s += len;
1445             }
1446 #endif
1447         }
1448     }
1449 
1450 #ifdef DEBUGGING
1451     /* Allow dumping but overwriting the collection of skipped
1452      * ops and/or strings with fake optimized ops */
1453     n = REGNODE_AFTER_varies(scan);
1454     while (n <= stop) {
1455         OP(n) = OPTIMIZED;
1456         FLAGS(n) = 0;
1457         NEXT_OFF(n) = 0;
1458         n++;
1459     }
1460 #endif
1461     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
1462     return stopnow;
1463 }
1464 
1465 /* REx optimizer.  Converts nodes into quicker variants "in place".
1466    Finds fixed substrings.  */
1467 
1468 
1469 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1470    to the position after last scanned or to NULL. */
1471 
1472 /* the return from this sub is the minimum length that could possibly match */
1473 SSize_t
Perl_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,SSize_t * minlenp,SSize_t * deltap,regnode * last,scan_data_t * data,I32 stopparen,U32 recursed_depth,regnode_ssc * and_withp,U32 flags,U32 depth,bool was_mutate_ok)1474 Perl_study_chunk(pTHX_
1475     RExC_state_t *pRExC_state,
1476     regnode **scanp,        /* Start here (read-write). */
1477     SSize_t *minlenp,       /* used for the minlen of substrings? */
1478     SSize_t *deltap,        /* Write maxlen-minlen here. */
1479     regnode *last,          /* Stop before this one. */
1480     scan_data_t *data,      /* string data about the pattern */
1481     I32 stopparen,          /* treat CLOSE-N as END, see GOSUB */
1482     U32 recursed_depth,     /* how deep have we recursed via GOSUB */
1483     regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
1484     U32 flags,              /* flags controlling this call, see SCF_ flags */
1485     U32 depth,              /* how deep have we recursed period */
1486     bool was_mutate_ok      /* TRUE if in-place optimizations are allowed.
1487                                FALSE only if the caller (recursively) was
1488                                prohibited from modifying the regops, because
1489                                a higher caller is holding a ptr to them. */
1490 )
1491 {
1492     /* vars about the regnodes we are working with */
1493     regnode *scan = *scanp; /* the current opcode we are inspecting */
1494     regnode *next = NULL;   /* the next opcode beyond scan, tmp var */
1495     regnode *first_non_open = scan; /* FIXME: should this init to NULL?
1496                                        the first non open regop, if the init
1497                                        val IS an OPEN then we will skip past
1498                                        it just after the var decls section */
1499     I32 code = 0;           /* temp var used to hold the optype of a regop */
1500 
1501     /* vars about the min and max length of the pattern */
1502     SSize_t min = 0;    /* min length of this part of the pattern */
1503     SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
1504                                          this is adjusted down if we find
1505                                          an ACCEPT */
1506     SSize_t delta = 0;  /* difference between min and max length
1507                            (not accounting for stopmin) */
1508 
1509     /* vars about capture buffers in the pattern */
1510     I32 pars = 0;       /* count of OPEN opcodes */
1511     I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
1512 
1513     /* vars about whether this pattern contains something that can match
1514      * infinitely long strings, eg, X* or X+ */
1515     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1516     int is_inf_internal = 0;            /* The studied chunk is infinite */
1517 
1518     /* scan_data_t (struct) is used to hold information about the substrings
1519      * and start class we have extracted from the string */
1520     scan_data_t data_fake; /* temp var used for recursing in some cases */
1521 
1522     SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
1523                                    trie optimizations */
1524 
1525     scan_frame *frame = NULL;  /* used as part of fake recursion */
1526 
1527     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1528 
1529     PERL_ARGS_ASSERT_STUDY_CHUNK;
1530     RExC_study_started= 1;
1531 
1532     Zero(&data_fake, 1, scan_data_t);
1533 
1534     if ( depth == 0 ) {
1535         while (first_non_open && OP(first_non_open) == OPEN)
1536             first_non_open=regnext(first_non_open);
1537     }
1538 
1539   fake_study_recurse:
1540     DEBUG_r(
1541         RExC_study_chunk_recursed_count++;
1542     );
1543     DEBUG_OPTIMISE_MORE_r(
1544     {
1545         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
1546             depth, (long)stopparen,
1547             (unsigned long)RExC_study_chunk_recursed_count,
1548             (unsigned long)depth, (unsigned long)recursed_depth,
1549             scan,
1550             last);
1551         if (recursed_depth) {
1552             U32 i;
1553             U32 j;
1554             for ( j = 0 ; j < recursed_depth ; j++ ) {
1555                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
1556                     if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
1557                         Perl_re_printf( aTHX_ " %d",(int)i);
1558                         break;
1559                     }
1560                 }
1561                 if ( j + 1 < recursed_depth ) {
1562                     Perl_re_printf( aTHX_  ",");
1563                 }
1564             }
1565         }
1566         Perl_re_printf( aTHX_ "\n");
1567     }
1568     );
1569     while ( scan && OP(scan) != END && scan < last ){
1570         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
1571                                    node length to get a real minimum (because
1572                                    the folded version may be shorter) */
1573         bool unfolded_multi_char = FALSE;
1574         /* avoid mutating ops if we are anywhere within the recursed or
1575          * enframed handling for a GOSUB: the outermost level will handle it.
1576          */
1577         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
1578         /* Peephole optimizer: */
1579         DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
1580         DEBUG_PEEP("Peep", scan, depth, flags);
1581 
1582 
1583         /* The reason we do this here is that we need to deal with things like
1584          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
1585          * parsing code, as each (?:..) is handled by a different invocation of
1586          * reg() -- Yves
1587          */
1588         if (REGNODE_TYPE(OP(scan)) == EXACT
1589             && OP(scan) != LEXACT
1590             && OP(scan) != LEXACT_REQ8
1591             && mutate_ok
1592         ) {
1593             join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
1594                     0, NULL, depth + 1);
1595         }
1596 
1597         /* Follow the next-chain of the current node and optimize
1598            away all the NOTHINGs from it.
1599          */
1600         rck_elide_nothing(scan);
1601 
1602         /* The principal pseudo-switch.  Cannot be a switch, since we look into
1603          * several different things.  */
1604         if ( OP(scan) == DEFINEP ) {
1605             SSize_t minlen = 0;
1606             SSize_t deltanext = 0;
1607             SSize_t fake_last_close = 0;
1608             regnode *fake_last_close_op = NULL;
1609             U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
1610 
1611             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1612             scan = regnext(scan);
1613             assert( OP(scan) == IFTHEN );
1614             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
1615 
1616             data_fake.last_closep= &fake_last_close;
1617             data_fake.last_close_opp= &fake_last_close_op;
1618             minlen = *minlenp;
1619             next = regnext(scan);
1620             scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
1621             DEBUG_PEEP("scan", scan, depth, flags);
1622             DEBUG_PEEP("next", next, depth, flags);
1623 
1624             /* we suppose the run is continuous, last=next...
1625              * NOTE we dont use the return here! */
1626             /* DEFINEP study_chunk() recursion */
1627             (void)study_chunk(pRExC_state, &scan, &minlen,
1628                               &deltanext, next, &data_fake, stopparen,
1629                               recursed_depth, NULL, f, depth+1, mutate_ok);
1630 
1631             scan = next;
1632         } else
1633         if (
1634             OP(scan) == BRANCH  ||
1635             OP(scan) == BRANCHJ ||
1636             OP(scan) == IFTHEN
1637         ) {
1638             next = regnext(scan);
1639             code = OP(scan);
1640 
1641             /* The op(next)==code check below is to see if we
1642              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
1643              * IFTHEN is special as it might not appear in pairs.
1644              * Not sure whether BRANCH-BRANCHJ is possible, regardless
1645              * we dont handle it cleanly. */
1646             if (OP(next) == code || code == IFTHEN) {
1647                 /* NOTE - There is similar code to this block below for
1648                  * handling TRIE nodes on a re-study.  If you change stuff here
1649                  * check there too. */
1650                 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
1651                 regnode_ssc accum;
1652                 regnode * const startbranch=scan;
1653 
1654                 if (flags & SCF_DO_SUBSTR) {
1655                     /* Cannot merge strings after this. */
1656                     scan_commit(pRExC_state, data, minlenp, is_inf);
1657                 }
1658 
1659                 if (flags & SCF_DO_STCLASS)
1660                     ssc_init_zero(pRExC_state, &accum);
1661 
1662                 while (OP(scan) == code) {
1663                     SSize_t deltanext, minnext, fake_last_close = 0;
1664                     regnode *fake_last_close_op = NULL;
1665                     U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
1666                     regnode_ssc this_class;
1667 
1668                     DEBUG_PEEP("Branch", scan, depth, flags);
1669 
1670                     num++;
1671                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1672                     if (data) {
1673                         data_fake.whilem_c = data->whilem_c;
1674                         data_fake.last_closep = data->last_closep;
1675                         data_fake.last_close_opp = data->last_close_opp;
1676                     }
1677                     else {
1678                         data_fake.last_closep = &fake_last_close;
1679                         data_fake.last_close_opp = &fake_last_close_op;
1680                     }
1681 
1682                     data_fake.pos_delta = delta;
1683                     next = regnext(scan);
1684 
1685                     scan = REGNODE_AFTER_opcode(scan, code);
1686 
1687                     if (flags & SCF_DO_STCLASS) {
1688                         ssc_init(pRExC_state, &this_class);
1689                         data_fake.start_class = &this_class;
1690                         f |= SCF_DO_STCLASS_AND;
1691                     }
1692                     if (flags & SCF_WHILEM_VISITED_POS)
1693                         f |= SCF_WHILEM_VISITED_POS;
1694 
1695                     /* we suppose the run is continuous, last=next...*/
1696                     /* recurse study_chunk() for each BRANCH in an alternation */
1697                     minnext = study_chunk(pRExC_state, &scan, minlenp,
1698                                       &deltanext, next, &data_fake, stopparen,
1699                                       recursed_depth, NULL, f, depth+1,
1700                                       mutate_ok);
1701 
1702                     if (min1 > minnext)
1703                         min1 = minnext;
1704                     if (deltanext == OPTIMIZE_INFTY) {
1705                         is_inf = is_inf_internal = 1;
1706                         max1 = OPTIMIZE_INFTY;
1707                     } else if (max1 < minnext + deltanext)
1708                         max1 = minnext + deltanext;
1709                     scan = next;
1710                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1711                         pars++;
1712                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
1713                         if ( stopmin > minnext)
1714                             stopmin = min + min1;
1715                         flags &= ~SCF_DO_SUBSTR;
1716                         if (data)
1717                             data->flags |= SCF_SEEN_ACCEPT;
1718                     }
1719                     if (data) {
1720                         if (data_fake.flags & SF_HAS_EVAL)
1721                             data->flags |= SF_HAS_EVAL;
1722                         data->whilem_c = data_fake.whilem_c;
1723                     }
1724                     if (flags & SCF_DO_STCLASS)
1725                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
1726                     DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
1727                 }
1728                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1729                     min1 = 0;
1730                 if (flags & SCF_DO_SUBSTR) {
1731                     data->pos_min += min1;
1732                     if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
1733                         data->pos_delta = OPTIMIZE_INFTY;
1734                     else
1735                         data->pos_delta += max1 - min1;
1736                     if (max1 != min1 || is_inf)
1737                         data->cur_is_floating = 1;
1738                 }
1739                 min += min1;
1740                 if (delta == OPTIMIZE_INFTY
1741                  || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
1742                     delta = OPTIMIZE_INFTY;
1743                 else
1744                     delta += max1 - min1;
1745                 if (flags & SCF_DO_STCLASS_OR) {
1746                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
1747                     if (min1) {
1748                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
1749                         flags &= ~SCF_DO_STCLASS;
1750                     }
1751                 }
1752                 else if (flags & SCF_DO_STCLASS_AND) {
1753                     if (min1) {
1754                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
1755                         flags &= ~SCF_DO_STCLASS;
1756                     }
1757                     else {
1758                         /* Switch to OR mode: cache the old value of
1759                          * data->start_class */
1760                         INIT_AND_WITHP;
1761                         StructCopy(data->start_class, and_withp, regnode_ssc);
1762                         flags &= ~SCF_DO_STCLASS_AND;
1763                         StructCopy(&accum, data->start_class, regnode_ssc);
1764                         flags |= SCF_DO_STCLASS_OR;
1765                     }
1766                 }
1767                 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
1768 
1769                 if (PERL_ENABLE_TRIE_OPTIMISATION
1770                     && OP(startbranch) == BRANCH
1771                     && mutate_ok
1772                 ) {
1773                 /* demq.
1774 
1775                    Assuming this was/is a branch we are dealing with: 'scan'
1776                    now points at the item that follows the branch sequence,
1777                    whatever it is. We now start at the beginning of the
1778                    sequence and look for subsequences of
1779 
1780                    BRANCH->EXACT=>x1
1781                    BRANCH->EXACT=>x2
1782                    tail
1783 
1784                    which would be constructed from a pattern like
1785                    /A|LIST|OF|WORDS/
1786 
1787                    If we can find such a subsequence we need to turn the first
1788                    element into a trie and then add the subsequent branch exact
1789                    strings to the trie.
1790 
1791                    We have two cases
1792 
1793                      1. patterns where the whole set of branches can be
1794                         converted.
1795 
1796                      2. patterns where only a subset can be converted.
1797 
1798                    In case 1 we can replace the whole set with a single regop
1799                    for the trie. In case 2 we need to keep the start and end
1800                    branches so
1801 
1802                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1803                      becomes BRANCH TRIE; BRANCH X;
1804 
1805                   There is an additional case, that being where there is a
1806                   common prefix, which gets split out into an EXACT like node
1807                   preceding the TRIE node.
1808 
1809                   If X(1..n)==tail then we can do a simple trie, if not we make
1810                   a "jump" trie, such that when we match the appropriate word
1811                   we "jump" to the appropriate tail node. Essentially we turn
1812                   a nested if into a case structure of sorts.
1813 
1814                 */
1815 
1816                     int made=0;
1817                     if (!re_trie_maxbuff) {
1818                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1819                         if (!SvIOK(re_trie_maxbuff))
1820                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1821                     }
1822                     if ( SvIV(re_trie_maxbuff)>=0  ) {
1823                         regnode *cur;
1824                         regnode *first = (regnode *)NULL;
1825                         regnode *prev = (regnode *)NULL;
1826                         regnode *tail = scan;
1827                         U8 trietype = 0;
1828                         U32 count=0;
1829 
1830                         /* var tail is used because there may be a TAIL
1831                            regop in the way. Ie, the exacts will point to the
1832                            thing following the TAIL, but the last branch will
1833                            point at the TAIL. So we advance tail. If we
1834                            have nested (?:) we may have to move through several
1835                            tails.
1836                          */
1837 
1838                         while ( OP( tail ) == TAIL ) {
1839                             /* this is the TAIL generated by (?:) */
1840                             tail = regnext( tail );
1841                         }
1842 
1843 
1844                         DEBUG_TRIE_COMPILE_r({
1845                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
1846                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
1847                               depth+1,
1848                               "Looking for TRIE'able sequences. Tail node is ",
1849                               (UV) REGNODE_OFFSET(tail),
1850                               SvPV_nolen_const( RExC_mysv )
1851                             );
1852                         });
1853 
1854                         /*
1855 
1856                             Step through the branches
1857                                 cur represents each branch,
1858                                 noper is the first thing to be matched as part
1859                                       of that branch
1860                                 noper_next is the regnext() of that node.
1861 
1862                             We normally handle a case like this
1863                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
1864                             support building with NOJUMPTRIE, which restricts
1865                             the trie logic to structures like /FOO|BAR/.
1866 
1867                             If noper is a trieable nodetype then the branch is
1868                             a possible optimization target. If we are building
1869                             under NOJUMPTRIE then we require that noper_next is
1870                             the same as scan (our current position in the regex
1871                             program).
1872 
1873                             Once we have two or more consecutive such branches
1874                             we can create a trie of the EXACT's contents and
1875                             stitch it in place into the program.
1876 
1877                             If the sequence represents all of the branches in
1878                             the alternation we replace the entire thing with a
1879                             single TRIE node.
1880 
1881                             Otherwise when it is a subsequence we need to
1882                             stitch it in place and replace only the relevant
1883                             branches. This means the first branch has to remain
1884                             as it is used by the alternation logic, and its
1885                             next pointer, and needs to be repointed at the item
1886                             on the branch chain following the last branch we
1887                             have optimized away.
1888 
1889                             This could be either a BRANCH, in which case the
1890                             subsequence is internal, or it could be the item
1891                             following the branch sequence in which case the
1892                             subsequence is at the end (which does not
1893                             necessarily mean the first node is the start of the
1894                             alternation).
1895 
1896                             TRIE_TYPE(X) is a define which maps the optype to a
1897                             trietype.
1898 
1899                                 optype          |  trietype
1900                                 ----------------+-----------
1901                                 NOTHING         | NOTHING
1902                                 EXACT           | EXACT
1903                                 EXACT_REQ8      | EXACT
1904                                 EXACTFU         | EXACTFU
1905                                 EXACTFU_REQ8    | EXACTFU
1906                                 EXACTFUP        | EXACTFU
1907                                 EXACTFAA        | EXACTFAA
1908                                 EXACTL          | EXACTL
1909                                 EXACTFLU8       | EXACTFLU8
1910 
1911 
1912                         */
1913 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
1914                        ? NOTHING                                            \
1915                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
1916                          ? EXACT                                            \
1917                          : (     EXACTFU == (X)                             \
1918                               || EXACTFU_REQ8 == (X)                       \
1919                               || EXACTFUP == (X) )                          \
1920                            ? EXACTFU                                        \
1921                            : ( EXACTFAA == (X) )                            \
1922                              ? EXACTFAA                                     \
1923                              : ( EXACTL == (X) )                            \
1924                                ? EXACTL                                     \
1925                                : ( EXACTFLU8 == (X) )                       \
1926                                  ? EXACTFLU8                                \
1927                                  : 0 )
1928 
1929                         /* dont use tail as the end marker for this traverse */
1930                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1931                             regnode * const noper = REGNODE_AFTER( cur );
1932                             U8 noper_type = OP( noper );
1933                             U8 noper_trietype = TRIE_TYPE( noper_type );
1934 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
1935                             regnode * const noper_next = regnext( noper );
1936                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1937                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
1938 #endif
1939 
1940                             DEBUG_TRIE_COMPILE_r({
1941                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
1942                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
1943                                    depth+1,
1944                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
1945 
1946                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
1947                                 Perl_re_printf( aTHX_  " -> %d:%s",
1948                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
1949 
1950                                 if ( noper_next ) {
1951                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
1952                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
1953                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
1954                                 }
1955                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
1956                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
1957                                    REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
1958                                 );
1959                             });
1960 
1961                             /* Is noper a trieable nodetype that can be merged
1962                              * with the current trie (if there is one)? */
1963                             if ( noper_trietype
1964                                   &&
1965                                   (
1966                                         ( noper_trietype == NOTHING )
1967                                         || ( trietype == NOTHING )
1968                                         || ( trietype == noper_trietype )
1969                                   )
1970 #ifdef NOJUMPTRIE
1971                                   && noper_next >= tail
1972 #endif
1973                                   && count < U16_MAX)
1974                             {
1975                                 /* Handle mergable triable node Either we are
1976                                  * the first node in a new trieable sequence,
1977                                  * in which case we do some bookkeeping,
1978                                  * otherwise we update the end pointer. */
1979                                 if ( !first ) {
1980                                     first = cur;
1981                                     if ( noper_trietype == NOTHING ) {
1982 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
1983                                         regnode * const noper_next = regnext( noper );
1984                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1985                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
1986 #endif
1987 
1988                                         if ( noper_next_trietype ) {
1989                                             trietype = noper_next_trietype;
1990                                         } else if (noper_next_type)  {
1991                                             /* a NOTHING regop is 1 regop wide.
1992                                              * We need at least two for a trie
1993                                              * so we can't merge this in */
1994                                             first = NULL;
1995                                         }
1996                                     } else {
1997                                         trietype = noper_trietype;
1998                                     }
1999                                 } else {
2000                                     if ( trietype == NOTHING )
2001                                         trietype = noper_trietype;
2002                                     prev = cur;
2003                                 }
2004                                 if (first)
2005                                     count++;
2006                             } /* end handle mergable triable node */
2007                             else {
2008                                 /* handle unmergable node -
2009                                  * noper may either be a triable node which can
2010                                  * not be tried together with the current trie,
2011                                  * or a non triable node */
2012                                 if ( prev ) {
2013                                     /* If last is set and trietype is not
2014                                      * NOTHING then we have found at least two
2015                                      * triable branch sequences in a row of a
2016                                      * similar trietype so we can turn them
2017                                      * into a trie. If/when we allow NOTHING to
2018                                      * start a trie sequence this condition
2019                                      * will be required, and it isn't expensive
2020                                      * so we leave it in for now. */
2021                                     if ( trietype && trietype != NOTHING )
2022                                         make_trie( pRExC_state,
2023                                                 startbranch, first, cur, tail,
2024                                                 count, trietype, depth+1 );
2025                                     prev = NULL; /* note: we clear/update
2026                                                     first, trietype etc below,
2027                                                     so we dont do it here */
2028                                 }
2029                                 if ( noper_trietype
2030 #ifdef NOJUMPTRIE
2031                                      && noper_next >= tail
2032 #endif
2033                                 ){
2034                                     /* noper is triable, so we can start a new
2035                                      * trie sequence */
2036                                     count = 1;
2037                                     first = cur;
2038                                     trietype = noper_trietype;
2039                                 } else if (first) {
2040                                     /* if we already saw a first but the
2041                                      * current node is not triable then we have
2042                                      * to reset the first information. */
2043                                     count = 0;
2044                                     first = NULL;
2045                                     trietype = 0;
2046                                 }
2047                             } /* end handle unmergable node */
2048                         } /* loop over branches */
2049                         DEBUG_TRIE_COMPILE_r({
2050                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2051                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
2052                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2053                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
2054                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2055                                REGNODE_NAME(trietype)
2056                             );
2057 
2058                         });
2059                         if ( prev && trietype ) {
2060                             if ( trietype != NOTHING ) {
2061                                 /* the last branch of the sequence was part of
2062                                  * a trie, so we have to construct it here
2063                                  * outside of the loop */
2064                                 made= make_trie( pRExC_state, startbranch,
2065                                                  first, scan, tail, count,
2066                                                  trietype, depth+1 );
2067 #ifdef TRIE_STUDY_OPT
2068                                 if ( ((made == MADE_EXACT_TRIE &&
2069                                      startbranch == first)
2070                                      || ( first_non_open == first )) &&
2071                                      depth==0 ) {
2072                                     flags |= SCF_TRIE_RESTUDY;
2073                                     if ( startbranch == first
2074                                          && scan >= tail )
2075                                     {
2076                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
2077                                     }
2078                                 }
2079 #endif
2080                             } else {
2081                                 /* at this point we know whatever we have is a
2082                                  * NOTHING sequence/branch AND if 'startbranch'
2083                                  * is 'first' then we can turn the whole thing
2084                                  * into a NOTHING
2085                                  */
2086                                 if ( startbranch == first ) {
2087                                     regnode *opt;
2088                                     /* the entire thing is a NOTHING sequence,
2089                                      * something like this: (?:|) So we can
2090                                      * turn it into a plain NOTHING op. */
2091                                     DEBUG_TRIE_COMPILE_r({
2092                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2093                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
2094                                           depth+1,
2095                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2096 
2097                                     });
2098                                     OP(startbranch)= NOTHING;
2099                                     NEXT_OFF(startbranch)= tail - startbranch;
2100                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
2101                                         OP(opt)= OPTIMIZED;
2102                                 }
2103                             }
2104                         } /* end if ( prev) */
2105                     } /* TRIE_MAXBUF is non zero */
2106                 } /* do trie */
2107                 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
2108             }
2109             else
2110                 scan = REGNODE_AFTER_opcode(scan,code);
2111             continue;
2112         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
2113             I32 paren = 0;
2114             regnode *start = NULL;
2115             regnode *end = NULL;
2116             U32 my_recursed_depth= recursed_depth;
2117 
2118             if (OP(scan) != SUSPEND) { /* GOSUB */
2119                 /* Do setup, note this code has side effects beyond
2120                  * the rest of this block. Specifically setting
2121                  * RExC_recurse[] must happen at least once during
2122                  * study_chunk(). */
2123                 paren = ARG1u(scan);
2124                 RExC_recurse[ARG2i(scan)] = scan;
2125                 start = REGNODE_p(RExC_open_parens[paren]);
2126                 end   = REGNODE_p(RExC_close_parens[paren]);
2127 
2128                 /* NOTE we MUST always execute the above code, even
2129                  * if we do nothing with a GOSUB */
2130                 if (
2131                     ( flags & SCF_IN_DEFINE )
2132                     ||
2133                     (
2134                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
2135                         &&
2136                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
2137                     )
2138                 ) {
2139                     /* no need to do anything here if we are in a define. */
2140                     /* or we are after some kind of infinite construct
2141                      * so we can skip recursing into this item.
2142                      * Since it is infinite we will not change the maxlen
2143                      * or delta, and if we miss something that might raise
2144                      * the minlen it will merely pessimise a little.
2145                      *
2146                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
2147                      * might result in a minlen of 1 and not of 4,
2148                      * but this doesn't make us mismatch, just try a bit
2149                      * harder than we should.
2150                      *
2151                      * However we must assume this GOSUB is infinite, to
2152                      * avoid wrongly applying other optimizations in the
2153                      * enclosing scope - see GH 18096, for example.
2154                      */
2155                     is_inf = is_inf_internal = 1;
2156                     scan= regnext(scan);
2157                     continue;
2158                 }
2159 
2160                 if (
2161                     !recursed_depth
2162                     || !PAREN_TEST(recursed_depth - 1, paren)
2163                 ) {
2164                     /* it is quite possible that there are more efficient ways
2165                      * to do this. We maintain a bitmap per level of recursion
2166                      * of which patterns we have entered so we can detect if a
2167                      * pattern creates a possible infinite loop. When we
2168                      * recurse down a level we copy the previous levels bitmap
2169                      * down. When we are at recursion level 0 we zero the top
2170                      * level bitmap. It would be nice to implement a different
2171                      * more efficient way of doing this. In particular the top
2172                      * level bitmap may be unnecessary.
2173                      */
2174                     if (!recursed_depth) {
2175                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
2176                     } else {
2177                         Copy(PAREN_OFFSET(recursed_depth - 1),
2178                              PAREN_OFFSET(recursed_depth),
2179                              RExC_study_chunk_recursed_bytes, U8);
2180                     }
2181                     /* we havent recursed into this paren yet, so recurse into it */
2182                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
2183                     PAREN_SET(recursed_depth, paren);
2184                     my_recursed_depth= recursed_depth + 1;
2185                 } else {
2186                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
2187                     /* some form of infinite recursion, assume infinite length
2188                      * */
2189                     if (flags & SCF_DO_SUBSTR) {
2190                         scan_commit(pRExC_state, data, minlenp, is_inf);
2191                         data->cur_is_floating = 1;
2192                     }
2193                     is_inf = is_inf_internal = 1;
2194                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2195                         ssc_anything(data->start_class);
2196                     flags &= ~SCF_DO_STCLASS;
2197 
2198                     start= NULL; /* reset start so we dont recurse later on. */
2199                 }
2200             } else {
2201                 paren = stopparen;
2202                 start = scan + 2;
2203                 end = regnext(scan);
2204             }
2205             if (start) {
2206                 scan_frame *newframe;
2207                 assert(end);
2208                 if (!RExC_frame_last) {
2209                     Newxz(newframe, 1, scan_frame);
2210                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
2211                     RExC_frame_head= newframe;
2212                     RExC_frame_count++;
2213                 } else if (!RExC_frame_last->next_frame) {
2214                     Newxz(newframe, 1, scan_frame);
2215                     RExC_frame_last->next_frame= newframe;
2216                     newframe->prev_frame= RExC_frame_last;
2217                     RExC_frame_count++;
2218                 } else {
2219                     newframe= RExC_frame_last->next_frame;
2220                 }
2221                 RExC_frame_last= newframe;
2222 
2223                 newframe->next_regnode = regnext(scan);
2224                 newframe->last_regnode = last;
2225                 newframe->stopparen = stopparen;
2226                 newframe->prev_recursed_depth = recursed_depth;
2227                 newframe->this_prev_frame= frame;
2228                 newframe->in_gosub = (
2229                     (frame && frame->in_gosub) || OP(scan) == GOSUB
2230                 );
2231 
2232                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
2233                 DEBUG_PEEP("fnew", scan, depth, flags);
2234 
2235                 frame = newframe;
2236                 scan =  start;
2237                 stopparen = paren;
2238                 last = end;
2239                 depth = depth + 1;
2240                 recursed_depth= my_recursed_depth;
2241 
2242                 continue;
2243             }
2244         }
2245         else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
2246             SSize_t bytelen = STR_LEN(scan), charlen;
2247             UV uc;
2248             assert(bytelen);
2249             if (UTF) {
2250                 const U8 * const s = (U8*)STRING(scan);
2251                 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
2252                 charlen = utf8_length(s, s + bytelen);
2253             } else {
2254                 uc = *((U8*)STRING(scan));
2255                 charlen = bytelen;
2256             }
2257             min += charlen;
2258             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2259                 /* The code below prefers earlier match for fixed
2260                    offset, later match for variable offset.  */
2261                 if (data->last_end == -1) { /* Update the start info. */
2262                     data->last_start_min = data->pos_min;
2263                     data->last_start_max =
2264                         is_inf ? OPTIMIZE_INFTY
2265                         : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
2266                             ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
2267                 }
2268                 sv_catpvn(data->last_found, STRING(scan), bytelen);
2269                 if (UTF)
2270                     SvUTF8_on(data->last_found);
2271                 {
2272                     SV * const sv = data->last_found;
2273                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2274                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
2275                     if (mg && mg->mg_len >= 0)
2276                         mg->mg_len += charlen;
2277                 }
2278                 data->last_end = data->pos_min + charlen;
2279                 data->pos_min += charlen; /* As in the first entry. */
2280                 data->flags &= ~SF_BEFORE_EOL;
2281             }
2282 
2283             /* ANDing the code point leaves at most it, and not in locale, and
2284              * can't match null string */
2285             if (flags & SCF_DO_STCLASS_AND) {
2286                 ssc_cp_and(data->start_class, uc);
2287                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2288                 ssc_clear_locale(data->start_class);
2289             }
2290             else if (flags & SCF_DO_STCLASS_OR) {
2291                 ssc_add_cp(data->start_class, uc);
2292                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2293 
2294                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2295                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2296             }
2297             flags &= ~SCF_DO_STCLASS;
2298             DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
2299         }
2300         else if (REGNODE_TYPE(OP(scan)) == EXACT) {
2301             /* But OP != EXACT!, so is EXACTFish */
2302             SSize_t bytelen = STR_LEN(scan), charlen;
2303             const U8 * s = (U8*)STRING(scan);
2304 
2305             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
2306              * with the mask set to the complement of the bit that differs
2307              * between upper and lower case, and the lowest code point of the
2308              * pair (which the '&' forces) */
2309             if (     bytelen == 1
2310                 &&   isALPHA_A(*s)
2311                 &&  (         OP(scan) == EXACTFAA
2312                      || (     OP(scan) == EXACTFU
2313                          && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
2314                 &&   mutate_ok
2315             ) {
2316                 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
2317 
2318                 OP(scan) = ANYOFM;
2319                 ARG1u_SET(scan, *s & mask);
2320                 FLAGS(scan) = mask;
2321                 /* We're not EXACTFish any more, so restudy.
2322                  * Search for "restudy" in this file to find
2323                  * a comment with details. */
2324                 continue;
2325             }
2326 
2327             /* Search for fixed substrings supports EXACT only. */
2328             if (flags & SCF_DO_SUBSTR) {
2329                 assert(data);
2330                 scan_commit(pRExC_state, data, minlenp, is_inf);
2331             }
2332             charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
2333             if (unfolded_multi_char) {
2334                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
2335             }
2336             min += charlen - min_subtract;
2337             assert (min >= 0);
2338             if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2339                 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2340             ) {
2341                 delta += min_subtract;
2342             } else {
2343                 delta = OPTIMIZE_INFTY;
2344             }
2345             if (flags & SCF_DO_SUBSTR) {
2346                 data->pos_min += charlen - min_subtract;
2347                 if (data->pos_min < 0) {
2348                     data->pos_min = 0;
2349                 }
2350                 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2351                     && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2352                 ) {
2353                     data->pos_delta += min_subtract;
2354                 } else {
2355                     data->pos_delta = OPTIMIZE_INFTY;
2356                 }
2357                 if (min_subtract) {
2358                     data->cur_is_floating = 1; /* float */
2359                 }
2360             }
2361 
2362             if (flags & SCF_DO_STCLASS) {
2363                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
2364 
2365                 assert(EXACTF_invlist);
2366                 if (flags & SCF_DO_STCLASS_AND) {
2367                     if (OP(scan) != EXACTFL)
2368                         ssc_clear_locale(data->start_class);
2369                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2370                     ANYOF_POSIXL_ZERO(data->start_class);
2371                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
2372                 }
2373                 else {  /* SCF_DO_STCLASS_OR */
2374                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
2375                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2376 
2377                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2378                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2379                 }
2380                 flags &= ~SCF_DO_STCLASS;
2381                 SvREFCNT_dec(EXACTF_invlist);
2382             }
2383             DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
2384         }
2385         else if (REGNODE_VARIES(OP(scan))) {
2386             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
2387             I32 fl = 0;
2388             U32 f = flags;
2389             regnode * const oscan = scan;
2390             regnode_ssc this_class;
2391             regnode_ssc *oclass = NULL;
2392             I32 next_is_eval = 0;
2393 
2394             switch (REGNODE_TYPE(OP(scan))) {
2395             case WHILEM:                /* End of (?:...)* . */
2396                 scan = REGNODE_AFTER(scan);
2397                 goto finish;
2398             case PLUS:
2399                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2400                     next = REGNODE_AFTER(scan);
2401                     if (   (     REGNODE_TYPE(OP(next)) == EXACT
2402                             && ! isEXACTFish(OP(next)))
2403                         || (flags & SCF_DO_STCLASS))
2404                     {
2405                         mincount = 1;
2406                         maxcount = REG_INFTY;
2407                         next = regnext(scan);
2408                         scan = REGNODE_AFTER(scan);
2409                         goto do_curly;
2410                     }
2411                 }
2412                 if (flags & SCF_DO_SUBSTR)
2413                     data->pos_min++;
2414                 /* This will bypass the formal 'min += minnext * mincount'
2415                  * calculation in the do_curly path, so assumes min width
2416                  * of the PLUS payload is exactly one. */
2417                 min++;
2418                 /* FALLTHROUGH */
2419             case STAR:
2420                 next = REGNODE_AFTER(scan);
2421 
2422                 /* This temporary node can now be turned into EXACTFU, and
2423                  * must, as regexec.c doesn't handle it */
2424                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
2425                     OP(next) = EXACTFU;
2426                 }
2427 
2428                 if (     STR_LEN(next) == 1
2429                     &&   isALPHA_A(* STRING(next))
2430                     && (         OP(next) == EXACTFAA
2431                         || (     OP(next) == EXACTFU
2432                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
2433                     &&   mutate_ok
2434                 ) {
2435                     /* These differ in just one bit */
2436                     U8 mask = ~ ('A' ^ 'a');
2437 
2438                     assert(isALPHA_A(* STRING(next)));
2439 
2440                     /* Then replace it by an ANYOFM node, with
2441                     * the mask set to the complement of the
2442                     * bit that differs between upper and lower
2443                     * case, and the lowest code point of the
2444                     * pair (which the '&' forces) */
2445                     OP(next) = ANYOFM;
2446                     ARG1u_SET(next, *STRING(next) & mask);
2447                     FLAGS(next) = mask;
2448                 }
2449 
2450                 if (flags & SCF_DO_STCLASS) {
2451                     mincount = 0;
2452                     maxcount = REG_INFTY;
2453                     next = regnext(scan);
2454                     scan = REGNODE_AFTER(scan);
2455                     goto do_curly;
2456                 }
2457                 if (flags & SCF_DO_SUBSTR) {
2458                     scan_commit(pRExC_state, data, minlenp, is_inf);
2459                     /* Cannot extend fixed substrings */
2460                     data->cur_is_floating = 1; /* float */
2461                 }
2462                 is_inf = is_inf_internal = 1;
2463                 scan = regnext(scan);
2464                 goto optimize_curly_tail;
2465             case CURLY:
2466                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2467                     && (FLAGS(scan) == stopparen))
2468                 {
2469                     mincount = 1;
2470                     maxcount = 1;
2471                 } else {
2472                     mincount = ARG1i(scan);
2473                     maxcount = ARG2i(scan);
2474                 }
2475                 next = regnext(scan);
2476                 if (OP(scan) == CURLYX) {
2477                     I32 lp = (data ? *(data->last_closep) : 0);
2478                     FLAGS(scan) = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2479                 }
2480                 scan = REGNODE_AFTER(scan);
2481                 next_is_eval = (OP(scan) == EVAL);
2482               do_curly:
2483                 if (flags & SCF_DO_SUBSTR) {
2484                     if (mincount == 0)
2485                         scan_commit(pRExC_state, data, minlenp, is_inf);
2486                     /* Cannot extend fixed substrings */
2487                     pos_before = data->pos_min;
2488                 }
2489                 if (data) {
2490                     fl = data->flags;
2491                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2492                     if (is_inf)
2493                         data->flags |= SF_IS_INF;
2494                 }
2495                 if (flags & SCF_DO_STCLASS) {
2496                     ssc_init(pRExC_state, &this_class);
2497                     oclass = data->start_class;
2498                     data->start_class = &this_class;
2499                     f |= SCF_DO_STCLASS_AND;
2500                     f &= ~SCF_DO_STCLASS_OR;
2501                 }
2502                 /* Exclude from super-linear cache processing any {n,m}
2503                    regops for which the combination of input pos and regex
2504                    pos is not enough information to determine if a match
2505                    will be possible.
2506 
2507                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
2508                    regex pos at the \s*, the prospects for a match depend not
2509                    only on the input position but also on how many (bar\s*)
2510                    repeats into the {4,8} we are. */
2511                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
2512                     f &= ~SCF_WHILEM_VISITED_POS;
2513 
2514                 /* This will finish on WHILEM, setting scan, or on NULL: */
2515                 /* recurse study_chunk() on loop bodies */
2516                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2517                                   last, data, stopparen, recursed_depth, NULL,
2518                                   (mincount == 0
2519                                    ? (f & ~SCF_DO_SUBSTR)
2520                                    : f)
2521                                   , depth+1, mutate_ok);
2522 
2523                 if (data && data->flags & SCF_SEEN_ACCEPT) {
2524                     if (mincount > 1)
2525                         mincount = 1;
2526                 }
2527 
2528                 if (flags & SCF_DO_STCLASS)
2529                     data->start_class = oclass;
2530                 if (mincount == 0 || minnext == 0) {
2531                     if (flags & SCF_DO_STCLASS_OR) {
2532                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2533                     }
2534                     else if (flags & SCF_DO_STCLASS_AND) {
2535                         /* Switch to OR mode: cache the old value of
2536                          * data->start_class */
2537                         INIT_AND_WITHP;
2538                         StructCopy(data->start_class, and_withp, regnode_ssc);
2539                         flags &= ~SCF_DO_STCLASS_AND;
2540                         StructCopy(&this_class, data->start_class, regnode_ssc);
2541                         flags |= SCF_DO_STCLASS_OR;
2542                         ANYOF_FLAGS(data->start_class)
2543                                                 |= SSC_MATCHES_EMPTY_STRING;
2544                     }
2545                 } else {                /* Non-zero len */
2546                     if (flags & SCF_DO_STCLASS_OR) {
2547                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2548                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2549                     }
2550                     else if (flags & SCF_DO_STCLASS_AND)
2551                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2552                     flags &= ~SCF_DO_STCLASS;
2553                 }
2554                 if (!scan)              /* It was not CURLYX, but CURLY. */
2555                     scan = next;
2556                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
2557                     /* ? quantifier ok, except for (?{ ... }) */
2558                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
2559                     && (minnext == 0) && (deltanext == 0)
2560                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2561                     && maxcount <= REG_INFTY/3) /* Complement check for big
2562                                                    count */
2563                 {
2564                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
2565                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
2566                             "Quantifier unexpected on zero-length expression "
2567                             "in regex m/%" UTF8f "/",
2568                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
2569                                   RExC_precomp)));
2570                 }
2571 
2572                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
2573                     || min >= SSize_t_MAX - minnext * mincount )
2574                 {
2575                     FAIL("Regexp out of space");
2576                 }
2577 
2578                 min += minnext * mincount;
2579                 is_inf_internal |= deltanext == OPTIMIZE_INFTY
2580                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
2581                 is_inf |= is_inf_internal;
2582                 if (is_inf) {
2583                     delta = OPTIMIZE_INFTY;
2584                 } else {
2585                     delta += (minnext + deltanext) * maxcount
2586                              - minnext * mincount;
2587                 }
2588 
2589                 if (data && data->flags & SCF_SEEN_ACCEPT) {
2590                     if (flags & SCF_DO_SUBSTR) {
2591                         scan_commit(pRExC_state, data, minlenp, is_inf);
2592                         flags &= ~SCF_DO_SUBSTR;
2593                     }
2594                     if (stopmin > min)
2595                         stopmin = min;
2596                     DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
2597                 }
2598                 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2599                 /* Try powerful optimization CURLYX => CURLYN. */
2600                 if ( RE_OPTIMIZE_CURLYX_TO_CURLYN
2601                      && OP(oscan) == CURLYX
2602                      && data
2603                      && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2604                                                             non optimistic eval is seen
2605                                                             anywhere.*/
2606                      && ( data->flags & SF_IN_PAR ) /* has parens */
2607                      && !deltanext
2608                      && minnext == 1
2609                      && mutate_ok
2610                 ) {
2611                     DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2612                     /* Try to optimize to CURLYN.  */
2613                     regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
2614                     regnode * const nxt1 = nxt;
2615 #ifdef DEBUGGING
2616                     regnode *nxt2;
2617 #endif
2618                     /* Skip open. */
2619                     nxt = regnext(nxt);
2620                     if (!REGNODE_SIMPLE(OP(nxt))
2621                         && !(REGNODE_TYPE(OP(nxt)) == EXACT
2622                              && STR_LEN(nxt) == 1))
2623                         goto nogo;
2624 #ifdef DEBUGGING
2625                     nxt2 = nxt;
2626 #endif
2627                     nxt = regnext(nxt);
2628                     if (OP(nxt) != CLOSE)
2629                         goto nogo;
2630                     if (RExC_open_parens) {
2631 
2632                         /*open->CURLYM*/
2633                         RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2634 
2635                         /*close->while*/
2636                         RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
2637                     }
2638                     /* Now we know that nxt2 is the only contents: */
2639                     FLAGS(oscan) = (U8)PARNO(nxt);
2640                     OP(oscan) = CURLYN;
2641                     OP(nxt1) = NOTHING; /* was OPEN. */
2642 
2643 #ifdef DEBUGGING
2644                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2645                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
2646                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
2647                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
2648                     OP(nxt + 1) = OPTIMIZED; /* was count. */
2649                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
2650 #endif
2651                 }
2652               nogo:
2653 
2654                 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2655 
2656                 /* Try optimization CURLYX => CURLYM. */
2657                 if ( RE_OPTIMIZE_CURLYX_TO_CURLYM
2658                      && OP(oscan) == CURLYX
2659                      && data
2660                      && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2661                                                             non optimistic eval is seen
2662                                                             anywhere.*/
2663                      && !(data->flags & SF_HAS_PAR) /* no parens! */
2664                      && !deltanext     /* atom is fixed width */
2665                      && minnext != 0  /* CURLYM can't handle zero width */
2666                          /* Nor characters whose fold at run-time may be
2667                           * multi-character */
2668                      && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN)
2669                      && mutate_ok
2670                 ) {
2671                     DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2672                     /* XXXX How to optimize if data == 0? */
2673                     /* Optimize to a simpler form.  */
2674                     regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
2675                     regnode *nxt2;
2676 
2677                     OP(oscan) = CURLYM;
2678                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2679                             && (OP(nxt2) != WHILEM))
2680                         nxt = nxt2;
2681                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
2682                     /* Need to optimize away parenths. */
2683                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
2684                         /* Set the parenth number.  */
2685                         /* note that we have changed the type of oscan to CURLYM here */
2686                         regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
2687 
2688                         FLAGS(oscan) = (U8)PARNO(nxt);
2689                         if (RExC_open_parens) {
2690                              /*open->CURLYM*/
2691                             RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2692 
2693                             /*close->NOTHING*/
2694                             RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
2695                                                          + 1;
2696                         }
2697                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
2698                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
2699 
2700 #ifdef DEBUGGING
2701                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2702                         OP(nxt + 1) = OPTIMIZED; /* was count. */
2703                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
2704                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
2705 #endif
2706 #if 0
2707                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
2708                             regnode *nnxt = regnext(nxt1);
2709                             if (nnxt == nxt) {
2710                                 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
2711                                     ARG1u_SET(nxt1, nxt2 - nxt1);
2712                                 else if (nxt2 - nxt1 < U16_MAX)
2713                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
2714                                 else
2715                                     OP(nxt) = NOTHING;  /* Cannot beautify */
2716                             }
2717                             nxt1 = nnxt;
2718                         }
2719 #endif
2720                         /* Optimize again: */
2721                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
2722                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2723                                     NULL, stopparen, recursed_depth, NULL, 0,
2724                                     depth+1, mutate_ok);
2725                     }
2726                     else
2727                         FLAGS(oscan) = 0;
2728                 }
2729                 else if ((OP(oscan) == CURLYX)
2730                          && (flags & SCF_WHILEM_VISITED_POS)
2731                          /* See the comment on a similar expression above.
2732                             However, this time it's not a subexpression
2733                             we care about, but the expression itself. */
2734                          && (maxcount == REG_INFTY)
2735                          && data) {
2736                     /* This stays as CURLYX, we can put the count/of pair. */
2737                     /* Find WHILEM (as in regexec.c) */
2738                     regnode *nxt = oscan + NEXT_OFF(oscan);
2739 
2740                     if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
2741                         nxt += ARG1u(nxt);
2742                     nxt = REGNODE_BEFORE(nxt);
2743                     if (FLAGS(nxt) & 0xf) {
2744                         /* we've already set whilem count on this node */
2745                     } else if (++data->whilem_c < 16) {
2746                         assert(data->whilem_c <= RExC_whilem_seen);
2747                         FLAGS(nxt) = (U8)(data->whilem_c
2748                             | (RExC_whilem_seen << 4)); /* On WHILEM */
2749                     }
2750                 }
2751                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2752                     pars++;
2753                 if (flags & SCF_DO_SUBSTR) {
2754                     SV *last_str = NULL;
2755                     STRLEN last_chrs = 0;
2756                     int counted = mincount != 0;
2757 
2758                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
2759                                                                   string. */
2760                         SSize_t b = pos_before >= data->last_start_min
2761                             ? pos_before : data->last_start_min;
2762                         STRLEN l;
2763                         const char * const s = SvPV_const(data->last_found, l);
2764                         SSize_t old = b - data->last_start_min;
2765                         assert(old >= 0);
2766 
2767                         if (UTF)
2768                             old = utf8_hop_forward((U8*)s, old,
2769                                                (U8 *) SvEND(data->last_found))
2770                                 - (U8*)s;
2771                         l -= old;
2772                         /* Get the added string: */
2773                         last_str = newSVpvn_utf8(s  + old, l, UTF);
2774                         last_chrs = UTF ? utf8_length((U8*)(s + old),
2775                                             (U8*)(s + old + l)) : l;
2776                         if (deltanext == 0 && pos_before == b) {
2777                             /* What was added is a constant string */
2778                             if (mincount > 1) {
2779 
2780                                 SvGROW(last_str, (mincount * l) + 1);
2781                                 repeatcpy(SvPVX(last_str) + l,
2782                                           SvPVX_const(last_str), l,
2783                                           mincount - 1);
2784                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2785                                 /* Add additional parts. */
2786                                 SvCUR_set(data->last_found,
2787                                           SvCUR(data->last_found) - l);
2788                                 sv_catsv(data->last_found, last_str);
2789                                 {
2790                                     SV * sv = data->last_found;
2791                                     MAGIC *mg =
2792                                         SvUTF8(sv) && SvMAGICAL(sv) ?
2793                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
2794                                     if (mg && mg->mg_len >= 0)
2795                                         mg->mg_len += last_chrs * (mincount-1);
2796                                 }
2797                                 last_chrs *= mincount;
2798                                 data->last_end += l * (mincount - 1);
2799                             }
2800                         } else {
2801                             /* start offset must point into the last copy */
2802                             data->last_start_min += minnext * (mincount - 1);
2803                             data->last_start_max =
2804                               is_inf
2805                                ? OPTIMIZE_INFTY
2806                                : data->last_start_max +
2807                                  (maxcount - 1) * (minnext + data->pos_delta);
2808                         }
2809                     }
2810                     /* It is counted once already... */
2811                     data->pos_min += minnext * (mincount - counted);
2812 #if 0
2813     Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
2814                               " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
2815                               " maxcount=%" UVuf " mincount=%" UVuf
2816                               " data->pos_delta=%" UVuf "\n",
2817         (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
2818         (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
2819     if (deltanext != OPTIMIZE_INFTY)
2820         Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
2821             (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
2822             - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
2823 #endif
2824                     if (deltanext == OPTIMIZE_INFTY
2825                         || data->pos_delta == OPTIMIZE_INFTY
2826                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
2827                         data->pos_delta = OPTIMIZE_INFTY;
2828                     else
2829                         data->pos_delta += - counted * deltanext +
2830                         (minnext + deltanext) * maxcount - minnext * mincount;
2831                     if (mincount != maxcount) {
2832                          /* Cannot extend fixed substrings found inside
2833                             the group.  */
2834                         scan_commit(pRExC_state, data, minlenp, is_inf);
2835                         if (mincount && last_str) {
2836                             SV * const sv = data->last_found;
2837                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2838                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2839 
2840                             if (mg)
2841                                 mg->mg_len = -1;
2842                             sv_setsv(sv, last_str);
2843                             data->last_end = data->pos_min;
2844                             data->last_start_min = data->pos_min - last_chrs;
2845                             data->last_start_max = is_inf
2846                                 ? OPTIMIZE_INFTY
2847                                 : data->pos_min + data->pos_delta - last_chrs;
2848                         }
2849                         data->cur_is_floating = 1; /* float */
2850                     }
2851                     SvREFCNT_dec(last_str);
2852                 }
2853                 if (data && (fl & SF_HAS_EVAL))
2854                     data->flags |= SF_HAS_EVAL;
2855               optimize_curly_tail:
2856                 rck_elide_nothing(oscan);
2857                 continue;
2858 
2859             default:
2860                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
2861                                                                     OP(scan));
2862             case REF:
2863             case CLUMP:
2864                 if (flags & SCF_DO_SUBSTR) {
2865                     /* Cannot expect anything... */
2866                     scan_commit(pRExC_state, data, minlenp, is_inf);
2867                     data->cur_is_floating = 1; /* float */
2868                 }
2869                 is_inf = is_inf_internal = 1;
2870                 if (flags & SCF_DO_STCLASS_OR) {
2871                     if (OP(scan) == CLUMP) {
2872                         /* Actually is any start char, but very few code points
2873                          * aren't start characters */
2874                         ssc_match_all_cp(data->start_class);
2875                     }
2876                     else {
2877                         ssc_anything(data->start_class);
2878                     }
2879                 }
2880                 flags &= ~SCF_DO_STCLASS;
2881                 break;
2882             }
2883         }
2884         else if (OP(scan) == LNBREAK) {
2885             if (flags & SCF_DO_STCLASS) {
2886                 if (flags & SCF_DO_STCLASS_AND) {
2887                     ssc_intersection(data->start_class,
2888                                     PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
2889                     ssc_clear_locale(data->start_class);
2890                     ANYOF_FLAGS(data->start_class)
2891                                                 &= ~SSC_MATCHES_EMPTY_STRING;
2892                 }
2893                 else if (flags & SCF_DO_STCLASS_OR) {
2894                     ssc_union(data->start_class,
2895                               PL_XPosix_ptrs[CC_VERTSPACE_],
2896                               FALSE);
2897                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2898 
2899                     /* See commit msg for
2900                      * 749e076fceedeb708a624933726e7989f2302f6a */
2901                     ANYOF_FLAGS(data->start_class)
2902                                                 &= ~SSC_MATCHES_EMPTY_STRING;
2903                 }
2904                 flags &= ~SCF_DO_STCLASS;
2905             }
2906             min++;
2907             if (delta != OPTIMIZE_INFTY)
2908                 delta++;    /* Because of the 2 char string cr-lf */
2909             if (flags & SCF_DO_SUBSTR) {
2910                 /* Cannot expect anything... */
2911                 scan_commit(pRExC_state, data, minlenp, is_inf);
2912                 data->pos_min += 1;
2913                 if (data->pos_delta != OPTIMIZE_INFTY) {
2914                     data->pos_delta += 1;
2915                 }
2916                 data->cur_is_floating = 1; /* float */
2917             }
2918         }
2919         else if (REGNODE_SIMPLE(OP(scan))) {
2920 
2921             if (flags & SCF_DO_SUBSTR) {
2922                 scan_commit(pRExC_state, data, minlenp, is_inf);
2923                 data->pos_min++;
2924             }
2925             min++;
2926             if (flags & SCF_DO_STCLASS) {
2927                 bool invert = 0;
2928                 SV* my_invlist = NULL;
2929                 U8 namedclass;
2930 
2931                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2932                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2933 
2934                 /* Some of the logic below assumes that switching
2935                    locale on will only add false positives. */
2936                 switch (OP(scan)) {
2937 
2938                 default:
2939 #ifdef DEBUGGING
2940                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
2941                                                                      OP(scan));
2942 #endif
2943                 case SANY:
2944                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2945                         ssc_match_all_cp(data->start_class);
2946                     break;
2947 
2948                 case REG_ANY:
2949                     {
2950                         SV* REG_ANY_invlist = _new_invlist(2);
2951                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
2952                                                             '\n');
2953                         if (flags & SCF_DO_STCLASS_OR) {
2954                             ssc_union(data->start_class,
2955                                       REG_ANY_invlist,
2956                                       TRUE /* TRUE => invert, hence all but \n
2957                                             */
2958                                       );
2959                         }
2960                         else if (flags & SCF_DO_STCLASS_AND) {
2961                             ssc_intersection(data->start_class,
2962                                              REG_ANY_invlist,
2963                                              TRUE  /* TRUE => invert */
2964                                              );
2965                             ssc_clear_locale(data->start_class);
2966                         }
2967                         SvREFCNT_dec_NN(REG_ANY_invlist);
2968                     }
2969                     break;
2970 
2971                 case ANYOFD:
2972                 case ANYOFL:
2973                 case ANYOFPOSIXL:
2974                 case ANYOFH:
2975                 case ANYOFHb:
2976                 case ANYOFHr:
2977                 case ANYOFHs:
2978                 case ANYOF:
2979                     if (flags & SCF_DO_STCLASS_AND)
2980                         ssc_and(pRExC_state, data->start_class,
2981                                 (regnode_charclass *) scan);
2982                     else
2983                         ssc_or(pRExC_state, data->start_class,
2984                                                           (regnode_charclass *) scan);
2985                     break;
2986 
2987                 case ANYOFHbbm:
2988                   {
2989                     SV* cp_list = get_ANYOFHbbm_contents(scan);
2990 
2991                     if (flags & SCF_DO_STCLASS_OR) {
2992                         ssc_union(data->start_class, cp_list, invert);
2993                     }
2994                     else if (flags & SCF_DO_STCLASS_AND) {
2995                         ssc_intersection(data->start_class, cp_list, invert);
2996                     }
2997 
2998                     SvREFCNT_dec_NN(cp_list);
2999                     break;
3000                   }
3001 
3002                 case NANYOFM: /* NANYOFM already contains the inversion of the
3003                                  input ANYOF data, so, unlike things like
3004                                  NPOSIXA, don't change 'invert' to TRUE */
3005                     /* FALLTHROUGH */
3006                 case ANYOFM:
3007                   {
3008                     SV* cp_list = get_ANYOFM_contents(scan);
3009 
3010                     if (flags & SCF_DO_STCLASS_OR) {
3011                         ssc_union(data->start_class, cp_list, invert);
3012                     }
3013                     else if (flags & SCF_DO_STCLASS_AND) {
3014                         ssc_intersection(data->start_class, cp_list, invert);
3015                     }
3016 
3017                     SvREFCNT_dec_NN(cp_list);
3018                     break;
3019                   }
3020 
3021                 case ANYOFR:
3022                 case ANYOFRb:
3023                   {
3024                     SV* cp_list = NULL;
3025 
3026                     cp_list = _add_range_to_invlist(cp_list,
3027                                         ANYOFRbase(scan),
3028                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
3029 
3030                     if (flags & SCF_DO_STCLASS_OR) {
3031                         ssc_union(data->start_class, cp_list, invert);
3032                     }
3033                     else if (flags & SCF_DO_STCLASS_AND) {
3034                         ssc_intersection(data->start_class, cp_list, invert);
3035                     }
3036 
3037                     SvREFCNT_dec_NN(cp_list);
3038                     break;
3039                   }
3040 
3041                 case NPOSIXL:
3042                     invert = 1;
3043                     /* FALLTHROUGH */
3044 
3045                 case POSIXL:
3046                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
3047                     if (flags & SCF_DO_STCLASS_AND) {
3048                         bool was_there = cBOOL(
3049                                           ANYOF_POSIXL_TEST(data->start_class,
3050                                                                  namedclass));
3051                         ANYOF_POSIXL_ZERO(data->start_class);
3052                         if (was_there) {    /* Do an AND */
3053                             ANYOF_POSIXL_SET(data->start_class, namedclass);
3054                         }
3055                         /* No individual code points can now match */
3056                         data->start_class->invlist
3057                                                 = sv_2mortal(_new_invlist(0));
3058                     }
3059                     else {
3060                         int complement = namedclass + ((invert) ? -1 : 1);
3061 
3062                         assert(flags & SCF_DO_STCLASS_OR);
3063 
3064                         /* If the complement of this class was already there,
3065                          * the result is that they match all code points,
3066                          * (\d + \D == everything).  Remove the classes from
3067                          * future consideration.  Locale is not relevant in
3068                          * this case */
3069                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
3070                             ssc_match_all_cp(data->start_class);
3071                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
3072                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
3073                         }
3074                         else {  /* The usual case; just add this class to the
3075                                    existing set */
3076                             ANYOF_POSIXL_SET(data->start_class, namedclass);
3077                         }
3078                     }
3079                     break;
3080 
3081                 case NPOSIXA:   /* For these, we always know the exact set of
3082                                    what's matched */
3083                     invert = 1;
3084                     /* FALLTHROUGH */
3085                 case POSIXA:
3086                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
3087                     goto join_posix_and_ascii;
3088 
3089                 case NPOSIXD:
3090                 case NPOSIXU:
3091                     invert = 1;
3092                     /* FALLTHROUGH */
3093                 case POSIXD:
3094                 case POSIXU:
3095                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
3096 
3097                     /* NPOSIXD matches all upper Latin1 code points unless the
3098                      * target string being matched is UTF-8, which is
3099                      * unknowable until match time.  Since we are going to
3100                      * invert, we want to get rid of all of them so that the
3101                      * inversion will match all */
3102                     if (OP(scan) == NPOSIXD) {
3103                         _invlist_subtract(my_invlist, PL_UpperLatin1,
3104                                           &my_invlist);
3105                     }
3106 
3107                   join_posix_and_ascii:
3108 
3109                     if (flags & SCF_DO_STCLASS_AND) {
3110                         ssc_intersection(data->start_class, my_invlist, invert);
3111                         ssc_clear_locale(data->start_class);
3112                     }
3113                     else {
3114                         assert(flags & SCF_DO_STCLASS_OR);
3115                         ssc_union(data->start_class, my_invlist, invert);
3116                     }
3117                     SvREFCNT_dec(my_invlist);
3118                 }
3119                 if (flags & SCF_DO_STCLASS_OR)
3120                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3121                 flags &= ~SCF_DO_STCLASS;
3122             }
3123         }
3124         else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
3125             data->flags |= (OP(scan) == MEOL
3126                             ? SF_BEFORE_MEOL
3127                             : SF_BEFORE_SEOL);
3128             scan_commit(pRExC_state, data, minlenp, is_inf);
3129 
3130         }
3131         else if (  REGNODE_TYPE(OP(scan)) == BRANCHJ
3132                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3133                    && (FLAGS(scan) || data || (flags & SCF_DO_STCLASS))
3134                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
3135         {
3136             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3137                 || OP(scan) == UNLESSM )
3138             {
3139                 /* Negative Lookahead/lookbehind
3140                    In this case we can't do fixed string optimisation.
3141                 */
3142 
3143                 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
3144                 SSize_t deltanext, minnext;
3145                 SSize_t fake_last_close = 0;
3146                 regnode *fake_last_close_op = NULL;
3147                 regnode *cur_last_close_op;
3148                 regnode *nscan;
3149                 regnode_ssc intrnl;
3150                 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3151 
3152                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3153                 if (data) {
3154                     data_fake.whilem_c = data->whilem_c;
3155                     data_fake.last_closep = data->last_closep;
3156                     data_fake.last_close_opp = data->last_close_opp;
3157                 }
3158                 else {
3159                     data_fake.last_closep = &fake_last_close;
3160                     data_fake.last_close_opp = &fake_last_close_op;
3161                 }
3162 
3163                 /* remember the last_close_op we saw so we can see if
3164                  * we are dealing with variable length lookbehind that
3165                  * contains capturing buffers, which are considered
3166                  * experimental */
3167                 cur_last_close_op= *(data_fake.last_close_opp);
3168 
3169                 data_fake.pos_delta = delta;
3170                 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3171                      && OP(scan) == IFMATCH ) { /* Lookahead */
3172                     ssc_init(pRExC_state, &intrnl);
3173                     data_fake.start_class = &intrnl;
3174                     f |= SCF_DO_STCLASS_AND;
3175                 }
3176                 if (flags & SCF_WHILEM_VISITED_POS)
3177                     f |= SCF_WHILEM_VISITED_POS;
3178                 next = regnext(scan);
3179                 nscan = REGNODE_AFTER(scan);
3180 
3181                 /* recurse study_chunk() for lookahead body */
3182                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3183                                       last, &data_fake, stopparen,
3184                                       recursed_depth, NULL, f, depth+1,
3185                                       mutate_ok);
3186 
3187                 if (FLAGS(scan)) {
3188                     if (   deltanext < 0
3189                         || deltanext > (I32) U8_MAX
3190                         || minnext > (I32)U8_MAX
3191                         || minnext + deltanext > (I32)U8_MAX)
3192                     {
3193                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
3194                               (UV)U8_MAX);
3195                     }
3196 
3197                     /* The 'next_off' field has been repurposed to count the
3198                      * additional starting positions to try beyond the initial
3199                      * one.  (This leaves it at 0 for non-variable length
3200                      * matches to avoid breakage for those not using this
3201                      * extension) */
3202                     if (deltanext)  {
3203                         NEXT_OFF(scan) = deltanext;
3204                         if (
3205                             /* See a CLOSE op inside this lookbehind? */
3206                             cur_last_close_op != *(data_fake.last_close_opp)
3207                             /* and not doing restudy. see: restudied */
3208                             && !(flags & SCF_TRIE_DOING_RESTUDY)
3209                         ) {
3210                             /* this is positive variable length lookbehind with
3211                              * capture buffers inside of it */
3212                             ckWARNexperimental_with_arg(RExC_parse,
3213                                 WARN_EXPERIMENTAL__VLB,
3214                                 "Variable length %s lookbehind with capturing is experimental",
3215                                 is_positive ? "positive" : "negative");
3216                         }
3217                     }
3218                     FLAGS(scan) = (U8)minnext + deltanext;
3219                 }
3220                 if (data) {
3221                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3222                         pars++;
3223                     if (data_fake.flags & SF_HAS_EVAL)
3224                         data->flags |= SF_HAS_EVAL;
3225                     data->whilem_c = data_fake.whilem_c;
3226                 }
3227                 if (f & SCF_DO_STCLASS_AND) {
3228                     if (flags & SCF_DO_STCLASS_OR) {
3229                         /* OR before, AND after: ideally we would recurse with
3230                          * data_fake to get the AND applied by study of the
3231                          * remainder of the pattern, and then derecurse;
3232                          * *** HACK *** for now just treat as "no information".
3233                          * See [perl #56690].
3234                          */
3235                         ssc_init(pRExC_state, data->start_class);
3236                     }  else {
3237                         /* AND before and after: combine and continue.  These
3238                          * assertions are zero-length, so can match an EMPTY
3239                          * string */
3240                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3241                         ANYOF_FLAGS(data->start_class)
3242                                                    |= SSC_MATCHES_EMPTY_STRING;
3243                     }
3244                 }
3245                 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
3246             }
3247 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3248             else {
3249                 /* Positive Lookahead/lookbehind
3250                    In this case we can do fixed string optimisation,
3251                    but we must be careful about it. Note in the case of
3252                    lookbehind the positions will be offset by the minimum
3253                    length of the pattern, something we won't know about
3254                    until after the recurse.
3255                 */
3256                 SSize_t deltanext, fake_last_close = 0;
3257                 regnode *last_close_op = NULL;
3258                 regnode *nscan;
3259                 regnode_ssc intrnl;
3260                 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3261                 /* We use SAVEFREEPV so that when the full compile
3262                     is finished perl will clean up the allocated
3263                     minlens when it's all done. This way we don't
3264                     have to worry about freeing them when we know
3265                     they wont be used, which would be a pain.
3266                  */
3267                 SSize_t *minnextp;
3268                 Newx( minnextp, 1, SSize_t );
3269                 SAVEFREEPV(minnextp);
3270 
3271                 if (data) {
3272                     StructCopy(data, &data_fake, scan_data_t);
3273                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3274                         f |= SCF_DO_SUBSTR;
3275                         if (FLAGS(scan))
3276                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
3277                         data_fake.last_found=newSVsv(data->last_found);
3278                     }
3279                 }
3280                 else {
3281                     data_fake.last_closep = &fake_last_close;
3282                     data_fake.last_close_opp = &fake_last_close_opp;
3283                 }
3284                 data_fake.flags = 0;
3285                 data_fake.substrs[0].flags = 0;
3286                 data_fake.substrs[1].flags = 0;
3287                 data_fake.pos_delta = delta;
3288                 if (is_inf)
3289                     data_fake.flags |= SF_IS_INF;
3290                 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3291                      && OP(scan) == IFMATCH ) { /* Lookahead */
3292                     ssc_init(pRExC_state, &intrnl);
3293                     data_fake.start_class = &intrnl;
3294                     f |= SCF_DO_STCLASS_AND;
3295                 }
3296                 if (flags & SCF_WHILEM_VISITED_POS)
3297                     f |= SCF_WHILEM_VISITED_POS;
3298                 next = regnext(scan);
3299                 nscan = REGNODE_AFTER(scan);
3300 
3301                 /* positive lookahead study_chunk() recursion */
3302                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
3303                                         &deltanext, last, &data_fake,
3304                                         stopparen, recursed_depth, NULL,
3305                                         f, depth+1, mutate_ok);
3306                 if (FLAGS(scan)) {
3307                     assert(0);  /* This code has never been tested since this
3308                                    is normally not compiled */
3309                     if (   deltanext < 0
3310                         || deltanext > (I32) U8_MAX
3311                         || *minnextp > (I32)U8_MAX
3312                         || *minnextp + deltanext > (I32)U8_MAX)
3313                     {
3314                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
3315                               (UV)U8_MAX);
3316                     }
3317 
3318                     if (deltanext) {
3319                         NEXT_OFF(scan) = deltanext;
3320                     }
3321                     FLAGS(scan) = (U8)*minnextp + deltanext;
3322                 }
3323 
3324                 *minnextp += min;
3325 
3326                 if (f & SCF_DO_STCLASS_AND) {
3327                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3328                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
3329                 }
3330                 if (data) {
3331                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3332                         pars++;
3333                     if (data_fake.flags & SF_HAS_EVAL)
3334                         data->flags |= SF_HAS_EVAL;
3335                     data->whilem_c = data_fake.whilem_c;
3336                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3337                         int i;
3338                         if (RExC_rx->minlen < *minnextp)
3339                             RExC_rx->minlen = *minnextp;
3340                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
3341                         SvREFCNT_dec_NN(data_fake.last_found);
3342 
3343                         for (i = 0; i < 2; i++) {
3344                             if (data_fake.substrs[i].minlenp != minlenp) {
3345                                 data->substrs[i].min_offset =
3346                                             data_fake.substrs[i].min_offset;
3347                                 data->substrs[i].max_offset =
3348                                             data_fake.substrs[i].max_offset;
3349                                 data->substrs[i].minlenp =
3350                                             data_fake.substrs[i].minlenp;
3351                                 data->substrs[i].lookbehind += FLAGS(scan);
3352                             }
3353                         }
3354                     }
3355                 }
3356             }
3357 #endif
3358         }
3359         else if (OP(scan) == OPEN) {
3360             if (stopparen != (I32)PARNO(scan))
3361                 pars++;
3362         }
3363         else if (OP(scan) == CLOSE) {
3364             if (stopparen == (I32)PARNO(scan)) {
3365                 break;
3366             }
3367             if ((I32)PARNO(scan) == is_par) {
3368                 next = regnext(scan);
3369 
3370                 if ( next && (OP(next) != WHILEM) && next < last)
3371                     is_par = 0;         /* Disable optimization */
3372             }
3373             if (data) {
3374                 *(data->last_closep) = PARNO(scan);
3375                 *(data->last_close_opp) = scan;
3376             }
3377         }
3378         else if (OP(scan) == EVAL) {
3379             if (data && !(FLAGS(scan) & EVAL_OPTIMISTIC_FLAG) )
3380                 data->flags |= SF_HAS_EVAL;
3381         }
3382         else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
3383             if (flags & SCF_DO_SUBSTR) {
3384                 scan_commit(pRExC_state, data, minlenp, is_inf);
3385                 flags &= ~SCF_DO_SUBSTR;
3386             }
3387             if (OP(scan)==ACCEPT) {
3388                 /* m{(*ACCEPT)x} does not have to start with 'x' */
3389                 flags &= ~SCF_DO_STCLASS;
3390                 if (data)
3391                     data->flags |= SCF_SEEN_ACCEPT;
3392                 if (stopmin > min)
3393                     stopmin = min;
3394             }
3395         }
3396         else if (OP(scan) == COMMIT) {
3397             /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
3398              * must not end up with "abcxyz" as a fixed substring else we'll
3399              * skip straight to attempting to match at offset 4.
3400              */
3401             if (flags & SCF_DO_SUBSTR) {
3402                 scan_commit(pRExC_state, data, minlenp, is_inf);
3403                 flags &= ~SCF_DO_SUBSTR;
3404             }
3405         }
3406         else if (OP(scan) == LOGICAL && FLAGS(scan) == 2) /* Embedded follows */
3407         {
3408                 if (flags & SCF_DO_SUBSTR) {
3409                     scan_commit(pRExC_state, data, minlenp, is_inf);
3410                     data->cur_is_floating = 1; /* float */
3411                 }
3412                 is_inf = is_inf_internal = 1;
3413                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3414                     ssc_anything(data->start_class);
3415                 flags &= ~SCF_DO_STCLASS;
3416         }
3417         else if (OP(scan) == GPOS) {
3418             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
3419                 !(delta || is_inf || (data && data->pos_delta)))
3420             {
3421                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
3422                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
3423                 if (RExC_rx->gofs < (STRLEN)min)
3424                     RExC_rx->gofs = min;
3425             } else {
3426                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
3427                 RExC_rx->gofs = 0;
3428             }
3429         }
3430 #ifdef TRIE_STUDY_OPT
3431 #ifdef FULL_TRIE_STUDY
3432         else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3433             /* NOTE - There is similar code to this block above for handling
3434                BRANCH nodes on the initial study.  If you change stuff here
3435                check there too. */
3436             regnode *trie_node= scan;
3437             regnode *tail= regnext(scan);
3438             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3439             SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
3440             regnode_ssc accum;
3441 
3442             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
3443                 /* Cannot merge strings after this. */
3444                 scan_commit(pRExC_state, data, minlenp, is_inf);
3445             }
3446             if (flags & SCF_DO_STCLASS)
3447                 ssc_init_zero(pRExC_state, &accum);
3448 
3449             if (!trie->jump) {
3450                 min1= trie->minlen;
3451                 max1= trie->maxlen;
3452             } else {
3453                 const regnode *nextbranch= NULL;
3454                 U32 word;
3455 
3456                 for ( word=1 ; word <= trie->wordcount ; word++)
3457                 {
3458                     SSize_t deltanext = 0, minnext = 0;
3459                     U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3460                     SSize_t fake_last_close = 0;
3461                     regnode *fake_last_close_op = NULL;
3462                     regnode_ssc this_class;
3463 
3464                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3465                     if (data) {
3466                         data_fake.whilem_c = data->whilem_c;
3467                         data_fake.last_closep = data->last_closep;
3468                         data_fake.last_close_opp = data->last_close_opp;
3469                     }
3470                     else {
3471                         data_fake.last_closep = &fake_last_close;
3472                         data_fake.last_close_opp = &fake_last_close_op;
3473                     }
3474                     data_fake.pos_delta = delta;
3475                     if (flags & SCF_DO_STCLASS) {
3476                         ssc_init(pRExC_state, &this_class);
3477                         data_fake.start_class = &this_class;
3478                         f |= SCF_DO_STCLASS_AND;
3479                     }
3480                     if (flags & SCF_WHILEM_VISITED_POS)
3481                         f |= SCF_WHILEM_VISITED_POS;
3482 
3483                     if (trie->jump[word]) {
3484                         if (!nextbranch)
3485                             nextbranch = trie_node + trie->jump[0];
3486                         scan= trie_node + trie->jump[word];
3487                         /* We go from the jump point to the branch that follows
3488                            it. Note this means we need the vestigal unused
3489                            branches even though they arent otherwise used. */
3490                         /* optimise study_chunk() for TRIE */
3491                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3492                             &deltanext, (regnode *)nextbranch, &data_fake,
3493                             stopparen, recursed_depth, NULL, f, depth+1,
3494                             mutate_ok);
3495                     }
3496                     if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
3497                         nextbranch= regnext((regnode*)nextbranch);
3498 
3499                     if (min1 > (SSize_t)(minnext + trie->minlen))
3500                         min1 = minnext + trie->minlen;
3501                     if (deltanext == OPTIMIZE_INFTY) {
3502                         is_inf = is_inf_internal = 1;
3503                         max1 = OPTIMIZE_INFTY;
3504                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
3505                         max1 = minnext + deltanext + trie->maxlen;
3506 
3507                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3508                         pars++;
3509                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3510                         if ( stopmin > min + min1)
3511                             stopmin = min + min1;
3512                         flags &= ~SCF_DO_SUBSTR;
3513                         if (data)
3514                             data->flags |= SCF_SEEN_ACCEPT;
3515                     }
3516                     if (data) {
3517                         if (data_fake.flags & SF_HAS_EVAL)
3518                             data->flags |= SF_HAS_EVAL;
3519                         data->whilem_c = data_fake.whilem_c;
3520                     }
3521                     if (flags & SCF_DO_STCLASS)
3522                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
3523                 }
3524                 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
3525             }
3526             if (flags & SCF_DO_SUBSTR) {
3527                 data->pos_min += min1;
3528                 data->pos_delta += max1 - min1;
3529                 if (max1 != min1 || is_inf)
3530                     data->cur_is_floating = 1; /* float */
3531             }
3532             min += min1;
3533             if (delta != OPTIMIZE_INFTY) {
3534                 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
3535                     delta += max1 - min1;
3536                 else
3537                     delta = OPTIMIZE_INFTY;
3538             }
3539             if (flags & SCF_DO_STCLASS_OR) {
3540                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3541                 if (min1) {
3542                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3543                     flags &= ~SCF_DO_STCLASS;
3544                 }
3545             }
3546             else if (flags & SCF_DO_STCLASS_AND) {
3547                 if (min1) {
3548                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3549                     flags &= ~SCF_DO_STCLASS;
3550                 }
3551                 else {
3552                     /* Switch to OR mode: cache the old value of
3553                      * data->start_class */
3554                     INIT_AND_WITHP;
3555                     StructCopy(data->start_class, and_withp, regnode_ssc);
3556                     flags &= ~SCF_DO_STCLASS_AND;
3557                     StructCopy(&accum, data->start_class, regnode_ssc);
3558                     flags |= SCF_DO_STCLASS_OR;
3559                 }
3560             }
3561             scan= tail;
3562             DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
3563             continue;
3564         }
3565 #else
3566         else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3567             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3568             U8*bang=NULL;
3569 
3570             min += trie->minlen;
3571             delta += (trie->maxlen - trie->minlen);
3572             flags &= ~SCF_DO_STCLASS; /* xxx */
3573             if (flags & SCF_DO_SUBSTR) {
3574                 /* Cannot expect anything... */
3575                 scan_commit(pRExC_state, data, minlenp, is_inf);
3576                 data->pos_min += trie->minlen;
3577                 data->pos_delta += (trie->maxlen - trie->minlen);
3578                 if (trie->maxlen != trie->minlen)
3579                     data->cur_is_floating = 1; /* float */
3580             }
3581             if (trie->jump) /* no more substrings -- for now /grr*/
3582                flags &= ~SCF_DO_SUBSTR;
3583         }
3584 
3585 #endif /* old or new */
3586 #endif /* TRIE_STUDY_OPT */
3587 
3588         else if (OP(scan) == REGEX_SET) {
3589             Perl_croak(aTHX_ "panic: %s regnode should be resolved"
3590                              " before optimization", REGNODE_NAME(REGEX_SET));
3591         }
3592 
3593         /* Else: zero-length, ignore. */
3594         scan = regnext(scan);
3595     }
3596 
3597   finish:
3598     if (frame) {
3599         /* we need to unwind recursion. */
3600         depth = depth - 1;
3601 
3602         DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
3603         DEBUG_PEEP("fend", scan, depth, flags);
3604 
3605         /* restore previous context */
3606         last = frame->last_regnode;
3607         scan = frame->next_regnode;
3608         stopparen = frame->stopparen;
3609         recursed_depth = frame->prev_recursed_depth;
3610 
3611         RExC_frame_last = frame->prev_frame;
3612         frame = frame->this_prev_frame;
3613         goto fake_study_recurse;
3614     }
3615 
3616     assert(!frame);
3617     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
3618 
3619     /* is this pattern infinite? Eg, consider /(a|b+)/ */
3620     if (is_inf_internal)
3621         delta = OPTIMIZE_INFTY;
3622 
3623     /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
3624     if (min > stopmin) {
3625         /*
3626         At this point 'min' represents the minimum length string we can
3627         match while *ignoring* the implication of ACCEPT, and 'delta'
3628         represents the difference between the minimum length and maximum
3629         length, and if the pattern matches an infinitely long string
3630         (consider the + and * quantifiers) then we use the special delta
3631         value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
3632         minimum length that can be matched *and* accepted.
3633 
3634         A pattern is accepted when matching was successful *and*
3635         complete, and thus there is no further matching needing to be
3636         done, no backtracking to occur, etc. Prior to the introduction
3637         of ACCEPT the only opcode that signaled acceptance was the END
3638         opcode, which is always the very last opcode in a regex program.
3639         ACCEPT is thus conceptually an early successful return out of
3640         the matching process. stopmin starts out as OPTIMIZE_INFTY to
3641         represent "the entire pattern", and is ratched down to the
3642         "current min" if necessary when an ACCEPT opcode is encountered.
3643 
3644         Thus stopmin might be smaller than min if we saw an (*ACCEPT),
3645         and we now need to account for it in both min and delta.
3646         Consider that in a pattern /AB/ normally the min length it can
3647         match can be computed as min(A)+min(B). But (*ACCEPT) means
3648         that it might be something else, not even neccesarily min(A) at
3649         all. Consider
3650 
3651              A  = /(foo(*ACCEPT)|x+)/
3652              B  = /whop/
3653              AB = /(foo(*ACCEPT)|x+)whop/
3654 
3655         The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
3656         for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
3657         "whop", and the delta of 0 as the pattern is of fixed length, the
3658         stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
3659         When handling AB we expect to see a min of 5 for "xwhop", and a
3660         delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
3661         for "foo". This should result in a final min of 3 for "foo", and
3662         a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
3663 
3664         In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
3665         min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
3666         stop min would be 4 for "dude". This should result in a final
3667         min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
3668 
3669         When min is smaller than stopmin then we can ignore it. In the
3670         fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
3671         and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
3672         the ACCEPT doesn't reduce the minimum length of the string that
3673         might be matched, nor affect the maximum length.
3674 
3675         In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
3676         for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
3677         "foo". We currently turn this into a min of 3 for "foo" and a
3678         delta of 3 for "foobar" even though technically "foobar" isn't
3679         possible. ACCEPT affects some aspects of the optimizer, like
3680         length computations and mandatory substring optimizations, but
3681         there are other optimzations this routine perfoms that are not
3682         affected and this compromise simplifies implementation.
3683 
3684         It might be helpful to consider that this C function is called
3685         recursively on the pattern in a bottom up fashion, and that the
3686         min returned by a nested call may be marked as coming from an
3687         ACCEPT, causing its callers to treat the returned min as a
3688         stopmin as the recursion unwinds. Thus a single ACCEPT can affect
3689         multiple calls into this function in different ways.
3690         */
3691 
3692         if (OPTIMIZE_INFTY - delta >= min - stopmin)
3693             delta += min - stopmin;
3694         else
3695             delta = OPTIMIZE_INFTY;
3696         min = stopmin;
3697     }
3698 
3699     *scanp = scan;
3700     *deltap = delta;
3701 
3702     if (flags & SCF_DO_SUBSTR && is_inf)
3703         data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
3704     if (is_par > (I32)U8_MAX)
3705         is_par = 0;
3706     if (is_par && pars==1 && data) {
3707         data->flags |= SF_IN_PAR;
3708         data->flags &= ~SF_HAS_PAR;
3709     }
3710     else if (pars && data) {
3711         data->flags |= SF_HAS_PAR;
3712         data->flags &= ~SF_IN_PAR;
3713     }
3714     if (flags & SCF_DO_STCLASS_OR)
3715         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3716     if (flags & SCF_TRIE_RESTUDY)
3717         data->flags |=  SCF_TRIE_RESTUDY;
3718 
3719 
3720     if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
3721         if (min > OPTIMIZE_INFTY - delta)
3722             RExC_maxlen = OPTIMIZE_INFTY;
3723         else if (RExC_maxlen < min + delta)
3724             RExC_maxlen = min + delta;
3725     }
3726     DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
3727     return min;
3728 }
3729