xref: /openbsd/gnu/usr.bin/perl/regcomp.c (revision 3d61058a)
1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not be
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 /*
35  * pregcomp and pregexec -- regsub and regerror are not used in perl
36  *
37  *	Copyright (c) 1986 by University of Toronto.
38  *	Written by Henry Spencer.  Not derived from licensed software.
39  *
40  *	Permission is granted to anyone to use this software for any
41  *	purpose on any computer system, and to redistribute it freely,
42  *	subject to the following restrictions:
43  *
44  *	1. The author is not responsible for the consequences of use of
45  *		this software, no matter how awful, even if they arise
46  *		from defects in it.
47  *
48  *	2. The origin of this software must not be misrepresented, either
49  *		by explicit claim or by omission.
50  *
51  *	3. Altered versions must be plainly marked as such, and must not
52  *		be misrepresented as being the original software.
53  *
54  *
55  ****    Alterations to Henry's code are...
56  ****
57  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
58  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
59  ****    by Larry Wall and others
60  ****
61  ****    You may distribute under the terms of either the GNU General Public
62  ****    License or the Artistic License, as specified in the README file.
63 
64  *
65  * Beware that some of this code is subtly aware of the way operator
66  * precedence is structured in regular expressions.  Serious changes in
67  * regular-expression syntax might require a total rethink.
68  */
69 
70 /* Note on debug output:
71  *
72  * This is set up so that -Dr turns on debugging like all other flags that are
73  * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
74  * all regular expressions encountered in a program, and gives a huge amount of
75  * output for all but the shortest programs.
76  *
77  * The ability to output pattern debugging information lexically, and with much
78  * finer grained control was added, with 'use re qw(Debug ....);' available even
79  * in non-DEBUGGING builds.  This is accomplished by copying the contents of
80  * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
81  * Those files are compiled and linked into the perl executable, and they are
82  * compiled essentially as if DEBUGGING were enabled, and controlled by calls
83  * to re.pm.
84  *
85  * That would normally mean linking errors when two functions of the same name
86  * are attempted to be placed into the same executable.  That is solved in one
87  * of four ways:
88  *  1)  Static functions aren't known outside the file they are in, so for the
89  *      many functions of that type in this file, it just isn't a problem.
90  *  2)  Most externally known functions are enclosed in
91  *          #ifndef PERL_IN_XSUB_RE
92  *          ...
93  *          #endif
94  *      blocks, so there is only one definition for them in the whole
95  *      executable, the one in regcomp.c (or regexec.c).  The implication of
96  *      that is any debugging info that comes from them is controlled only by
97  *      -Dr.  Further, any static function they call will also be the version
98  *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
99  *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
100  *      have different names, so that what gets loaded in the executable is
101  *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
102  *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
103  *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
104  *      versions and their callees are under control of re.pm.   The catch is
105  *      that references to all these go through the regexp_engine structure,
106  *      which is initialized in regcomp.h to the Perl_foo versions, and
107  *      substituted out in lexical scopes where 'use re' is in effect to the
108  *      'my_foo' ones.   That structure is public API, so it would be a hard
109  *      sell to add any additional members.
110  *  4)  For functions in regcomp.c and re_comp.c that are called only from,
111  *      respectively, regexec.c and re_exec.c, they can have two different
112  *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
113  *      embed.fnc.
114  *
115  * The bottom line is that if you add code to one of the public functions
116  * listed in ext/re/re_top.h, debugging automagically works.  But if you write
117  * a new function that needs to do debugging or there is a chain of calls from
118  * it that need to do debugging, all functions in the chain should use options
119  * 2) or 4) above.
120  *
121  * A function may have to be split so that debugging stuff is static, but it
122  * calls out to some other function that only gets compiled in regcomp.c to
123  * access data that we don't want to duplicate.
124  */
125 
126 #ifdef PERL_EXT_RE_BUILD
127 #include "re_top.h"
128 #endif
129 
130 #include "EXTERN.h"
131 #define PERL_IN_REGEX_ENGINE
132 #define PERL_IN_REGCOMP_ANY
133 #define PERL_IN_REGCOMP_C
134 #include "perl.h"
135 
136 #ifdef PERL_IN_XSUB_RE
137 #  include "re_comp.h"
138 EXTERN_C const struct regexp_engine my_reg_engine;
139 EXTERN_C const struct regexp_engine wild_reg_engine;
140 #else
141 #  include "regcomp.h"
142 #endif
143 
144 #include "invlist_inline.h"
145 #include "unicode_constants.h"
146 #include "regcomp_internal.h"
147 
148 /* =========================================================
149  * BEGIN edit_distance stuff.
150  *
151  * This calculates how many single character changes of any type are needed to
152  * transform a string into another one.  It is taken from version 3.1 of
153  *
154  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
155  */
156 
157 /* Our unsorted dictionary linked list.   */
158 /* Note we use UVs, not chars. */
159 
160 struct dictionary{
161   UV key;
162   UV value;
163   struct dictionary* next;
164 };
165 typedef struct dictionary item;
166 
167 
168 PERL_STATIC_INLINE item*
push(UV key,item * curr)169 push(UV key, item* curr)
170 {
171     item* head;
172     Newx(head, 1, item);
173     head->key = key;
174     head->value = 0;
175     head->next = curr;
176     return head;
177 }
178 
179 
180 PERL_STATIC_INLINE item*
find(item * head,UV key)181 find(item* head, UV key)
182 {
183     item* iterator = head;
184     while (iterator){
185         if (iterator->key == key){
186             return iterator;
187         }
188         iterator = iterator->next;
189     }
190 
191     return NULL;
192 }
193 
194 PERL_STATIC_INLINE item*
uniquePush(item * head,UV key)195 uniquePush(item* head, UV key)
196 {
197     item* iterator = head;
198 
199     while (iterator){
200         if (iterator->key == key) {
201             return head;
202         }
203         iterator = iterator->next;
204     }
205 
206     return push(key, head);
207 }
208 
209 PERL_STATIC_INLINE void
dict_free(item * head)210 dict_free(item* head)
211 {
212     item* iterator = head;
213 
214     while (iterator) {
215         item* temp = iterator;
216         iterator = iterator->next;
217         Safefree(temp);
218     }
219 
220     head = NULL;
221 }
222 
223 /* End of Dictionary Stuff */
224 
225 /* All calculations/work are done here */
226 STATIC int
S_edit_distance(const UV * src,const UV * tgt,const STRLEN x,const STRLEN y,const SSize_t maxDistance)227 S_edit_distance(const UV* src,
228                 const UV* tgt,
229                 const STRLEN x,             /* length of src[] */
230                 const STRLEN y,             /* length of tgt[] */
231                 const SSize_t maxDistance
232 )
233 {
234     item *head = NULL;
235     UV swapCount, swapScore, targetCharCount, i, j;
236     UV *scores;
237     UV score_ceil = x + y;
238 
239     PERL_ARGS_ASSERT_EDIT_DISTANCE;
240 
241     /* initialize matrix start values */
242     Newx(scores, ( (x + 2) * (y + 2)), UV);
243     scores[0] = score_ceil;
244     scores[1 * (y + 2) + 0] = score_ceil;
245     scores[0 * (y + 2) + 1] = score_ceil;
246     scores[1 * (y + 2) + 1] = 0;
247     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
248 
249     /* work loops    */
250     /* i = src index */
251     /* j = tgt index */
252     for (i=1;i<=x;i++) {
253         if (i < x)
254             head = uniquePush(head, src[i]);
255         scores[(i+1) * (y + 2) + 1] = i;
256         scores[(i+1) * (y + 2) + 0] = score_ceil;
257         swapCount = 0;
258 
259         for (j=1;j<=y;j++) {
260             if (i == 1) {
261                 if(j < y)
262                 head = uniquePush(head, tgt[j]);
263                 scores[1 * (y + 2) + (j + 1)] = j;
264                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
265             }
266 
267             targetCharCount = find(head, tgt[j-1])->value;
268             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
269 
270             if (src[i-1] != tgt[j-1]){
271                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
272             }
273             else {
274                 swapCount = j;
275                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
276             }
277         }
278 
279         find(head, src[i-1])->value = i;
280     }
281 
282     {
283         IV score = scores[(x+1) * (y + 2) + (y + 1)];
284         dict_free(head);
285         Safefree(scores);
286         return (maxDistance != 0 && maxDistance < score)?(-1):score;
287     }
288 }
289 
290 /* END of edit_distance() stuff
291  * ========================================================= */
292 
293 #ifdef PERL_RE_BUILD_AUX
294 /* add a data member to the struct reg_data attached to this regex, it should
295  * always return a non-zero return. the 's' argument is the type of the items
296  * being added and the n is the number of items. The length of 's' should match
297  * the number of items. */
298 U32
Perl_reg_add_data(RExC_state_t * const pRExC_state,const char * const s,const U32 n)299 Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
300 {
301     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
302 
303     PERL_ARGS_ASSERT_REG_ADD_DATA;
304 
305     /* in the below expression we have (count + n - 1), the minus one is there
306      * because the struct that we allocate already contains a slot for 1 data
307      * item, so we do not need to allocate it the first time. IOW, the
308      * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
309      * to allocate. See struct reg_data in regcomp.h
310      */
311     Renewc(RExC_rxi->data,
312            sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
313            char, struct reg_data);
314     /* however in the data->what expression we use (count + n) and do not
315      * subtract one from the result because the data structure contains a
316      * pointer to an array, and does not allocate the first element as part of
317      * the data struct. */
318     if (count > 1)
319         Renew(RExC_rxi->data->what, (count + n), U8);
320     else {
321         /* when count == 1 it means we have not initialized anything.
322          * we always fill the 0 slot of the data array with a '%' entry, which
323          * means "zero" (all the other types are letters) which exists purely
324          * so the return from reg_add_data is ALWAYS true, so we can tell it apart
325          * from a "no value" idx=0 in places where we would return an index
326          * into reg_add_data.  This is particularly important with the new "single
327          * pass, usually, but not always" strategy that we use, where the code
328          * will use a 0 to represent "not able to compute this yet".
329          */
330         Newx(RExC_rxi->data->what, n+1, U8);
331         /* fill in the placeholder slot of 0 with a what of '%', we use
332          * this because it sorta looks like a zero (0/0) and it is not a letter
333          * like any of the other "whats", this type should never be created
334          * any other way but here. '%' happens to also not appear in this
335          * file for any other reason (at the time of writing this comment)*/
336         RExC_rxi->data->what[0]= '%';
337         RExC_rxi->data->data[0]= NULL;
338     }
339     RExC_rxi->data->count = count + n;
340     Copy(s, RExC_rxi->data->what + count, n, U8);
341     assert(count>0);
342     return count;
343 }
344 #endif /* PERL_RE_BUILD_AUX */
345 
346 /*XXX: todo make this not included in a non debugging perl, but appears to be
347  * used anyway there, in 'use re' */
348 #ifndef PERL_IN_XSUB_RE
349 void
Perl_reginitcolors(pTHX)350 Perl_reginitcolors(pTHX)
351 {
352     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
353     if (s) {
354         char *t = savepv(s);
355         int i = 0;
356         PL_colors[0] = t;
357         while (++i < 6) {
358             t = strchr(t, '\t');
359             if (t) {
360                 *t = '\0';
361                 PL_colors[i] = ++t;
362             }
363             else
364                 PL_colors[i] = t = (char *)"";
365         }
366     } else {
367         int i = 0;
368         while (i < 6)
369             PL_colors[i++] = (char *)"";
370     }
371     PL_colorset = 1;
372 }
373 #endif
374 
375 
376 #ifdef TRIE_STUDY_OPT
377 /* search for "restudy" in this file for a detailed explanation */
378 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
379     STMT_START {                                            \
380         if (                                                \
381               (data.flags & SCF_TRIE_RESTUDY)               \
382               && ! restudied++                              \
383         ) {                                                 \
384             dOsomething;                                    \
385             goto reStudy;                                   \
386         }                                                   \
387     } STMT_END
388 #else
389 #define CHECK_RESTUDY_GOTO_butfirst
390 #endif
391 
392 #ifndef PERL_IN_XSUB_RE
393 
394 /* return the currently in-scope regex engine (or the default if none)  */
395 regexp_engine const *
Perl_current_re_engine(pTHX)396 Perl_current_re_engine(pTHX)
397 {
398     if (IN_PERL_COMPILETIME) {
399         HV * const table = GvHV(PL_hintgv);
400         SV **ptr;
401 
402         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
403             return &PL_core_reg_engine;
404         ptr = hv_fetchs(table, "regcomp", FALSE);
405         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
406             return &PL_core_reg_engine;
407         return INT2PTR(regexp_engine*, SvIV(*ptr));
408     }
409     else {
410         SV *ptr;
411         if (!PL_curcop->cop_hints_hash)
412             return &PL_core_reg_engine;
413         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
414         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
415             return &PL_core_reg_engine;
416         return INT2PTR(regexp_engine*, SvIV(ptr));
417     }
418 }
419 
420 
421 /*
422  * pregcomp - compile a regular expression into internal code
423  *
424  * Decides which engine's compiler to call based on the hint currently in
425  * scope
426  */
427 
428 REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern,const U32 flags)429 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
430 {
431     regexp_engine const *eng = current_re_engine();
432     DECLARE_AND_GET_RE_DEBUG_FLAGS;
433 
434     PERL_ARGS_ASSERT_PREGCOMP;
435 
436     /* Dispatch a request to compile a regexp to correct regexp engine. */
437     DEBUG_COMPILE_r({
438         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
439                         PTR2UV(eng));
440     });
441     return CALLREGCOMP_ENG(eng, pattern, flags);
442 }
443 #endif
444 
445 /*
446 =for apidoc re_compile
447 
448 Compile the regular expression pattern C<pattern>, returning a pointer to the
449 compiled object for later matching with the internal regex engine.
450 
451 This function is typically used by a custom regexp engine C<.comp()> function
452 to hand off to the core regexp engine those patterns it doesn't want to handle
453 itself (typically passing through the same flags it was called with).  In
454 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
455 to compile using the currently active regexp engine.
456 
457 If C<pattern> is already a C<REGEXP>, this function does nothing but return a
458 pointer to the input.  Otherwise the PV is extracted and treated like a string
459 representing a pattern.  See L<perlre>.
460 
461 The possible flags for C<rx_flags> are documented in L<perlreapi>.  Their names
462 all begin with C<RXf_>.
463 
464 =cut
465 
466  * public entry point for the perl core's own regex compiling code.
467  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
468  * pattern rather than a list of OPs, and uses the internal engine rather
469  * than the current one */
470 
471 REGEXP *
Perl_re_compile(pTHX_ SV * const pattern,U32 rx_flags)472 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
473 {
474     SV *pat = pattern; /* defeat constness! */
475 
476     PERL_ARGS_ASSERT_RE_COMPILE;
477 
478     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
479 #ifdef PERL_IN_XSUB_RE
480                                 &my_reg_engine,
481 #else
482                                 &PL_core_reg_engine,
483 #endif
484                                 NULL, NULL, rx_flags, 0);
485 }
486 
487 static void
S_free_codeblocks(pTHX_ struct reg_code_blocks * cbs)488 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
489 {
490     int n;
491 
492     if (--cbs->refcnt > 0)
493         return;
494     for (n = 0; n < cbs->count; n++) {
495         REGEXP *rx = cbs->cb[n].src_regex;
496         if (rx) {
497             cbs->cb[n].src_regex = NULL;
498             SvREFCNT_dec_NN(rx);
499         }
500     }
501     Safefree(cbs->cb);
502     Safefree(cbs);
503 }
504 
505 
506 static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)507 S_alloc_code_blocks(pTHX_  int ncode)
508 {
509      struct reg_code_blocks *cbs;
510     Newx(cbs, 1, struct reg_code_blocks);
511     cbs->count = ncode;
512     cbs->refcnt = 1;
513     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
514     if (ncode)
515         Newx(cbs->cb, ncode, struct reg_code_block);
516     else
517         cbs->cb = NULL;
518     return cbs;
519 }
520 
521 
522 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
523  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
524  * point to the realloced string and length.
525  *
526  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
527  * stuff added */
528 
529 static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,char ** pat_p,STRLEN * plen_p,int num_code_blocks)530 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
531                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
532 {
533     U8 *const src = (U8*)*pat_p;
534     U8 *dst, *d;
535     int n=0;
536     STRLEN s = 0;
537     bool do_end = 0;
538     DECLARE_AND_GET_RE_DEBUG_FLAGS;
539 
540     DEBUG_PARSE_r(Perl_re_printf( aTHX_
541         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
542 
543     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
544     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
545     d = dst;
546 
547     while (s < *plen_p) {
548         append_utf8_from_native_byte(src[s], &d);
549 
550         if (n < num_code_blocks) {
551             assert(pRExC_state->code_blocks);
552             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
553                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
554                 assert(*(d - 1) == '(');
555                 do_end = 1;
556             }
557             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
558                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
559                 assert(*(d - 1) == ')');
560                 do_end = 0;
561                 n++;
562             }
563         }
564         s++;
565     }
566     *d = '\0';
567     *plen_p = d - dst;
568     *pat_p = (char*) dst;
569     SAVEFREEPV(*pat_p);
570     RExC_orig_utf8 = RExC_utf8 = 1;
571 }
572 
573 
574 
575 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
576  * while recording any code block indices, and handling overloading,
577  * nested qr// objects etc.  If pat is null, it will allocate a new
578  * string, or just return the first arg, if there's only one.
579  *
580  * Returns the malloced/updated pat.
581  * patternp and pat_count is the array of SVs to be concatted;
582  * oplist is the optional list of ops that generated the SVs;
583  * recompile_p is a pointer to a boolean that will be set if
584  *   the regex will need to be recompiled.
585  * delim, if non-null is an SV that will be inserted between each element
586  */
587 
588 static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,SV * pat,SV ** const patternp,int pat_count,OP * oplist,bool * recompile_p,SV * delim)589 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
590                 SV *pat, SV ** const patternp, int pat_count,
591                 OP *oplist, bool *recompile_p, SV *delim)
592 {
593     SV **svp;
594     int n = 0;
595     bool use_delim = FALSE;
596     bool alloced = FALSE;
597 
598     /* if we know we have at least two args, create an empty string,
599      * then concatenate args to that. For no args, return an empty string */
600     if (!pat && pat_count != 1) {
601         pat = newSVpvs("");
602         SAVEFREESV(pat);
603         alloced = TRUE;
604     }
605 
606     for (svp = patternp; svp < patternp + pat_count; svp++) {
607         SV *sv;
608         SV *rx  = NULL;
609         STRLEN orig_patlen = 0;
610         bool code = 0;
611         SV *msv = use_delim ? delim : *svp;
612         if (!msv) msv = &PL_sv_undef;
613 
614         /* if we've got a delimiter, we go round the loop twice for each
615          * svp slot (except the last), using the delimiter the second
616          * time round */
617         if (use_delim) {
618             svp--;
619             use_delim = FALSE;
620         }
621         else if (delim)
622             use_delim = TRUE;
623 
624         if (SvTYPE(msv) == SVt_PVAV) {
625             /* we've encountered an interpolated array within
626              * the pattern, e.g. /...@a..../. Expand the list of elements,
627              * then recursively append elements.
628              * The code in this block is based on S_pushav() */
629 
630             AV *const av = (AV*)msv;
631             const SSize_t maxarg = AvFILL(av) + 1;
632             SV **array;
633 
634             if (oplist) {
635                 assert(oplist->op_type == OP_PADAV
636                     || oplist->op_type == OP_RV2AV);
637                 oplist = OpSIBLING(oplist);
638             }
639 
640             if (SvRMAGICAL(av)) {
641                 SSize_t i;
642 
643                 Newx(array, maxarg, SV*);
644                 SAVEFREEPV(array);
645                 for (i=0; i < maxarg; i++) {
646                     SV ** const svp = av_fetch(av, i, FALSE);
647                     array[i] = svp ? *svp : &PL_sv_undef;
648                 }
649             }
650             else
651                 array = AvARRAY(av);
652 
653             if (maxarg > 0) {
654                 pat = S_concat_pat(aTHX_ pRExC_state, pat,
655                                    array, maxarg, NULL, recompile_p,
656                                    /* $" */
657                                    GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
658             }
659             else if (!pat) {
660                 pat = newSVpvs_flags("", SVs_TEMP);
661             }
662 
663             continue;
664         }
665 
666 
667         /* we make the assumption here that each op in the list of
668          * op_siblings maps to one SV pushed onto the stack,
669          * except for code blocks, with have both an OP_NULL and
670          * an OP_CONST.
671          * This allows us to match up the list of SVs against the
672          * list of OPs to find the next code block.
673          *
674          * Note that       PUSHMARK PADSV PADSV ..
675          * is optimised to
676          *                 PADRANGE PADSV  PADSV  ..
677          * so the alignment still works. */
678 
679         if (oplist) {
680             if (oplist->op_type == OP_NULL
681                 && (oplist->op_flags & OPf_SPECIAL))
682             {
683                 assert(n < pRExC_state->code_blocks->count);
684                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
685                 pRExC_state->code_blocks->cb[n].block = oplist;
686                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
687                 n++;
688                 code = 1;
689                 oplist = OpSIBLING(oplist); /* skip CONST */
690                 assert(oplist);
691             }
692             oplist = OpSIBLING(oplist);
693         }
694 
695         /* apply magic and QR overloading to arg */
696 
697         SvGETMAGIC(msv);
698         if (SvROK(msv) && SvAMAGIC(msv)) {
699             SV *sv = AMG_CALLunary(msv, regexp_amg);
700             if (sv) {
701                 if (SvROK(sv))
702                     sv = SvRV(sv);
703                 if (SvTYPE(sv) != SVt_REGEXP)
704                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
705                 msv = sv;
706             }
707         }
708 
709         /* try concatenation overload ... */
710         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
711                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
712         {
713             sv_setsv(pat, sv);
714             /* overloading involved: all bets are off over literal
715              * code. Pretend we haven't seen it */
716             if (n)
717                 pRExC_state->code_blocks->count -= n;
718             n = 0;
719         }
720         else {
721             /* ... or failing that, try "" overload */
722             while (SvAMAGIC(msv)
723                     && (sv = AMG_CALLunary(msv, string_amg))
724                     && sv != msv
725                     &&  !(   SvROK(msv)
726                           && SvROK(sv)
727                           && SvRV(msv) == SvRV(sv))
728             ) {
729                 msv = sv;
730                 SvGETMAGIC(msv);
731             }
732             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
733                 msv = SvRV(msv);
734 
735             if (pat) {
736                 /* this is a partially unrolled
737                  *     sv_catsv_nomg(pat, msv);
738                  * that allows us to adjust code block indices if
739                  * needed */
740                 STRLEN dlen;
741                 char *dst = SvPV_force_nomg(pat, dlen);
742                 orig_patlen = dlen;
743                 if (SvUTF8(msv) && !SvUTF8(pat)) {
744                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
745                     sv_setpvn(pat, dst, dlen);
746                     SvUTF8_on(pat);
747                 }
748                 sv_catsv_nomg(pat, msv);
749                 rx = msv;
750             }
751             else {
752                 /* We have only one SV to process, but we need to verify
753                  * it is properly null terminated or we will fail asserts
754                  * later. In theory we probably shouldn't get such SV's,
755                  * but if we do we should handle it gracefully. */
756                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
757                     /* not a string, or a string with a trailing null */
758                     pat = msv;
759                 } else {
760                     /* a string with no trailing null, we need to copy it
761                      * so it has a trailing null */
762                     pat = sv_2mortal(newSVsv(msv));
763                 }
764             }
765 
766             if (code)
767                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
768         }
769 
770         /* extract any code blocks within any embedded qr//'s */
771         if (rx && SvTYPE(rx) == SVt_REGEXP
772             && RX_ENGINE((REGEXP*)rx)->op_comp)
773         {
774 
775             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
776             if (ri->code_blocks && ri->code_blocks->count) {
777                 int i;
778                 /* the presence of an embedded qr// with code means
779                  * we should always recompile: the text of the
780                  * qr// may not have changed, but it may be a
781                  * different closure than last time */
782                 *recompile_p = 1;
783                 if (pRExC_state->code_blocks) {
784                     int new_count = pRExC_state->code_blocks->count
785                             + ri->code_blocks->count;
786                     Renew(pRExC_state->code_blocks->cb,
787                             new_count, struct reg_code_block);
788                     pRExC_state->code_blocks->count = new_count;
789                 }
790                 else
791                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
792                                                     ri->code_blocks->count);
793 
794                 for (i=0; i < ri->code_blocks->count; i++) {
795                     struct reg_code_block *src, *dst;
796                     STRLEN offset =  orig_patlen
797                         + ReANY((REGEXP *)rx)->pre_prefix;
798                     assert(n < pRExC_state->code_blocks->count);
799                     src = &ri->code_blocks->cb[i];
800                     dst = &pRExC_state->code_blocks->cb[n];
801                     dst->start	    = src->start + offset;
802                     dst->end	    = src->end   + offset;
803                     dst->block	    = src->block;
804                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
805                                             src->src_regex
806                                                 ? src->src_regex
807                                                 : (REGEXP*)rx);
808                     n++;
809                 }
810             }
811         }
812     }
813     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
814     if (alloced)
815         SvSETMAGIC(pat);
816 
817     return pat;
818 }
819 
820 
821 
822 /* see if there are any run-time code blocks in the pattern.
823  * False positives are allowed */
824 
825 static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)826 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
827                     char *pat, STRLEN plen)
828 {
829     int n = 0;
830     STRLEN s;
831 
832     PERL_UNUSED_CONTEXT;
833 
834     for (s = 0; s < plen; s++) {
835         if (   pRExC_state->code_blocks
836             && n < pRExC_state->code_blocks->count
837             && s == pRExC_state->code_blocks->cb[n].start)
838         {
839             s = pRExC_state->code_blocks->cb[n].end;
840             n++;
841             continue;
842         }
843         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
844          * positives here */
845         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
846             (pat[s+2] == '{'
847                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
848         )
849             return 1;
850     }
851     return 0;
852 }
853 
854 /* Handle run-time code blocks. We will already have compiled any direct
855  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
856  * copy of it, but with any literal code blocks blanked out and
857  * appropriate chars escaped; then feed it into
858  *
859  *    eval "qr'modified_pattern'"
860  *
861  * For example,
862  *
863  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
864  *
865  * becomes
866  *
867  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
868  *
869  * After eval_sv()-ing that, grab any new code blocks from the returned qr
870  * and merge them with any code blocks of the original regexp.
871  *
872  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
873  * instead, just save the qr and return FALSE; this tells our caller that
874  * the original pattern needs upgrading to utf8.
875  */
876 
877 static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)878 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
879     char *pat, STRLEN plen)
880 {
881     SV *qr;
882 
883     DECLARE_AND_GET_RE_DEBUG_FLAGS;
884 
885     if (pRExC_state->runtime_code_qr) {
886         /* this is the second time we've been called; this should
887          * only happen if the main pattern got upgraded to utf8
888          * during compilation; re-use the qr we compiled first time
889          * round (which should be utf8 too)
890          */
891         qr = pRExC_state->runtime_code_qr;
892         pRExC_state->runtime_code_qr = NULL;
893         assert(RExC_utf8 && SvUTF8(qr));
894     }
895     else {
896         int n = 0;
897         STRLEN s;
898         char *p, *newpat;
899         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
900         SV *sv, *qr_ref;
901         dSP;
902 
903         /* determine how many extra chars we need for ' and \ escaping */
904         for (s = 0; s < plen; s++) {
905             if (pat[s] == '\'' || pat[s] == '\\')
906                 newlen++;
907         }
908 
909         Newx(newpat, newlen, char);
910         p = newpat;
911         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
912 
913         for (s = 0; s < plen; s++) {
914             if (   pRExC_state->code_blocks
915                 && n < pRExC_state->code_blocks->count
916                 && s == pRExC_state->code_blocks->cb[n].start)
917             {
918                 /* blank out literal code block so that they aren't
919                  * recompiled: eg change from/to:
920                  *     /(?{xyz})/
921                  *     /(?=====)/
922                  * and
923                  *     /(??{xyz})/
924                  *     /(?======)/
925                  * and
926                  *     /(?(?{xyz}))/
927                  *     /(?(?=====))/
928                 */
929                 assert(pat[s]   == '(');
930                 assert(pat[s+1] == '?');
931                 *p++ = '(';
932                 *p++ = '?';
933                 s += 2;
934                 while (s < pRExC_state->code_blocks->cb[n].end) {
935                     *p++ = '=';
936                     s++;
937                 }
938                 *p++ = ')';
939                 n++;
940                 continue;
941             }
942             if (pat[s] == '\'' || pat[s] == '\\')
943                 *p++ = '\\';
944             *p++ = pat[s];
945         }
946         *p++ = '\'';
947         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
948             *p++ = 'x';
949             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
950                 *p++ = 'x';
951             }
952         }
953         *p++ = '\0';
954         DEBUG_COMPILE_r({
955             Perl_re_printf( aTHX_
956                 "%sre-parsing pattern for runtime code:%s %s\n",
957                 PL_colors[4], PL_colors[5], newpat);
958         });
959 
960         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
961         Safefree(newpat);
962 
963         ENTER;
964         SAVETMPS;
965         save_re_context();
966         PUSHSTACKi(PERLSI_REQUIRE);
967         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
968          * parsing qr''; normally only q'' does this. It also alters
969          * hints handling */
970         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
971         SvREFCNT_dec_NN(sv);
972         SPAGAIN;
973         qr_ref = POPs;
974         PUTBACK;
975         {
976             SV * const errsv = ERRSV;
977             if (SvTRUE_NN(errsv))
978                 /* use croak_sv ? */
979                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
980         }
981         assert(SvROK(qr_ref));
982         qr = SvRV(qr_ref);
983         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
984         /* the leaving below frees the tmp qr_ref.
985          * Give qr a life of its own */
986         SvREFCNT_inc(qr);
987         POPSTACK;
988         FREETMPS;
989         LEAVE;
990 
991     }
992 
993     if (!RExC_utf8 && SvUTF8(qr)) {
994         /* first time through; the pattern got upgraded; save the
995          * qr for the next time through */
996         assert(!pRExC_state->runtime_code_qr);
997         pRExC_state->runtime_code_qr = qr;
998         return 0;
999     }
1000 
1001 
1002     /* extract any code blocks within the returned qr//  */
1003 
1004 
1005     /* merge the main (r1) and run-time (r2) code blocks into one */
1006     {
1007         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
1008         struct reg_code_block *new_block, *dst;
1009         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
1010         int i1 = 0, i2 = 0;
1011         int r1c, r2c;
1012 
1013         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
1014         {
1015             SvREFCNT_dec_NN(qr);
1016             return 1;
1017         }
1018 
1019         if (!r1->code_blocks)
1020             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
1021 
1022         r1c = r1->code_blocks->count;
1023         r2c = r2->code_blocks->count;
1024 
1025         Newx(new_block, r1c + r2c, struct reg_code_block);
1026 
1027         dst = new_block;
1028 
1029         while (i1 < r1c || i2 < r2c) {
1030             struct reg_code_block *src;
1031             bool is_qr = 0;
1032 
1033             if (i1 == r1c) {
1034                 src = &r2->code_blocks->cb[i2++];
1035                 is_qr = 1;
1036             }
1037             else if (i2 == r2c)
1038                 src = &r1->code_blocks->cb[i1++];
1039             else if (  r1->code_blocks->cb[i1].start
1040                      < r2->code_blocks->cb[i2].start)
1041             {
1042                 src = &r1->code_blocks->cb[i1++];
1043                 assert(src->end < r2->code_blocks->cb[i2].start);
1044             }
1045             else {
1046                 assert(  r1->code_blocks->cb[i1].start
1047                        > r2->code_blocks->cb[i2].start);
1048                 src = &r2->code_blocks->cb[i2++];
1049                 is_qr = 1;
1050                 assert(src->end < r1->code_blocks->cb[i1].start);
1051             }
1052 
1053             assert(pat[src->start] == '(');
1054             assert(pat[src->end]   == ')');
1055             dst->start	    = src->start;
1056             dst->end	    = src->end;
1057             dst->block	    = src->block;
1058             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
1059                                     : src->src_regex;
1060             dst++;
1061         }
1062         r1->code_blocks->count += r2c;
1063         Safefree(r1->code_blocks->cb);
1064         r1->code_blocks->cb = new_block;
1065     }
1066 
1067     SvREFCNT_dec_NN(qr);
1068     return 1;
1069 }
1070 
1071 
1072 STATIC bool
S_setup_longest(pTHX_ RExC_state_t * pRExC_state,struct reg_substr_datum * rsd,struct scan_data_substrs * sub,STRLEN longest_length)1073 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
1074                       struct reg_substr_datum  *rsd,
1075                       struct scan_data_substrs *sub,
1076                       STRLEN longest_length)
1077 {
1078     /* This is the common code for setting up the floating and fixed length
1079      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
1080      * as to whether succeeded or not */
1081 
1082     I32 t;
1083     SSize_t ml;
1084     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
1085     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
1086 
1087     if (! (longest_length
1088            || (eol /* Can't have SEOL and MULTI */
1089                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
1090           )
1091             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
1092         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
1093     {
1094         return FALSE;
1095     }
1096 
1097     /* copy the information about the longest from the reg_scan_data
1098         over to the program. */
1099     if (SvUTF8(sub->str)) {
1100         rsd->substr      = NULL;
1101         rsd->utf8_substr = sub->str;
1102     } else {
1103         rsd->substr      = sub->str;
1104         rsd->utf8_substr = NULL;
1105     }
1106     /* end_shift is how many chars that must be matched that
1107         follow this item. We calculate it ahead of time as once the
1108         lookbehind offset is added in we lose the ability to correctly
1109         calculate it.*/
1110     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
1111     rsd->end_shift = ml - sub->min_offset
1112         - longest_length
1113             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
1114              * intead? - DAPM
1115             + (SvTAIL(sub->str) != 0)
1116             */
1117         + sub->lookbehind;
1118 
1119     t = (eol/* Can't have SEOL and MULTI */
1120          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
1121     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
1122 
1123     return TRUE;
1124 }
1125 
1126 STATIC void
S_set_regex_pv(pTHX_ RExC_state_t * pRExC_state,REGEXP * Rx)1127 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
1128 {
1129     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
1130      * properly wrapped with the right modifiers */
1131 
1132     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
1133     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
1134                                                 != REGEX_DEPENDS_CHARSET);
1135 
1136     /* The caret is output if there are any defaults: if not all the STD
1137         * flags are set, or if no character set specifier is needed */
1138     bool has_default =
1139                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
1140                 || ! has_charset);
1141     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
1142                                                 == REG_RUN_ON_COMMENT_SEEN);
1143     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
1144                         >> RXf_PMf_STD_PMMOD_SHIFT);
1145     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
1146     char *p;
1147     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
1148 
1149     /* We output all the necessary flags; we never output a minus, as all
1150         * those are defaults, so are
1151         * covered by the caret */
1152     const STRLEN wraplen = pat_len + has_p + has_runon
1153         + has_default       /* If needs a caret */
1154         + PL_bitcount[reganch] /* 1 char for each set standard flag */
1155 
1156             /* If needs a character set specifier */
1157         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
1158         + (sizeof("(?:)") - 1);
1159 
1160     PERL_ARGS_ASSERT_SET_REGEX_PV;
1161 
1162     /* make sure PL_bitcount bounds not exceeded */
1163     STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
1164 
1165     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
1166     SvPOK_on(Rx);
1167     if (RExC_utf8)
1168         SvFLAGS(Rx) |= SVf_UTF8;
1169     *p++='('; *p++='?';
1170 
1171     /* If a default, cover it using the caret */
1172     if (has_default) {
1173         *p++= DEFAULT_PAT_MOD;
1174     }
1175     if (has_charset) {
1176         STRLEN len;
1177         const char* name;
1178 
1179         name = get_regex_charset_name(RExC_rx->extflags, &len);
1180         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
1181             assert(RExC_utf8);
1182             name = UNICODE_PAT_MODS;
1183             len = sizeof(UNICODE_PAT_MODS) - 1;
1184         }
1185         Copy(name, p, len, char);
1186         p += len;
1187     }
1188     if (has_p)
1189         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
1190     {
1191         char ch;
1192         while((ch = *fptr++)) {
1193             if(reganch & 1)
1194                 *p++ = ch;
1195             reganch >>= 1;
1196         }
1197     }
1198 
1199     *p++ = ':';
1200     Copy(RExC_precomp, p, pat_len, char);
1201     assert ((RX_WRAPPED(Rx) - p) < 16);
1202     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
1203     p += pat_len;
1204 
1205     /* Adding a trailing \n causes this to compile properly:
1206             my $R = qr / A B C # D E/x; /($R)/
1207         Otherwise the parens are considered part of the comment */
1208     if (has_runon)
1209         *p++ = '\n';
1210     *p++ = ')';
1211     *p = 0;
1212     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
1213 }
1214 
1215 STATIC void
S_ssc_finalize(pTHX_ RExC_state_t * pRExC_state,regnode_ssc * ssc)1216 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1217 {
1218     /* The inversion list in the SSC is marked mortal; now we need a more
1219      * permanent copy, which is stored the same way that is done in a regular
1220      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1221      * map */
1222 
1223     SV* invlist = invlist_clone(ssc->invlist, NULL);
1224 
1225     PERL_ARGS_ASSERT_SSC_FINALIZE;
1226 
1227     assert(is_ANYOF_SYNTHETIC(ssc));
1228 
1229     /* The code in this file assumes that all but these flags aren't relevant
1230      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1231      * by the time we reach here */
1232     assert(! (ANYOF_FLAGS(ssc)
1233         & ~( ANYOF_COMMON_FLAGS
1234             |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
1235             |ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
1236 
1237     populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
1238 
1239     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
1240     SvREFCNT_dec(invlist);
1241 
1242     /* Make sure is clone-safe */
1243     ssc->invlist = NULL;
1244 
1245     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1246         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1247         OP(ssc) = ANYOFPOSIXL;
1248     }
1249     else if (RExC_contains_locale) {
1250         OP(ssc) = ANYOFL;
1251     }
1252 
1253     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1254 }
1255 
1256 STATIC bool
S_is_ssc_worth_it(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)1257 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1258 {
1259     /* The synthetic start class is used to hopefully quickly winnow down
1260      * places where a pattern could start a match in the target string.  If it
1261      * doesn't really narrow things down that much, there isn't much point to
1262      * having the overhead of using it.  This function uses some very crude
1263      * heuristics to decide if to use the ssc or not.
1264      *
1265      * It returns TRUE if 'ssc' rules out more than half what it considers to
1266      * be the "likely" possible matches, but of course it doesn't know what the
1267      * actual things being matched are going to be; these are only guesses
1268      *
1269      * For /l matches, it assumes that the only likely matches are going to be
1270      *      in the 0-255 range, uniformly distributed, so half of that is 127
1271      * For /a and /d matches, it assumes that the likely matches will be just
1272      *      the ASCII range, so half of that is 63
1273      * For /u and there isn't anything matching above the Latin1 range, it
1274      *      assumes that that is the only range likely to be matched, and uses
1275      *      half that as the cut-off: 127.  If anything matches above Latin1,
1276      *      it assumes that all of Unicode could match (uniformly), except for
1277      *      non-Unicode code points and things in the General Category "Other"
1278      *      (unassigned, private use, surrogates, controls and formats).  This
1279      *      is a much large number. */
1280 
1281     U32 count = 0;      /* Running total of number of code points matched by
1282                            'ssc' */
1283     UV start, end;      /* Start and end points of current range in inversion
1284                            XXX outdated.  UTF-8 locales are common, what about invert? list */
1285     const U32 max_code_points = (LOC)
1286                                 ?  256
1287                                 : ((  ! UNI_SEMANTICS
1288                                     ||  invlist_highest(ssc->invlist) < 256)
1289                                   ? 128
1290                                   : NON_OTHER_COUNT);
1291     const U32 max_match = max_code_points / 2;
1292 
1293     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1294 
1295     invlist_iterinit(ssc->invlist);
1296     while (invlist_iternext(ssc->invlist, &start, &end)) {
1297         if (start >= max_code_points) {
1298             break;
1299         }
1300         end = MIN(end, max_code_points - 1);
1301         count += end - start + 1;
1302         if (count >= max_match) {
1303             invlist_iterfinish(ssc->invlist);
1304             return FALSE;
1305         }
1306     }
1307 
1308     return TRUE;
1309 }
1310 
1311 static void
release_RExC_state(pTHX_ void * vstate)1312 release_RExC_state(pTHX_ void *vstate) {
1313     RExC_state_t *pRExC_state = (RExC_state_t *)vstate;
1314 
1315     /* Any or all of these might be NULL.
1316 
1317        There's no point in setting them to NULL after the free, since
1318        pRExC_state is about to be released.
1319      */
1320     SvREFCNT_dec(RExC_rx_sv);
1321     Safefree(RExC_open_parens);
1322     Safefree(RExC_close_parens);
1323     Safefree(RExC_logical_to_parno);
1324     Safefree(RExC_parno_to_logical);
1325 
1326     Safefree(pRExC_state);
1327 }
1328 
1329 /*
1330  * Perl_re_op_compile - the perl internal RE engine's function to compile a
1331  * regular expression into internal code.
1332  * The pattern may be passed either as:
1333  *    a list of SVs (patternp plus pat_count)
1334  *    a list of OPs (expr)
1335  * If both are passed, the SV list is used, but the OP list indicates
1336  * which SVs are actually pre-compiled code blocks
1337  *
1338  * The SVs in the list have magic and qr overloading applied to them (and
1339  * the list may be modified in-place with replacement SVs in the latter
1340  * case).
1341  *
1342  * If the pattern hasn't changed from old_re, then old_re will be
1343  * returned.
1344  *
1345  * eng is the current engine. If that engine has an op_comp method, then
1346  * handle directly (i.e. we assume that op_comp was us); otherwise, just
1347  * do the initial concatenation of arguments and pass on to the external
1348  * engine.
1349  *
1350  * If is_bare_re is not null, set it to a boolean indicating whether the
1351  * arg list reduced (after overloading) to a single bare regex which has
1352  * been returned (i.e. /$qr/).
1353  *
1354  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
1355  *
1356  * pm_flags contains the PMf_* flags, typically based on those from the
1357  * pm_flags field of the related PMOP. Currently we're only interested in
1358  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
1359  *
1360  * For many years this code had an initial sizing pass that calculated
1361  * (sometimes incorrectly, leading to security holes) the size needed for the
1362  * compiled pattern.  That was changed by commit
1363  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
1364  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
1365  * references to this sizing pass.
1366  *
1367  * Now, an initial crude guess as to the size needed is made, based on the
1368  * length of the pattern.  Patches welcome to improve that guess.  That amount
1369  * of space is malloc'd and then immediately freed, and then clawed back node
1370  * by node.  This design is to minimize, to the extent possible, memory churn
1371  * when doing the reallocs.
1372  *
1373  * A separate parentheses counting pass may be needed in some cases.
1374  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
1375  * of these cases.
1376  *
1377  * The existence of a sizing pass necessitated design decisions that are no
1378  * longer needed.  There are potential areas of simplification.
1379  *
1380  * Beware that the optimization-preparation code in here knows about some
1381  * of the structure of the compiled regexp.  [I'll say.]
1382  */
1383 
1384 REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp,int pat_count,OP * expr,const regexp_engine * eng,REGEXP * old_re,bool * is_bare_re,const U32 orig_rx_flags,const U32 pm_flags)1385 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
1386                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
1387                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
1388 {
1389     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
1390     STRLEN plen;
1391     char *exp;
1392     regnode *scan;
1393     I32 flags;
1394     SSize_t minlen = 0;
1395     U32 rx_flags;
1396     SV *pat;
1397     SV** new_patternp = patternp;
1398 
1399     /* these are all flags - maybe they should be turned
1400      * into a single int with different bit masks */
1401     I32 sawlookahead = 0;
1402     I32 sawplus = 0;
1403     I32 sawopen = 0;
1404     I32 sawminmod = 0;
1405 
1406     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
1407     bool recompile = 0;
1408     bool runtime_code = 0;
1409     scan_data_t data;
1410 
1411 #ifdef TRIE_STUDY_OPT
1412     /* search for "restudy" in this file for a detailed explanation */
1413     int restudied = 0;
1414     RExC_state_t copyRExC_state;
1415 #endif
1416     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1417 
1418     PERL_ARGS_ASSERT_RE_OP_COMPILE;
1419 
1420     DEBUG_r(if (!PL_colorset) reginitcolors());
1421 
1422     RExC_state_t *pRExC_state = NULL;
1423     /* Ensure that all members of the pRExC_state is initialized to 0
1424      * at the start of regex compilation. Historically we have had issues
1425      * with people remembering to zero specific members or zeroing them
1426      * too late, etc. Doing it in one place is saner and avoid oversight
1427      * or error. */
1428     Newxz(pRExC_state, 1, RExC_state_t);
1429 
1430     SAVEDESTRUCTOR_X(release_RExC_state, pRExC_state);
1431 
1432     DEBUG_r({
1433         /* and then initialize RExC_mysv1 and RExC_mysv2 early so if
1434          * something calls regprop we don't have issues. These variables
1435          * not being set up properly motivated the use of Newxz() to initalize
1436          * the pRExC_state structure, as there were codepaths under -Uusedl
1437          * that left these unitialized, and non-null as well. */
1438         RExC_mysv1 = sv_newmortal();
1439         RExC_mysv2 = sv_newmortal();
1440     });
1441 
1442     if (is_bare_re)
1443         *is_bare_re = FALSE;
1444 
1445     if (expr && (expr->op_type == OP_LIST ||
1446                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
1447         /* allocate code_blocks if needed */
1448         OP *o;
1449         int ncode = 0;
1450 
1451         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
1452             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
1453                 ncode++; /* count of DO blocks */
1454 
1455         if (ncode)
1456             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
1457     }
1458 
1459     if (!pat_count) {
1460         /* compile-time pattern with just OP_CONSTs and DO blocks */
1461 
1462         int n;
1463         OP *o;
1464 
1465         /* find how many CONSTs there are */
1466         assert(expr);
1467         n = 0;
1468         if (expr->op_type == OP_CONST)
1469             n = 1;
1470         else
1471             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1472                 if (o->op_type == OP_CONST)
1473                     n++;
1474             }
1475 
1476         /* fake up an SV array */
1477 
1478         assert(!new_patternp);
1479         Newx(new_patternp, n, SV*);
1480         SAVEFREEPV(new_patternp);
1481         pat_count = n;
1482 
1483         n = 0;
1484         if (expr->op_type == OP_CONST)
1485             new_patternp[n] = cSVOPx_sv(expr);
1486         else
1487             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1488                 if (o->op_type == OP_CONST)
1489                     new_patternp[n++] = cSVOPo_sv;
1490             }
1491 
1492     }
1493 
1494     DEBUG_PARSE_r(Perl_re_printf( aTHX_
1495         "Assembling pattern from %d elements%s\n", pat_count,
1496             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1497 
1498     /* set expr to the first arg op */
1499 
1500     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
1501          && expr->op_type != OP_CONST)
1502     {
1503             expr = cLISTOPx(expr)->op_first;
1504             assert(   expr->op_type == OP_PUSHMARK
1505                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
1506                    || expr->op_type == OP_PADRANGE);
1507             expr = OpSIBLING(expr);
1508     }
1509 
1510     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
1511                         expr, &recompile, NULL);
1512 
1513     /* handle bare (possibly after overloading) regex: foo =~ $re */
1514     {
1515         SV *re = pat;
1516         if (SvROK(re))
1517             re = SvRV(re);
1518         if (SvTYPE(re) == SVt_REGEXP) {
1519             if (is_bare_re)
1520                 *is_bare_re = TRUE;
1521             SvREFCNT_inc(re);
1522             DEBUG_PARSE_r(Perl_re_printf( aTHX_
1523                 "Precompiled pattern%s\n",
1524                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1525 
1526             return (REGEXP*)re;
1527         }
1528     }
1529 
1530     exp = SvPV_nomg(pat, plen);
1531 
1532     if (!eng->op_comp) {
1533         if ((SvUTF8(pat) && IN_BYTES)
1534                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
1535         {
1536             /* make a temporary copy; either to convert to bytes,
1537              * or to avoid repeating get-magic / overloaded stringify */
1538             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
1539                                         (IN_BYTES ? 0 : SvUTF8(pat)));
1540         }
1541         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1542     }
1543 
1544     /* ignore the utf8ness if the pattern is 0 length */
1545     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
1546     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
1547 
1548 
1549     DEBUG_COMPILE_r({
1550             RE_PV_QUOTED_DECL(s, RExC_utf8, RExC_mysv, exp, plen, PL_dump_re_max_len);
1551             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
1552                           PL_colors[4], PL_colors[5], s);
1553         });
1554 
1555     /* we jump here if we have to recompile, e.g., from upgrading the pattern
1556      * to utf8 */
1557 
1558     if ((pm_flags & PMf_USE_RE_EVAL)
1559                 /* this second condition covers the non-regex literal case,
1560                  * i.e.  $foo =~ '(?{})'. */
1561                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
1562     )
1563         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
1564 
1565   redo_parse:
1566     /* return old regex if pattern hasn't changed */
1567     /* XXX: note in the below we have to check the flags as well as the
1568      * pattern.
1569      *
1570      * Things get a touch tricky as we have to compare the utf8 flag
1571      * independently from the compile flags.
1572      *
1573      * ALSO NOTE: After this point we may need to zero members of pRExC_state
1574      * explicitly. Prior to this point they should all be zeroed as part of
1575      * a struct wide Zero instruction.
1576      */
1577 
1578     if (   old_re
1579         && !recompile
1580         && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
1581         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
1582         && RX_PRELEN(old_re) == plen
1583         && memEQ(RX_PRECOMP(old_re), exp, plen)
1584         && !runtime_code /* with runtime code, always recompile */ )
1585     {
1586         DEBUG_COMPILE_r({
1587             RE_PV_QUOTED_DECL(s, RExC_utf8, RExC_mysv, exp, plen, PL_dump_re_max_len);
1588             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
1589                           PL_colors[4], PL_colors[5], s);
1590         });
1591         return old_re;
1592     }
1593 
1594     /* Allocate the pattern's SV */
1595     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
1596     RExC_rx = ReANY(Rx);
1597     if ( RExC_rx == NULL )
1598         FAIL("Regexp out of space");
1599 
1600     rx_flags = orig_rx_flags;
1601     if (rx_flags & RXf_SPLIT)
1602         rx_flags &= ~(RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE);
1603 
1604     if (   toUSE_UNI_CHARSET_NOT_DEPENDS
1605         && initial_charset == REGEX_DEPENDS_CHARSET)
1606     {
1607 
1608         /* Set to use unicode semantics if the pattern is in utf8 and has the
1609          * 'depends' charset specified, as it means unicode when utf8  */
1610         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
1611         RExC_uni_semantics = 1;
1612     }
1613 
1614     RExC_pm_flags = pm_flags;
1615 
1616     if (runtime_code) {
1617         assert(TAINTING_get || !TAINT_get);
1618         if (TAINT_get)
1619             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
1620 
1621         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
1622             /* whoops, we have a non-utf8 pattern, whilst run-time code
1623              * got compiled as utf8. Try again with a utf8 pattern */
1624             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1625                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1626             goto redo_parse;
1627         }
1628     }
1629     assert(!pRExC_state->runtime_code_qr);
1630 
1631     RExC_sawback = 0;
1632 
1633     RExC_seen = 0;
1634     RExC_maxlen = 0;
1635     RExC_in_lookaround = 0;
1636     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1637     RExC_recode_x_to_native = 0;
1638     RExC_in_multi_char_class = 0;
1639 
1640     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
1641     RExC_precomp_end = RExC_end = exp + plen;
1642     RExC_nestroot = 0;
1643     RExC_whilem_seen = 0;
1644     RExC_end_op = NULL;
1645     RExC_recurse = NULL;
1646     RExC_study_chunk_recursed = NULL;
1647     RExC_study_chunk_recursed_bytes= 0;
1648     RExC_recurse_count = 0;
1649     RExC_sets_depth = 0;
1650     pRExC_state->code_index = 0;
1651 
1652     /* Initialize the string in the compiled pattern.  This is so that there is
1653      * something to output if necessary */
1654     set_regex_pv(pRExC_state, Rx);
1655 
1656     DEBUG_PARSE_r({
1657         Perl_re_printf( aTHX_
1658             "Starting parse and generation\n");
1659         RExC_lastnum=0;
1660         RExC_lastparse=NULL;
1661     });
1662 
1663     /* Allocate space and zero-initialize. Note, the two step process
1664        of zeroing when in debug mode, thus anything assigned has to
1665        happen after that */
1666     if (!  RExC_size) {
1667 
1668         /* On the first pass of the parse, we guess how big this will be.  Then
1669          * we grow in one operation to that amount and then give it back.  As
1670          * we go along, we re-allocate what we need.
1671          *
1672          * XXX Currently the guess is essentially that the pattern will be an
1673          * EXACT node with one byte input, one byte output.  This is crude, and
1674          * better heuristics are welcome.
1675          *
1676          * On any subsequent passes, we guess what we actually computed in the
1677          * latest earlier pass.  Such a pass probably didn't complete so is
1678          * missing stuff.  We could improve those guesses by knowing where the
1679          * parse stopped, and use the length so far plus apply the above
1680          * assumption to what's left. */
1681         RExC_size = STR_SZ(RExC_end - RExC_start);
1682     }
1683 
1684     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
1685     if ( RExC_rxi == NULL )
1686         FAIL("Regexp out of space");
1687 
1688     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
1689     RXi_SET( RExC_rx, RExC_rxi );
1690 
1691     /* We start from 0 (over from 0 in the case this is a reparse.  The first
1692      * node parsed will give back any excess memory we have allocated so far).
1693      * */
1694     RExC_size = 0;
1695 
1696     /* non-zero initialization begins here */
1697     RExC_rx->engine= eng;
1698     RExC_rx->extflags = rx_flags;
1699     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
1700 
1701     if (pm_flags & PMf_IS_QR) {
1702         RExC_rxi->code_blocks = pRExC_state->code_blocks;
1703         if (RExC_rxi->code_blocks) {
1704             RExC_rxi->code_blocks->refcnt++;
1705         }
1706     }
1707 
1708     RExC_rx->intflags = 0;
1709 
1710     RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
1711     RExC_parse_set(exp);
1712 
1713     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
1714      * code makes sure the final byte is an uncounted NUL.  But should this
1715      * ever not be the case, lots of things could read beyond the end of the
1716      * buffer: loops like
1717      *      while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
1718      *      strchr(RExC_parse, "foo");
1719      * etc.  So it is worth noting. */
1720     assert(*RExC_end == '\0');
1721 
1722     RExC_naughty = 0;
1723     RExC_npar = 1;
1724     RExC_logical_npar = 1;
1725     RExC_parens_buf_size = 0;
1726     RExC_emit_start = RExC_rxi->program;
1727     pRExC_state->code_index = 0;
1728 
1729     *((char*) RExC_emit_start) = (char) REG_MAGIC;
1730     RExC_emit = NODE_STEP_REGNODE;
1731 
1732     /* Do the parse */
1733     if (reg(pRExC_state, 0, &flags, 1)) {
1734 
1735         /* Success!, But we may need to redo the parse knowing how many parens
1736          * there actually are */
1737         if (IN_PARENS_PASS) {
1738             flags |= RESTART_PARSE;
1739         }
1740 
1741         /* We have that number in RExC_npar */
1742         RExC_total_parens = RExC_npar;
1743         RExC_logical_total_parens = RExC_logical_npar;
1744     }
1745     else if (! MUST_RESTART(flags)) {
1746         ReREFCNT_dec(Rx);
1747         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
1748     }
1749 
1750     /* Here, we either have success, or we have to redo the parse for some reason */
1751     if (MUST_RESTART(flags)) {
1752 
1753         /* It's possible to write a regexp in ascii that represents Unicode
1754         codepoints outside of the byte range, such as via \x{100}. If we
1755         detect such a sequence we have to convert the entire pattern to utf8
1756         and then recompile, as our sizing calculation will have been based
1757         on 1 byte == 1 character, but we will need to use utf8 to encode
1758         at least some part of the pattern, and therefore must convert the whole
1759         thing.
1760         -- dmq */
1761         if (flags & NEED_UTF8) {
1762 
1763             /* We have stored the offset of the final warning output so far.
1764              * That must be adjusted.  Any variant characters between the start
1765              * of the pattern and this warning count for 2 bytes in the final,
1766              * so just add them again */
1767             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
1768                 RExC_latest_warn_offset +=
1769                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
1770                                                 + RExC_latest_warn_offset);
1771             }
1772             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1773             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1774             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
1775         }
1776         else {
1777             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
1778         }
1779 
1780         if (ALL_PARENS_COUNTED) {
1781             /* Make enough room for all the known parens, and zero it */
1782             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
1783             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
1784             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
1785 
1786             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
1787             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
1788             /* we do NOT reinitialize  RExC_logical_to_parno and
1789              * RExC_parno_to_logical here. We need their data on the second
1790              * pass */
1791         }
1792         else { /* Parse did not complete.  Reinitialize the parentheses
1793                   structures */
1794             RExC_total_parens = 0;
1795             if (RExC_open_parens) {
1796                 Safefree(RExC_open_parens);
1797                 RExC_open_parens = NULL;
1798             }
1799             if (RExC_close_parens) {
1800                 Safefree(RExC_close_parens);
1801                 RExC_close_parens = NULL;
1802             }
1803             if (RExC_logical_to_parno) {
1804                 Safefree(RExC_logical_to_parno);
1805                 RExC_logical_to_parno = NULL;
1806             }
1807             if (RExC_parno_to_logical) {
1808                 Safefree(RExC_parno_to_logical);
1809                 RExC_parno_to_logical = NULL;
1810             }
1811         }
1812 
1813         /* Clean up what we did in this parse */
1814         SvREFCNT_dec_NN(RExC_rx_sv);
1815         RExC_rx_sv = NULL;
1816 
1817         goto redo_parse;
1818     }
1819 
1820     /* Here, we have successfully parsed and generated the pattern's program
1821      * for the regex engine.  We are ready to finish things up and look for
1822      * optimizations. */
1823 
1824     /* Update the string to compile, with correct modifiers, etc */
1825     set_regex_pv(pRExC_state, Rx);
1826 
1827     RExC_rx->nparens = RExC_total_parens - 1;
1828     RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
1829 
1830     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
1831     if (RExC_whilem_seen > 15)
1832         RExC_whilem_seen = 15;
1833 
1834     DEBUG_PARSE_r({
1835         Perl_re_printf( aTHX_
1836             "Required size %" IVdf " nodes\n", (IV)RExC_size);
1837         RExC_lastnum=0;
1838         RExC_lastparse=NULL;
1839     });
1840 
1841     SetProgLen(RExC_rxi,RExC_size);
1842 
1843     DEBUG_DUMP_PRE_OPTIMIZE_r({
1844         SV * const sv = sv_newmortal(); /* can this use RExC_mysv? */
1845         RXi_GET_DECL(RExC_rx, ri);
1846         DEBUG_RExC_seen();
1847         Perl_re_printf( aTHX_ "Program before optimization:\n");
1848 
1849         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
1850                         sv, 0, 0);
1851     });
1852 
1853     DEBUG_OPTIMISE_r(
1854         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
1855     );
1856 
1857     /* XXXX To minimize changes to RE engine we always allocate
1858        3-units-long substrs field. */
1859     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
1860     if (RExC_recurse_count) {
1861         Newx(RExC_recurse, RExC_recurse_count, regnode *);
1862         SAVEFREEPV(RExC_recurse);
1863     }
1864 
1865     if (RExC_seen & REG_RECURSE_SEEN) {
1866         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
1867          * So its 1 if there are no parens. */
1868         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
1869                                          ((RExC_total_parens & 0x07) != 0);
1870         Newx(RExC_study_chunk_recursed,
1871              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1872         SAVEFREEPV(RExC_study_chunk_recursed);
1873     }
1874 
1875   reStudy:
1876     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
1877     DEBUG_r(
1878         RExC_study_chunk_recursed_count= 0;
1879     );
1880     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
1881     if (RExC_study_chunk_recursed) {
1882         Zero(RExC_study_chunk_recursed,
1883              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1884     }
1885 
1886 
1887 #ifdef TRIE_STUDY_OPT
1888     /* search for "restudy" in this file for a detailed explanation */
1889     if (!restudied) {
1890         StructCopy(&zero_scan_data, &data, scan_data_t);
1891         copyRExC_state = *pRExC_state;
1892     } else {
1893         U32 seen=RExC_seen;
1894         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
1895 
1896         *pRExC_state = copyRExC_state;
1897         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
1898             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
1899         else
1900             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
1901         StructCopy(&zero_scan_data, &data, scan_data_t);
1902     }
1903 #else
1904     StructCopy(&zero_scan_data, &data, scan_data_t);
1905 #endif
1906 
1907     /* Dig out information for optimizations. */
1908     RExC_rx->extflags = RExC_flags; /* was pm_op */
1909     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
1910 
1911     if (UTF)
1912         SvUTF8_on(Rx);	/* Unicode in it? */
1913     RExC_rxi->regstclass = NULL;
1914     if (RExC_naughty >= TOO_NAUGHTY)	/* Probably an expensive pattern. */
1915         RExC_rx->intflags |= PREGf_NAUGHTY;
1916     scan = RExC_rxi->program + 1;		/* First BRANCH. */
1917 
1918     /* testing for BRANCH here tells us whether there is "must appear"
1919        data in the pattern. If there is then we can use it for optimisations */
1920     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
1921                                                   */
1922         SSize_t fake_deltap;
1923         STRLEN longest_length[2];
1924         regnode_ssc ch_class; /* pointed to by data */
1925         int stclass_flag;
1926         SSize_t last_close = 0; /* pointed to by data */
1927         regnode *first= scan;
1928         regnode *first_next= regnext(first);
1929         regnode *last_close_op= NULL;
1930         int i;
1931 
1932         /*
1933          * Skip introductions and multiplicators >= 1
1934          * so that we can extract the 'meat' of the pattern that must
1935          * match in the large if() sequence following.
1936          * NOTE that EXACT is NOT covered here, as it is normally
1937          * picked up by the optimiser separately.
1938          *
1939          * This is unfortunate as the optimiser isnt handling lookahead
1940          * properly currently.
1941          *
1942          */
1943         while (1)
1944         {
1945             if (OP(first) == OPEN)
1946                 sawopen = 1;
1947             else
1948             if (OP(first) == IFMATCH && !FLAGS(first))
1949                 /* for now we can't handle lookbehind IFMATCH */
1950                 sawlookahead = 1;
1951             else
1952             if (OP(first) == PLUS)
1953                 sawplus = 1;
1954             else
1955             if (OP(first) == MINMOD)
1956                 sawminmod = 1;
1957             else
1958             if (!(
1959                 /* An OR of *one* alternative - should not happen now. */
1960                 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
1961                 /* An {n,m} with n>0 */
1962                 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) ||
1963                 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
1964             )){
1965                 break;
1966             }
1967 
1968             first = REGNODE_AFTER(first);
1969             first_next= regnext(first);
1970         }
1971 
1972         /* Starting-point info. */
1973       again:
1974         DEBUG_PEEP("first:", first, 0, 0);
1975         /* Ignore EXACT as we deal with it later. */
1976         if (REGNODE_TYPE(OP(first)) == EXACT) {
1977             if (! isEXACTFish(OP(first))) {
1978                 NOOP;	/* Empty, get anchored substr later. */
1979             }
1980             else
1981                 RExC_rxi->regstclass = first;
1982         }
1983 #ifdef TRIE_STCLASS
1984         else if (REGNODE_TYPE(OP(first)) == TRIE &&
1985                 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
1986         {
1987             /* this can happen only on restudy
1988              * Search for "restudy" in this file to find
1989              * a comment with details. */
1990             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
1991         }
1992 #endif
1993         else if (REGNODE_SIMPLE(OP(first)))
1994             RExC_rxi->regstclass = first;
1995         else if (REGNODE_TYPE(OP(first)) == BOUND ||
1996                  REGNODE_TYPE(OP(first)) == NBOUND)
1997             RExC_rxi->regstclass = first;
1998         else if (REGNODE_TYPE(OP(first)) == BOL) {
1999             RExC_rx->intflags |= (OP(first) == MBOL
2000                            ? PREGf_ANCH_MBOL
2001                            : PREGf_ANCH_SBOL);
2002             first = REGNODE_AFTER(first);
2003             goto again;
2004         }
2005         else if (OP(first) == GPOS) {
2006             RExC_rx->intflags |= PREGf_ANCH_GPOS;
2007             first = REGNODE_AFTER_type(first,tregnode_GPOS);
2008             goto again;
2009         }
2010         else if ((!sawopen || !RExC_sawback) &&
2011             !sawlookahead &&
2012             (OP(first) == STAR &&
2013             REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
2014             !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
2015         {
2016             /* turn .* into ^.* with an implied $*=1 */
2017             const int type =
2018                 (OP(REGNODE_AFTER(first)) == REG_ANY)
2019                     ? PREGf_ANCH_MBOL
2020                     : PREGf_ANCH_SBOL;
2021             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
2022             first = REGNODE_AFTER(first);
2023             goto again;
2024         }
2025         if (sawplus && !sawminmod && !sawlookahead
2026             && (!sawopen || !RExC_sawback)
2027             && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
2028             /* x+ must match at the 1st pos of run of x's */
2029             RExC_rx->intflags |= PREGf_SKIP;
2030 
2031         /* Scan is after the zeroth branch, first is atomic matcher. */
2032 #ifdef TRIE_STUDY_OPT
2033         /* search for "restudy" in this file for a detailed explanation */
2034         DEBUG_PARSE_r(
2035             if (!restudied)
2036                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
2037                               (IV)(first - scan + 1))
2038         );
2039 #else
2040         DEBUG_PARSE_r(
2041             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
2042                 (IV)(first - scan + 1))
2043         );
2044 #endif
2045 
2046 
2047         /*
2048         * If there's something expensive in the r.e., find the
2049         * longest literal string that must appear and make it the
2050         * regmust.  Resolve ties in favor of later strings, since
2051         * the regstart check works with the beginning of the r.e.
2052         * and avoiding duplication strengthens checking.  Not a
2053         * strong reason, but sufficient in the absence of others.
2054         * [Now we resolve ties in favor of the earlier string if
2055         * it happens that c_offset_min has been invalidated, since the
2056         * earlier string may buy us something the later one won't.]
2057         */
2058 
2059         data.substrs[0].str = newSVpvs("");
2060         data.substrs[1].str = newSVpvs("");
2061         data.last_found = newSVpvs("");
2062         data.cur_is_floating = 0; /* initially any found substring is fixed */
2063         ENTER_with_name("study_chunk");
2064         SAVEFREESV(data.substrs[0].str);
2065         SAVEFREESV(data.substrs[1].str);
2066         SAVEFREESV(data.last_found);
2067         first = scan;
2068         if (!RExC_rxi->regstclass) {
2069             ssc_init(pRExC_state, &ch_class);
2070             data.start_class = &ch_class;
2071             stclass_flag = SCF_DO_STCLASS_AND;
2072         } else				/* XXXX Check for BOUND? */
2073             stclass_flag = 0;
2074         data.last_closep = &last_close;
2075         data.last_close_opp = &last_close_op;
2076 
2077         DEBUG_RExC_seen();
2078         /*
2079          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
2080          * (NO top level branches)
2081          */
2082         minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
2083                              scan + RExC_size, /* Up to end */
2084             &data, -1, 0, NULL,
2085             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
2086                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
2087             0, TRUE);
2088         /* search for "restudy" in this file for a detailed explanation
2089          * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2090 
2091 
2092         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
2093 
2094 
2095         if ( RExC_total_parens == 1 && !data.cur_is_floating
2096              && data.last_start_min == 0 && data.last_end > 0
2097              && !RExC_seen_zerolen
2098              && !(RExC_seen & REG_VERBARG_SEEN)
2099              && !(RExC_seen & REG_GPOS_SEEN)
2100         ){
2101             RExC_rx->extflags |= RXf_CHECK_ALL;
2102         }
2103         scan_commit(pRExC_state, &data,&minlen, 0);
2104 
2105 
2106         /* XXX this is done in reverse order because that's the way the
2107          * code was before it was parameterised. Don't know whether it
2108          * actually needs doing in reverse order. DAPM */
2109         for (i = 1; i >= 0; i--) {
2110             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
2111 
2112             if (   !(   i
2113                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
2114                      &&    data.substrs[0].min_offset
2115                         == data.substrs[1].min_offset
2116                      &&    SvCUR(data.substrs[0].str)
2117                         == SvCUR(data.substrs[1].str)
2118                     )
2119                 && S_setup_longest (aTHX_ pRExC_state,
2120                                         &(RExC_rx->substrs->data[i]),
2121                                         &(data.substrs[i]),
2122                                         longest_length[i]))
2123             {
2124                 RExC_rx->substrs->data[i].min_offset =
2125                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
2126 
2127                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
2128                 /* Don't offset infinity */
2129                 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
2130                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
2131                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
2132             }
2133             else {
2134                 RExC_rx->substrs->data[i].substr      = NULL;
2135                 RExC_rx->substrs->data[i].utf8_substr = NULL;
2136                 longest_length[i] = 0;
2137             }
2138         }
2139 
2140         LEAVE_with_name("study_chunk");
2141 
2142         if (RExC_rxi->regstclass
2143             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
2144             RExC_rxi->regstclass = NULL;
2145 
2146         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
2147               || RExC_rx->substrs->data[0].min_offset)
2148             && stclass_flag
2149             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2150             && is_ssc_worth_it(pRExC_state, data.start_class))
2151         {
2152             const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2153 
2154             ssc_finalize(pRExC_state, data.start_class);
2155 
2156             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2157             StructCopy(data.start_class,
2158                        (regnode_ssc*)RExC_rxi->data->data[n],
2159                        regnode_ssc);
2160             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2161             RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
2162             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
2163                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2164                       Perl_re_printf( aTHX_
2165                                     "synthetic stclass \"%s\".\n",
2166                                     SvPVX_const(sv));});
2167             data.start_class = NULL;
2168         }
2169 
2170         /* A temporary algorithm prefers floated substr to fixed one of
2171          * same length to dig more info. */
2172         i = (longest_length[0] <= longest_length[1]);
2173         RExC_rx->substrs->check_ix = i;
2174         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
2175         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
2176         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
2177         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
2178         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
2179         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
2180             RExC_rx->intflags |= PREGf_NOSCAN;
2181 
2182         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
2183             RExC_rx->extflags |= RXf_USE_INTUIT;
2184             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
2185                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
2186         }
2187 
2188         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
2189         if ( (STRLEN)minlen < longest_length[1] )
2190             minlen= longest_length[1];
2191         if ( (STRLEN)minlen < longest_length[0] )
2192             minlen= longest_length[0];
2193         */
2194     }
2195     else {
2196         /* Several toplevels. Best we can is to set minlen. */
2197         SSize_t fake_deltap;
2198         regnode_ssc ch_class;
2199         SSize_t last_close = 0;
2200         regnode *last_close_op = NULL;
2201 
2202         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
2203 
2204         scan = RExC_rxi->program + 1;
2205         ssc_init(pRExC_state, &ch_class);
2206         data.start_class = &ch_class;
2207         data.last_closep = &last_close;
2208         data.last_close_opp = &last_close_op;
2209 
2210         DEBUG_RExC_seen();
2211         /*
2212          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
2213          * (patterns WITH top level branches)
2214          */
2215         minlen = study_chunk(pRExC_state,
2216             &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
2217             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
2218                                                       ? SCF_TRIE_DOING_RESTUDY
2219                                                       : 0),
2220             0, TRUE);
2221         /* search for "restudy" in this file for a detailed explanation
2222          * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2223 
2224         CHECK_RESTUDY_GOTO_butfirst(NOOP);
2225 
2226         RExC_rx->check_substr = NULL;
2227         RExC_rx->check_utf8 = NULL;
2228         RExC_rx->substrs->data[0].substr      = NULL;
2229         RExC_rx->substrs->data[0].utf8_substr = NULL;
2230         RExC_rx->substrs->data[1].substr      = NULL;
2231         RExC_rx->substrs->data[1].utf8_substr = NULL;
2232 
2233         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2234             && is_ssc_worth_it(pRExC_state, data.start_class))
2235         {
2236             const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2237 
2238             ssc_finalize(pRExC_state, data.start_class);
2239 
2240             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2241             StructCopy(data.start_class,
2242                        (regnode_ssc*)RExC_rxi->data->data[n],
2243                        regnode_ssc);
2244             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2245             RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
2246             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
2247                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2248                       Perl_re_printf( aTHX_
2249                                     "synthetic stclass \"%s\".\n",
2250                                     SvPVX_const(sv));});
2251             data.start_class = NULL;
2252         }
2253     }
2254 
2255     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
2256         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
2257         RExC_rx->maxlen = REG_INFTY;
2258     }
2259     else {
2260         RExC_rx->maxlen = RExC_maxlen;
2261     }
2262 
2263     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
2264        the "real" pattern. */
2265     DEBUG_OPTIMISE_r({
2266         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
2267                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
2268     });
2269     RExC_rx->minlenret = minlen;
2270     if (RExC_rx->minlen < minlen)
2271         RExC_rx->minlen = minlen;
2272 
2273     if (RExC_seen & REG_RECURSE_SEEN ) {
2274         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
2275         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
2276     }
2277     if (RExC_seen & REG_GPOS_SEEN)
2278         RExC_rx->intflags |= PREGf_GPOS_SEEN;
2279 
2280     if (RExC_seen & REG_PESSIMIZE_SEEN)
2281         RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN;
2282 
2283     if (RExC_seen & REG_LOOKBEHIND_SEEN)
2284         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
2285                                                 lookbehind */
2286     if (pRExC_state->code_blocks)
2287         RExC_rx->extflags |= RXf_EVAL_SEEN;
2288 
2289     if (RExC_seen & REG_VERBARG_SEEN) {
2290         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
2291         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
2292     }
2293 
2294     if (RExC_seen & REG_CUTGROUP_SEEN)
2295         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
2296 
2297     if (pm_flags & PMf_USE_RE_EVAL)
2298         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
2299 
2300     if (RExC_paren_names)
2301         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
2302     else
2303         RXp_PAREN_NAMES(RExC_rx) = NULL;
2304 
2305     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
2306      * so it can be used in pp.c */
2307     if (RExC_rx->intflags & PREGf_ANCH)
2308         RExC_rx->extflags |= RXf_IS_ANCHORED;
2309 
2310 
2311     {
2312         /* this is used to identify "special" patterns that might result
2313          * in Perl NOT calling the regex engine and instead doing the match "itself",
2314          * particularly special cases in split//. By having the regex compiler
2315          * do this pattern matching at a regop level (instead of by inspecting the pattern)
2316          * we avoid weird issues with equivalent patterns resulting in different behavior,
2317          * AND we allow non Perl engines to get the same optimizations by the setting the
2318          * flags appropriately - Yves */
2319         regnode *first = RExC_rxi->program + 1;
2320         U8 fop = OP(first);
2321         regnode *next = NULL;
2322         U8 nop = 0;
2323         if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
2324             next = REGNODE_AFTER(first);
2325             nop = OP(next);
2326         }
2327         /* It's safe to read through *next only if OP(first) is a regop of
2328          * the right type (not EXACT, for example).
2329          */
2330         if (REGNODE_TYPE(fop) == NOTHING && nop == END)
2331             RExC_rx->extflags |= RXf_NULL;
2332         else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END)
2333             /* when fop is SBOL first->flags will be true only when it was
2334              * produced by parsing /\A/, and not when parsing /^/. This is
2335              * very important for the split code as there we want to
2336              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
2337              * See rt #122761 for more details. -- Yves */
2338             RExC_rx->extflags |= RXf_START_ONLY;
2339         else if (fop == PLUS
2340                  && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
2341                  && OP(regnext(first)) == END)
2342             RExC_rx->extflags |= RXf_WHITE;
2343         else if ( RExC_rx->extflags & RXf_SPLIT
2344                   && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
2345                   && STR_LEN(first) == 1
2346                   && *(STRING(first)) == ' '
2347                   && OP(regnext(first)) == END )
2348             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
2349 
2350     }
2351 
2352     if (RExC_contains_locale) {
2353         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
2354     }
2355 
2356 #ifdef DEBUGGING
2357     if (RExC_paren_names) {
2358         RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
2359         RExC_rxi->data->data[RExC_rxi->name_list_idx]
2360                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
2361     } else
2362 #endif
2363     RExC_rxi->name_list_idx = 0;
2364 
2365     while ( RExC_recurse_count > 0 ) {
2366         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
2367         /*
2368          * This data structure is set up in study_chunk() and is used
2369          * to calculate the distance between a GOSUB regopcode and
2370          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
2371          * it refers to.
2372          *
2373          * If for some reason someone writes code that optimises
2374          * away a GOSUB opcode then the assert should be changed to
2375          * an if(scan) to guard the ARG2i_SET() - Yves
2376          *
2377          */
2378         assert(scan && OP(scan) == GOSUB);
2379         ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
2380     }
2381     if (RExC_logical_total_parens != RExC_total_parens) {
2382         Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
2383         /* we rebuild this below */
2384         Zero(RExC_logical_to_parno, RExC_total_parens, I32);
2385         for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
2386             int logical_parno= RExC_parno_to_logical[parno];
2387             assert(logical_parno);
2388             RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
2389             RExC_logical_to_parno[logical_parno] = parno;
2390         }
2391         RExC_rx->logical_to_parno = RExC_logical_to_parno;
2392         RExC_rx->parno_to_logical = RExC_parno_to_logical;
2393         RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
2394         RExC_logical_to_parno = NULL;
2395         RExC_parno_to_logical = NULL;
2396         RExC_parno_to_logical_next = NULL;
2397     } else {
2398         RExC_rx->logical_to_parno = NULL;
2399         RExC_rx->parno_to_logical = NULL;
2400         RExC_rx->parno_to_logical_next = NULL;
2401     }
2402 
2403     Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
2404     /* assume we don't need to swap parens around before we match */
2405     DEBUG_TEST_r({
2406         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
2407             (unsigned long)RExC_study_chunk_recursed_count);
2408     });
2409     DEBUG_DUMP_r({
2410         DEBUG_RExC_seen();
2411         Perl_re_printf( aTHX_ "Final program:\n");
2412         regdump(RExC_rx);
2413     });
2414 
2415     /* we're returning ownership of the SV to the caller, ensure the cleanup
2416      * doesn't release it
2417      */
2418     RExC_rx_sv = NULL;
2419 
2420 #ifdef USE_ITHREADS
2421     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
2422      * by setting the regexp SV to readonly-only instead. If the
2423      * pattern's been recompiled, the USEDness should remain. */
2424     if (old_re && SvREADONLY(old_re))
2425         SvREADONLY_on(Rx);
2426 #endif
2427     return Rx;
2428 }
2429 
2430 
2431 
2432 SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)2433 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
2434 {
2435     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
2436         PERL_UNUSED_ARG(rx);
2437         if (0)
2438             return NULL;
2439         else
2440             return newSVpvs("Regexp");
2441 }
2442 
2443 /* Scans the name of a named buffer from the pattern.
2444  * If flags is REG_RSN_RETURN_NULL returns null.
2445  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
2446  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
2447  * to the parsed name as looked up in the RExC_paren_names hash.
2448  * If there is an error throws a vFAIL().. type exception.
2449  */
2450 
2451 #define REG_RSN_RETURN_NULL    0
2452 #define REG_RSN_RETURN_NAME    1
2453 #define REG_RSN_RETURN_DATA    2
2454 
2455 STATIC SV*
S_reg_scan_name(pTHX_ RExC_state_t * pRExC_state,U32 flags)2456 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
2457 {
2458     char *name_start = RExC_parse;
2459     SV* sv_name;
2460 
2461     PERL_ARGS_ASSERT_REG_SCAN_NAME;
2462 
2463     assert (RExC_parse <= RExC_end);
2464     if (RExC_parse == RExC_end) NOOP;
2465     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
2466          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
2467           * using do...while */
2468         if (UTF)
2469             do {
2470                 RExC_parse_inc_utf8();
2471             } while (   RExC_parse < RExC_end
2472                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
2473         else
2474             do {
2475                 RExC_parse_inc_by(1);
2476             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
2477     } else {
2478         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
2479                          character */
2480         vFAIL("Group name must start with a non-digit word character");
2481     }
2482     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
2483                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
2484     if ( flags == REG_RSN_RETURN_NAME)
2485         return sv_name;
2486     else if (flags==REG_RSN_RETURN_DATA) {
2487         HE *he_str = NULL;
2488         SV *sv_dat = NULL;
2489         if ( ! sv_name )      /* should not happen*/
2490             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
2491         if (RExC_paren_names)
2492             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
2493         if ( he_str )
2494             sv_dat = HeVAL(he_str);
2495         if ( ! sv_dat ) {   /* Didn't find group */
2496 
2497             /* It might be a forward reference; we can't fail until we
2498                 * know, by completing the parse to get all the groups, and
2499                 * then reparsing */
2500             if (ALL_PARENS_COUNTED)  {
2501                 vFAIL("Reference to nonexistent named group");
2502             }
2503             else {
2504                 REQUIRE_PARENS_PASS;
2505             }
2506         }
2507         return sv_dat;
2508     }
2509 
2510     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
2511                      (unsigned long) flags);
2512 }
2513 
2514 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
2515     if (RExC_lastparse!=RExC_parse) {                           \
2516         Perl_re_printf( aTHX_  "%s",                            \
2517             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
2518                 RExC_end - RExC_parse, 16,                      \
2519                 "", "",                                         \
2520                 PERL_PV_ESCAPE_UNI_DETECT |                     \
2521                 PERL_PV_PRETTY_ELLIPSES   |                     \
2522                 PERL_PV_PRETTY_LTGT       |                     \
2523                 PERL_PV_ESCAPE_RE         |                     \
2524                 PERL_PV_PRETTY_EXACTSIZE                        \
2525             )                                                   \
2526         );                                                      \
2527     } else                                                      \
2528         Perl_re_printf( aTHX_ "%16s","");                       \
2529                                                                 \
2530     if (RExC_lastnum!=RExC_emit)                                \
2531        Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
2532     else                                                        \
2533        Perl_re_printf( aTHX_ "|%4s","");                        \
2534     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
2535         (int)((depth*2)), "",                                   \
2536         (funcname)                                              \
2537     );                                                          \
2538     RExC_lastnum=RExC_emit;                                     \
2539     RExC_lastparse=RExC_parse;                                  \
2540 })
2541 
2542 
2543 
2544 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
2545     DEBUG_PARSE_MSG((funcname));                            \
2546     Perl_re_printf( aTHX_ "%4s","\n");                                  \
2547 })
2548 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
2549     DEBUG_PARSE_MSG((funcname));                            \
2550     Perl_re_printf( aTHX_ fmt "\n",args);                               \
2551 })
2552 
2553 
2554 STATIC void
S_parse_lparen_question_flags(pTHX_ RExC_state_t * pRExC_state)2555 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
2556 {
2557     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
2558      * constructs, and updates RExC_flags with them.  On input, RExC_parse
2559      * should point to the first flag; it is updated on output to point to the
2560      * final ')' or ':'.  There needs to be at least one flag, or this will
2561      * abort */
2562 
2563     /* for (?g), (?gc), and (?o) warnings; warning
2564        about (?c) will warn about (?g) -- japhy    */
2565 
2566 #define WASTED_O  0x01
2567 #define WASTED_G  0x02
2568 #define WASTED_C  0x04
2569 #define WASTED_GC (WASTED_G|WASTED_C)
2570     I32 wastedflags = 0x00;
2571     U32 posflags = 0, negflags = 0;
2572     U32 *flagsp = &posflags;
2573     char has_charset_modifier = '\0';
2574     regex_charset cs;
2575     bool has_use_defaults = FALSE;
2576     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
2577     int x_mod_count = 0;
2578 
2579     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
2580 
2581     /* '^' as an initial flag sets certain defaults */
2582     if (UCHARAT(RExC_parse) == '^') {
2583         RExC_parse_inc_by(1);
2584         has_use_defaults = TRUE;
2585         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
2586         cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2587              ? REGEX_UNICODE_CHARSET
2588              : REGEX_DEPENDS_CHARSET;
2589         set_regex_charset(&RExC_flags, cs);
2590     }
2591     else {
2592         cs = get_regex_charset(RExC_flags);
2593         if (   cs == REGEX_DEPENDS_CHARSET
2594             && (toUSE_UNI_CHARSET_NOT_DEPENDS))
2595         {
2596             cs = REGEX_UNICODE_CHARSET;
2597         }
2598     }
2599 
2600     while (RExC_parse < RExC_end) {
2601         /* && memCHRs("iogcmsx", *RExC_parse) */
2602         /* (?g), (?gc) and (?o) are useless here
2603            and must be globally applied -- japhy */
2604         if ((RExC_pm_flags & PMf_WILDCARD)) {
2605             if (flagsp == & negflags) {
2606                 if (*RExC_parse == 'm') {
2607                     RExC_parse_inc_by(1);
2608                     /* diag_listed_as: Use of %s is not allowed in Unicode
2609                        property wildcard subpatterns in regex; marked by <--
2610                        HERE in m/%s/ */
2611                     vFAIL("Use of modifier '-m' is not allowed in Unicode"
2612                           " property wildcard subpatterns");
2613                 }
2614             }
2615             else {
2616                 if (*RExC_parse == 's') {
2617                     goto modifier_illegal_in_wildcard;
2618                 }
2619             }
2620         }
2621 
2622         switch (*RExC_parse) {
2623 
2624             /* Code for the imsxn flags */
2625             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
2626 
2627             case LOCALE_PAT_MOD:
2628                 if (has_charset_modifier) {
2629                     goto excess_modifier;
2630                 }
2631                 else if (flagsp == &negflags) {
2632                     goto neg_modifier;
2633                 }
2634                 cs = REGEX_LOCALE_CHARSET;
2635                 has_charset_modifier = LOCALE_PAT_MOD;
2636                 break;
2637             case UNICODE_PAT_MOD:
2638                 if (has_charset_modifier) {
2639                     goto excess_modifier;
2640                 }
2641                 else if (flagsp == &negflags) {
2642                     goto neg_modifier;
2643                 }
2644                 cs = REGEX_UNICODE_CHARSET;
2645                 has_charset_modifier = UNICODE_PAT_MOD;
2646                 break;
2647             case ASCII_RESTRICT_PAT_MOD:
2648                 if (flagsp == &negflags) {
2649                     goto neg_modifier;
2650                 }
2651                 if (has_charset_modifier) {
2652                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
2653                         goto excess_modifier;
2654                     }
2655                     /* Doubled modifier implies more restricted */
2656                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
2657                 }
2658                 else {
2659                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
2660                 }
2661                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
2662                 break;
2663             case DEPENDS_PAT_MOD:
2664                 if (has_use_defaults) {
2665                     goto fail_modifiers;
2666                 }
2667                 else if (flagsp == &negflags) {
2668                     goto neg_modifier;
2669                 }
2670                 else if (has_charset_modifier) {
2671                     goto excess_modifier;
2672                 }
2673 
2674                 /* The dual charset means unicode semantics if the
2675                  * pattern (or target, not known until runtime) are
2676                  * utf8, or something in the pattern indicates unicode
2677                  * semantics */
2678                 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2679                      ? REGEX_UNICODE_CHARSET
2680                      : REGEX_DEPENDS_CHARSET;
2681                 has_charset_modifier = DEPENDS_PAT_MOD;
2682                 break;
2683               excess_modifier:
2684                 RExC_parse_inc_by(1);
2685                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
2686                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
2687                 }
2688                 else if (has_charset_modifier == *(RExC_parse - 1)) {
2689                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
2690                                         *(RExC_parse - 1));
2691                 }
2692                 else {
2693                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
2694                 }
2695                 NOT_REACHED; /*NOTREACHED*/
2696               neg_modifier:
2697                 RExC_parse_inc_by(1);
2698                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
2699                                     *(RExC_parse - 1));
2700                 NOT_REACHED; /*NOTREACHED*/
2701             case GLOBAL_PAT_MOD: /* 'g' */
2702                 if (RExC_pm_flags & PMf_WILDCARD) {
2703                     goto modifier_illegal_in_wildcard;
2704                 }
2705                 /*FALLTHROUGH*/
2706             case ONCE_PAT_MOD: /* 'o' */
2707                 if (ckWARN(WARN_REGEXP)) {
2708                     const I32 wflagbit = *RExC_parse == 'o'
2709                                          ? WASTED_O
2710                                          : WASTED_G;
2711                     if (! (wastedflags & wflagbit) ) {
2712                         wastedflags |= wflagbit;
2713                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2714                         vWARN5(
2715                             RExC_parse + 1,
2716                             "Useless (%s%c) - %suse /%c modifier",
2717                             flagsp == &negflags ? "?-" : "?",
2718                             *RExC_parse,
2719                             flagsp == &negflags ? "don't " : "",
2720                             *RExC_parse
2721                         );
2722                     }
2723                 }
2724                 break;
2725 
2726             case CONTINUE_PAT_MOD: /* 'c' */
2727                 if (RExC_pm_flags & PMf_WILDCARD) {
2728                     goto modifier_illegal_in_wildcard;
2729                 }
2730                 if (ckWARN(WARN_REGEXP)) {
2731                     if (! (wastedflags & WASTED_C) ) {
2732                         wastedflags |= WASTED_GC;
2733                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2734                         vWARN3(
2735                             RExC_parse + 1,
2736                             "Useless (%sc) - %suse /gc modifier",
2737                             flagsp == &negflags ? "?-" : "?",
2738                             flagsp == &negflags ? "don't " : ""
2739                         );
2740                     }
2741                 }
2742                 break;
2743             case KEEPCOPY_PAT_MOD: /* 'p' */
2744                 if (RExC_pm_flags & PMf_WILDCARD) {
2745                     goto modifier_illegal_in_wildcard;
2746                 }
2747                 if (flagsp == &negflags) {
2748                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
2749                 } else {
2750                     *flagsp |= RXf_PMf_KEEPCOPY;
2751                 }
2752                 break;
2753             case '-':
2754                 /* A flag is a default iff it is following a minus, so
2755                  * if there is a minus, it means will be trying to
2756                  * re-specify a default which is an error */
2757                 if (has_use_defaults || flagsp == &negflags) {
2758                     goto fail_modifiers;
2759                 }
2760                 flagsp = &negflags;
2761                 wastedflags = 0;  /* reset so (?g-c) warns twice */
2762                 x_mod_count = 0;
2763                 break;
2764             case ':':
2765             case ')':
2766 
2767                 if (  (RExC_pm_flags & PMf_WILDCARD)
2768                     && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
2769                 {
2770                     RExC_parse_inc_by(1);
2771                     /* diag_listed_as: Use of %s is not allowed in Unicode
2772                        property wildcard subpatterns in regex; marked by <--
2773                        HERE in m/%s/ */
2774                     vFAIL2("Use of modifier '%c' is not allowed in Unicode"
2775                            " property wildcard subpatterns",
2776                            has_charset_modifier);
2777                 }
2778 
2779                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
2780                     negflags |= RXf_PMf_EXTENDED_MORE;
2781                 }
2782                 RExC_flags |= posflags;
2783 
2784                 if (negflags & RXf_PMf_EXTENDED) {
2785                     negflags |= RXf_PMf_EXTENDED_MORE;
2786                 }
2787                 RExC_flags &= ~negflags;
2788                 set_regex_charset(&RExC_flags, cs);
2789 
2790                 return;
2791             default:
2792               fail_modifiers:
2793                 RExC_parse_inc_if_char();
2794                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
2795                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
2796                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
2797                 NOT_REACHED; /*NOTREACHED*/
2798         }
2799 
2800         RExC_parse_inc();
2801     }
2802 
2803     vFAIL("Sequence (?... not terminated");
2804 
2805   modifier_illegal_in_wildcard:
2806     RExC_parse_inc_by(1);
2807     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
2808        subpatterns in regex; marked by <-- HERE in m/%s/ */
2809     vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
2810            " subpatterns", *(RExC_parse - 1));
2811 }
2812 
2813 /*
2814  - reg - regular expression, i.e. main body or parenthesized thing
2815  *
2816  * Caller must absorb opening parenthesis.
2817  *
2818  * Combining parenthesis handling with the base level of regular expression
2819  * is a trifle forced, but the need to tie the tails of the branches to what
2820  * follows makes it hard to avoid.
2821  */
2822 
2823 STATIC regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,char * backref_parse_start,char ch)2824 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
2825                              I32 *flagp,
2826                              char * backref_parse_start,
2827                              char ch
2828                       )
2829 {
2830     regnode_offset ret;
2831     char* name_start = RExC_parse;
2832     U32 num = 0;
2833     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
2834     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2835 
2836     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
2837 
2838     if (RExC_parse != name_start && ch == '}') {
2839         while (isBLANK(*RExC_parse)) {
2840             RExC_parse_inc_by(1);
2841         }
2842     }
2843     if (RExC_parse == name_start || *RExC_parse != ch) {
2844         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
2845         vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
2846     }
2847 
2848     if (sv_dat) {
2849         num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
2850         RExC_rxi->data->data[num]=(void*)sv_dat;
2851         SvREFCNT_inc_simple_void_NN(sv_dat);
2852     }
2853     RExC_sawback = 1;
2854     ret = reg2node(pRExC_state,
2855                    ((! FOLD)
2856                      ? REFN
2857                      : (ASCII_FOLD_RESTRICTED)
2858                        ? REFFAN
2859                        : (AT_LEAST_UNI_SEMANTICS)
2860                          ? REFFUN
2861                          : (LOC)
2862                            ? REFFLN
2863                            : REFFN),
2864                     num, RExC_nestroot);
2865     if (RExC_nestroot && num >= (U32)RExC_nestroot)
2866         FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
2867     *flagp |= HASWIDTH;
2868 
2869     nextchar(pRExC_state);
2870     return ret;
2871 }
2872 
2873 /* reg_la_NOTHING()
2874  *
2875  * Maybe parse a parenthesized lookaround construct that is equivalent to a
2876  * NOTHING regop when the construct is empty.
2877  *
2878  * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2879  *
2880  * Checks for unterminated constructs and throws a "not terminated" error
2881  * with the appropriate type if necessary
2882  *
2883  * Assuming it does not throw an exception increments RExC_seen_zerolen.
2884  *
2885  * If the construct is empty generates a NOTHING op and returns its
2886  * regnode_offset, which the caller would then return to its caller.
2887  *
2888  * If the construct is not empty increments RExC_in_lookaround, and turns
2889  * on any flags provided in RExC_seen, and then returns 0 to signify
2890  * that parsing should continue.
2891  *
2892  * PS: I would have called this reg_parse_lookaround_NOTHING() but then
2893  * any use of it would have had to be broken onto multiple lines, hence
2894  * the abbreviation.
2895  */
2896 STATIC regnode_offset
S_reg_la_NOTHING(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2897 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2898     const char *type)
2899 {
2900 
2901     PERL_ARGS_ASSERT_REG_LA_NOTHING;
2902 
2903     /* false below so we do not force /x */
2904     skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2905 
2906     if (RExC_parse >= RExC_end)
2907         vFAIL2("Sequence (%s... not terminated", type);
2908 
2909     /* Always increment as NOTHING regops are zerolen */
2910     RExC_seen_zerolen++;
2911 
2912     if (*RExC_parse == ')') {
2913         regnode_offset ret= reg_node(pRExC_state, NOTHING);
2914         nextchar(pRExC_state);
2915         return ret;
2916     }
2917 
2918     RExC_seen |= flags;
2919     RExC_in_lookaround++;
2920     return 0; /* keep parsing! */
2921 }
2922 
2923 /* reg_la_OPFAIL()
2924  *
2925  * Maybe parse a parenthesized lookaround construct that is equivalent to a
2926  * OPFAIL regop when the construct is empty.
2927  *
2928  * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2929  *
2930  * Checks for unterminated constructs and throws a "not terminated" error
2931  * if necessary.
2932  *
2933  * If the construct is empty generates an OPFAIL op and returns its
2934  * regnode_offset which the caller should then return to its caller.
2935  *
2936  * If the construct is not empty increments RExC_in_lookaround, and also
2937  * increments RExC_seen_zerolen, and turns on the flags provided in
2938  * RExC_seen, and then returns 0 to signify that parsing should continue.
2939  *
2940  * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
2941  * any use of it would have had to be broken onto multiple lines, hence
2942  * the abbreviation.
2943  */
2944 
2945 STATIC regnode_offset
S_reg_la_OPFAIL(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2946 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2947     const char *type)
2948 {
2949 
2950     PERL_ARGS_ASSERT_REG_LA_OPFAIL;
2951 
2952     /* FALSE so we don't force to /x below */;
2953     skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2954 
2955     if (RExC_parse >= RExC_end)
2956         vFAIL2("Sequence (%s... not terminated", type);
2957 
2958     if (*RExC_parse == ')') {
2959         regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
2960         nextchar(pRExC_state);
2961         return ret; /* return produced regop */
2962     }
2963 
2964     /* only increment zerolen *after* we check if we produce an OPFAIL
2965      * as an OPFAIL does not match a zero length construct, as it
2966      * does not match ever. */
2967     RExC_seen_zerolen++;
2968     RExC_seen |= flags;
2969     RExC_in_lookaround++;
2970     return 0; /* keep parsing! */
2971 }
2972 
2973 /* Below are the main parsing routines.
2974  *
2975  * S_reg()      parses a whole pattern or subpattern.  It itself handles things
2976  *              like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
2977  *              alternation '|' in the '...' pattern.
2978  * S_regbranch() effectively implements the concatenation operator, handling
2979  *              one alternative of '|', repeatedly calling S_regpiece on each
2980  *              segment of the input.
2981  * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
2982  *              and then adds any quantifier for that chunk.
2983  * S_regatom()  parses the next chunk of the input, returning when it
2984  *              determines it has found a complete atomic chunk.  The chunk may
2985  *              be a nested subpattern, in which case S_reg is called
2986  *              recursively
2987  *
2988  * The functions generate regnodes as they go along, appending each to the
2989  * pattern data structure so far.  They return the offset of the current final
2990  * node into that structure, or 0 on failure.
2991  *
2992  * There are three parameters common to all of them:
2993  *   pRExC_state    is a structure with much information about the current
2994  *                  state of the parse.  It's easy to add new elements to
2995  *                  convey new information, but beware that an error return may
2996  *                  require clearing the element.
2997  *   flagp          is a pointer to bit flags set in a lower level to pass up
2998  *                  to higher levels information, such as the cause of a
2999  *                  failure, or some characteristic about the generated node
3000  *   depth          is roughly the recursion depth, mostly unused except for
3001  *                  pretty printing debugging info.
3002  *
3003  * There are ancillary functions that these may farm work out to, using the
3004  * same parameters.
3005  *
3006  * The protocol for handling flags is that each function will, before
3007  * returning, add into *flagp the flags it needs to pass up.  Each function has
3008  * a second flags variable, typically named 'flags', which it sets and clears
3009  * at will.  Flag bits in it are used in that function, and it calls the next
3010  * layer down with its 'flagp' parameter set to '&flags'.  Thus, upon return,
3011  * 'flags' will contain whatever it had before the call, plus whatever that
3012  * function passed up.  If it wants to pass any of these up to its caller, it
3013  * has to add them to its *flagp.  This means that it takes extra steps to keep
3014  * passing a flag upwards, and otherwise the flag bit is cleared for higher
3015  * functions.
3016  */
3017 
3018 /* On success, returns the offset at which any next node should be placed into
3019  * the regex engine program being compiled.
3020  *
3021  * Returns 0 otherwise, with *flagp set to indicate why:
3022  *  TRYAGAIN        at the end of (?) that only sets flags.
3023  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
3024  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
3025  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
3026  *  happen.  */
3027 STATIC regnode_offset
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp,U32 depth)3028 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
3029     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
3030      * 2 is like 1, but indicates that nextchar() has been called to advance
3031      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
3032      * this flag alerts us to the need to check for that */
3033 {
3034     regnode_offset ret = 0;    /* Will be the head of the group. */
3035     regnode_offset br;
3036     regnode_offset lastbr;
3037     regnode_offset ender = 0;
3038     I32 logical_parno = 0;
3039     I32 parno = 0;
3040     I32 flags;
3041     U32 oregflags = RExC_flags;
3042     bool have_branch = 0;
3043     bool is_open = 0;
3044     I32 freeze_paren = 0;
3045     I32 after_freeze = 0;
3046     I32 num; /* numeric backreferences */
3047     SV * max_open;  /* Max number of unclosed parens */
3048     I32 was_in_lookaround = RExC_in_lookaround;
3049     I32 fake_eval = 0; /* matches paren */
3050 
3051     /* The difference between the following variables can be seen with  *
3052      * the broken pattern /(?:foo/ where segment_parse_start will point *
3053      * at the 'f', and reg_parse_start will point at the '('            */
3054 
3055     /* the following is used for unmatched '(' errors */
3056     char * const reg_parse_start = RExC_parse;
3057 
3058     /* the following is used to track where various segments of
3059      * the pattern that we parse out started. */
3060     char * segment_parse_start = RExC_parse;
3061 
3062     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3063 
3064     PERL_ARGS_ASSERT_REG;
3065     DEBUG_PARSE("reg ");
3066 
3067     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
3068     assert(max_open);
3069     if (!SvIOK(max_open)) {
3070         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
3071     }
3072     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
3073                                               open paren */
3074         vFAIL("Too many nested open parens");
3075     }
3076 
3077     *flagp = 0;				/* Initialize. */
3078 
3079     /* Having this true makes it feasible to have a lot fewer tests for the
3080      * parse pointer being in scope.  For example, we can write
3081      *      while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
3082      * instead of
3083      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
3084      */
3085     assert(*RExC_end == '\0');
3086 
3087     /* Make an OPEN node, if parenthesized. */
3088     if (paren) {
3089 
3090         /* Under /x, space and comments can be gobbled up between the '(' and
3091          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
3092          * intervening space, as the sequence is a token, and a token should be
3093          * indivisible */
3094         bool has_intervening_patws = (paren == 2)
3095                                   && *(RExC_parse - 1) != '(';
3096 
3097         if (RExC_parse >= RExC_end) {
3098             vFAIL("Unmatched (");
3099         }
3100 
3101         if (paren == 'r') {     /* Atomic script run */
3102             paren = '>';
3103             goto parse_rest;
3104         }
3105         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
3106             if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */
3107                 fake_eval = '{';
3108                 goto handle_qmark;
3109             }
3110 
3111             char *start_verb = RExC_parse + 1;
3112             STRLEN verb_len;
3113             char *start_arg = NULL;
3114             unsigned char op = 0;
3115             int arg_required = 0;
3116             int internal_argval = -1; /* if > -1 no argument allowed */
3117             bool has_upper = FALSE;
3118             U32 seen_flag_set = 0; /* RExC_seen flags we must set */
3119 
3120             if (has_intervening_patws) {
3121                 RExC_parse_inc_by(1);   /* past the '*' */
3122 
3123                 /* For strict backwards compatibility, don't change the message
3124                  * now that we also have lowercase operands */
3125                 if (isUPPER(*RExC_parse)) {
3126                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
3127                 }
3128                 else {
3129                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
3130                 }
3131             }
3132             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
3133                 if ( *RExC_parse == ':' ) {
3134                     start_arg = RExC_parse + 1;
3135                     break;
3136                 }
3137                 else if (! UTF) {
3138                     if (isUPPER(*RExC_parse)) {
3139                         has_upper = TRUE;
3140                     }
3141                     RExC_parse_inc_by(1);
3142                 }
3143                 else {
3144                     RExC_parse_inc_utf8();
3145                 }
3146             }
3147             verb_len = RExC_parse - start_verb;
3148             if ( start_arg ) {
3149                 if (RExC_parse >= RExC_end) {
3150                     goto unterminated_verb_pattern;
3151                 }
3152 
3153                 RExC_parse_inc();
3154                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
3155                     RExC_parse_inc();
3156                 }
3157                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3158                   unterminated_verb_pattern:
3159                     if (has_upper) {
3160                         vFAIL("Unterminated verb pattern argument");
3161                     }
3162                     else {
3163                         vFAIL("Unterminated '(*...' argument");
3164                     }
3165                 }
3166             } else {
3167                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3168                     if (has_upper) {
3169                         vFAIL("Unterminated verb pattern");
3170                     }
3171                     else {
3172                         vFAIL("Unterminated '(*...' construct");
3173                     }
3174                 }
3175             }
3176 
3177             /* Here, we know that RExC_parse < RExC_end */
3178 
3179             switch ( *start_verb ) {
3180             case 'A':  /* (*ACCEPT) */
3181                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
3182                     op = ACCEPT;
3183                     internal_argval = RExC_nestroot;
3184                 }
3185                 break;
3186             case 'C':  /* (*COMMIT) */
3187                 if ( memEQs(start_verb, verb_len,"COMMIT") )
3188                     op = COMMIT;
3189                 break;
3190             case 'F':  /* (*FAIL) */
3191                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
3192                     op = OPFAIL;
3193                 }
3194                 break;
3195             case ':':  /* (*:NAME) */
3196             case 'M':  /* (*MARK:NAME) */
3197                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
3198                     op = MARKPOINT;
3199                     arg_required = 1;
3200                 }
3201                 break;
3202             case 'P':  /* (*PRUNE) */
3203                 if ( memEQs(start_verb, verb_len,"PRUNE") )
3204                     op = PRUNE;
3205                 break;
3206             case 'S':   /* (*SKIP) */
3207                 if ( memEQs(start_verb, verb_len,"SKIP") )
3208                     op = SKIP;
3209                 break;
3210             case 'T':  /* (*THEN) */
3211                 /* [19:06] <TimToady> :: is then */
3212                 if ( memEQs(start_verb, verb_len,"THEN") ) {
3213                     op = CUTGROUP;
3214                     RExC_seen |= REG_CUTGROUP_SEEN;
3215                 }
3216                 break;
3217             case 'a':
3218                 if (   memEQs(start_verb, verb_len, "asr")
3219                     || memEQs(start_verb, verb_len, "atomic_script_run"))
3220                 {
3221                     paren = 'r';        /* Mnemonic: recursed run */
3222                     goto script_run;
3223                 }
3224                 else if (memEQs(start_verb, verb_len, "atomic")) {
3225                     paren = 't';    /* AtOMIC */
3226                     goto alpha_assertions;
3227                 }
3228                 break;
3229             case 'p':
3230                 if (   memEQs(start_verb, verb_len, "plb")
3231                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
3232                 {
3233                     paren = 'b';
3234                     goto lookbehind_alpha_assertions;
3235                 }
3236                 else if (   memEQs(start_verb, verb_len, "pla")
3237                          || memEQs(start_verb, verb_len, "positive_lookahead"))
3238                 {
3239                     paren = 'a';
3240                     goto alpha_assertions;
3241                 }
3242                 break;
3243             case 'n':
3244                 if (   memEQs(start_verb, verb_len, "nlb")
3245                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
3246                 {
3247                     paren = 'B';
3248                     goto lookbehind_alpha_assertions;
3249                 }
3250                 else if (   memEQs(start_verb, verb_len, "nla")
3251                          || memEQs(start_verb, verb_len, "negative_lookahead"))
3252                 {
3253                     paren = 'A';
3254                     goto alpha_assertions;
3255                 }
3256                 break;
3257             case 's':
3258                 if (   memEQs(start_verb, verb_len, "sr")
3259                     || memEQs(start_verb, verb_len, "script_run"))
3260                 {
3261                     regnode_offset atomic;
3262 
3263                     paren = 's';
3264 
3265                    script_run:
3266 
3267                     /* This indicates Unicode rules. */
3268                     REQUIRE_UNI_RULES(flagp, 0);
3269 
3270                     if (! start_arg) {
3271                         goto no_colon;
3272                     }
3273 
3274                     RExC_parse_set(start_arg);
3275 
3276                     if (RExC_in_script_run) {
3277 
3278                         /*  Nested script runs are treated as no-ops, because
3279                          *  if the nested one fails, the outer one must as
3280                          *  well.  It could fail sooner, and avoid (??{} with
3281                          *  side effects, but that is explicitly documented as
3282                          *  undefined behavior. */
3283 
3284                         ret = 0;
3285 
3286                         if (paren == 's') {
3287                             paren = ':';
3288                             goto parse_rest;
3289                         }
3290 
3291                         /* But, the atomic part of a nested atomic script run
3292                          * isn't a no-op, but can be treated just like a '(?>'
3293                          * */
3294                         paren = '>';
3295                         goto parse_rest;
3296                     }
3297 
3298                     if (paren == 's') {
3299                         /* Here, we're starting a new regular script run */
3300                         ret = reg_node(pRExC_state, SROPEN);
3301                         RExC_in_script_run = 1;
3302                         is_open = 1;
3303                         goto parse_rest;
3304                     }
3305 
3306                     /* Here, we are starting an atomic script run.  This is
3307                      * handled by recursing to deal with the atomic portion
3308                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
3309 
3310                     ret = reg_node(pRExC_state, SROPEN);
3311 
3312                     RExC_in_script_run = 1;
3313 
3314                     atomic = reg(pRExC_state, 'r', &flags, depth);
3315                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
3316                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
3317                         return 0;
3318                     }
3319 
3320                     if (! REGTAIL(pRExC_state, ret, atomic)) {
3321                         REQUIRE_BRANCHJ(flagp, 0);
3322                     }
3323 
3324                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
3325                                                                 SRCLOSE)))
3326                     {
3327                         REQUIRE_BRANCHJ(flagp, 0);
3328                     }
3329 
3330                     RExC_in_script_run = 0;
3331                     return ret;
3332                 }
3333 
3334                 break;
3335 
3336             lookbehind_alpha_assertions:
3337                 seen_flag_set = REG_LOOKBEHIND_SEEN;
3338                 /*FALLTHROUGH*/
3339 
3340             alpha_assertions:
3341 
3342                 if ( !start_arg ) {
3343                     goto no_colon;
3344                 }
3345 
3346                 if ( RExC_parse == start_arg ) {
3347                     if ( paren == 'A' || paren == 'B' ) {
3348                         /* An empty negative lookaround assertion is failure.
3349                          * See also: S_reg_la_OPFAIL() */
3350 
3351                         /* Note: OPFAIL is *not* zerolen. */
3352                         ret = reg1node(pRExC_state, OPFAIL, 0);
3353                         nextchar(pRExC_state);
3354                         return ret;
3355                     }
3356                     else
3357                     if ( paren == 'a' || paren == 'b' ) {
3358                         /* An empty positive lookaround assertion is success.
3359                          * See also: S_reg_la_NOTHING() */
3360 
3361                         /* Note: NOTHING is zerolen, so increment here */
3362                         RExC_seen_zerolen++;
3363                         ret = reg_node(pRExC_state, NOTHING);
3364                         nextchar(pRExC_state);
3365                         return ret;
3366                     }
3367                 }
3368 
3369                 RExC_seen_zerolen++;
3370                 RExC_in_lookaround++;
3371                 RExC_seen |= seen_flag_set;
3372 
3373                 RExC_parse_set(start_arg);
3374                 goto parse_rest;
3375 
3376               no_colon:
3377                 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
3378                     UTF8fARG(UTF, verb_len, start_verb));
3379                 NOT_REACHED; /*NOTREACHED*/
3380 
3381             } /* End of switch */
3382             if ( ! op ) {
3383                 RExC_parse_inc_safe();
3384                 if (has_upper || verb_len == 0) {
3385                     vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
3386                         UTF8fARG(UTF, verb_len, start_verb));
3387                 }
3388                 else {
3389                     vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
3390                         UTF8fARG(UTF, verb_len, start_verb));
3391                 }
3392             }
3393             if ( RExC_parse == start_arg ) {
3394                 start_arg = NULL;
3395             }
3396             if ( arg_required && !start_arg ) {
3397                 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
3398                     (int) verb_len, start_verb);
3399             }
3400             if (internal_argval == -1) {
3401                 ret = reg1node(pRExC_state, op, 0);
3402             } else {
3403                 ret = reg2node(pRExC_state, op, 0, internal_argval);
3404             }
3405             RExC_seen |= REG_VERBARG_SEEN;
3406             if (start_arg) {
3407                 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
3408                 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
3409                                         STR_WITH_LEN("S"));
3410                 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
3411                 FLAGS(REGNODE_p(ret)) = 1;
3412             } else {
3413                 FLAGS(REGNODE_p(ret)) = 0;
3414             }
3415             if ( internal_argval != -1 )
3416                 ARG2i_SET(REGNODE_p(ret), internal_argval);
3417             nextchar(pRExC_state);
3418             return ret;
3419         }
3420         else if (*RExC_parse == '?') { /* (?...) */
3421           handle_qmark:
3422             ; /* make sure the label has a statement associated with it*/
3423             bool is_logical = 0, is_optimistic = 0;
3424             const char * const seqstart = RExC_parse;
3425             const char * endptr;
3426             const char non_existent_group_msg[]
3427                                             = "Reference to nonexistent group";
3428             const char impossible_group[] = "Invalid reference to group";
3429 
3430             if (has_intervening_patws) {
3431                 RExC_parse_inc_by(1);
3432                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
3433             }
3434 
3435             RExC_parse_inc_by(1);   /* past the '?' */
3436             if (!fake_eval) {
3437                 paren = *RExC_parse;    /* might be a trailing NUL, if not
3438                                            well-formed */
3439                 is_optimistic = 0;
3440             } else {
3441                 is_optimistic = 1;
3442                 paren = fake_eval;
3443             }
3444             RExC_parse_inc();
3445             if (RExC_parse > RExC_end) {
3446                 paren = '\0';
3447             }
3448             ret = 0;			/* For look-ahead/behind. */
3449             switch (paren) {
3450 
3451             case 'P':	/* (?P...) variants for those used to PCRE/Python */
3452                 paren = *RExC_parse;
3453                 if ( paren == '<') {    /* (?P<...>) named capture */
3454                     RExC_parse_inc_by(1);
3455                     if (RExC_parse >= RExC_end) {
3456                         vFAIL("Sequence (?P<... not terminated");
3457                     }
3458                     goto named_capture;
3459                 }
3460                 else if (paren == '>') {   /* (?P>name) named recursion */
3461                     RExC_parse_inc_by(1);
3462                     if (RExC_parse >= RExC_end) {
3463                         vFAIL("Sequence (?P>... not terminated");
3464                     }
3465                     goto named_recursion;
3466                 }
3467                 else if (paren == '=') {   /* (?P=...)  named backref */
3468                     RExC_parse_inc_by(1);
3469                     return handle_named_backref(pRExC_state, flagp,
3470                                                 segment_parse_start, ')');
3471                 }
3472                 RExC_parse_inc_if_char();
3473                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3474                 vFAIL3("Sequence (%.*s...) not recognized",
3475                                 (int) (RExC_parse - seqstart), seqstart);
3476                 NOT_REACHED; /*NOTREACHED*/
3477             case '<':           /* (?<...) */
3478                 /* If you want to support (?<*...), first reconcile with GH #17363 */
3479                 if (*RExC_parse == '!') {
3480                     paren = ','; /* negative lookbehind (?<! ... ) */
3481                     RExC_parse_inc_by(1);
3482                     if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
3483                         return ret;
3484                     break;
3485                 }
3486                 else
3487                 if (*RExC_parse == '=') {
3488                     /* paren = '<' - negative lookahead (?<= ... ) */
3489                     RExC_parse_inc_by(1);
3490                     if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
3491                         return ret;
3492                     break;
3493                 }
3494                 else
3495               named_capture:
3496                 {               /* (?<...>) */
3497                     char *name_start;
3498                     SV *svname;
3499                     paren= '>';
3500                 /* FALLTHROUGH */
3501             case '\'':          /* (?'...') */
3502                     name_start = RExC_parse;
3503                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
3504                     if (   RExC_parse == name_start
3505                         || RExC_parse >= RExC_end
3506                         || *RExC_parse != paren)
3507                     {
3508                         vFAIL2("Sequence (?%c... not terminated",
3509                             paren=='>' ? '<' : (char) paren);
3510                     }
3511                     {
3512                         HE *he_str;
3513                         SV *sv_dat = NULL;
3514                         if (!svname) /* shouldn't happen */
3515                             Perl_croak(aTHX_
3516                                 "panic: reg_scan_name returned NULL");
3517                         if (!RExC_paren_names) {
3518                             RExC_paren_names= newHV();
3519                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
3520 #ifdef DEBUGGING
3521                             RExC_paren_name_list= newAV();
3522                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
3523 #endif
3524                         }
3525                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
3526                         if ( he_str )
3527                             sv_dat = HeVAL(he_str);
3528                         if ( ! sv_dat ) {
3529                             /* croak baby croak */
3530                             Perl_croak(aTHX_
3531                                 "panic: paren_name hash element allocation failed");
3532                         } else if ( SvPOK(sv_dat) ) {
3533                             /* (?|...) can mean we have dupes so scan to check
3534                                its already been stored. Maybe a flag indicating
3535                                we are inside such a construct would be useful,
3536                                but the arrays are likely to be quite small, so
3537                                for now we punt -- dmq */
3538                             IV count = SvIV(sv_dat);
3539                             I32 *pv = (I32*)SvPVX(sv_dat);
3540                             IV i;
3541                             for ( i = 0 ; i < count ; i++ ) {
3542                                 if ( pv[i] == RExC_npar ) {
3543                                     count = 0;
3544                                     break;
3545                                 }
3546                             }
3547                             if ( count ) {
3548                                 pv = (I32*)SvGROW(sv_dat,
3549                                                 SvCUR(sv_dat) + sizeof(I32)+1);
3550                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
3551                                 pv[count] = RExC_npar;
3552                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
3553                             }
3554                         } else {
3555                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
3556                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
3557                                                                 sizeof(I32));
3558                             SvIOK_on(sv_dat);
3559                             SvIV_set(sv_dat, 1);
3560                         }
3561 #ifdef DEBUGGING
3562                         /* No, this does not cause a memory leak under
3563                          * debugging. RExC_paren_name_list is freed later
3564                          * on in the dump process. - Yves
3565                          */
3566                         if (!av_store(RExC_paren_name_list,
3567                                       RExC_npar, SvREFCNT_inc_NN(svname)))
3568                             SvREFCNT_dec_NN(svname);
3569 #endif
3570 
3571                     }
3572                     nextchar(pRExC_state);
3573                     paren = 1;
3574                     goto capturing_parens;
3575                 }
3576                 NOT_REACHED; /*NOTREACHED*/
3577             case '=':           /* (?=...) */
3578                 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
3579                     return ret;
3580                 break;
3581             case '!':           /* (?!...) */
3582                 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
3583                     return ret;
3584                 break;
3585             case '|':           /* (?|...) */
3586                 /* branch reset, behave like a (?:...) except that
3587                    buffers in alternations share the same numbers */
3588                 paren = ':';
3589                 after_freeze = freeze_paren = RExC_logical_npar;
3590 
3591                 /* XXX This construct currently requires an extra pass.
3592                  * Investigation would be required to see if that could be
3593                  * changed */
3594                 REQUIRE_PARENS_PASS;
3595                 break;
3596             case ':':           /* (?:...) */
3597             case '>':           /* (?>...) */
3598                 break;
3599             case '$':           /* (?$...) */
3600             case '@':           /* (?@...) */
3601                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3602                 break;
3603             case '0' :           /* (?0) */
3604             case 'R' :           /* (?R) */
3605                 if (RExC_parse == RExC_end || *RExC_parse != ')')
3606                     FAIL("Sequence (?R) not terminated");
3607                 num = 0;
3608                 RExC_seen |= REG_RECURSE_SEEN;
3609 
3610                 /* XXX These constructs currently require an extra pass.
3611                  * It probably could be changed */
3612                 REQUIRE_PARENS_PASS;
3613 
3614                 *flagp |= POSTPONED;
3615                 goto gen_recurse_regop;
3616                 /*notreached*/
3617             /* named and numeric backreferences */
3618             case '&':            /* (?&NAME) */
3619                 segment_parse_start = RExC_parse - 1;
3620               named_recursion:
3621                 {
3622                     SV *sv_dat = reg_scan_name(pRExC_state,
3623                                                REG_RSN_RETURN_DATA);
3624                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
3625                 }
3626                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
3627                     vFAIL("Sequence (?&... not terminated");
3628                 goto gen_recurse_regop;
3629                 /* NOTREACHED */
3630             case '+':
3631                 if (! inRANGE(RExC_parse[0], '1', '9')) {
3632                     RExC_parse_inc_by(1);
3633                     vFAIL("Illegal pattern");
3634                 }
3635                 goto parse_recursion;
3636                 /* NOTREACHED*/
3637             case '-': /* (?-1) */
3638                 if (! inRANGE(RExC_parse[0], '1', '9')) {
3639                     RExC_parse--; /* rewind to let it be handled later */
3640                     goto parse_flags;
3641                 }
3642                 /* FALLTHROUGH */
3643             case '1': case '2': case '3': case '4': /* (?1) */
3644             case '5': case '6': case '7': case '8': case '9':
3645                 RExC_parse_set((char *) seqstart + 1);  /* Point to the digit */
3646               parse_recursion:
3647                 {
3648                     bool is_neg = FALSE;
3649                     UV unum;
3650                     segment_parse_start = RExC_parse - 1;
3651                     if (*RExC_parse == '-') {
3652                         RExC_parse_inc_by(1);
3653                         is_neg = TRUE;
3654                     }
3655                     endptr = RExC_end;
3656                     if (grok_atoUV(RExC_parse, &unum, &endptr)
3657                         && unum <= I32_MAX
3658                     ) {
3659                         num = (I32)unum;
3660                         RExC_parse_set((char*)endptr);
3661                     }
3662                     else {  /* Overflow, or something like that.  Position
3663                                beyond all digits for the message */
3664                         while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
3665                             RExC_parse_inc_by(1);
3666                         }
3667                         vFAIL(impossible_group);
3668                     }
3669                     if (is_neg) {
3670                         /* -num is always representable on 1 and 2's complement
3671                          * machines */
3672                         num = -num;
3673                     }
3674                 }
3675                 if (*RExC_parse!=')')
3676                     vFAIL("Expecting close bracket");
3677 
3678                 if (paren == '-' || paren == '+') {
3679 
3680                     /* Don't overflow */
3681                     if (UNLIKELY(I32_MAX - RExC_npar < num)) {
3682                         RExC_parse_inc_by(1);
3683                         vFAIL(impossible_group);
3684                     }
3685 
3686                     /*
3687                     Diagram of capture buffer numbering.
3688                     Top line is the normal capture buffer numbers
3689                     Bottom line is the negative indexing as from
3690                     the X (the (?-2))
3691 
3692                         1 2    3 4 5 X   Y      6 7
3693                        /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
3694                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
3695                     -   5 4    3 2 1 X   Y      x x
3696 
3697                     Resolve to absolute group.  Recall that RExC_npar is +1 of
3698                     the actual parenthesis group number.  For lookahead, we
3699                     have to compensate for that.  Using the above example, when
3700                     we get to Y in the parse, num is 2 and RExC_npar is 6.  We
3701                     want 7 for +2, and 4 for -2.
3702                     */
3703                     if ( paren == '+' ) {
3704                         num--;
3705                     }
3706 
3707                     num += RExC_npar;
3708 
3709                     if (paren == '-' && num < 1) {
3710                         RExC_parse_inc_by(1);
3711                         vFAIL(non_existent_group_msg);
3712                     }
3713                 }
3714                 else
3715                 if (num && num < RExC_logical_npar) {
3716                     num = RExC_logical_to_parno[num];
3717                 }
3718                 else
3719                 if (ALL_PARENS_COUNTED) {
3720                     if (num < RExC_logical_total_parens) {
3721                         num = RExC_logical_to_parno[num];
3722                     }
3723                     else {
3724                         RExC_parse_inc_by(1);
3725                         vFAIL(non_existent_group_msg);
3726                     }
3727                 }
3728                 else {
3729                     REQUIRE_PARENS_PASS;
3730                 }
3731 
3732 
3733               gen_recurse_regop:
3734                 if (num >= RExC_npar) {
3735 
3736                     /* It might be a forward reference; we can't fail until we
3737                      * know, by completing the parse to get all the groups, and
3738                      * then reparsing */
3739                     if (ALL_PARENS_COUNTED)  {
3740                         if (num >= RExC_total_parens) {
3741                             RExC_parse_inc_by(1);
3742                             vFAIL(non_existent_group_msg);
3743                         }
3744                     }
3745                     else {
3746                         REQUIRE_PARENS_PASS;
3747                     }
3748                 }
3749 
3750                 /* We keep track how many GOSUB items we have produced.
3751                    To start off the ARG2i() of the GOSUB holds its "id",
3752                    which is used later in conjunction with RExC_recurse
3753                    to calculate the offset we need to jump for the GOSUB,
3754                    which it will store in the final representation.
3755                    We have to defer the actual calculation until much later
3756                    as the regop may move.
3757                  */
3758                 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
3759                 RExC_recurse_count++;
3760                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
3761                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
3762                             22, "|    |", (int)(depth * 2 + 1), "",
3763                             (UV)ARG1u(REGNODE_p(ret)),
3764                             (IV)ARG2i(REGNODE_p(ret))));
3765                 RExC_seen |= REG_RECURSE_SEEN;
3766 
3767                 *flagp |= POSTPONED;
3768                 assert(*RExC_parse == ')');
3769                 nextchar(pRExC_state);
3770                 return ret;
3771 
3772             /* NOTREACHED */
3773 
3774             case '?':           /* (??...) */
3775                 is_logical = 1;
3776                 if (*RExC_parse != '{') {
3777                     RExC_parse_inc_if_char();
3778                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3779                     vFAIL2utf8f(
3780                         "Sequence (%" UTF8f "...) not recognized",
3781                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
3782                     NOT_REACHED; /*NOTREACHED*/
3783                 }
3784                 *flagp |= POSTPONED;
3785                 paren = '{';
3786                 RExC_parse_inc_by(1);
3787                 /* FALLTHROUGH */
3788             case '{':           /* (?{...}) */
3789             {
3790                 U32 n = 0;
3791                 struct reg_code_block *cb;
3792                 OP * o;
3793 
3794                 RExC_seen_zerolen++;
3795 
3796                 if (   !pRExC_state->code_blocks
3797                     || pRExC_state->code_index
3798                                         >= pRExC_state->code_blocks->count
3799                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
3800                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
3801                             - RExC_start)
3802                 ) {
3803                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
3804                         FAIL("panic: Sequence (?{...}): no code block found\n");
3805                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
3806                 }
3807                 /* this is a pre-compiled code block (?{...}) */
3808                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
3809                 RExC_parse_set(RExC_start + cb->end);
3810                 o = cb->block;
3811                 if (cb->src_regex) {
3812                     n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
3813                     RExC_rxi->data->data[n] =
3814                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
3815                     RExC_rxi->data->data[n+1] = (void*)o;
3816                 }
3817                 else {
3818                     n = reg_add_data(pRExC_state,
3819                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
3820                     RExC_rxi->data->data[n] = (void*)o;
3821                 }
3822                 pRExC_state->code_index++;
3823                 nextchar(pRExC_state);
3824                 if (!is_optimistic)
3825                     RExC_seen |= REG_PESSIMIZE_SEEN;
3826 
3827                 if (is_logical) {
3828                     regnode_offset eval;
3829                     ret = reg_node(pRExC_state, LOGICAL);
3830                     FLAGS(REGNODE_p(ret)) = 2;
3831 
3832                     eval = reg2node(pRExC_state, EVAL,
3833                                        n,
3834 
3835                                        /* for later propagation into (??{})
3836                                         * return value */
3837                                        RExC_flags & RXf_PMf_COMPILETIME
3838                                       );
3839                     FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3840                     if (! REGTAIL(pRExC_state, ret, eval)) {
3841                         REQUIRE_BRANCHJ(flagp, 0);
3842                     }
3843                     return ret;
3844                 }
3845                 ret = reg2node(pRExC_state, EVAL, n, 0);
3846                 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3847 
3848                 return ret;
3849             }
3850             case '(':           /* (?(?{...})...) and (?(?=...)...) */
3851             {
3852                 int is_define= 0;
3853                 const int DEFINE_len = sizeof("DEFINE") - 1;
3854                 if (    RExC_parse < RExC_end - 1
3855                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
3856                             && (   RExC_parse[1] == '='
3857                                 || RExC_parse[1] == '!'
3858                                 || RExC_parse[1] == '<'
3859                                 || RExC_parse[1] == '{'))
3860                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
3861                             && (   RExC_parse[1] == '{'
3862                             || (   memBEGINs(RExC_parse + 1,
3863                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3864                                          "pla:")
3865                                 || memBEGINs(RExC_parse + 1,
3866                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3867                                          "plb:")
3868                                 || memBEGINs(RExC_parse + 1,
3869                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3870                                          "nla:")
3871                                 || memBEGINs(RExC_parse + 1,
3872                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3873                                          "nlb:")
3874                                 || memBEGINs(RExC_parse + 1,
3875                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3876                                          "positive_lookahead:")
3877                                 || memBEGINs(RExC_parse + 1,
3878                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3879                                          "positive_lookbehind:")
3880                                 || memBEGINs(RExC_parse + 1,
3881                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3882                                          "negative_lookahead:")
3883                                 || memBEGINs(RExC_parse + 1,
3884                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3885                                          "negative_lookbehind:")))))
3886                 ) { /* Lookahead or eval. */
3887                     I32 flag;
3888                     regnode_offset tail;
3889 
3890                     ret = reg_node(pRExC_state, LOGICAL);
3891                     FLAGS(REGNODE_p(ret)) = 1;
3892 
3893                     tail = reg(pRExC_state, 1, &flag, depth+1);
3894                     RETURN_FAIL_ON_RESTART(flag, flagp);
3895                     if (! REGTAIL(pRExC_state, ret, tail)) {
3896                         REQUIRE_BRANCHJ(flagp, 0);
3897                     }
3898                     goto insert_if;
3899                 }
3900                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
3901                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
3902                 {
3903                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
3904                     char *name_start= RExC_parse;
3905                     RExC_parse_inc_by(1);
3906                     U32 num = 0;
3907                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
3908                     if (   RExC_parse == name_start
3909                         || RExC_parse >= RExC_end
3910                         || *RExC_parse != ch)
3911                     {
3912                         vFAIL2("Sequence (?(%c... not terminated",
3913                             (ch == '>' ? '<' : ch));
3914                     }
3915                     RExC_parse_inc_by(1);
3916                     if (sv_dat) {
3917                         num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
3918                         RExC_rxi->data->data[num]=(void*)sv_dat;
3919                         SvREFCNT_inc_simple_void_NN(sv_dat);
3920                     }
3921                     ret = reg1node(pRExC_state, GROUPPN, num);
3922                     goto insert_if_check_paren;
3923                 }
3924                 else if (memBEGINs(RExC_parse,
3925                                    (STRLEN) (RExC_end - RExC_parse),
3926                                    "DEFINE"))
3927                 {
3928                     ret = reg1node(pRExC_state, DEFINEP, 0);
3929                     RExC_parse_inc_by(DEFINE_len);
3930                     is_define = 1;
3931                     goto insert_if_check_paren;
3932                 }
3933                 else if (RExC_parse[0] == 'R') {
3934                     RExC_parse_inc_by(1);
3935                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
3936                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
3937                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
3938                      */
3939                     parno = 0;
3940                     if (RExC_parse[0] == '0') {
3941                         parno = 1;
3942                         RExC_parse_inc_by(1);
3943                     }
3944                     else if (inRANGE(RExC_parse[0], '1', '9')) {
3945                         UV uv;
3946                         endptr = RExC_end;
3947                         if (grok_atoUV(RExC_parse, &uv, &endptr)
3948                             && uv <= I32_MAX
3949                         ) {
3950                             parno = (I32)uv + 1;
3951                             RExC_parse_set((char*)endptr);
3952                         }
3953                         /* else "Switch condition not recognized" below */
3954                     } else if (RExC_parse[0] == '&') {
3955                         SV *sv_dat;
3956                         RExC_parse_inc_by(1);
3957                         sv_dat = reg_scan_name(pRExC_state,
3958                                                REG_RSN_RETURN_DATA);
3959                         if (sv_dat)
3960                             parno = 1 + *((I32 *)SvPVX(sv_dat));
3961                     }
3962                     ret = reg1node(pRExC_state, INSUBP, parno);
3963                     goto insert_if_check_paren;
3964                 }
3965                 else if (inRANGE(RExC_parse[0], '1', '9')) {
3966                     /* (?(1)...) */
3967                     char c;
3968                     UV uv;
3969                     endptr = RExC_end;
3970                     if (grok_atoUV(RExC_parse, &uv, &endptr)
3971                         && uv <= I32_MAX
3972                     ) {
3973                         parno = (I32)uv;
3974                         RExC_parse_set((char*)endptr);
3975                     }
3976                     else {
3977                         vFAIL("panic: grok_atoUV returned FALSE");
3978                     }
3979                     ret = reg1node(pRExC_state, GROUPP, parno);
3980 
3981                  insert_if_check_paren:
3982                     if (UCHARAT(RExC_parse) != ')') {
3983                         RExC_parse_inc_safe();
3984                         vFAIL("Switch condition not recognized");
3985                     }
3986                     nextchar(pRExC_state);
3987                   insert_if:
3988                     if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
3989                                                              IFTHEN, 0)))
3990                     {
3991                         REQUIRE_BRANCHJ(flagp, 0);
3992                     }
3993                     br = regbranch(pRExC_state, &flags, 1, depth+1);
3994                     if (br == 0) {
3995                         RETURN_FAIL_ON_RESTART(flags,flagp);
3996                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3997                               (UV) flags);
3998                     } else
3999                     if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
4000                                                              LONGJMP, 0)))
4001                     {
4002                         REQUIRE_BRANCHJ(flagp, 0);
4003                     }
4004                     c = UCHARAT(RExC_parse);
4005                     nextchar(pRExC_state);
4006                     if (flags&HASWIDTH)
4007                         *flagp |= HASWIDTH;
4008                     if (c == '|') {
4009                         if (is_define)
4010                             vFAIL("(?(DEFINE)....) does not allow branches");
4011 
4012                         /* Fake one for optimizer.  */
4013                         lastbr = reg1node(pRExC_state, IFTHEN, 0);
4014 
4015                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
4016                             RETURN_FAIL_ON_RESTART(flags, flagp);
4017                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
4018                                   (UV) flags);
4019                         }
4020                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
4021                             REQUIRE_BRANCHJ(flagp, 0);
4022                         }
4023                         if (flags&HASWIDTH)
4024                             *flagp |= HASWIDTH;
4025                         c = UCHARAT(RExC_parse);
4026                         nextchar(pRExC_state);
4027                     }
4028                     else
4029                         lastbr = 0;
4030                     if (c != ')') {
4031                         if (RExC_parse >= RExC_end)
4032                             vFAIL("Switch (?(condition)... not terminated");
4033                         else
4034                             vFAIL("Switch (?(condition)... contains too many branches");
4035                     }
4036                     ender = reg_node(pRExC_state, TAIL);
4037                     if (! REGTAIL(pRExC_state, br, ender)) {
4038                         REQUIRE_BRANCHJ(flagp, 0);
4039                     }
4040                     if (lastbr) {
4041                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
4042                             REQUIRE_BRANCHJ(flagp, 0);
4043                         }
4044                         if (! REGTAIL(pRExC_state,
4045                                       REGNODE_OFFSET(
4046                                         REGNODE_AFTER(REGNODE_p(lastbr))),
4047                                       ender))
4048                         {
4049                             REQUIRE_BRANCHJ(flagp, 0);
4050                         }
4051                     }
4052                     else
4053                         if (! REGTAIL(pRExC_state, ret, ender)) {
4054                             REQUIRE_BRANCHJ(flagp, 0);
4055                         }
4056 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
4057                     RExC_size++; /* XXX WHY do we need this?!!
4058                                     For large programs it seems to be required
4059                                     but I can't figure out why. -- dmq*/
4060 #endif
4061                     return ret;
4062                 }
4063                 RExC_parse_inc_safe();
4064                 vFAIL("Unknown switch condition (?(...))");
4065             }
4066             case '[':           /* (?[ ... ]) */
4067                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
4068             case 0: /* A NUL */
4069                 RExC_parse--; /* for vFAIL to print correctly */
4070                 vFAIL("Sequence (? incomplete");
4071                 break;
4072 
4073             case ')':
4074                 if (RExC_strict) {  /* [perl #132851] */
4075                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
4076                 }
4077                 /* FALLTHROUGH */
4078             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
4079             /* FALLTHROUGH */
4080             default: /* e.g., (?i) */
4081                 RExC_parse_set((char *) seqstart + 1);
4082               parse_flags:
4083                 parse_lparen_question_flags(pRExC_state);
4084                 if (UCHARAT(RExC_parse) != ':') {
4085                     if (RExC_parse < RExC_end)
4086                         nextchar(pRExC_state);
4087                     *flagp = TRYAGAIN;
4088                     return 0;
4089                 }
4090                 paren = ':';
4091                 nextchar(pRExC_state);
4092                 ret = 0;
4093                 goto parse_rest;
4094             } /* end switch */
4095         }
4096         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
4097           capturing_parens:
4098             parno = RExC_npar;
4099             RExC_npar++;
4100             if (RExC_npar >= U16_MAX)
4101                 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
4102 
4103             logical_parno = RExC_logical_npar;
4104             RExC_logical_npar++;
4105             if (! ALL_PARENS_COUNTED) {
4106                 /* If we are in our first pass through (and maybe only pass),
4107                  * we  need to allocate memory for the capturing parentheses
4108                  * data structures.
4109                  */
4110 
4111                 if (!RExC_parens_buf_size) {
4112                     /* first guess at number of parens we might encounter */
4113                     RExC_parens_buf_size = 10;
4114 
4115                     /* setup RExC_open_parens, which holds the address of each
4116                      * OPEN tag, and to make things simpler for the 0 index the
4117                      * start of the program - this is used later for offsets */
4118                     Newxz(RExC_open_parens, RExC_parens_buf_size,
4119                             regnode_offset);
4120                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
4121 
4122                     /* setup RExC_close_parens, which holds the address of each
4123                      * CLOSE tag, and to make things simpler for the 0 index
4124                      * the end of the program - this is used later for offsets
4125                      * */
4126                     Newxz(RExC_close_parens, RExC_parens_buf_size,
4127                             regnode_offset);
4128                     /* we don't know where end op starts yet, so we don't need to
4129                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
4130                      * above */
4131 
4132                     Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4133                     Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4134                 }
4135                 else if (RExC_npar > RExC_parens_buf_size) {
4136                     I32 old_size = RExC_parens_buf_size;
4137 
4138                     RExC_parens_buf_size *= 2;
4139 
4140                     Renew(RExC_open_parens, RExC_parens_buf_size,
4141                             regnode_offset);
4142                     Zero(RExC_open_parens + old_size,
4143                             RExC_parens_buf_size - old_size, regnode_offset);
4144 
4145                     Renew(RExC_close_parens, RExC_parens_buf_size,
4146                             regnode_offset);
4147                     Zero(RExC_close_parens + old_size,
4148                             RExC_parens_buf_size - old_size, regnode_offset);
4149 
4150                     Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4151                     Zero(RExC_logical_to_parno + old_size,
4152                          RExC_parens_buf_size - old_size, I32);
4153 
4154                     Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4155                     Zero(RExC_parno_to_logical + old_size,
4156                          RExC_parens_buf_size - old_size, I32);
4157                 }
4158             }
4159 
4160             ret = reg1node(pRExC_state, OPEN, parno);
4161             if (!RExC_nestroot)
4162                 RExC_nestroot = parno;
4163             if (RExC_open_parens && !RExC_open_parens[parno])
4164             {
4165                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4166                     "%*s%*s Setting open paren #%" IVdf " to %zu\n",
4167                     22, "|    |", (int)(depth * 2 + 1), "",
4168                     (IV)parno, ret));
4169                 RExC_open_parens[parno]= ret;
4170             }
4171             if (RExC_parno_to_logical) {
4172                 RExC_parno_to_logical[parno] = logical_parno;
4173                 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
4174                     RExC_logical_to_parno[logical_parno] = parno;
4175             }
4176             is_open = 1;
4177         } else {
4178             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
4179             paren = ':';
4180             ret = 0;
4181         }
4182     }
4183     else                        /* ! paren */
4184         ret = 0;
4185 
4186    parse_rest:
4187     /* Pick up the branches, linking them together. */
4188     segment_parse_start = RExC_parse;
4189     I32 npar_before_regbranch = RExC_npar - 1;
4190     br = regbranch(pRExC_state, &flags, 1, depth+1);
4191 
4192     /*     branch_len = (paren != 0); */
4193 
4194     if (br == 0) {
4195         RETURN_FAIL_ON_RESTART(flags, flagp);
4196         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4197     }
4198     if (*RExC_parse == '|') {
4199         if (RExC_use_BRANCHJ) {
4200             reginsert(pRExC_state, BRANCHJ, br, depth+1);
4201             ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
4202             ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4203         }
4204         else {
4205             reginsert(pRExC_state, BRANCH, br, depth+1);
4206             ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
4207             ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4208         }
4209         have_branch = 1;
4210     }
4211     else if (paren == ':') {
4212         *flagp |= flags&SIMPLE;
4213     }
4214     if (is_open) {				/* Starts with OPEN. */
4215         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
4216             REQUIRE_BRANCHJ(flagp, 0);
4217         }
4218     }
4219     else if (paren != '?')		/* Not Conditional */
4220         ret = br;
4221     *flagp |= flags & (HASWIDTH | POSTPONED);
4222     lastbr = br;
4223     while (*RExC_parse == '|') {
4224         if (RExC_use_BRANCHJ) {
4225             bool shut_gcc_up;
4226 
4227             ender = reg1node(pRExC_state, LONGJMP, 0);
4228 
4229             /* Append to the previous. */
4230             shut_gcc_up = REGTAIL(pRExC_state,
4231                          REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
4232                          ender);
4233             PERL_UNUSED_VAR(shut_gcc_up);
4234         }
4235         nextchar(pRExC_state);
4236         if (freeze_paren) {
4237             if (RExC_logical_npar > after_freeze)
4238                 after_freeze = RExC_logical_npar;
4239             RExC_logical_npar = freeze_paren;
4240         }
4241         br = regbranch(pRExC_state, &flags, 0, depth+1);
4242 
4243         if (br == 0) {
4244             RETURN_FAIL_ON_RESTART(flags, flagp);
4245             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4246         }
4247         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
4248             REQUIRE_BRANCHJ(flagp, 0);
4249         }
4250         assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4251         assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4252         if (OP(REGNODE_p(br)) == BRANCH) {
4253             if (OP(REGNODE_p(lastbr)) == BRANCH)
4254                 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4255             else
4256                 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4257         }
4258         else
4259         if (OP(REGNODE_p(br)) == BRANCHJ) {
4260             if (OP(REGNODE_p(lastbr)) == BRANCH)
4261                 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4262             else
4263                 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4264         }
4265 
4266         lastbr = br;
4267         *flagp |= flags & (HASWIDTH | POSTPONED);
4268     }
4269 
4270     if (have_branch || paren != ':') {
4271         regnode * br;
4272 
4273         /* Make a closing node, and hook it on the end. */
4274         switch (paren) {
4275         case ':':
4276             ender = reg_node(pRExC_state, TAIL);
4277             break;
4278         case 1: case 2:
4279             ender = reg1node(pRExC_state, CLOSE, parno);
4280             if ( RExC_close_parens ) {
4281                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4282                         "%*s%*s Setting close paren #%" IVdf " to %zu\n",
4283                         22, "|    |", (int)(depth * 2 + 1), "",
4284                         (IV)parno, ender));
4285                 RExC_close_parens[parno]= ender;
4286                 if (RExC_nestroot == parno)
4287                     RExC_nestroot = 0;
4288             }
4289             break;
4290         case 's':
4291             ender = reg_node(pRExC_state, SRCLOSE);
4292             RExC_in_script_run = 0;
4293             break;
4294         /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
4295         case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
4296         case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
4297         case '<': /* (?<= ... ) */
4298         case ',': /* (?<! ... ) */
4299             *flagp &= ~HASWIDTH;
4300             ender = reg_node(pRExC_state, LOOKBEHIND_END);
4301             break;
4302         /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
4303         case 'a':
4304         case 'A':
4305         case '=':
4306         case '!':
4307             *flagp &= ~HASWIDTH;
4308             /* FALLTHROUGH */
4309         case 't':   /* aTomic */
4310         case '>':
4311             ender = reg_node(pRExC_state, SUCCEED);
4312             break;
4313         case 0:
4314             ender = reg_node(pRExC_state, END);
4315             assert(!RExC_end_op); /* there can only be one! */
4316             RExC_end_op = REGNODE_p(ender);
4317             if (RExC_close_parens) {
4318                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4319                     "%*s%*s Setting close paren #0 (END) to %zu\n",
4320                     22, "|    |", (int)(depth * 2 + 1), "",
4321                     ender));
4322 
4323                 RExC_close_parens[0]= ender;
4324             }
4325             break;
4326         }
4327         DEBUG_PARSE_r({
4328             DEBUG_PARSE_MSG("lsbr");
4329             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
4330             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
4331             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4332                           SvPV_nolen_const(RExC_mysv1),
4333                           (IV)lastbr,
4334                           SvPV_nolen_const(RExC_mysv2),
4335                           (IV)ender,
4336                           (IV)(ender - lastbr)
4337             );
4338         });
4339         if (OP(REGNODE_p(lastbr)) == BRANCH) {
4340             ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4341         }
4342         else
4343         if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4344             ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4345         }
4346 
4347         if (! REGTAIL(pRExC_state, lastbr, ender)) {
4348             REQUIRE_BRANCHJ(flagp, 0);
4349         }
4350 
4351         if (have_branch) {
4352             char is_nothing= 1;
4353             if (depth==1)
4354                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
4355 
4356             /* Hook the tails of the branches to the closing node. */
4357             for (br = REGNODE_p(ret); br; br = regnext(br)) {
4358                 const U8 op = REGNODE_TYPE(OP(br));
4359                 regnode *nextoper = REGNODE_AFTER(br);
4360                 if (op == BRANCH) {
4361                     if (! REGTAIL_STUDY(pRExC_state,
4362                                         REGNODE_OFFSET(nextoper),
4363                                         ender))
4364                     {
4365                         REQUIRE_BRANCHJ(flagp, 0);
4366                     }
4367                     if ( OP(nextoper) != NOTHING
4368                          || regnext(nextoper) != REGNODE_p(ender))
4369                         is_nothing= 0;
4370                 }
4371                 else if (op == BRANCHJ) {
4372                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
4373                                         REGNODE_OFFSET(nextoper),
4374                                         ender);
4375                     PERL_UNUSED_VAR(shut_gcc_up);
4376                     /* for now we always disable this optimisation * /
4377                     regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
4378                     if ( OP(nopr) != NOTHING
4379                          || regnext(nopr) != REGNODE_p(ender))
4380                     */
4381                         is_nothing= 0;
4382                 }
4383             }
4384             if (is_nothing) {
4385                 regnode * ret_as_regnode = REGNODE_p(ret);
4386                 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
4387                                ? regnext(ret_as_regnode)
4388                                : ret_as_regnode;
4389                 DEBUG_PARSE_r({
4390                     DEBUG_PARSE_MSG("NADA");
4391                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
4392                                      NULL, pRExC_state);
4393                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
4394                                      NULL, pRExC_state);
4395                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4396                                   SvPV_nolen_const(RExC_mysv1),
4397                                   (IV)REG_NODE_NUM(ret_as_regnode),
4398                                   SvPV_nolen_const(RExC_mysv2),
4399                                   (IV)ender,
4400                                   (IV)(ender - ret)
4401                     );
4402                 });
4403                 OP(br)= NOTHING;
4404                 if (OP(REGNODE_p(ender)) == TAIL) {
4405                     NEXT_OFF(br)= 0;
4406                     RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
4407                 } else {
4408                     regnode *opt;
4409                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
4410                         OP(opt)= OPTIMIZED;
4411                     NEXT_OFF(br)= REGNODE_p(ender) - br;
4412                 }
4413             }
4414         }
4415     }
4416 
4417     {
4418         const char *p;
4419          /* Even/odd or x=don't care: 010101x10x */
4420         static const char parens[] = "=!aA<,>Bbt";
4421          /* flag below is set to 0 up through 'A'; 1 for larger */
4422 
4423         if (paren && (p = strchr(parens, paren))) {
4424             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4425             int flag = (p - parens) > 3;
4426 
4427             if (paren == '>' || paren == 't') {
4428                 node = SUSPEND, flag = 0;
4429             }
4430 
4431             reginsert(pRExC_state, node, ret, depth+1);
4432             FLAGS(REGNODE_p(ret)) = flag;
4433             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
4434             {
4435                 REQUIRE_BRANCHJ(flagp, 0);
4436             }
4437         }
4438     }
4439 
4440     /* Check for proper termination. */
4441     if (paren) {
4442         /* restore original flags, but keep (?p) and, if we've encountered
4443          * something in the parse that changes /d rules into /u, keep the /u */
4444         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
4445         if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
4446             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
4447         }
4448         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
4449             RExC_parse_set(reg_parse_start);
4450             vFAIL("Unmatched (");
4451         }
4452         nextchar(pRExC_state);
4453     }
4454     else if (!paren && RExC_parse < RExC_end) {
4455         if (*RExC_parse == ')') {
4456             RExC_parse_inc_by(1);
4457             vFAIL("Unmatched )");
4458         }
4459         else
4460             FAIL("Junk on end of regexp");	/* "Can't happen". */
4461         NOT_REACHED; /* NOTREACHED */
4462     }
4463 
4464     if (after_freeze > RExC_logical_npar)
4465         RExC_logical_npar = after_freeze;
4466 
4467     RExC_in_lookaround = was_in_lookaround;
4468 
4469     return(ret);
4470 }
4471 
4472 /*
4473  - regbranch - one alternative of an | operator
4474  *
4475  * Implements the concatenation operator.
4476  *
4477  * On success, returns the offset at which any next node should be placed into
4478  * the regex engine program being compiled.
4479  *
4480  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
4481  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
4482  * UTF-8
4483  */
4484 STATIC regnode_offset
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first,U32 depth)4485 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4486 {
4487     regnode_offset ret;
4488     regnode_offset chain = 0;
4489     regnode_offset latest;
4490     regnode *branch_node = NULL;
4491     I32 flags = 0, c = 0;
4492     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4493 
4494     PERL_ARGS_ASSERT_REGBRANCH;
4495 
4496     DEBUG_PARSE("brnc");
4497 
4498     if (first)
4499         ret = 0;
4500     else {
4501         if (RExC_use_BRANCHJ) {
4502             ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
4503             branch_node = REGNODE_p(ret);
4504             ARG2a_SET(branch_node, (U16)RExC_npar-1);
4505         } else {
4506             ret = reg1node(pRExC_state, BRANCH, 0);
4507             branch_node = REGNODE_p(ret);
4508             ARG1a_SET(branch_node, (U16)RExC_npar-1);
4509         }
4510     }
4511 
4512     *flagp = 0;			/* Initialize. */
4513 
4514     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
4515                             FALSE /* Don't force to /x */ );
4516     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4517         flags &= ~TRYAGAIN;
4518         latest = regpiece(pRExC_state, &flags, depth+1);
4519         if (latest == 0) {
4520             if (flags & TRYAGAIN)
4521                 continue;
4522             RETURN_FAIL_ON_RESTART(flags, flagp);
4523             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
4524         }
4525         else if (ret == 0)
4526             ret = latest;
4527         *flagp |= flags&(HASWIDTH|POSTPONED);
4528         if (chain != 0) {
4529             /* FIXME adding one for every branch after the first is probably
4530              * excessive now we have TRIE support. (hv) */
4531             MARK_NAUGHTY(1);
4532             if (! REGTAIL(pRExC_state, chain, latest)) {
4533                 /* XXX We could just redo this branch, but figuring out what
4534                  * bookkeeping needs to be reset is a pain, and it's likely
4535                  * that other branches that goto END will also be too large */
4536                 REQUIRE_BRANCHJ(flagp, 0);
4537             }
4538         }
4539         chain = latest;
4540         c++;
4541     }
4542     if (chain == 0) {	/* Loop ran zero times. */
4543         chain = reg_node(pRExC_state, NOTHING);
4544         if (ret == 0)
4545             ret = chain;
4546     }
4547     if (c == 1) {
4548         *flagp |= flags & SIMPLE;
4549     }
4550     return ret;
4551 }
4552 
4553 #define RBRACE  0
4554 #define MIN_S   1
4555 #define MIN_E   2
4556 #define MAX_S   3
4557 #define MAX_E   4
4558 
4559 #ifndef PERL_IN_XSUB_RE
4560 bool
Perl_regcurly(const char * s,const char * e,const char * result[5])4561 Perl_regcurly(const char *s, const char *e, const char * result[5])
4562 {
4563     /* This function matches a {m,n} quantifier.  When called with a NULL final
4564      * argument, it simply parses the input from 's' up through 'e-1', and
4565      * returns a boolean as to whether or not this input is syntactically a
4566      * {m,n} quantifier.
4567      *
4568      * When called with a non-NULL final parameter, and when the function
4569      * returns TRUE, it additionally stores information into the array
4570      * specified by that parameter about what it found in the parse.  The
4571      * parameter must be a pointer into a 5 element array of 'const char *'
4572      * elements.  The returned information is as follows:
4573      *   result[RBRACE]  points to the closing brace
4574      *   result[MIN_S]   points to the first byte of the lower bound
4575      *   result[MIN_E]   points to one beyond the final byte of the lower bound
4576      *   result[MAX_S]   points to the first byte of the upper bound
4577      *   result[MAX_E]   points to one beyond the final byte of the upper bound
4578      *
4579      * If the quantifier is of the form {m,} (meaning an infinite upper
4580      * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
4581      * to is irrelevant, just that it's the same place
4582      *
4583      * If instead the quantifier is of the form {m} there is actually only
4584      * one bound, and both the upper and lower result[] elements are set to
4585      * point to it.
4586      *
4587      * This function checks only for syntactic validity; it leaves checking for
4588      * semantic validity and raising any diagnostics to the caller.  This
4589      * function is called in multiple places to check for syntax, but only from
4590      * one for semantics.  It makes it as simple as possible for the
4591      * syntax-only callers, while furnishing just enough information for the
4592      * semantic caller.
4593      */
4594 
4595     const char * min_start = NULL;
4596     const char * max_start = NULL;
4597     const char * min_end = NULL;
4598     const char * max_end = NULL;
4599 
4600     bool has_comma = FALSE;
4601 
4602     PERL_ARGS_ASSERT_REGCURLY;
4603 
4604     if (s >= e || *s++ != '{')
4605         return FALSE;
4606 
4607     while (s < e && isBLANK(*s)) {
4608         s++;
4609     }
4610 
4611     if isDIGIT(*s) {
4612         min_start = s;
4613         do {
4614             s++;
4615         } while (s < e && isDIGIT(*s));
4616         min_end = s;
4617     }
4618 
4619     while (s < e && isBLANK(*s)) {
4620         s++;
4621     }
4622 
4623     if (*s == ',') {
4624         has_comma = TRUE;
4625         s++;
4626 
4627         while (s < e && isBLANK(*s)) {
4628             s++;
4629         }
4630 
4631         if isDIGIT(*s) {
4632             max_start = s;
4633             do {
4634                 s++;
4635             } while (s < e && isDIGIT(*s));
4636             max_end = s;
4637         }
4638     }
4639 
4640     while (s < e && isBLANK(*s)) {
4641         s++;
4642     }
4643                                /* Need at least one number */
4644     if (s >= e || *s != '}' || (! min_start && ! max_end)) {
4645         return FALSE;
4646     }
4647 
4648     if (result) {
4649 
4650         result[RBRACE] = s;
4651 
4652         result[MIN_S] = min_start;
4653         result[MIN_E] = min_end;
4654         if (has_comma) {
4655             if (max_start) {
4656                 result[MAX_S] = max_start;
4657                 result[MAX_E] = max_end;
4658             }
4659             else {
4660                 /* Having no value after the comma is signalled by setting
4661                  * start and end to the same value.  What that value is isn't
4662                  * relevant; NULL is chosen simply because it will fail if the
4663                  * caller mistakenly uses it */
4664                 result[MAX_S] = result[MAX_E] = NULL;
4665             }
4666         }
4667         else {  /* No comma means lower and upper bounds are the same */
4668             result[MAX_S] = min_start;
4669             result[MAX_E] = min_end;
4670         }
4671     }
4672 
4673     return TRUE;
4674 }
4675 #endif
4676 
4677 U32
S_get_quantifier_value(pTHX_ RExC_state_t * pRExC_state,const char * start,const char * end)4678 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
4679                        const char * start, const char * end)
4680 {
4681     /* This is a helper function for regpiece() to compute, given the
4682      * quantifier {m,n}, the value of either m or n, based on the starting
4683      * position 'start' in the string, through the byte 'end-1', returning it
4684      * if valid, and failing appropriately if not.  It knows the restrictions
4685      * imposed on quantifier values */
4686 
4687     UV uv;
4688     STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
4689 
4690     PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
4691 
4692     if (grok_atoUV(start, &uv, &end)) {
4693         if (uv < REG_INFTY) {   /* A valid, small-enough number */
4694             return (U32) uv;
4695         }
4696     }
4697     else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
4698                                  leading zeros or overflow */
4699         RExC_parse_set((char * ) end);
4700 
4701         /* Perhaps too generic a msg for what is only failure from having
4702          * leading zeros, but this is how it's always behaved. */
4703         vFAIL("Invalid quantifier in {,}");
4704         NOT_REACHED; /*NOTREACHED*/
4705     }
4706 
4707     /* Here, found a quantifier, but was too large; either it overflowed or was
4708      * too big a legal number */
4709     RExC_parse_set((char * ) end);
4710     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4711 
4712     NOT_REACHED; /*NOTREACHED*/
4713     return U32_MAX; /* Perhaps some compilers will be expecting a return */
4714 }
4715 
4716 /*
4717  - regpiece - something followed by possible quantifier * + ? {n,m}
4718  *
4719  * Note that the branching code sequences used for ? and the general cases
4720  * of * and + are somewhat optimized:  they use the same NOTHING node as
4721  * both the endmarker for their branch list and the body of the last branch.
4722  * It might seem that this node could be dispensed with entirely, but the
4723  * endmarker role is not redundant.
4724  *
4725  * On success, returns the offset at which any next node should be placed into
4726  * the regex engine program being compiled.
4727  *
4728  * Returns 0 otherwise, with *flagp set to indicate why:
4729  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
4730  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
4731  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
4732  */
4733 STATIC regnode_offset
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)4734 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4735 {
4736     regnode_offset ret;
4737     char op;
4738     I32 flags;
4739     const char * const origparse = RExC_parse;
4740     I32 min;
4741     I32 max = REG_INFTY;
4742     I32 npar_before = RExC_npar-1;
4743 
4744     /* Save the original in case we change the emitted regop to a FAIL. */
4745     const regnode_offset orig_emit = RExC_emit;
4746 
4747     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4748 
4749     PERL_ARGS_ASSERT_REGPIECE;
4750 
4751     DEBUG_PARSE("piec");
4752 
4753     ret = regatom(pRExC_state, &flags, depth+1);
4754     if (ret == 0) {
4755         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
4756         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
4757     }
4758     I32 npar_after = RExC_npar-1;
4759 
4760     op = *RExC_parse;
4761     switch (op) {
4762         const char * regcurly_return[5];
4763 
4764       case '*':
4765         nextchar(pRExC_state);
4766         min = 0;
4767         break;
4768 
4769       case '+':
4770         nextchar(pRExC_state);
4771         min = 1;
4772         break;
4773 
4774       case '?':
4775         nextchar(pRExC_state);
4776         min = 0; max = 1;
4777         break;
4778 
4779       case '{':  /* A '{' may or may not indicate a quantifier; call regcurly()
4780                     to determine which */
4781         if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
4782             const char * min_start = regcurly_return[MIN_S];
4783             const char * min_end   = regcurly_return[MIN_E];
4784             const char * max_start = regcurly_return[MAX_S];
4785             const char * max_end   = regcurly_return[MAX_E];
4786 
4787             if (min_start) {
4788                 min = get_quantifier_value(pRExC_state, min_start, min_end);
4789             }
4790             else {
4791                 min = 0;
4792             }
4793 
4794             if (max_start == max_end) {     /* Was of the form {m,} */
4795                 max = REG_INFTY;
4796             }
4797             else if (max_start == min_start) {  /* Was of the form {m} */
4798                 max = min;
4799             }
4800             else {  /* Was of the form {m,n} */
4801                 assert(max_end >= max_start);
4802 
4803                 max = get_quantifier_value(pRExC_state, max_start, max_end);
4804             }
4805 
4806             RExC_parse_set((char *) regcurly_return[RBRACE]);
4807             nextchar(pRExC_state);
4808 
4809             if (max < min) {    /* If can't match, warn and optimize to fail
4810                                    unconditionally */
4811                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
4812                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
4813                 NEXT_OFF(REGNODE_p(orig_emit)) =
4814                                     REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
4815                 return ret;
4816             }
4817             else if (min == max && *RExC_parse == '?') {
4818                 ckWARN2reg(RExC_parse + 1,
4819                            "Useless use of greediness modifier '%c'",
4820                            *RExC_parse);
4821             }
4822 
4823             break;
4824         } /* End of is {m,n} */
4825 
4826         /* Here was a '{', but what followed it didn't form a quantifier. */
4827         /* FALLTHROUGH */
4828 
4829       default:
4830         *flagp = flags;
4831         return(ret);
4832         NOT_REACHED; /*NOTREACHED*/
4833     }
4834 
4835     /* Here we have a quantifier, and have calculated 'min' and 'max'.
4836      *
4837      * Check and possibly adjust a zero width operand */
4838     if (! (flags & (HASWIDTH|POSTPONED))) {
4839         if (max > REG_INFTY/3) {
4840             ckWARN2reg(RExC_parse,
4841                        "%" UTF8f " matches null string many times",
4842                        UTF8fARG(UTF, (RExC_parse >= origparse
4843                                      ? RExC_parse - origparse
4844                                      : 0),
4845                        origparse));
4846         }
4847 
4848         /* There's no point in trying to match something 0 length more than
4849          * once except for extra side effects, which we don't have here since
4850          * not POSTPONED */
4851         if (max > 1) {
4852             max = 1;
4853             if (min > max) {
4854                 min = max;
4855             }
4856         }
4857     }
4858 
4859     /* If this is a code block pass it up */
4860     *flagp |= (flags & POSTPONED);
4861 
4862     if (max > 0) {
4863         *flagp |= (flags & HASWIDTH);
4864         if (max == REG_INFTY)
4865             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
4866     }
4867 
4868     /* 'SIMPLE' operands don't require full generality */
4869     if ((flags&SIMPLE)) {
4870         if (max == REG_INFTY) {
4871             if (min == 0) {
4872                 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
4873                     goto min0_maxINF_wildcard_forbidden;
4874                 }
4875 
4876                 reginsert(pRExC_state, STAR, ret, depth+1);
4877                 MARK_NAUGHTY(4);
4878                 goto done_main_op;
4879             }
4880             else if (min == 1) {
4881                 reginsert(pRExC_state, PLUS, ret, depth+1);
4882                 MARK_NAUGHTY(3);
4883                 goto done_main_op;
4884             }
4885         }
4886 
4887         /* Here, SIMPLE, but not the '*' and '+' special cases */
4888 
4889         MARK_NAUGHTY_EXP(2, 2);
4890         reginsert(pRExC_state, CURLY, ret, depth+1);
4891     }
4892     else {  /* not SIMPLE */
4893         const regnode_offset w = reg_node(pRExC_state, WHILEM);
4894 
4895         FLAGS(REGNODE_p(w)) = 0;
4896         if (!  REGTAIL(pRExC_state, ret, w)) {
4897             REQUIRE_BRANCHJ(flagp, 0);
4898         }
4899         if (RExC_use_BRANCHJ) {
4900             reginsert(pRExC_state, LONGJMP, ret, depth+1);
4901             reginsert(pRExC_state, NOTHING, ret, depth+1);
4902             REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
4903         }
4904         reginsert(pRExC_state, CURLYX, ret, depth+1);
4905         if (RExC_use_BRANCHJ)
4906             /* Go over NOTHING to LONGJMP. */
4907             REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
4908 
4909         if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
4910                                                   NOTHING)))
4911         {
4912             REQUIRE_BRANCHJ(flagp, 0);
4913         }
4914         RExC_whilem_seen++;
4915         MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
4916     }
4917 
4918     /* Finish up the CURLY/CURLYX case */
4919     FLAGS(REGNODE_p(ret)) = 0;
4920 
4921     ARG1i_SET(REGNODE_p(ret), min);
4922     ARG2i_SET(REGNODE_p(ret), max);
4923 
4924     /* if we had a npar_after then we need to increment npar_before,
4925      * we want to track the range of parens we need to reset each iteration
4926      */
4927     if (npar_after!=npar_before) {
4928         ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
4929         ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
4930     } else {
4931         ARG3a_SET(REGNODE_p(ret), 0);
4932         ARG3b_SET(REGNODE_p(ret), 0);
4933     }
4934 
4935   done_main_op:
4936 
4937     /* Process any greediness modifiers */
4938     if (*RExC_parse == '?') {
4939         nextchar(pRExC_state);
4940         reginsert(pRExC_state, MINMOD, ret, depth+1);
4941         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
4942             REQUIRE_BRANCHJ(flagp, 0);
4943         }
4944     }
4945     else if (*RExC_parse == '+') {
4946         regnode_offset ender;
4947         nextchar(pRExC_state);
4948         ender = reg_node(pRExC_state, SUCCEED);
4949         if (! REGTAIL(pRExC_state, ret, ender)) {
4950             REQUIRE_BRANCHJ(flagp, 0);
4951         }
4952         reginsert(pRExC_state, SUSPEND, ret, depth+1);
4953         ender = reg_node(pRExC_state, TAIL);
4954         if (! REGTAIL(pRExC_state, ret, ender)) {
4955             REQUIRE_BRANCHJ(flagp, 0);
4956         }
4957     }
4958 
4959     /* Forbid extra quantifiers */
4960     if (isQUANTIFIER(RExC_parse, RExC_end)) {
4961         RExC_parse_inc_by(1);
4962         vFAIL("Nested quantifiers");
4963     }
4964 
4965     return(ret);
4966 
4967   min0_maxINF_wildcard_forbidden:
4968 
4969     /* Here we are in a wildcard match, and the minimum match length is 0, and
4970      * the max could be infinity.  This is currently forbidden.  The only
4971      * reason is to make it harder to write patterns that take a long long time
4972      * to halt, and because the use of this construct isn't necessary in
4973      * matching Unicode property values */
4974     RExC_parse_inc_by(1);
4975     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
4976        subpatterns in regex; marked by <-- HERE in m/%s/
4977      */
4978     vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
4979           " subpatterns");
4980 
4981     /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
4982      * legal at all in wildcards, so can't get this far */
4983 
4984     NOT_REACHED; /*NOTREACHED*/
4985 }
4986 
4987 STATIC bool
S_grok_bslash_N(pTHX_ RExC_state_t * pRExC_state,regnode_offset * node_p,UV * code_point_p,int * cp_count,I32 * flagp,const bool strict,const U32 depth)4988 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
4989                 regnode_offset * node_p,
4990                 UV * code_point_p,
4991                 int * cp_count,
4992                 I32 * flagp,
4993                 const bool strict,
4994                 const U32 depth
4995     )
4996 {
4997  /* This routine teases apart the various meanings of \N and returns
4998   * accordingly.  The input parameters constrain which meaning(s) is/are valid
4999   * in the current context.
5000   *
5001   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
5002   *
5003   * If <code_point_p> is not NULL, the context is expecting the result to be a
5004   * single code point.  If this \N instance turns out to a single code point,
5005   * the function returns TRUE and sets *code_point_p to that code point.
5006   *
5007   * If <node_p> is not NULL, the context is expecting the result to be one of
5008   * the things representable by a regnode.  If this \N instance turns out to be
5009   * one such, the function generates the regnode, returns TRUE and sets *node_p
5010   * to point to the offset of that regnode into the regex engine program being
5011   * compiled.
5012   *
5013   * If this instance of \N isn't legal in any context, this function will
5014   * generate a fatal error and not return.
5015   *
5016   * On input, RExC_parse should point to the first char following the \N at the
5017   * time of the call.  On successful return, RExC_parse will have been updated
5018   * to point to just after the sequence identified by this routine.  Also
5019   * *flagp has been updated as needed.
5020   *
5021   * When there is some problem with the current context and this \N instance,
5022   * the function returns FALSE, without advancing RExC_parse, nor setting
5023   * *node_p, nor *code_point_p, nor *flagp.
5024   *
5025   * If <cp_count> is not NULL, the caller wants to know the length (in code
5026   * points) that this \N sequence matches.  This is set, and the input is
5027   * parsed for errors, even if the function returns FALSE, as detailed below.
5028   *
5029   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
5030   *
5031   * Probably the most common case is for the \N to specify a single code point.
5032   * *cp_count will be set to 1, and *code_point_p will be set to that code
5033   * point.
5034   *
5035   * Another possibility is for the input to be an empty \N{}.  This is no
5036   * longer accepted, and will generate a fatal error.
5037   *
5038   * Another possibility is for a custom charnames handler to be in effect which
5039   * translates the input name to an empty string.  *cp_count will be set to 0.
5040   * *node_p will be set to a generated NOTHING node.
5041   *
5042   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
5043   * set to 0. *node_p will be set to a generated REG_ANY node.
5044   *
5045   * The fifth possibility is that \N resolves to a sequence of more than one
5046   * code points.  *cp_count will be set to the number of code points in the
5047   * sequence. *node_p will be set to a generated node returned by this
5048   * function calling S_reg().
5049   *
5050   * The sixth and final possibility is that it is premature to be calling this
5051   * function; the parse needs to be restarted.  This can happen when this
5052   * changes from /d to /u rules, or when the pattern needs to be upgraded to
5053   * UTF-8.  The latter occurs only when the fifth possibility would otherwise
5054   * be in effect, and is because one of those code points requires the pattern
5055   * to be recompiled as UTF-8.  The function returns FALSE, and sets the
5056   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
5057   * happens, the caller needs to desist from continuing parsing, and return
5058   * this information to its caller.  This is not set for when there is only one
5059   * code point, as this can be called as part of an ANYOF node, and they can
5060   * store above-Latin1 code points without the pattern having to be in UTF-8.
5061   *
5062   * For non-single-quoted regexes, the tokenizer has resolved character and
5063   * sequence names inside \N{...} into their Unicode values, normalizing the
5064   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
5065   * hex-represented code points in the sequence.  This is done there because
5066   * the names can vary based on what charnames pragma is in scope at the time,
5067   * so we need a way to take a snapshot of what they resolve to at the time of
5068   * the original parse. [perl #56444].
5069   *
5070   * That parsing is skipped for single-quoted regexes, so here we may get
5071   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
5072   * like '\N{U+41}', that code point is Unicode, and has to be translated into
5073   * the native character set for non-ASCII platforms.  The other possibilities
5074   * are already native, so no translation is done. */
5075 
5076     char * endbrace;    /* points to '}' following the name */
5077     char * e;           /* points to final non-blank before endbrace */
5078     char* p = RExC_parse; /* Temporary */
5079 
5080     SV * substitute_parse = NULL;
5081     char *orig_end;
5082     char *save_start;
5083     I32 flags;
5084 
5085     DECLARE_AND_GET_RE_DEBUG_FLAGS;
5086 
5087     PERL_ARGS_ASSERT_GROK_BSLASH_N;
5088 
5089     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
5090     assert(! (node_p && cp_count));               /* At most 1 should be set */
5091 
5092     if (cp_count) {     /* Initialize return for the most common case */
5093         *cp_count = 1;
5094     }
5095 
5096     /* The [^\n] meaning of \N ignores spaces and comments under the /x
5097      * modifier.  The other meanings do not (except blanks adjacent to and
5098      * within the braces), so use a temporary until we find out which we are
5099      * being called with */
5100     skip_to_be_ignored_text(pRExC_state, &p,
5101                             FALSE /* Don't force to /x */ );
5102 
5103     /* Disambiguate between \N meaning a named character versus \N meaning
5104      * [^\n].  The latter is assumed when the {...} following the \N is a legal
5105      * quantifier, or if there is no '{' at all */
5106     if (*p != '{' || regcurly(p, RExC_end, NULL)) {
5107         RExC_parse_set(p);
5108         if (cp_count) {
5109             *cp_count = -1;
5110         }
5111 
5112         if (! node_p) {
5113             return FALSE;
5114         }
5115 
5116         *node_p = reg_node(pRExC_state, REG_ANY);
5117         *flagp |= HASWIDTH|SIMPLE;
5118         MARK_NAUGHTY(1);
5119         return TRUE;
5120     }
5121 
5122     /* The test above made sure that the next real character is a '{', but
5123      * under the /x modifier, it could be separated by space (or a comment and
5124      * \n) and this is not allowed (for consistency with \x{...} and the
5125      * tokenizer handling of \N{NAME}). */
5126     if (*RExC_parse != '{') {
5127         vFAIL("Missing braces on \\N{}");
5128     }
5129 
5130     RExC_parse_inc_by(1);       /* Skip past the '{' */
5131 
5132     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
5133     if (! endbrace) { /* no trailing brace */
5134         vFAIL2("Missing right brace on \\%c{}", 'N');
5135     }
5136 
5137     /* Here, we have decided it should be a named character or sequence.  These
5138      * imply Unicode semantics */
5139     REQUIRE_UNI_RULES(flagp, FALSE);
5140 
5141     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
5142      * nothing at all (not allowed under strict) */
5143     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
5144         RExC_parse_set(endbrace);
5145         if (strict) {
5146             RExC_parse_inc_by(1);   /* Position after the "}" */
5147             vFAIL("Zero length \\N{}");
5148         }
5149 
5150         if (cp_count) {
5151             *cp_count = 0;
5152         }
5153         nextchar(pRExC_state);
5154         if (! node_p) {
5155             return FALSE;
5156         }
5157 
5158         *node_p = reg_node(pRExC_state, NOTHING);
5159         return TRUE;
5160     }
5161 
5162     while (isBLANK(*RExC_parse)) {
5163         RExC_parse_inc_by(1);
5164     }
5165 
5166     e = endbrace;
5167     while (RExC_parse < e && isBLANK(*(e-1))) {
5168         e--;
5169     }
5170 
5171     if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
5172 
5173         /* Here, the name isn't of the form  U+....  This can happen if the
5174          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
5175          * is the time to find out what the name means */
5176 
5177         const STRLEN name_len = e - RExC_parse;
5178         SV *  value_sv;     /* What does this name evaluate to */
5179         SV ** value_svp;
5180         const U8 * value;   /* string of name's value */
5181         STRLEN value_len;   /* and its length */
5182 
5183         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
5184          *  toke.c, and their values. Make sure is initialized */
5185         if (! RExC_unlexed_names) {
5186             RExC_unlexed_names = newHV();
5187         }
5188 
5189         /* If we have already seen this name in this pattern, use that.  This
5190          * allows us to only call the charnames handler once per name per
5191          * pattern.  A broken or malicious handler could return something
5192          * different each time, which could cause the results to vary depending
5193          * on if something gets added or subtracted from the pattern that
5194          * causes the number of passes to change, for example */
5195         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
5196                                                       name_len, 0)))
5197         {
5198             value_sv = *value_svp;
5199         }
5200         else { /* Otherwise we have to go out and get the name */
5201             const char * error_msg = NULL;
5202             value_sv = get_and_check_backslash_N_name(RExC_parse, e,
5203                                                       UTF,
5204                                                       &error_msg);
5205             if (error_msg) {
5206                 RExC_parse_set(endbrace);
5207                 vFAIL(error_msg);
5208             }
5209 
5210             /* If no error message, should have gotten a valid return */
5211             assert (value_sv);
5212 
5213             /* Save the name's meaning for later use */
5214             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
5215                            value_sv, 0))
5216             {
5217                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
5218             }
5219         }
5220 
5221         /* Here, we have the value the name evaluates to in 'value_sv' */
5222         value = (U8 *) SvPV(value_sv, value_len);
5223 
5224         /* See if the result is one code point vs 0 or multiple */
5225         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
5226                                   ? UTF8SKIP(value)
5227                                   : 1)))
5228         {
5229             /* Here, exactly one code point.  If that isn't what is wanted,
5230              * fail */
5231             if (! code_point_p) {
5232                 RExC_parse_set(p);
5233                 return FALSE;
5234             }
5235 
5236             /* Convert from string to numeric code point */
5237             *code_point_p = (SvUTF8(value_sv))
5238                             ? valid_utf8_to_uvchr(value, NULL)
5239                             : *value;
5240 
5241             /* Have parsed this entire single code point \N{...}.  *cp_count
5242              * has already been set to 1, so don't do it again. */
5243             RExC_parse_set(endbrace);
5244             nextchar(pRExC_state);
5245             return TRUE;
5246         } /* End of is a single code point */
5247 
5248         /* Count the code points, if caller desires.  The API says to do this
5249          * even if we will later return FALSE */
5250         if (cp_count) {
5251             *cp_count = 0;
5252 
5253             *cp_count = (SvUTF8(value_sv))
5254                         ? utf8_length(value, value + value_len)
5255                         : value_len;
5256         }
5257 
5258         /* Fail if caller doesn't want to handle a multi-code-point sequence.
5259          * But don't back the pointer up if the caller wants to know how many
5260          * code points there are (they need to handle it themselves in this
5261          * case).  */
5262         if (! node_p) {
5263             if (! cp_count) {
5264                 RExC_parse_set(p);
5265             }
5266             return FALSE;
5267         }
5268 
5269         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
5270          * reg recursively to parse it.  That way, it retains its atomicness,
5271          * while not having to worry about any special handling that some code
5272          * points may have. */
5273 
5274         substitute_parse = newSVpvs("?:");
5275         sv_catsv(substitute_parse, value_sv);
5276         sv_catpv(substitute_parse, ")");
5277 
5278         /* The value should already be native, so no need to convert on EBCDIC
5279          * platforms.*/
5280         assert(! RExC_recode_x_to_native);
5281 
5282     }
5283     else {   /* \N{U+...} */
5284         Size_t count = 0;   /* code point count kept internally */
5285 
5286         /* We can get to here when the input is \N{U+...} or when toke.c has
5287          * converted a name to the \N{U+...} form.  This include changing a
5288          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
5289 
5290         RExC_parse_inc_by(2);    /* Skip past the 'U+' */
5291 
5292         /* Code points are separated by dots.  The '}' terminates the whole
5293          * thing. */
5294 
5295         do {    /* Loop until the ending brace */
5296             I32 flags = PERL_SCAN_SILENT_OVERFLOW
5297                       | PERL_SCAN_SILENT_ILLDIGIT
5298                       | PERL_SCAN_NOTIFY_ILLDIGIT
5299                       | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
5300                       | PERL_SCAN_DISALLOW_PREFIX;
5301             STRLEN len = e - RExC_parse;
5302             NV overflow_value;
5303             char * start_digit = RExC_parse;
5304             UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
5305 
5306             if (len == 0) {
5307                 RExC_parse_inc_by(1);
5308               bad_NU:
5309                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
5310             }
5311 
5312             RExC_parse_inc_by(len);
5313 
5314             if (cp > MAX_LEGAL_CP) {
5315                 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
5316             }
5317 
5318             if (RExC_parse >= e) { /* Got to the closing '}' */
5319                 if (count) {
5320                     goto do_concat;
5321                 }
5322 
5323                 /* Here, is a single code point; fail if doesn't want that */
5324                 if (! code_point_p) {
5325                     RExC_parse_set(p);
5326                     return FALSE;
5327                 }
5328 
5329                 /* A single code point is easy to handle; just return it */
5330                 *code_point_p = UNI_TO_NATIVE(cp);
5331                 RExC_parse_set(endbrace);
5332                 nextchar(pRExC_state);
5333                 return TRUE;
5334             }
5335 
5336             /* Here, the parse stopped bfore the ending brace.  This is legal
5337              * only if that character is a dot separating code points, like a
5338              * multiple character sequence (of the form "\N{U+c1.c2. ... }".
5339              * So the next character must be a dot (and the one after that
5340              * can't be the ending brace, or we'd have something like
5341              * \N{U+100.} )
5342              * */
5343             if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
5344                 /*point to after 1st invalid */
5345                 RExC_parse_incf(RExC_orig_utf8);
5346                 /*Guard against malformed utf8*/
5347                 RExC_parse_set(MIN(e, RExC_parse));
5348                 goto bad_NU;
5349             }
5350 
5351             /* Here, looks like its really a multiple character sequence.  Fail
5352              * if that's not what the caller wants.  But continue with counting
5353              * and error checking if they still want a count */
5354             if (! node_p && ! cp_count) {
5355                 return FALSE;
5356             }
5357 
5358             /* What is done here is to convert this to a sub-pattern of the
5359              * form \x{char1}\x{char2}...  and then call reg recursively to
5360              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
5361              * atomicness, while not having to worry about special handling
5362              * that some code points may have.  We don't create a subpattern,
5363              * but go through the motions of code point counting and error
5364              * checking, if the caller doesn't want a node returned. */
5365 
5366             if (node_p && ! substitute_parse) {
5367                 substitute_parse = newSVpvs("?:");
5368             }
5369 
5370           do_concat:
5371 
5372             if (node_p) {
5373                 /* Convert to notation the rest of the code understands */
5374                 sv_catpvs(substitute_parse, "\\x{");
5375                 sv_catpvn(substitute_parse, start_digit,
5376                                             RExC_parse - start_digit);
5377                 sv_catpvs(substitute_parse, "}");
5378             }
5379 
5380             /* Move to after the dot (or ending brace the final time through.)
5381              * */
5382             RExC_parse_inc_by(1);
5383             count++;
5384 
5385         } while (RExC_parse < e);
5386 
5387         if (! node_p) { /* Doesn't want the node */
5388             assert (cp_count);
5389 
5390             *cp_count = count;
5391             return FALSE;
5392         }
5393 
5394         sv_catpvs(substitute_parse, ")");
5395 
5396         /* The values are Unicode, and therefore have to be converted to native
5397          * on a non-Unicode (meaning non-ASCII) platform. */
5398         SET_recode_x_to_native(1);
5399     }
5400 
5401     /* Here, we have the string the name evaluates to, ready to be parsed,
5402      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
5403      * constructs.  This can be called from within a substitute parse already.
5404      * The error reporting mechanism doesn't work for 2 levels of this, but the
5405      * code above has validated this new construct, so there should be no
5406      * errors generated by the below.  And this isn't an exact copy, so the
5407      * mechanism to seamlessly deal with this won't work, so turn off warnings
5408      * during it */
5409     save_start = RExC_start;
5410     orig_end = RExC_end;
5411 
5412     RExC_start = SvPVX(substitute_parse);
5413     RExC_parse_set(RExC_start);
5414     RExC_end = RExC_parse + SvCUR(substitute_parse);
5415     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
5416 
5417     *node_p = reg(pRExC_state, 1, &flags, depth+1);
5418 
5419     /* Restore the saved values */
5420     RESTORE_WARNINGS;
5421     RExC_start = save_start;
5422     RExC_parse_set(endbrace);
5423     RExC_end = orig_end;
5424     SET_recode_x_to_native(0);
5425 
5426     SvREFCNT_dec_NN(substitute_parse);
5427 
5428     if (! *node_p) {
5429         RETURN_FAIL_ON_RESTART(flags, flagp);
5430         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
5431             (UV) flags);
5432     }
5433     *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5434 
5435     nextchar(pRExC_state);
5436 
5437     return TRUE;
5438 }
5439 
5440 
5441 STATIC U8
S_compute_EXACTish(RExC_state_t * pRExC_state)5442 S_compute_EXACTish(RExC_state_t *pRExC_state)
5443 {
5444     U8 op;
5445 
5446     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
5447 
5448     if (! FOLD) {
5449         return (LOC)
5450                 ? EXACTL
5451                 : EXACT;
5452     }
5453 
5454     op = get_regex_charset(RExC_flags);
5455     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
5456         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
5457                  been, so there is no hole */
5458     }
5459 
5460     return op + EXACTF;
5461 }
5462 
5463 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
5464  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
5465 
5466 static I32
S_backref_value(char * p,char * e)5467 S_backref_value(char *p, char *e)
5468 {
5469     const char* endptr = e;
5470     UV val;
5471     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
5472         return (I32)val;
5473     return I32_MAX;
5474 }
5475 
5476 
5477 /*
5478  - regatom - the lowest level
5479 
5480    Try to identify anything special at the start of the current parse position.
5481    If there is, then handle it as required. This may involve generating a
5482    single regop, such as for an assertion; or it may involve recursing, such as
5483    to handle a () structure.
5484 
5485    If the string doesn't start with something special then we gobble up
5486    as much literal text as we can.  If we encounter a quantifier, we have to
5487    back off the final literal character, as that quantifier applies to just it
5488    and not to the whole string of literals.
5489 
5490    Once we have been able to handle whatever type of thing started the
5491    sequence, we return the offset into the regex engine program being compiled
5492    at which any  next regnode should be placed.
5493 
5494    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
5495    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
5496    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
5497    Otherwise does not return 0.
5498 
5499    Note: we have to be careful with escapes, as they can be both literal
5500    and special, and in the case of \10 and friends, context determines which.
5501 
5502    A summary of the code structure is:
5503 
5504    switch (first_byte) {
5505         cases for each special:
5506             handle this special;
5507             break;
5508         case '\\':
5509             switch (2nd byte) {
5510                 cases for each unambiguous special:
5511                     handle this special;
5512                     break;
5513                 cases for each ambiguous special/literal:
5514                     disambiguate;
5515                     if (special)  handle here
5516                     else goto defchar;
5517                 default: // unambiguously literal:
5518                     goto defchar;
5519             }
5520         default:  // is a literal char
5521             // FALL THROUGH
5522         defchar:
5523             create EXACTish node for literal;
5524             while (more input and node isn't full) {
5525                 switch (input_byte) {
5526                    cases for each special;
5527                        make sure parse pointer is set so that the next call to
5528                            regatom will see this special first
5529                        goto loopdone; // EXACTish node terminated by prev. char
5530                    default:
5531                        append char to EXACTISH node;
5532                 }
5533                 get next input byte;
5534             }
5535         loopdone:
5536    }
5537    return the generated node;
5538 
5539    Specifically there are two separate switches for handling
5540    escape sequences, with the one for handling literal escapes requiring
5541    a dummy entry for all of the special escapes that are actually handled
5542    by the other.
5543 
5544 */
5545 
5546 STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)5547 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5548 {
5549     regnode_offset ret = 0;
5550     I32 flags = 0;
5551     char *atom_parse_start;
5552     U8 op;
5553     int invert = 0;
5554 
5555     DECLARE_AND_GET_RE_DEBUG_FLAGS;
5556 
5557     *flagp = 0;		/* Initialize. */
5558 
5559     DEBUG_PARSE("atom");
5560 
5561     PERL_ARGS_ASSERT_REGATOM;
5562 
5563   tryagain:
5564     atom_parse_start = RExC_parse;
5565     assert(RExC_parse < RExC_end);
5566     switch ((U8)*RExC_parse) {
5567     case '^':
5568         RExC_seen_zerolen++;
5569         nextchar(pRExC_state);
5570         if (RExC_flags & RXf_PMf_MULTILINE)
5571             ret = reg_node(pRExC_state, MBOL);
5572         else
5573             ret = reg_node(pRExC_state, SBOL);
5574         break;
5575     case '$':
5576         nextchar(pRExC_state);
5577         if (*RExC_parse)
5578             RExC_seen_zerolen++;
5579         if (RExC_flags & RXf_PMf_MULTILINE)
5580             ret = reg_node(pRExC_state, MEOL);
5581         else
5582             ret = reg_node(pRExC_state, SEOL);
5583         break;
5584     case '.':
5585         nextchar(pRExC_state);
5586         if (RExC_flags & RXf_PMf_SINGLELINE)
5587             ret = reg_node(pRExC_state, SANY);
5588         else
5589             ret = reg_node(pRExC_state, REG_ANY);
5590         *flagp |= HASWIDTH|SIMPLE;
5591         MARK_NAUGHTY(1);
5592         break;
5593     case '[':
5594     {
5595         char * const cc_parse_start = ++RExC_parse;
5596         ret = regclass(pRExC_state, flagp, depth+1,
5597                        FALSE, /* means parse the whole char class */
5598                        TRUE, /* allow multi-char folds */
5599                        FALSE, /* don't silence non-portable warnings. */
5600                        (bool) RExC_strict,
5601                        TRUE, /* Allow an optimized regnode result */
5602                        NULL);
5603         if (ret == 0) {
5604             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5605             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5606                   (UV) *flagp);
5607         }
5608         if (*RExC_parse != ']') {
5609             RExC_parse_set(cc_parse_start);
5610             vFAIL("Unmatched [");
5611         }
5612         nextchar(pRExC_state);
5613         break;
5614     }
5615     case '(':
5616         nextchar(pRExC_state);
5617         ret = reg(pRExC_state, 2, &flags, depth+1);
5618         if (ret == 0) {
5619                 if (flags & TRYAGAIN) {
5620                     if (RExC_parse >= RExC_end) {
5621                          /* Make parent create an empty node if needed. */
5622                         *flagp |= TRYAGAIN;
5623                         return(0);
5624                     }
5625                     goto tryagain;
5626                 }
5627                 RETURN_FAIL_ON_RESTART(flags, flagp);
5628                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
5629                                                                  (UV) flags);
5630         }
5631         *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5632         break;
5633     case '|':
5634     case ')':
5635         if (flags & TRYAGAIN) {
5636             *flagp |= TRYAGAIN;
5637             return 0;
5638         }
5639         vFAIL("Internal urp");
5640                                 /* Supposed to be caught earlier. */
5641         break;
5642     case '?':
5643     case '+':
5644     case '*':
5645         RExC_parse_inc_by(1);
5646         vFAIL("Quantifier follows nothing");
5647         break;
5648     case '\\':
5649         /* Special Escapes
5650 
5651            This switch handles escape sequences that resolve to some kind
5652            of special regop and not to literal text. Escape sequences that
5653            resolve to literal text are handled below in the switch marked
5654            "Literal Escapes".
5655 
5656            Every entry in this switch *must* have a corresponding entry
5657            in the literal escape switch. However, the opposite is not
5658            required, as the default for this switch is to jump to the
5659            literal text handling code.
5660         */
5661         RExC_parse_inc_by(1);
5662         switch ((U8)*RExC_parse) {
5663         /* Special Escapes */
5664         case 'A':
5665             RExC_seen_zerolen++;
5666             /* Under wildcards, this is changed to match \n; should be
5667              * invisible to the user, as they have to compile under /m */
5668             if (RExC_pm_flags & PMf_WILDCARD) {
5669                 ret = reg_node(pRExC_state, MBOL);
5670             }
5671             else {
5672                 ret = reg_node(pRExC_state, SBOL);
5673                 /* SBOL is shared with /^/ so we set the flags so we can tell
5674                  * /\A/ from /^/ in split. */
5675                 FLAGS(REGNODE_p(ret)) = 1;
5676             }
5677             goto finish_meta_pat;
5678         case 'G':
5679             if (RExC_pm_flags & PMf_WILDCARD) {
5680                 RExC_parse_inc_by(1);
5681                 /* diag_listed_as: Use of %s is not allowed in Unicode property
5682                    wildcard subpatterns in regex; marked by <-- HERE in m/%s/
5683                  */
5684                 vFAIL("Use of '\\G' is not allowed in Unicode property"
5685                       " wildcard subpatterns");
5686             }
5687             ret = reg_node(pRExC_state, GPOS);
5688             RExC_seen |= REG_GPOS_SEEN;
5689             goto finish_meta_pat;
5690         case 'K':
5691             if (!RExC_in_lookaround) {
5692                 RExC_seen_zerolen++;
5693                 ret = reg_node(pRExC_state, KEEPS);
5694                 /* XXX:dmq : disabling in-place substitution seems to
5695                  * be necessary here to avoid cases of memory corruption, as
5696                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
5697                  */
5698                 RExC_seen |= REG_LOOKBEHIND_SEEN;
5699                 goto finish_meta_pat;
5700             }
5701             else {
5702                 ++RExC_parse; /* advance past the 'K' */
5703                 vFAIL("\\K not permitted in lookahead/lookbehind");
5704             }
5705         case 'Z':
5706             if (RExC_pm_flags & PMf_WILDCARD) {
5707                 /* See comment under \A above */
5708                 ret = reg_node(pRExC_state, MEOL);
5709             }
5710             else {
5711                 ret = reg_node(pRExC_state, SEOL);
5712             }
5713             RExC_seen_zerolen++;		/* Do not optimize RE away */
5714             goto finish_meta_pat;
5715         case 'z':
5716             if (RExC_pm_flags & PMf_WILDCARD) {
5717                 /* See comment under \A above */
5718                 ret = reg_node(pRExC_state, MEOL);
5719             }
5720             else {
5721                 ret = reg_node(pRExC_state, EOS);
5722             }
5723             RExC_seen_zerolen++;		/* Do not optimize RE away */
5724             goto finish_meta_pat;
5725         case 'C':
5726             vFAIL("\\C no longer supported");
5727         case 'X':
5728             ret = reg_node(pRExC_state, CLUMP);
5729             *flagp |= HASWIDTH;
5730             goto finish_meta_pat;
5731 
5732         case 'B':
5733             invert = 1;
5734             /* FALLTHROUGH */
5735         case 'b':
5736           {
5737             U8 flags = 0;
5738             regex_charset charset = get_regex_charset(RExC_flags);
5739 
5740             RExC_seen_zerolen++;
5741             RExC_seen |= REG_LOOKBEHIND_SEEN;
5742             op = BOUND + charset;
5743 
5744             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
5745                 flags = TRADITIONAL_BOUND;
5746                 if (op > BOUNDA) {  /* /aa is same as /a */
5747                     op = BOUNDA;
5748                 }
5749             }
5750             else {
5751                 STRLEN length;
5752                 char name = *RExC_parse;
5753                 char * endbrace =  (char *) memchr(RExC_parse, '}',
5754                                                    RExC_end - RExC_parse);
5755                 char * e = endbrace;
5756 
5757                 RExC_parse_inc_by(2);
5758 
5759                 if (! endbrace) {
5760                     vFAIL2("Missing right brace on \\%c{}", name);
5761                 }
5762 
5763                 while (isBLANK(*RExC_parse)) {
5764                     RExC_parse_inc_by(1);
5765                 }
5766 
5767                 while (RExC_parse < e && isBLANK(*(e - 1))) {
5768                     e--;
5769                 }
5770 
5771                 if (e == RExC_parse) {
5772                     RExC_parse_set(endbrace + 1);  /* After the '}' */
5773                     vFAIL2("Empty \\%c{}", name);
5774                 }
5775 
5776                 length = e - RExC_parse;
5777 
5778                 switch (*RExC_parse) {
5779                     case 'g':
5780                         if (    length != 1
5781                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
5782                         {
5783                             goto bad_bound_type;
5784                         }
5785                         flags = GCB_BOUND;
5786                         break;
5787                     case 'l':
5788                         if (length != 2 || *(RExC_parse + 1) != 'b') {
5789                             goto bad_bound_type;
5790                         }
5791                         flags = LB_BOUND;
5792                         break;
5793                     case 's':
5794                         if (length != 2 || *(RExC_parse + 1) != 'b') {
5795                             goto bad_bound_type;
5796                         }
5797                         flags = SB_BOUND;
5798                         break;
5799                     case 'w':
5800                         if (length != 2 || *(RExC_parse + 1) != 'b') {
5801                             goto bad_bound_type;
5802                         }
5803                         flags = WB_BOUND;
5804                         break;
5805                     default:
5806                       bad_bound_type:
5807                         RExC_parse_set(e);
5808                         vFAIL2utf8f(
5809                             "'%" UTF8f "' is an unknown bound type",
5810                             UTF8fARG(UTF, length, e - length));
5811                         NOT_REACHED; /*NOTREACHED*/
5812                 }
5813                 RExC_parse_set(endbrace);
5814                 REQUIRE_UNI_RULES(flagp, 0);
5815 
5816                 if (op == BOUND) {
5817                     op = BOUNDU;
5818                 }
5819                 else if (op >= BOUNDA) {  /* /aa is same as /a */
5820                     op = BOUNDU;
5821                     length += 4;
5822 
5823                     /* Don't have to worry about UTF-8, in this message because
5824                      * to get here the contents of the \b must be ASCII */
5825                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
5826                               "Using /u for '%.*s' instead of /%s",
5827                               (unsigned) length,
5828                               endbrace - length + 1,
5829                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
5830                               ? ASCII_RESTRICT_PAT_MODS
5831                               : ASCII_MORE_RESTRICT_PAT_MODS);
5832                 }
5833             }
5834 
5835             if (op == BOUND) {
5836                 RExC_seen_d_op = TRUE;
5837             }
5838             else if (op == BOUNDL) {
5839                 RExC_contains_locale = 1;
5840             }
5841 
5842             if (invert) {
5843                 op += NBOUND - BOUND;
5844             }
5845 
5846             ret = reg_node(pRExC_state, op);
5847             FLAGS(REGNODE_p(ret)) = flags;
5848 
5849             goto finish_meta_pat;
5850           }
5851 
5852         case 'R':
5853             ret = reg_node(pRExC_state, LNBREAK);
5854             *flagp |= HASWIDTH|SIMPLE;
5855             goto finish_meta_pat;
5856 
5857         case 'd':
5858         case 'D':
5859         case 'h':
5860         case 'H':
5861         case 'p':
5862         case 'P':
5863         case 's':
5864         case 'S':
5865         case 'v':
5866         case 'V':
5867         case 'w':
5868         case 'W':
5869             /* These all have the same meaning inside [brackets], and it knows
5870              * how to do the best optimizations for them.  So, pretend we found
5871              * these within brackets, and let it do the work */
5872             RExC_parse--;
5873 
5874             ret = regclass(pRExC_state, flagp, depth+1,
5875                            TRUE, /* means just parse this element */
5876                            FALSE, /* don't allow multi-char folds */
5877                            FALSE, /* don't silence non-portable warnings.  It
5878                                      would be a bug if these returned
5879                                      non-portables */
5880                            (bool) RExC_strict,
5881                            TRUE, /* Allow an optimized regnode result */
5882                            NULL);
5883             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5884             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
5885              * multi-char folds are allowed.  */
5886             if (!ret)
5887                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5888                       (UV) *flagp);
5889 
5890             RExC_parse--;   /* regclass() leaves this one too far ahead */
5891 
5892           finish_meta_pat:
5893                    /* The escapes above that don't take a parameter can't be
5894                     * followed by a '{'.  But 'pX', 'p{foo}' and
5895                     * correspondingly 'P' can be */
5896             if (   RExC_parse - atom_parse_start == 1
5897                 && UCHARAT(RExC_parse + 1) == '{'
5898                 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
5899             {
5900                 RExC_parse_inc_by(2);
5901                 vFAIL("Unescaped left brace in regex is illegal here");
5902             }
5903             nextchar(pRExC_state);
5904             break;
5905         case 'N':
5906             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
5907              * \N{...} evaluates to a sequence of more than one code points).
5908              * The function call below returns a regnode, which is our result.
5909              * The parameters cause it to fail if the \N{} evaluates to a
5910              * single code point; we handle those like any other literal.  The
5911              * reason that the multicharacter case is handled here and not as
5912              * part of the EXACtish code is because of quantifiers.  In
5913              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
5914              * this way makes that Just Happen. dmq.
5915              * join_exact() will join this up with adjacent EXACTish nodes
5916              * later on, if appropriate. */
5917             ++RExC_parse;
5918             if (grok_bslash_N(pRExC_state,
5919                               &ret,     /* Want a regnode returned */
5920                               NULL,     /* Fail if evaluates to a single code
5921                                            point */
5922                               NULL,     /* Don't need a count of how many code
5923                                            points */
5924                               flagp,
5925                               RExC_strict,
5926                               depth)
5927             ) {
5928                 break;
5929             }
5930 
5931             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5932 
5933             /* Here, evaluates to a single code point.  Go get that */
5934             RExC_parse_set(atom_parse_start);
5935             goto defchar;
5936 
5937         case 'k':    /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
5938       parse_named_seq:  /* Also handle non-numeric \g{...} */
5939         {
5940             char ch;
5941             if (   RExC_parse >= RExC_end - 1
5942                 || ((   ch = RExC_parse[1]) != '<'
5943                                       && ch != '\''
5944                                       && ch != '{'))
5945             {
5946                 RExC_parse_inc_by(1);
5947                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
5948                 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
5949             } else {
5950                 RExC_parse_inc_by(2);
5951                 if (ch == '{') {
5952                     while (isBLANK(*RExC_parse)) {
5953                         RExC_parse_inc_by(1);
5954                     }
5955                 }
5956                 ret = handle_named_backref(pRExC_state,
5957                                            flagp,
5958                                            atom_parse_start,
5959                                            (ch == '<')
5960                                            ? '>'
5961                                            : (ch == '{')
5962                                              ? '}'
5963                                              : '\'');
5964             }
5965             break;
5966         }
5967         case 'g':
5968         case '1': case '2': case '3': case '4':
5969         case '5': case '6': case '7': case '8': case '9':
5970             {
5971                 I32 num;
5972                 char * endbrace = NULL;
5973                 char * s = RExC_parse;
5974                 char * e = RExC_end;
5975 
5976                 if (*s == 'g') {
5977                     bool isrel = 0;
5978 
5979                     s++;
5980                     if (*s == '{') {
5981                         endbrace = (char *) memchr(s, '}', RExC_end - s);
5982                         if (! endbrace ) {
5983 
5984                             /* Missing '}'.  Position after the number to give
5985                              * a better indication to the user of where the
5986                              * problem is. */
5987                             s++;
5988                             if (*s == '-') {
5989                                 s++;
5990                             }
5991 
5992                             /* If it looks to be a name and not a number, go
5993                              * handle it there */
5994                             if (! isDIGIT(*s)) {
5995                                 goto parse_named_seq;
5996                             }
5997 
5998                             do {
5999                                 s++;
6000                             } while isDIGIT(*s);
6001 
6002                             RExC_parse_set(s);
6003                             vFAIL("Unterminated \\g{...} pattern");
6004                         }
6005 
6006                         s++;    /* Past the '{' */
6007 
6008                         while (isBLANK(*s)) {
6009                             s++;
6010                         }
6011 
6012                         /* Ignore trailing blanks */
6013                         e = endbrace;
6014                         while (s < e && isBLANK(*(e - 1))) {
6015                             e--;
6016                         }
6017                     }
6018 
6019                     /* Here, have isolated the meat of the construct from any
6020                      * surrounding braces */
6021 
6022                     if (*s == '-') {
6023                         isrel = 1;
6024                         s++;
6025                     }
6026 
6027                     if (endbrace && !isDIGIT(*s)) {
6028                         goto parse_named_seq;
6029                     }
6030 
6031                     RExC_parse_set(s);
6032                     num = S_backref_value(RExC_parse, RExC_end);
6033                     if (num == 0)
6034                         vFAIL("Reference to invalid group 0");
6035                     else if (num == I32_MAX) {
6036                          if (isDIGIT(*RExC_parse))
6037                             vFAIL("Reference to nonexistent group");
6038                         else
6039                             vFAIL("Unterminated \\g... pattern");
6040                     }
6041 
6042                     if (isrel) {
6043                         num = RExC_npar - num;
6044                         if (num < 1)
6045                             vFAIL("Reference to nonexistent or unclosed group");
6046                     }
6047                     else
6048                     if (num < RExC_logical_npar) {
6049                         num = RExC_logical_to_parno[num];
6050                     }
6051                     else
6052                     if (ALL_PARENS_COUNTED)  {
6053                         if (num < RExC_logical_total_parens)
6054                             num = RExC_logical_to_parno[num];
6055                         else {
6056                             num = -1;
6057                         }
6058                     }
6059                     else{
6060                         REQUIRE_PARENS_PASS;
6061                     }
6062                 }
6063                 else {
6064                     num = S_backref_value(RExC_parse, RExC_end);
6065                     /* bare \NNN might be backref or octal - if it is larger
6066                      * than or equal RExC_npar then it is assumed to be an
6067                      * octal escape. Note RExC_npar is +1 from the actual
6068                      * number of parens. */
6069                     /* Note we do NOT check if num == I32_MAX here, as that is
6070                      * handled by the RExC_npar check */
6071 
6072                     if (    /* any numeric escape < 10 is always a backref */
6073                            num > 9
6074                             /* any numeric escape < RExC_npar is a backref */
6075                         && num >= RExC_logical_npar
6076                             /* cannot be an octal escape if it starts with [89]
6077                              * */
6078                         && ! inRANGE(*RExC_parse, '8', '9')
6079                     ) {
6080                         /* Probably not meant to be a backref, instead likely
6081                          * to be an octal character escape, e.g. \35 or \777.
6082                          * The above logic should make it obvious why using
6083                          * octal escapes in patterns is problematic. - Yves */
6084                         RExC_parse_set(atom_parse_start);
6085                         goto defchar;
6086                     }
6087                     if (num < RExC_logical_npar) {
6088                         num = RExC_logical_to_parno[num];
6089                     }
6090                     else
6091                     if (ALL_PARENS_COUNTED) {
6092                         if (num < RExC_logical_total_parens) {
6093                             num = RExC_logical_to_parno[num];
6094                         } else {
6095                             num = -1;
6096                         }
6097                     } else {
6098                         REQUIRE_PARENS_PASS;
6099                     }
6100                 }
6101 
6102                 /* At this point RExC_parse points at a numeric escape like
6103                  * \12 or \88 or the digits in \g{34} or \g34 or something
6104                  * similar, which we should NOT treat as an octal escape. It
6105                  * may or may not be a valid backref escape. For instance
6106                  * \88888888 is unlikely to be a valid backref.
6107                  *
6108                  * We've already figured out what value the digits represent.
6109                  * Now, move the parse to beyond them. */
6110                 if (endbrace) {
6111                     RExC_parse_set(endbrace + 1);
6112                 }
6113                 else while (isDIGIT(*RExC_parse)) {
6114                     RExC_parse_inc_by(1);
6115                 }
6116                 if (num < 0)
6117                     vFAIL("Reference to nonexistent group");
6118 
6119                 if (num >= (I32)RExC_npar) {
6120                     /* It might be a forward reference; we can't fail until we
6121                      * know, by completing the parse to get all the groups, and
6122                      * then reparsing */
6123                     if (ALL_PARENS_COUNTED)  {
6124                         if (num >= RExC_total_parens)  {
6125                             vFAIL("Reference to nonexistent group");
6126                         }
6127                     }
6128                     else {
6129                         REQUIRE_PARENS_PASS;
6130                     }
6131                 }
6132                 RExC_sawback = 1;
6133                 ret = reg2node(pRExC_state,
6134                                ((! FOLD)
6135                                  ? REF
6136                                  : (ASCII_FOLD_RESTRICTED)
6137                                    ? REFFA
6138                                    : (AT_LEAST_UNI_SEMANTICS)
6139                                      ? REFFU
6140                                      : (LOC)
6141                                        ? REFFL
6142                                        : REFF),
6143                                 num, RExC_nestroot);
6144                 if (RExC_nestroot && num >= RExC_nestroot)
6145                     FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
6146                 if (OP(REGNODE_p(ret)) == REFF) {
6147                     RExC_seen_d_op = TRUE;
6148                 }
6149                 *flagp |= HASWIDTH;
6150 
6151                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
6152                                         FALSE /* Don't force to /x */ );
6153             }
6154             break;
6155         case '\0':
6156             if (RExC_parse >= RExC_end)
6157                 FAIL("Trailing \\");
6158             /* FALLTHROUGH */
6159         default:
6160             /* Do not generate "unrecognized" warnings here, we fall
6161                back into the quick-grab loop below */
6162             RExC_parse_set(atom_parse_start);
6163             goto defchar;
6164         } /* end of switch on a \foo sequence */
6165         break;
6166 
6167     case '#':
6168 
6169         /* '#' comments should have been spaced over before this function was
6170          * called */
6171         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
6172         /*
6173         if (RExC_flags & RXf_PMf_EXTENDED) {
6174             RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
6175             if (RExC_parse < RExC_end)
6176                 goto tryagain;
6177         }
6178         */
6179 
6180         /* FALLTHROUGH */
6181 
6182     default:
6183           defchar: {
6184 
6185             /* Here, we have determined that the next thing is probably a
6186              * literal character.  RExC_parse points to the first byte of its
6187              * definition.  (It still may be an escape sequence that evaluates
6188              * to a single character) */
6189 
6190             STRLEN len = 0;
6191             UV ender = 0;
6192             char *p;
6193             char *s, *old_s = NULL, *old_old_s = NULL;
6194             char *s0;
6195             U32 max_string_len = 255;
6196 
6197             /* We may have to reparse the node, artificially stopping filling
6198              * it early, based on info gleaned in the first parse.  This
6199              * variable gives where we stop.  Make it above the normal stopping
6200              * place first time through; otherwise it would stop too early */
6201             U32 upper_fill = max_string_len + 1;
6202 
6203             /* We start out as an EXACT node, even if under /i, until we find a
6204              * character which is in a fold.  The algorithm now segregates into
6205              * separate nodes, characters that fold from those that don't under
6206              * /i.  (This hopefully will create nodes that are fixed strings
6207              * even under /i, giving the optimizer something to grab on to.)
6208              * So, if a node has something in it and the next character is in
6209              * the opposite category, that node is closed up, and the function
6210              * returns.  Then regatom is called again, and a new node is
6211              * created for the new category. */
6212             U8 node_type = EXACT;
6213 
6214             /* Assume the node will be fully used; the excess is given back at
6215              * the end.  Under /i, we may need to temporarily add the fold of
6216              * an extra character or two at the end to check for splitting
6217              * multi-char folds, so allocate extra space for that.   We can't
6218              * make any other length assumptions, as a byte input sequence
6219              * could shrink down. */
6220             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
6221                                                  + ((! FOLD)
6222                                                     ? 0
6223                                                     : 2 * ((UTF)
6224                                                            ? UTF8_MAXBYTES_CASE
6225                         /* Max non-UTF-8 expansion is 2 */ : 2)));
6226 
6227             bool next_is_quantifier;
6228             char * oldp = NULL;
6229 
6230             /* We can convert EXACTF nodes to EXACTFU if they contain only
6231              * characters that match identically regardless of the target
6232              * string's UTF8ness.  The reason to do this is that EXACTF is not
6233              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
6234              * runtime.
6235              *
6236              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
6237              * contain only above-Latin1 characters (hence must be in UTF8),
6238              * which don't participate in folds with Latin1-range characters,
6239              * as the latter's folds aren't known until runtime. */
6240             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6241 
6242             /* Single-character EXACTish nodes are almost always SIMPLE.  This
6243              * allows us to override this as encountered */
6244             U8 maybe_SIMPLE = SIMPLE;
6245 
6246             /* Does this node contain something that can't match unless the
6247              * target string is (also) in UTF-8 */
6248             bool requires_utf8_target = FALSE;
6249 
6250             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
6251             bool has_ss = FALSE;
6252 
6253             /* So is the MICRO SIGN */
6254             bool has_micro_sign = FALSE;
6255 
6256             /* Set when we fill up the current node and there is still more
6257              * text to process */
6258             bool overflowed;
6259 
6260             /* Allocate an EXACT node.  The node_type may change below to
6261              * another EXACTish node, but since the size of the node doesn't
6262              * change, it works */
6263             ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
6264             FILL_NODE(ret, node_type);
6265             RExC_emit += NODE_STEP_REGNODE;
6266 
6267             s = STRING(REGNODE_p(ret));
6268 
6269             s0 = s;
6270 
6271           reparse:
6272 
6273             p = RExC_parse;
6274             len = 0;
6275             s = s0;
6276             node_type = EXACT;
6277             oldp = NULL;
6278             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6279             maybe_SIMPLE = SIMPLE;
6280             requires_utf8_target = FALSE;
6281             has_ss = FALSE;
6282             has_micro_sign = FALSE;
6283 
6284           continue_parse:
6285 
6286             /* This breaks under rare circumstances.  If folding, we do not
6287              * want to split a node at a character that is a non-final in a
6288              * multi-char fold, as an input string could just happen to want to
6289              * match across the node boundary.  The code at the end of the loop
6290              * looks for this, and backs off until it finds not such a
6291              * character, but it is possible (though extremely, extremely
6292              * unlikely) for all characters in the node to be non-final fold
6293              * ones, in which case we just leave the node fully filled, and
6294              * hope that it doesn't match the string in just the wrong place */
6295 
6296             assert( ! UTF     /* Is at the beginning of a character */
6297                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
6298                    || UTF8_IS_START(UCHARAT(RExC_parse)));
6299 
6300             overflowed = FALSE;
6301 
6302             /* Here, we have a literal character.  Find the maximal string of
6303              * them in the input that we can fit into a single EXACTish node.
6304              * We quit at the first non-literal or when the node gets full, or
6305              * under /i the categorization of folding/non-folding character
6306              * changes */
6307             while (p < RExC_end && len < upper_fill) {
6308 
6309                 /* In most cases each iteration adds one byte to the output.
6310                  * The exceptions override this */
6311                 Size_t added_len = 1;
6312 
6313                 oldp = p;
6314                 old_old_s = old_s;
6315                 old_s = s;
6316 
6317                 /* White space has already been ignored */
6318                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
6319                        || ! is_PATWS_safe((p), RExC_end, UTF));
6320 
6321                 switch ((U8)*p) {
6322                   const char* message;
6323                   U32 packed_warn;
6324                   U8 grok_c_char;
6325 
6326                 case '^':
6327                 case '$':
6328                 case '.':
6329                 case '[':
6330                 case '(':
6331                 case ')':
6332                 case '|':
6333                     goto loopdone;
6334                 case '\\':
6335                     /* Literal Escapes Switch
6336 
6337                        This switch is meant to handle escape sequences that
6338                        resolve to a literal character.
6339 
6340                        Every escape sequence that represents something
6341                        else, like an assertion or a char class, is handled
6342                        in the switch marked 'Special Escapes' above in this
6343                        routine, but also has an entry here as anything that
6344                        isn't explicitly mentioned here will be treated as
6345                        an unescaped equivalent literal.
6346                     */
6347 
6348                     switch ((U8)*++p) {
6349 
6350                     /* These are all the special escapes. */
6351                     case 'A':             /* Start assertion */
6352                     case 'b': case 'B':   /* Word-boundary assertion*/
6353                     case 'C':             /* Single char !DANGEROUS! */
6354                     case 'd': case 'D':   /* digit class */
6355                     case 'g': case 'G':   /* generic-backref, pos assertion */
6356                     case 'h': case 'H':   /* HORIZWS */
6357                     case 'k': case 'K':   /* named backref, keep marker */
6358                     case 'p': case 'P':   /* Unicode property */
6359                               case 'R':   /* LNBREAK */
6360                     case 's': case 'S':   /* space class */
6361                     case 'v': case 'V':   /* VERTWS */
6362                     case 'w': case 'W':   /* word class */
6363                     case 'X':             /* eXtended Unicode "combining
6364                                              character sequence" */
6365                     case 'z': case 'Z':   /* End of line/string assertion */
6366                         --p;
6367                         goto loopdone;
6368 
6369                     /* Anything after here is an escape that resolves to a
6370                        literal. (Except digits, which may or may not)
6371                      */
6372                     case 'n':
6373                         ender = '\n';
6374                         p++;
6375                         break;
6376                     case 'N': /* Handle a single-code point named character. */
6377                         RExC_parse_set( p + 1 );
6378                         if (! grok_bslash_N(pRExC_state,
6379                                             NULL,   /* Fail if evaluates to
6380                                                        anything other than a
6381                                                        single code point */
6382                                             &ender, /* The returned single code
6383                                                        point */
6384                                             NULL,   /* Don't need a count of
6385                                                        how many code points */
6386                                             flagp,
6387                                             RExC_strict,
6388                                             depth)
6389                         ) {
6390                             if (*flagp & NEED_UTF8)
6391                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
6392                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
6393 
6394                             /* Here, it wasn't a single code point.  Go close
6395                              * up this EXACTish node.  The switch() prior to
6396                              * this switch handles the other cases */
6397                             p = oldp;
6398                             RExC_parse_set(p);
6399                             goto loopdone;
6400                         }
6401                         p = RExC_parse;
6402                         RExC_parse_set(atom_parse_start);
6403 
6404                         /* The \N{} means the pattern, if previously /d,
6405                          * becomes /u.  That means it can't be an EXACTF node,
6406                          * but an EXACTFU */
6407                         if (node_type == EXACTF) {
6408                             node_type = EXACTFU;
6409 
6410                             /* If the node already contains something that
6411                              * differs between EXACTF and EXACTFU, reparse it
6412                              * as EXACTFU */
6413                             if (! maybe_exactfu) {
6414                                 len = 0;
6415                                 s = s0;
6416                                 goto reparse;
6417                             }
6418                         }
6419 
6420                         break;
6421                     case 'r':
6422                         ender = '\r';
6423                         p++;
6424                         break;
6425                     case 't':
6426                         ender = '\t';
6427                         p++;
6428                         break;
6429                     case 'f':
6430                         ender = '\f';
6431                         p++;
6432                         break;
6433                     case 'e':
6434                         ender = ESC_NATIVE;
6435                         p++;
6436                         break;
6437                     case 'a':
6438                         ender = '\a';
6439                         p++;
6440                         break;
6441                     case 'o':
6442                         if (! grok_bslash_o(&p,
6443                                             RExC_end,
6444                                             &ender,
6445                                             &message,
6446                                             &packed_warn,
6447                                             (bool) RExC_strict,
6448                                             FALSE, /* No illegal cp's */
6449                                             UTF))
6450                         {
6451                             RExC_parse_set(p); /* going to die anyway; point to
6452                                                exact spot of failure */
6453                             vFAIL(message);
6454                         }
6455 
6456                         if (message && TO_OUTPUT_WARNINGS(p)) {
6457                             warn_non_literal_string(p, packed_warn, message);
6458                         }
6459                         break;
6460                     case 'x':
6461                         if (! grok_bslash_x(&p,
6462                                             RExC_end,
6463                                             &ender,
6464                                             &message,
6465                                             &packed_warn,
6466                                             (bool) RExC_strict,
6467                                             FALSE, /* No illegal cp's */
6468                                             UTF))
6469                         {
6470                             RExC_parse_set(p);        /* going to die anyway; point
6471                                                    to exact spot of failure */
6472                             vFAIL(message);
6473                         }
6474 
6475                         if (message && TO_OUTPUT_WARNINGS(p)) {
6476                             warn_non_literal_string(p, packed_warn, message);
6477                         }
6478 
6479 #ifdef EBCDIC
6480                         if (ender < 0x100) {
6481                             if (RExC_recode_x_to_native) {
6482                                 ender = LATIN1_TO_NATIVE(ender);
6483                             }
6484                         }
6485 #endif
6486                         break;
6487                     case 'c':
6488                         p++;
6489                         if (! grok_bslash_c(*p, &grok_c_char,
6490                                             &message, &packed_warn))
6491                         {
6492                             /* going to die anyway; point to exact spot of
6493                              * failure */
6494                             char *new_p= p + ((UTF)
6495                                               ? UTF8_SAFE_SKIP(p, RExC_end)
6496                                               : 1);
6497                             RExC_parse_set(new_p);
6498                             vFAIL(message);
6499                         }
6500 
6501                         ender = grok_c_char;
6502                         p++;
6503                         if (message && TO_OUTPUT_WARNINGS(p)) {
6504                             warn_non_literal_string(p, packed_warn, message);
6505                         }
6506 
6507                         break;
6508                     case '8': case '9': /* must be a backreference */
6509                         --p;
6510                         /* we have an escape like \8 which cannot be an octal escape
6511                          * so we exit the loop, and let the outer loop handle this
6512                          * escape which may or may not be a legitimate backref. */
6513                         goto loopdone;
6514                     case '1': case '2': case '3':case '4':
6515                     case '5': case '6': case '7':
6516 
6517                         /* When we parse backslash escapes there is ambiguity
6518                          * between backreferences and octal escapes. Any escape
6519                          * from \1 - \9 is a backreference, any multi-digit
6520                          * escape which does not start with 0 and which when
6521                          * evaluated as decimal could refer to an already
6522                          * parsed capture buffer is a back reference. Anything
6523                          * else is octal.
6524                          *
6525                          * Note this implies that \118 could be interpreted as
6526                          * 118 OR as "\11" . "8" depending on whether there
6527                          * were 118 capture buffers defined already in the
6528                          * pattern.  */
6529 
6530                         /* NOTE, RExC_npar is 1 more than the actual number of
6531                          * parens we have seen so far, hence the "<" as opposed
6532                          * to "<=" */
6533                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
6534                         {  /* Not to be treated as an octal constant, go
6535                                    find backref */
6536                             p = oldp;
6537                             goto loopdone;
6538                         }
6539                         /* FALLTHROUGH */
6540                     case '0':
6541                         {
6542                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT
6543                                       | PERL_SCAN_NOTIFY_ILLDIGIT;
6544                             STRLEN numlen = 3;
6545                             ender = grok_oct(p, &numlen, &flags, NULL);
6546                             p += numlen;
6547                             if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
6548                                 && isDIGIT(*p)  /* like \08, \178 */
6549                                 && ckWARN(WARN_REGEXP))
6550                             {
6551                                 reg_warn_non_literal_string(
6552                                      p + 1,
6553                                      form_alien_digit_msg(8, numlen, p,
6554                                                         RExC_end, UTF, FALSE));
6555                             }
6556                         }
6557                         break;
6558                     case '\0':
6559                         if (p >= RExC_end)
6560                             FAIL("Trailing \\");
6561                         /* FALLTHROUGH */
6562                     default:
6563                         if (isALPHANUMERIC(*p)) {
6564                             /* An alpha followed by '{' is going to fail next
6565                              * iteration, so don't output this warning in that
6566                              * case */
6567                             if (! isALPHA(*p) || *(p + 1) != '{') {
6568                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
6569                                                   " passed through", p);
6570                             }
6571                         }
6572                         goto normal_default;
6573                     } /* End of switch on '\' */
6574                     break;
6575                 case '{':
6576                     /* Trying to gain new uses for '{' without breaking too
6577                      * much existing code is hard.  The solution currently
6578                      * adopted is:
6579                      *  1)  If there is no ambiguity that a '{' should always
6580                      *      be taken literally, at the start of a construct, we
6581                      *      just do so.
6582                      *  2)  If the literal '{' conflicts with our desired use
6583                      *      of it as a metacharacter, we die.  The deprecation
6584                      *      cycles for this have come and gone.
6585                      *  3)  If there is ambiguity, we raise a simple warning.
6586                      *      This could happen, for example, if the user
6587                      *      intended it to introduce a quantifier, but slightly
6588                      *      misspelled the quantifier.  Without this warning,
6589                      *      the quantifier would silently be taken as a literal
6590                      *      string of characters instead of a meta construct */
6591                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
6592                         if (      RExC_strict
6593                             || (  p > atom_parse_start + 1
6594                                 && isALPHA_A(*(p - 1))
6595                                 && *(p - 2) == '\\'))
6596                         {
6597                             RExC_parse_set(p + 1);
6598                             vFAIL("Unescaped left brace in regex is "
6599                                   "illegal here");
6600                         }
6601                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
6602                                          " passed through");
6603                     }
6604                     goto normal_default;
6605                 case '}':
6606                 case ']':
6607                     if (p > RExC_parse && RExC_strict) {
6608                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
6609                     }
6610                     /*FALLTHROUGH*/
6611                 default:    /* A literal character */
6612                   normal_default:
6613                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
6614                         STRLEN numlen;
6615                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6616                                                &numlen, UTF8_ALLOW_DEFAULT);
6617                         p += numlen;
6618                     }
6619                     else
6620                         ender = (U8) *p++;
6621                     break;
6622                 } /* End of switch on the literal */
6623 
6624                 /* Here, have looked at the literal character, and <ender>
6625                  * contains its ordinal; <p> points to the character after it.
6626                  * */
6627 
6628                 if (ender > 255) {
6629                     REQUIRE_UTF8(flagp);
6630                     if (   UNICODE_IS_PERL_EXTENDED(ender)
6631                         && TO_OUTPUT_WARNINGS(p))
6632                     {
6633                         ckWARN2_non_literal_string(p,
6634                                                    packWARN(WARN_PORTABLE),
6635                                                    PL_extended_cp_format,
6636                                                    ender);
6637                     }
6638                 }
6639 
6640                 /* We need to check if the next non-ignored thing is a
6641                  * quantifier.  Move <p> to after anything that should be
6642                  * ignored, which, as a side effect, positions <p> for the next
6643                  * loop iteration */
6644                 skip_to_be_ignored_text(pRExC_state, &p,
6645                                         FALSE /* Don't force to /x */ );
6646 
6647                 /* If the next thing is a quantifier, it applies to this
6648                  * character only, which means that this character has to be in
6649                  * its own node and can't just be appended to the string in an
6650                  * existing node, so if there are already other characters in
6651                  * the node, close the node with just them, and set up to do
6652                  * this character again next time through, when it will be the
6653                  * only thing in its new node */
6654 
6655                 next_is_quantifier =    LIKELY(p < RExC_end)
6656                                      && UNLIKELY(isQUANTIFIER(p, RExC_end));
6657 
6658                 if (next_is_quantifier && LIKELY(len)) {
6659                     p = oldp;
6660                     goto loopdone;
6661                 }
6662 
6663                 /* Ready to add 'ender' to the node */
6664 
6665                 if (! FOLD) {  /* The simple case, just append the literal */
6666                   not_fold_common:
6667 
6668                     /* Don't output if it would overflow */
6669                     if (UNLIKELY(len > max_string_len - ((UTF)
6670                                                       ? UVCHR_SKIP(ender)
6671                                                       : 1)))
6672                     {
6673                         overflowed = TRUE;
6674                         break;
6675                     }
6676 
6677                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
6678                         *(s++) = (char) ender;
6679                     }
6680                     else {
6681                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
6682                         added_len = (char *) new_s - s;
6683                         s = (char *) new_s;
6684 
6685                         if (ender > 255)  {
6686                             requires_utf8_target = TRUE;
6687                         }
6688                     }
6689                 }
6690                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
6691 
6692                     /* Here are folding under /l, and the code point is
6693                      * problematic.  If this is the first character in the
6694                      * node, change the node type to folding.   Otherwise, if
6695                      * this is the first problematic character, close up the
6696                      * existing node, so can start a new node with this one */
6697                     if (! len) {
6698                         node_type = EXACTFL;
6699                         RExC_contains_locale = 1;
6700                     }
6701                     else if (node_type == EXACT) {
6702                         p = oldp;
6703                         goto loopdone;
6704                     }
6705 
6706                     /* This problematic code point means we can't simplify
6707                      * things */
6708                     maybe_exactfu = FALSE;
6709 
6710                     /* Although these two characters have folds that are
6711                      * locale-problematic, they also have folds to above Latin1
6712                      * that aren't a problem.  Doing these now helps at
6713                      * runtime. */
6714                     if (UNLIKELY(   ender == GREEK_CAPITAL_LETTER_MU
6715                                  || ender == LATIN_CAPITAL_LETTER_SHARP_S))
6716                     {
6717                         goto fold_anyway;
6718                     }
6719 
6720                     /* Here, we are adding a problematic fold character.
6721                      * "Problematic" in this context means that its fold isn't
6722                      * known until runtime.  (The non-problematic code points
6723                      * are the above-Latin1 ones that fold to also all
6724                      * above-Latin1.  Their folds don't vary no matter what the
6725                      * locale is.) But here we have characters whose fold
6726                      * depends on the locale.  We just add in the unfolded
6727                      * character, and wait until runtime to fold it */
6728                     goto not_fold_common;
6729                 }
6730                 else /* regular fold; see if actually is in a fold */
6731                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
6732                          || (ender > 255
6733                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
6734                 {
6735                     /* Here, folding, but the character isn't in a fold.
6736                      *
6737                      * Start a new node if previous characters in the node were
6738                      * folded */
6739                     if (len && node_type != EXACT) {
6740                         p = oldp;
6741                         goto loopdone;
6742                     }
6743 
6744                     /* Here, continuing a node with non-folded characters.  Add
6745                      * this one */
6746                     goto not_fold_common;
6747                 }
6748                 else {  /* Here, does participate in some fold */
6749 
6750                     /* If this is the first character in the node, change its
6751                      * type to folding.  Otherwise, if this is the first
6752                      * folding character in the node, close up the existing
6753                      * node, so can start a new node with this one.  */
6754                     if (! len) {
6755                         node_type = compute_EXACTish(pRExC_state);
6756                     }
6757                     else if (node_type == EXACT) {
6758                         p = oldp;
6759                         goto loopdone;
6760                     }
6761 
6762                     if (UTF) {  /* Alway use the folded value for UTF-8
6763                                    patterns */
6764                         if (UVCHR_IS_INVARIANT(ender)) {
6765                             if (UNLIKELY(len + 1 > max_string_len)) {
6766                                 overflowed = TRUE;
6767                                 break;
6768                             }
6769 
6770                             *(s)++ = (U8) toFOLD(ender);
6771                         }
6772                         else {
6773                             UV folded;
6774 
6775                           fold_anyway:
6776                             folded = _to_uni_fold_flags(
6777                                     ender,
6778                                     (U8 *) s,  /* We have allocated extra space
6779                                                   in 's' so can't run off the
6780                                                   end */
6781                                     &added_len,
6782                                     FOLD_FLAGS_FULL
6783                                   | ((   ASCII_FOLD_RESTRICTED
6784                                       || node_type == EXACTFL)
6785                                     ? FOLD_FLAGS_NOMIX_ASCII
6786                                     : 0));
6787                             if (UNLIKELY(len + added_len > max_string_len)) {
6788                                 overflowed = TRUE;
6789                                 break;
6790                             }
6791 
6792                             s += added_len;
6793 
6794                             if (   folded > 255
6795                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
6796                             {
6797                                 /* U+B5 folds to the MU, so its possible for a
6798                                  * non-UTF-8 target to match it */
6799                                 requires_utf8_target = TRUE;
6800                             }
6801                         }
6802                     }
6803                     else { /* Here is non-UTF8. */
6804 
6805                         /* The fold will be one or (rarely) two characters.
6806                          * Check that there's room for at least a single one
6807                          * before setting any flags, etc.  Because otherwise an
6808                          * overflowing character could cause a flag to be set
6809                          * even though it doesn't end up in this node.  (For
6810                          * the two character fold, we check again, before
6811                          * setting any flags) */
6812                         if (UNLIKELY(len + 1 > max_string_len)) {
6813                             overflowed = TRUE;
6814                             break;
6815                         }
6816 
6817 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
6818    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
6819                                       || UNICODE_DOT_DOT_VERSION > 0)
6820 
6821                         /* On non-ancient Unicodes, check for the only possible
6822                          * multi-char fold  */
6823                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
6824 
6825                             /* This potential multi-char fold means the node
6826                              * can't be simple (because it could match more
6827                              * than a single char).  And in some cases it will
6828                              * match 'ss', so set that flag */
6829                             maybe_SIMPLE = 0;
6830                             has_ss = TRUE;
6831 
6832                             /* It can't change to be an EXACTFU (unless already
6833                              * is one).  We fold it iff under /u rules. */
6834                             if (node_type != EXACTFU) {
6835                                 maybe_exactfu = FALSE;
6836                             }
6837                             else {
6838                                 if (UNLIKELY(len + 2 > max_string_len)) {
6839                                     overflowed = TRUE;
6840                                     break;
6841                                 }
6842 
6843                                 *(s++) = 's';
6844                                 *(s++) = 's';
6845                                 added_len = 2;
6846 
6847                                 goto done_with_this_char;
6848                             }
6849                         }
6850                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
6851                                  && LIKELY(len > 0)
6852                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
6853                         {
6854                             /* Also, the sequence 'ss' is special when not
6855                              * under /u.  If the target string is UTF-8, it
6856                              * should match SHARP S; otherwise it won't.  So,
6857                              * here we have to exclude the possibility of this
6858                              * node moving to /u.*/
6859                             has_ss = TRUE;
6860                             maybe_exactfu = FALSE;
6861                         }
6862 #endif
6863                         /* Here, the fold will be a single character */
6864 
6865                         if (UNLIKELY(ender == MICRO_SIGN)) {
6866                             has_micro_sign = TRUE;
6867                         }
6868                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
6869 
6870                             /* If the character's fold differs between /d and
6871                              * /u, this can't change to be an EXACTFU node */
6872                             maybe_exactfu = FALSE;
6873                         }
6874 
6875                         *(s++) = (DEPENDS_SEMANTICS)
6876                                  ? (char) toFOLD(ender)
6877 
6878                                    /* Under /u, the fold of any character in
6879                                     * the 0-255 range happens to be its
6880                                     * lowercase equivalent, except for LATIN
6881                                     * SMALL LETTER SHARP S, which was handled
6882                                     * above, and the MICRO SIGN, whose fold
6883                                     * requires UTF-8 to represent.  */
6884                                  : (char) toLOWER_L1(ender);
6885                     }
6886                 } /* End of adding current character to the node */
6887 
6888               done_with_this_char:
6889 
6890                 len += added_len;
6891 
6892                 if (next_is_quantifier) {
6893 
6894                     /* Here, the next input is a quantifier, and to get here,
6895                      * the current character is the only one in the node. */
6896                     goto loopdone;
6897                 }
6898 
6899             } /* End of loop through literal characters */
6900 
6901             /* Here we have either exhausted the input or run out of room in
6902              * the node.  If the former, we are done.  (If we encountered a
6903              * character that can't be in the node, transfer is made directly
6904              * to <loopdone>, and so we wouldn't have fallen off the end of the
6905              * loop.)  */
6906             if (LIKELY(! overflowed)) {
6907                 goto loopdone;
6908             }
6909 
6910             /* Here we have run out of room.  We can grow plain EXACT and
6911              * LEXACT nodes.  If the pattern is gigantic enough, though,
6912              * eventually we'll have to artificially chunk the pattern into
6913              * multiple nodes. */
6914             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
6915                 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
6916                 Size_t overhead_expansion = 0;
6917                 char temp[256];
6918                 Size_t max_nodes_for_string;
6919                 Size_t achievable;
6920                 SSize_t delta;
6921 
6922                 /* Here we couldn't fit the final character in the current
6923                  * node, so it will have to be reparsed, no matter what else we
6924                  * do */
6925                 p = oldp;
6926 
6927                 /* If would have overflowed a regular EXACT node, switch
6928                  * instead to an LEXACT.  The code below is structured so that
6929                  * the actual growing code is common to changing from an EXACT
6930                  * or just increasing the LEXACT size.  This means that we have
6931                  * to save the string in the EXACT case before growing, and
6932                  * then copy it afterwards to its new location */
6933                 if (node_type == EXACT) {
6934                     overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
6935                     RExC_emit += overhead_expansion;
6936                     Copy(s0, temp, len, char);
6937                 }
6938 
6939                 /* Ready to grow.  If it was a plain EXACT, the string was
6940                  * saved, and the first few bytes of it overwritten by adding
6941                  * an argument field.  We assume, as we do elsewhere in this
6942                  * file, that one byte of remaining input will translate into
6943                  * one byte of output, and if that's too small, we grow again,
6944                  * if too large the excess memory is freed at the end */
6945 
6946                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
6947                 achievable = MIN(max_nodes_for_string,
6948                                  current_string_nodes + STR_SZ(RExC_end - p));
6949                 delta = achievable - current_string_nodes;
6950 
6951                 /* If there is just no more room, go finish up this chunk of
6952                  * the pattern. */
6953                 if (delta <= 0) {
6954                     goto loopdone;
6955                 }
6956 
6957                 change_engine_size(pRExC_state, delta + overhead_expansion);
6958                 current_string_nodes += delta;
6959                 max_string_len
6960                            = sizeof(struct regnode) * current_string_nodes;
6961                 upper_fill = max_string_len + 1;
6962 
6963                 /* If the length was small, we know this was originally an
6964                  * EXACT node now converted to LEXACT, and the string has to be
6965                  * restored.  Otherwise the string was untouched.  260 is just
6966                  * a number safely above 255 so don't have to worry about
6967                  * getting it precise */
6968                 if (len < 260) {
6969                     node_type = LEXACT;
6970                     FILL_NODE(ret, node_type);
6971                     s0 = STRING(REGNODE_p(ret));
6972                     Copy(temp, s0, len, char);
6973                     s = s0 + len;
6974                 }
6975 
6976                 goto continue_parse;
6977             }
6978             else if (FOLD) {
6979                 bool splittable = FALSE;
6980                 bool backed_up = FALSE;
6981                 char * e;       /* should this be U8? */
6982                 char * s_start; /* should this be U8? */
6983 
6984                 /* Here is /i.  Running out of room creates a problem if we are
6985                  * folding, and the split happens in the middle of a
6986                  * multi-character fold, as a match that should have occurred,
6987                  * won't, due to the way nodes are matched, and our artificial
6988                  * boundary.  So back off until we aren't splitting such a
6989                  * fold.  If there is no such place to back off to, we end up
6990                  * taking the entire node as-is.  This can happen if the node
6991                  * consists entirely of 'f' or entirely of 's' characters (or
6992                  * things that fold to them) as 'ff' and 'ss' are
6993                  * multi-character folds.
6994                  *
6995                  * The Unicode standard says that multi character folds consist
6996                  * of either two or three characters.  That means we would be
6997                  * splitting one if the final character in the node is at the
6998                  * beginning of either type, or is the second of a three
6999                  * character fold.
7000                  *
7001                  * At this point:
7002                  *  ender     is the code point of the character that won't fit
7003                  *            in the node
7004                  *  s         points to just beyond the final byte in the node.
7005                  *            It's where we would place ender if there were
7006                  *            room, and where in fact we do place ender's fold
7007                  *            in the code below, as we've over-allocated space
7008                  *            for s0 (hence s) to allow for this
7009                  *  e         starts at 's' and advances as we append things.
7010                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
7011                  *            have been advanced to beyond it).
7012                  *  old_old_s points to the beginning byte of the final
7013                  *            character in the node
7014                  *  p         points to the beginning byte in the input of the
7015                  *            character beyond 'ender'.
7016                  *  oldp      points to the beginning byte in the input of
7017                  *            'ender'.
7018                  *
7019                  * In the case of /il, we haven't folded anything that could be
7020                  * affected by the locale.  That means only above-Latin1
7021                  * characters that fold to other above-latin1 characters get
7022                  * folded at compile time.  To check where a good place to
7023                  * split nodes is, everything in it will have to be folded.
7024                  * The boolean 'maybe_exactfu' keeps track in /il if there are
7025                  * any unfolded characters in the node. */
7026                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
7027 
7028                 /* If we do need to fold the node, we need a place to store the
7029                  * folded copy, and a way to map back to the unfolded original
7030                  * */
7031                 char * locfold_buf = NULL;
7032                 Size_t * loc_correspondence = NULL;
7033 
7034                 if (! need_to_fold_loc) {   /* The normal case.  Just
7035                                                initialize to the actual node */
7036                     e = s;
7037                     s_start = s0;
7038                     s = old_old_s;  /* Point to the beginning of the final char
7039                                        that fits in the node */
7040                 }
7041                 else {
7042 
7043                     /* Here, we have filled a /il node, and there are unfolded
7044                      * characters in it.  If the runtime locale turns out to be
7045                      * UTF-8, there are possible multi-character folds, just
7046                      * like when not under /l.  The node hence can't terminate
7047                      * in the middle of such a fold.  To determine this, we
7048                      * have to create a folded copy of this node.  That means
7049                      * reparsing the node, folding everything assuming a UTF-8
7050                      * locale.  (If at runtime it isn't such a locale, the
7051                      * actions here wouldn't have been necessary, but we have
7052                      * to assume the worst case.)  If we find we need to back
7053                      * off the folded string, we do so, and then map that
7054                      * position back to the original unfolded node, which then
7055                      * gets output, truncated at that spot */
7056 
7057                     char * redo_p = RExC_parse;
7058                     char * redo_e;
7059                     char * old_redo_e;
7060 
7061                     /* Allow enough space assuming a single byte input folds to
7062                      * a single byte output, plus assume that the two unparsed
7063                      * characters (that we may need) fold to the largest number
7064                      * of bytes possible, plus extra for one more worst case
7065                      * scenario.  In the loop below, if we start eating into
7066                      * that final spare space, we enlarge this initial space */
7067                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
7068 
7069                     Newxz(locfold_buf, size, char);
7070                     Newxz(loc_correspondence, size, Size_t);
7071 
7072                     /* Redo this node's parse, folding into 'locfold_buf' */
7073                     redo_p = RExC_parse;
7074                     old_redo_e = redo_e = locfold_buf;
7075                     while (redo_p <= oldp) {
7076 
7077                         old_redo_e = redo_e;
7078                         loc_correspondence[redo_e - locfold_buf]
7079                                                         = redo_p - RExC_parse;
7080 
7081                         if (UTF) {
7082                             Size_t added_len;
7083 
7084                             (void) _to_utf8_fold_flags((U8 *) redo_p,
7085                                                        (U8 *) RExC_end,
7086                                                        (U8 *) redo_e,
7087                                                        &added_len,
7088                                                        FOLD_FLAGS_FULL);
7089                             redo_e += added_len;
7090                             redo_p += UTF8SKIP(redo_p);
7091                         }
7092                         else {
7093 
7094                             /* Note that if this code is run on some ancient
7095                              * Unicode versions, SHARP S doesn't fold to 'ss',
7096                              * but rather than clutter the code with #ifdef's,
7097                              * as is done above, we ignore that possibility.
7098                              * This is ok because this code doesn't affect what
7099                              * gets matched, but merely where the node gets
7100                              * split */
7101                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
7102                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
7103                             }
7104                             else {
7105                                 *redo_e++ = 's';
7106                                 *redo_e++ = 's';
7107                             }
7108                             redo_p++;
7109                         }
7110 
7111 
7112                         /* If we're getting so close to the end that a
7113                          * worst-case fold in the next character would cause us
7114                          * to overflow, increase, assuming one byte output byte
7115                          * per one byte input one, plus room for another worst
7116                          * case fold */
7117                         if (   redo_p <= oldp
7118                             && redo_e > locfold_buf + size
7119                                                     - (UTF8_MAXBYTES_CASE + 1))
7120                         {
7121                             Size_t new_size = size
7122                                             + (oldp - redo_p)
7123                                             + UTF8_MAXBYTES_CASE + 1;
7124                             Ptrdiff_t e_offset = redo_e - locfold_buf;
7125 
7126                             Renew(locfold_buf, new_size, char);
7127                             Renew(loc_correspondence, new_size, Size_t);
7128                             size = new_size;
7129 
7130                             redo_e = locfold_buf + e_offset;
7131                         }
7132                     }
7133 
7134                     /* Set so that things are in terms of the folded, temporary
7135                      * string */
7136                     s = old_redo_e;
7137                     s_start = locfold_buf;
7138                     e = redo_e;
7139 
7140                 }
7141 
7142                 /* Here, we have 's', 's_start' and 'e' set up to point to the
7143                  * input that goes into the node, folded.
7144                  *
7145                  * If the final character of the node and the fold of ender
7146                  * form the first two characters of a three character fold, we
7147                  * need to peek ahead at the next (unparsed) character in the
7148                  * input to determine if the three actually do form such a
7149                  * fold.  Just looking at that character is not generally
7150                  * sufficient, as it could be, for example, an escape sequence
7151                  * that evaluates to something else, and it needs to be folded.
7152                  *
7153                  * khw originally thought to just go through the parse loop one
7154                  * extra time, but that doesn't work easily as that iteration
7155                  * could cause things to think that the parse is over and to
7156                  * goto loopdone.  The character could be a '$' for example, or
7157                  * the character beyond could be a quantifier, and other
7158                  * glitches as well.
7159                  *
7160                  * The solution used here for peeking ahead is to look at that
7161                  * next character.  If it isn't ASCII punctuation, then it will
7162                  * be something that would continue on in an EXACTish node if
7163                  * there were space.  We append the fold of it to s, having
7164                  * reserved enough room in s0 for the purpose.  If we can't
7165                  * reasonably peek ahead, we instead assume the worst case:
7166                  * that it is something that would form the completion of a
7167                  * multi-char fold.
7168                  *
7169                  * If we can't split between s and ender, we work backwards
7170                  * character-by-character down to s0.  At each current point
7171                  * see if we are at the beginning of a multi-char fold.  If so,
7172                  * that means we would be splitting the fold across nodes, and
7173                  * so we back up one and try again.
7174                  *
7175                  * If we're not at the beginning, we still could be at the
7176                  * final two characters of a (rare) three character fold.  We
7177                  * check if the sequence starting at the character before the
7178                  * current position (and including the current and next
7179                  * characters) is a three character fold.  If not, the node can
7180                  * be split here.  If it is, we have to backup two characters
7181                  * and try again.
7182                  *
7183                  * Otherwise, the node can be split at the current position.
7184                  *
7185                  * The same logic is used for UTF-8 patterns and not */
7186                 if (UTF) {
7187                     Size_t added_len;
7188 
7189                     /* Append the fold of ender */
7190                     (void) _to_uni_fold_flags(
7191                         ender,
7192                         (U8 *) e,
7193                         &added_len,
7194                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7195                                         ? FOLD_FLAGS_NOMIX_ASCII
7196                                         : 0));
7197                     e += added_len;
7198 
7199                     /* 's' and the character folded to by ender may be the
7200                      * first two of a three-character fold, in which case the
7201                      * node should not be split here.  That may mean examining
7202                      * the so-far unparsed character starting at 'p'.  But if
7203                      * ender folded to more than one character, we already have
7204                      * three characters to look at.  Also, we first check if
7205                      * the sequence consisting of s and the next character form
7206                      * the first two of some three character fold.  If not,
7207                      * there's no need to peek ahead. */
7208                     if (   added_len <= UTF8SKIP(e - added_len)
7209                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
7210                     {
7211                         /* Here, the two do form the beginning of a potential
7212                          * three character fold.  The unexamined character may
7213                          * or may not complete it.  Peek at it.  It might be
7214                          * something that ends the node or an escape sequence,
7215                          * in which case we don't know without a lot of work
7216                          * what it evaluates to, so we have to assume the worst
7217                          * case: that it does complete the fold, and so we
7218                          * can't split here.  All such instances  will have
7219                          * that character be an ASCII punctuation character,
7220                          * like a backslash.  So, for that case, backup one and
7221                          * drop down to try at that position */
7222                         if (isPUNCT(*p)) {
7223                             s = (char *) utf8_hop_back((U8 *) s, -1,
7224                                        (U8 *) s_start);
7225                             backed_up = TRUE;
7226                         }
7227                         else {
7228                             /* Here, since it's not punctuation, it must be a
7229                              * real character, and we can append its fold to
7230                              * 'e' (having deliberately reserved enough space
7231                              * for this eventuality) and drop down to check if
7232                              * the three actually do form a folded sequence */
7233                             (void) _to_utf8_fold_flags(
7234                                 (U8 *) p, (U8 *) RExC_end,
7235                                 (U8 *) e,
7236                                 &added_len,
7237                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7238                                                 ? FOLD_FLAGS_NOMIX_ASCII
7239                                                 : 0));
7240                             e += added_len;
7241                         }
7242                     }
7243 
7244                     /* Here, we either have three characters available in
7245                      * sequence starting at 's', or we have two characters and
7246                      * know that the following one can't possibly be part of a
7247                      * three character fold.  We go through the node backwards
7248                      * until we find a place where we can split it without
7249                      * breaking apart a multi-character fold.  At any given
7250                      * point we have to worry about if such a fold begins at
7251                      * the current 's', and also if a three-character fold
7252                      * begins at s-1, (containing s and s+1).  Splitting in
7253                      * either case would break apart a fold */
7254                     do {
7255                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
7256                                                             (U8 *) s_start);
7257 
7258                         /* If is a multi-char fold, can't split here.  Backup
7259                          * one char and try again */
7260                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
7261                             s = prev_s;
7262                             backed_up = TRUE;
7263                             continue;
7264                         }
7265 
7266                         /* If the two characters beginning at 's' are part of a
7267                          * three character fold starting at the character
7268                          * before s, we can't split either before or after s.
7269                          * Backup two chars and try again */
7270                         if (   LIKELY(s > s_start)
7271                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
7272                         {
7273                             s = prev_s;
7274                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
7275                             backed_up = TRUE;
7276                             continue;
7277                         }
7278 
7279                         /* Here there's no multi-char fold between s and the
7280                          * next character following it.  We can split */
7281                         splittable = TRUE;
7282                         break;
7283 
7284                     } while (s > s_start); /* End of loops backing up through the node */
7285 
7286                     /* Here we either couldn't find a place to split the node,
7287                      * or else we broke out of the loop setting 'splittable' to
7288                      * true.  In the latter case, the place to split is between
7289                      * the first and second characters in the sequence starting
7290                      * at 's' */
7291                     if (splittable) {
7292                         s += UTF8SKIP(s);
7293                     }
7294                 }
7295                 else {  /* Pattern not UTF-8 */
7296                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
7297                         || ASCII_FOLD_RESTRICTED)
7298                     {
7299                         assert( toLOWER_L1(ender) < 256 );
7300                         *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7301                     }
7302                     else {
7303                         *e++ = 's';
7304                         *e++ = 's';
7305                     }
7306 
7307                     if (   e - s  <= 1
7308                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
7309                     {
7310                         if (isPUNCT(*p)) {
7311                             s--;
7312                             backed_up = TRUE;
7313                         }
7314                         else {
7315                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
7316                                 || ASCII_FOLD_RESTRICTED)
7317                             {
7318                                 assert( toLOWER_L1(ender) < 256 );
7319                                 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7320                             }
7321                             else {
7322                                 *e++ = 's';
7323                                 *e++ = 's';
7324                             }
7325                         }
7326                     }
7327 
7328                     do {
7329                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
7330                             s--;
7331                             backed_up = TRUE;
7332                             continue;
7333                         }
7334 
7335                         if (   LIKELY(s > s_start)
7336                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
7337                         {
7338                             s -= 2;
7339                             backed_up = TRUE;
7340                             continue;
7341                         }
7342 
7343                         splittable = TRUE;
7344                         break;
7345 
7346                     } while (s > s_start);
7347 
7348                     if (splittable) {
7349                         s++;
7350                     }
7351                 }
7352 
7353                 /* Here, we are done backing up.  If we didn't backup at all
7354                  * (the likely case), just proceed */
7355                 if (backed_up) {
7356 
7357                    /* If we did find a place to split, reparse the entire node
7358                     * stopping where we have calculated. */
7359                     if (splittable) {
7360 
7361                        /* If we created a temporary folded string under /l, we
7362                         * have to map that back to the original */
7363                         if (need_to_fold_loc) {
7364                             upper_fill = loc_correspondence[s - s_start];
7365                             if (upper_fill == 0) {
7366                                 FAIL2("panic: loc_correspondence[%d] is 0",
7367                                       (int) (s - s_start));
7368                             }
7369                             Safefree(locfold_buf);
7370                             Safefree(loc_correspondence);
7371                         }
7372                         else {
7373                             upper_fill = s - s0;
7374                         }
7375                         goto reparse;
7376                     }
7377 
7378                     /* Here the node consists entirely of non-final multi-char
7379                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
7380                      * decent place to split it, so give up and just take the
7381                      * whole thing */
7382                     len = old_s - s0;
7383                 }
7384 
7385                 if (need_to_fold_loc) {
7386                     Safefree(locfold_buf);
7387                     Safefree(loc_correspondence);
7388                 }
7389             }   /* End of verifying node ends with an appropriate char */
7390 
7391             /* We need to start the next node at the character that didn't fit
7392              * in this one */
7393             p = oldp;
7394 
7395           loopdone:   /* Jumped to when encounters something that shouldn't be
7396                          in the node */
7397 
7398             /* Free up any over-allocated space; cast is to silence bogus
7399              * warning in MS VC */
7400             change_engine_size(pRExC_state,
7401                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
7402 
7403             /* I (khw) don't know if you can get here with zero length, but the
7404              * old code handled this situation by creating a zero-length EXACT
7405              * node.  Might as well be NOTHING instead */
7406             if (len == 0) {
7407                 OP(REGNODE_p(ret)) = NOTHING;
7408             }
7409             else {
7410 
7411                 /* If the node type is EXACT here, check to see if it
7412                  * should be EXACTL, or EXACT_REQ8. */
7413                 if (node_type == EXACT) {
7414                     if (LOC) {
7415                         node_type = EXACTL;
7416                     }
7417                     else if (requires_utf8_target) {
7418                         node_type = EXACT_REQ8;
7419                     }
7420                 }
7421                 else if (node_type == LEXACT) {
7422                     if (requires_utf8_target) {
7423                         node_type = LEXACT_REQ8;
7424                     }
7425                 }
7426                 else if (FOLD) {
7427                     if (    UNLIKELY(has_micro_sign || has_ss)
7428                         && (node_type == EXACTFU || (   node_type == EXACTF
7429                                                      && maybe_exactfu)))
7430                     {   /* These two conditions are problematic in non-UTF-8
7431                            EXACTFU nodes. */
7432                         assert(! UTF);
7433                         node_type = EXACTFUP;
7434                     }
7435                     else if (node_type == EXACTFL) {
7436 
7437                         /* 'maybe_exactfu' is deliberately set above to
7438                          * indicate this node type, where all code points in it
7439                          * are above 255 */
7440                         if (maybe_exactfu) {
7441                             node_type = EXACTFLU8;
7442                         }
7443                         else if (UNLIKELY(
7444                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
7445                         {
7446                             /* A character that folds to more than one will
7447                              * match multiple characters, so can't be SIMPLE.
7448                              * We don't have to worry about this with EXACTFLU8
7449                              * nodes just above, as they have already been
7450                              * folded (since the fold doesn't vary at run
7451                              * time).  Here, if the final character in the node
7452                              * folds to multiple, it can't be simple.  (This
7453                              * only has an effect if the node has only a single
7454                              * character, hence the final one, as elsewhere we
7455                              * turn off simple for nodes whose length > 1 */
7456                             maybe_SIMPLE = 0;
7457                         }
7458                     }
7459                     else if (node_type == EXACTF) {  /* Means is /di */
7460 
7461                         /* This intermediate variable is needed solely because
7462                          * the asserts in the macro where used exceed Win32's
7463                          * literal string capacity */
7464                         char first_char = * STRING(REGNODE_p(ret));
7465 
7466                         /* If 'maybe_exactfu' is clear, then we need to stay
7467                          * /di.  If it is set, it means there are no code
7468                          * points that match differently depending on UTF8ness
7469                          * of the target string, so it can become an EXACTFU
7470                          * node */
7471                         if (! maybe_exactfu) {
7472                             RExC_seen_d_op = TRUE;
7473                         }
7474                         else if (   isALPHA_FOLD_EQ(first_char, 's')
7475                                  || isALPHA_FOLD_EQ(ender, 's'))
7476                         {
7477                             /* But, if the node begins or ends in an 's' we
7478                              * have to defer changing it into an EXACTFU, as
7479                              * the node could later get joined with another one
7480                              * that ends or begins with 's' creating an 'ss'
7481                              * sequence which would then wrongly match the
7482                              * sharp s without the target being UTF-8.  We
7483                              * create a special node that we resolve later when
7484                              * we join nodes together */
7485 
7486                             node_type = EXACTFU_S_EDGE;
7487                         }
7488                         else {
7489                             node_type = EXACTFU;
7490                         }
7491                     }
7492 
7493                     if (requires_utf8_target && node_type == EXACTFU) {
7494                         node_type = EXACTFU_REQ8;
7495                     }
7496                 }
7497 
7498                 OP(REGNODE_p(ret)) = node_type;
7499                 setSTR_LEN(REGNODE_p(ret), len);
7500                 RExC_emit += STR_SZ(len);
7501 
7502                 /* If the node isn't a single character, it can't be SIMPLE */
7503                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
7504                     maybe_SIMPLE = 0;
7505                 }
7506 
7507                 *flagp |= HASWIDTH | maybe_SIMPLE;
7508             }
7509 
7510             RExC_parse_set(p);
7511 
7512             {
7513                 /* len is STRLEN which is unsigned, need to copy to signed */
7514                 IV iv = len;
7515                 if (iv < 0)
7516                     vFAIL("Internal disaster");
7517             }
7518 
7519         } /* End of label 'defchar:' */
7520         break;
7521     } /* End of giant switch on input character */
7522 
7523     /* Position parse to next real character */
7524     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
7525                                             FALSE /* Don't force to /x */ );
7526     if (   *RExC_parse == '{'
7527         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
7528     {
7529         if (RExC_strict) {
7530             RExC_parse_inc_by(1);
7531             vFAIL("Unescaped left brace in regex is illegal here");
7532         }
7533         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
7534                                   " passed through");
7535     }
7536 
7537     return(ret);
7538 }
7539 
7540 
7541 #ifdef PERL_RE_BUILD_AUX
7542 void
Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode * node,SV ** invlist_ptr)7543 Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
7544 {
7545     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
7546      * sets up the bitmap and any flags, removing those code points from the
7547      * inversion list, setting it to NULL should it become completely empty */
7548 
7549 
7550     PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
7551 
7552     /* There is no bitmap for this node type */
7553     if (REGNODE_TYPE(OP(node))  != ANYOF) {
7554         return;
7555     }
7556 
7557     ANYOF_BITMAP_ZERO(node);
7558     if (*invlist_ptr) {
7559 
7560         /* This gets set if we actually need to modify things */
7561         bool change_invlist = FALSE;
7562 
7563         UV start, end;
7564 
7565         /* Start looking through *invlist_ptr */
7566         invlist_iterinit(*invlist_ptr);
7567         while (invlist_iternext(*invlist_ptr, &start, &end)) {
7568             UV high;
7569             int i;
7570 
7571             /* Quit if are above what we should change */
7572             if (start >= NUM_ANYOF_CODE_POINTS) {
7573                 break;
7574             }
7575 
7576             change_invlist = TRUE;
7577 
7578             /* Set all the bits in the range, up to the max that we are doing */
7579             high = (end < NUM_ANYOF_CODE_POINTS - 1)
7580                    ? end
7581                    : NUM_ANYOF_CODE_POINTS - 1;
7582             for (i = start; i <= (int) high; i++) {
7583                 ANYOF_BITMAP_SET(node, i);
7584             }
7585         }
7586         invlist_iterfinish(*invlist_ptr);
7587 
7588         /* Done with loop; remove any code points that are in the bitmap from
7589          * *invlist_ptr */
7590         if (change_invlist) {
7591             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
7592         }
7593 
7594         /* If have completely emptied it, remove it completely */
7595         if (_invlist_len(*invlist_ptr) == 0) {
7596             SvREFCNT_dec_NN(*invlist_ptr);
7597             *invlist_ptr = NULL;
7598         }
7599     }
7600 }
7601 #endif /* PERL_RE_BUILD_AUX */
7602 
7603 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7604    Character classes ([:foo:]) can also be negated ([:^foo:]).
7605    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7606    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7607    but trigger failures because they are currently unimplemented. */
7608 
7609 #define POSIXCC_DONE(c)   ((c) == ':')
7610 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7611 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7612 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
7613 
7614 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
7615 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
7616 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
7617 
7618 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
7619 
7620 /* 'posix_warnings' and 'warn_text' are names of variables in the following
7621  * routine. q.v. */
7622 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
7623         if (posix_warnings) {                                               \
7624             if (! RExC_warn_text ) RExC_warn_text =                         \
7625                                          (AV *) sv_2mortal((SV *) newAV()); \
7626             av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
7627                                              WARNING_PREFIX                 \
7628                                              text                           \
7629                                              REPORT_LOCATION,               \
7630                                              REPORT_LOCATION_ARGS(p)));     \
7631         }                                                                   \
7632     } STMT_END
7633 #define CLEAR_POSIX_WARNINGS()                                              \
7634     STMT_START {                                                            \
7635         if (posix_warnings && RExC_warn_text)                               \
7636             av_clear(RExC_warn_text);                                       \
7637     } STMT_END
7638 
7639 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
7640     STMT_START {                                                            \
7641         CLEAR_POSIX_WARNINGS();                                             \
7642         return ret;                                                         \
7643     } STMT_END
7644 
7645 STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t * pRExC_state,const char * const s,char ** updated_parse_ptr,AV ** posix_warnings,const bool check_only)7646 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
7647 
7648     const char * const s,      /* Where the putative posix class begins.
7649                                   Normally, this is one past the '['.  This
7650                                   parameter exists so it can be somewhere
7651                                   besides RExC_parse. */
7652     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
7653                                   NULL */
7654     AV ** posix_warnings,      /* Where to place any generated warnings, or
7655                                   NULL */
7656     const bool check_only      /* Don't die if error */
7657 )
7658 {
7659     /* This parses what the caller thinks may be one of the three POSIX
7660      * constructs:
7661      *  1) a character class, like [:blank:]
7662      *  2) a collating symbol, like [. .]
7663      *  3) an equivalence class, like [= =]
7664      * In the latter two cases, it croaks if it finds a syntactically legal
7665      * one, as these are not handled by Perl.
7666      *
7667      * The main purpose is to look for a POSIX character class.  It returns:
7668      *  a) the class number
7669      *      if it is a completely syntactically and semantically legal class.
7670      *      'updated_parse_ptr', if not NULL, is set to point to just after the
7671      *      closing ']' of the class
7672      *  b) OOB_NAMEDCLASS
7673      *      if it appears that one of the three POSIX constructs was meant, but
7674      *      its specification was somehow defective.  'updated_parse_ptr', if
7675      *      not NULL, is set to point to the character just after the end
7676      *      character of the class.  See below for handling of warnings.
7677      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
7678      *      if it  doesn't appear that a POSIX construct was intended.
7679      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
7680      *      raised.
7681      *
7682      * In b) there may be errors or warnings generated.  If 'check_only' is
7683      * TRUE, then any errors are discarded.  Warnings are returned to the
7684      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
7685      * instead it is NULL, warnings are suppressed.
7686      *
7687      * The reason for this function, and its complexity is that a bracketed
7688      * character class can contain just about anything.  But it's easy to
7689      * mistype the very specific posix class syntax but yielding a valid
7690      * regular bracketed class, so it silently gets compiled into something
7691      * quite unintended.
7692      *
7693      * The solution adopted here maintains backward compatibility except that
7694      * it adds a warning if it looks like a posix class was intended but
7695      * improperly specified.  The warning is not raised unless what is input
7696      * very closely resembles one of the 14 legal posix classes.  To do this,
7697      * it uses fuzzy parsing.  It calculates how many single-character edits it
7698      * would take to transform what was input into a legal posix class.  Only
7699      * if that number is quite small does it think that the intention was a
7700      * posix class.  Obviously these are heuristics, and there will be cases
7701      * where it errs on one side or another, and they can be tweaked as
7702      * experience informs.
7703      *
7704      * The syntax for a legal posix class is:
7705      *
7706      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
7707      *
7708      * What this routine considers syntactically to be an intended posix class
7709      * is this (the comments indicate some restrictions that the pattern
7710      * doesn't show):
7711      *
7712      *  qr/(?x: \[?                         # The left bracket, possibly
7713      *                                      # omitted
7714      *          \h*                         # possibly followed by blanks
7715      *          (?: \^ \h* )?               # possibly a misplaced caret
7716      *          [:;]?                       # The opening class character,
7717      *                                      # possibly omitted.  A typo
7718      *                                      # semi-colon can also be used.
7719      *          \h*
7720      *          \^?                         # possibly a correctly placed
7721      *                                      # caret, but not if there was also
7722      *                                      # a misplaced one
7723      *          \h*
7724      *          .{3,15}                     # The class name.  If there are
7725      *                                      # deviations from the legal syntax,
7726      *                                      # its edit distance must be close
7727      *                                      # to a real class name in order
7728      *                                      # for it to be considered to be
7729      *                                      # an intended posix class.
7730      *          \h*
7731      *          [[:punct:]]?                # The closing class character,
7732      *                                      # possibly omitted.  If not a colon
7733      *                                      # nor semi colon, the class name
7734      *                                      # must be even closer to a valid
7735      *                                      # one
7736      *          \h*
7737      *          \]?                         # The right bracket, possibly
7738      *                                      # omitted.
7739      *     )/
7740      *
7741      * In the above, \h must be ASCII-only.
7742      *
7743      * These are heuristics, and can be tweaked as field experience dictates.
7744      * There will be cases when someone didn't intend to specify a posix class
7745      * that this warns as being so.  The goal is to minimize these, while
7746      * maximizing the catching of things intended to be a posix class that
7747      * aren't parsed as such.
7748      */
7749 
7750     const char* p             = s;
7751     const char * const e      = RExC_end;
7752     unsigned complement       = 0;      /* If to complement the class */
7753     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
7754     bool has_opening_bracket  = FALSE;
7755     bool has_opening_colon    = FALSE;
7756     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
7757                                                    valid class */
7758     const char * possible_end = NULL;   /* used for a 2nd parse pass */
7759     const char* name_start;             /* ptr to class name first char */
7760 
7761     /* If the number of single-character typos the input name is away from a
7762      * legal name is no more than this number, it is considered to have meant
7763      * the legal name */
7764     int max_distance          = 2;
7765 
7766     /* to store the name.  The size determines the maximum length before we
7767      * decide that no posix class was intended.  Should be at least
7768      * sizeof("alphanumeric") */
7769     UV input_text[15];
7770     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
7771 
7772     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
7773 
7774     CLEAR_POSIX_WARNINGS();
7775 
7776     if (p >= e) {
7777         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
7778     }
7779 
7780     if (*(p - 1) != '[') {
7781         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
7782         found_problem = TRUE;
7783     }
7784     else {
7785         has_opening_bracket = TRUE;
7786     }
7787 
7788     /* They could be confused and think you can put spaces between the
7789      * components */
7790     if (isBLANK(*p)) {
7791         found_problem = TRUE;
7792 
7793         do {
7794             p++;
7795         } while (p < e && isBLANK(*p));
7796 
7797         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7798     }
7799 
7800     /* For [. .] and [= =].  These are quite different internally from [: :],
7801      * so they are handled separately.  */
7802     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
7803                                             and 1 for at least one char in it
7804                                           */
7805     {
7806         const char open_char  = *p;
7807         const char * temp_ptr = p + 1;
7808 
7809         /* These two constructs are not handled by perl, and if we find a
7810          * syntactically valid one, we croak.  khw, who wrote this code, finds
7811          * this explanation of them very unclear:
7812          * https://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
7813          * And searching the rest of the internet wasn't very helpful either.
7814          * It looks like just about any byte can be in these constructs,
7815          * depending on the locale.  But unless the pattern is being compiled
7816          * under /l, which is very rare, Perl runs under the C or POSIX locale.
7817          * In that case, it looks like [= =] isn't allowed at all, and that
7818          * [. .] could be any single code point, but for longer strings the
7819          * constituent characters would have to be the ASCII alphabetics plus
7820          * the minus-hyphen.  Any sensible locale definition would limit itself
7821          * to these.  And any portable one definitely should.  Trying to parse
7822          * the general case is a nightmare (see [perl #127604]).  So, this code
7823          * looks only for interiors of these constructs that match:
7824          *      qr/.|[-\w]{2,}/
7825          * Using \w relaxes the apparent rules a little, without adding much
7826          * danger of mistaking something else for one of these constructs.
7827          *
7828          * [. .] in some implementations described on the internet is usable to
7829          * escape a character that otherwise is special in bracketed character
7830          * classes.  For example [.].] means a literal right bracket instead of
7831          * the ending of the class
7832          *
7833          * [= =] can legitimately contain a [. .] construct, but we don't
7834          * handle this case, as that [. .] construct will later get parsed
7835          * itself and croak then.  And [= =] is checked for even when not under
7836          * /l, as Perl has long done so.
7837          *
7838          * The code below relies on there being a trailing NUL, so it doesn't
7839          * have to keep checking if the parse ptr < e.
7840          */
7841         if (temp_ptr[1] == open_char) {
7842             temp_ptr++;
7843         }
7844         else while (    temp_ptr < e
7845                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
7846         {
7847             temp_ptr++;
7848         }
7849 
7850         if (*temp_ptr == open_char) {
7851             temp_ptr++;
7852             if (*temp_ptr == ']') {
7853                 temp_ptr++;
7854                 if (! found_problem && ! check_only) {
7855                     RExC_parse_set((char *) temp_ptr);
7856                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
7857                             "extensions", open_char, open_char);
7858                 }
7859 
7860                 /* Here, the syntax wasn't completely valid, or else the call
7861                  * is to check-only */
7862                 if (updated_parse_ptr) {
7863                     *updated_parse_ptr = (char *) temp_ptr;
7864                 }
7865 
7866                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
7867             }
7868         }
7869 
7870         /* If we find something that started out to look like one of these
7871          * constructs, but isn't, we continue below so that it can be checked
7872          * for being a class name with a typo of '.' or '=' instead of a colon.
7873          * */
7874     }
7875 
7876     /* Here, we think there is a possibility that a [: :] class was meant, and
7877      * we have the first real character.  It could be they think the '^' comes
7878      * first */
7879     if (*p == '^') {
7880         found_problem = TRUE;
7881         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
7882         complement = 1;
7883         p++;
7884 
7885         if (isBLANK(*p)) {
7886             found_problem = TRUE;
7887 
7888             do {
7889                 p++;
7890             } while (p < e && isBLANK(*p));
7891 
7892             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7893         }
7894     }
7895 
7896     /* But the first character should be a colon, which they could have easily
7897      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
7898      * distinguish from a colon, so treat that as a colon).  */
7899     if (*p == ':') {
7900         p++;
7901         has_opening_colon = TRUE;
7902     }
7903     else if (*p == ';') {
7904         found_problem = TRUE;
7905         p++;
7906         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7907         has_opening_colon = TRUE;
7908     }
7909     else {
7910         found_problem = TRUE;
7911         ADD_POSIX_WARNING(p, "there must be a starting ':'");
7912 
7913         /* Consider an initial punctuation (not one of the recognized ones) to
7914          * be a left terminator */
7915         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
7916             p++;
7917         }
7918     }
7919 
7920     /* They may think that you can put spaces between the components */
7921     if (isBLANK(*p)) {
7922         found_problem = TRUE;
7923 
7924         do {
7925             p++;
7926         } while (p < e && isBLANK(*p));
7927 
7928         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7929     }
7930 
7931     if (*p == '^') {
7932 
7933         /* We consider something like [^:^alnum:]] to not have been intended to
7934          * be a posix class, but XXX maybe we should */
7935         if (complement) {
7936             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7937         }
7938 
7939         complement = 1;
7940         p++;
7941     }
7942 
7943     /* Again, they may think that you can put spaces between the components */
7944     if (isBLANK(*p)) {
7945         found_problem = TRUE;
7946 
7947         do {
7948             p++;
7949         } while (p < e && isBLANK(*p));
7950 
7951         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7952     }
7953 
7954     if (*p == ']') {
7955 
7956         /* XXX This ']' may be a typo, and something else was meant.  But
7957          * treating it as such creates enough complications, that that
7958          * possibility isn't currently considered here.  So we assume that the
7959          * ']' is what is intended, and if we've already found an initial '[',
7960          * this leaves this construct looking like [:] or [:^], which almost
7961          * certainly weren't intended to be posix classes */
7962         if (has_opening_bracket) {
7963             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7964         }
7965 
7966         /* But this function can be called when we parse the colon for
7967          * something like qr/[alpha:]]/, so we back up to look for the
7968          * beginning */
7969         p--;
7970 
7971         if (*p == ';') {
7972             found_problem = TRUE;
7973             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7974         }
7975         else if (*p != ':') {
7976 
7977             /* XXX We are currently very restrictive here, so this code doesn't
7978              * consider the possibility that, say, /[alpha.]]/ was intended to
7979              * be a posix class. */
7980             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7981         }
7982 
7983         /* Here we have something like 'foo:]'.  There was no initial colon,
7984          * and we back up over 'foo.  XXX Unlike the going forward case, we
7985          * don't handle typos of non-word chars in the middle */
7986         has_opening_colon = FALSE;
7987         p--;
7988 
7989         while (p > RExC_start && isWORDCHAR(*p)) {
7990             p--;
7991         }
7992         p++;
7993 
7994         /* Here, we have positioned ourselves to where we think the first
7995          * character in the potential class is */
7996     }
7997 
7998     /* Now the interior really starts.  There are certain key characters that
7999      * can end the interior, or these could just be typos.  To catch both
8000      * cases, we may have to do two passes.  In the first pass, we keep on
8001      * going unless we come to a sequence that matches
8002      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
8003      * This means it takes a sequence to end the pass, so two typos in a row if
8004      * that wasn't what was intended.  If the class is perfectly formed, just
8005      * this one pass is needed.  We also stop if there are too many characters
8006      * being accumulated, but this number is deliberately set higher than any
8007      * real class.  It is set high enough so that someone who thinks that
8008      * 'alphanumeric' is a correct name would get warned that it wasn't.
8009      * While doing the pass, we keep track of where the key characters were in
8010      * it.  If we don't find an end to the class, and one of the key characters
8011      * was found, we redo the pass, but stop when we get to that character.
8012      * Thus the key character was considered a typo in the first pass, but a
8013      * terminator in the second.  If two key characters are found, we stop at
8014      * the second one in the first pass.  Again this can miss two typos, but
8015      * catches a single one
8016      *
8017      * In the first pass, 'possible_end' starts as NULL, and then gets set to
8018      * point to the first key character.  For the second pass, it starts as -1.
8019      * */
8020 
8021     name_start = p;
8022   parse_name:
8023     {
8024         bool has_blank               = FALSE;
8025         bool has_upper               = FALSE;
8026         bool has_terminating_colon   = FALSE;
8027         bool has_terminating_bracket = FALSE;
8028         bool has_semi_colon          = FALSE;
8029         unsigned int name_len        = 0;
8030         int punct_count              = 0;
8031 
8032         while (p < e) {
8033 
8034             /* Squeeze out blanks when looking up the class name below */
8035             if (isBLANK(*p) ) {
8036                 has_blank = TRUE;
8037                 found_problem = TRUE;
8038                 p++;
8039                 continue;
8040             }
8041 
8042             /* The name will end with a punctuation */
8043             if (isPUNCT(*p)) {
8044                 const char * peek = p + 1;
8045 
8046                 /* Treat any non-']' punctuation followed by a ']' (possibly
8047                  * with intervening blanks) as trying to terminate the class.
8048                  * ']]' is very likely to mean a class was intended (but
8049                  * missing the colon), but the warning message that gets
8050                  * generated shows the error position better if we exit the
8051                  * loop at the bottom (eventually), so skip it here. */
8052                 if (*p != ']') {
8053                     if (peek < e && isBLANK(*peek)) {
8054                         has_blank = TRUE;
8055                         found_problem = TRUE;
8056                         do {
8057                             peek++;
8058                         } while (peek < e && isBLANK(*peek));
8059                     }
8060 
8061                     if (peek < e && *peek == ']') {
8062                         has_terminating_bracket = TRUE;
8063                         if (*p == ':') {
8064                             has_terminating_colon = TRUE;
8065                         }
8066                         else if (*p == ';') {
8067                             has_semi_colon = TRUE;
8068                             has_terminating_colon = TRUE;
8069                         }
8070                         else {
8071                             found_problem = TRUE;
8072                         }
8073                         p = peek + 1;
8074                         goto try_posix;
8075                     }
8076                 }
8077 
8078                 /* Here we have punctuation we thought didn't end the class.
8079                  * Keep track of the position of the key characters that are
8080                  * more likely to have been class-enders */
8081                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
8082 
8083                     /* Allow just one such possible class-ender not actually
8084                      * ending the class. */
8085                     if (possible_end) {
8086                         break;
8087                     }
8088                     possible_end = p;
8089                 }
8090 
8091                 /* If we have too many punctuation characters, no use in
8092                  * keeping going */
8093                 if (++punct_count > max_distance) {
8094                     break;
8095                 }
8096 
8097                 /* Treat the punctuation as a typo. */
8098                 input_text[name_len++] = *p;
8099                 p++;
8100             }
8101             else if (isUPPER(*p)) { /* Use lowercase for lookup */
8102                 input_text[name_len++] = toLOWER(*p);
8103                 has_upper = TRUE;
8104                 found_problem = TRUE;
8105                 p++;
8106             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
8107                 input_text[name_len++] = *p;
8108                 p++;
8109             }
8110             else {
8111                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
8112                 p+= UTF8SKIP(p);
8113             }
8114 
8115             /* The declaration of 'input_text' is how long we allow a potential
8116              * class name to be, before saying they didn't mean a class name at
8117              * all */
8118             if (name_len >= C_ARRAY_LENGTH(input_text)) {
8119                 break;
8120             }
8121         }
8122 
8123         /* We get to here when the possible class name hasn't been properly
8124          * terminated before:
8125          *   1) we ran off the end of the pattern; or
8126          *   2) found two characters, each of which might have been intended to
8127          *      be the name's terminator
8128          *   3) found so many punctuation characters in the purported name,
8129          *      that the edit distance to a valid one is exceeded
8130          *   4) we decided it was more characters than anyone could have
8131          *      intended to be one. */
8132 
8133         found_problem = TRUE;
8134 
8135         /* In the final two cases, we know that looking up what we've
8136          * accumulated won't lead to a match, even a fuzzy one. */
8137         if (   name_len >= C_ARRAY_LENGTH(input_text)
8138             || punct_count > max_distance)
8139         {
8140             /* If there was an intermediate key character that could have been
8141              * an intended end, redo the parse, but stop there */
8142             if (possible_end && possible_end != (char *) -1) {
8143                 possible_end = (char *) -1; /* Special signal value to say
8144                                                we've done a first pass */
8145                 p = name_start;
8146                 goto parse_name;
8147             }
8148 
8149             /* Otherwise, it can't have meant to have been a class */
8150             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8151         }
8152 
8153         /* If we ran off the end, and the final character was a punctuation
8154          * one, back up one, to look at that final one just below.  Later, we
8155          * will restore the parse pointer if appropriate */
8156         if (name_len && p == e && isPUNCT(*(p-1))) {
8157             p--;
8158             name_len--;
8159         }
8160 
8161         if (p < e && isPUNCT(*p)) {
8162             if (*p == ']') {
8163                 has_terminating_bracket = TRUE;
8164 
8165                 /* If this is a 2nd ']', and the first one is just below this
8166                  * one, consider that to be the real terminator.  This gives a
8167                  * uniform and better positioning for the warning message  */
8168                 if (   possible_end
8169                     && possible_end != (char *) -1
8170                     && *possible_end == ']'
8171                     && name_len && input_text[name_len - 1] == ']')
8172                 {
8173                     name_len--;
8174                     p = possible_end;
8175 
8176                     /* And this is actually equivalent to having done the 2nd
8177                      * pass now, so set it to not try again */
8178                     possible_end = (char *) -1;
8179                 }
8180             }
8181             else {
8182                 if (*p == ':') {
8183                     has_terminating_colon = TRUE;
8184                 }
8185                 else if (*p == ';') {
8186                     has_semi_colon = TRUE;
8187                     has_terminating_colon = TRUE;
8188                 }
8189                 p++;
8190             }
8191         }
8192 
8193     try_posix:
8194 
8195         /* Here, we have a class name to look up.  We can short circuit the
8196          * stuff below for short names that can't possibly be meant to be a
8197          * class name.  (We can do this on the first pass, as any second pass
8198          * will yield an even shorter name) */
8199         if (name_len < 3) {
8200             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8201         }
8202 
8203         /* Find which class it is.  Initially switch on the length of the name.
8204          * */
8205         switch (name_len) {
8206             case 4:
8207                 if (memEQs(name_start, 4, "word")) {
8208                     /* this is not POSIX, this is the Perl \w */
8209                     class_number = ANYOF_WORDCHAR;
8210                 }
8211                 break;
8212             case 5:
8213                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
8214                  *                        graph lower print punct space upper
8215                  * Offset 4 gives the best switch position.  */
8216                 switch (name_start[4]) {
8217                     case 'a':
8218                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
8219                             class_number = ANYOF_ALPHA;
8220                         break;
8221                     case 'e':
8222                         if (memBEGINs(name_start, 5, "spac")) /* space */
8223                             class_number = ANYOF_SPACE;
8224                         break;
8225                     case 'h':
8226                         if (memBEGINs(name_start, 5, "grap")) /* graph */
8227                             class_number = ANYOF_GRAPH;
8228                         break;
8229                     case 'i':
8230                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
8231                             class_number = ANYOF_ASCII;
8232                         break;
8233                     case 'k':
8234                         if (memBEGINs(name_start, 5, "blan")) /* blank */
8235                             class_number = ANYOF_BLANK;
8236                         break;
8237                     case 'l':
8238                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
8239                             class_number = ANYOF_CNTRL;
8240                         break;
8241                     case 'm':
8242                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
8243                             class_number = ANYOF_ALPHANUMERIC;
8244                         break;
8245                     case 'r':
8246                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
8247                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
8248                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
8249                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
8250                         break;
8251                     case 't':
8252                         if (memBEGINs(name_start, 5, "digi")) /* digit */
8253                             class_number = ANYOF_DIGIT;
8254                         else if (memBEGINs(name_start, 5, "prin")) /* print */
8255                             class_number = ANYOF_PRINT;
8256                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
8257                             class_number = ANYOF_PUNCT;
8258                         break;
8259                 }
8260                 break;
8261             case 6:
8262                 if (memEQs(name_start, 6, "xdigit"))
8263                     class_number = ANYOF_XDIGIT;
8264                 break;
8265         }
8266 
8267         /* If the name exactly matches a posix class name the class number will
8268          * here be set to it, and the input almost certainly was meant to be a
8269          * posix class, so we can skip further checking.  If instead the syntax
8270          * is exactly correct, but the name isn't one of the legal ones, we
8271          * will return that as an error below.  But if neither of these apply,
8272          * it could be that no posix class was intended at all, or that one
8273          * was, but there was a typo.  We tease these apart by doing fuzzy
8274          * matching on the name */
8275         if (class_number == OOB_NAMEDCLASS && found_problem) {
8276             const UV posix_names[][6] = {
8277                                                 { 'a', 'l', 'n', 'u', 'm' },
8278                                                 { 'a', 'l', 'p', 'h', 'a' },
8279                                                 { 'a', 's', 'c', 'i', 'i' },
8280                                                 { 'b', 'l', 'a', 'n', 'k' },
8281                                                 { 'c', 'n', 't', 'r', 'l' },
8282                                                 { 'd', 'i', 'g', 'i', 't' },
8283                                                 { 'g', 'r', 'a', 'p', 'h' },
8284                                                 { 'l', 'o', 'w', 'e', 'r' },
8285                                                 { 'p', 'r', 'i', 'n', 't' },
8286                                                 { 'p', 'u', 'n', 'c', 't' },
8287                                                 { 's', 'p', 'a', 'c', 'e' },
8288                                                 { 'u', 'p', 'p', 'e', 'r' },
8289                                                 { 'w', 'o', 'r', 'd' },
8290                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
8291                                             };
8292             /* The names of the above all have added NULs to make them the same
8293              * size, so we need to also have the real lengths */
8294             const UV posix_name_lengths[] = {
8295                                                 sizeof("alnum") - 1,
8296                                                 sizeof("alpha") - 1,
8297                                                 sizeof("ascii") - 1,
8298                                                 sizeof("blank") - 1,
8299                                                 sizeof("cntrl") - 1,
8300                                                 sizeof("digit") - 1,
8301                                                 sizeof("graph") - 1,
8302                                                 sizeof("lower") - 1,
8303                                                 sizeof("print") - 1,
8304                                                 sizeof("punct") - 1,
8305                                                 sizeof("space") - 1,
8306                                                 sizeof("upper") - 1,
8307                                                 sizeof("word")  - 1,
8308                                                 sizeof("xdigit")- 1
8309                                             };
8310             unsigned int i;
8311             int temp_max = max_distance;    /* Use a temporary, so if we
8312                                                reparse, we haven't changed the
8313                                                outer one */
8314 
8315             /* Use a smaller max edit distance if we are missing one of the
8316              * delimiters */
8317             if (   has_opening_bracket + has_opening_colon < 2
8318                 || has_terminating_bracket + has_terminating_colon < 2)
8319             {
8320                 temp_max--;
8321             }
8322 
8323             /* See if the input name is close to a legal one */
8324             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
8325 
8326                 /* Short circuit call if the lengths are too far apart to be
8327                  * able to match */
8328                 if (abs( (int) (name_len - posix_name_lengths[i]))
8329                     > temp_max)
8330                 {
8331                     continue;
8332                 }
8333 
8334                 if (edit_distance(input_text,
8335                                   posix_names[i],
8336                                   name_len,
8337                                   posix_name_lengths[i],
8338                                   temp_max
8339                                  )
8340                     > -1)
8341                 { /* If it is close, it probably was intended to be a class */
8342                     goto probably_meant_to_be;
8343                 }
8344             }
8345 
8346             /* Here the input name is not close enough to a valid class name
8347              * for us to consider it to be intended to be a posix class.  If
8348              * we haven't already done so, and the parse found a character that
8349              * could have been terminators for the name, but which we absorbed
8350              * as typos during the first pass, repeat the parse, signalling it
8351              * to stop at that character */
8352             if (possible_end && possible_end != (char *) -1) {
8353                 possible_end = (char *) -1;
8354                 p = name_start;
8355                 goto parse_name;
8356             }
8357 
8358             /* Here neither pass found a close-enough class name */
8359             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8360         }
8361 
8362     probably_meant_to_be:
8363 
8364         /* Here we think that a posix specification was intended.  Update any
8365          * parse pointer */
8366         if (updated_parse_ptr) {
8367             *updated_parse_ptr = (char *) p;
8368         }
8369 
8370         /* If a posix class name was intended but incorrectly specified, we
8371          * output or return the warnings */
8372         if (found_problem) {
8373 
8374             /* We set flags for these issues in the parse loop above instead of
8375              * adding them to the list of warnings, because we can parse it
8376              * twice, and we only want one warning instance */
8377             if (has_upper) {
8378                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
8379             }
8380             if (has_blank) {
8381                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
8382             }
8383             if (has_semi_colon) {
8384                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
8385             }
8386             else if (! has_terminating_colon) {
8387                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
8388             }
8389             if (! has_terminating_bracket) {
8390                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
8391             }
8392 
8393             if (   posix_warnings
8394                 && RExC_warn_text
8395                 && av_count(RExC_warn_text) > 0)
8396             {
8397                 *posix_warnings = RExC_warn_text;
8398             }
8399         }
8400         else if (class_number != OOB_NAMEDCLASS) {
8401             /* If it is a known class, return the class.  The class number
8402              * #defines are structured so each complement is +1 to the normal
8403              * one */
8404             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
8405         }
8406         else if (! check_only) {
8407 
8408             /* Here, it is an unrecognized class.  This is an error (unless the
8409             * call is to check only, which we've already handled above) */
8410             const char * const complement_string = (complement)
8411                                                    ? "^"
8412                                                    : "";
8413             RExC_parse_set((char *) p);
8414             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
8415                         complement_string,
8416                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
8417         }
8418     }
8419 
8420     return OOB_NAMEDCLASS;
8421 }
8422 #undef ADD_POSIX_WARNING
8423 
8424 STATIC unsigned  int
S_regex_set_precedence(const U8 my_operator)8425 S_regex_set_precedence(const U8 my_operator) {
8426 
8427     /* Returns the precedence in the (?[...]) construct of the input operator,
8428      * specified by its character representation.  The precedence follows
8429      * general Perl rules, but it extends this so that ')' and ']' have (low)
8430      * precedence even though they aren't really operators */
8431 
8432     switch (my_operator) {
8433         case '!':
8434             return 5;
8435         case '&':
8436             return 4;
8437         case '^':
8438         case '|':
8439         case '+':
8440         case '-':
8441             return 3;
8442         case ')':
8443             return 2;
8444         case ']':
8445             return 1;
8446     }
8447 
8448     NOT_REACHED; /* NOTREACHED */
8449     return 0;   /* Silence compiler warning */
8450 }
8451 
8452 STATIC regnode_offset
S_handle_regex_sets(pTHX_ RExC_state_t * pRExC_state,SV ** return_invlist,I32 * flagp,U32 depth)8453 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
8454                     I32 *flagp, U32 depth)
8455 {
8456     /* Handle the (?[...]) construct to do set operations */
8457 
8458     U8 curchar;                     /* Current character being parsed */
8459     UV start, end;	            /* End points of code point ranges */
8460     SV* final = NULL;               /* The end result inversion list */
8461     SV* result_string;              /* 'final' stringified */
8462     AV* stack;                      /* stack of operators and operands not yet
8463                                        resolved */
8464     AV* fence_stack = NULL;         /* A stack containing the positions in
8465                                        'stack' of where the undealt-with left
8466                                        parens would be if they were actually
8467                                        put there */
8468     /* The 'volatile' is a workaround for an optimiser bug
8469      * in Solaris Studio 12.3. See RT #127455 */
8470     volatile IV fence = 0;          /* Position of where most recent undealt-
8471                                        with left paren in stack is; -1 if none.
8472                                      */
8473     STRLEN len;                     /* Temporary */
8474     regnode_offset node;            /* Temporary, and final regnode returned by
8475                                        this function */
8476     const bool save_fold = FOLD;    /* Temporary */
8477     char *save_end, *save_parse;    /* Temporaries */
8478     const bool in_locale = LOC;     /* we turn off /l during processing */
8479 
8480     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8481 
8482     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
8483 
8484     DEBUG_PARSE("xcls");
8485 
8486     if (in_locale) {
8487         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
8488     }
8489 
8490     /* The use of this operator implies /u.  This is required so that the
8491      * compile time values are valid in all runtime cases */
8492     REQUIRE_UNI_RULES(flagp, 0);
8493 
8494     /* Everything in this construct is a metacharacter.  Operands begin with
8495      * either a '\' (for an escape sequence), or a '[' for a bracketed
8496      * character class.  Any other character should be an operator, or
8497      * parenthesis for grouping.  Both types of operands are handled by calling
8498      * regclass() to parse them.  It is called with a parameter to indicate to
8499      * return the computed inversion list.  The parsing here is implemented via
8500      * a stack.  Each entry on the stack is a single character representing one
8501      * of the operators; or else a pointer to an operand inversion list. */
8502 
8503 #define IS_OPERATOR(a) SvIOK(a)
8504 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
8505 
8506     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
8507      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
8508      * with pronouncing it called it Reverse Polish instead, but now that YOU
8509      * know how to pronounce it you can use the correct term, thus giving due
8510      * credit to the person who invented it, and impressing your geek friends.
8511      * Wikipedia says that the pronunciation of "Ł" has been changing so that
8512      * it is now more like an English initial W (as in wonk) than an L.)
8513      *
8514      * This means that, for example, 'a | b & c' is stored on the stack as
8515      *
8516      * c  [4]
8517      * b  [3]
8518      * &  [2]
8519      * a  [1]
8520      * |  [0]
8521      *
8522      * where the numbers in brackets give the stack [array] element number.
8523      * In this implementation, parentheses are not stored on the stack.
8524      * Instead a '(' creates a "fence" so that the part of the stack below the
8525      * fence is invisible except to the corresponding ')' (this allows us to
8526      * replace testing for parens, by using instead subtraction of the fence
8527      * position).  As new operands are processed they are pushed onto the stack
8528      * (except as noted in the next paragraph).  New operators of higher
8529      * precedence than the current final one are inserted on the stack before
8530      * the lhs operand (so that when the rhs is pushed next, everything will be
8531      * in the correct positions shown above.  When an operator of equal or
8532      * lower precedence is encountered in parsing, all the stacked operations
8533      * of equal or higher precedence are evaluated, leaving the result as the
8534      * top entry on the stack.  This makes higher precedence operations
8535      * evaluate before lower precedence ones, and causes operations of equal
8536      * precedence to left associate.
8537      *
8538      * The only unary operator '!' is immediately pushed onto the stack when
8539      * encountered.  When an operand is encountered, if the top of the stack is
8540      * a '!", the complement is immediately performed, and the '!' popped.  The
8541      * resulting value is treated as a new operand, and the logic in the
8542      * previous paragraph is executed.  Thus in the expression
8543      *      [a] + ! [b]
8544      * the stack looks like
8545      *
8546      * !
8547      * a
8548      * +
8549      *
8550      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
8551      * becomes
8552      *
8553      * !b
8554      * a
8555      * +
8556      *
8557      * A ')' is treated as an operator with lower precedence than all the
8558      * aforementioned ones, which causes all operations on the stack above the
8559      * corresponding '(' to be evaluated down to a single resultant operand.
8560      * Then the fence for the '(' is removed, and the operand goes through the
8561      * algorithm above, without the fence.
8562      *
8563      * A separate stack is kept of the fence positions, so that the position of
8564      * the latest so-far unbalanced '(' is at the top of it.
8565      *
8566      * The ']' ending the construct is treated as the lowest operator of all,
8567      * so that everything gets evaluated down to a single operand, which is the
8568      * result */
8569 
8570     stack = (AV*)newSV_type_mortal(SVt_PVAV);
8571     fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
8572 
8573     while (RExC_parse < RExC_end) {
8574         I32 top_index;              /* Index of top-most element in 'stack' */
8575         SV** top_ptr;               /* Pointer to top 'stack' element */
8576         SV* current = NULL;         /* To contain the current inversion list
8577                                        operand */
8578         SV* only_to_avoid_leaks;
8579 
8580         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
8581                                 TRUE /* Force /x */ );
8582         if (RExC_parse >= RExC_end) {   /* Fail */
8583             break;
8584         }
8585 
8586         curchar = UCHARAT(RExC_parse);
8587 
8588 redo_curchar:
8589 
8590 #ifdef ENABLE_REGEX_SETS_DEBUGGING
8591                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
8592         DEBUG_U(dump_regex_sets_structures(pRExC_state,
8593                                            stack, fence, fence_stack));
8594 #endif
8595 
8596         top_index = av_tindex_skip_len_mg(stack);
8597 
8598         switch (curchar) {
8599             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
8600             char stacked_operator;  /* The topmost operator on the 'stack'. */
8601             SV* lhs;                /* Operand to the left of the operator */
8602             SV* rhs;                /* Operand to the right of the operator */
8603             SV* fence_ptr;          /* Pointer to top element of the fence
8604                                        stack */
8605             case '(':
8606 
8607                 if (   RExC_parse < RExC_end - 2
8608                     && UCHARAT(RExC_parse + 1) == '?'
8609                     && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
8610                 {
8611                     const regnode_offset orig_emit = RExC_emit;
8612                     SV * resultant_invlist;
8613 
8614                     /* Here it could be an embedded '(?flags:(?[...])'.
8615                      * This happens when we have some thing like
8616                      *
8617                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
8618                      *   ...
8619                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
8620                      *
8621                      * Here we would be handling the interpolated
8622                      * '$thai_or_lao'.  We handle this by a recursive call to
8623                      * reg which returns the inversion list the
8624                      * interpolated expression evaluates to.  Actually, the
8625                      * return is a special regnode containing a pointer to that
8626                      * inversion list.  If the return isn't that regnode alone,
8627                      * we know that this wasn't such an interpolation, which is
8628                      * an error: we need to get a single inversion list back
8629                      * from the recursion */
8630 
8631                     RExC_parse_inc_by(1);
8632                     RExC_sets_depth++;
8633 
8634                     node = reg(pRExC_state, 2, flagp, depth+1);
8635                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
8636 
8637                     if (   OP(REGNODE_p(node)) != REGEX_SET
8638                            /* If more than a single node returned, the nested
8639                             * parens evaluated to more than just a (?[...]),
8640                             * which isn't legal */
8641                         || RExC_emit != orig_emit
8642                                       + NODE_STEP_REGNODE
8643                                       + REGNODE_ARG_LEN(REGEX_SET))
8644                     {
8645                         vFAIL("Expecting interpolated extended charclass");
8646                     }
8647                     resultant_invlist = (SV *) ARGp(REGNODE_p(node));
8648                     current = invlist_clone(resultant_invlist, NULL);
8649                     SvREFCNT_dec(resultant_invlist);
8650 
8651                     RExC_sets_depth--;
8652                     RExC_emit = orig_emit;
8653                     goto handle_operand;
8654                 }
8655 
8656                 /* A regular '('.  Look behind for illegal syntax */
8657                 if (top_index - fence >= 0) {
8658                     /* If the top entry on the stack is an operator, it had
8659                      * better be a '!', otherwise the entry below the top
8660                      * operand should be an operator */
8661                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
8662                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
8663                         || (   IS_OPERAND(*top_ptr)
8664                             && (   top_index - fence < 1
8665                                 || ! (stacked_ptr = av_fetch(stack,
8666                                                              top_index - 1,
8667                                                              FALSE))
8668                                 || ! IS_OPERATOR(*stacked_ptr))))
8669                     {
8670                         RExC_parse_inc_by(1);
8671                         vFAIL("Unexpected '(' with no preceding operator");
8672                     }
8673                 }
8674 
8675                 /* Stack the position of this undealt-with left paren */
8676                 av_push_simple(fence_stack, newSViv(fence));
8677                 fence = top_index + 1;
8678                 break;
8679 
8680             case '\\':
8681                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8682                  * multi-char folds are allowed.  */
8683                 if (!regclass(pRExC_state, flagp, depth+1,
8684                               TRUE, /* means parse just the next thing */
8685                               FALSE, /* don't allow multi-char folds */
8686                               FALSE, /* don't silence non-portable warnings.  */
8687                               TRUE,  /* strict */
8688                               FALSE, /* Require return to be an ANYOF */
8689                               &current))
8690                 {
8691                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
8692                     goto regclass_failed;
8693                 }
8694 
8695                 assert(current);
8696 
8697                 /* regclass() will return with parsing just the \ sequence,
8698                  * leaving the parse pointer at the next thing to parse */
8699                 RExC_parse--;
8700                 goto handle_operand;
8701 
8702             case '[':   /* Is a bracketed character class */
8703             {
8704                 /* See if this is a [:posix:] class. */
8705                 bool is_posix_class = (OOB_NAMEDCLASS
8706                             < handle_possible_posix(pRExC_state,
8707                                                 RExC_parse + 1,
8708                                                 NULL,
8709                                                 NULL,
8710                                                 TRUE /* checking only */));
8711                 /* If it is a posix class, leave the parse pointer at the '['
8712                  * to fool regclass() into thinking it is part of a
8713                  * '[[:posix:]]'. */
8714                 if (! is_posix_class) {
8715                     RExC_parse_inc_by(1);
8716                 }
8717 
8718                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8719                  * multi-char folds are allowed.  */
8720                 if (!regclass(pRExC_state, flagp, depth+1,
8721                                 is_posix_class, /* parse the whole char
8722                                                     class only if not a
8723                                                     posix class */
8724                                 FALSE, /* don't allow multi-char folds */
8725                                 TRUE, /* silence non-portable warnings. */
8726                                 TRUE, /* strict */
8727                                 FALSE, /* Require return to be an ANYOF */
8728                                 &current))
8729                 {
8730                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
8731                     goto regclass_failed;
8732                 }
8733 
8734                 assert(current);
8735 
8736                 /* function call leaves parse pointing to the ']', except if we
8737                  * faked it */
8738                 if (is_posix_class) {
8739                     RExC_parse--;
8740                 }
8741 
8742                 goto handle_operand;
8743             }
8744 
8745             case ']':
8746                 if (top_index >= 1) {
8747                     goto join_operators;
8748                 }
8749 
8750                 /* Only a single operand on the stack: are done */
8751                 goto done;
8752 
8753             case ')':
8754                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
8755                     if (UCHARAT(RExC_parse - 1) == ']')  {
8756                         break;
8757                     }
8758                     RExC_parse_inc_by(1);
8759                     vFAIL("Unexpected ')'");
8760                 }
8761 
8762                 /* If nothing after the fence, is missing an operand */
8763                 if (top_index - fence < 0) {
8764                     RExC_parse_inc_by(1);
8765                     goto bad_syntax;
8766                 }
8767                 /* If at least two things on the stack, treat this as an
8768                   * operator */
8769                 if (top_index - fence >= 1) {
8770                     goto join_operators;
8771                 }
8772 
8773                 /* Here only a single thing on the fenced stack, and there is a
8774                  * fence.  Get rid of it */
8775                 fence_ptr = av_pop(fence_stack);
8776                 assert(fence_ptr);
8777                 fence = SvIV(fence_ptr);
8778                 SvREFCNT_dec_NN(fence_ptr);
8779                 fence_ptr = NULL;
8780 
8781                 if (fence < 0) {
8782                     fence = 0;
8783                 }
8784 
8785                 /* Having gotten rid of the fence, we pop the operand at the
8786                  * stack top and process it as a newly encountered operand */
8787                 current = av_pop(stack);
8788                 if (IS_OPERAND(current)) {
8789                     goto handle_operand;
8790                 }
8791 
8792                 RExC_parse_inc_by(1);
8793                 goto bad_syntax;
8794 
8795             case '&':
8796             case '|':
8797             case '+':
8798             case '-':
8799             case '^':
8800 
8801                 /* These binary operators should have a left operand already
8802                  * parsed */
8803                 if (   top_index - fence < 0
8804                     || top_index - fence == 1
8805                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
8806                     || ! IS_OPERAND(*top_ptr))
8807                 {
8808                     goto unexpected_binary;
8809                 }
8810 
8811                 /* If only the one operand is on the part of the stack visible
8812                  * to us, we just place this operator in the proper position */
8813                 if (top_index - fence < 2) {
8814 
8815                     /* Place the operator before the operand */
8816 
8817                     SV* lhs = av_pop(stack);
8818                     av_push_simple(stack, newSVuv(curchar));
8819                     av_push_simple(stack, lhs);
8820                     break;
8821                 }
8822 
8823                 /* But if there is something else on the stack, we need to
8824                  * process it before this new operator if and only if the
8825                  * stacked operation has equal or higher precedence than the
8826                  * new one */
8827 
8828              join_operators:
8829 
8830                 /* The operator on the stack is supposed to be below both its
8831                  * operands */
8832                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
8833                     || IS_OPERAND(*stacked_ptr))
8834                 {
8835                     /* But if not, it's legal and indicates we are completely
8836                      * done if and only if we're currently processing a ']',
8837                      * which should be the final thing in the expression */
8838                     if (curchar == ']') {
8839                         goto done;
8840                     }
8841 
8842                   unexpected_binary:
8843                     RExC_parse_inc_by(1);
8844                     vFAIL2("Unexpected binary operator '%c' with no "
8845                            "preceding operand", curchar);
8846                 }
8847                 stacked_operator = (char) SvUV(*stacked_ptr);
8848 
8849                 if (regex_set_precedence(curchar)
8850                     > regex_set_precedence(stacked_operator))
8851                 {
8852                     /* Here, the new operator has higher precedence than the
8853                      * stacked one.  This means we need to add the new one to
8854                      * the stack to await its rhs operand (and maybe more
8855                      * stuff).  We put it before the lhs operand, leaving
8856                      * untouched the stacked operator and everything below it
8857                      * */
8858                     lhs = av_pop(stack);
8859                     assert(IS_OPERAND(lhs));
8860                     av_push_simple(stack, newSVuv(curchar));
8861                     av_push_simple(stack, lhs);
8862                     break;
8863                 }
8864 
8865                 /* Here, the new operator has equal or lower precedence than
8866                  * what's already there.  This means the operation already
8867                  * there should be performed now, before the new one. */
8868 
8869                 rhs = av_pop(stack);
8870                 if (! IS_OPERAND(rhs)) {
8871 
8872                     /* This can happen when a ! is not followed by an operand,
8873                      * like in /(?[\t &!])/ */
8874                     goto bad_syntax;
8875                 }
8876 
8877                 lhs = av_pop(stack);
8878 
8879                 if (! IS_OPERAND(lhs)) {
8880 
8881                     /* This can happen when there is an empty (), like in
8882                      * /(?[[0]+()+])/ */
8883                     goto bad_syntax;
8884                 }
8885 
8886                 switch (stacked_operator) {
8887                     case '&':
8888                         _invlist_intersection(lhs, rhs, &rhs);
8889                         break;
8890 
8891                     case '|':
8892                     case '+':
8893                         _invlist_union(lhs, rhs, &rhs);
8894                         break;
8895 
8896                     case '-':
8897                         _invlist_subtract(lhs, rhs, &rhs);
8898                         break;
8899 
8900                     case '^':   /* The union minus the intersection */
8901                     {
8902                         SV* i = NULL;
8903                         SV* u = NULL;
8904 
8905                         _invlist_union(lhs, rhs, &u);
8906                         _invlist_intersection(lhs, rhs, &i);
8907                         _invlist_subtract(u, i, &rhs);
8908                         SvREFCNT_dec_NN(i);
8909                         SvREFCNT_dec_NN(u);
8910                         break;
8911                     }
8912                 }
8913                 SvREFCNT_dec(lhs);
8914 
8915                 /* Here, the higher precedence operation has been done, and the
8916                  * result is in 'rhs'.  We overwrite the stacked operator with
8917                  * the result.  Then we redo this code to either push the new
8918                  * operator onto the stack or perform any higher precedence
8919                  * stacked operation */
8920                 only_to_avoid_leaks = av_pop(stack);
8921                 SvREFCNT_dec(only_to_avoid_leaks);
8922                 av_push_simple(stack, rhs);
8923                 goto redo_curchar;
8924 
8925             case '!':   /* Highest priority, right associative */
8926 
8927                 /* If what's already at the top of the stack is another '!",
8928                  * they just cancel each other out */
8929                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
8930                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
8931                 {
8932                     only_to_avoid_leaks = av_pop(stack);
8933                     SvREFCNT_dec(only_to_avoid_leaks);
8934                 }
8935                 else { /* Otherwise, since it's right associative, just push
8936                           onto the stack */
8937                     av_push_simple(stack, newSVuv(curchar));
8938                 }
8939                 break;
8940 
8941             default:
8942                 RExC_parse_inc();
8943                 if (RExC_parse >= RExC_end) {
8944                     break;
8945                 }
8946                 vFAIL("Unexpected character");
8947 
8948           handle_operand:
8949 
8950             /* Here 'current' is the operand.  If something is already on the
8951              * stack, we have to check if it is a !.  But first, the code above
8952              * may have altered the stack in the time since we earlier set
8953              * 'top_index'.  */
8954 
8955             top_index = av_tindex_skip_len_mg(stack);
8956             if (top_index - fence >= 0) {
8957                 /* If the top entry on the stack is an operator, it had better
8958                  * be a '!', otherwise the entry below the top operand should
8959                  * be an operator */
8960                 top_ptr = av_fetch(stack, top_index, FALSE);
8961                 assert(top_ptr);
8962                 if (IS_OPERATOR(*top_ptr)) {
8963 
8964                     /* The only permissible operator at the top of the stack is
8965                      * '!', which is applied immediately to this operand. */
8966                     curchar = (char) SvUV(*top_ptr);
8967                     if (curchar != '!') {
8968                         SvREFCNT_dec(current);
8969                         vFAIL2("Unexpected binary operator '%c' with no "
8970                                 "preceding operand", curchar);
8971                     }
8972 
8973                     _invlist_invert(current);
8974 
8975                     only_to_avoid_leaks = av_pop(stack);
8976                     SvREFCNT_dec(only_to_avoid_leaks);
8977 
8978                     /* And we redo with the inverted operand.  This allows
8979                      * handling multiple ! in a row */
8980                     goto handle_operand;
8981                 }
8982                           /* Single operand is ok only for the non-binary ')'
8983                            * operator */
8984                 else if ((top_index - fence == 0 && curchar != ')')
8985                          || (top_index - fence > 0
8986                              && (! (stacked_ptr = av_fetch(stack,
8987                                                            top_index - 1,
8988                                                            FALSE))
8989                                  || IS_OPERAND(*stacked_ptr))))
8990                 {
8991                     SvREFCNT_dec(current);
8992                     vFAIL("Operand with no preceding operator");
8993                 }
8994             }
8995 
8996             /* Here there was nothing on the stack or the top element was
8997              * another operand.  Just add this new one */
8998             av_push_simple(stack, current);
8999 
9000         } /* End of switch on next parse token */
9001 
9002         RExC_parse_inc();
9003     } /* End of loop parsing through the construct */
9004 
9005     vFAIL("Syntax error in (?[...])");
9006 
9007   done:
9008 
9009     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
9010         if (RExC_parse < RExC_end) {
9011             RExC_parse_inc_by(1);
9012         }
9013 
9014         vFAIL("Unexpected ']' with no following ')' in (?[...");
9015     }
9016 
9017     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
9018         vFAIL("Unmatched (");
9019     }
9020 
9021     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
9022         || ((final = av_pop(stack)) == NULL)
9023         || ! IS_OPERAND(final)
9024         || ! is_invlist(final)
9025         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
9026     {
9027       bad_syntax:
9028         SvREFCNT_dec(final);
9029         vFAIL("Incomplete expression within '(?[ ])'");
9030     }
9031 
9032     /* Here, 'final' is the resultant inversion list from evaluating the
9033      * expression.  Return it if so requested */
9034     if (return_invlist) {
9035         *return_invlist = final;
9036         return END;
9037     }
9038 
9039     if (RExC_sets_depth) {  /* If within a recursive call, return in a special
9040                                regnode */
9041         RExC_parse_inc_by(1);
9042         node = regpnode(pRExC_state, REGEX_SET, final);
9043     }
9044     else {
9045 
9046         /* Otherwise generate a resultant node, based on 'final'.  regclass()
9047          * is expecting a string of ranges and individual code points */
9048         invlist_iterinit(final);
9049         result_string = newSVpvs("");
9050         while (invlist_iternext(final, &start, &end)) {
9051             if (start == end) {
9052                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
9053             }
9054             else {
9055                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
9056                                                         UVXf "}", start, end);
9057             }
9058         }
9059 
9060         /* About to generate an ANYOF (or similar) node from the inversion list
9061          * we have calculated */
9062         save_parse = RExC_parse;
9063         RExC_parse_set(SvPV(result_string, len));
9064         save_end = RExC_end;
9065         RExC_end = RExC_parse + len;
9066         TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
9067 
9068         /* We turn off folding around the call, as the class we have
9069          * constructed already has all folding taken into consideration, and we
9070          * don't want regclass() to add to that */
9071         RExC_flags &= ~RXf_PMf_FOLD;
9072         /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
9073          * folds are allowed.  */
9074         node = regclass(pRExC_state, flagp, depth+1,
9075                         FALSE, /* means parse the whole char class */
9076                         FALSE, /* don't allow multi-char folds */
9077                         TRUE, /* silence non-portable warnings.  The above may
9078                                  very well have generated non-portable code
9079                                  points, but they're valid on this machine */
9080                         FALSE, /* similarly, no need for strict */
9081 
9082                         /* We can optimize into something besides an ANYOF,
9083                          * except under /l, which needs to be ANYOF because of
9084                          * runtime checks for locale sanity, etc */
9085                     ! in_locale,
9086                         NULL
9087                     );
9088 
9089         RESTORE_WARNINGS;
9090         RExC_parse_set(save_parse + 1);
9091         RExC_end = save_end;
9092         SvREFCNT_dec_NN(final);
9093         SvREFCNT_dec_NN(result_string);
9094 
9095         if (save_fold) {
9096             RExC_flags |= RXf_PMf_FOLD;
9097         }
9098 
9099         if (!node) {
9100             RETURN_FAIL_ON_RESTART(*flagp, flagp);
9101             goto regclass_failed;
9102         }
9103 
9104         /* Fix up the node type if we are in locale.  (We have pretended we are
9105          * under /u for the purposes of regclass(), as this construct will only
9106          * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
9107          * (so as to cause any warnings about bad locales to be output in
9108          * regexec.c), and add the flag that indicates to check if not in a
9109          * UTF-8 locale.  The reason we above forbid optimization into
9110          * something other than an ANYOF node is simply to minimize the number
9111          * of code changes in regexec.c.  Otherwise we would have to create new
9112          * EXACTish node types and deal with them.  This decision could be
9113          * revisited should this construct become popular.
9114          *
9115          * (One might think we could look at the resulting ANYOF node and
9116          * suppress the flag if everything is above 255, as those would be
9117          * UTF-8 only, but this isn't true, as the components that led to that
9118          * result could have been locale-affected, and just happen to cancel
9119          * each other out under UTF-8 locales.) */
9120         if (in_locale) {
9121             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
9122 
9123             assert(OP(REGNODE_p(node)) == ANYOF);
9124 
9125             OP(REGNODE_p(node)) = ANYOFL;
9126             ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
9127         }
9128     }
9129 
9130     nextchar(pRExC_state);
9131     return node;
9132 
9133   regclass_failed:
9134     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
9135                                                                 (UV) *flagp);
9136 }
9137 
9138 #ifdef ENABLE_REGEX_SETS_DEBUGGING
9139 
9140 STATIC void
S_dump_regex_sets_structures(pTHX_ RExC_state_t * pRExC_state,AV * stack,const IV fence,AV * fence_stack)9141 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
9142                              AV * stack, const IV fence, AV * fence_stack)
9143 {   /* Dumps the stacks in handle_regex_sets() */
9144 
9145     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
9146     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
9147     SSize_t i;
9148 
9149     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
9150 
9151     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
9152 
9153     if (stack_top < 0) {
9154         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
9155     }
9156     else {
9157         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
9158         for (i = stack_top; i >= 0; i--) {
9159             SV ** element_ptr = av_fetch(stack, i, FALSE);
9160             if (! element_ptr) {
9161             }
9162 
9163             if (IS_OPERATOR(*element_ptr)) {
9164                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
9165                                             (int) i, (int) SvIV(*element_ptr));
9166             }
9167             else {
9168                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
9169                 sv_dump(*element_ptr);
9170             }
9171         }
9172     }
9173 
9174     if (fence_stack_top < 0) {
9175         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
9176     }
9177     else {
9178         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
9179         for (i = fence_stack_top; i >= 0; i--) {
9180             SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
9181             if (! element_ptr) {
9182             }
9183 
9184             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
9185                                             (int) i, (int) SvIV(*element_ptr));
9186         }
9187     }
9188 }
9189 
9190 #endif
9191 
9192 #undef IS_OPERATOR
9193 #undef IS_OPERAND
9194 
9195 #ifdef PERL_RE_BUILD_AUX
9196 void
Perl_add_above_Latin1_folds(pTHX_ RExC_state_t * pRExC_state,const U8 cp,SV ** invlist)9197 Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
9198 {
9199     /* This adds the Latin1/above-Latin1 folding rules.
9200      *
9201      * This should be called only for a Latin1-range code points, cp, which is
9202      * known to be involved in a simple fold with other code points above
9203      * Latin1.  It would give false results if /aa has been specified.
9204      * Multi-char folds are outside the scope of this, and must be handled
9205      * specially. */
9206 
9207     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
9208 
9209     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
9210 
9211     /* The rules that are valid for all Unicode versions are hard-coded in */
9212     switch (cp) {
9213         case 'k':
9214         case 'K':
9215           *invlist =
9216              add_cp_to_invlist(*invlist, KELVIN_SIGN);
9217             break;
9218         case 's':
9219         case 'S':
9220           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
9221             break;
9222         case MICRO_SIGN:
9223           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
9224           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
9225             break;
9226         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9227         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9228           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
9229             break;
9230         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9231           *invlist = add_cp_to_invlist(*invlist,
9232                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9233             break;
9234 
9235         default:    /* Other code points are checked against the data for the
9236                        current Unicode version */
9237           {
9238             Size_t folds_count;
9239             U32 first_fold;
9240             const U32 * remaining_folds;
9241             UV folded_cp;
9242 
9243             if (isASCII(cp)) {
9244                 folded_cp = toFOLD(cp);
9245             }
9246             else {
9247                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
9248                 Size_t dummy_len;
9249                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
9250             }
9251 
9252             if (folded_cp > 255) {
9253                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
9254             }
9255 
9256             folds_count = _inverse_folds(folded_cp, &first_fold,
9257                                                     &remaining_folds);
9258             if (folds_count == 0) {
9259 
9260                 /* Use deprecated warning to increase the chances of this being
9261                  * output */
9262                 ckWARN2reg_d(RExC_parse,
9263                         "Perl folding rules are not up-to-date for 0x%02X;"
9264                         " please use the perlbug utility to report;", cp);
9265             }
9266             else {
9267                 unsigned int i;
9268 
9269                 if (first_fold > 255) {
9270                     *invlist = add_cp_to_invlist(*invlist, first_fold);
9271                 }
9272                 for (i = 0; i < folds_count - 1; i++) {
9273                     if (remaining_folds[i] > 255) {
9274                         *invlist = add_cp_to_invlist(*invlist,
9275                                                     remaining_folds[i]);
9276                     }
9277                 }
9278             }
9279             break;
9280          }
9281     }
9282 }
9283 #endif /* PERL_RE_BUILD_AUX */
9284 
9285 
9286 STATIC void
S_output_posix_warnings(pTHX_ RExC_state_t * pRExC_state,AV * posix_warnings)9287 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
9288 {
9289     /* Output the elements of the array given by '*posix_warnings' as REGEXP
9290      * warnings. */
9291 
9292     SV * msg;
9293     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
9294 
9295     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
9296 
9297     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
9298         CLEAR_POSIX_WARNINGS();
9299         return;
9300     }
9301 
9302     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
9303         if (first_is_fatal) {           /* Avoid leaking this */
9304             av_undef(posix_warnings);   /* This isn't necessary if the
9305                                             array is mortal, but is a
9306                                             fail-safe */
9307             (void) sv_2mortal(msg);
9308         }
9309         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
9310         SvREFCNT_dec_NN(msg);
9311     }
9312 
9313     UPDATE_WARNINGS_LOC(RExC_parse);
9314 }
9315 
9316 PERL_STATIC_INLINE Size_t
S_find_first_differing_byte_pos(const U8 * s1,const U8 * s2,const Size_t max)9317 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
9318 {
9319     const U8 * const start = s1;
9320     const U8 * const send = start + max;
9321 
9322     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
9323 
9324     while (s1 < send && *s1  == *s2) {
9325         s1++; s2++;
9326     }
9327 
9328     return s1 - start;
9329 }
9330 
9331 STATIC AV *
S_add_multi_match(pTHX_ AV * multi_char_matches,SV * multi_string,const STRLEN cp_count)9332 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
9333 {
9334     /* This adds the string scalar <multi_string> to the array
9335      * <multi_char_matches>.  <multi_string> is known to have exactly
9336      * <cp_count> code points in it.  This is used when constructing a
9337      * bracketed character class and we find something that needs to match more
9338      * than a single character.
9339      *
9340      * <multi_char_matches> is actually an array of arrays.  Each top-level
9341      * element is an array that contains all the strings known so far that are
9342      * the same length.  And that length (in number of code points) is the same
9343      * as the index of the top-level array.  Hence, the [2] element is an
9344      * array, each element thereof is a string containing TWO code points;
9345      * while element [3] is for strings of THREE characters, and so on.  Since
9346      * this is for multi-char strings there can never be a [0] nor [1] element.
9347      *
9348      * When we rewrite the character class below, we will do so such that the
9349      * longest strings are written first, so that it prefers the longest
9350      * matching strings first.  This is done even if it turns out that any
9351      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
9352      * Christiansen has agreed that this is ok.  This makes the test for the
9353      * ligature 'ffi' come before the test for 'ff', for example */
9354 
9355     AV* this_array;
9356     AV** this_array_ptr;
9357 
9358     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
9359 
9360     if (! multi_char_matches) {
9361         multi_char_matches = newAV();
9362     }
9363 
9364     if (av_exists(multi_char_matches, cp_count)) {
9365         this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
9366         this_array = *this_array_ptr;
9367     }
9368     else {
9369         this_array = newAV();
9370         av_store_simple(multi_char_matches, cp_count,
9371                  (SV*) this_array);
9372     }
9373     av_push_simple(this_array, multi_string);
9374 
9375     return multi_char_matches;
9376 }
9377 
9378 /* The names of properties whose definitions are not known at compile time are
9379  * stored in this SV, after a constant heading.  So if the length has been
9380  * changed since initialization, then there is a run-time definition. */
9381 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
9382                                         (SvCUR(listsv) != initial_listsv_len)
9383 
9384 /* There is a restricted set of white space characters that are legal when
9385  * ignoring white space in a bracketed character class.  This generates the
9386  * code to skip them.
9387  *
9388  * There is a line below that uses the same white space criteria but is outside
9389  * this macro.  Both here and there must use the same definition */
9390 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
9391     STMT_START {                                                        \
9392         if (do_skip) {                                                  \
9393             while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
9394             {                                                           \
9395                 p++;                                                    \
9396             }                                                           \
9397         }                                                               \
9398     } STMT_END
9399 
9400 STATIC regnode_offset
S_regclass(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth,const bool stop_at_1,bool allow_mutiple_chars,const bool silence_non_portable,const bool strict,bool optimizable,SV ** ret_invlist)9401 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
9402                  const bool stop_at_1,  /* Just parse the next thing, don't
9403                                            look for a full character class */
9404                  bool allow_mutiple_chars,
9405                  const bool silence_non_portable,   /* Don't output warnings
9406                                                        about too large
9407                                                        characters */
9408                  const bool strict,
9409                  bool optimizable,                  /* ? Allow a non-ANYOF return
9410                                                        node */
9411                  SV** ret_invlist  /* Return an inversion list, not a node */
9412           )
9413 {
9414     /* parse a bracketed class specification.  Most of these will produce an
9415      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
9416      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
9417      * under /i with multi-character folds: it will be rewritten following the
9418      * paradigm of this example, where the <multi-fold>s are characters which
9419      * fold to multiple character sequences:
9420      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
9421      * gets effectively rewritten as:
9422      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
9423      * reg() gets called (recursively) on the rewritten version, and this
9424      * function will return what it constructs.  (Actually the <multi-fold>s
9425      * aren't physically removed from the [abcdefghi], it's just that they are
9426      * ignored in the recursion by means of a flag:
9427      * <RExC_in_multi_char_class>.)
9428      *
9429      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
9430      * characters, with the corresponding bit set if that character is in the
9431      * list.  For characters above this, an inversion list is used.  There
9432      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
9433      * determinable at compile time
9434      *
9435      * On success, returns the offset at which any next node should be placed
9436      * into the regex engine program being compiled.
9437      *
9438      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
9439      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
9440      * UTF-8
9441      */
9442 
9443     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
9444     IV range = 0;
9445     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
9446     regnode_offset ret = -1;    /* Initialized to an illegal value */
9447     STRLEN numlen;
9448     int namedclass = OOB_NAMEDCLASS;
9449     char *rangebegin = NULL;
9450     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
9451                                aren't available at the time this was called */
9452     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9453                                       than just initialized.  */
9454     SV* properties = NULL;    /* Code points that match \p{} \P{} */
9455     SV* posixes = NULL;     /* Code points that match classes like [:word:],
9456                                extended beyond the Latin1 range.  These have to
9457                                be kept separate from other code points for much
9458                                of this function because their handling  is
9459                                different under /i, and for most classes under
9460                                /d as well */
9461     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
9462                                separate for a while from the non-complemented
9463                                versions because of complications with /d
9464                                matching */
9465     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
9466                                   treated more simply than the general case,
9467                                   leading to less compilation and execution
9468                                   work */
9469     UV element_count = 0;   /* Number of distinct elements in the class.
9470                                Optimizations may be possible if this is tiny */
9471     AV * multi_char_matches = NULL; /* Code points that fold to more than one
9472                                        character; used under /i */
9473     UV n;
9474     char * stop_ptr = RExC_end;    /* where to stop parsing */
9475 
9476     /* ignore unescaped whitespace? */
9477     const bool skip_white = cBOOL(   ret_invlist
9478                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
9479 
9480     /* inversion list of code points this node matches only when the target
9481      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
9482      * /d) */
9483     SV* upper_latin1_only_utf8_matches = NULL;
9484 
9485     /* Inversion list of code points this node matches regardless of things
9486      * like locale, folding, utf8ness of the target string */
9487     SV* cp_list = NULL;
9488 
9489     /* Like cp_list, but code points on this list need to be checked for things
9490      * that fold to/from them under /i */
9491     SV* cp_foldable_list = NULL;
9492 
9493     /* Like cp_list, but code points on this list are valid only when the
9494      * runtime locale is UTF-8 */
9495     SV* only_utf8_locale_list = NULL;
9496 
9497     /* In a range, if one of the endpoints is non-character-set portable,
9498      * meaning that it hard-codes a code point that may mean a different
9499      * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
9500      * mnemonic '\t' which each mean the same character no matter which
9501      * character set the platform is on. */
9502     unsigned int non_portable_endpoint = 0;
9503 
9504     /* Is the range unicode? which means on a platform that isn't 1-1 native
9505      * to Unicode (i.e. non-ASCII), each code point in it should be considered
9506      * to be a Unicode value.  */
9507     bool unicode_range = FALSE;
9508     bool invert = FALSE;    /* Is this class to be complemented */
9509 
9510     bool warn_super = ALWAYS_WARN_SUPER;
9511 
9512     const char * orig_parse = RExC_parse;
9513 
9514     /* This variable is used to mark where the end in the input is of something
9515      * that looks like a POSIX construct but isn't.  During the parse, when
9516      * something looks like it could be such a construct is encountered, it is
9517      * checked for being one, but not if we've already checked this area of the
9518      * input.  Only after this position is reached do we check again */
9519     char *not_posix_region_end = RExC_parse - 1;
9520 
9521     AV* posix_warnings = NULL;
9522     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
9523     U8 op = ANYOF;    /* The returned node-type, initialized to the expected
9524                          type. */
9525     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
9526     U32 posixl = 0;       /* bit field of posix classes matched under /l */
9527 
9528 
9529 /* Flags as to what things aren't knowable until runtime.  (Note that these are
9530  * mutually exclusive.) */
9531 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
9532                                             haven't been defined as of yet */
9533 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
9534                                             UTF-8 or not */
9535 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
9536                                             what gets folded */
9537     U32 has_runtime_dependency = 0;     /* OR of the above flags */
9538 
9539     DECLARE_AND_GET_RE_DEBUG_FLAGS;
9540 
9541     PERL_ARGS_ASSERT_REGCLASS;
9542 #ifndef DEBUGGING
9543     PERL_UNUSED_ARG(depth);
9544 #endif
9545 
9546     assert(! (ret_invlist && allow_mutiple_chars));
9547 
9548     /* If wants an inversion list returned, we can't optimize to something
9549      * else. */
9550     if (ret_invlist) {
9551         optimizable = FALSE;
9552     }
9553 
9554     DEBUG_PARSE("clas");
9555 
9556 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
9557     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
9558                                    && UNICODE_DOT_DOT_VERSION == 0)
9559     allow_mutiple_chars = FALSE;
9560 #endif
9561 
9562     /* We include the /i status at the beginning of this so that we can
9563      * know it at runtime */
9564     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
9565     initial_listsv_len = SvCUR(listsv);
9566     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
9567 
9568     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9569 
9570     assert(RExC_parse <= RExC_end);
9571 
9572     if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
9573         RExC_parse_inc_by(1);
9574         invert = TRUE;
9575         allow_mutiple_chars = FALSE;
9576         MARK_NAUGHTY(1);
9577         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9578     }
9579 
9580     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
9581     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
9582         int maybe_class = handle_possible_posix(pRExC_state,
9583                                                 RExC_parse,
9584                                                 &not_posix_region_end,
9585                                                 NULL,
9586                                                 TRUE /* checking only */);
9587         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
9588             ckWARN4reg(not_posix_region_end,
9589                     "POSIX syntax [%c %c] belongs inside character classes%s",
9590                     *RExC_parse, *RExC_parse,
9591                     (maybe_class == OOB_NAMEDCLASS)
9592                     ? ((POSIXCC_NOTYET(*RExC_parse))
9593                         ? " (but this one isn't implemented)"
9594                         : " (but this one isn't fully valid)")
9595                     : ""
9596                     );
9597         }
9598     }
9599 
9600     /* If the caller wants us to just parse a single element, accomplish this
9601      * by faking the loop ending condition */
9602     if (stop_at_1 && RExC_end > RExC_parse) {
9603         stop_ptr = RExC_parse + 1;
9604     }
9605 
9606     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
9607     if (UCHARAT(RExC_parse) == ']')
9608         goto charclassloop;
9609 
9610     while (1) {
9611 
9612         if (   posix_warnings
9613             && av_tindex_skip_len_mg(posix_warnings) >= 0
9614             && RExC_parse > not_posix_region_end)
9615         {
9616             /* Warnings about posix class issues are considered tentative until
9617              * we are far enough along in the parse that we can no longer
9618              * change our mind, at which point we output them.  This is done
9619              * each time through the loop so that a later class won't zap them
9620              * before they have been dealt with. */
9621             output_posix_warnings(pRExC_state, posix_warnings);
9622         }
9623 
9624         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9625 
9626         if  (RExC_parse >= stop_ptr) {
9627             break;
9628         }
9629 
9630         if  (UCHARAT(RExC_parse) == ']') {
9631             break;
9632         }
9633 
9634       charclassloop:
9635 
9636         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9637         save_value = value;
9638         save_prevvalue = prevvalue;
9639 
9640         if (!range) {
9641             rangebegin = RExC_parse;
9642             element_count++;
9643             non_portable_endpoint = 0;
9644         }
9645         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
9646             value = utf8n_to_uvchr((U8*)RExC_parse,
9647                                    RExC_end - RExC_parse,
9648                                    &numlen, UTF8_ALLOW_DEFAULT);
9649             RExC_parse_inc_by(numlen);
9650         }
9651         else {
9652             value = UCHARAT(RExC_parse);
9653             RExC_parse_inc_by(1);
9654         }
9655 
9656         if (value == '[') {
9657             char * posix_class_end;
9658             namedclass = handle_possible_posix(pRExC_state,
9659                                                RExC_parse,
9660                                                &posix_class_end,
9661                                                do_posix_warnings ? &posix_warnings : NULL,
9662                                                FALSE    /* die if error */);
9663             if (namedclass > OOB_NAMEDCLASS) {
9664 
9665                 /* If there was an earlier attempt to parse this particular
9666                  * posix class, and it failed, it was a false alarm, as this
9667                  * successful one proves */
9668                 if (   posix_warnings
9669                     && av_tindex_skip_len_mg(posix_warnings) >= 0
9670                     && not_posix_region_end >= RExC_parse
9671                     && not_posix_region_end <= posix_class_end)
9672                 {
9673                     av_undef(posix_warnings);
9674                 }
9675 
9676                 RExC_parse_set(posix_class_end);
9677             }
9678             else if (namedclass == OOB_NAMEDCLASS) {
9679                 not_posix_region_end = posix_class_end;
9680             }
9681             else {
9682                 namedclass = OOB_NAMEDCLASS;
9683             }
9684         }
9685         else if (   RExC_parse - 1 > not_posix_region_end
9686                  && MAYBE_POSIXCC(value))
9687         {
9688             (void) handle_possible_posix(
9689                         pRExC_state,
9690                         RExC_parse - 1,  /* -1 because parse has already been
9691                                             advanced */
9692                         &not_posix_region_end,
9693                         do_posix_warnings ? &posix_warnings : NULL,
9694                         TRUE /* checking only */);
9695         }
9696         else if (  strict && ! skip_white
9697                  && (   generic_isCC_(value, CC_VERTSPACE_)
9698                      || is_VERTWS_cp_high(value)))
9699         {
9700             vFAIL("Literal vertical space in [] is illegal except under /x");
9701         }
9702         else if (value == '\\') {
9703             /* Is a backslash; get the code point of the char after it */
9704 
9705             if (RExC_parse >= RExC_end) {
9706                 vFAIL("Unmatched [");
9707             }
9708 
9709             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
9710                 value = utf8n_to_uvchr((U8*)RExC_parse,
9711                                    RExC_end - RExC_parse,
9712                                    &numlen, UTF8_ALLOW_DEFAULT);
9713                 RExC_parse_inc_by(numlen);
9714             }
9715             else {
9716                 value = UCHARAT(RExC_parse);
9717                 RExC_parse_inc_by(1);
9718             }
9719 
9720             /* Some compilers cannot handle switching on 64-bit integer
9721              * values, therefore value cannot be an UV.  Yes, this will
9722              * be a problem later if we want switch on Unicode.
9723              * A similar issue a little bit later when switching on
9724              * namedclass. --jhi */
9725 
9726             /* If the \ is escaping white space when white space is being
9727              * skipped, it means that that white space is wanted literally, and
9728              * is already in 'value'.  Otherwise, need to translate the escape
9729              * into what it signifies. */
9730             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
9731                 const char * message;
9732                 U32 packed_warn;
9733                 U8 grok_c_char;
9734 
9735             case 'w':	namedclass = ANYOF_WORDCHAR;	break;
9736             case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
9737             case 's':	namedclass = ANYOF_SPACE;	break;
9738             case 'S':	namedclass = ANYOF_NSPACE;	break;
9739             case 'd':	namedclass = ANYOF_DIGIT;	break;
9740             case 'D':	namedclass = ANYOF_NDIGIT;	break;
9741             case 'v':	namedclass = ANYOF_VERTWS;	break;
9742             case 'V':	namedclass = ANYOF_NVERTWS;	break;
9743             case 'h':	namedclass = ANYOF_HORIZWS;	break;
9744             case 'H':	namedclass = ANYOF_NHORIZWS;	break;
9745             case 'N':  /* Handle \N{NAME} in class */
9746                 {
9747                     const char * const backslash_N_beg = RExC_parse - 2;
9748                     int cp_count;
9749 
9750                     if (! grok_bslash_N(pRExC_state,
9751                                         NULL,      /* No regnode */
9752                                         &value,    /* Yes single value */
9753                                         &cp_count, /* Multiple code pt count */
9754                                         flagp,
9755                                         strict,
9756                                         depth)
9757                     ) {
9758 
9759                         if (*flagp & NEED_UTF8)
9760                             FAIL("panic: grok_bslash_N set NEED_UTF8");
9761 
9762                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
9763 
9764                         if (cp_count < 0) {
9765                             vFAIL("\\N in a character class must be a named character: \\N{...}");
9766                         }
9767                         else if (cp_count == 0) {
9768                             ckWARNreg(RExC_parse,
9769                               "Ignoring zero length \\N{} in character class");
9770                         }
9771                         else { /* cp_count > 1 */
9772                             assert(cp_count > 1);
9773                             if (! RExC_in_multi_char_class) {
9774                                 if ( ! allow_mutiple_chars
9775                                     || invert
9776                                     || range
9777                                     || *RExC_parse == '-')
9778                                 {
9779                                     if (strict) {
9780                                         RExC_parse--;
9781                                         vFAIL("\\N{} here is restricted to one character");
9782                                     }
9783                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
9784                                     break; /* <value> contains the first code
9785                                               point. Drop out of the switch to
9786                                               process it */
9787                                 }
9788                                 else {
9789                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
9790                                                  RExC_parse - backslash_N_beg);
9791                                     multi_char_matches
9792                                         = add_multi_match(multi_char_matches,
9793                                                           multi_char_N,
9794                                                           cp_count);
9795                                 }
9796                             }
9797                         } /* End of cp_count != 1 */
9798 
9799                         /* This element should not be processed further in this
9800                          * class */
9801                         element_count--;
9802                         value = save_value;
9803                         prevvalue = save_prevvalue;
9804                         continue;   /* Back to top of loop to get next char */
9805                     }
9806 
9807                     /* Here, is a single code point, and <value> contains it */
9808                     unicode_range = TRUE;   /* \N{} are Unicode */
9809                 }
9810                 break;
9811             case 'p':
9812             case 'P':
9813                 {
9814                 char *e;
9815 
9816                 if (RExC_pm_flags & PMf_WILDCARD) {
9817                     RExC_parse_inc_by(1);
9818                     /* diag_listed_as: Use of %s is not allowed in Unicode
9819                        property wildcard subpatterns in regex; marked by <--
9820                        HERE in m/%s/ */
9821                     vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
9822                            " wildcard subpatterns", (char) value, *(RExC_parse - 1));
9823                 }
9824 
9825                 /* \p means they want Unicode semantics */
9826                 REQUIRE_UNI_RULES(flagp, 0);
9827 
9828                 if (RExC_parse >= RExC_end)
9829                     vFAIL2("Empty \\%c", (U8)value);
9830                 if (*RExC_parse == '{') {
9831                     const U8 c = (U8)value;
9832                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
9833                     if (!e) {
9834                         RExC_parse_inc_by(1);
9835                         vFAIL2("Missing right brace on \\%c{}", c);
9836                     }
9837 
9838                     RExC_parse_inc_by(1);
9839 
9840                     /* White space is allowed adjacent to the braces and after
9841                      * any '^', even when not under /x */
9842                     while (isSPACE(*RExC_parse)) {
9843                          RExC_parse_inc_by(1);
9844                     }
9845 
9846                     if (UCHARAT(RExC_parse) == '^') {
9847 
9848                         /* toggle.  (The rhs xor gets the single bit that
9849                          * differs between P and p; the other xor inverts just
9850                          * that bit) */
9851                         value ^= 'P' ^ 'p';
9852 
9853                         RExC_parse_inc_by(1);
9854                         while (isSPACE(*RExC_parse)) {
9855                             RExC_parse_inc_by(1);
9856                         }
9857                     }
9858 
9859                     if (e == RExC_parse)
9860                         vFAIL2("Empty \\%c{}", c);
9861 
9862                     n = e - RExC_parse;
9863                     while (isSPACE(*(RExC_parse + n - 1)))
9864                         n--;
9865 
9866                 }   /* The \p isn't immediately followed by a '{' */
9867                 else if (! isALPHA(*RExC_parse)) {
9868                     RExC_parse_inc_safe();
9869                     vFAIL2("Character following \\%c must be '{' or a "
9870                            "single-character Unicode property name",
9871                            (U8) value);
9872                 }
9873                 else {
9874                     e = RExC_parse;
9875                     n = 1;
9876                 }
9877                 {
9878                     char* name = RExC_parse;
9879 
9880                     /* Any message returned about expanding the definition */
9881                     SV* msg = newSVpvs_flags("", SVs_TEMP);
9882 
9883                     /* If set TRUE, the property is user-defined as opposed to
9884                      * official Unicode */
9885                     bool user_defined = FALSE;
9886                     AV * strings = NULL;
9887 
9888                     SV * prop_definition = parse_uniprop_string(
9889                                             name, n, UTF, FOLD,
9890                                             FALSE, /* This is compile-time */
9891 
9892                                             /* We can't defer this defn when
9893                                              * the full result is required in
9894                                              * this call */
9895                                             ! cBOOL(ret_invlist),
9896 
9897                                             &strings,
9898                                             &user_defined,
9899                                             msg,
9900                                             0 /* Base level */
9901                                            );
9902                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
9903                         assert(prop_definition == NULL);
9904                         RExC_parse_set(e + 1);
9905                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
9906                                                thing so, or else the display is
9907                                                mojibake */
9908                             RExC_utf8 = TRUE;
9909                         }
9910                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
9911                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
9912                                     SvCUR(msg), SvPVX(msg)));
9913                     }
9914 
9915                     assert(prop_definition || strings);
9916 
9917                     if (strings) {
9918                         if (ret_invlist) {
9919                             if (! prop_definition) {
9920                                 RExC_parse_set(e + 1);
9921                                 vFAIL("Unicode string properties are not implemented in (?[...])");
9922                             }
9923                             else {
9924                                 ckWARNreg(e + 1,
9925                                     "Using just the single character results"
9926                                     " returned by \\p{} in (?[...])");
9927                             }
9928                         }
9929                         else if (! RExC_in_multi_char_class) {
9930                             if (invert ^ (value == 'P')) {
9931                                 RExC_parse_set(e + 1);
9932                                 vFAIL("Inverting a character class which contains"
9933                                     " a multi-character sequence is illegal");
9934                             }
9935 
9936                             /* For each multi-character string ... */
9937                             while (av_count(strings) > 0) {
9938                                 /* ... Each entry is itself an array of code
9939                                 * points. */
9940                                 AV * this_string = (AV *) av_shift( strings);
9941                                 STRLEN cp_count = av_count(this_string);
9942                                 SV * final = newSV(cp_count ? cp_count * 4 : 1);
9943                                 SvPVCLEAR_FRESH(final);
9944 
9945                                 /* Create another string of sequences of \x{...} */
9946                                 while (av_count(this_string) > 0) {
9947                                     SV * character = av_shift(this_string);
9948                                     UV cp = SvUV(character);
9949 
9950                                     if (cp > 255) {
9951                                         REQUIRE_UTF8(flagp);
9952                                     }
9953                                     Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
9954                                                                         cp);
9955                                     SvREFCNT_dec_NN(character);
9956                                 }
9957                                 SvREFCNT_dec_NN(this_string);
9958 
9959                                 /* And add that to the list of such things */
9960                                 multi_char_matches
9961                                             = add_multi_match(multi_char_matches,
9962                                                             final,
9963                                                             cp_count);
9964                             }
9965                         }
9966                         SvREFCNT_dec_NN(strings);
9967                     }
9968 
9969                     if (! prop_definition) {    /* If we got only a string,
9970                                                    this iteration didn't really
9971                                                    find a character */
9972                         element_count--;
9973                     }
9974                     else if (! is_invlist(prop_definition)) {
9975 
9976                         /* Here, the definition isn't known, so we have gotten
9977                          * returned a string that will be evaluated if and when
9978                          * encountered at runtime.  We add it to the list of
9979                          * such properties, along with whether it should be
9980                          * complemented or not */
9981                         if (value == 'P') {
9982                             sv_catpvs(listsv, "!");
9983                         }
9984                         else {
9985                             sv_catpvs(listsv, "+");
9986                         }
9987                         sv_catsv(listsv, prop_definition);
9988 
9989                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
9990 
9991                         /* We don't know yet what this matches, so have to flag
9992                          * it */
9993                         anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
9994                     }
9995                     else {
9996                         assert (prop_definition && is_invlist(prop_definition));
9997 
9998                         /* Here we do have the complete property definition
9999                          *
10000                          * Temporary workaround for [GH #16520].  For this
10001                          * precise input that is in the .t that is failing,
10002                          * load utf8.pm, which is what the test wants, so that
10003                          * that .t passes */
10004                         if (     memEQs(RExC_start, e + 1 - RExC_start,
10005                                         "foo\\p{Alnum}")
10006                             && ! hv_common(GvHVn(PL_incgv),
10007                                            NULL,
10008                                            "utf8.pm", sizeof("utf8.pm") - 1,
10009                                            0, HV_FETCH_ISEXISTS, NULL, 0))
10010                         {
10011                             require_pv("utf8.pm");
10012                         }
10013 
10014                         if (! user_defined &&
10015                             /* We warn on matching an above-Unicode code point
10016                              * if the match would return true, except don't
10017                              * warn for \p{All}, which has exactly one element
10018                              * = 0 */
10019                             (_invlist_contains_cp(prop_definition, 0x110000)
10020                                 && (! (_invlist_len(prop_definition) == 1
10021                                        && *invlist_array(prop_definition) == 0))))
10022                         {
10023                             warn_super = TRUE;
10024                         }
10025 
10026                         /* Invert if asking for the complement */
10027                         if (value == 'P') {
10028                             _invlist_union_complement_2nd(properties,
10029                                                           prop_definition,
10030                                                           &properties);
10031                         }
10032                         else {
10033                             _invlist_union(properties, prop_definition, &properties);
10034                         }
10035                     }
10036                 }
10037 
10038                 RExC_parse_set(e + 1);
10039                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
10040                                                 named */
10041                 }
10042                 break;
10043             case 'n':	value = '\n';			break;
10044             case 'r':	value = '\r';			break;
10045             case 't':	value = '\t';			break;
10046             case 'f':	value = '\f';			break;
10047             case 'b':	value = '\b';			break;
10048             case 'e':	value = ESC_NATIVE;             break;
10049             case 'a':	value = '\a';                   break;
10050             case 'o':
10051                 RExC_parse--;	/* function expects to be pointed at the 'o' */
10052                 if (! grok_bslash_o(&RExC_parse,
10053                                             RExC_end,
10054                                             &value,
10055                                             &message,
10056                                             &packed_warn,
10057                                             strict,
10058                                             cBOOL(range), /* MAX_UV allowed for range
10059                                                       upper limit */
10060                                             UTF))
10061                 {
10062                     vFAIL(message);
10063                 }
10064                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10065                     warn_non_literal_string(RExC_parse, packed_warn, message);
10066                 }
10067 
10068                 if (value < 256) {
10069                     non_portable_endpoint++;
10070                 }
10071                 break;
10072             case 'x':
10073                 RExC_parse--;	/* function expects to be pointed at the 'x' */
10074                 if (!  grok_bslash_x(&RExC_parse,
10075                                             RExC_end,
10076                                             &value,
10077                                             &message,
10078                                             &packed_warn,
10079                                             strict,
10080                                             cBOOL(range), /* MAX_UV allowed for range
10081                                                       upper limit */
10082                                             UTF))
10083                 {
10084                     vFAIL(message);
10085                 }
10086                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10087                     warn_non_literal_string(RExC_parse, packed_warn, message);
10088                 }
10089 
10090                 if (value < 256) {
10091                     non_portable_endpoint++;
10092                 }
10093                 break;
10094             case 'c':
10095                 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
10096                                                                 &packed_warn))
10097                 {
10098                     /* going to die anyway; point to exact spot of
10099                         * failure */
10100                     RExC_parse_inc_safe();
10101                     vFAIL(message);
10102                 }
10103 
10104                 value = grok_c_char;
10105                 RExC_parse_inc_by(1);
10106                 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10107                     warn_non_literal_string(RExC_parse, packed_warn, message);
10108                 }
10109 
10110                 non_portable_endpoint++;
10111                 break;
10112             case '0': case '1': case '2': case '3': case '4':
10113             case '5': case '6': case '7':
10114                 {
10115                     /* Take 1-3 octal digits */
10116                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
10117                               | PERL_SCAN_NOTIFY_ILLDIGIT;
10118                     numlen = (strict) ? 4 : 3;
10119                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10120                     RExC_parse_inc_by(numlen);
10121                     if (numlen != 3) {
10122                         if (strict) {
10123                             RExC_parse_inc_safe();
10124                             vFAIL("Need exactly 3 octal digits");
10125                         }
10126                         else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
10127                                  && RExC_parse < RExC_end
10128                                  && isDIGIT(*RExC_parse)
10129                                  && ckWARN(WARN_REGEXP))
10130                         {
10131                             reg_warn_non_literal_string(
10132                                  RExC_parse + 1,
10133                                  form_alien_digit_msg(8, numlen, RExC_parse,
10134                                                         RExC_end, UTF, FALSE));
10135                         }
10136                     }
10137                     if (value < 256) {
10138                         non_portable_endpoint++;
10139                     }
10140                     break;
10141                 }
10142             default:
10143                 /* Allow \_ to not give an error */
10144                 if (isWORDCHAR(value) && value != '_') {
10145                     if (strict) {
10146                         vFAIL2("Unrecognized escape \\%c in character class",
10147                                (int)value);
10148                     }
10149                     else {
10150                         ckWARN2reg(RExC_parse,
10151                             "Unrecognized escape \\%c in character class passed through",
10152                             (int)value);
10153                     }
10154                 }
10155                 break;
10156             }   /* End of switch on char following backslash */
10157         } /* end of handling backslash escape sequences */
10158 
10159         /* Here, we have the current token in 'value' */
10160 
10161         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10162             U8 classnum;
10163 
10164             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10165              * literal, as is the character that began the false range, i.e.
10166              * the 'a' in the examples */
10167             if (range) {
10168                 const int w = (RExC_parse >= rangebegin)
10169                                 ? RExC_parse - rangebegin
10170                                 : 0;
10171                 if (strict) {
10172                     vFAIL2utf8f(
10173                         "False [] range \"%" UTF8f "\"",
10174                         UTF8fARG(UTF, w, rangebegin));
10175                 }
10176                 else {
10177                     ckWARN2reg(RExC_parse,
10178                         "False [] range \"%" UTF8f "\"",
10179                         UTF8fARG(UTF, w, rangebegin));
10180                     cp_list = add_cp_to_invlist(cp_list, '-');
10181                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
10182                                                             prevvalue);
10183                 }
10184 
10185                 range = 0; /* this was not a true range */
10186                 element_count += 2; /* So counts for three values */
10187             }
10188 
10189             classnum = namedclass_to_classnum(namedclass);
10190 
10191             if (LOC && namedclass < ANYOF_POSIXL_MAX
10192 #ifndef HAS_ISASCII
10193                 && classnum != CC_ASCII_
10194 #endif
10195             ) {
10196                 SV* scratch_list = NULL;
10197 
10198                 /* What the Posix classes (like \w, [:space:]) match isn't
10199                  * generally knowable under locale until actual match time.  A
10200                  * special node is used for these which has extra space for a
10201                  * bitmap, with a bit reserved for each named class that is to
10202                  * be matched against.  (This isn't needed for \p{} and
10203                  * pseudo-classes, as they are not affected by locale, and
10204                  * hence are dealt with separately.)  However, if a named class
10205                  * and its complement are both present, then it matches
10206                  * everything, and there is no runtime dependency.  Odd numbers
10207                  * are the complements of the next lower number, so xor works.
10208                  * (Note that something like [\w\D] should match everything,
10209                  * because \d should be a proper subset of \w.  But rather than
10210                  * trust that the locale is well behaved, we leave this to
10211                  * runtime to sort out) */
10212                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
10213                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
10214                     POSIXL_ZERO(posixl);
10215                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
10216                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
10217                     continue;   /* We could ignore the rest of the class, but
10218                                    best to parse it for any errors */
10219                 }
10220                 else { /* Here, isn't the complement of any already parsed
10221                           class */
10222                     POSIXL_SET(posixl, namedclass);
10223                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
10224                     anyof_flags |= ANYOF_MATCHES_POSIXL;
10225 
10226                     /* The above-Latin1 characters are not subject to locale
10227                      * rules.  Just add them to the unconditionally-matched
10228                      * list */
10229 
10230                     /* Get the list of the above-Latin1 code points this
10231                      * matches */
10232                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
10233                                             PL_XPosix_ptrs[classnum],
10234 
10235                                             /* Odd numbers are complements,
10236                                              * like NDIGIT, NASCII, ... */
10237                                             namedclass % 2 != 0,
10238                                             &scratch_list);
10239                     /* Checking if 'cp_list' is NULL first saves an extra
10240                      * clone.  Its reference count will be decremented at the
10241                      * next union, etc, or if this is the only instance, at the
10242                      * end of the routine */
10243                     if (! cp_list) {
10244                         cp_list = scratch_list;
10245                     }
10246                     else {
10247                         _invlist_union(cp_list, scratch_list, &cp_list);
10248                         SvREFCNT_dec_NN(scratch_list);
10249                     }
10250                     continue;   /* Go get next character */
10251                 }
10252             }
10253             else {
10254 
10255                 /* Here, is not /l, or is a POSIX class for which /l doesn't
10256                  * matter (or is a Unicode property, which is skipped here). */
10257                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
10258                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
10259 
10260                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
10261                          * nor /l make a difference in what these match,
10262                          * therefore we just add what they match to cp_list. */
10263                         if (classnum != CC_VERTSPACE_) {
10264                             assert(   namedclass == ANYOF_HORIZWS
10265                                    || namedclass == ANYOF_NHORIZWS);
10266 
10267                             /* It turns out that \h is just a synonym for
10268                              * XPosixBlank */
10269                             classnum = CC_BLANK_;
10270                         }
10271 
10272                         _invlist_union_maybe_complement_2nd(
10273                                 cp_list,
10274                                 PL_XPosix_ptrs[classnum],
10275                                 namedclass % 2 != 0,    /* Complement if odd
10276                                                           (NHORIZWS, NVERTWS)
10277                                                         */
10278                                 &cp_list);
10279                     }
10280                 }
10281                 else if (   AT_LEAST_UNI_SEMANTICS
10282                          || classnum == CC_ASCII_
10283                          || (DEPENDS_SEMANTICS && (   classnum == CC_DIGIT_
10284                                                    || classnum == CC_XDIGIT_)))
10285                 {
10286                     /* We usually have to worry about /d affecting what POSIX
10287                      * classes match, with special code needed because we won't
10288                      * know until runtime what all matches.  But there is no
10289                      * extra work needed under /u and /a; and [:ascii:] is
10290                      * unaffected by /d; and :digit: and :xdigit: don't have
10291                      * runtime differences under /d.  So we can special case
10292                      * these, and avoid some extra work below, and at runtime.
10293                      * */
10294                     _invlist_union_maybe_complement_2nd(
10295                                                      simple_posixes,
10296                                                       ((AT_LEAST_ASCII_RESTRICTED)
10297                                                        ? PL_Posix_ptrs[classnum]
10298                                                        : PL_XPosix_ptrs[classnum]),
10299                                                      namedclass % 2 != 0,
10300                                                      &simple_posixes);
10301                 }
10302                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
10303                            complement and use nposixes */
10304                     SV** posixes_ptr = namedclass % 2 == 0
10305                                        ? &posixes
10306                                        : &nposixes;
10307                     _invlist_union_maybe_complement_2nd(
10308                                                      *posixes_ptr,
10309                                                      PL_XPosix_ptrs[classnum],
10310                                                      namedclass % 2 != 0,
10311                                                      posixes_ptr);
10312                 }
10313             }
10314         } /* end of namedclass \blah */
10315 
10316         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
10317 
10318         /* If 'range' is set, 'value' is the ending of a range--check its
10319          * validity.  (If value isn't a single code point in the case of a
10320          * range, we should have figured that out above in the code that
10321          * catches false ranges).  Later, we will handle each individual code
10322          * point in the range.  If 'range' isn't set, this could be the
10323          * beginning of a range, so check for that by looking ahead to see if
10324          * the next real character to be processed is the range indicator--the
10325          * minus sign */
10326 
10327         if (range) {
10328 #ifdef EBCDIC
10329             /* For unicode ranges, we have to test that the Unicode as opposed
10330              * to the native values are not decreasing.  (Above 255, there is
10331              * no difference between native and Unicode) */
10332             if (unicode_range && prevvalue < 255 && value < 255) {
10333                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
10334                     goto backwards_range;
10335                 }
10336             }
10337             else
10338 #endif
10339             if (prevvalue > value) /* b-a */ {
10340                 int w;
10341 #ifdef EBCDIC
10342               backwards_range:
10343 #endif
10344                 w = RExC_parse - rangebegin;
10345                 vFAIL2utf8f(
10346                     "Invalid [] range \"%" UTF8f "\"",
10347                     UTF8fARG(UTF, w, rangebegin));
10348                 NOT_REACHED; /* NOTREACHED */
10349             }
10350         }
10351         else {
10352             prevvalue = value; /* save the beginning of the potential range */
10353             if (! stop_at_1     /* Can't be a range if parsing just one thing */
10354                 && *RExC_parse == '-')
10355             {
10356                 char* next_char_ptr = RExC_parse + 1;
10357 
10358                 /* Get the next real char after the '-' */
10359                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
10360 
10361                 /* If the '-' is at the end of the class (just before the ']',
10362                  * it is a literal minus; otherwise it is a range */
10363                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
10364                     RExC_parse_set(next_char_ptr);
10365 
10366                     /* a bad range like \w-, [:word:]- ? */
10367                     if (namedclass > OOB_NAMEDCLASS) {
10368                         if (strict || ckWARN(WARN_REGEXP)) {
10369                             const int w = RExC_parse >= rangebegin
10370                                           ?  RExC_parse - rangebegin
10371                                           : 0;
10372                             if (strict) {
10373                                 vFAIL4("False [] range \"%*.*s\"",
10374                                     w, w, rangebegin);
10375                             }
10376                             else {
10377                                 vWARN4(RExC_parse,
10378                                     "False [] range \"%*.*s\"",
10379                                     w, w, rangebegin);
10380                             }
10381                         }
10382                         cp_list = add_cp_to_invlist(cp_list, '-');
10383                         element_count++;
10384                     } else
10385                         range = 1;	/* yeah, it's a range! */
10386                     continue;	/* but do it the next time */
10387                 }
10388             }
10389         }
10390 
10391         if (namedclass > OOB_NAMEDCLASS) {
10392             continue;
10393         }
10394 
10395         /* Here, we have a single value this time through the loop, and
10396          * <prevvalue> is the beginning of the range, if any; or <value> if
10397          * not. */
10398 
10399         /* non-Latin1 code point implies unicode semantics. */
10400         if (value > 255) {
10401             if (value > MAX_LEGAL_CP && (   value != UV_MAX
10402                                          || prevvalue > MAX_LEGAL_CP))
10403             {
10404                 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
10405             }
10406             REQUIRE_UNI_RULES(flagp, 0);
10407             if (  ! silence_non_portable
10408                 &&  UNICODE_IS_PERL_EXTENDED(value)
10409                 &&  TO_OUTPUT_WARNINGS(RExC_parse))
10410             {
10411                 ckWARN2_non_literal_string(RExC_parse,
10412                                            packWARN(WARN_PORTABLE),
10413                                            PL_extended_cp_format,
10414                                            value);
10415             }
10416         }
10417 
10418         /* Ready to process either the single value, or the completed range.
10419          * For single-valued non-inverted ranges, we consider the possibility
10420          * of multi-char folds.  (We made a conscious decision to not do this
10421          * for the other cases because it can often lead to non-intuitive
10422          * results.  For example, you have the peculiar case that:
10423          *  "s s" =~ /^[^\xDF]+$/i => Y
10424          *  "ss"  =~ /^[^\xDF]+$/i => N
10425          *
10426          * See [perl #89750] */
10427         if (FOLD && allow_mutiple_chars && value == prevvalue) {
10428             if (    value == LATIN_SMALL_LETTER_SHARP_S
10429                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
10430                                                         value)))
10431             {
10432                 /* Here <value> is indeed a multi-char fold.  Get what it is */
10433 
10434                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10435                 STRLEN foldlen;
10436 
10437                 UV folded = _to_uni_fold_flags(
10438                                 value,
10439                                 foldbuf,
10440                                 &foldlen,
10441                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
10442                                                    ? FOLD_FLAGS_NOMIX_ASCII
10443                                                    : 0)
10444                                 );
10445 
10446                 /* Here, <folded> should be the first character of the
10447                  * multi-char fold of <value>, with <foldbuf> containing the
10448                  * whole thing.  But, if this fold is not allowed (because of
10449                  * the flags), <fold> will be the same as <value>, and should
10450                  * be processed like any other character, so skip the special
10451                  * handling */
10452                 if (folded != value) {
10453 
10454                     /* Skip if we are recursed, currently parsing the class
10455                      * again.  Otherwise add this character to the list of
10456                      * multi-char folds. */
10457                     if (! RExC_in_multi_char_class) {
10458                         STRLEN cp_count = utf8_length(foldbuf,
10459                                                       foldbuf + foldlen);
10460                         SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
10461 
10462                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
10463 
10464                         multi_char_matches
10465                                         = add_multi_match(multi_char_matches,
10466                                                           multi_fold,
10467                                                           cp_count);
10468 
10469                     }
10470 
10471                     /* This element should not be processed further in this
10472                      * class */
10473                     element_count--;
10474                     value = save_value;
10475                     prevvalue = save_prevvalue;
10476                     continue;
10477                 }
10478             }
10479         }
10480 
10481         if (strict && ckWARN(WARN_REGEXP)) {
10482             if (range) {
10483 
10484                 /* If the range starts above 255, everything is portable and
10485                  * likely to be so for any forseeable character set, so don't
10486                  * warn. */
10487                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
10488                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
10489                 }
10490                 else if (prevvalue != value) {
10491 
10492                     /* Under strict, ranges that stop and/or end in an ASCII
10493                      * printable should have each end point be a portable value
10494                      * for it (preferably like 'A', but we don't warn if it is
10495                      * a (portable) Unicode name or code point), and the range
10496                      * must be all digits or all letters of the same case.
10497                      * Otherwise, the range is non-portable and unclear as to
10498                      * what it contains */
10499                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
10500                         && (          non_portable_endpoint
10501                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
10502                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
10503                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
10504                     ))) {
10505                         vWARN(RExC_parse, "Ranges of ASCII printables should"
10506                                           " be some subset of \"0-9\","
10507                                           " \"A-Z\", or \"a-z\"");
10508                     }
10509                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
10510                         SSize_t index_start;
10511                         SSize_t index_final;
10512 
10513                         /* But the nature of Unicode and languages mean we
10514                          * can't do the same checks for above-ASCII ranges,
10515                          * except in the case of digit ones.  These should
10516                          * contain only digits from the same group of 10.  The
10517                          * ASCII case is handled just above.  Hence here, the
10518                          * range could be a range of digits.  First some
10519                          * unlikely special cases.  Grandfather in that a range
10520                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
10521                          * if its starting value is one of the 10 digits prior
10522                          * to it.  This is because it is an alternate way of
10523                          * writing 19D1, and some people may expect it to be in
10524                          * that group.  But it is bad, because it won't give
10525                          * the expected results.  In Unicode 5.2 it was
10526                          * considered to be in that group (of 11, hence), but
10527                          * this was fixed in the next version */
10528 
10529                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
10530                             goto warn_bad_digit_range;
10531                         }
10532                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
10533                                           &&     value <= 0x1D7FF))
10534                         {
10535                             /* This is the only other case currently in Unicode
10536                              * where the algorithm below fails.  The code
10537                              * points just above are the end points of a single
10538                              * range containing only decimal digits.  It is 5
10539                              * different series of 0-9.  All other ranges of
10540                              * digits currently in Unicode are just a single
10541                              * series.  (And mktables will notify us if a later
10542                              * Unicode version breaks this.)
10543                              *
10544                              * If the range being checked is at most 9 long,
10545                              * and the digit values represented are in
10546                              * numerical order, they are from the same series.
10547                              * */
10548                             if (         value - prevvalue > 9
10549                                 ||    (((    value - 0x1D7CE) % 10)
10550                                      <= (prevvalue - 0x1D7CE) % 10))
10551                             {
10552                                 goto warn_bad_digit_range;
10553                             }
10554                         }
10555                         else {
10556 
10557                             /* For all other ranges of digits in Unicode, the
10558                              * algorithm is just to check if both end points
10559                              * are in the same series, which is the same range.
10560                              * */
10561                             index_start = _invlist_search(
10562                                                     PL_XPosix_ptrs[CC_DIGIT_],
10563                                                     prevvalue);
10564 
10565                             /* Warn if the range starts and ends with a digit,
10566                              * and they are not in the same group of 10. */
10567                             if (   index_start >= 0
10568                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
10569                                 && (index_final =
10570                                     _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
10571                                                     value)) != index_start
10572                                 && index_final >= 0
10573                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
10574                             {
10575                               warn_bad_digit_range:
10576                                 vWARN(RExC_parse, "Ranges of digits should be"
10577                                                   " from the same group of"
10578                                                   " 10");
10579                             }
10580                         }
10581                     }
10582                 }
10583             }
10584             if ((! range || prevvalue == value) && non_portable_endpoint) {
10585                 if (isPRINT_A(value)) {
10586                     char literal[3];
10587                     unsigned d = 0;
10588                     if (isBACKSLASHED_PUNCT(value)) {
10589                         literal[d++] = '\\';
10590                     }
10591                     literal[d++] = (char) value;
10592                     literal[d++] = '\0';
10593 
10594                     vWARN4(RExC_parse,
10595                            "\"%.*s\" is more clearly written simply as \"%s\"",
10596                            (int) (RExC_parse - rangebegin),
10597                            rangebegin,
10598                            literal
10599                         );
10600                 }
10601                 else if (isMNEMONIC_CNTRL(value)) {
10602                     vWARN4(RExC_parse,
10603                            "\"%.*s\" is more clearly written simply as \"%s\"",
10604                            (int) (RExC_parse - rangebegin),
10605                            rangebegin,
10606                            cntrl_to_mnemonic((U8) value)
10607                         );
10608                 }
10609             }
10610         }
10611 
10612         /* Deal with this element of the class */
10613 
10614 #ifndef EBCDIC
10615         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10616                                                     prevvalue, value);
10617 #else
10618         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
10619          * that don't require special handling, we can just add the range like
10620          * we do for ASCII platforms */
10621         if ((UNLIKELY(prevvalue == 0) && value >= 255)
10622             || ! (prevvalue < 256
10623                     && (unicode_range
10624                         || (! non_portable_endpoint
10625                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
10626                                 || (isUPPER_A(prevvalue)
10627                                     && isUPPER_A(value)))))))
10628         {
10629             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10630                                                         prevvalue, value);
10631         }
10632         else {
10633             /* Here, requires special handling.  This can be because it is a
10634              * range whose code points are considered to be Unicode, and so
10635              * must be individually translated into native, or because its a
10636              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
10637              * EBCDIC, but we have defined them to include only the "expected"
10638              * upper or lower case ASCII alphabetics.  Subranges above 255 are
10639              * the same in native and Unicode, so can be added as a range */
10640             U8 start = NATIVE_TO_LATIN1(prevvalue);
10641             unsigned j;
10642             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
10643             for (j = start; j <= end; j++) {
10644                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
10645             }
10646             if (value > 255) {
10647                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10648                                                             256, value);
10649             }
10650         }
10651 #endif
10652 
10653         range = 0; /* this range (if it was one) is done now */
10654     } /* End of loop through all the text within the brackets */
10655 
10656     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
10657         output_posix_warnings(pRExC_state, posix_warnings);
10658     }
10659 
10660     /* If anything in the class expands to more than one character, we have to
10661      * deal with them by building up a substitute parse string, and recursively
10662      * calling reg() on it, instead of proceeding */
10663     if (multi_char_matches) {
10664         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
10665         I32 cp_count;
10666         STRLEN len;
10667         char *save_end = RExC_end;
10668         char *save_parse = RExC_parse;
10669         char *save_start = RExC_start;
10670         Size_t constructed_prefix_len = 0; /* This gives the length of the
10671                                               constructed portion of the
10672                                               substitute parse. */
10673         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
10674                                        a "|" */
10675         I32 reg_flags;
10676 
10677         assert(! invert);
10678         /* Only one level of recursion allowed */
10679         assert(RExC_copy_start_in_constructed == RExC_precomp);
10680 
10681 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
10682            because too confusing */
10683         if (invert) {
10684             sv_catpvs(substitute_parse, "(?:");
10685         }
10686 #endif
10687 
10688         /* Look at the longest strings first */
10689         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
10690                         cp_count > 0;
10691                         cp_count--)
10692         {
10693 
10694             if (av_exists(multi_char_matches, cp_count)) {
10695                 AV** this_array_ptr;
10696                 SV* this_sequence;
10697 
10698                 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
10699                                                  cp_count, FALSE);
10700                 while ((this_sequence = av_pop(*this_array_ptr)) !=
10701                                                                 &PL_sv_undef)
10702                 {
10703                     if (! first_time) {
10704                         sv_catpvs(substitute_parse, "|");
10705                     }
10706                     first_time = FALSE;
10707 
10708                     sv_catpv(substitute_parse, SvPVX(this_sequence));
10709                 }
10710             }
10711         }
10712 
10713         /* If the character class contains anything else besides these
10714          * multi-character strings, have to include it in recursive parsing */
10715         if (element_count) {
10716             bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
10717 
10718             sv_catpvs(substitute_parse, "|");
10719             if (has_l_bracket) {    /* Add an [ if the original had one */
10720                 sv_catpvs(substitute_parse, "[");
10721             }
10722             constructed_prefix_len = SvCUR(substitute_parse);
10723             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
10724 
10725             /* Put in a closing ']' to match any opening one, but not if going
10726              * off the end, as otherwise we are adding something that really
10727              * isn't there */
10728             if (has_l_bracket && RExC_parse < RExC_end) {
10729                 sv_catpvs(substitute_parse, "]");
10730             }
10731         }
10732 
10733         sv_catpvs(substitute_parse, ")");
10734 #if 0
10735         if (invert) {
10736             /* This is a way to get the parse to skip forward a whole named
10737              * sequence instead of matching the 2nd character when it fails the
10738              * first */
10739             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
10740         }
10741 #endif
10742 
10743         /* Set up the data structure so that any errors will be properly
10744          * reported.  See the comments at the definition of
10745          * REPORT_LOCATION_ARGS for details */
10746         RExC_copy_start_in_input = (char *) orig_parse;
10747         RExC_start = SvPV(substitute_parse, len);
10748         RExC_parse_set( RExC_start );
10749         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
10750         RExC_end = RExC_parse + len;
10751         RExC_in_multi_char_class = 1;
10752 
10753         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
10754 
10755         *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
10756 
10757         /* And restore so can parse the rest of the pattern */
10758         RExC_parse_set(save_parse);
10759         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
10760         RExC_end = save_end;
10761         RExC_in_multi_char_class = 0;
10762         SvREFCNT_dec_NN(multi_char_matches);
10763         SvREFCNT_dec(properties);
10764         SvREFCNT_dec(cp_list);
10765         SvREFCNT_dec(simple_posixes);
10766         SvREFCNT_dec(posixes);
10767         SvREFCNT_dec(nposixes);
10768         SvREFCNT_dec(cp_foldable_list);
10769         return ret;
10770     }
10771 
10772     /* If folding, we calculate all characters that could fold to or from the
10773      * ones already on the list */
10774     if (cp_foldable_list) {
10775         if (FOLD) {
10776             UV start, end;	/* End points of code point ranges */
10777 
10778             SV* fold_intersection = NULL;
10779             SV** use_list;
10780 
10781             /* Our calculated list will be for Unicode rules.  For locale
10782              * matching, we have to keep a separate list that is consulted at
10783              * runtime only when the locale indicates Unicode rules (and we
10784              * don't include potential matches in the ASCII/Latin1 range, as
10785              * any code point could fold to any other, based on the run-time
10786              * locale).   For non-locale, we just use the general list */
10787             if (LOC) {
10788                 use_list = &only_utf8_locale_list;
10789             }
10790             else {
10791                 use_list = &cp_list;
10792             }
10793 
10794             /* Only the characters in this class that participate in folds need
10795              * be checked.  Get the intersection of this class and all the
10796              * possible characters that are foldable.  This can quickly narrow
10797              * down a large class */
10798             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
10799                                   &fold_intersection);
10800 
10801             /* Now look at the foldable characters in this class individually */
10802             invlist_iterinit(fold_intersection);
10803             while (invlist_iternext(fold_intersection, &start, &end)) {
10804                 UV j;
10805                 UV folded;
10806 
10807                 /* Look at every character in the range */
10808                 for (j = start; j <= end; j++) {
10809                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10810                     STRLEN foldlen;
10811                     unsigned int k;
10812                     Size_t folds_count;
10813                     U32 first_fold;
10814                     const U32 * remaining_folds;
10815 
10816                     if (j < 256) {
10817 
10818                         /* Under /l, we don't know what code points below 256
10819                          * fold to, except we do know the MICRO SIGN folds to
10820                          * an above-255 character if the locale is UTF-8, so we
10821                          * add it to the special list (in *use_list)  Otherwise
10822                          * we know now what things can match, though some folds
10823                          * are valid under /d only if the target is UTF-8.
10824                          * Those go in a separate list */
10825                         if (      IS_IN_SOME_FOLD_L1(j)
10826                             && ! (LOC && j != MICRO_SIGN))
10827                         {
10828 
10829                             /* ASCII is always matched; non-ASCII is matched
10830                              * only under Unicode rules (which could happen
10831                              * under /l if the locale is a UTF-8 one */
10832                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
10833                                 *use_list = add_cp_to_invlist(*use_list,
10834                                                             PL_fold_latin1[j]);
10835                             }
10836                             else if (j != PL_fold_latin1[j]) {
10837                                 upper_latin1_only_utf8_matches
10838                                         = add_cp_to_invlist(
10839                                                 upper_latin1_only_utf8_matches,
10840                                                 PL_fold_latin1[j]);
10841                             }
10842                         }
10843 
10844                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
10845                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
10846                         {
10847                             add_above_Latin1_folds(pRExC_state,
10848                                                    (U8) j,
10849                                                    use_list);
10850                         }
10851                         continue;
10852                     }
10853 
10854                     /* Here is an above Latin1 character.  We don't have the
10855                      * rules hard-coded for it.  First, get its fold.  This is
10856                      * the simple fold, as the multi-character folds have been
10857                      * handled earlier and separated out */
10858                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
10859                                                         (ASCII_FOLD_RESTRICTED)
10860                                                         ? FOLD_FLAGS_NOMIX_ASCII
10861                                                         : 0);
10862 
10863                     /* Single character fold of above Latin1.  Add everything
10864                      * in its fold closure to the list that this node should
10865                      * match. */
10866                     folds_count = _inverse_folds(folded, &first_fold,
10867                                                     &remaining_folds);
10868                     for (k = 0; k <= folds_count; k++) {
10869                         UV c = (k == 0)     /* First time through use itself */
10870                                 ? folded
10871                                 : (k == 1)  /* 2nd time use, the first fold */
10872                                    ? first_fold
10873 
10874                                      /* Then the remaining ones */
10875                                    : remaining_folds[k-2];
10876 
10877                         /* /aa doesn't allow folds between ASCII and non- */
10878                         if ((   ASCII_FOLD_RESTRICTED
10879                             && (isASCII(c) != isASCII(j))))
10880                         {
10881                             continue;
10882                         }
10883 
10884                         /* Folds under /l which cross the 255/256 boundary are
10885                          * added to a separate list.  (These are valid only
10886                          * when the locale is UTF-8.) */
10887                         if (c < 256 && LOC) {
10888                             *use_list = add_cp_to_invlist(*use_list, c);
10889                             continue;
10890                         }
10891 
10892                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
10893                         {
10894                             cp_list = add_cp_to_invlist(cp_list, c);
10895                         }
10896                         else {
10897                             /* Similarly folds involving non-ascii Latin1
10898                              * characters under /d are added to their list */
10899                             upper_latin1_only_utf8_matches
10900                                     = add_cp_to_invlist(
10901                                                 upper_latin1_only_utf8_matches,
10902                                                 c);
10903                         }
10904                     }
10905                 }
10906             }
10907             SvREFCNT_dec_NN(fold_intersection);
10908         }
10909 
10910         /* Now that we have finished adding all the folds, there is no reason
10911          * to keep the foldable list separate */
10912         _invlist_union(cp_list, cp_foldable_list, &cp_list);
10913         SvREFCNT_dec_NN(cp_foldable_list);
10914     }
10915 
10916     /* And combine the result (if any) with any inversion lists from posix
10917      * classes.  The lists are kept separate up to now because we don't want to
10918      * fold the classes */
10919     if (simple_posixes) {   /* These are the classes known to be unaffected by
10920                                /a, /aa, and /d */
10921         if (cp_list) {
10922             _invlist_union(cp_list, simple_posixes, &cp_list);
10923             SvREFCNT_dec_NN(simple_posixes);
10924         }
10925         else {
10926             cp_list = simple_posixes;
10927         }
10928     }
10929     if (posixes || nposixes) {
10930         if (! DEPENDS_SEMANTICS) {
10931 
10932             /* For everything but /d, we can just add the current 'posixes' and
10933              * 'nposixes' to the main list */
10934             if (posixes) {
10935                 if (cp_list) {
10936                     _invlist_union(cp_list, posixes, &cp_list);
10937                     SvREFCNT_dec_NN(posixes);
10938                 }
10939                 else {
10940                     cp_list = posixes;
10941                 }
10942             }
10943             if (nposixes) {
10944                 if (cp_list) {
10945                     _invlist_union(cp_list, nposixes, &cp_list);
10946                     SvREFCNT_dec_NN(nposixes);
10947                 }
10948                 else {
10949                     cp_list = nposixes;
10950                 }
10951             }
10952         }
10953         else {
10954             /* Under /d, things like \w match upper Latin1 characters only if
10955              * the target string is in UTF-8.  But things like \W match all the
10956              * upper Latin1 characters if the target string is not in UTF-8.
10957              *
10958              * Handle the case with something like \W separately */
10959             if (nposixes) {
10960                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
10961 
10962                 /* A complemented posix class matches all upper Latin1
10963                  * characters if not in UTF-8.  And it matches just certain
10964                  * ones when in UTF-8.  That means those certain ones are
10965                  * matched regardless, so can just be added to the
10966                  * unconditional list */
10967                 if (cp_list) {
10968                     _invlist_union(cp_list, nposixes, &cp_list);
10969                     SvREFCNT_dec_NN(nposixes);
10970                     nposixes = NULL;
10971                 }
10972                 else {
10973                     cp_list = nposixes;
10974                 }
10975 
10976                 /* Likewise for 'posixes' */
10977                 _invlist_union(posixes, cp_list, &cp_list);
10978                 SvREFCNT_dec(posixes);
10979 
10980                 /* Likewise for anything else in the range that matched only
10981                  * under UTF-8 */
10982                 if (upper_latin1_only_utf8_matches) {
10983                     _invlist_union(cp_list,
10984                                    upper_latin1_only_utf8_matches,
10985                                    &cp_list);
10986                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10987                     upper_latin1_only_utf8_matches = NULL;
10988                 }
10989 
10990                 /* If we don't match all the upper Latin1 characters regardless
10991                  * of UTF-8ness, we have to set a flag to match the rest when
10992                  * not in UTF-8 */
10993                 _invlist_subtract(only_non_utf8_list, cp_list,
10994                                   &only_non_utf8_list);
10995                 if (_invlist_len(only_non_utf8_list) != 0) {
10996                     anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
10997                 }
10998                 SvREFCNT_dec_NN(only_non_utf8_list);
10999             }
11000             else {
11001                 /* Here there were no complemented posix classes.  That means
11002                  * the upper Latin1 characters in 'posixes' match only when the
11003                  * target string is in UTF-8.  So we have to add them to the
11004                  * list of those types of code points, while adding the
11005                  * remainder to the unconditional list.
11006                  *
11007                  * First calculate what they are */
11008                 SV* nonascii_but_latin1_properties = NULL;
11009                 _invlist_intersection(posixes, PL_UpperLatin1,
11010                                       &nonascii_but_latin1_properties);
11011 
11012                 /* And add them to the final list of such characters. */
11013                 _invlist_union(upper_latin1_only_utf8_matches,
11014                                nonascii_but_latin1_properties,
11015                                &upper_latin1_only_utf8_matches);
11016 
11017                 /* Remove them from what now becomes the unconditional list */
11018                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
11019                                   &posixes);
11020 
11021                 /* And add those unconditional ones to the final list */
11022                 if (cp_list) {
11023                     _invlist_union(cp_list, posixes, &cp_list);
11024                     SvREFCNT_dec_NN(posixes);
11025                     posixes = NULL;
11026                 }
11027                 else {
11028                     cp_list = posixes;
11029                 }
11030 
11031                 SvREFCNT_dec(nonascii_but_latin1_properties);
11032 
11033                 /* Get rid of any characters from the conditional list that we
11034                  * now know are matched unconditionally, which may make that
11035                  * list empty */
11036                 _invlist_subtract(upper_latin1_only_utf8_matches,
11037                                   cp_list,
11038                                   &upper_latin1_only_utf8_matches);
11039                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
11040                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11041                     upper_latin1_only_utf8_matches = NULL;
11042                 }
11043             }
11044         }
11045     }
11046 
11047     /* And combine the result (if any) with any inversion list from properties.
11048      * The lists are kept separate up to now so that we can distinguish the two
11049      * in regards to matching above-Unicode.  A run-time warning is generated
11050      * if a Unicode property is matched against a non-Unicode code point. But,
11051      * we allow user-defined properties to match anything, without any warning,
11052      * and we also suppress the warning if there is a portion of the character
11053      * class that isn't a Unicode property, and which matches above Unicode, \W
11054      * or [\x{110000}] for example.
11055      * (Note that in this case, unlike the Posix one above, there is no
11056      * <upper_latin1_only_utf8_matches>, because having a Unicode property
11057      * forces Unicode semantics */
11058     if (properties) {
11059         if (cp_list) {
11060 
11061             /* If it matters to the final outcome, see if a non-property
11062              * component of the class matches above Unicode.  If so, the
11063              * warning gets suppressed.  This is true even if just a single
11064              * such code point is specified, as, though not strictly correct if
11065              * another such code point is matched against, the fact that they
11066              * are using above-Unicode code points indicates they should know
11067              * the issues involved */
11068             if (warn_super) {
11069                 warn_super = ! (invert
11070                                ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
11071             }
11072 
11073             _invlist_union(properties, cp_list, &cp_list);
11074             SvREFCNT_dec_NN(properties);
11075         }
11076         else {
11077             cp_list = properties;
11078         }
11079 
11080         if (warn_super) {
11081             anyof_flags |= ANYOF_WARN_SUPER__shared;
11082 
11083             /* Because an ANYOF node is the only one that warns, this node
11084              * can't be optimized into something else */
11085             optimizable = FALSE;
11086         }
11087     }
11088 
11089     /* Here, we have calculated what code points should be in the character
11090      * class.
11091      *
11092      * Now we can see about various optimizations.  Fold calculation (which we
11093      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11094      * would invert to include K, which under /i would match k, which it
11095      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
11096      * folded until runtime */
11097 
11098     /* If we didn't do folding, it's because some information isn't available
11099      * until runtime; set the run-time fold flag for these  We know to set the
11100      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
11101      * at least one 0-255 range code point */
11102     if (LOC && FOLD) {
11103 
11104         /* Some things on the list might be unconditionally included because of
11105          * other components.  Remove them, and clean up the list if it goes to
11106          * 0 elements */
11107         if (only_utf8_locale_list && cp_list) {
11108             _invlist_subtract(only_utf8_locale_list, cp_list,
11109                               &only_utf8_locale_list);
11110 
11111             if (_invlist_len(only_utf8_locale_list) == 0) {
11112                 SvREFCNT_dec_NN(only_utf8_locale_list);
11113                 only_utf8_locale_list = NULL;
11114             }
11115         }
11116         if (    only_utf8_locale_list
11117             || (    cp_list
11118                 && (   _invlist_contains_cp(cp_list,
11119                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
11120                     || _invlist_contains_cp(cp_list,
11121                                             LATIN_SMALL_LETTER_DOTLESS_I))))
11122         {
11123             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11124             anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11125         }
11126         else if (cp_list && invlist_lowest(cp_list) < 256) {
11127             /* If nothing is below 256, has no locale dependency; otherwise it
11128              * does */
11129             anyof_flags |= ANYOFL_FOLD;
11130             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11131 
11132             /* In a Turkish locale these could match, notify the run-time code
11133              * to check for that */
11134             if (   _invlist_contains_cp(cp_list, 'I')
11135                 || _invlist_contains_cp(cp_list, 'i'))
11136             {
11137                 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11138             }
11139         }
11140     }
11141     else if (   DEPENDS_SEMANTICS
11142              && (    upper_latin1_only_utf8_matches
11143                  || (  anyof_flags
11144                      & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
11145     {
11146         RExC_seen_d_op = TRUE;
11147         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
11148     }
11149 
11150     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
11151      * compile time. */
11152     if (     cp_list
11153         &&   invert
11154         && ! has_runtime_dependency)
11155     {
11156         _invlist_invert(cp_list);
11157 
11158         /* Clear the invert flag since have just done it here */
11159         invert = FALSE;
11160     }
11161 
11162     /* All possible optimizations below still have these characteristics.
11163      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
11164      * routine) */
11165     *flagp |= HASWIDTH|SIMPLE;
11166 
11167     if (ret_invlist) {
11168         *ret_invlist = cp_list;
11169 
11170         return (cp_list) ? RExC_emit : 0;
11171     }
11172 
11173     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
11174         RExC_contains_locale = 1;
11175     }
11176 
11177     if (optimizable) {
11178 
11179         /* Some character classes are equivalent to other nodes.  Such nodes
11180          * take up less room, and some nodes require fewer operations to
11181          * execute, than ANYOF nodes.  EXACTish nodes may be joinable with
11182          * adjacent nodes to improve efficiency. */
11183         op = optimize_regclass(pRExC_state, cp_list,
11184                                             only_utf8_locale_list,
11185                                             upper_latin1_only_utf8_matches,
11186                                             has_runtime_dependency,
11187                                             posixl,
11188                                             &anyof_flags, &invert, &ret, flagp);
11189         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
11190 
11191         /* If optimized to something else and emitted, clean up and return */
11192         if (ret >= 0) {
11193             SvREFCNT_dec(cp_list);
11194             SvREFCNT_dec(only_utf8_locale_list);
11195             SvREFCNT_dec(upper_latin1_only_utf8_matches);
11196             return ret;
11197         }
11198 
11199         /* If no optimization was found, an END was returned and we will now
11200          * emit an ANYOF */
11201         if (op == END) {
11202             op = ANYOF;
11203         }
11204     }
11205 
11206     /* Here are going to emit an ANYOF; set the particular type */
11207     if (op == ANYOF) {
11208         if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
11209             op = ANYOFD;
11210         }
11211         else if (posixl) {
11212             op = ANYOFPOSIXL;
11213         }
11214         else if (LOC) {
11215             op = ANYOFL;
11216         }
11217     }
11218 
11219     ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
11220     FILL_NODE(ret, op);        /* We set the argument later */
11221     RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
11222     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
11223 
11224     /* Here, <cp_list> contains all the code points we can determine at
11225      * compile time that match under all conditions.  Go through it, and
11226      * for things that belong in the bitmap, put them there, and delete from
11227      * <cp_list>.  While we are at it, see if everything above 255 is in the
11228      * list, and if so, set a flag to speed up execution */
11229 
11230     populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
11231 
11232     if (posixl) {
11233         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
11234     }
11235 
11236     if (invert) {
11237         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
11238     }
11239 
11240     /* Here, the bitmap has been populated with all the Latin1 code points that
11241      * always match.  Can now add to the overall list those that match only
11242      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
11243      * */
11244     if (upper_latin1_only_utf8_matches) {
11245         if (cp_list) {
11246             _invlist_union(cp_list,
11247                            upper_latin1_only_utf8_matches,
11248                            &cp_list);
11249             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11250         }
11251         else {
11252             cp_list = upper_latin1_only_utf8_matches;
11253         }
11254         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11255     }
11256 
11257     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
11258                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
11259                    ? listsv
11260                    : NULL,
11261                   only_utf8_locale_list);
11262 
11263     SvREFCNT_dec(cp_list);
11264     SvREFCNT_dec(only_utf8_locale_list);
11265     return ret;
11266 }
11267 
11268 STATIC U8
S_optimize_regclass(pTHX_ RExC_state_t * pRExC_state,SV * cp_list,SV * only_utf8_locale_list,SV * upper_latin1_only_utf8_matches,const U32 has_runtime_dependency,const U32 posixl,U8 * anyof_flags,bool * invert,regnode_offset * ret,I32 * flagp)11269 S_optimize_regclass(pTHX_
11270                     RExC_state_t *pRExC_state,
11271                     SV * cp_list,
11272                     SV* only_utf8_locale_list,
11273                     SV* upper_latin1_only_utf8_matches,
11274                     const U32 has_runtime_dependency,
11275                     const U32 posixl,
11276                     U8  * anyof_flags,
11277                     bool * invert,
11278                     regnode_offset * ret,
11279                     I32 *flagp
11280                   )
11281 {
11282     /* This function exists just to make S_regclass() smaller.  It extracts out
11283      * the code that looks for potential optimizations away from a full generic
11284      * ANYOF node.  The parameter names are the same as the corresponding
11285      * variables in S_regclass.
11286      *
11287      * It returns the new op (the impossible END one if no optimization found)
11288      * and sets *ret to any created regnode.  If the new op is sufficiently
11289      * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
11290      *
11291      * Certain of the parameters may be updated as a result of the changes
11292      * herein */
11293 
11294     U8 op = END;    /* The returned node-type, initialized to an impossible
11295                       one. */
11296     UV value = 0;
11297     PERL_UINT_FAST8_T i;
11298     UV partial_cp_count = 0;
11299     UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
11300     UV   end[MAX_FOLD_FROMS+1] = { 0 };
11301     bool single_range = FALSE;
11302     UV lowest_cp = 0, highest_cp = 0;
11303 
11304     PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
11305 
11306     if (cp_list) { /* Count the code points in enough ranges that we would see
11307                       all the ones possible in any fold in this version of
11308                       Unicode */
11309 
11310         invlist_iterinit(cp_list);
11311         for (i = 0; i <= MAX_FOLD_FROMS; i++) {
11312             if (! invlist_iternext(cp_list, &start[i], &end[i])) {
11313                 break;
11314             }
11315             partial_cp_count += end[i] - start[i] + 1;
11316         }
11317 
11318         if (i == 1) {
11319             single_range = TRUE;
11320         }
11321         invlist_iterfinish(cp_list);
11322 
11323         /* If we know at compile time that this matches every possible code
11324          * point, any run-time dependencies don't matter */
11325         if (start[0] == 0 && end[0] == UV_MAX) {
11326             if (*invert) {
11327                 goto return_OPFAIL;
11328             }
11329             else {
11330                 goto return_SANY;
11331             }
11332         }
11333 
11334         /* Use a clearer mnemonic for below */
11335         lowest_cp = start[0];
11336 
11337         highest_cp = invlist_highest(cp_list);
11338     }
11339 
11340     /* Similarly, for /l posix classes, if both a class and its complement
11341      * match, any run-time dependencies don't matter */
11342     if (posixl) {
11343         int namedclass;
11344         for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
11345             if (   POSIXL_TEST(posixl, namedclass)      /* class */
11346                 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
11347             {
11348                 if (*invert) {
11349                     goto return_OPFAIL;
11350                 }
11351                 goto return_SANY;
11352             }
11353         }
11354 
11355         /* For well-behaved locales, some classes are subsets of others, so
11356          * complementing the subset and including the non-complemented superset
11357          * should match everything, like [\D[:alnum:]], and
11358          * [[:^alpha:][:alnum:]], but some implementations of locales are
11359          * buggy, and khw thinks its a bad idea to have optimization change
11360          * behavior, even if it avoids an OS bug in a given case */
11361 
11362 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
11363 
11364         /* If is a single posix /l class, can optimize to just that op.  Such a
11365          * node will not match anything in the Latin1 range, as that is not
11366          * determinable until runtime, but will match whatever the class does
11367          * outside that range.  (Note that some classes won't match anything
11368          * outside the range, like [:ascii:]) */
11369         if (   isSINGLE_BIT_SET(posixl)
11370             && (partial_cp_count == 0 || lowest_cp > 255))
11371         {
11372             U8 classnum;
11373             SV * class_above_latin1 = NULL;
11374             bool already_inverted;
11375             bool are_equivalent;
11376 
11377 
11378             namedclass = single_1bit_pos32(posixl);
11379             classnum = namedclass_to_classnum(namedclass);
11380 
11381             /* The named classes are such that the inverted number is one
11382              * larger than the non-inverted one */
11383             already_inverted = namedclass - classnum_to_namedclass(classnum);
11384 
11385             /* Create an inversion list of the official property, inverted if
11386              * the constructed node list is inverted, and restricted to only
11387              * the above latin1 code points, which are the only ones known at
11388              * compile time */
11389             _invlist_intersection_maybe_complement_2nd(
11390                                                 PL_AboveLatin1,
11391                                                 PL_XPosix_ptrs[classnum],
11392                                                 already_inverted,
11393                                                 &class_above_latin1);
11394             are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
11395             SvREFCNT_dec_NN(class_above_latin1);
11396 
11397             if (are_equivalent) {
11398 
11399                 /* Resolve the run-time inversion flag with this possibly
11400                  * inverted class */
11401                 *invert = *invert ^ already_inverted;
11402 
11403                 op = POSIXL + *invert * (NPOSIXL - POSIXL);
11404                 *ret = reg_node(pRExC_state, op);
11405                 FLAGS(REGNODE_p(*ret)) = classnum;
11406                 return op;
11407             }
11408         }
11409     }
11410 
11411     /* khw can't think of any other possible transformation involving these. */
11412     if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
11413         return END;
11414     }
11415 
11416     if (! has_runtime_dependency) {
11417 
11418         /* If the list is empty, nothing matches.  This happens, for example,
11419          * when a Unicode property that doesn't match anything is the only
11420          * element in the character class (perluniprops.pod notes such
11421          * properties). */
11422         if (partial_cp_count == 0) {
11423             if (*invert) {
11424                 goto return_SANY;
11425             }
11426             else {
11427                 goto return_OPFAIL;
11428             }
11429         }
11430 
11431         /* If matches everything but \n */
11432         if (   start[0] == 0 && end[0] == '\n' - 1
11433             && start[1] == '\n' + 1 && end[1] == UV_MAX)
11434         {
11435             assert (! *invert);
11436             op = REG_ANY;
11437             *ret = reg_node(pRExC_state, op);
11438             MARK_NAUGHTY(1);
11439             return op;
11440         }
11441     }
11442 
11443     /* Next see if can optimize classes that contain just a few code points
11444      * into an EXACTish node.  The reason to do this is to let the optimizer
11445      * join this node with adjacent EXACTish ones, and ANYOF nodes require
11446      * runtime conversion to code point from UTF-8, which we'd like to avoid.
11447      *
11448      * An EXACTFish node can be generated even if not under /i, and vice versa.
11449      * But care must be taken.  An EXACTFish node has to be such that it only
11450      * matches precisely the code points in the class, but we want to generate
11451      * the least restrictive one that does that, to increase the odds of being
11452      * able to join with an adjacent node.  For example, if the class contains
11453      * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
11454      * from matching.  Whether we are under /i or not is irrelevant in this
11455      * case.  Less obvious is the pattern qr/[\x{02BC}]n/i.  U+02BC is MODIFIER
11456      * LETTER APOSTROPHE. That is supposed to match the single character U+0149
11457      * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE.  And so even though there
11458      * is no simple fold that includes \X{02BC}, there is a multi-char fold
11459      * that does, and so the node generated for it must be an EXACTFish one.
11460      * On the other hand qr/:/i should generate a plain EXACT node since the
11461      * colon participates in no fold whatsoever, and having it be EXACT tells
11462      * the optimizer the target string cannot match unless it has a colon in
11463      * it. */
11464     if (   ! posixl
11465         && ! *invert
11466 
11467             /* Only try if there are no more code points in the class than in
11468              * the max possible fold */
11469         &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
11470     {
11471         /* We can always make a single code point class into an EXACTish node.
11472          * */
11473         if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
11474             if (LOC) {
11475 
11476                 /* Here is /l:  Use EXACTL, except if there is a fold not known
11477                  * until runtime so shows as only a single code point here.
11478                  * For code points above 255, we know which can cause problems
11479                  * by having a potential fold to the Latin1 range. */
11480                 if (  ! FOLD
11481                     || (     lowest_cp > 255
11482                         && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
11483                 {
11484                     op = EXACTL;
11485                 }
11486                 else {
11487                     op = EXACTFL;
11488                 }
11489             }
11490             else if (! FOLD) { /* Not /l and not /i */
11491                 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
11492             }
11493             else if (lowest_cp < 256) { /* /i, not /l, and the code point is
11494                                           small */
11495 
11496                 /* Under /i, it gets a little tricky.  A code point that
11497                  * doesn't participate in a fold should be an EXACT node.  We
11498                  * know this one isn't the result of a simple fold, or there'd
11499                  * be more than one code point in the list, but it could be
11500                  * part of a multi-character fold.  In that case we better not
11501                  * create an EXACT node, as we would wrongly be telling the
11502                  * optimizer that this code point must be in the target string,
11503                  * and that is wrong.  This is because if the sequence around
11504                  * this code point forms a multi-char fold, what needs to be in
11505                  * the string could be the code point that folds to the
11506                  * sequence.
11507                  *
11508                  * This handles the case of below-255 code points, as we have
11509                  * an easy look up for those.  The next clause handles the
11510                  * above-256 one */
11511                 op = IS_IN_SOME_FOLD_L1(lowest_cp)
11512                      ? EXACTFU
11513                      : EXACT;
11514             }
11515             else {  /* /i, larger code point.  Since we are under /i, and have
11516                        just this code point, we know that it can't fold to
11517                        something else, so PL_InMultiCharFold applies to it */
11518                 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
11519                          ? EXACTFU_REQ8
11520                          : EXACT_REQ8;
11521                 }
11522 
11523                 value = lowest_cp;
11524         }
11525         else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
11526                  && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
11527         {
11528             /* Here, the only runtime dependency, if any, is from /d, and the
11529              * class matches more than one code point, and the lowest code
11530              * point participates in some fold.  It might be that the other
11531              * code points are /i equivalent to this one, and hence they would
11532              * be representable by an EXACTFish node.  Above, we eliminated
11533              * classes that contain too many code points to be EXACTFish, with
11534              * the test for MAX_FOLD_FROMS
11535              *
11536              * First, special case the ASCII fold pairs, like 'B' and 'b'.  We
11537              * do this because we have EXACTFAA at our disposal for the ASCII
11538              * range */
11539             if (partial_cp_count == 2 && isASCII(lowest_cp)) {
11540 
11541                 /* The only ASCII characters that participate in folds are
11542                  * alphabetics */
11543                 assert(isALPHA(lowest_cp));
11544                 if (   end[0] == start[0]   /* First range is a single
11545                                                character, so 2nd exists */
11546                     && isALPHA_FOLD_EQ(start[0], start[1]))
11547                 {
11548                     /* Here, is part of an ASCII fold pair */
11549 
11550                     if (   ASCII_FOLD_RESTRICTED
11551                         || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
11552                     {
11553                         /* If the second clause just above was true, it means
11554                          * we can't be under /i, or else the list would have
11555                          * included more than this fold pair.  Therefore we
11556                          * have to exclude the possibility of whatever else it
11557                          * is that folds to these, by using EXACTFAA */
11558                         op = EXACTFAA;
11559                     }
11560                     else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
11561 
11562                         /* Here, there's no simple fold that lowest_cp is part
11563                          * of, but there is a multi-character one.  If we are
11564                          * not under /i, we want to exclude that possibility;
11565                          * if under /i, we want to include it */
11566                         op = (FOLD) ? EXACTFU : EXACTFAA;
11567                     }
11568                     else {
11569 
11570                         /* Here, the only possible fold lowest_cp participates in
11571                          * is with start[1].  /i or not isn't relevant */
11572                         op = EXACTFU;
11573                     }
11574 
11575                     value = toFOLD(lowest_cp);
11576                 }
11577             }
11578             else if (  ! upper_latin1_only_utf8_matches
11579                      || (   _invlist_len(upper_latin1_only_utf8_matches) == 2
11580                          && PL_fold_latin1[
11581                            invlist_highest(upper_latin1_only_utf8_matches)]
11582                          == lowest_cp))
11583             {
11584                 /* Here, the smallest character is non-ascii or there are more
11585                  * than 2 code points matched by this node.  Also, we either
11586                  * don't have /d UTF-8 dependent matches, or if we do, they
11587                  * look like they could be a single character that is the fold
11588                  * of the lowest one is in the always-match list.  This test
11589                  * quickly excludes most of the false positives when there are
11590                  * /d UTF-8 depdendent matches.  These are like LATIN CAPITAL
11591                  * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
11592                  * iff the target string is UTF-8.  (We don't have to worry
11593                  * above about exceeding the array bounds of PL_fold_latin1[]
11594                  * because any code point in 'upper_latin1_only_utf8_matches'
11595                  * is below 256.)
11596                  *
11597                  * EXACTFAA would apply only to pairs (hence exactly 2 code
11598                  * points) in the ASCII range, so we can't use it here to
11599                  * artificially restrict the fold domain, so we check if the
11600                  * class does or does not match some EXACTFish node.  Further,
11601                  * if we aren't under /i, and and the folded-to character is
11602                  * part of a multi-character fold, we can't do this
11603                  * optimization, as the sequence around it could be that
11604                  * multi-character fold, and we don't here know the context, so
11605                  * we have to assume it is that multi-char fold, to prevent
11606                  * potential bugs.
11607                  *
11608                  * To do the general case, we first find the fold of the lowest
11609                  * code point (which may be higher than that lowest unfolded
11610                  * one), then find everything that folds to it.  (The data
11611                  * structure we have only maps from the folded code points, so
11612                  * we have to do the earlier step.) */
11613 
11614                 Size_t foldlen;
11615                 U8 foldbuf[UTF8_MAXBYTES_CASE];
11616                 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
11617                 U32 first_fold;
11618                 const U32 * remaining_folds;
11619                 Size_t folds_to_this_cp_count = _inverse_folds(
11620                                                             folded,
11621                                                             &first_fold,
11622                                                             &remaining_folds);
11623                 Size_t folds_count = folds_to_this_cp_count + 1;
11624                 SV * fold_list = _new_invlist(folds_count);
11625                 unsigned int i;
11626 
11627                 /* If there are UTF-8 dependent matches, create a temporary
11628                  * list of what this node matches, including them. */
11629                 SV * all_cp_list = NULL;
11630                 SV ** use_this_list = &cp_list;
11631 
11632                 if (upper_latin1_only_utf8_matches) {
11633                     all_cp_list = _new_invlist(0);
11634                     use_this_list = &all_cp_list;
11635                     _invlist_union(cp_list,
11636                                    upper_latin1_only_utf8_matches,
11637                                    use_this_list);
11638                 }
11639 
11640                 /* Having gotten everything that participates in the fold
11641                  * containing the lowest code point, we turn that into an
11642                  * inversion list, making sure everything is included. */
11643                 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
11644                 fold_list = add_cp_to_invlist(fold_list, folded);
11645                 if (folds_to_this_cp_count > 0) {
11646                     fold_list = add_cp_to_invlist(fold_list, first_fold);
11647                     for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
11648                         fold_list = add_cp_to_invlist(fold_list,
11649                                                     remaining_folds[i]);
11650                     }
11651                 }
11652 
11653                 /* If the fold list is identical to what's in this ANYOF node,
11654                  * the node can be represented by an EXACTFish one instead */
11655                 if (_invlistEQ(*use_this_list, fold_list,
11656                                0 /* Don't complement */ )
11657                 ) {
11658 
11659                     /* But, we have to be careful, as mentioned above.  Just
11660                      * the right sequence of characters could match this if it
11661                      * is part of a multi-character fold.  That IS what we want
11662                      * if we are under /i.  But it ISN'T what we want if not
11663                      * under /i, as it could match when it shouldn't.  So, when
11664                      * we aren't under /i and this character participates in a
11665                      * multi-char fold, we don't optimize into an EXACTFish
11666                      * node.  So, for each case below we have to check if we
11667                      * are folding, and if not, if it is not part of a
11668                      * multi-char fold.  */
11669                     if (lowest_cp > 255) {    /* Highish code point */
11670                         if (FOLD || ! _invlist_contains_cp(
11671                                                    PL_InMultiCharFold, folded))
11672                         {
11673                             op = (LOC)
11674                                  ? EXACTFLU8
11675                                  : (ASCII_FOLD_RESTRICTED)
11676                                    ? EXACTFAA
11677                                    : EXACTFU_REQ8;
11678                             value = folded;
11679                         }
11680                     }   /* Below, the lowest code point < 256 */
11681                     else if (    FOLD
11682                              &&  folded == 's'
11683                              &&  DEPENDS_SEMANTICS)
11684                     {   /* An EXACTF node containing a single character 's',
11685                            can be an EXACTFU if it doesn't get joined with an
11686                            adjacent 's' */
11687                         op = EXACTFU_S_EDGE;
11688                         value = folded;
11689                     }
11690                     else if (     FOLD
11691                              || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
11692                     {
11693                         if (upper_latin1_only_utf8_matches) {
11694                             op = EXACTF;
11695 
11696                             /* We can't use the fold, as that only matches
11697                              * under UTF-8 */
11698                             value = lowest_cp;
11699                         }
11700                         else if (     UNLIKELY(lowest_cp == MICRO_SIGN)
11701                                  && ! UTF)
11702                         {   /* EXACTFUP is a special node for this character */
11703                             op = (ASCII_FOLD_RESTRICTED)
11704                                  ? EXACTFAA
11705                                  : EXACTFUP;
11706                             value = MICRO_SIGN;
11707                         }
11708                         else if (     ASCII_FOLD_RESTRICTED
11709                                  && ! isASCII(lowest_cp))
11710                         {   /* For ASCII under /iaa, we can use EXACTFU below
11711                              */
11712                             op = EXACTFAA;
11713                             value = folded;
11714                         }
11715                         else {
11716                             op = EXACTFU;
11717                             value = folded;
11718                         }
11719                     }
11720                 }
11721 
11722                 SvREFCNT_dec_NN(fold_list);
11723                 SvREFCNT_dec(all_cp_list);
11724             }
11725         }
11726 
11727         if (op != END) {
11728             U8 len;
11729 
11730             /* Here, we have calculated what EXACTish node to use.  Have to
11731              * convert to UTF-8 if not already there */
11732             if (value > 255) {
11733                 if (! UTF) {
11734                     SvREFCNT_dec(cp_list);
11735                     REQUIRE_UTF8(flagp);
11736                 }
11737 
11738                 /* This is a kludge to the special casing issues with this
11739                  * ligature under /aa.  FB05 should fold to FB06, but the call
11740                  * above to _to_uni_fold_flags() didn't find this, as it didn't
11741                  * use the /aa restriction in order to not miss other folds
11742                  * that would be affected.  This is the only instance likely to
11743                  * ever be a problem in all of Unicode.  So special case it. */
11744                 if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
11745                     && ASCII_FOLD_RESTRICTED)
11746                 {
11747                     value = LATIN_SMALL_LIGATURE_ST;
11748                 }
11749             }
11750 
11751             len = (UTF) ? UVCHR_SKIP(value) : 1;
11752 
11753             *ret = REGNODE_GUTS(pRExC_state, op, len);
11754             FILL_NODE(*ret, op);
11755             RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
11756             setSTR_LEN(REGNODE_p(*ret), len);
11757             if (len == 1) {
11758                 *STRINGs(REGNODE_p(*ret)) = (U8) value;
11759             }
11760             else {
11761                 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
11762             }
11763 
11764             return op;
11765         }
11766     }
11767 
11768     if (! has_runtime_dependency) {
11769 
11770         /* See if this can be turned into an ANYOFM node.  Think about the bit
11771          * patterns in two different bytes.  In some positions, the bits in
11772          * each will be 1; and in other positions both will be 0; and in some
11773          * positions the bit will be 1 in one byte, and 0 in the other.  Let
11774          * 'n' be the number of positions where the bits differ.  We create a
11775          * mask which has exactly 'n' 0 bits, each in a position where the two
11776          * bytes differ.  Now take the set of all bytes that when ANDed with
11777          * the mask yield the same result.  That set has 2**n elements, and is
11778          * representable by just two 8 bit numbers: the result and the mask.
11779          * Importantly, matching the set can be vectorized by creating a word
11780          * full of the result bytes, and a word full of the mask bytes,
11781          * yielding a significant speed up.  Here, see if this node matches
11782          * such a set.  As a concrete example consider [01], and the byte
11783          * representing '0' which is 0x30 on ASCII machines.  It has the bits
11784          * 0011 0000.  Take the mask 1111 1110.  If we AND 0x31 and 0x30 with
11785          * that mask we get 0x30.  Any other bytes ANDed yield something else.
11786          * So [01], which is a common usage, is optimizable into ANYOFM, and
11787          * can benefit from the speed up.  We can only do this on UTF-8
11788          * invariant bytes, because they have the same bit patterns under UTF-8
11789          * as not. */
11790         PERL_UINT_FAST8_T inverted = 0;
11791 
11792         /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
11793          * EBCDIC */
11794         const PERL_UINT_FAST8_T max_permissible
11795                                     = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
11796 
11797         /* If doesn't fit the criteria for ANYOFM, invert and try again.  If
11798          * that works we will instead later generate an NANYOFM, and invert
11799          * back when through */
11800         if (highest_cp > max_permissible) {
11801             _invlist_invert(cp_list);
11802             inverted = 1;
11803         }
11804 
11805         if (invlist_highest(cp_list) <= max_permissible) {
11806             UV this_start, this_end;
11807             UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
11808             U8 bits_differing = 0;
11809             Size_t full_cp_count = 0;
11810             bool first_time = TRUE;
11811 
11812             /* Go through the bytes and find the bit positions that differ */
11813             invlist_iterinit(cp_list);
11814             while (invlist_iternext(cp_list, &this_start, &this_end)) {
11815                 unsigned int i = this_start;
11816 
11817                 if (first_time) {
11818                     if (! UVCHR_IS_INVARIANT(i)) {
11819                         goto done_anyofm;
11820                     }
11821 
11822                     first_time = FALSE;
11823                     lowest_cp = this_start;
11824 
11825                     /* We have set up the code point to compare with.  Don't
11826                      * compare it with itself */
11827                     i++;
11828                 }
11829 
11830                 /* Find the bit positions that differ from the lowest code
11831                  * point in the node.  Keep track of all such positions by
11832                  * OR'ing */
11833                 for (; i <= this_end; i++) {
11834                     if (! UVCHR_IS_INVARIANT(i)) {
11835                         goto done_anyofm;
11836                     }
11837 
11838                     bits_differing  |= i ^ lowest_cp;
11839                 }
11840 
11841                 full_cp_count += this_end - this_start + 1;
11842             }
11843 
11844             /* At the end of the loop, we count how many bits differ from the
11845              * bits in lowest code point, call the count 'd'.  If the set we
11846              * found contains 2**d elements, it is the closure of all code
11847              * points that differ only in those bit positions.  To convince
11848              * yourself of that, first note that the number in the closure must
11849              * be a power of 2, which we test for.  The only way we could have
11850              * that count and it be some differing set, is if we got some code
11851              * points that don't differ from the lowest code point in any
11852              * position, but do differ from each other in some other position.
11853              * That means one code point has a 1 in that position, and another
11854              * has a 0.  But that would mean that one of them differs from the
11855              * lowest code point in that position, which possibility we've
11856              * already excluded.  */
11857             if (  (inverted || full_cp_count > 1)
11858                 && full_cp_count == 1U << PL_bitcount[bits_differing])
11859             {
11860                 U8 ANYOFM_mask;
11861 
11862                 op = ANYOFM + inverted;
11863 
11864                 /* We need to make the bits that differ be 0's */
11865                 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
11866 
11867                 /* The argument is the lowest code point */
11868                 *ret = reg1node(pRExC_state, op, lowest_cp);
11869                 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
11870             }
11871 
11872           done_anyofm:
11873             invlist_iterfinish(cp_list);
11874         }
11875 
11876         if (inverted) {
11877             _invlist_invert(cp_list);
11878         }
11879 
11880         if (op != END) {
11881             return op;
11882         }
11883 
11884         /* XXX We could create an ANYOFR_LOW node here if we saved above if all
11885          * were invariants, it wasn't inverted, and there is a single range.
11886          * This would be faster than some of the posix nodes we create below
11887          * like /\d/a, but would be twice the size.  Without having actually
11888          * measured the gain, khw doesn't think the tradeoff is really worth it
11889          * */
11890     }
11891 
11892     if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
11893         PERL_UINT_FAST8_T type;
11894         SV * intersection = NULL;
11895         SV* d_invlist = NULL;
11896 
11897         /* See if this matches any of the POSIX classes.  The POSIXA and POSIXD
11898          * ones are about the same speed as ANYOF ops, but take less room; the
11899          * ones that have above-Latin1 code point matches are somewhat faster
11900          * than ANYOF. */
11901 
11902         for (type = POSIXA; type >= POSIXD; type--) {
11903             int posix_class;
11904 
11905             if (type == POSIXL) {   /* But not /l posix classes */
11906                 continue;
11907             }
11908 
11909             for (posix_class = 0;
11910                  posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
11911                  posix_class++)
11912             {
11913                 SV** our_code_points = &cp_list;
11914                 SV** official_code_points;
11915                 int try_inverted;
11916 
11917                 if (type == POSIXA) {
11918                     official_code_points = &PL_Posix_ptrs[posix_class];
11919                 }
11920                 else {
11921                     official_code_points = &PL_XPosix_ptrs[posix_class];
11922                 }
11923 
11924                 /* Skip non-existent classes of this type.  e.g. \v only has an
11925                  * entry in PL_XPosix_ptrs */
11926                 if (! *official_code_points) {
11927                     continue;
11928                 }
11929 
11930                 /* Try both the regular class, and its inversion */
11931                 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
11932                     bool this_inverted = *invert ^ try_inverted;
11933 
11934                     if (type != POSIXD) {
11935 
11936                         /* This class that isn't /d can't match if we have /d
11937                          * dependencies */
11938                         if (has_runtime_dependency
11939                                                 & HAS_D_RUNTIME_DEPENDENCY)
11940                         {
11941                             continue;
11942                         }
11943                     }
11944                     else /* is /d */ if (! this_inverted) {
11945 
11946                         /* /d classes don't match anything non-ASCII below 256
11947                          * unconditionally (which cp_list contains) */
11948                         _invlist_intersection(cp_list, PL_UpperLatin1,
11949                                                        &intersection);
11950                         if (_invlist_len(intersection) != 0) {
11951                             continue;
11952                         }
11953 
11954                         SvREFCNT_dec(d_invlist);
11955                         d_invlist = invlist_clone(cp_list, NULL);
11956 
11957                         /* But under UTF-8 it turns into using /u rules.  Add
11958                          * the things it matches under these conditions so that
11959                          * we check below that these are identical to what the
11960                          * tested class should match */
11961                         if (upper_latin1_only_utf8_matches) {
11962                             _invlist_union(
11963                                         d_invlist,
11964                                         upper_latin1_only_utf8_matches,
11965                                         &d_invlist);
11966                         }
11967                         our_code_points = &d_invlist;
11968                     }
11969                     else {  /* POSIXD, inverted.  If this doesn't have this
11970                                flag set, it isn't /d. */
11971                         if (! ( *anyof_flags
11972                                & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
11973                         {
11974                             continue;
11975                         }
11976 
11977                         our_code_points = &cp_list;
11978                     }
11979 
11980                     /* Here, have weeded out some things.  We want to see if
11981                      * the list of characters this node contains
11982                      * ('*our_code_points') precisely matches those of the
11983                      * class we are currently checking against
11984                      * ('*official_code_points'). */
11985                     if (_invlistEQ(*our_code_points,
11986                                    *official_code_points,
11987                                    try_inverted))
11988                     {
11989                         /* Here, they precisely match.  Optimize this ANYOF
11990                          * node into its equivalent POSIX one of the correct
11991                          * type, possibly inverted.
11992                          *
11993                          * Some of these nodes match a single range of
11994                          * characters (or [:alpha:] matches two parallel ranges
11995                          * on ASCII platforms).  The array lookup at execution
11996                          * time could be replaced by a range check for such
11997                          * nodes.  But regnodes are a finite resource, and the
11998                          * possible performance boost isn't large, so this
11999                          * hasn't been done.  An attempt to use just one node
12000                          * (and its inverse) to encompass all such cases was
12001                          * made in d62feba66bf43f35d092bb026694f927e9f94d38.
12002                          * But the shifting/masking it used ended up being
12003                          * slower than the array look up, so it was reverted */
12004                         op = (try_inverted)
12005                             ? type + NPOSIXA - POSIXA
12006                             : type;
12007                         *ret = reg_node(pRExC_state, op);
12008                         FLAGS(REGNODE_p(*ret)) = posix_class;
12009                         SvREFCNT_dec(d_invlist);
12010                         SvREFCNT_dec(intersection);
12011                         return op;
12012                     }
12013                 }
12014             }
12015         }
12016         SvREFCNT_dec(d_invlist);
12017         SvREFCNT_dec(intersection);
12018     }
12019 
12020     /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
12021      * in size and speed.  Currently, a 20 bit range base (smallest code point
12022      * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
12023      * This allows for using it on all of the Unicode code points except for
12024      * the highest plane, which is only for private use code points.  khw
12025      * doubts that a bigger delta is likely in real world applications */
12026     if (     single_range
12027         && ! has_runtime_dependency
12028         &&   *anyof_flags == 0
12029         &&   start[0] < (1 << ANYOFR_BASE_BITS)
12030         &&   end[0] - start[0]
12031                 < ((1U << (sizeof(ARG1u_LOC(NULL))
12032                                * CHARBITS - ANYOFR_BASE_BITS))))
12033 
12034     {
12035         U8 low_utf8[UTF8_MAXBYTES+1];
12036         U8 high_utf8[UTF8_MAXBYTES+1];
12037 
12038         op = ANYOFR;
12039         *ret = reg1node(pRExC_state, op,
12040                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
12041 
12042         /* Place the lowest UTF-8 start byte in the flags field, so as to allow
12043          * efficient ruling out at run time of many possible inputs.  */
12044         (void) uvchr_to_utf8(low_utf8, start[0]);
12045         (void) uvchr_to_utf8(high_utf8, end[0]);
12046 
12047         /* If all code points share the same first byte, this can be an
12048          * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
12049          * quickly rule out many inputs at run-time without having to compute
12050          * the code point from UTF-8.  For EBCDIC, we use I8, as not doing that
12051          * transformation would not rule out nearly so many things */
12052         if (low_utf8[0] == high_utf8[0]) {
12053             op = ANYOFRb;
12054             OP(REGNODE_p(*ret)) = op;
12055             ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
12056         }
12057         else {
12058             ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
12059         }
12060 
12061         return op;
12062     }
12063 
12064     /* If didn't find an optimization and there is no need for a bitmap,
12065      * of the lowest code points, optimize to indicate that */
12066     if (     lowest_cp >= NUM_ANYOF_CODE_POINTS
12067         && ! LOC
12068         && ! upper_latin1_only_utf8_matches
12069         &&   *anyof_flags == 0)
12070     {
12071         U8 low_utf8[UTF8_MAXBYTES+1];
12072         UV highest_cp = invlist_highest(cp_list);
12073 
12074         /* Currently the maximum allowed code point by the system is IV_MAX.
12075          * Higher ones are reserved for future internal use.  This particular
12076          * regnode can be used for higher ones, but we can't calculate the code
12077          * point of those.  IV_MAX suffices though, as it will be a large first
12078          * byte */
12079         Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
12080                        - low_utf8;
12081 
12082         /* We store the lowest possible first byte of the UTF-8 representation,
12083          * using the flags field.  This allows for quick ruling out of some
12084          * inputs without having to convert from UTF-8 to code point.  For
12085          * EBCDIC, we use I8, as not doing that transformation would not rule
12086          * out nearly so many things */
12087         *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
12088 
12089         op = ANYOFH;
12090 
12091         /* If the first UTF-8 start byte for the highest code point in the
12092          * range is suitably small, we may be able to get an upper bound as
12093          * well */
12094         if (highest_cp <= IV_MAX) {
12095             U8 high_utf8[UTF8_MAXBYTES+1];
12096             Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
12097 
12098             /* If the lowest and highest are the same, we can get an exact
12099              * first byte instead of a just minimum or even a sequence of exact
12100              * leading bytes.  We signal these with different regnodes */
12101             if (low_utf8[0] == high_utf8[0]) {
12102                 Size_t len = find_first_differing_byte_pos(low_utf8,
12103                                                            high_utf8,
12104                                                    MIN(low_len, high_len));
12105                 if (len == 1) {
12106 
12107                     /* No need to convert to I8 for EBCDIC as this is an exact
12108                      * match */
12109                     *anyof_flags = low_utf8[0];
12110 
12111                     if (high_len == 2) {
12112                         /* If the elements matched all have a 2-byte UTF-8
12113                          * representation, with the first byte being the same,
12114                          * we can use a compact, fast regnode. capable of
12115                          * matching any combination of continuation byte
12116                          * patterns.
12117                          *
12118                          * (A similar regnode could be created for the Latin1
12119                          * range; the complication being that it could match
12120                          * non-UTF8 targets.  The internal bitmap would serve
12121                          * both cases; with some extra code in regexec.c) */
12122                         op = ANYOFHbbm;
12123                         *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12124                         FILL_NODE(*ret, op);
12125                         FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0],
12126 
12127                         /* The 64 bit (or 32 on EBCCDIC) map can be looked up
12128                          * directly based on the continuation byte, without
12129                          * needing to convert to code point */
12130                         populate_bitmap_from_invlist(
12131                             cp_list,
12132 
12133                             /* The base code point is from the start byte */
12134                             TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
12135                                                     UTF_CONTINUATION_MARK | 0),
12136 
12137                             ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
12138                             REGNODE_BBM_BITMAP_LEN);
12139                         RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
12140                         return op;
12141                     }
12142                     else {
12143                         op = ANYOFHb;
12144                     }
12145                 }
12146                 else {
12147                     op = ANYOFHs;
12148                     *ret = REGNODE_GUTS(pRExC_state, op,
12149                                        REGNODE_ARG_LEN(op) + STR_SZ(len));
12150                     FILL_NODE(*ret, op);
12151                     STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret))
12152                                                                     = len;
12153                     Copy(low_utf8,  /* Add the common bytes */
12154                     ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
12155                        len, U8);
12156                     RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
12157                     set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
12158                                               NULL, only_utf8_locale_list);
12159                     return op;
12160                 }
12161             }
12162             else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
12163 
12164                 /* Here, the high byte is not the same as the low, but is small
12165                  * enough that its reasonable to have a loose upper bound,
12166                  * which is packed in with the strict lower bound.  See
12167                  * comments at the definition of MAX_ANYOF_HRx_BYTE.  On EBCDIC
12168                  * platforms, I8 is used.  On ASCII platforms I8 is the same
12169                  * thing as UTF-8 */
12170 
12171                 U8 bits = 0;
12172                 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
12173                 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
12174                             - *anyof_flags;
12175 
12176                 if (range_diff <= max_range_diff / 8) {
12177                     bits = 3;
12178                 }
12179                 else if (range_diff <= max_range_diff / 4) {
12180                     bits = 2;
12181                 }
12182                 else if (range_diff <= max_range_diff / 2) {
12183                     bits = 1;
12184                 }
12185                 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
12186                 op = ANYOFHr;
12187             }
12188         }
12189     }
12190 
12191     return op;
12192 
12193   return_OPFAIL:
12194     op = OPFAIL;
12195     *flagp &= ~(SIMPLE|HASWIDTH);
12196     *ret = reg1node(pRExC_state, op, 0);
12197     return op;
12198 
12199   return_SANY:
12200     op = SANY;
12201     *ret = reg_node(pRExC_state, op);
12202     MARK_NAUGHTY(1);
12203     return op;
12204 }
12205 
12206 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12207 
12208 #ifdef PERL_RE_BUILD_AUX
12209 void
Perl_set_ANYOF_arg(pTHX_ RExC_state_t * const pRExC_state,regnode * const node,SV * const cp_list,SV * const runtime_defns,SV * const only_utf8_locale_list)12210 Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
12211                 regnode* const node,
12212                 SV* const cp_list,
12213                 SV* const runtime_defns,
12214                 SV* const only_utf8_locale_list)
12215 {
12216     /* Sets the arg field of an ANYOF-type node 'node', using information about
12217      * the node passed-in.  If only the bitmap is needed to determine what
12218      * matches, the arg is set appropriately to either
12219      *      1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
12220      *      2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
12221      *
12222      * Otherwise, it sets the argument to the count returned by reg_add_data(),
12223      * having allocated and stored an array, av, as follows:
12224      *  av[0] stores the inversion list defining this class as far as known at
12225      *        this time, or PL_sv_undef if nothing definite is now known.
12226      *  av[1] stores the inversion list of code points that match only if the
12227      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
12228      *        av[2], or no entry otherwise.
12229      *  av[2] stores the list of user-defined properties whose subroutine
12230      *        definitions aren't known at this time, or no entry if none. */
12231 
12232     UV n;
12233 
12234     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
12235 
12236     /* If this is set, the final disposition won't be known until runtime, so
12237      * we can't do any of the compile time optimizations */
12238     if (! runtime_defns) {
12239 
12240         /* On plain ANYOF nodes without the possibility of a runtime locale
12241          * making a difference, maybe there's no information to be gleaned
12242          * except for what's in the bitmap */
12243         if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
12244 
12245             /* There are two such cases:
12246              *  1)  there is no list of code points matched outside the bitmap
12247              */
12248             if (! cp_list) {
12249                 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
12250                 return;
12251             }
12252 
12253             /*  2)  the list indicates everything outside the bitmap matches */
12254             if (   invlist_highest(cp_list) == UV_MAX
12255                 && invlist_highest_range_start(cp_list)
12256                                                        <= NUM_ANYOF_CODE_POINTS)
12257             {
12258                 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
12259                 return;
12260             }
12261 
12262             /* In all other cases there are things outside the bitmap that we
12263              * may need to check at runtime. */
12264         }
12265 
12266         /* Here, we have resolved all the possible run-time matches, and they
12267          * are stored in one or both of two possible lists.  (While some match
12268          * only under certain runtime circumstances, we know all the possible
12269          * ones for each such circumstance.)
12270          *
12271          * It may very well be that the pattern being compiled contains an
12272          * identical class, already encountered.  Reusing that class here saves
12273          * space.  Look through all classes so far encountered. */
12274         U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
12275         for (unsigned int i = 0; i < existing_items; i++) {
12276 
12277             /* Only look at auxiliary data of this type */
12278             if (RExC_rxi->data->what[i] != 's') {
12279                 continue;
12280             }
12281 
12282             SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
12283             AV * const av = MUTABLE_AV(SvRV(rv));
12284 
12285             /* If the already encountered class has data that won't be known
12286              * until runtime (stored in the final element of the array), we
12287              * can't share */
12288             if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
12289                 continue;
12290             }
12291 
12292             SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
12293                                                 false /* no lvalue */);
12294 
12295             /* The new and the existing one both have to have or both not
12296              * have this element, for this one to duplicate that one */
12297             if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
12298                 continue;
12299             }
12300 
12301             /* If the inversion lists aren't equivalent, can't share */
12302             if (cp_list && ! _invlistEQ(cp_list,
12303                                         *stored_cp_list_ptr,
12304                                         FALSE /* don't complement */))
12305             {
12306                 continue;
12307             }
12308 
12309             /* Similarly for the other list */
12310             SV ** stored_only_utf8_locale_list_ptr = av_fetch(
12311                                                 av,
12312                                                 ONLY_LOCALE_MATCHES_INDEX,
12313                                                 false /* no lvalue */);
12314             if (   cBOOL(only_utf8_locale_list)
12315                 != cBOOL(stored_only_utf8_locale_list_ptr))
12316             {
12317                 continue;
12318             }
12319 
12320             if (only_utf8_locale_list && ! _invlistEQ(
12321                                          only_utf8_locale_list,
12322                                          *stored_only_utf8_locale_list_ptr,
12323                                          FALSE /* don't complement */))
12324             {
12325                 continue;
12326             }
12327 
12328             /* Here, the existence and contents of both compile-time lists
12329              * are identical between the new and existing data.  Re-use the
12330              * existing one */
12331             ARG1u_SET(node, i);
12332             return;
12333         } /* end of loop through existing classes */
12334     }
12335 
12336     /* Here, we need to create a new auxiliary data element; either because
12337      * this doesn't duplicate an existing one, or we can't tell at this time if
12338      * it eventually will */
12339 
12340     AV * const av = newAV();
12341     SV *rv;
12342 
12343     if (cp_list) {
12344         av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
12345     }
12346 
12347     /* (Note that if any of this changes, the size calculations in
12348      * S_optimize_regclass() might need to be updated.) */
12349 
12350     if (only_utf8_locale_list) {
12351         av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
12352                                        SvREFCNT_inc_NN(only_utf8_locale_list));
12353     }
12354 
12355     if (runtime_defns) {
12356         av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
12357                      SvREFCNT_inc_NN(runtime_defns));
12358     }
12359 
12360     rv = newRV_noinc(MUTABLE_SV(av));
12361     n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
12362     RExC_rxi->data->data[n] = (void*)rv;
12363     ARG1u_SET(node, n);
12364 }
12365 #endif /* PERL_RE_BUILD_AUX */
12366 
12367 SV *
12368 
12369 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
Perl_get_regclass_aux_data(pTHX_ const regexp * prog,const regnode * node,bool doinit,SV ** listsvp,SV ** only_utf8_locale_ptr,SV ** output_invlist)12370 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12371 #else
12372 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12373 #endif
12374 
12375 {
12376     /* For internal core use only.
12377      * Returns the inversion list for the input 'node' in the regex 'prog'.
12378      * If <doinit> is 'true', will attempt to create the inversion list if not
12379      *    already done.  If it is created, it will add to the normal inversion
12380      *    list any that comes from user-defined properties.  It croaks if this
12381      *    is called before such a list is ready to be generated, that is when a
12382      *    user-defined property has been declared, buyt still not yet defined.
12383      * If <listsvp> is non-null, will return the printable contents of the
12384      *    property definition.  This can be used to get debugging information
12385      *    even before the inversion list exists, by calling this function with
12386      *    'doinit' set to false, in which case the components that will be used
12387      *    to eventually create the inversion list are returned  (in a printable
12388      *    form).
12389      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
12390      *    store an inversion list of code points that should match only if the
12391      *    execution-time locale is a UTF-8 one.
12392      * If <output_invlist> is not NULL, it is where this routine is to store an
12393      *    inversion list of the code points that would be instead returned in
12394      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
12395      *    when this parameter is used, is just the non-code point data that
12396      *    will go into creating the inversion list.  This currently should be just
12397      *    user-defined properties whose definitions were not known at compile
12398      *    time.  Using this parameter allows for easier manipulation of the
12399      *    inversion list's data by the caller.  It is illegal to call this
12400      *    function with this parameter set, but not <listsvp>
12401      *
12402      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
12403      * that, in spite of this function's name, the inversion list it returns
12404      * may include the bitmap data as well */
12405 
12406     SV *si  = NULL;         /* Input initialization string */
12407     SV* invlist = NULL;
12408 
12409     RXi_GET_DECL_NULL(prog, progi);
12410     const struct reg_data * const data = prog ? progi->data : NULL;
12411 
12412 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12413     PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
12414 #else
12415     PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
12416 #endif
12417     assert(! output_invlist || listsvp);
12418 
12419     if (data && data->count) {
12420         const U32 n = ARG1u(node);
12421 
12422         if (data->what[n] == 's') {
12423             SV * const rv = MUTABLE_SV(data->data[n]);
12424             AV * const av = MUTABLE_AV(SvRV(rv));
12425             SV **const ary = AvARRAY(av);
12426 
12427             invlist = ary[INVLIST_INDEX];
12428 
12429             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
12430                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
12431             }
12432 
12433             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
12434                 si = ary[DEFERRED_USER_DEFINED_INDEX];
12435             }
12436 
12437             if (doinit && (si || invlist)) {
12438                 if (si) {
12439                     bool user_defined;
12440                     SV * msg = newSVpvs_flags("", SVs_TEMP);
12441 
12442                     SV * prop_definition = handle_user_defined_property(
12443                             "", 0, FALSE,   /* There is no \p{}, \P{} */
12444                             SvPVX_const(si)[1] - '0',   /* /i or not has been
12445                                                            stored here for just
12446                                                            this occasion */
12447                             TRUE,           /* run time */
12448                             FALSE,          /* This call must find the defn */
12449                             si,             /* The property definition  */
12450                             &user_defined,
12451                             msg,
12452                             0               /* base level call */
12453                            );
12454 
12455                     if (SvCUR(msg)) {
12456                         assert(prop_definition == NULL);
12457 
12458                         Perl_croak(aTHX_ "%" UTF8f,
12459                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
12460                     }
12461 
12462                     if (invlist) {
12463                         _invlist_union(invlist, prop_definition, &invlist);
12464                         SvREFCNT_dec_NN(prop_definition);
12465                     }
12466                     else {
12467                         invlist = prop_definition;
12468                     }
12469 
12470                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
12471                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
12472 
12473                     ary[INVLIST_INDEX] = invlist;
12474                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
12475                                  ? ONLY_LOCALE_MATCHES_INDEX
12476                                  : INVLIST_INDEX);
12477                     si = NULL;
12478                 }
12479             }
12480         }
12481     }
12482 
12483     /* If requested, return a printable version of what this ANYOF node matches
12484      * */
12485     if (listsvp) {
12486         SV* matches_string = NULL;
12487 
12488         /* This function can be called at compile-time, before everything gets
12489          * resolved, in which case we return the currently best available
12490          * information, which is the string that will eventually be used to do
12491          * that resolving, 'si' */
12492         if (si) {
12493             /* Here, we only have 'si' (and possibly some passed-in data in
12494              * 'invlist', which is handled below)  If the caller only wants
12495              * 'si', use that.  */
12496             if (! output_invlist) {
12497                 matches_string = newSVsv(si);
12498             }
12499             else {
12500                 /* But if the caller wants an inversion list of the node, we
12501                  * need to parse 'si' and place as much as possible in the
12502                  * desired output inversion list, making 'matches_string' only
12503                  * contain the currently unresolvable things */
12504                 const char *si_string = SvPVX(si);
12505                 STRLEN remaining = SvCUR(si);
12506                 UV prev_cp = 0;
12507                 U8 count = 0;
12508 
12509                 /* Ignore everything before and including the first new-line */
12510                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
12511                 assert (si_string != NULL);
12512                 si_string++;
12513                 remaining = SvPVX(si) + SvCUR(si) - si_string;
12514 
12515                 while (remaining > 0) {
12516 
12517                     /* The data consists of just strings defining user-defined
12518                      * property names, but in prior incarnations, and perhaps
12519                      * somehow from pluggable regex engines, it could still
12520                      * hold hex code point definitions, all of which should be
12521                      * legal (or it wouldn't have gotten this far).  Each
12522                      * component of a range would be separated by a tab, and
12523                      * each range by a new-line.  If these are found, instead
12524                      * add them to the inversion list */
12525                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
12526                                      |PERL_SCAN_SILENT_NON_PORTABLE;
12527                     STRLEN len = remaining;
12528                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
12529 
12530                     /* If the hex decode routine found something, it should go
12531                      * up to the next \n */
12532                     if (   *(si_string + len) == '\n') {
12533                         if (count) {    /* 2nd code point on line */
12534                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
12535                         }
12536                         else {
12537                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
12538                         }
12539                         count = 0;
12540                         goto prepare_for_next_iteration;
12541                     }
12542 
12543                     /* If the hex decode was instead for the lower range limit,
12544                      * save it, and go parse the upper range limit */
12545                     if (*(si_string + len) == '\t') {
12546                         assert(count == 0);
12547 
12548                         prev_cp = cp;
12549                         count = 1;
12550                       prepare_for_next_iteration:
12551                         si_string += len + 1;
12552                         remaining -= len + 1;
12553                         continue;
12554                     }
12555 
12556                     /* Here, didn't find a legal hex number.  Just add the text
12557                      * from here up to the next \n, omitting any trailing
12558                      * markers. */
12559 
12560                     remaining -= len;
12561                     len = strcspn(si_string,
12562                                         DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
12563                     remaining -= len;
12564                     if (matches_string) {
12565                         sv_catpvn(matches_string, si_string, len);
12566                     }
12567                     else {
12568                         matches_string = newSVpvn(si_string, len);
12569                     }
12570                     sv_catpvs(matches_string, " ");
12571 
12572                     si_string += len;
12573                     if (   remaining
12574                         && UCHARAT(si_string)
12575                                             == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
12576                     {
12577                         si_string++;
12578                         remaining--;
12579                     }
12580                     if (remaining && UCHARAT(si_string) == '\n') {
12581                         si_string++;
12582                         remaining--;
12583                     }
12584                 } /* end of loop through the text */
12585 
12586                 assert(matches_string);
12587                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
12588                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
12589                 }
12590             } /* end of has an 'si' */
12591         }
12592 
12593         /* Add the stuff that's already known */
12594         if (invlist) {
12595 
12596             /* Again, if the caller doesn't want the output inversion list, put
12597              * everything in 'matches-string' */
12598             if (! output_invlist) {
12599                 if ( ! matches_string) {
12600                     matches_string = newSVpvs("\n");
12601                 }
12602                 sv_catsv(matches_string, invlist_contents(invlist,
12603                                                   TRUE /* traditional style */
12604                                                   ));
12605             }
12606             else if (! *output_invlist) {
12607                 *output_invlist = invlist_clone(invlist, NULL);
12608             }
12609             else {
12610                 _invlist_union(*output_invlist, invlist, output_invlist);
12611             }
12612         }
12613 
12614         *listsvp = matches_string;
12615     }
12616 
12617     return invlist;
12618 }
12619 
12620 /* reg_skipcomment()
12621 
12622    Absorbs an /x style # comment from the input stream,
12623    returning a pointer to the first character beyond the comment, or if the
12624    comment terminates the pattern without anything following it, this returns
12625    one past the final character of the pattern (in other words, RExC_end) and
12626    sets the REG_RUN_ON_COMMENT_SEEN flag.
12627 
12628    Note it's the callers responsibility to ensure that we are
12629    actually in /x mode
12630 
12631 */
12632 
12633 PERL_STATIC_INLINE char*
S_reg_skipcomment(RExC_state_t * pRExC_state,char * p)12634 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
12635 {
12636     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12637 
12638     assert(*p == '#');
12639 
12640     while (p < RExC_end) {
12641         if (*(++p) == '\n') {
12642             return p+1;
12643         }
12644     }
12645 
12646     /* we ran off the end of the pattern without ending the comment, so we have
12647      * to add an \n when wrapping */
12648     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12649     return p;
12650 }
12651 
12652 STATIC void
S_skip_to_be_ignored_text(pTHX_ RExC_state_t * pRExC_state,char ** p,const bool force_to_xmod)12653 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
12654                                 char ** p,
12655                                 const bool force_to_xmod
12656                          )
12657 {
12658     /* If the text at the current parse position '*p' is a '(?#...)' comment,
12659      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
12660      * is /x whitespace, advance '*p' so that on exit it points to the first
12661      * byte past all such white space and comments */
12662 
12663     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
12664 
12665     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
12666 
12667     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
12668 
12669     for (;;) {
12670         if (RExC_end - (*p) >= 3
12671             && *(*p)     == '('
12672             && *(*p + 1) == '?'
12673             && *(*p + 2) == '#')
12674         {
12675             while (*(*p) != ')') {
12676                 if ((*p) == RExC_end)
12677                     FAIL("Sequence (?#... not terminated");
12678                 (*p)++;
12679             }
12680             (*p)++;
12681             continue;
12682         }
12683 
12684         if (use_xmod) {
12685             const char * save_p = *p;
12686             while ((*p) < RExC_end) {
12687                 STRLEN len;
12688                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
12689                     (*p) += len;
12690                 }
12691                 else if (*(*p) == '#') {
12692                     (*p) = reg_skipcomment(pRExC_state, (*p));
12693                 }
12694                 else {
12695                     break;
12696                 }
12697             }
12698             if (*p != save_p) {
12699                 continue;
12700             }
12701         }
12702 
12703         break;
12704     }
12705 
12706     return;
12707 }
12708 
12709 /* nextchar()
12710 
12711    Advances the parse position by one byte, unless that byte is the beginning
12712    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
12713    those two cases, the parse position is advanced beyond all such comments and
12714    white space.
12715 
12716    This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
12717 */
12718 
12719 STATIC void
S_nextchar(pTHX_ RExC_state_t * pRExC_state)12720 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12721 {
12722     PERL_ARGS_ASSERT_NEXTCHAR;
12723 
12724     if (RExC_parse < RExC_end) {
12725         assert(   ! UTF
12726                || UTF8_IS_INVARIANT(*RExC_parse)
12727                || UTF8_IS_START(*RExC_parse));
12728 
12729         RExC_parse_inc_safe();
12730 
12731         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12732                                 FALSE /* Don't force /x */ );
12733     }
12734 }
12735 
12736 STATIC void
S_change_engine_size(pTHX_ RExC_state_t * pRExC_state,const Ptrdiff_t size)12737 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
12738 {
12739     /* 'size' is the delta number of smallest regnode equivalents to add or
12740      * subtract from the current memory allocated to the regex engine being
12741      * constructed. */
12742 
12743     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
12744 
12745     RExC_size += size;
12746 
12747     Renewc(RExC_rxi,
12748            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
12749                                                 /* +1 for REG_MAGIC */
12750            char,
12751            regexp_internal);
12752     if ( RExC_rxi == NULL )
12753         FAIL("Regexp out of space");
12754     RXi_SET(RExC_rx, RExC_rxi);
12755 
12756     RExC_emit_start = RExC_rxi->program;
12757     if (size > 0) {
12758         Zero(REGNODE_p(RExC_emit), size, regnode);
12759     }
12760 }
12761 
12762 STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t * pRExC_state,const STRLEN extra_size)12763 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
12764 {
12765     /* Allocate a regnode that is (1 + extra_size) times as big as the
12766      * smallest regnode worth of space, and also aligns and increments
12767      * RExC_size appropriately.
12768      *
12769      * It returns the regnode's offset into the regex engine program */
12770 
12771     const regnode_offset ret = RExC_emit;
12772 
12773     PERL_ARGS_ASSERT_REGNODE_GUTS;
12774 
12775     SIZE_ALIGN(RExC_size);
12776     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
12777     NODE_ALIGN_FILL(REGNODE_p(ret));
12778     return(ret);
12779 }
12780 
12781 #ifdef DEBUGGING
12782 
12783 STATIC regnode_offset
S_regnode_guts_debug(pTHX_ RExC_state_t * pRExC_state,const U8 op,const STRLEN extra_size)12784 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
12785     PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
12786     assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
12787     return S_regnode_guts(aTHX_ pRExC_state, extra_size);
12788 }
12789 
12790 #endif
12791 
12792 
12793 
12794 /*
12795 - reg_node - emit a node
12796 */
12797 STATIC regnode_offset /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)12798 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12799 {
12800     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12801     regnode_offset ptr = ret;
12802 
12803     PERL_ARGS_ASSERT_REG_NODE;
12804 
12805     assert(REGNODE_ARG_LEN(op) == 0);
12806 
12807     FILL_ADVANCE_NODE(ptr, op);
12808     RExC_emit = ptr;
12809     return(ret);
12810 }
12811 
12812 /*
12813 - reg1node - emit a node with an argument
12814 */
12815 STATIC regnode_offset /* Location. */
S_reg1node(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)12816 S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12817 {
12818     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12819     regnode_offset ptr = ret;
12820 
12821     PERL_ARGS_ASSERT_REG1NODE;
12822 
12823     /* ANYOF are special cased to allow non-length 1 args */
12824     assert(REGNODE_ARG_LEN(op) == 1);
12825 
12826     FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
12827     RExC_emit = ptr;
12828     return(ret);
12829 }
12830 
12831 /*
12832 - regpnode - emit a temporary node with a SV* argument
12833 */
12834 STATIC regnode_offset /* Location. */
S_regpnode(pTHX_ RExC_state_t * pRExC_state,U8 op,SV * arg)12835 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
12836 {
12837     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12838     regnode_offset ptr = ret;
12839 
12840     PERL_ARGS_ASSERT_REGPNODE;
12841 
12842     FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
12843     RExC_emit = ptr;
12844     return(ret);
12845 }
12846 
12847 STATIC regnode_offset
S_reg2node(pTHX_ RExC_state_t * pRExC_state,const U8 op,const U32 arg1,const I32 arg2)12848 S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
12849 {
12850     /* emit a node with U32 and I32 arguments */
12851 
12852     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12853     regnode_offset ptr = ret;
12854 
12855     PERL_ARGS_ASSERT_REG2NODE;
12856 
12857     assert(REGNODE_ARG_LEN(op) == 2);
12858 
12859     FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
12860     RExC_emit = ptr;
12861     return(ret);
12862 }
12863 
12864 /*
12865 - reginsert - insert an operator in front of already-emitted operand
12866 *
12867 * That means that on exit 'operand' is the offset of the newly inserted
12868 * operator, and the original operand has been relocated.
12869 *
12870 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
12871 * set up NEXT_OFF() of the inserted node if needed. Something like this:
12872 *
12873 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
12874 *   NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
12875 *
12876 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
12877 */
12878 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,const U8 op,const regnode_offset operand,const U32 depth)12879 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
12880                   const regnode_offset operand, const U32 depth)
12881 {
12882     regnode *src;
12883     regnode *dst;
12884     regnode *place;
12885     const int offset = REGNODE_ARG_LEN((U8)op);
12886     const int size = NODE_STEP_REGNODE + offset;
12887     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12888 
12889     PERL_ARGS_ASSERT_REGINSERT;
12890     PERL_UNUSED_CONTEXT;
12891     PERL_UNUSED_ARG(depth);
12892     DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
12893     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
12894                                     studying. If this is wrong then we need to adjust RExC_recurse
12895                                     below like we do with RExC_open_parens/RExC_close_parens. */
12896     change_engine_size(pRExC_state, (Ptrdiff_t) size);
12897     src = REGNODE_p(RExC_emit);
12898     RExC_emit += size;
12899     dst = REGNODE_p(RExC_emit);
12900 
12901     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
12902      * and [perl #133871] shows this can lead to problems, so skip this
12903      * realignment of parens until a later pass when they are reliable */
12904     if (! IN_PARENS_PASS && RExC_open_parens) {
12905         int paren;
12906         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
12907         /* remember that RExC_npar is rex->nparens + 1,
12908          * iow it is 1 more than the number of parens seen in
12909          * the pattern so far. */
12910         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12911             /* note, RExC_open_parens[0] is the start of the
12912              * regex, it can't move. RExC_close_parens[0] is the end
12913              * of the regex, it *can* move. */
12914             if ( paren && RExC_open_parens[paren] >= operand ) {
12915                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
12916                 RExC_open_parens[paren] += size;
12917             } else {
12918                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12919             }
12920             if ( RExC_close_parens[paren] >= operand ) {
12921                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
12922                 RExC_close_parens[paren] += size;
12923             } else {
12924                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12925             }
12926         }
12927     }
12928     if (RExC_end_op)
12929         RExC_end_op += size;
12930 
12931     while (src > REGNODE_p(operand)) {
12932         StructCopy(--src, --dst, regnode);
12933     }
12934 
12935     place = REGNODE_p(operand);	/* Op node, where operand used to be. */
12936     src = place + 1; /* NOT REGNODE_AFTER! */
12937     FLAGS(place) = 0;
12938     FILL_NODE(operand, op);
12939 
12940     /* Zero out any arguments in the new node */
12941     Zero(src, offset, regnode);
12942 }
12943 
12944 /*
12945 - regtail - set the next-pointer at the end of a node chain of p to val.  If
12946             that value won't fit in the space available, instead returns FALSE.
12947             (Except asserts if we can't fit in the largest space the regex
12948             engine is designed for.)
12949 - SEE ALSO: regtail_study
12950 */
12951 STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,const regnode_offset p,const regnode_offset val,const U32 depth)12952 S_regtail(pTHX_ RExC_state_t * pRExC_state,
12953                 const regnode_offset p,
12954                 const regnode_offset val,
12955                 const U32 depth)
12956 {
12957     regnode_offset scan;
12958     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12959 
12960     PERL_ARGS_ASSERT_REGTAIL;
12961 #ifndef DEBUGGING
12962     PERL_UNUSED_ARG(depth);
12963 #endif
12964 
12965     /* The final node in the chain is the first one with a nonzero next pointer
12966      * */
12967     scan = (regnode_offset) p;
12968     for (;;) {
12969         regnode * const temp = regnext(REGNODE_p(scan));
12970         DEBUG_PARSE_r({
12971             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12972             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12973             Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
12974                 SvPV_nolen_const(RExC_mysv), scan,
12975                     (temp == NULL ? "->" : ""),
12976                     (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
12977             );
12978         });
12979         if (temp == NULL)
12980             break;
12981         scan = REGNODE_OFFSET(temp);
12982     }
12983 
12984     /* Populate this node's next pointer */
12985     assert(val >= scan);
12986     if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12987         assert((UV) (val - scan) <= U32_MAX);
12988         ARG1u_SET(REGNODE_p(scan), val - scan);
12989     }
12990     else {
12991         if (val - scan > U16_MAX) {
12992             /* Populate this with something that won't loop and will likely
12993              * lead to a crash if the caller ignores the failure return, and
12994              * execution continues */
12995             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12996             return FALSE;
12997         }
12998         NEXT_OFF(REGNODE_p(scan)) = val - scan;
12999     }
13000 
13001     return TRUE;
13002 }
13003 
13004 #ifdef DEBUGGING
13005 /*
13006 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13007 - Look for optimizable sequences at the same time.
13008 - currently only looks for EXACT chains.
13009 
13010 This is experimental code. The idea is to use this routine to perform
13011 in place optimizations on branches and groups as they are constructed,
13012 with the long term intention of removing optimization from study_chunk so
13013 that it is purely analytical.
13014 
13015 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13016 to control which is which.
13017 
13018 This used to return a value that was ignored.  It was a problem that it is
13019 #ifdef'd to be another function that didn't return a value.  khw has changed it
13020 so both currently return a pass/fail return.
13021 
13022 */
13023 /* TODO: All four parms should be const */
13024 
13025 STATIC bool
S_regtail_study(pTHX_ RExC_state_t * pRExC_state,regnode_offset p,const regnode_offset val,U32 depth)13026 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
13027                       const regnode_offset val, U32 depth)
13028 {
13029     regnode_offset scan;
13030     U8 exact = PSEUDO;
13031 #ifdef EXPERIMENTAL_INPLACESCAN
13032     I32 min = 0;
13033 #endif
13034     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13035 
13036     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13037 
13038 
13039     /* Find last node. */
13040 
13041     scan = p;
13042     for (;;) {
13043         regnode * const temp = regnext(REGNODE_p(scan));
13044 #ifdef EXPERIMENTAL_INPLACESCAN
13045         if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
13046             bool unfolded_multi_char;	/* Unexamined in this routine */
13047             if (join_exact(pRExC_state, scan, &min,
13048                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
13049                 return TRUE; /* Was return EXACT */
13050         }
13051 #endif
13052         if ( exact ) {
13053             if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
13054                 if (exact == PSEUDO )
13055                     exact= OP(REGNODE_p(scan));
13056                 else if (exact != OP(REGNODE_p(scan)) )
13057                     exact= 0;
13058             }
13059             else if (OP(REGNODE_p(scan)) != NOTHING) {
13060                 exact= 0;
13061             }
13062         }
13063         DEBUG_PARSE_r({
13064             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13065             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
13066             Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
13067                 SvPV_nolen_const(RExC_mysv),
13068                 scan,
13069                 REGNODE_NAME(exact));
13070         });
13071         if (temp == NULL)
13072             break;
13073         scan = REGNODE_OFFSET(temp);
13074     }
13075     DEBUG_PARSE_r({
13076         DEBUG_PARSE_MSG("");
13077         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
13078         Perl_re_printf( aTHX_
13079                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
13080                       SvPV_nolen_const(RExC_mysv),
13081                       (IV)val,
13082                       (IV)(val - scan)
13083         );
13084     });
13085     if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
13086         assert((UV) (val - scan) <= U32_MAX);
13087         ARG1u_SET(REGNODE_p(scan), val - scan);
13088     }
13089     else {
13090         if (val - scan > U16_MAX) {
13091             /* Populate this with something that won't loop and will likely
13092              * lead to a crash if the caller ignores the failure return, and
13093              * execution continues */
13094             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
13095             return FALSE;
13096         }
13097         NEXT_OFF(REGNODE_p(scan)) = val - scan;
13098     }
13099 
13100     return TRUE; /* Was 'return exact' */
13101 }
13102 #endif
13103 
13104 
13105 #ifdef PERL_RE_BUILD_AUX
13106 SV*
Perl_get_ANYOFM_contents(pTHX_ const regnode * n)13107 Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
13108 
13109     /* Returns an inversion list of all the code points matched by the
13110      * ANYOFM/NANYOFM node 'n' */
13111 
13112     SV * cp_list = _new_invlist(-1);
13113     const U8 lowest = (U8) ARG1u(n);
13114     unsigned int i;
13115     U8 count = 0;
13116     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
13117 
13118     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
13119 
13120     /* Starting with the lowest code point, any code point that ANDed with the
13121      * mask yields the lowest code point is in the set */
13122     for (i = lowest; i <= 0xFF; i++) {
13123         if ((i & FLAGS(n)) == ARG1u(n)) {
13124             cp_list = add_cp_to_invlist(cp_list, i);
13125             count++;
13126 
13127             /* We know how many code points (a power of two) that are in the
13128              * set.  No use looking once we've got that number */
13129             if (count >= needed) break;
13130         }
13131     }
13132 
13133     if (OP(n) == NANYOFM) {
13134         _invlist_invert(cp_list);
13135     }
13136     return cp_list;
13137 }
13138 
13139 SV *
Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n)13140 Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
13141     PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
13142 
13143     SV * cp_list = NULL;
13144     populate_invlist_from_bitmap(
13145               ((struct regnode_bbm *) n)->bitmap,
13146               REGNODE_BBM_BITMAP_LEN * CHARBITS,
13147               &cp_list,
13148 
13149               /* The base cp is from the start byte plus a zero continuation */
13150               TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
13151                                       UTF_CONTINUATION_MARK | 0));
13152     return cp_list;
13153 }
13154 #endif /* PERL_RE_BUILD_AUX */
13155 
13156 
13157 SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)13158 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13159 {				/* Assume that RE_INTUIT is set */
13160     /* Returns an SV containing a string that must appear in the target for it
13161      * to match, or NULL if nothing is known that must match.
13162      *
13163      * CAUTION: the SV can be freed during execution of the regex engine */
13164 
13165     struct regexp *const prog = ReANY(r);
13166     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13167 
13168     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13169     PERL_UNUSED_CONTEXT;
13170 
13171     DEBUG_COMPILE_r(
13172         {
13173             if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
13174                 const char * const s = SvPV_nolen_const(RX_UTF8(r)
13175                       ? prog->check_utf8 : prog->check_substr);
13176 
13177                 if (!PL_colorset) reginitcolors();
13178                 Perl_re_printf( aTHX_
13179                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13180                       PL_colors[4],
13181                       RX_UTF8(r) ? "utf8 " : "",
13182                       PL_colors[5], PL_colors[0],
13183                       s,
13184                       PL_colors[1],
13185                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
13186             }
13187         } );
13188 
13189     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
13190     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
13191 }
13192 
13193 /*
13194    pregfree()
13195 
13196    handles refcounting and freeing the perl core regexp structure. When
13197    it is necessary to actually free the structure the first thing it
13198    does is call the 'free' method of the regexp_engine associated to
13199    the regexp, allowing the handling of the void *pprivate; member
13200    first. (This routine is not overridable by extensions, which is why
13201    the extensions free is called first.)
13202 
13203    See regdupe and regdupe_internal if you change anything here.
13204 */
13205 #ifndef PERL_IN_XSUB_RE
13206 void
Perl_pregfree(pTHX_ REGEXP * r)13207 Perl_pregfree(pTHX_ REGEXP *r)
13208 {
13209     SvREFCNT_dec(r);
13210 }
13211 
13212 void
Perl_pregfree2(pTHX_ REGEXP * rx)13213 Perl_pregfree2(pTHX_ REGEXP *rx)
13214 {
13215     struct regexp *const r = ReANY(rx);
13216     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13217 
13218     PERL_ARGS_ASSERT_PREGFREE2;
13219 
13220     if (! r)
13221         return;
13222 
13223     if (r->mother_re) {
13224         ReREFCNT_dec(r->mother_re);
13225     } else {
13226         CALLREGFREE_PVT(rx); /* free the private data */
13227         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13228     }
13229     if (r->substrs) {
13230         int i;
13231         for (i = 0; i < 2; i++) {
13232             SvREFCNT_dec(r->substrs->data[i].substr);
13233             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
13234         }
13235         Safefree(r->substrs);
13236     }
13237     RX_MATCH_COPY_FREE(rx);
13238 #ifdef PERL_ANY_COW
13239     SvREFCNT_dec(r->saved_copy);
13240 #endif
13241     Safefree(RXp_OFFSp(r));
13242     if (r->logical_to_parno) {
13243         Safefree(r->logical_to_parno);
13244         Safefree(r->parno_to_logical);
13245         Safefree(r->parno_to_logical_next);
13246     }
13247 
13248     SvREFCNT_dec(r->qr_anoncv);
13249     if (r->recurse_locinput)
13250         Safefree(r->recurse_locinput);
13251 }
13252 
13253 
13254 /*  reg_temp_copy()
13255 
13256     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
13257     except that dsv will be created if NULL.
13258 
13259     This function is used in two main ways. First to implement
13260         $r = qr/....; $s = $$r;
13261 
13262     Secondly, it is used as a hacky workaround to the structural issue of
13263     match results
13264     being stored in the regexp structure which is in turn stored in
13265     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13266     could be PL_curpm in multiple contexts, and could require multiple
13267     result sets being associated with the pattern simultaneously, such
13268     as when doing a recursive match with (??{$qr})
13269 
13270     The solution is to make a lightweight copy of the regexp structure
13271     when a qr// is returned from the code executed by (??{$qr}) this
13272     lightweight copy doesn't actually own any of its data except for
13273     the starp/end and the actual regexp structure itself.
13274 
13275 */
13276 
13277 
13278 REGEXP *
Perl_reg_temp_copy(pTHX_ REGEXP * dsv,REGEXP * ssv)13279 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
13280 {
13281     struct regexp *drx;
13282     struct regexp *const srx = ReANY(ssv);
13283     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
13284 
13285     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13286 
13287     if (!dsv)
13288         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
13289     else {
13290         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
13291 
13292         /* our only valid caller, sv_setsv_flags(), should have done
13293          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
13294         assert(!SvOOK(dsv));
13295         assert(!SvIsCOW(dsv));
13296         assert(!SvROK(dsv));
13297 
13298         if (SvPVX_const(dsv)) {
13299             if (SvLEN(dsv))
13300                 Safefree(SvPVX(dsv));
13301             SvPVX(dsv) = NULL;
13302         }
13303         SvLEN_set(dsv, 0);
13304         SvCUR_set(dsv, 0);
13305         SvOK_off((SV *)dsv);
13306 
13307         if (islv) {
13308             /* For PVLVs, the head (sv_any) points to an XPVLV, while
13309              * the LV's xpvlenu_rx will point to a regexp body, which
13310              * we allocate here */
13311             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13312             assert(!SvPVX(dsv));
13313             /* We "steal" the body from the newly allocated SV temp, changing
13314              * the pointer in its HEAD to NULL. We then change its type to
13315              * SVt_NULL so that when we immediately release its only reference,
13316              * no memory deallocation happens.
13317              *
13318              * The body will eventually be freed (from the PVLV) either in
13319              * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
13320              * the regexp body needs to be removed)
13321              * or in Perl_sv_clear() (if the PVLV still holds the pointer until
13322              * the PVLV itself is deallocated). */
13323             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
13324             temp->sv_any = NULL;
13325             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13326             SvREFCNT_dec_NN(temp);
13327             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
13328                ing below will not set it. */
13329             SvCUR_set(dsv, SvCUR(ssv));
13330         }
13331     }
13332     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
13333        sv_force_normal(sv) is called.  */
13334     SvFAKE_on(dsv);
13335     drx = ReANY(dsv);
13336 
13337     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
13338     SvPV_set(dsv, RX_WRAPPED(ssv));
13339     /* We share the same string buffer as the original regexp, on which we
13340        hold a reference count, incremented when mother_re is set below.
13341        The string pointer is copied here, being part of the regexp struct.
13342      */
13343     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
13344            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13345 
13346     if (!islv)
13347         SvLEN_set(dsv, 0);
13348     if (RXp_OFFSp(srx)) {
13349         const I32 npar = srx->nparens+1;
13350         NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
13351     }
13352     if (srx->substrs) {
13353         int i;
13354         Newx(drx->substrs, 1, struct reg_substr_data);
13355         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
13356 
13357         for (i = 0; i < 2; i++) {
13358             SvREFCNT_inc_void(drx->substrs->data[i].substr);
13359             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
13360         }
13361 
13362         /* check_substr and check_utf8, if non-NULL, point to either their
13363            anchored or float namesakes, and don't hold a second reference.  */
13364     }
13365     if (srx->logical_to_parno) {
13366         NewCopy(srx->logical_to_parno,
13367                 drx->logical_to_parno,
13368                 srx->nparens+1, I32);
13369         NewCopy(srx->parno_to_logical,
13370                 drx->parno_to_logical,
13371                 srx->nparens+1, I32);
13372         NewCopy(srx->parno_to_logical_next,
13373                 drx->parno_to_logical_next,
13374                 srx->nparens+1, I32);
13375     } else {
13376         drx->logical_to_parno = NULL;
13377         drx->parno_to_logical = NULL;
13378         drx->parno_to_logical_next = NULL;
13379     }
13380     drx->logical_nparens = srx->logical_nparens;
13381 
13382     RX_MATCH_COPIED_off(dsv);
13383 #ifdef PERL_ANY_COW
13384     RXp_SAVED_COPY(drx) = NULL;
13385 #endif
13386     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
13387     SvREFCNT_inc_void(drx->qr_anoncv);
13388     if (srx->recurse_locinput)
13389         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
13390 
13391     return dsv;
13392 }
13393 #endif
13394 
13395 
13396 /* regfree_internal()
13397 
13398    Free the private data in a regexp. This is overloadable by
13399    extensions. Perl takes care of the regexp structure in pregfree(),
13400    this covers the *pprivate pointer which technically perl doesn't
13401    know about, however of course we have to handle the
13402    regexp_internal structure when no extension is in use.
13403 
13404    Note this is called before freeing anything in the regexp
13405    structure.
13406  */
13407 
13408 void
Perl_regfree_internal(pTHX_ REGEXP * const rx)13409 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13410 {
13411     struct regexp *const r = ReANY(rx);
13412     RXi_GET_DECL(r, ri);
13413     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13414 
13415     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13416 
13417     if (! ri) {
13418         return;
13419     }
13420 
13421     DEBUG_COMPILE_r({
13422         if (!PL_colorset)
13423             reginitcolors();
13424         {
13425             SV *dsv= sv_newmortal();
13426             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13427                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
13428             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
13429                 PL_colors[4], PL_colors[5], s);
13430         }
13431     });
13432 
13433     if (ri->code_blocks)
13434         S_free_codeblocks(aTHX_ ri->code_blocks);
13435 
13436     if (ri->data) {
13437         int n = ri->data->count;
13438 
13439         while (--n >= 0) {
13440           /* If you add a ->what type here, update the comment in regcomp.h */
13441             switch (ri->data->what[n]) {
13442             case 'a':
13443             case 'r':
13444             case 's':
13445             case 'S':
13446             case 'u':
13447                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13448                 break;
13449             case 'f':
13450                 Safefree(ri->data->data[n]);
13451                 break;
13452             case 'l':
13453             case 'L':
13454                 break;
13455             case 'T':
13456                 { /* Aho Corasick add-on structure for a trie node.
13457                      Used in stclass optimization only */
13458                     U32 refcount;
13459                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13460                     OP_REFCNT_LOCK;
13461                     refcount = --aho->refcount;
13462                     OP_REFCNT_UNLOCK;
13463                     if ( !refcount ) {
13464                         PerlMemShared_free(aho->states);
13465                         PerlMemShared_free(aho->fail);
13466                          /* do this last!!!! */
13467                         PerlMemShared_free(ri->data->data[n]);
13468                         /* we should only ever get called once, so
13469                          * assert as much, and also guard the free
13470                          * which /might/ happen twice. At the least
13471                          * it will make code anlyzers happy and it
13472                          * doesn't cost much. - Yves */
13473                         assert(ri->regstclass);
13474                         if (ri->regstclass) {
13475                             PerlMemShared_free(ri->regstclass);
13476                             ri->regstclass = 0;
13477                         }
13478                     }
13479                 }
13480                 break;
13481             case 't':
13482                 {
13483                     /* trie structure. */
13484                     U32 refcount;
13485                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13486                     OP_REFCNT_LOCK;
13487                     refcount = --trie->refcount;
13488                     OP_REFCNT_UNLOCK;
13489                     if ( !refcount ) {
13490                         PerlMemShared_free(trie->charmap);
13491                         PerlMemShared_free(trie->states);
13492                         PerlMemShared_free(trie->trans);
13493                         if (trie->bitmap)
13494                             PerlMemShared_free(trie->bitmap);
13495                         if (trie->jump)
13496                             PerlMemShared_free(trie->jump);
13497                         if (trie->j_before_paren)
13498                             PerlMemShared_free(trie->j_before_paren);
13499                         if (trie->j_after_paren)
13500                             PerlMemShared_free(trie->j_after_paren);
13501                         PerlMemShared_free(trie->wordinfo);
13502                         /* do this last!!!! */
13503                         PerlMemShared_free(ri->data->data[n]);
13504                     }
13505                 }
13506                 break;
13507             case '%':
13508                 /* NO-OP a '%' data contains a null pointer, so that reg_add_data
13509                  * always returns non-zero, this should only ever happen in the
13510                  * 0 index */
13511                 assert(n==0);
13512                 break;
13513             default:
13514                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
13515                                                     ri->data->what[n]);
13516             }
13517         }
13518         Safefree(ri->data->what);
13519         Safefree(ri->data);
13520     }
13521 
13522     Safefree(ri);
13523 }
13524 
13525 #define SAVEPVN(p, n)	((p) ? savepvn(p, n) : NULL)
13526 
13527 /*
13528 =for apidoc re_dup_guts
13529 Duplicate a regexp.
13530 
13531 This routine is expected to clone a given regexp structure. It is only
13532 compiled under USE_ITHREADS.
13533 
13534 After all of the core data stored in struct regexp is duplicated
13535 the C<regexp_engine.dupe> method is used to copy any private data
13536 stored in the *pprivate pointer. This allows extensions to handle
13537 any duplication they need to do.
13538 
13539 =cut
13540 
13541    See pregfree() and regfree_internal() if you change anything here.
13542 */
13543 #if defined(USE_ITHREADS)
13544 #ifndef PERL_IN_XSUB_RE
13545 void
Perl_re_dup_guts(pTHX_ const REGEXP * sstr,REGEXP * dstr,CLONE_PARAMS * param)13546 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13547 {
13548     I32 npar;
13549     const struct regexp *r = ReANY(sstr);
13550     struct regexp *ret = ReANY(dstr);
13551 
13552     PERL_ARGS_ASSERT_RE_DUP_GUTS;
13553 
13554     npar = r->nparens+1;
13555     NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
13556 
13557     if (ret->substrs) {
13558         /* Do it this way to avoid reading from *r after the StructCopy().
13559            That way, if any of the sv_dup_inc()s dislodge *r from the L1
13560            cache, it doesn't matter.  */
13561         int i;
13562         const bool anchored = r->check_substr
13563             ? r->check_substr == r->substrs->data[0].substr
13564             : r->check_utf8   == r->substrs->data[0].utf8_substr;
13565         Newx(ret->substrs, 1, struct reg_substr_data);
13566         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13567 
13568         for (i = 0; i < 2; i++) {
13569             ret->substrs->data[i].substr =
13570                         sv_dup_inc(ret->substrs->data[i].substr, param);
13571             ret->substrs->data[i].utf8_substr =
13572                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
13573         }
13574 
13575         /* check_substr and check_utf8, if non-NULL, point to either their
13576            anchored or float namesakes, and don't hold a second reference.  */
13577 
13578         if (ret->check_substr) {
13579             if (anchored) {
13580                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
13581 
13582                 ret->check_substr = ret->substrs->data[0].substr;
13583                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
13584             } else {
13585                 assert(r->check_substr == r->substrs->data[1].substr);
13586                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
13587 
13588                 ret->check_substr = ret->substrs->data[1].substr;
13589                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
13590             }
13591         } else if (ret->check_utf8) {
13592             if (anchored) {
13593                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13594             } else {
13595                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13596             }
13597         }
13598     }
13599 
13600     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13601     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13602     if (r->recurse_locinput)
13603         Newx(ret->recurse_locinput, r->nparens + 1, char *);
13604 
13605     if (ret->pprivate)
13606         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
13607 
13608     if (RX_MATCH_COPIED(dstr))
13609         RXp_SUBBEG(ret)  = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
13610     else
13611         RXp_SUBBEG(ret) = NULL;
13612 #ifdef PERL_ANY_COW
13613     RXp_SAVED_COPY(ret) = NULL;
13614 #endif
13615 
13616     if (r->logical_to_parno) {
13617         /* we use total_parens for all three just for symmetry */
13618         ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
13619         ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
13620         ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
13621     } else {
13622         ret->logical_to_parno = NULL;
13623         ret->parno_to_logical = NULL;
13624         ret->parno_to_logical_next = NULL;
13625     }
13626 
13627     ret->logical_nparens = r->logical_nparens;
13628 
13629     /* Whether mother_re be set or no, we need to copy the string.  We
13630        cannot refrain from copying it when the storage points directly to
13631        our mother regexp, because that's
13632                1: a buffer in a different thread
13633                2: something we no longer hold a reference on
13634                so we need to copy it locally.  */
13635     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
13636     /* set malloced length to a non-zero value so it will be freed
13637      * (otherwise in combination with SVf_FAKE it looks like an alien
13638      * buffer). It doesn't have to be the actual malloced size, since it
13639      * should never be grown */
13640     SvLEN_set(dstr, SvCUR(sstr)+1);
13641     ret->mother_re   = NULL;
13642 }
13643 #endif /* PERL_IN_XSUB_RE */
13644 
13645 /*
13646    regdupe_internal()
13647 
13648    This is the internal complement to regdupe() which is used to copy
13649    the structure pointed to by the *pprivate pointer in the regexp.
13650    This is the core version of the extension overridable cloning hook.
13651    The regexp structure being duplicated will be copied by perl prior
13652    to this and will be provided as the regexp *r argument, however
13653    with the /old/ structures pprivate pointer value. Thus this routine
13654    may override any copying normally done by perl.
13655 
13656    It returns a pointer to the new regexp_internal structure.
13657 */
13658 
13659 void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx,CLONE_PARAMS * param)13660 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13661 {
13662     struct regexp *const r = ReANY(rx);
13663     regexp_internal *reti;
13664     int len;
13665     RXi_GET_DECL(r, ri);
13666 
13667     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13668 
13669     len = ProgLen(ri);
13670 
13671     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
13672           char, regexp_internal);
13673     Copy(ri->program, reti->program, len+1, regnode);
13674 
13675 
13676     if (ri->code_blocks) {
13677         int n;
13678         Newx(reti->code_blocks, 1, struct reg_code_blocks);
13679         Newx(reti->code_blocks->cb, ri->code_blocks->count,
13680                     struct reg_code_block);
13681         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
13682              ri->code_blocks->count, struct reg_code_block);
13683         for (n = 0; n < ri->code_blocks->count; n++)
13684              reti->code_blocks->cb[n].src_regex = (REGEXP*)
13685                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
13686         reti->code_blocks->count = ri->code_blocks->count;
13687         reti->code_blocks->refcnt = 1;
13688     }
13689     else
13690         reti->code_blocks = NULL;
13691 
13692     reti->regstclass = NULL;
13693 
13694     if (ri->data) {
13695         struct reg_data *d;
13696         const int count = ri->data->count;
13697         int i;
13698 
13699         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13700                 char, struct reg_data);
13701         Newx(d->what, count, U8);
13702 
13703         d->count = count;
13704         for (i = 0; i < count; i++) {
13705             d->what[i] = ri->data->what[i];
13706             switch (d->what[i]) {
13707                 /* see also regcomp.h and regfree_internal() */
13708             case 'a': /* actually an AV, but the dup function is identical.
13709                          values seem to be "plain sv's" generally. */
13710             case 'r': /* a compiled regex (but still just another SV) */
13711             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
13712                          this use case should go away, the code could have used
13713                          'a' instead - see S_set_ANYOF_arg() for array contents. */
13714             case 'S': /* actually an SV, but the dup function is identical.  */
13715             case 'u': /* actually an HV, but the dup function is identical.
13716                          values are "plain sv's" */
13717                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13718                 break;
13719             case 'f':
13720                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
13721                  * patterns which could start with several different things. Pre-TRIE
13722                  * this was more important than it is now, however this still helps
13723                  * in some places, for instance /x?a+/ might produce a SSC equivalent
13724                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
13725                  * in regexec.c
13726                  */
13727                 /* This is cheating. */
13728                 Newx(d->data[i], 1, regnode_ssc);
13729                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
13730                 reti->regstclass = (regnode*)d->data[i];
13731                 break;
13732             case 'T':
13733                 /* AHO-CORASICK fail table */
13734                 /* Trie stclasses are readonly and can thus be shared
13735                  * without duplication. We free the stclass in pregfree
13736                  * when the corresponding reg_ac_data struct is freed.
13737                  */
13738                 reti->regstclass= ri->regstclass;
13739                 /* FALLTHROUGH */
13740             case 't':
13741                 /* TRIE transition table */
13742                 OP_REFCNT_LOCK;
13743                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13744                 OP_REFCNT_UNLOCK;
13745                 /* FALLTHROUGH */
13746             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
13747             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
13748                          is not from another regexp */
13749                 d->data[i] = ri->data->data[i];
13750                 break;
13751             case '%':
13752                 /* this is a placeholder type, it exists purely so that
13753                  * reg_add_data always returns a non-zero value, this type of
13754                  * entry should ONLY be present in the 0 slot of the array */
13755                 assert(i == 0);
13756                 d->data[i]= ri->data->data[i];
13757                 break;
13758             default:
13759                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
13760                                                            ri->data->what[i]);
13761             }
13762         }
13763 
13764         reti->data = d;
13765     }
13766     else
13767         reti->data = NULL;
13768 
13769     if (ri->regstclass && !reti->regstclass) {
13770         /* Assume that the regstclass is a regnode which is inside of the
13771          * program which we have to copy over */
13772         regnode *node= ri->regstclass;
13773         assert(node >= ri->program && (node - ri->program) < len);
13774         reti->regstclass = reti->program + (node - ri->program);
13775     }
13776 
13777 
13778     reti->name_list_idx = ri->name_list_idx;
13779 
13780     SetProgLen(reti, len);
13781 
13782     return (void*)reti;
13783 }
13784 
13785 #endif    /* USE_ITHREADS */
13786 
13787 STATIC void
S_re_croak(pTHX_ bool utf8,const char * pat,...)13788 S_re_croak(pTHX_ bool utf8, const char* pat,...)
13789 {
13790     va_list args;
13791     STRLEN len = strlen(pat);
13792     char buf[512];
13793     SV *msv;
13794     const char *message;
13795 
13796     PERL_ARGS_ASSERT_RE_CROAK;
13797 
13798     if (len > 510)
13799         len = 510;
13800     Copy(pat, buf, len , char);
13801     buf[len] = '\n';
13802     buf[len + 1] = '\0';
13803     va_start(args, pat);
13804     msv = vmess(buf, &args);
13805     va_end(args);
13806     message = SvPV_const(msv, len);
13807     if (len > 512)
13808         len = 512;
13809     Copy(message, buf, len , char);
13810     /* len-1 to avoid \n */
13811     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
13812 }
13813 
13814 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13815 
13816 #ifndef PERL_IN_XSUB_RE
13817 void
Perl_save_re_context(pTHX)13818 Perl_save_re_context(pTHX)
13819 {
13820     I32 nparens = -1;
13821     I32 i;
13822 
13823     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13824 
13825     if (PL_curpm) {
13826         const REGEXP * const rx = PM_GETRE(PL_curpm);
13827         if (rx)
13828             nparens = RX_NPARENS(rx);
13829     }
13830 
13831     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
13832      * that PL_curpm will be null, but that utf8.pm and the modules it
13833      * loads will only use $1..$3.
13834      * The t/porting/re_context.t test file checks this assumption.
13835      */
13836     if (nparens == -1)
13837         nparens = 3;
13838 
13839     for (i = 1; i <= nparens; i++) {
13840         char digits[TYPE_CHARS(long)];
13841         const STRLEN len = my_snprintf(digits, sizeof(digits),
13842                                        "%lu", (long)i);
13843         GV *const *const gvp
13844             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13845 
13846         if (gvp) {
13847             GV * const gv = *gvp;
13848             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13849                 save_scalar(gv);
13850         }
13851     }
13852 }
13853 #endif
13854 
13855 #ifndef PERL_IN_XSUB_RE
13856 
13857 #  include "uni_keywords.h"
13858 
13859 void
Perl_init_uniprops(pTHX)13860 Perl_init_uniprops(pTHX)
13861 {
13862 
13863 #  ifdef DEBUGGING
13864     char * dump_len_string;
13865 
13866     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
13867     if (   ! dump_len_string
13868         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
13869     {
13870         PL_dump_re_max_len = 60;    /* A reasonable default */
13871     }
13872 #  endif
13873 
13874     PL_user_def_props = newHV();
13875 
13876 #  ifdef USE_ITHREADS
13877 
13878     HvSHAREKEYS_off(PL_user_def_props);
13879     PL_user_def_props_aTHX = aTHX;
13880 
13881 #  endif
13882 
13883     /* Set up the inversion list interpreter-level variables */
13884 
13885     PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13886     PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
13887     PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
13888     PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
13889     PL_XPosix_ptrs[CC_CASED_] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
13890     PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
13891     PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
13892     PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
13893     PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
13894     PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
13895     PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
13896     PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
13897     PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
13898     PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
13899     PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
13900     PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
13901 
13902     PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13903     PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
13904     PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
13905     PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
13906     PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
13907     PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
13908     PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
13909     PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
13910     PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
13911     PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
13912     PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
13913     PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
13914     PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
13915     PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
13916     PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
13917     PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
13918 
13919     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
13920     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
13921     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
13922     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
13923     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
13924 
13925     PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
13926     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
13927     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
13928     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
13929 
13930     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
13931 
13932     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
13933     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
13934 
13935     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
13936     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
13937 
13938     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
13939     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13940                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
13941     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13942                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
13943     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
13944     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
13945     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
13946     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
13947     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
13948     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
13949     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
13950     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
13951     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
13952 
13953 #  ifdef UNI_XIDC
13954     /* The below are used only by deprecated functions.  They could be removed */
13955     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
13956     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
13957     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
13958 #  endif
13959 }
13960 
13961 /* These four functions are compiled only in regcomp.c, where they have access
13962  * to the data they return.  They are a way for re_comp.c to get access to that
13963  * data without having to compile the whole data structures. */
13964 
13965 I16
Perl_do_uniprop_match(const char * const key,const U16 key_len)13966 Perl_do_uniprop_match(const char * const key, const U16 key_len)
13967 {
13968     PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
13969 
13970     return match_uniprop((U8 *) key, key_len);
13971 }
13972 
13973 SV *
Perl_get_prop_definition(pTHX_ const int table_index)13974 Perl_get_prop_definition(pTHX_ const int table_index)
13975 {
13976     PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
13977 
13978     /* Create and return the inversion list */
13979     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
13980 }
13981 
13982 const char * const *
Perl_get_prop_values(const int table_index)13983 Perl_get_prop_values(const int table_index)
13984 {
13985     PERL_ARGS_ASSERT_GET_PROP_VALUES;
13986 
13987     return UNI_prop_value_ptrs[table_index];
13988 }
13989 
13990 const char *
Perl_get_deprecated_property_msg(const Size_t warning_offset)13991 Perl_get_deprecated_property_msg(const Size_t warning_offset)
13992 {
13993     PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
13994 
13995     return deprecated_property_msgs[warning_offset];
13996 }
13997 
13998 #  if 0
13999 
14000 This code was mainly added for backcompat to give a warning for non-portable
14001 code points in user-defined properties.  But experiments showed that the
14002 warning in earlier perls were only omitted on overflow, which should be an
14003 error, so there really isnt a backcompat issue, and actually adding the
14004 warning when none was present before might cause breakage, for little gain.  So
14005 khw left this code in, but not enabled.  Tests were never added.
14006 
14007 embed.fnc entry:
14008 Ei	|const char *|get_extended_utf8_msg|const UV cp
14009 
14010 PERL_STATIC_INLINE const char *
14011 S_get_extended_utf8_msg(pTHX_ const UV cp)
14012 {
14013     U8 dummy[UTF8_MAXBYTES + 1];
14014     HV *msgs;
14015     SV **msg;
14016 
14017     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
14018                              &msgs);
14019 
14020     msg = hv_fetchs(msgs, "text", 0);
14021     assert(msg);
14022 
14023     (void) sv_2mortal((SV *) msgs);
14024 
14025     return SvPVX(*msg);
14026 }
14027 
14028 #  endif
14029 #endif /* end of ! PERL_IN_XSUB_RE */
14030 
14031 STATIC REGEXP *
S_compile_wildcard(pTHX_ const char * subpattern,const STRLEN len,const bool ignore_case)14032 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
14033                          const bool ignore_case)
14034 {
14035     /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
14036      * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
14037      * because nothing outside of ASCII will match.  Use /m because the input
14038      * string may be a bunch of lines strung together.
14039      *
14040      * Also sets up the debugging info */
14041 
14042     U32 flags = PMf_MULTILINE|PMf_WILDCARD;
14043     U32 rx_flags;
14044     SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
14045     REGEXP * subpattern_re;
14046     DECLARE_AND_GET_RE_DEBUG_FLAGS;
14047 
14048     PERL_ARGS_ASSERT_COMPILE_WILDCARD;
14049 
14050     if (ignore_case) {
14051         flags |= PMf_FOLD;
14052     }
14053     set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
14054 
14055     /* Like in op.c, we copy the compile time pm flags to the rx ones */
14056     rx_flags = flags & RXf_PMf_COMPILETIME;
14057 
14058 #ifndef PERL_IN_XSUB_RE
14059     /* Use the core engine if this file is regcomp.c.  That means no
14060      * 'use re "Debug ..." is in effect, so the core engine is sufficient */
14061     subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
14062                                              &PL_core_reg_engine,
14063                                              NULL, NULL,
14064                                              rx_flags, flags);
14065 #else
14066     if (isDEBUG_WILDCARD) {
14067         /* Use the special debugging engine if this file is re_comp.c and wants
14068          * to output the wildcard matching.  This uses whatever
14069          * 'use re "Debug ..." is in effect */
14070         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
14071                                                  &my_reg_engine,
14072                                                  NULL, NULL,
14073                                                  rx_flags, flags);
14074     }
14075     else {
14076         /* Use the special wildcard engine if this file is re_comp.c and
14077          * doesn't want to output the wildcard matching.  This uses whatever
14078          * 'use re "Debug ..." is in effect for compilation, but this engine
14079          * structure has been set up so that it uses the core engine for
14080          * execution, so no execution debugging as a result of re.pm will be
14081          * displayed. */
14082         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
14083                                                  &wild_reg_engine,
14084                                                  NULL, NULL,
14085                                                  rx_flags, flags);
14086         /* XXX The above has the effect that any user-supplied regex engine
14087          * won't be called for matching wildcards.  That might be good, or bad.
14088          * It could be changed in several ways.  The reason it is done the
14089          * current way is to avoid having to save and restore
14090          * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
14091          * could be used.  Another suggestion is to keep the authoritative
14092          * value of the debug flags in a thread-local variable and add set/get
14093          * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
14094          * Still another is to pass a flag, say in the engine's intflags that
14095          * would be checked each time before doing the debug output */
14096     }
14097 #endif
14098 
14099     assert(subpattern_re);  /* Should have died if didn't compile successfully */
14100     return subpattern_re;
14101 }
14102 
14103 STATIC I32
S_execute_wildcard(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)14104 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
14105          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
14106 {
14107     I32 result;
14108     DECLARE_AND_GET_RE_DEBUG_FLAGS;
14109 
14110     PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
14111 
14112     ENTER;
14113 
14114     /* The compilation has set things up so that if the program doesn't want to
14115      * see the wildcard matching procedure, it will get the core execution
14116      * engine, which is subject only to -Dr.  So we have to turn that off
14117      * around this procedure */
14118     if (! isDEBUG_WILDCARD) {
14119         /* Note! Casts away 'volatile' */
14120         SAVEI32(PL_debug);
14121         PL_debug &= ~ DEBUG_r_FLAG;
14122     }
14123 
14124     result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
14125                          NULL, nosave);
14126     LEAVE;
14127 
14128     return result;
14129 }
14130 
14131 SV *
S_handle_user_defined_property(pTHX_ const char * name,const STRLEN name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,SV * contents,bool * user_defined_ptr,SV * msg,const STRLEN level)14132 S_handle_user_defined_property(pTHX_
14133 
14134     /* Parses the contents of a user-defined property definition; returning the
14135      * expanded definition if possible.  If so, the return is an inversion
14136      * list.
14137      *
14138      * If there are subroutines that are part of the expansion and which aren't
14139      * known at the time of the call to this function, this returns what
14140      * parse_uniprop_string() returned for the first one encountered.
14141      *
14142      * If an error was found, NULL is returned, and 'msg' gets a suitable
14143      * message appended to it.  (Appending allows the back trace of how we got
14144      * to the faulty definition to be displayed through nested calls of
14145      * user-defined subs.)
14146      *
14147      * The caller IS responsible for freeing any returned SV.
14148      *
14149      * The syntax of the contents is pretty much described in perlunicode.pod,
14150      * but we also allow comments on each line */
14151 
14152     const char * name,          /* Name of property */
14153     const STRLEN name_len,      /* The name's length in bytes */
14154     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14155     const bool to_fold,         /* ? Is this under /i */
14156     const bool runtime,         /* ? Are we in compile- or run-time */
14157     const bool deferrable,      /* Is it ok for this property's full definition
14158                                    to be deferred until later? */
14159     SV* contents,               /* The property's definition */
14160     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
14161                                    getting called unless this is thought to be
14162                                    a user-defined property */
14163     SV * msg,                   /* Any error or warning msg(s) are appended to
14164                                    this */
14165     const STRLEN level)         /* Recursion level of this call */
14166 {
14167     STRLEN len;
14168     const char * string         = SvPV_const(contents, len);
14169     const char * const e        = string + len;
14170     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
14171     const STRLEN msgs_length_on_entry = SvCUR(msg);
14172 
14173     const char * s0 = string;   /* Points to first byte in the current line
14174                                    being parsed in 'string' */
14175     const char overflow_msg[] = "Code point too large in \"";
14176     SV* running_definition = NULL;
14177 
14178     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
14179 
14180     *user_defined_ptr = TRUE;
14181 
14182     /* Look at each line */
14183     while (s0 < e) {
14184         const char * s;     /* Current byte */
14185         char op = '+';      /* Default operation is 'union' */
14186         IV   min = 0;       /* range begin code point */
14187         IV   max = -1;      /* and range end */
14188         SV* this_definition;
14189 
14190         /* Skip comment lines */
14191         if (*s0 == '#') {
14192             s0 = strchr(s0, '\n');
14193             if (s0 == NULL) {
14194                 break;
14195             }
14196             s0++;
14197             continue;
14198         }
14199 
14200         /* For backcompat, allow an empty first line */
14201         if (*s0 == '\n') {
14202             s0++;
14203             continue;
14204         }
14205 
14206         /* First character in the line may optionally be the operation */
14207         if (   *s0 == '+'
14208             || *s0 == '!'
14209             || *s0 == '-'
14210             || *s0 == '&')
14211         {
14212             op = *s0++;
14213         }
14214 
14215         /* If the line is one or two hex digits separated by blank space, its
14216          * a range; otherwise it is either another user-defined property or an
14217          * error */
14218 
14219         s = s0;
14220 
14221         if (! isXDIGIT(*s)) {
14222             goto check_if_property;
14223         }
14224 
14225         do { /* Each new hex digit will add 4 bits. */
14226             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
14227                 s = strchr(s, '\n');
14228                 if (s == NULL) {
14229                     s = e;
14230                 }
14231                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14232                 sv_catpv(msg, overflow_msg);
14233                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14234                                      UTF8fARG(is_contents_utf8, s - s0, s0));
14235                 sv_catpvs(msg, "\"");
14236                 goto return_failure;
14237             }
14238 
14239             /* Accumulate this digit into the value */
14240             min = (min << 4) + READ_XDIGIT(s);
14241         } while (isXDIGIT(*s));
14242 
14243         while (isBLANK(*s)) { s++; }
14244 
14245         /* We allow comments at the end of the line */
14246         if (*s == '#') {
14247             s = strchr(s, '\n');
14248             if (s == NULL) {
14249                 s = e;
14250             }
14251             s++;
14252         }
14253         else if (s < e && *s != '\n') {
14254             if (! isXDIGIT(*s)) {
14255                 goto check_if_property;
14256             }
14257 
14258             /* Look for the high point of the range */
14259             max = 0;
14260             do {
14261                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
14262                     s = strchr(s, '\n');
14263                     if (s == NULL) {
14264                         s = e;
14265                     }
14266                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14267                     sv_catpv(msg, overflow_msg);
14268                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14269                                       UTF8fARG(is_contents_utf8, s - s0, s0));
14270                     sv_catpvs(msg, "\"");
14271                     goto return_failure;
14272                 }
14273 
14274                 max = (max << 4) + READ_XDIGIT(s);
14275             } while (isXDIGIT(*s));
14276 
14277             while (isBLANK(*s)) { s++; }
14278 
14279             if (*s == '#') {
14280                 s = strchr(s, '\n');
14281                 if (s == NULL) {
14282                     s = e;
14283                 }
14284             }
14285             else if (s < e && *s != '\n') {
14286                 goto check_if_property;
14287             }
14288         }
14289 
14290         if (max == -1) {    /* The line only had one entry */
14291             max = min;
14292         }
14293         else if (max < min) {
14294             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14295             sv_catpvs(msg, "Illegal range in \"");
14296             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14297                                 UTF8fARG(is_contents_utf8, s - s0, s0));
14298             sv_catpvs(msg, "\"");
14299             goto return_failure;
14300         }
14301 
14302 #  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
14303 
14304         if (   UNICODE_IS_PERL_EXTENDED(min)
14305             || UNICODE_IS_PERL_EXTENDED(max))
14306         {
14307             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14308 
14309             /* If both code points are non-portable, warn only on the lower
14310              * one. */
14311             sv_catpv(msg, get_extended_utf8_msg(
14312                                             (UNICODE_IS_PERL_EXTENDED(min))
14313                                             ? min : max));
14314             sv_catpvs(msg, " in \"");
14315             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14316                                  UTF8fARG(is_contents_utf8, s - s0, s0));
14317             sv_catpvs(msg, "\"");
14318         }
14319 
14320 #  endif
14321 
14322         /* Here, this line contains a legal range */
14323         this_definition = sv_2mortal(_new_invlist(2));
14324         this_definition = _add_range_to_invlist(this_definition, min, max);
14325         goto calculate;
14326 
14327       check_if_property:
14328 
14329         /* Here it isn't a legal range line.  See if it is a legal property
14330          * line.  First find the end of the meat of the line */
14331         s = strpbrk(s, "#\n");
14332         if (s == NULL) {
14333             s = e;
14334         }
14335 
14336         /* Ignore trailing blanks in keeping with the requirements of
14337          * parse_uniprop_string() */
14338         s--;
14339         while (s > s0 && isBLANK_A(*s)) {
14340             s--;
14341         }
14342         s++;
14343 
14344         this_definition = parse_uniprop_string(s0, s - s0,
14345                                                is_utf8, to_fold, runtime,
14346                                                deferrable,
14347                                                NULL,
14348                                                user_defined_ptr, msg,
14349                                                (name_len == 0)
14350                                                 ? level /* Don't increase level
14351                                                            if input is empty */
14352                                                 : level + 1
14353                                               );
14354         if (this_definition == NULL) {
14355             goto return_failure;    /* 'msg' should have had the reason
14356                                        appended to it by the above call */
14357         }
14358 
14359         if (! is_invlist(this_definition)) {    /* Unknown at this time */
14360             return newSVsv(this_definition);
14361         }
14362 
14363         if (*s != '\n') {
14364             s = strchr(s, '\n');
14365             if (s == NULL) {
14366                 s = e;
14367             }
14368         }
14369 
14370       calculate:
14371 
14372         switch (op) {
14373             case '+':
14374                 _invlist_union(running_definition, this_definition,
14375                                                         &running_definition);
14376                 break;
14377             case '-':
14378                 _invlist_subtract(running_definition, this_definition,
14379                                                         &running_definition);
14380                 break;
14381             case '&':
14382                 _invlist_intersection(running_definition, this_definition,
14383                                                         &running_definition);
14384                 break;
14385             case '!':
14386                 _invlist_union_complement_2nd(running_definition,
14387                                         this_definition, &running_definition);
14388                 break;
14389             default:
14390                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
14391                                  __FILE__, __LINE__, op);
14392                 break;
14393         }
14394 
14395         /* Position past the '\n' */
14396         s0 = s + 1;
14397     }   /* End of loop through the lines of 'contents' */
14398 
14399     /* Here, we processed all the lines in 'contents' without error.  If we
14400      * didn't add any warnings, simply return success */
14401     if (msgs_length_on_entry == SvCUR(msg)) {
14402 
14403         /* If the expansion was empty, the answer isn't nothing: its an empty
14404          * inversion list */
14405         if (running_definition == NULL) {
14406             running_definition = _new_invlist(1);
14407         }
14408 
14409         return running_definition;
14410     }
14411 
14412     /* Otherwise, add some explanatory text, but we will return success */
14413     goto return_msg;
14414 
14415   return_failure:
14416     running_definition = NULL;
14417 
14418   return_msg:
14419 
14420     if (name_len > 0) {
14421         sv_catpvs(msg, " in expansion of ");
14422         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
14423     }
14424 
14425     return running_definition;
14426 }
14427 
14428 /* As explained below, certain operations need to take place in the first
14429  * thread created.  These macros switch contexts */
14430 #  ifdef USE_ITHREADS
14431 #    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
14432                                         PerlInterpreter * save_aTHX = aTHX;
14433 #    define SWITCH_TO_GLOBAL_CONTEXT                                        \
14434                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
14435 #    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
14436 #    define CUR_CONTEXT      aTHX
14437 #    define ORIGINAL_CONTEXT save_aTHX
14438 #  else
14439 #    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
14440 #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
14441 #    define RESTORE_CONTEXT                   NOOP
14442 #    define CUR_CONTEXT                       NULL
14443 #    define ORIGINAL_CONTEXT                  NULL
14444 #  endif
14445 
14446 STATIC void
S_delete_recursion_entry(pTHX_ void * key)14447 S_delete_recursion_entry(pTHX_ void *key)
14448 {
14449     /* Deletes the entry used to detect recursion when expanding user-defined
14450      * properties.  This is a function so it can be set up to be called even if
14451      * the program unexpectedly quits */
14452 
14453     SV ** current_entry;
14454     const STRLEN key_len = strlen((const char *) key);
14455     DECLARATION_FOR_GLOBAL_CONTEXT;
14456 
14457     SWITCH_TO_GLOBAL_CONTEXT;
14458 
14459     /* If the entry is one of these types, it is a permanent entry, and not the
14460      * one used to detect recursions.  This function should delete only the
14461      * recursion entry */
14462     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
14463     if (     current_entry
14464         && ! is_invlist(*current_entry)
14465         && ! SvPOK(*current_entry))
14466     {
14467         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
14468                                                                     G_DISCARD);
14469     }
14470 
14471     RESTORE_CONTEXT;
14472 }
14473 
14474 STATIC SV *
S_get_fq_name(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool has_colon_colon)14475 S_get_fq_name(pTHX_
14476               const char * const name,    /* The first non-blank in the \p{}, \P{} */
14477               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
14478               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14479               const bool has_colon_colon
14480              )
14481 {
14482     /* Returns a mortal SV containing the fully qualified version of the input
14483      * name */
14484 
14485     SV * fq_name;
14486 
14487     fq_name = newSVpvs_flags("", SVs_TEMP);
14488 
14489     /* Use the current package if it wasn't included in our input */
14490     if (! has_colon_colon) {
14491         const HV * pkg = (IN_PERL_COMPILETIME)
14492                          ? PL_curstash
14493                          : CopSTASH(PL_curcop);
14494         const char* pkgname = HvNAME(pkg);
14495 
14496         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14497                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
14498         sv_catpvs(fq_name, "::");
14499     }
14500 
14501     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14502                          UTF8fARG(is_utf8, name_len, name));
14503     return fq_name;
14504 }
14505 
14506 STATIC SV *
S_parse_uniprop_string(pTHX_ const char * const name,Size_t name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,AV ** strings,bool * user_defined_ptr,SV * msg,const STRLEN level)14507 S_parse_uniprop_string(pTHX_
14508 
14509     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
14510      * now.  If so, the return is an inversion list.
14511      *
14512      * If the property is user-defined, it is a subroutine, which in turn
14513      * may call other subroutines.  This function will call the whole nest of
14514      * them to get the definition they return; if some aren't known at the time
14515      * of the call to this function, the fully qualified name of the highest
14516      * level sub is returned.  It is an error to call this function at runtime
14517      * without every sub defined.
14518      *
14519      * If an error was found, NULL is returned, and 'msg' gets a suitable
14520      * message appended to it.  (Appending allows the back trace of how we got
14521      * to the faulty definition to be displayed through nested calls of
14522      * user-defined subs.)
14523      *
14524      * The caller should NOT try to free any returned inversion list.
14525      *
14526      * Other parameters will be set on return as described below */
14527 
14528     const char * const name,    /* The first non-blank in the \p{}, \P{} */
14529     Size_t name_len,            /* Its length in bytes, not including any
14530                                    trailing space */
14531     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14532     const bool to_fold,         /* ? Is this under /i */
14533     const bool runtime,         /* TRUE if this is being called at run time */
14534     const bool deferrable,      /* TRUE if it's ok for the definition to not be
14535                                    known at this call */
14536     AV ** strings,              /* To return string property values, like named
14537                                    sequences */
14538     bool *user_defined_ptr,     /* Upon return from this function it will be
14539                                    set to TRUE if any component is a
14540                                    user-defined property */
14541     SV * msg,                   /* Any error or warning msg(s) are appended to
14542                                    this */
14543     const STRLEN level)         /* Recursion level of this call */
14544 {
14545     char* lookup_name;          /* normalized name for lookup in our tables */
14546     unsigned lookup_len;        /* Its length */
14547     enum { Not_Strict = 0,      /* Some properties have stricter name */
14548            Strict,              /* normalization rules, which we decide */
14549            As_Is                /* upon based on parsing */
14550          } stricter = Not_Strict;
14551 
14552     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
14553      * (though it requires extra effort to download them from Unicode and
14554      * compile perl to know about them) */
14555     bool is_nv_type = FALSE;
14556 
14557     unsigned int i = 0, i_zero = 0, j = 0;
14558     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
14559     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
14560     int table_index = 0;    /* The entry number for this property in the table
14561                                of all Unicode property names */
14562     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
14563     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
14564                                    the normalized name in certain situations */
14565     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
14566                                    part of a package name */
14567     Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
14568     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
14569                                              property rather than a Unicode
14570                                              one. */
14571     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
14572                                      if an error.  If it is an inversion list,
14573                                      it is the definition.  Otherwise it is a
14574                                      string containing the fully qualified sub
14575                                      name of 'name' */
14576     SV * fq_name = NULL;        /* For user-defined properties, the fully
14577                                    qualified name */
14578     bool invert_return = FALSE; /* ? Do we need to complement the result before
14579                                      returning it */
14580     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
14581                                        explicit utf8:: package that we strip
14582                                        off  */
14583     /* The expansion of properties that could be either user-defined or
14584      * official unicode ones is deferred until runtime, including a marker for
14585      * those that might be in the latter category.  This boolean indicates if
14586      * we've seen that marker.  If not, what we're parsing can't be such an
14587      * official Unicode property whose expansion was deferred */
14588     bool could_be_deferred_official = FALSE;
14589 
14590     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
14591 
14592     /* The input will be normalized into 'lookup_name' */
14593     Newx(lookup_name, name_len, char);
14594     SAVEFREEPV(lookup_name);
14595 
14596     /* Parse the input. */
14597     for (i = 0; i < name_len; i++) {
14598         char cur = name[i];
14599 
14600         /* Most of the characters in the input will be of this ilk, being parts
14601          * of a name */
14602         if (isIDCONT_A(cur)) {
14603 
14604             /* Case differences are ignored.  Our lookup routine assumes
14605              * everything is lowercase, so normalize to that */
14606             if (isUPPER_A(cur)) {
14607                 lookup_name[j++] = toLOWER_A(cur);
14608                 continue;
14609             }
14610 
14611             if (cur == '_') { /* Don't include these in the normalized name */
14612                 continue;
14613             }
14614 
14615             lookup_name[j++] = cur;
14616 
14617             /* The first character in a user-defined name must be of this type.
14618              * */
14619             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
14620                 could_be_user_defined = FALSE;
14621             }
14622 
14623             continue;
14624         }
14625 
14626         /* Here, the character is not something typically in a name,  But these
14627          * two types of characters (and the '_' above) can be freely ignored in
14628          * most situations.  Later it may turn out we shouldn't have ignored
14629          * them, and we have to reparse, but we don't have enough information
14630          * yet to make that decision */
14631         if (cur == '-' || isSPACE_A(cur)) {
14632             could_be_user_defined = FALSE;
14633             continue;
14634         }
14635 
14636         /* An equals sign or single colon mark the end of the first part of
14637          * the property name */
14638         if (    cur == '='
14639             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
14640         {
14641             lookup_name[j++] = '='; /* Treat the colon as an '=' */
14642             equals_pos = j; /* Note where it occurred in the input */
14643             could_be_user_defined = FALSE;
14644             break;
14645         }
14646 
14647         /* If this looks like it is a marker we inserted at compile time,
14648          * set a flag and otherwise ignore it.  If it isn't in the final
14649          * position, keep it as it would have been user input. */
14650         if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
14651             && ! deferrable
14652             &&   could_be_user_defined
14653             &&   i == name_len - 1)
14654         {
14655             name_len--;
14656             could_be_deferred_official = TRUE;
14657             continue;
14658         }
14659 
14660         /* Otherwise, this character is part of the name. */
14661         lookup_name[j++] = cur;
14662 
14663         /* Here it isn't a single colon, so if it is a colon, it must be a
14664          * double colon */
14665         if (cur == ':') {
14666 
14667             /* A double colon should be a package qualifier.  We note its
14668              * position and continue.  Note that one could have
14669              *      pkg1::pkg2::...::foo
14670              * so that the position at the end of the loop will be just after
14671              * the final qualifier */
14672 
14673             i++;
14674             non_pkg_begin = i + 1;
14675             lookup_name[j++] = ':';
14676             lun_non_pkg_begin = j;
14677         }
14678         else { /* Only word chars (and '::') can be in a user-defined name */
14679             could_be_user_defined = FALSE;
14680         }
14681     } /* End of parsing through the lhs of the property name (or all of it if
14682          no rhs) */
14683 
14684     /* If there is a single package name 'utf8::', it is ambiguous.  It could
14685      * be for a user-defined property, or it could be a Unicode property, as
14686      * all of them are considered to be for that package.  For the purposes of
14687      * parsing the rest of the property, strip it off */
14688     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
14689         lookup_name += STRLENs("utf8::");
14690         j           -= STRLENs("utf8::");
14691         equals_pos  -= STRLENs("utf8::");
14692         i_zero       = STRLENs("utf8::");   /* When resetting 'i' to reparse
14693                                                from the beginning, it has to be
14694                                                set past what we're stripping
14695                                                off */
14696         stripped_utf8_pkg = TRUE;
14697     }
14698 
14699     /* Here, we are either done with the whole property name, if it was simple;
14700      * or are positioned just after the '=' if it is compound. */
14701 
14702     if (equals_pos >= 0) {
14703         assert(stricter == Not_Strict); /* We shouldn't have set this yet */
14704 
14705         /* Space immediately after the '=' is ignored */
14706         i++;
14707         for (; i < name_len; i++) {
14708             if (! isSPACE_A(name[i])) {
14709                 break;
14710             }
14711         }
14712 
14713         /* Most punctuation after the equals indicates a subpattern, like
14714          * \p{foo=/bar/} */
14715         if (   isPUNCT_A(name[i])
14716             &&  name[i] != '-'
14717             &&  name[i] != '+'
14718             &&  name[i] != '_'
14719             &&  name[i] != '{'
14720                 /* A backslash means the real delimiter is the next character,
14721                  * but it must be punctuation */
14722             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
14723         {
14724             bool special_property = memEQs(lookup_name, j - 1, "name")
14725                                  || memEQs(lookup_name, j - 1, "na");
14726             if (! special_property) {
14727                 /* Find the property.  The table includes the equals sign, so
14728                  * we use 'j' as-is */
14729                 table_index = do_uniprop_match(lookup_name, j);
14730             }
14731             if (special_property || table_index) {
14732                 REGEXP * subpattern_re;
14733                 char open = name[i++];
14734                 char close;
14735                 const char * pos_in_brackets;
14736                 const char * const * prop_values;
14737                 bool escaped = 0;
14738 
14739                 /* Backslash => delimiter is the character following.  We
14740                  * already checked that it is punctuation */
14741                 if (open == '\\') {
14742                     open = name[i++];
14743                     escaped = 1;
14744                 }
14745 
14746                 /* This data structure is constructed so that the matching
14747                  * closing bracket is 3 past its matching opening.  The second
14748                  * set of closing is so that if the opening is something like
14749                  * ']', the closing will be that as well.  Something similar is
14750                  * done in toke.c */
14751                 pos_in_brackets = memCHRs("([<)]>)]>", open);
14752                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
14753 
14754                 if (    i >= name_len
14755                     ||  name[name_len-1] != close
14756                     || (escaped && name[name_len-2] != '\\')
14757                         /* Also make sure that there are enough characters.
14758                          * e.g., '\\\' would show up incorrectly as legal even
14759                          * though it is too short */
14760                     || (SSize_t) (name_len - i - 1 - escaped) < 0)
14761                 {
14762                     sv_catpvs(msg, "Unicode property wildcard not terminated");
14763                     goto append_name_to_msg;
14764                 }
14765 
14766                 Perl_ck_warner_d(aTHX_
14767                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
14768                     "The Unicode property wildcards feature is experimental");
14769 
14770                 if (special_property) {
14771                     const char * error_msg;
14772                     const char * revised_name = name + i;
14773                     Size_t revised_name_len = name_len - (i + 1 + escaped);
14774 
14775                     /* Currently, the only 'special_property' is name, which we
14776                      * lookup in _charnames.pm */
14777 
14778                     if (! load_charnames(newSVpvs("placeholder"),
14779                                          revised_name, revised_name_len,
14780                                          &error_msg))
14781                     {
14782                         sv_catpv(msg, error_msg);
14783                         goto append_name_to_msg;
14784                     }
14785 
14786                     /* Farm this out to a function just to make the current
14787                      * function less unwieldy */
14788                     if (handle_names_wildcard(revised_name, revised_name_len,
14789                                               &prop_definition,
14790                                               strings))
14791                     {
14792                         return prop_definition;
14793                     }
14794 
14795                     goto failed;
14796                 }
14797 
14798                 prop_values = get_prop_values(table_index);
14799 
14800                 /* Now create and compile the wildcard subpattern.  Use /i
14801                  * because the property values are supposed to match with case
14802                  * ignored. */
14803                 subpattern_re = compile_wildcard(name + i,
14804                                                  name_len - i - 1 - escaped,
14805                                                  TRUE /* /i */
14806                                                 );
14807 
14808                 /* For each legal property value, see if the supplied pattern
14809                  * matches it. */
14810                 while (*prop_values) {
14811                     const char * const entry = *prop_values;
14812                     const Size_t len = strlen(entry);
14813                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
14814 
14815                     if (execute_wildcard(subpattern_re,
14816                                  (char *) entry,
14817                                  (char *) entry + len,
14818                                  (char *) entry, 0,
14819                                  entry_sv,
14820                                  0))
14821                     { /* Here, matched.  Add to the returned list */
14822                         Size_t total_len = j + len;
14823                         SV * sub_invlist = NULL;
14824                         char * this_string;
14825 
14826                         /* We know this is a legal \p{property=value}.  Call
14827                          * the function to return the list of code points that
14828                          * match it */
14829                         Newxz(this_string, total_len + 1, char);
14830                         Copy(lookup_name, this_string, j, char);
14831                         my_strlcat(this_string, entry, total_len + 1);
14832                         SAVEFREEPV(this_string);
14833                         sub_invlist = parse_uniprop_string(this_string,
14834                                                            total_len,
14835                                                            is_utf8,
14836                                                            to_fold,
14837                                                            runtime,
14838                                                            deferrable,
14839                                                            NULL,
14840                                                            user_defined_ptr,
14841                                                            msg,
14842                                                            level + 1);
14843                         _invlist_union(prop_definition, sub_invlist,
14844                                        &prop_definition);
14845                     }
14846 
14847                     prop_values++;  /* Next iteration, look at next propvalue */
14848                 } /* End of looking through property values; (the data
14849                      structure is terminated by a NULL ptr) */
14850 
14851                 SvREFCNT_dec_NN(subpattern_re);
14852 
14853                 if (prop_definition) {
14854                     return prop_definition;
14855                 }
14856 
14857                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
14858                 goto append_name_to_msg;
14859             }
14860 
14861             /* Here's how khw thinks we should proceed to handle the properties
14862              * not yet done:    Bidi Mirroring Glyph        can map to ""
14863                                 Bidi Paired Bracket         can map to ""
14864                                 Case Folding  (both full and simple)
14865                                             Shouldn't /i be good enough for Full
14866                                 Decomposition Mapping
14867                                 Equivalent Unified Ideograph    can map to ""
14868                                 Lowercase Mapping  (both full and simple)
14869                                 NFKC Case Fold                  can map to ""
14870                                 Titlecase Mapping  (both full and simple)
14871                                 Uppercase Mapping  (both full and simple)
14872              * Handle these the same way Name is done, using say, _wild.pm, but
14873              * having both loose and full, like in charclass_invlists.h.
14874              * Perhaps move block and script to that as they are somewhat large
14875              * in charclass_invlists.h.
14876              * For properties where the default is the code point itself, such
14877              * as any of the case changing mappings, the string would otherwise
14878              * consist of all Unicode code points in UTF-8 strung together.
14879              * This would be impractical.  So instead, examine their compiled
14880              * pattern, looking at the ssc.  If none, reject the pattern as an
14881              * error.  Otherwise run the pattern against every code point in
14882              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
14883              * And it might be good to create an API to return the ssc.
14884              * Or handle them like the algorithmic names are done
14885              */
14886         } /* End of is a wildcard subppattern */
14887 
14888         /* \p{name=...} is handled specially.  Instead of using the normal
14889          * mechanism involving charclass_invlists.h, it uses _charnames.pm
14890          * which has the necessary (huge) data accessible to it, and which
14891          * doesn't get loaded unless necessary.  The legal syntax for names is
14892          * somewhat different than other properties due both to the vagaries of
14893          * a few outlier official names, and the fact that only a few ASCII
14894          * characters are permitted in them */
14895         if (   memEQs(lookup_name, j - 1, "name")
14896             || memEQs(lookup_name, j - 1, "na"))
14897         {
14898             dSP;
14899             HV * table;
14900             SV * character;
14901             const char * error_msg;
14902             CV* lookup_loose;
14903             SV * character_name;
14904             STRLEN character_len;
14905             UV cp;
14906 
14907             stricter = As_Is;
14908 
14909             /* Since the RHS (after skipping initial space) is passed unchanged
14910              * to charnames, and there are different criteria for what are
14911              * legal characters in the name, just parse it here.  A character
14912              * name must begin with an ASCII alphabetic */
14913             if (! isALPHA(name[i])) {
14914                 goto failed;
14915             }
14916             lookup_name[j++] = name[i];
14917 
14918             for (++i; i < name_len; i++) {
14919                 /* Official names can only be in the ASCII range, and only
14920                  * certain characters */
14921                 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
14922                     goto failed;
14923                 }
14924                 lookup_name[j++] = name[i];
14925             }
14926 
14927             /* Finished parsing, save the name into an SV */
14928             character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
14929 
14930             /* Make sure _charnames is loaded.  (The parameters give context
14931              * for any errors generated */
14932             table = load_charnames(character_name, name, name_len, &error_msg);
14933             if (table == NULL) {
14934                 sv_catpv(msg, error_msg);
14935                 goto append_name_to_msg;
14936             }
14937 
14938             lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
14939             if (! lookup_loose) {
14940                 Perl_croak(aTHX_
14941                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
14942             }
14943 
14944             PUSHSTACKi(PERLSI_REGCOMP);
14945             ENTER ;
14946             SAVETMPS;
14947             save_re_context();
14948 
14949             PUSHMARK(SP) ;
14950             XPUSHs(character_name);
14951             PUTBACK;
14952             call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
14953 
14954             SPAGAIN ;
14955 
14956             character = POPs;
14957             SvREFCNT_inc_simple_void_NN(character);
14958 
14959             PUTBACK ;
14960             FREETMPS ;
14961             LEAVE ;
14962             POPSTACK;
14963 
14964             if (! SvOK(character)) {
14965                 goto failed;
14966             }
14967 
14968             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
14969             if (character_len == SvCUR(character)) {
14970                 prop_definition = add_cp_to_invlist(NULL, cp);
14971             }
14972             else {
14973                 AV * this_string;
14974 
14975                 /* First of the remaining characters in the string. */
14976                 char * remaining = SvPVX(character) + character_len;
14977 
14978                 if (strings == NULL) {
14979                     goto failed;    /* XXX Perhaps a specific msg instead, like
14980                                        'not available here' */
14981                 }
14982 
14983                 if (*strings == NULL) {
14984                     *strings = newAV();
14985                 }
14986 
14987                 this_string = newAV();
14988                 av_push_simple(this_string, newSVuv(cp));
14989 
14990                 do {
14991                     cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
14992                     av_push_simple(this_string, newSVuv(cp));
14993                     remaining += character_len;
14994                 } while (remaining < SvEND(character));
14995 
14996                 av_push_simple(*strings, (SV *) this_string);
14997             }
14998 
14999             return prop_definition;
15000         }
15001 
15002         /* Certain properties whose values are numeric need special handling.
15003          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
15004          * purposes of checking if this is one of those properties */
15005         if (memBEGINPs(lookup_name, j, "is")) {
15006             lookup_offset = 2;
15007         }
15008 
15009         /* Then check if it is one of these specially-handled properties.  The
15010          * possibilities are hard-coded because easier this way, and the list
15011          * is unlikely to change.
15012          *
15013          * All numeric value type properties are of this ilk, and are also
15014          * special in a different way later on.  So find those first.  There
15015          * are several numeric value type properties in the Unihan DB (which is
15016          * unlikely to be compiled with perl, but we handle it here in case it
15017          * does get compiled).  They all end with 'numeric'.  The interiors
15018          * aren't checked for the precise property.  This would stop working if
15019          * a cjk property were to be created that ended with 'numeric' and
15020          * wasn't a numeric type */
15021         is_nv_type = memEQs(lookup_name + lookup_offset,
15022                        j - 1 - lookup_offset, "numericvalue")
15023                   || memEQs(lookup_name + lookup_offset,
15024                       j - 1 - lookup_offset, "nv")
15025                   || (   memENDPs(lookup_name + lookup_offset,
15026                             j - 1 - lookup_offset, "numeric")
15027                       && (   memBEGINPs(lookup_name + lookup_offset,
15028                                       j - 1 - lookup_offset, "cjk")
15029                           || memBEGINPs(lookup_name + lookup_offset,
15030                                       j - 1 - lookup_offset, "k")));
15031         if (   is_nv_type
15032             || memEQs(lookup_name + lookup_offset,
15033                       j - 1 - lookup_offset, "canonicalcombiningclass")
15034             || memEQs(lookup_name + lookup_offset,
15035                       j - 1 - lookup_offset, "ccc")
15036             || memEQs(lookup_name + lookup_offset,
15037                       j - 1 - lookup_offset, "age")
15038             || memEQs(lookup_name + lookup_offset,
15039                       j - 1 - lookup_offset, "in")
15040             || memEQs(lookup_name + lookup_offset,
15041                       j - 1 - lookup_offset, "presentin"))
15042         {
15043             unsigned int k;
15044 
15045             /* Since the stuff after the '=' is a number, we can't throw away
15046              * '-' willy-nilly, as those could be a minus sign.  Other stricter
15047              * rules also apply.  However, these properties all can have the
15048              * rhs not be a number, in which case they contain at least one
15049              * alphabetic.  In those cases, the stricter rules don't apply.
15050              * But the numeric type properties can have the alphas [Ee] to
15051              * signify an exponent, and it is still a number with stricter
15052              * rules.  So look for an alpha that signifies not-strict */
15053             stricter = Strict;
15054             for (k = i; k < name_len; k++) {
15055                 if (   isALPHA_A(name[k])
15056                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
15057                 {
15058                     stricter = Not_Strict;
15059                     break;
15060                 }
15061             }
15062         }
15063 
15064         if (stricter) {
15065 
15066             /* A number may have a leading '+' or '-'.  The latter is retained
15067              * */
15068             if (name[i] == '+') {
15069                 i++;
15070             }
15071             else if (name[i] == '-') {
15072                 lookup_name[j++] = '-';
15073                 i++;
15074             }
15075 
15076             /* Skip leading zeros including single underscores separating the
15077              * zeros, or between the final leading zero and the first other
15078              * digit */
15079             for (; i < name_len - 1; i++) {
15080                 if (    name[i] != '0'
15081                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15082                 {
15083                     break;
15084                 }
15085             }
15086 
15087             /* Turn nv=-0 into nv=0.  These should be equivalent, but vary by
15088              * underling libc implementation. */
15089             if (   i == name_len - 1
15090                 && name[name_len-1] == '0'
15091                 && lookup_name[j-1] == '-')
15092             {
15093                 j--;
15094             }
15095         }
15096     }
15097     else {  /* No '=' */
15098 
15099        /* Only a few properties without an '=' should be parsed with stricter
15100         * rules.  The list is unlikely to change. */
15101         if (   memBEGINPs(lookup_name, j, "perl")
15102             && memNEs(lookup_name + 4, j - 4, "space")
15103             && memNEs(lookup_name + 4, j - 4, "word"))
15104         {
15105             stricter = Strict;
15106 
15107             /* We set the inputs back to 0 and the code below will reparse,
15108              * using strict */
15109             i = i_zero;
15110             j = 0;
15111         }
15112     }
15113 
15114     /* Here, we have either finished the property, or are positioned to parse
15115      * the remainder, and we know if stricter rules apply.  Finish out, if not
15116      * already done */
15117     for (; i < name_len; i++) {
15118         char cur = name[i];
15119 
15120         /* In all instances, case differences are ignored, and we normalize to
15121          * lowercase */
15122         if (isUPPER_A(cur)) {
15123             lookup_name[j++] = toLOWER(cur);
15124             continue;
15125         }
15126 
15127         /* An underscore is skipped, but not under strict rules unless it
15128          * separates two digits */
15129         if (cur == '_') {
15130             if (    stricter
15131                 && (   i == i_zero || (int) i == equals_pos || i == name_len- 1
15132                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
15133             {
15134                 lookup_name[j++] = '_';
15135             }
15136             continue;
15137         }
15138 
15139         /* Hyphens are skipped except under strict */
15140         if (cur == '-' && ! stricter) {
15141             continue;
15142         }
15143 
15144         /* XXX Bug in documentation.  It says white space skipped adjacent to
15145          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
15146          * in a number */
15147         if (isSPACE_A(cur) && ! stricter) {
15148             continue;
15149         }
15150 
15151         lookup_name[j++] = cur;
15152 
15153         /* Unless this is a non-trailing slash, we are done with it */
15154         if (i >= name_len - 1 || cur != '/') {
15155             continue;
15156         }
15157 
15158         slash_pos = j;
15159 
15160         /* A slash in the 'numeric value' property indicates that what follows
15161          * is a denominator.  It can have a leading '+' and '0's that should be
15162          * skipped.  But we have never allowed a negative denominator, so treat
15163          * a minus like every other character.  (No need to rule out a second
15164          * '/', as that won't match anything anyway */
15165         if (is_nv_type) {
15166             i++;
15167             if (i < name_len && name[i] == '+') {
15168                 i++;
15169             }
15170 
15171             /* Skip leading zeros including underscores separating digits */
15172             for (; i < name_len - 1; i++) {
15173                 if (   name[i] != '0'
15174                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15175                 {
15176                     break;
15177                 }
15178             }
15179 
15180             /* Store the first real character in the denominator */
15181             if (i < name_len) {
15182                 lookup_name[j++] = name[i];
15183             }
15184         }
15185     }
15186 
15187     /* Here are completely done parsing the input 'name', and 'lookup_name'
15188      * contains a copy, normalized.
15189      *
15190      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
15191      * different from without the underscores.  */
15192     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
15193            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
15194         && UNLIKELY(name[name_len-1] == '_'))
15195     {
15196         lookup_name[j++] = '&';
15197     }
15198 
15199     /* If the original input began with 'In' or 'Is', it could be a subroutine
15200      * call to a user-defined property instead of a Unicode property name. */
15201     if (    name_len - non_pkg_begin > 2
15202         &&  name[non_pkg_begin+0] == 'I'
15203         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
15204     {
15205         /* Names that start with In have different characteristics than those
15206          * that start with Is */
15207         if (name[non_pkg_begin+1] == 's') {
15208             starts_with_Is = TRUE;
15209         }
15210     }
15211     else {
15212         could_be_user_defined = FALSE;
15213     }
15214 
15215     if (could_be_user_defined) {
15216         CV* user_sub;
15217 
15218         /* If the user defined property returns the empty string, it could
15219          * easily be because the pattern is being compiled before the data it
15220          * actually needs to compile is available.  This could be argued to be
15221          * a bug in the perl code, but this is a change of behavior for Perl,
15222          * so we handle it.  This means that intentionally returning nothing
15223          * will not be resolved until runtime */
15224         bool empty_return = FALSE;
15225 
15226         /* Here, the name could be for a user defined property, which are
15227          * implemented as subs. */
15228         user_sub = get_cvn_flags(name, name_len, 0);
15229         if (! user_sub) {
15230 
15231             /* Here, the property name could be a user-defined one, but there
15232              * is no subroutine to handle it (as of now).   Defer handling it
15233              * until runtime.  Otherwise, a block defined by Unicode in a later
15234              * release would get the synonym InFoo added for it, and existing
15235              * code that used that name would suddenly break if it referred to
15236              * the property before the sub was declared.  See [perl #134146] */
15237             if (deferrable) {
15238                 goto definition_deferred;
15239             }
15240 
15241             /* Here, we are at runtime, and didn't find the user property.  It
15242              * could be an official property, but only if no package was
15243              * specified, or just the utf8:: package. */
15244             if (could_be_deferred_official) {
15245                 lookup_name += lun_non_pkg_begin;
15246                 j -= lun_non_pkg_begin;
15247             }
15248             else if (! stripped_utf8_pkg) {
15249                 goto unknown_user_defined;
15250             }
15251 
15252             /* Drop down to look up in the official properties */
15253         }
15254         else {
15255             const char insecure[] = "Insecure user-defined property";
15256 
15257             /* Here, there is a sub by the correct name.  Normally we call it
15258              * to get the property definition */
15259             dSP;
15260             SV * user_sub_sv = MUTABLE_SV(user_sub);
15261             SV * error;     /* Any error returned by calling 'user_sub' */
15262             SV * key;       /* The key into the hash of user defined sub names
15263                              */
15264             SV * placeholder;
15265             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
15266 
15267             /* How many times to retry when another thread is in the middle of
15268              * expanding the same definition we want */
15269             PERL_INT_FAST8_T retry_countdown = 10;
15270 
15271             DECLARATION_FOR_GLOBAL_CONTEXT;
15272 
15273             /* If we get here, we know this property is user-defined */
15274             *user_defined_ptr = TRUE;
15275 
15276             /* We refuse to call a potentially tainted subroutine; returning an
15277              * error instead */
15278             if (TAINT_get) {
15279                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15280                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15281                 goto append_name_to_msg;
15282             }
15283 
15284             /* In principal, we only call each subroutine property definition
15285              * once during the life of the program.  This guarantees that the
15286              * property definition never changes.  The results of the single
15287              * sub call are stored in a hash, which is used instead for future
15288              * references to this property.  The property definition is thus
15289              * immutable.  But, to allow the user to have a /i-dependent
15290              * definition, we call the sub once for non-/i, and once for /i,
15291              * should the need arise, passing the /i status as a parameter.
15292              *
15293              * We start by constructing the hash key name, consisting of the
15294              * fully qualified subroutine name, preceded by the /i status, so
15295              * that there is a key for /i and a different key for non-/i */
15296             key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
15297             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15298                                           non_pkg_begin != 0);
15299             sv_catsv(key, fq_name);
15300 
15301             /* We only call the sub once throughout the life of the program
15302              * (with the /i, non-/i exception noted above).  That means the
15303              * hash must be global and accessible to all threads.  It is
15304              * created at program start-up, before any threads are created, so
15305              * is accessible to all children.  But this creates some
15306              * complications.
15307              *
15308              * 1) The keys can't be shared, or else problems arise; sharing is
15309              *    turned off at hash creation time
15310              * 2) All SVs in it are there for the remainder of the life of the
15311              *    program, and must be created in the same interpreter context
15312              *    as the hash, or else they will be freed from the wrong pool
15313              *    at global destruction time.  This is handled by switching to
15314              *    the hash's context to create each SV going into it, and then
15315              *    immediately switching back
15316              * 3) All accesses to the hash must be controlled by a mutex, to
15317              *    prevent two threads from getting an unstable state should
15318              *    they simultaneously be accessing it.  The code below is
15319              *    crafted so that the mutex is locked whenever there is an
15320              *    access and unlocked only when the next stable state is
15321              *    achieved.
15322              *
15323              * The hash stores either the definition of the property if it was
15324              * valid, or, if invalid, the error message that was raised.  We
15325              * use the type of SV to distinguish.
15326              *
15327              * There's also the need to guard against the definition expansion
15328              * from infinitely recursing.  This is handled by storing the aTHX
15329              * of the expanding thread during the expansion.  Again the SV type
15330              * is used to distinguish this from the other two cases.  If we
15331              * come to here and the hash entry for this property is our aTHX,
15332              * it means we have recursed, and the code assumes that we would
15333              * infinitely recurse, so instead stops and raises an error.
15334              * (Any recursion has always been treated as infinite recursion in
15335              * this feature.)
15336              *
15337              * If instead, the entry is for a different aTHX, it means that
15338              * that thread has gotten here first, and hasn't finished expanding
15339              * the definition yet.  We just have to wait until it is done.  We
15340              * sleep and retry a few times, returning an error if the other
15341              * thread doesn't complete. */
15342 
15343           re_fetch:
15344             USER_PROP_MUTEX_LOCK;
15345 
15346             /* If we have an entry for this key, the subroutine has already
15347              * been called once with this /i status. */
15348             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
15349                                                    SvPVX(key), SvCUR(key), 0);
15350             if (saved_user_prop_ptr) {
15351 
15352                 /* If the saved result is an inversion list, it is the valid
15353                  * definition of this property */
15354                 if (is_invlist(*saved_user_prop_ptr)) {
15355                     prop_definition = *saved_user_prop_ptr;
15356 
15357                     /* The SV in the hash won't be removed until global
15358                      * destruction, so it is stable and we can unlock */
15359                     USER_PROP_MUTEX_UNLOCK;
15360 
15361                     /* The caller shouldn't try to free this SV */
15362                     return prop_definition;
15363                 }
15364 
15365                 /* Otherwise, if it is a string, it is the error message
15366                  * that was returned when we first tried to evaluate this
15367                  * property.  Fail, and append the message */
15368                 if (SvPOK(*saved_user_prop_ptr)) {
15369                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15370                     sv_catsv(msg, *saved_user_prop_ptr);
15371 
15372                     /* The SV in the hash won't be removed until global
15373                      * destruction, so it is stable and we can unlock */
15374                     USER_PROP_MUTEX_UNLOCK;
15375 
15376                     return NULL;
15377                 }
15378 
15379                 assert(SvIOK(*saved_user_prop_ptr));
15380 
15381                 /* Here, we have an unstable entry in the hash.  Either another
15382                  * thread is in the middle of expanding the property's
15383                  * definition, or we are ourselves recursing.  We use the aTHX
15384                  * in it to distinguish */
15385                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
15386 
15387                     /* Here, it's another thread doing the expanding.  We've
15388                      * looked as much as we are going to at the contents of the
15389                      * hash entry.  It's safe to unlock. */
15390                     USER_PROP_MUTEX_UNLOCK;
15391 
15392                     /* Retry a few times */
15393                     if (retry_countdown-- > 0) {
15394                         PerlProc_sleep(1);
15395                         goto re_fetch;
15396                     }
15397 
15398                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15399                     sv_catpvs(msg, "Timeout waiting for another thread to "
15400                                    "define");
15401                     goto append_name_to_msg;
15402                 }
15403 
15404                 /* Here, we are recursing; don't dig any deeper */
15405                 USER_PROP_MUTEX_UNLOCK;
15406 
15407                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15408                 sv_catpvs(msg,
15409                           "Infinite recursion in user-defined property");
15410                 goto append_name_to_msg;
15411             }
15412 
15413             /* Here, this thread has exclusive control, and there is no entry
15414              * for this property in the hash.  So we have the go ahead to
15415              * expand the definition ourselves. */
15416 
15417             PUSHSTACKi(PERLSI_REGCOMP);
15418             ENTER;
15419 
15420             /* Create a temporary placeholder in the hash to detect recursion
15421              * */
15422             SWITCH_TO_GLOBAL_CONTEXT;
15423             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
15424             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
15425             RESTORE_CONTEXT;
15426 
15427             /* Now that we have a placeholder, we can let other threads
15428              * continue */
15429             USER_PROP_MUTEX_UNLOCK;
15430 
15431             /* Make sure the placeholder always gets destroyed */
15432             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
15433 
15434             PUSHMARK(SP);
15435             SAVETMPS;
15436 
15437             /* Call the user's function, with the /i status as a parameter.
15438              * Note that we have gone to a lot of trouble to keep this call
15439              * from being within the locked mutex region. */
15440             XPUSHs(boolSV(to_fold));
15441             PUTBACK;
15442 
15443             /* The following block was taken from swash_init().  Presumably
15444              * they apply to here as well, though we no longer use a swash --
15445              * khw */
15446             SAVEHINTS();
15447             save_re_context();
15448             /* We might get here via a subroutine signature which uses a utf8
15449              * parameter name, at which point PL_subname will have been set
15450              * but not yet used. */
15451             save_item(PL_subname);
15452 
15453             /* G_SCALAR guarantees a single return value */
15454             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
15455 
15456             SPAGAIN;
15457 
15458             error = ERRSV;
15459             if (TAINT_get || SvTRUE(error)) {
15460                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15461                 if (SvTRUE(error)) {
15462                     sv_catpvs(msg, "Error \"");
15463                     sv_catsv(msg, error);
15464                     sv_catpvs(msg, "\"");
15465                 }
15466                 if (TAINT_get) {
15467                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
15468                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15469                 }
15470 
15471                 if (name_len > 0) {
15472                     sv_catpvs(msg, " in expansion of ");
15473                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
15474                                                                   name_len,
15475                                                                   name));
15476                 }
15477 
15478                 (void) POPs;
15479                 prop_definition = NULL;
15480             }
15481             else {
15482                 SV * contents = POPs;
15483 
15484                 /* The contents is supposed to be the expansion of the property
15485                  * definition.  If the definition is deferrable, and we got an
15486                  * empty string back, set a flag to later defer it (after clean
15487                  * up below). */
15488                 if (      deferrable
15489                     && (! SvPOK(contents) || SvCUR(contents) == 0))
15490                 {
15491                         empty_return = TRUE;
15492                 }
15493                 else { /* Otherwise, call a function to check for valid syntax,
15494                           and handle it */
15495 
15496                     prop_definition = handle_user_defined_property(
15497                                                     name, name_len,
15498                                                     is_utf8, to_fold, runtime,
15499                                                     deferrable,
15500                                                     contents, user_defined_ptr,
15501                                                     msg,
15502                                                     level);
15503                 }
15504             }
15505 
15506             /* Here, we have the results of the expansion.  Delete the
15507              * placeholder, and if the definition is now known, replace it with
15508              * that definition.  We need exclusive access to the hash, and we
15509              * can't let anyone else in, between when we delete the placeholder
15510              * and add the permanent entry */
15511             USER_PROP_MUTEX_LOCK;
15512 
15513             S_delete_recursion_entry(aTHX_ SvPVX(key));
15514 
15515             if (    ! empty_return
15516                 && (! prop_definition || is_invlist(prop_definition)))
15517             {
15518                 /* If we got success we use the inversion list defining the
15519                  * property; otherwise use the error message */
15520                 SWITCH_TO_GLOBAL_CONTEXT;
15521                 (void) hv_store_ent(PL_user_def_props,
15522                                     key,
15523                                     ((prop_definition)
15524                                      ? newSVsv(prop_definition)
15525                                      : newSVsv(msg)),
15526                                     0);
15527                 RESTORE_CONTEXT;
15528             }
15529 
15530             /* All done, and the hash now has a permanent entry for this
15531              * property.  Give up exclusive control */
15532             USER_PROP_MUTEX_UNLOCK;
15533 
15534             FREETMPS;
15535             LEAVE;
15536             POPSTACK;
15537 
15538             if (empty_return) {
15539                 goto definition_deferred;
15540             }
15541 
15542             if (prop_definition) {
15543 
15544                 /* If the definition is for something not known at this time,
15545                  * we toss it, and go return the main property name, as that's
15546                  * the one the user will be aware of */
15547                 if (! is_invlist(prop_definition)) {
15548                     SvREFCNT_dec_NN(prop_definition);
15549                     goto definition_deferred;
15550                 }
15551 
15552                 sv_2mortal(prop_definition);
15553             }
15554 
15555             /* And return */
15556             return prop_definition;
15557 
15558         }   /* End of calling the subroutine for the user-defined property */
15559     }       /* End of it could be a user-defined property */
15560 
15561     /* Here it wasn't a user-defined property that is known at this time.  See
15562      * if it is a Unicode property */
15563 
15564     lookup_len = j;     /* This is a more mnemonic name than 'j' */
15565 
15566     /* Get the index into our pointer table of the inversion list corresponding
15567      * to the property */
15568     table_index = do_uniprop_match(lookup_name, lookup_len);
15569 
15570     /* If it didn't find the property ... */
15571     if (table_index == 0) {
15572 
15573         /* Try again stripping off any initial 'Is'.  This is because we
15574          * promise that an initial Is is optional.  The same isn't true of
15575          * names that start with 'In'.  Those can match only blocks, and the
15576          * lookup table already has those accounted for.  The lookup table also
15577          * has already accounted for Perl extensions (without and = sign)
15578          * starting with 'i's'. */
15579         if (starts_with_Is && equals_pos >= 0) {
15580             lookup_name += 2;
15581             lookup_len -= 2;
15582             equals_pos -= 2;
15583             slash_pos -= 2;
15584 
15585             table_index = do_uniprop_match(lookup_name, lookup_len);
15586         }
15587 
15588         if (table_index == 0) {
15589             char * canonical;
15590 
15591             /* Here, we didn't find it.  If not a numeric type property, and
15592              * can't be a user-defined one, it isn't a legal property */
15593             if (! is_nv_type) {
15594                 if (! could_be_user_defined) {
15595                     goto failed;
15596                 }
15597 
15598                 /* Here, the property name is legal as a user-defined one.   At
15599                  * compile time, it might just be that the subroutine for that
15600                  * property hasn't been encountered yet, but at runtime, it's
15601                  * an error to try to use an undefined one */
15602                 if (! deferrable) {
15603                     goto unknown_user_defined;
15604                 }
15605 
15606                 goto definition_deferred;
15607             } /* End of isn't a numeric type property */
15608 
15609             /* The numeric type properties need more work to decide.  What we
15610              * do is make sure we have the number in canonical form and look
15611              * that up. */
15612 
15613             if (slash_pos < 0) {    /* No slash */
15614 
15615                 /* When it isn't a rational, take the input, convert it to a
15616                  * NV, then create a canonical string representation of that
15617                  * NV. */
15618 
15619                 NV value;
15620                 SSize_t value_len = lookup_len - equals_pos;
15621 
15622                 /* Get the value */
15623                 if (   value_len <= 0
15624                     || my_atof3(lookup_name + equals_pos, &value,
15625                                 value_len)
15626                           != lookup_name + lookup_len)
15627                 {
15628                     goto failed;
15629                 }
15630 
15631                 /* If the value is an integer, the canonical value is integral
15632                  * */
15633                 if (Perl_ceil(value) == value) {
15634                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
15635                                             equals_pos, lookup_name, value);
15636                 }
15637                 else {  /* Otherwise, it is %e with a known precision */
15638                     char * exp_ptr;
15639 
15640                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
15641                                                 equals_pos, lookup_name,
15642                                                 PL_E_FORMAT_PRECISION, value);
15643 
15644                     /* The exponent generated is expecting two digits, whereas
15645                      * %e on some systems will generate three.  Remove leading
15646                      * zeros in excess of 2 from the exponent.  We start
15647                      * looking for them after the '=' */
15648                     exp_ptr = strchr(canonical + equals_pos, 'e');
15649                     if (exp_ptr) {
15650                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
15651                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
15652 
15653                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
15654 
15655                         if (excess_exponent_len > 0) {
15656                             SSize_t leading_zeros = strspn(cur_ptr, "0");
15657                             SSize_t excess_leading_zeros
15658                                     = MIN(leading_zeros, excess_exponent_len);
15659                             if (excess_leading_zeros > 0) {
15660                                 Move(cur_ptr + excess_leading_zeros,
15661                                      cur_ptr,
15662                                      strlen(cur_ptr) - excess_leading_zeros
15663                                        + 1,  /* Copy the NUL as well */
15664                                      char);
15665                             }
15666                         }
15667                     }
15668                 }
15669             }
15670             else {  /* Has a slash.  Create a rational in canonical form  */
15671                 UV numerator, denominator, gcd, trial;
15672                 const char * end_ptr;
15673                 const char * sign = "";
15674 
15675                 /* We can't just find the numerator, denominator, and do the
15676                  * division, then use the method above, because that is
15677                  * inexact.  And the input could be a rational that is within
15678                  * epsilon (given our precision) of a valid rational, and would
15679                  * then incorrectly compare valid.
15680                  *
15681                  * We're only interested in the part after the '=' */
15682                 const char * this_lookup_name = lookup_name + equals_pos;
15683                 lookup_len -= equals_pos;
15684                 slash_pos -= equals_pos;
15685 
15686                 /* Handle any leading minus */
15687                 if (this_lookup_name[0] == '-') {
15688                     sign = "-";
15689                     this_lookup_name++;
15690                     lookup_len--;
15691                     slash_pos--;
15692                 }
15693 
15694                 /* Convert the numerator to numeric */
15695                 end_ptr = this_lookup_name + slash_pos;
15696                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
15697                     goto failed;
15698                 }
15699 
15700                 /* It better have included all characters before the slash */
15701                 if (*end_ptr != '/') {
15702                     goto failed;
15703                 }
15704 
15705                 /* Set to look at just the denominator */
15706                 this_lookup_name += slash_pos;
15707                 lookup_len -= slash_pos;
15708                 end_ptr = this_lookup_name + lookup_len;
15709 
15710                 /* Convert the denominator to numeric */
15711                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
15712                     goto failed;
15713                 }
15714 
15715                 /* It better be the rest of the characters, and don't divide by
15716                  * 0 */
15717                 if (   end_ptr != this_lookup_name + lookup_len
15718                     || denominator == 0)
15719                 {
15720                     goto failed;
15721                 }
15722 
15723                 /* Get the greatest common denominator using
15724                    https://en.wikipedia.org/wiki/Euclidean_algorithm */
15725                 gcd = numerator;
15726                 trial = denominator;
15727                 while (trial != 0) {
15728                     UV temp = trial;
15729                     trial = gcd % trial;
15730                     gcd = temp;
15731                 }
15732 
15733                 /* If already in lowest possible terms, we have already tried
15734                  * looking this up */
15735                 if (gcd == 1) {
15736                     goto failed;
15737                 }
15738 
15739                 /* Reduce the rational, which should put it in canonical form
15740                  * */
15741                 numerator /= gcd;
15742                 denominator /= gcd;
15743 
15744                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
15745                         equals_pos, lookup_name, sign, numerator, denominator);
15746             }
15747 
15748             /* Here, we have the number in canonical form.  Try that */
15749             table_index = do_uniprop_match(canonical, strlen(canonical));
15750             if (table_index == 0) {
15751                 goto failed;
15752             }
15753         }   /* End of still didn't find the property in our table */
15754     }       /* End of       didn't find the property in our table */
15755 
15756     /* Here, we have a non-zero return, which is an index into a table of ptrs.
15757      * A negative return signifies that the real index is the absolute value,
15758      * but the result needs to be inverted */
15759     if (table_index < 0) {
15760         invert_return = TRUE;
15761         table_index = -table_index;
15762     }
15763 
15764     /* Out-of band indices indicate a deprecated property.  The proper index is
15765      * modulo it with the table size.  And dividing by the table size yields
15766      * an offset into a table constructed by regen/mk_invlists.pl to contain
15767      * the corresponding warning message */
15768     if (table_index > MAX_UNI_KEYWORD_INDEX) {
15769         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
15770         table_index %= MAX_UNI_KEYWORD_INDEX;
15771         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
15772                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
15773                 (int) name_len, name,
15774                 get_deprecated_property_msg(warning_offset));
15775     }
15776 
15777     /* In a few properties, a different property is used under /i.  These are
15778      * unlikely to change, so are hard-coded here. */
15779     if (to_fold) {
15780         if (   table_index == UNI_XPOSIXUPPER
15781             || table_index == UNI_XPOSIXLOWER
15782             || table_index == UNI_TITLE)
15783         {
15784             table_index = UNI_CASED;
15785         }
15786         else if (   table_index == UNI_UPPERCASELETTER
15787                  || table_index == UNI_LOWERCASELETTER
15788 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
15789                  || table_index == UNI_TITLECASELETTER
15790 #  endif
15791         ) {
15792             table_index = UNI_CASEDLETTER;
15793         }
15794         else if (  table_index == UNI_POSIXUPPER
15795                 || table_index == UNI_POSIXLOWER)
15796         {
15797             table_index = UNI_POSIXALPHA;
15798         }
15799     }
15800 
15801     /* Create and return the inversion list */
15802     prop_definition = get_prop_definition(table_index);
15803     sv_2mortal(prop_definition);
15804 
15805     /* See if there is a private use override to add to this definition */
15806     {
15807         COPHH * hinthash = (IN_PERL_COMPILETIME)
15808                            ? CopHINTHASH_get(&PL_compiling)
15809                            : CopHINTHASH_get(PL_curcop);
15810         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
15811 
15812         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
15813 
15814             /* See if there is an element in the hints hash for this table */
15815             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
15816             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
15817 
15818             if (pos) {
15819                 bool dummy;
15820                 SV * pu_definition;
15821                 SV * pu_invlist;
15822                 SV * expanded_prop_definition =
15823                             sv_2mortal(invlist_clone(prop_definition, NULL));
15824 
15825                 /* If so, it's definition is the string from here to the next
15826                  * \a character.  And its format is the same as a user-defined
15827                  * property */
15828                 pos += SvCUR(pu_lookup);
15829                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
15830                 pu_invlist = handle_user_defined_property(lookup_name,
15831                                                           lookup_len,
15832                                                           0, /* Not UTF-8 */
15833                                                           0, /* Not folded */
15834                                                           runtime,
15835                                                           deferrable,
15836                                                           pu_definition,
15837                                                           &dummy,
15838                                                           msg,
15839                                                           level);
15840                 if (TAINT_get) {
15841                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15842                     sv_catpvs(msg, "Insecure private-use override");
15843                     goto append_name_to_msg;
15844                 }
15845 
15846                 /* For now, as a safety measure, make sure that it doesn't
15847                  * override non-private use code points */
15848                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
15849 
15850                 /* Add it to the list to be returned */
15851                 _invlist_union(prop_definition, pu_invlist,
15852                                &expanded_prop_definition);
15853                 prop_definition = expanded_prop_definition;
15854                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
15855             }
15856         }
15857     }
15858 
15859     if (invert_return) {
15860         _invlist_invert(prop_definition);
15861     }
15862     return prop_definition;
15863 
15864   unknown_user_defined:
15865     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15866     sv_catpvs(msg, "Unknown user-defined property name");
15867     goto append_name_to_msg;
15868 
15869   failed:
15870     if (non_pkg_begin != 0) {
15871         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15872         sv_catpvs(msg, "Illegal user-defined property name");
15873     }
15874     else {
15875         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15876         sv_catpvs(msg, "Can't find Unicode property definition");
15877     }
15878     /* FALLTHROUGH */
15879 
15880   append_name_to_msg:
15881     {
15882         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
15883         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
15884 
15885         sv_catpv(msg, prefix);
15886         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
15887         sv_catpv(msg, suffix);
15888     }
15889 
15890     return NULL;
15891 
15892   definition_deferred:
15893 
15894     {
15895         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
15896 
15897         /* Here it could yet to be defined, so defer evaluation of this until
15898          * its needed at runtime.  We need the fully qualified property name to
15899          * avoid ambiguity */
15900         if (! fq_name) {
15901             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15902                                                                 is_qualified);
15903         }
15904 
15905         /* If it didn't come with a package, or the package is utf8::, this
15906          * actually could be an official Unicode property whose inclusion we
15907          * are deferring until runtime to make sure that it isn't overridden by
15908          * a user-defined property of the same name (which we haven't
15909          * encountered yet).  Add a marker to indicate this possibility, for
15910          * use at such time when we first need the definition during pattern
15911          * matching execution */
15912         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
15913             sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
15914         }
15915 
15916         /* We also need a trailing newline */
15917         sv_catpvs(fq_name, "\n");
15918 
15919         *user_defined_ptr = TRUE;
15920         return fq_name;
15921     }
15922 }
15923 
15924 STATIC bool
S_handle_names_wildcard(pTHX_ const char * wname,const STRLEN wname_len,SV ** prop_definition,AV ** strings)15925 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
15926                               const STRLEN wname_len, /* Its length */
15927                               SV ** prop_definition,
15928                               AV ** strings)
15929 {
15930     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
15931      * any matches, adding them to prop_definition */
15932 
15933     dSP;
15934 
15935     CV * get_names_info;        /* entry to charnames.pm to get info we need */
15936     SV * names_string;          /* Contains all character names, except algo */
15937     SV * algorithmic_names;     /* Contains info about algorithmically
15938                                    generated character names */
15939     REGEXP * subpattern_re;     /* The user's pattern to match with */
15940     struct regexp * prog;       /* The compiled pattern */
15941     char * all_names_start;     /* lib/unicore/Name.pl string of every
15942                                    (non-algorithmic) character name */
15943     char * cur_pos;             /* We match, effectively using /gc; this is
15944                                    where we are now */
15945     bool found_matches = FALSE; /* Did any name match so far? */
15946     SV * empty;                 /* For matching zero length names */
15947     SV * must_sv;               /* Contains the substring, if any, that must be
15948                                    in a name for the subpattern to match */
15949     const char * must;          /* The PV of 'must' */
15950     STRLEN must_len;            /* And its length */
15951     SV * syllable_name = NULL;  /* For Hangul syllables */
15952     const char hangul_prefix[] = "HANGUL SYLLABLE ";
15953     const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
15954 
15955     /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
15956      * syllable name, and these are immutable and guaranteed by the Unicode
15957      * standard to never be extended */
15958     const STRLEN syl_max_len = hangul_prefix_len + 7;
15959 
15960     IV i;
15961 
15962     PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
15963 
15964     /* Make sure _charnames is loaded.  (The parameters give context
15965      * for any errors generated */
15966     get_names_info = get_cv("_charnames::_get_names_info", 0);
15967     if (! get_names_info) {
15968         Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
15969     }
15970 
15971     /* Get the charnames data */
15972     PUSHSTACKi(PERLSI_REGCOMP);
15973     ENTER ;
15974     SAVETMPS;
15975     save_re_context();
15976 
15977     PUSHMARK(SP) ;
15978     PUTBACK;
15979 
15980     /* Special _charnames entry point that returns the info this routine
15981      * requires */
15982     call_sv(MUTABLE_SV(get_names_info), G_LIST);
15983 
15984     SPAGAIN ;
15985 
15986     /* Data structure for names which end in their very own code points */
15987     algorithmic_names = POPs;
15988     SvREFCNT_inc_simple_void_NN(algorithmic_names);
15989 
15990     /* The lib/unicore/Name.pl string */
15991     names_string = POPs;
15992     SvREFCNT_inc_simple_void_NN(names_string);
15993 
15994     PUTBACK ;
15995     FREETMPS ;
15996     LEAVE ;
15997     POPSTACK;
15998 
15999     if (   ! SvROK(names_string)
16000         || ! SvROK(algorithmic_names))
16001     {   /* Perhaps should panic instead XXX */
16002         SvREFCNT_dec(names_string);
16003         SvREFCNT_dec(algorithmic_names);
16004         return FALSE;
16005     }
16006 
16007     names_string = sv_2mortal(SvRV(names_string));
16008     all_names_start = SvPVX(names_string);
16009     cur_pos = all_names_start;
16010 
16011     algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
16012 
16013     /* Compile the subpattern consisting of the name being looked for */
16014     subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
16015 
16016     must_sv = re_intuit_string(subpattern_re);
16017     if (must_sv) {
16018         /* regexec.c can free the re_intuit_string() return. GH #17734 */
16019         must_sv = sv_2mortal(newSVsv(must_sv));
16020         must = SvPV(must_sv, must_len);
16021     }
16022     else {
16023         must = "";
16024         must_len = 0;
16025     }
16026 
16027     /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
16028      * This works because the NUL causes the function to return early, thus
16029      * showing that there are characters in it other than the acceptable ones,
16030      * which is our desired result.) */
16031 
16032     prog = ReANY(subpattern_re);
16033 
16034     /* If only nothing is matched, skip to where empty names are looked for */
16035     if (prog->maxlen == 0) {
16036         goto check_empty;
16037     }
16038 
16039     /* And match against the string of all names /gc.  Don't even try if it
16040      * must match a character not found in any name. */
16041     if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
16042     {
16043         while (execute_wildcard(subpattern_re,
16044                                 cur_pos,
16045                                 SvEND(names_string),
16046                                 all_names_start, 0,
16047                                 names_string,
16048                                 0))
16049         { /* Here, matched. */
16050 
16051             /* Note the string entries look like
16052              *      00001\nSTART OF HEADING\n\n
16053              * so we could match anywhere in that string.  We have to rule out
16054              * matching a code point line */
16055             char * this_name_start = all_names_start
16056                                                 + RX_OFFS_START(subpattern_re,0);
16057             char * this_name_end   = all_names_start
16058                                                 + RX_OFFS_END(subpattern_re,0);
16059             char * cp_start;
16060             char * cp_end;
16061             UV cp = 0;      /* Silences some compilers */
16062             AV * this_string = NULL;
16063             bool is_multi = FALSE;
16064 
16065             /* If matched nothing, advance to next possible match */
16066             if (this_name_start == this_name_end) {
16067                 cur_pos = (char *) memchr(this_name_end + 1, '\n',
16068                                           SvEND(names_string) - this_name_end);
16069                 if (cur_pos == NULL) {
16070                     break;
16071                 }
16072             }
16073             else {
16074                 /* Position the next match to start beyond the current returned
16075                  * entry */
16076                 cur_pos = (char *) memchr(this_name_end, '\n',
16077                                           SvEND(names_string) - this_name_end);
16078             }
16079 
16080             /* Back up to the \n just before the beginning of the character. */
16081             cp_end = (char *) my_memrchr(all_names_start,
16082                                          '\n',
16083                                          this_name_start - all_names_start);
16084 
16085             /* If we didn't find a \n, it means it matched somewhere in the
16086              * initial '00000' in the string, so isn't a real match */
16087             if (cp_end == NULL) {
16088                 continue;
16089             }
16090 
16091             this_name_start = cp_end + 1;   /* The name starts just after */
16092             cp_end--;                       /* the \n, and the code point */
16093                                             /* ends just before it */
16094 
16095             /* All code points are 5 digits long */
16096             cp_start = cp_end - 4;
16097 
16098             /* This shouldn't happen, as we found a \n, and the first \n is
16099              * further along than what we subtracted */
16100             assert(cp_start >= all_names_start);
16101 
16102             if (cp_start == all_names_start) {
16103                 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
16104                 continue;
16105             }
16106 
16107             /* If the character is a blank, we either have a named sequence, or
16108              * something is wrong */
16109             if (*(cp_start - 1) == ' ') {
16110                 cp_start = (char *) my_memrchr(all_names_start,
16111                                                '\n',
16112                                                cp_start - all_names_start);
16113                 cp_start++;
16114             }
16115 
16116             assert(cp_start != NULL && cp_start >= all_names_start + 2);
16117 
16118             /* Except for the first line in the string, the sequence before the
16119              * code point is \n\n.  If that isn't the case here, we didn't
16120              * match the name of a character.  (We could have matched a named
16121              * sequence, not currently handled */
16122             if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
16123                 continue;
16124             }
16125 
16126             /* We matched!  Add this to the list */
16127             found_matches = TRUE;
16128 
16129             /* Loop through all the code points in the sequence */
16130             while (cp_start < cp_end) {
16131 
16132                 /* Calculate this code point from its 5 digits */
16133                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
16134                    + (XDIGIT_VALUE(cp_start[1]) << 12)
16135                    + (XDIGIT_VALUE(cp_start[2]) << 8)
16136                    + (XDIGIT_VALUE(cp_start[3]) << 4)
16137                    +  XDIGIT_VALUE(cp_start[4]);
16138 
16139                 cp_start += 6;  /* Go past any blank */
16140 
16141                 if (cp_start < cp_end || is_multi) {
16142                     if (this_string == NULL) {
16143                         this_string = newAV();
16144                     }
16145 
16146                     is_multi = TRUE;
16147                     av_push_simple(this_string, newSVuv(cp));
16148                 }
16149             }
16150 
16151             if (is_multi) { /* Was more than one code point */
16152                 if (*strings == NULL) {
16153                     *strings = newAV();
16154                 }
16155 
16156                 av_push_simple(*strings, (SV *) this_string);
16157             }
16158             else {  /* Only a single code point */
16159                 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
16160             }
16161         } /* End of loop through the non-algorithmic names string */
16162     }
16163 
16164     /* There are also character names not in 'names_string'.  These are
16165      * algorithmically generatable.  Try this pattern on each possible one.
16166      * (khw originally planned to leave this out given the large number of
16167      * matches attempted; but the speed turned out to be quite acceptable
16168      *
16169      * There are plenty of opportunities to optimize to skip many of the tests.
16170      * beyond the rudimentary ones already here */
16171 
16172     /* First see if the subpattern matches any of the algorithmic generatable
16173      * Hangul syllable names.
16174      *
16175      * We know none of these syllable names will match if the input pattern
16176      * requires more bytes than any syllable has, or if the input pattern only
16177      * matches an empty name, or if the pattern has something it must match and
16178      * one of the characters in that isn't in any Hangul syllable. */
16179     if (    prog->minlen <= (SSize_t) syl_max_len
16180         &&  prog->maxlen > 0
16181         && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
16182     {
16183         /* These constants, names, values, and algorithm are adapted from the
16184          * Unicode standard, version 5.1, section 3.12, and should never
16185          * change. */
16186         const char * JamoL[] = {
16187             "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
16188             "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
16189         };
16190         const int LCount = C_ARRAY_LENGTH(JamoL);
16191 
16192         const char * JamoV[] = {
16193             "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
16194             "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
16195             "I"
16196         };
16197         const int VCount = C_ARRAY_LENGTH(JamoV);
16198 
16199         const char * JamoT[] = {
16200             "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
16201             "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
16202             "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
16203         };
16204         const int TCount = C_ARRAY_LENGTH(JamoT);
16205 
16206         int L, V, T;
16207 
16208         /* This is the initial Hangul syllable code point; each time through the
16209          * inner loop, it maps to the next higher code point.  For more info,
16210          * see the Hangul syllable section of the Unicode standard. */
16211         int cp = 0xAC00;
16212 
16213         syllable_name = sv_2mortal(newSV(syl_max_len));
16214         sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
16215 
16216         for (L = 0; L < LCount; L++) {
16217             for (V = 0; V < VCount; V++) {
16218                 for (T = 0; T < TCount; T++) {
16219 
16220                     /* Truncate back to the prefix, which is unvarying */
16221                     SvCUR_set(syllable_name, hangul_prefix_len);
16222 
16223                     sv_catpv(syllable_name, JamoL[L]);
16224                     sv_catpv(syllable_name, JamoV[V]);
16225                     sv_catpv(syllable_name, JamoT[T]);
16226 
16227                     if (execute_wildcard(subpattern_re,
16228                                 SvPVX(syllable_name),
16229                                 SvEND(syllable_name),
16230                                 SvPVX(syllable_name), 0,
16231                                 syllable_name,
16232                                 0))
16233                     {
16234                         *prop_definition = add_cp_to_invlist(*prop_definition,
16235                                                              cp);
16236                         found_matches = TRUE;
16237                     }
16238 
16239                     cp++;
16240                 }
16241             }
16242         }
16243     }
16244 
16245     /* The rest of the algorithmically generatable names are of the form
16246      * "PREFIX-code_point".  The prefixes and the code point limits of each
16247      * were returned to us in the array 'algorithmic_names' from data in
16248      * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
16249     for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
16250         IV j;
16251 
16252         /* Each element of the array is a hash, giving the details for the
16253          * series of names it covers.  There is the base name of the characters
16254          * in the series, and the low and high code points in the series.  And,
16255          * for optimization purposes a string containing all the legal
16256          * characters that could possibly be in a name in this series. */
16257         HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
16258         SV * prefix = * hv_fetchs(this_series, "name", 0);
16259         IV low = SvIV(* hv_fetchs(this_series, "low", 0));
16260         IV high = SvIV(* hv_fetchs(this_series, "high", 0));
16261         char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
16262 
16263         /* Pre-allocate an SV with enough space */
16264         SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
16265                                                         SvPVX(prefix)));
16266         if (high >= 0x10000) {
16267             sv_catpvs(algo_name, "0");
16268         }
16269 
16270         /* This series can be skipped entirely if the pattern requires
16271          * something longer than any name in the series, or can only match an
16272          * empty name, or contains a character not found in any name in the
16273          * series */
16274         if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
16275             &&  prog->maxlen > 0
16276             && (strspn(must, legal) == must_len))
16277         {
16278             for (j = low; j <= high; j++) { /* For each code point in the series */
16279 
16280                 /* Get its name, and see if it matches the subpattern */
16281                 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
16282                                      (unsigned) j);
16283 
16284                 if (execute_wildcard(subpattern_re,
16285                                     SvPVX(algo_name),
16286                                     SvEND(algo_name),
16287                                     SvPVX(algo_name), 0,
16288                                     algo_name,
16289                                     0))
16290                 {
16291                     *prop_definition = add_cp_to_invlist(*prop_definition, j);
16292                     found_matches = TRUE;
16293                 }
16294             }
16295         }
16296     }
16297 
16298   check_empty:
16299     /* Finally, see if the subpattern matches an empty string */
16300     empty = newSVpvs("");
16301     if (execute_wildcard(subpattern_re,
16302                          SvPVX(empty),
16303                          SvEND(empty),
16304                          SvPVX(empty), 0,
16305                          empty,
16306                          0))
16307     {
16308         /* Many code points have empty names.  Currently these are the \p{GC=C}
16309          * ones, minus CC and CF */
16310 
16311         SV * empty_names_ref = get_prop_definition(UNI_C);
16312         SV * empty_names = invlist_clone(empty_names_ref, NULL);
16313 
16314         SV * subtract = get_prop_definition(UNI_CC);
16315 
16316         _invlist_subtract(empty_names, subtract, &empty_names);
16317         SvREFCNT_dec_NN(empty_names_ref);
16318         SvREFCNT_dec_NN(subtract);
16319 
16320         subtract = get_prop_definition(UNI_CF);
16321         _invlist_subtract(empty_names, subtract, &empty_names);
16322         SvREFCNT_dec_NN(subtract);
16323 
16324         _invlist_union(*prop_definition, empty_names, prop_definition);
16325         found_matches = TRUE;
16326         SvREFCNT_dec_NN(empty_names);
16327     }
16328     SvREFCNT_dec_NN(empty);
16329 
16330 #if 0
16331     /* If we ever were to accept aliases for, say private use names, we would
16332      * need to do something fancier to find empty names.  The code below works
16333      * (at the time it was written), and is slower than the above */
16334     const char empties_pat[] = "^.";
16335     if (strNE(name, empties_pat)) {
16336         SV * empty = newSVpvs("");
16337         if (execute_wildcard(subpattern_re,
16338                     SvPVX(empty),
16339                     SvEND(empty),
16340                     SvPVX(empty), 0,
16341                     empty,
16342                     0))
16343         {
16344             SV * empties = NULL;
16345 
16346             (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
16347 
16348             _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
16349             SvREFCNT_dec_NN(empties);
16350 
16351             found_matches = TRUE;
16352         }
16353         SvREFCNT_dec_NN(empty);
16354     }
16355 #endif
16356 
16357     SvREFCNT_dec_NN(subpattern_re);
16358     return found_matches;
16359 }
16360 
16361 /*
16362  * ex: set ts=8 sts=4 sw=4 et:
16363  */
16364