xref: /openbsd/gnu/usr.bin/perl/regcomp.c (revision e0680481)
1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 /*
35  * pregcomp and pregexec -- regsub and regerror are not used in perl
36  *
37  *	Copyright (c) 1986 by University of Toronto.
38  *	Written by Henry Spencer.  Not derived from licensed software.
39  *
40  *	Permission is granted to anyone to use this software for any
41  *	purpose on any computer system, and to redistribute it freely,
42  *	subject to the following restrictions:
43  *
44  *	1. The author is not responsible for the consequences of use of
45  *		this software, no matter how awful, even if they arise
46  *		from defects in it.
47  *
48  *	2. The origin of this software must not be misrepresented, either
49  *		by explicit claim or by omission.
50  *
51  *	3. Altered versions must be plainly marked as such, and must not
52  *		be misrepresented as being the original software.
53  *
54  *
55  ****    Alterations to Henry's code are...
56  ****
57  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
58  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
59  ****    by Larry Wall and others
60  ****
61  ****    You may distribute under the terms of either the GNU General Public
62  ****    License or the Artistic License, as specified in the README file.
63 
64  *
65  * Beware that some of this code is subtly aware of the way operator
66  * precedence is structured in regular expressions.  Serious changes in
67  * regular-expression syntax might require a total rethink.
68  */
69 
70 /* Note on debug output:
71  *
72  * This is set up so that -Dr turns on debugging like all other flags that are
73  * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
74  * all regular expressions encountered in a program, and gives a huge amount of
75  * output for all but the shortest programs.
76  *
77  * The ability to output pattern debugging information lexically, and with much
78  * finer grained control was added, with 'use re qw(Debug ....);' available even
79  * in non-DEBUGGING builds.  This is accomplished by copying the contents of
80  * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
81  * Those files are compiled and linked into the perl executable, and they are
82  * compiled essentially as if DEBUGGING were enabled, and controlled by calls
83  * to re.pm.
84  *
85  * That would normally mean linking errors when two functions of the same name
86  * are attempted to be placed into the same executable.  That is solved in one
87  * of four ways:
88  *  1)  Static functions aren't known outside the file they are in, so for the
89  *      many functions of that type in this file, it just isn't a problem.
90  *  2)  Most externally known functions are enclosed in
91  *          #ifndef PERL_IN_XSUB_RE
92  *          ...
93  *          #endif
94  *      blocks, so there is only one definition for them in the whole
95  *      executable, the one in regcomp.c (or regexec.c).  The implication of
96  *      that is any debugging info that comes from them is controlled only by
97  *      -Dr.  Further, any static function they call will also be the version
98  *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
99  *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
100  *      have different names, so that what gets loaded in the executable is
101  *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
102  *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
103  *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
104  *      versions and their callees are under control of re.pm.   The catch is
105  *      that references to all these go through the regexp_engine structure,
106  *      which is initialized in regcomp.h to the Perl_foo versions, and
107  *      substituted out in lexical scopes where 'use re' is in effect to the
108  *      'my_foo' ones.   That structure is public API, so it would be a hard
109  *      sell to add any additional members.
110  *  4)  For functions in regcomp.c and re_comp.c that are called only from,
111  *      respectively, regexec.c and re_exec.c, they can have two different
112  *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
113  *      embed.fnc.
114  *
115  * The bottom line is that if you add code to one of the public functions
116  * listed in ext/re/re_top.h, debugging automagically works.  But if you write
117  * a new function that needs to do debugging or there is a chain of calls from
118  * it that need to do debugging, all functions in the chain should use options
119  * 2) or 4) above.
120  *
121  * A function may have to be split so that debugging stuff is static, but it
122  * calls out to some other function that only gets compiled in regcomp.c to
123  * access data that we don't want to duplicate.
124  */
125 
126 #ifdef PERL_EXT_RE_BUILD
127 #include "re_top.h"
128 #endif
129 
130 #include "EXTERN.h"
131 #define PERL_IN_REGEX_ENGINE
132 #define PERL_IN_REGCOMP_ANY
133 #define PERL_IN_REGCOMP_C
134 #include "perl.h"
135 
136 #ifdef PERL_IN_XSUB_RE
137 #  include "re_comp.h"
138 EXTERN_C const struct regexp_engine my_reg_engine;
139 EXTERN_C const struct regexp_engine wild_reg_engine;
140 #else
141 #  include "regcomp.h"
142 #endif
143 
144 #include "invlist_inline.h"
145 #include "unicode_constants.h"
146 #include "regcomp_internal.h"
147 
148 /* =========================================================
149  * BEGIN edit_distance stuff.
150  *
151  * This calculates how many single character changes of any type are needed to
152  * transform a string into another one.  It is taken from version 3.1 of
153  *
154  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
155  */
156 
157 /* Our unsorted dictionary linked list.   */
158 /* Note we use UVs, not chars. */
159 
160 struct dictionary{
161   UV key;
162   UV value;
163   struct dictionary* next;
164 };
165 typedef struct dictionary item;
166 
167 
168 PERL_STATIC_INLINE item*
push(UV key,item * curr)169 push(UV key, item* curr)
170 {
171     item* head;
172     Newx(head, 1, item);
173     head->key = key;
174     head->value = 0;
175     head->next = curr;
176     return head;
177 }
178 
179 
180 PERL_STATIC_INLINE item*
find(item * head,UV key)181 find(item* head, UV key)
182 {
183     item* iterator = head;
184     while (iterator){
185         if (iterator->key == key){
186             return iterator;
187         }
188         iterator = iterator->next;
189     }
190 
191     return NULL;
192 }
193 
194 PERL_STATIC_INLINE item*
uniquePush(item * head,UV key)195 uniquePush(item* head, UV key)
196 {
197     item* iterator = head;
198 
199     while (iterator){
200         if (iterator->key == key) {
201             return head;
202         }
203         iterator = iterator->next;
204     }
205 
206     return push(key, head);
207 }
208 
209 PERL_STATIC_INLINE void
dict_free(item * head)210 dict_free(item* head)
211 {
212     item* iterator = head;
213 
214     while (iterator) {
215         item* temp = iterator;
216         iterator = iterator->next;
217         Safefree(temp);
218     }
219 
220     head = NULL;
221 }
222 
223 /* End of Dictionary Stuff */
224 
225 /* All calculations/work are done here */
226 STATIC int
S_edit_distance(const UV * src,const UV * tgt,const STRLEN x,const STRLEN y,const SSize_t maxDistance)227 S_edit_distance(const UV* src,
228                 const UV* tgt,
229                 const STRLEN x,             /* length of src[] */
230                 const STRLEN y,             /* length of tgt[] */
231                 const SSize_t maxDistance
232 )
233 {
234     item *head = NULL;
235     UV swapCount, swapScore, targetCharCount, i, j;
236     UV *scores;
237     UV score_ceil = x + y;
238 
239     PERL_ARGS_ASSERT_EDIT_DISTANCE;
240 
241     /* initialize matrix start values */
242     Newx(scores, ( (x + 2) * (y + 2)), UV);
243     scores[0] = score_ceil;
244     scores[1 * (y + 2) + 0] = score_ceil;
245     scores[0 * (y + 2) + 1] = score_ceil;
246     scores[1 * (y + 2) + 1] = 0;
247     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
248 
249     /* work loops    */
250     /* i = src index */
251     /* j = tgt index */
252     for (i=1;i<=x;i++) {
253         if (i < x)
254             head = uniquePush(head, src[i]);
255         scores[(i+1) * (y + 2) + 1] = i;
256         scores[(i+1) * (y + 2) + 0] = score_ceil;
257         swapCount = 0;
258 
259         for (j=1;j<=y;j++) {
260             if (i == 1) {
261                 if(j < y)
262                 head = uniquePush(head, tgt[j]);
263                 scores[1 * (y + 2) + (j + 1)] = j;
264                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
265             }
266 
267             targetCharCount = find(head, tgt[j-1])->value;
268             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
269 
270             if (src[i-1] != tgt[j-1]){
271                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
272             }
273             else {
274                 swapCount = j;
275                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
276             }
277         }
278 
279         find(head, src[i-1])->value = i;
280     }
281 
282     {
283         IV score = scores[(x+1) * (y + 2) + (y + 1)];
284         dict_free(head);
285         Safefree(scores);
286         return (maxDistance != 0 && maxDistance < score)?(-1):score;
287     }
288 }
289 
290 /* END of edit_distance() stuff
291  * ========================================================= */
292 
293 /* add a data member to the struct reg_data attached to this regex, it should
294  * always return a non-zero return. the 's' argument is the type of the items
295  * being added and the n is the number of items. The length of 's' should match
296  * the number of items. */
297 U32
Perl_reg_add_data(RExC_state_t * const pRExC_state,const char * const s,const U32 n)298 Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
299 {
300     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
301 
302     PERL_ARGS_ASSERT_REG_ADD_DATA;
303 
304     /* in the below expression we have (count + n - 1), the minus one is there
305      * because the struct that we allocate already contains a slot for 1 data
306      * item, so we do not need to allocate it the first time. IOW, the
307      * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
308      * to allocate. See struct reg_data in regcomp.h
309      */
310     Renewc(RExC_rxi->data,
311            sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
312            char, struct reg_data);
313     /* however in the data->what expression we use (count + n) and do not
314      * subtract one from the result because the data structure contains a
315      * pointer to an array, and does not allocate the first element as part of
316      * the data struct. */
317     if (count > 1)
318         Renew(RExC_rxi->data->what, (count + n), U8);
319     else {
320         /* when count == 1 it means we have not initialized anything.
321          * we always fill the 0 slot of the data array with a '%' entry, which
322          * means "zero" (all the other types are letters) which exists purely
323          * so the return from reg_add_data is ALWAYS true, so we can tell it apart
324          * from a "no value" idx=0 in places where we would return an index
325          * into reg_add_data.  This is particularly important with the new "single
326          * pass, usually, but not always" strategy that we use, where the code
327          * will use a 0 to represent "not able to compute this yet".
328          */
329         Newx(RExC_rxi->data->what, n+1, U8);
330         /* fill in the placeholder slot of 0 with a what of '%', we use
331          * this because it sorta looks like a zero (0/0) and it is not a letter
332          * like any of the other "whats", this type should never be created
333          * any other way but here. '%' happens to also not appear in this
334          * file for any other reason (at the time of writing this comment)*/
335         RExC_rxi->data->what[0]= '%';
336         RExC_rxi->data->data[0]= NULL;
337     }
338     RExC_rxi->data->count = count + n;
339     Copy(s, RExC_rxi->data->what + count, n, U8);
340     assert(count>0);
341     return count;
342 }
343 
344 /*XXX: todo make this not included in a non debugging perl, but appears to be
345  * used anyway there, in 'use re' */
346 #ifndef PERL_IN_XSUB_RE
347 void
Perl_reginitcolors(pTHX)348 Perl_reginitcolors(pTHX)
349 {
350     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
351     if (s) {
352         char *t = savepv(s);
353         int i = 0;
354         PL_colors[0] = t;
355         while (++i < 6) {
356             t = strchr(t, '\t');
357             if (t) {
358                 *t = '\0';
359                 PL_colors[i] = ++t;
360             }
361             else
362                 PL_colors[i] = t = (char *)"";
363         }
364     } else {
365         int i = 0;
366         while (i < 6)
367             PL_colors[i++] = (char *)"";
368     }
369     PL_colorset = 1;
370 }
371 #endif
372 
373 
374 #ifdef TRIE_STUDY_OPT
375 /* search for "restudy" in this file for a detailed explanation */
376 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
377     STMT_START {                                            \
378         if (                                                \
379               (data.flags & SCF_TRIE_RESTUDY)               \
380               && ! restudied++                              \
381         ) {                                                 \
382             dOsomething;                                    \
383             goto reStudy;                                   \
384         }                                                   \
385     } STMT_END
386 #else
387 #define CHECK_RESTUDY_GOTO_butfirst
388 #endif
389 
390 /*
391  * pregcomp - compile a regular expression into internal code
392  *
393  * Decides which engine's compiler to call based on the hint currently in
394  * scope
395  */
396 
397 #ifndef PERL_IN_XSUB_RE
398 
399 /* return the currently in-scope regex engine (or the default if none)  */
400 
401 regexp_engine const *
Perl_current_re_engine(pTHX)402 Perl_current_re_engine(pTHX)
403 {
404     if (IN_PERL_COMPILETIME) {
405         HV * const table = GvHV(PL_hintgv);
406         SV **ptr;
407 
408         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
409             return &PL_core_reg_engine;
410         ptr = hv_fetchs(table, "regcomp", FALSE);
411         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
412             return &PL_core_reg_engine;
413         return INT2PTR(regexp_engine*, SvIV(*ptr));
414     }
415     else {
416         SV *ptr;
417         if (!PL_curcop->cop_hints_hash)
418             return &PL_core_reg_engine;
419         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
420         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
421             return &PL_core_reg_engine;
422         return INT2PTR(regexp_engine*, SvIV(ptr));
423     }
424 }
425 
426 
427 REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern,const U32 flags)428 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
429 {
430     regexp_engine const *eng = current_re_engine();
431     DECLARE_AND_GET_RE_DEBUG_FLAGS;
432 
433     PERL_ARGS_ASSERT_PREGCOMP;
434 
435     /* Dispatch a request to compile a regexp to correct regexp engine. */
436     DEBUG_COMPILE_r({
437         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
438                         PTR2UV(eng));
439     });
440     return CALLREGCOMP_ENG(eng, pattern, flags);
441 }
442 #endif
443 
444 /*
445 =for apidoc re_compile
446 
447 Compile the regular expression pattern C<pattern>, returning a pointer to the
448 compiled object for later matching with the internal regex engine.
449 
450 This function is typically used by a custom regexp engine C<.comp()> function
451 to hand off to the core regexp engine those patterns it doesn't want to handle
452 itself (typically passing through the same flags it was called with).  In
453 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
454 to compile using the currently active regexp engine.
455 
456 If C<pattern> is already a C<REGEXP>, this function does nothing but return a
457 pointer to the input.  Otherwise the PV is extracted and treated like a string
458 representing a pattern.  See L<perlre>.
459 
460 The possible flags for C<rx_flags> are documented in L<perlreapi>.  Their names
461 all begin with C<RXf_>.
462 
463 =cut
464 
465  * public entry point for the perl core's own regex compiling code.
466  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
467  * pattern rather than a list of OPs, and uses the internal engine rather
468  * than the current one */
469 
470 REGEXP *
Perl_re_compile(pTHX_ SV * const pattern,U32 rx_flags)471 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
472 {
473     SV *pat = pattern; /* defeat constness! */
474 
475     PERL_ARGS_ASSERT_RE_COMPILE;
476 
477     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
478 #ifdef PERL_IN_XSUB_RE
479                                 &my_reg_engine,
480 #else
481                                 &PL_core_reg_engine,
482 #endif
483                                 NULL, NULL, rx_flags, 0);
484 }
485 
486 static void
S_free_codeblocks(pTHX_ struct reg_code_blocks * cbs)487 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
488 {
489     int n;
490 
491     if (--cbs->refcnt > 0)
492         return;
493     for (n = 0; n < cbs->count; n++) {
494         REGEXP *rx = cbs->cb[n].src_regex;
495         if (rx) {
496             cbs->cb[n].src_regex = NULL;
497             SvREFCNT_dec_NN(rx);
498         }
499     }
500     Safefree(cbs->cb);
501     Safefree(cbs);
502 }
503 
504 
505 static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)506 S_alloc_code_blocks(pTHX_  int ncode)
507 {
508      struct reg_code_blocks *cbs;
509     Newx(cbs, 1, struct reg_code_blocks);
510     cbs->count = ncode;
511     cbs->refcnt = 1;
512     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
513     if (ncode)
514         Newx(cbs->cb, ncode, struct reg_code_block);
515     else
516         cbs->cb = NULL;
517     return cbs;
518 }
519 
520 
521 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
522  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
523  * point to the realloced string and length.
524  *
525  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
526  * stuff added */
527 
528 static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,char ** pat_p,STRLEN * plen_p,int num_code_blocks)529 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
530                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
531 {
532     U8 *const src = (U8*)*pat_p;
533     U8 *dst, *d;
534     int n=0;
535     STRLEN s = 0;
536     bool do_end = 0;
537     DECLARE_AND_GET_RE_DEBUG_FLAGS;
538 
539     DEBUG_PARSE_r(Perl_re_printf( aTHX_
540         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
541 
542     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
543     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
544     d = dst;
545 
546     while (s < *plen_p) {
547         append_utf8_from_native_byte(src[s], &d);
548 
549         if (n < num_code_blocks) {
550             assert(pRExC_state->code_blocks);
551             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
552                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
553                 assert(*(d - 1) == '(');
554                 do_end = 1;
555             }
556             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
557                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
558                 assert(*(d - 1) == ')');
559                 do_end = 0;
560                 n++;
561             }
562         }
563         s++;
564     }
565     *d = '\0';
566     *plen_p = d - dst;
567     *pat_p = (char*) dst;
568     SAVEFREEPV(*pat_p);
569     RExC_orig_utf8 = RExC_utf8 = 1;
570 }
571 
572 
573 
574 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
575  * while recording any code block indices, and handling overloading,
576  * nested qr// objects etc.  If pat is null, it will allocate a new
577  * string, or just return the first arg, if there's only one.
578  *
579  * Returns the malloced/updated pat.
580  * patternp and pat_count is the array of SVs to be concatted;
581  * oplist is the optional list of ops that generated the SVs;
582  * recompile_p is a pointer to a boolean that will be set if
583  *   the regex will need to be recompiled.
584  * delim, if non-null is an SV that will be inserted between each element
585  */
586 
587 static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,SV * pat,SV ** const patternp,int pat_count,OP * oplist,bool * recompile_p,SV * delim)588 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
589                 SV *pat, SV ** const patternp, int pat_count,
590                 OP *oplist, bool *recompile_p, SV *delim)
591 {
592     SV **svp;
593     int n = 0;
594     bool use_delim = FALSE;
595     bool alloced = FALSE;
596 
597     /* if we know we have at least two args, create an empty string,
598      * then concatenate args to that. For no args, return an empty string */
599     if (!pat && pat_count != 1) {
600         pat = newSVpvs("");
601         SAVEFREESV(pat);
602         alloced = TRUE;
603     }
604 
605     for (svp = patternp; svp < patternp + pat_count; svp++) {
606         SV *sv;
607         SV *rx  = NULL;
608         STRLEN orig_patlen = 0;
609         bool code = 0;
610         SV *msv = use_delim ? delim : *svp;
611         if (!msv) msv = &PL_sv_undef;
612 
613         /* if we've got a delimiter, we go round the loop twice for each
614          * svp slot (except the last), using the delimiter the second
615          * time round */
616         if (use_delim) {
617             svp--;
618             use_delim = FALSE;
619         }
620         else if (delim)
621             use_delim = TRUE;
622 
623         if (SvTYPE(msv) == SVt_PVAV) {
624             /* we've encountered an interpolated array within
625              * the pattern, e.g. /...@a..../. Expand the list of elements,
626              * then recursively append elements.
627              * The code in this block is based on S_pushav() */
628 
629             AV *const av = (AV*)msv;
630             const SSize_t maxarg = AvFILL(av) + 1;
631             SV **array;
632 
633             if (oplist) {
634                 assert(oplist->op_type == OP_PADAV
635                     || oplist->op_type == OP_RV2AV);
636                 oplist = OpSIBLING(oplist);
637             }
638 
639             if (SvRMAGICAL(av)) {
640                 SSize_t i;
641 
642                 Newx(array, maxarg, SV*);
643                 SAVEFREEPV(array);
644                 for (i=0; i < maxarg; i++) {
645                     SV ** const svp = av_fetch(av, i, FALSE);
646                     array[i] = svp ? *svp : &PL_sv_undef;
647                 }
648             }
649             else
650                 array = AvARRAY(av);
651 
652             if (maxarg > 0) {
653                 pat = S_concat_pat(aTHX_ pRExC_state, pat,
654                                    array, maxarg, NULL, recompile_p,
655                                    /* $" */
656                                    GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
657             }
658             else if (!pat) {
659                 pat = newSVpvs_flags("", SVs_TEMP);
660             }
661 
662             continue;
663         }
664 
665 
666         /* we make the assumption here that each op in the list of
667          * op_siblings maps to one SV pushed onto the stack,
668          * except for code blocks, with have both an OP_NULL and
669          * an OP_CONST.
670          * This allows us to match up the list of SVs against the
671          * list of OPs to find the next code block.
672          *
673          * Note that       PUSHMARK PADSV PADSV ..
674          * is optimised to
675          *                 PADRANGE PADSV  PADSV  ..
676          * so the alignment still works. */
677 
678         if (oplist) {
679             if (oplist->op_type == OP_NULL
680                 && (oplist->op_flags & OPf_SPECIAL))
681             {
682                 assert(n < pRExC_state->code_blocks->count);
683                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
684                 pRExC_state->code_blocks->cb[n].block = oplist;
685                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
686                 n++;
687                 code = 1;
688                 oplist = OpSIBLING(oplist); /* skip CONST */
689                 assert(oplist);
690             }
691             oplist = OpSIBLING(oplist);;
692         }
693 
694         /* apply magic and QR overloading to arg */
695 
696         SvGETMAGIC(msv);
697         if (SvROK(msv) && SvAMAGIC(msv)) {
698             SV *sv = AMG_CALLunary(msv, regexp_amg);
699             if (sv) {
700                 if (SvROK(sv))
701                     sv = SvRV(sv);
702                 if (SvTYPE(sv) != SVt_REGEXP)
703                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
704                 msv = sv;
705             }
706         }
707 
708         /* try concatenation overload ... */
709         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
710                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
711         {
712             sv_setsv(pat, sv);
713             /* overloading involved: all bets are off over literal
714              * code. Pretend we haven't seen it */
715             if (n)
716                 pRExC_state->code_blocks->count -= n;
717             n = 0;
718         }
719         else {
720             /* ... or failing that, try "" overload */
721             while (SvAMAGIC(msv)
722                     && (sv = AMG_CALLunary(msv, string_amg))
723                     && sv != msv
724                     &&  !(   SvROK(msv)
725                           && SvROK(sv)
726                           && SvRV(msv) == SvRV(sv))
727             ) {
728                 msv = sv;
729                 SvGETMAGIC(msv);
730             }
731             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
732                 msv = SvRV(msv);
733 
734             if (pat) {
735                 /* this is a partially unrolled
736                  *     sv_catsv_nomg(pat, msv);
737                  * that allows us to adjust code block indices if
738                  * needed */
739                 STRLEN dlen;
740                 char *dst = SvPV_force_nomg(pat, dlen);
741                 orig_patlen = dlen;
742                 if (SvUTF8(msv) && !SvUTF8(pat)) {
743                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
744                     sv_setpvn(pat, dst, dlen);
745                     SvUTF8_on(pat);
746                 }
747                 sv_catsv_nomg(pat, msv);
748                 rx = msv;
749             }
750             else {
751                 /* We have only one SV to process, but we need to verify
752                  * it is properly null terminated or we will fail asserts
753                  * later. In theory we probably shouldn't get such SV's,
754                  * but if we do we should handle it gracefully. */
755                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
756                     /* not a string, or a string with a trailing null */
757                     pat = msv;
758                 } else {
759                     /* a string with no trailing null, we need to copy it
760                      * so it has a trailing null */
761                     pat = sv_2mortal(newSVsv(msv));
762                 }
763             }
764 
765             if (code)
766                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
767         }
768 
769         /* extract any code blocks within any embedded qr//'s */
770         if (rx && SvTYPE(rx) == SVt_REGEXP
771             && RX_ENGINE((REGEXP*)rx)->op_comp)
772         {
773 
774             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
775             if (ri->code_blocks && ri->code_blocks->count) {
776                 int i;
777                 /* the presence of an embedded qr// with code means
778                  * we should always recompile: the text of the
779                  * qr// may not have changed, but it may be a
780                  * different closure than last time */
781                 *recompile_p = 1;
782                 if (pRExC_state->code_blocks) {
783                     int new_count = pRExC_state->code_blocks->count
784                             + ri->code_blocks->count;
785                     Renew(pRExC_state->code_blocks->cb,
786                             new_count, struct reg_code_block);
787                     pRExC_state->code_blocks->count = new_count;
788                 }
789                 else
790                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
791                                                     ri->code_blocks->count);
792 
793                 for (i=0; i < ri->code_blocks->count; i++) {
794                     struct reg_code_block *src, *dst;
795                     STRLEN offset =  orig_patlen
796                         + ReANY((REGEXP *)rx)->pre_prefix;
797                     assert(n < pRExC_state->code_blocks->count);
798                     src = &ri->code_blocks->cb[i];
799                     dst = &pRExC_state->code_blocks->cb[n];
800                     dst->start	    = src->start + offset;
801                     dst->end	    = src->end   + offset;
802                     dst->block	    = src->block;
803                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
804                                             src->src_regex
805                                                 ? src->src_regex
806                                                 : (REGEXP*)rx);
807                     n++;
808                 }
809             }
810         }
811     }
812     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
813     if (alloced)
814         SvSETMAGIC(pat);
815 
816     return pat;
817 }
818 
819 
820 
821 /* see if there are any run-time code blocks in the pattern.
822  * False positives are allowed */
823 
824 static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)825 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
826                     char *pat, STRLEN plen)
827 {
828     int n = 0;
829     STRLEN s;
830 
831     PERL_UNUSED_CONTEXT;
832 
833     for (s = 0; s < plen; s++) {
834         if (   pRExC_state->code_blocks
835             && n < pRExC_state->code_blocks->count
836             && s == pRExC_state->code_blocks->cb[n].start)
837         {
838             s = pRExC_state->code_blocks->cb[n].end;
839             n++;
840             continue;
841         }
842         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
843          * positives here */
844         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
845             (pat[s+2] == '{'
846                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
847         )
848             return 1;
849     }
850     return 0;
851 }
852 
853 /* Handle run-time code blocks. We will already have compiled any direct
854  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
855  * copy of it, but with any literal code blocks blanked out and
856  * appropriate chars escaped; then feed it into
857  *
858  *    eval "qr'modified_pattern'"
859  *
860  * For example,
861  *
862  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
863  *
864  * becomes
865  *
866  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
867  *
868  * After eval_sv()-ing that, grab any new code blocks from the returned qr
869  * and merge them with any code blocks of the original regexp.
870  *
871  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
872  * instead, just save the qr and return FALSE; this tells our caller that
873  * the original pattern needs upgrading to utf8.
874  */
875 
876 static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)877 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
878     char *pat, STRLEN plen)
879 {
880     SV *qr;
881 
882     DECLARE_AND_GET_RE_DEBUG_FLAGS;
883 
884     if (pRExC_state->runtime_code_qr) {
885         /* this is the second time we've been called; this should
886          * only happen if the main pattern got upgraded to utf8
887          * during compilation; re-use the qr we compiled first time
888          * round (which should be utf8 too)
889          */
890         qr = pRExC_state->runtime_code_qr;
891         pRExC_state->runtime_code_qr = NULL;
892         assert(RExC_utf8 && SvUTF8(qr));
893     }
894     else {
895         int n = 0;
896         STRLEN s;
897         char *p, *newpat;
898         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
899         SV *sv, *qr_ref;
900         dSP;
901 
902         /* determine how many extra chars we need for ' and \ escaping */
903         for (s = 0; s < plen; s++) {
904             if (pat[s] == '\'' || pat[s] == '\\')
905                 newlen++;
906         }
907 
908         Newx(newpat, newlen, char);
909         p = newpat;
910         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
911 
912         for (s = 0; s < plen; s++) {
913             if (   pRExC_state->code_blocks
914                 && n < pRExC_state->code_blocks->count
915                 && s == pRExC_state->code_blocks->cb[n].start)
916             {
917                 /* blank out literal code block so that they aren't
918                  * recompiled: eg change from/to:
919                  *     /(?{xyz})/
920                  *     /(?=====)/
921                  * and
922                  *     /(??{xyz})/
923                  *     /(?======)/
924                  * and
925                  *     /(?(?{xyz}))/
926                  *     /(?(?=====))/
927                 */
928                 assert(pat[s]   == '(');
929                 assert(pat[s+1] == '?');
930                 *p++ = '(';
931                 *p++ = '?';
932                 s += 2;
933                 while (s < pRExC_state->code_blocks->cb[n].end) {
934                     *p++ = '=';
935                     s++;
936                 }
937                 *p++ = ')';
938                 n++;
939                 continue;
940             }
941             if (pat[s] == '\'' || pat[s] == '\\')
942                 *p++ = '\\';
943             *p++ = pat[s];
944         }
945         *p++ = '\'';
946         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
947             *p++ = 'x';
948             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
949                 *p++ = 'x';
950             }
951         }
952         *p++ = '\0';
953         DEBUG_COMPILE_r({
954             Perl_re_printf( aTHX_
955                 "%sre-parsing pattern for runtime code:%s %s\n",
956                 PL_colors[4], PL_colors[5], newpat);
957         });
958 
959         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
960         Safefree(newpat);
961 
962         ENTER;
963         SAVETMPS;
964         save_re_context();
965         PUSHSTACKi(PERLSI_REQUIRE);
966         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
967          * parsing qr''; normally only q'' does this. It also alters
968          * hints handling */
969         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
970         SvREFCNT_dec_NN(sv);
971         SPAGAIN;
972         qr_ref = POPs;
973         PUTBACK;
974         {
975             SV * const errsv = ERRSV;
976             if (SvTRUE_NN(errsv))
977                 /* use croak_sv ? */
978                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
979         }
980         assert(SvROK(qr_ref));
981         qr = SvRV(qr_ref);
982         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
983         /* the leaving below frees the tmp qr_ref.
984          * Give qr a life of its own */
985         SvREFCNT_inc(qr);
986         POPSTACK;
987         FREETMPS;
988         LEAVE;
989 
990     }
991 
992     if (!RExC_utf8 && SvUTF8(qr)) {
993         /* first time through; the pattern got upgraded; save the
994          * qr for the next time through */
995         assert(!pRExC_state->runtime_code_qr);
996         pRExC_state->runtime_code_qr = qr;
997         return 0;
998     }
999 
1000 
1001     /* extract any code blocks within the returned qr//  */
1002 
1003 
1004     /* merge the main (r1) and run-time (r2) code blocks into one */
1005     {
1006         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
1007         struct reg_code_block *new_block, *dst;
1008         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
1009         int i1 = 0, i2 = 0;
1010         int r1c, r2c;
1011 
1012         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
1013         {
1014             SvREFCNT_dec_NN(qr);
1015             return 1;
1016         }
1017 
1018         if (!r1->code_blocks)
1019             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
1020 
1021         r1c = r1->code_blocks->count;
1022         r2c = r2->code_blocks->count;
1023 
1024         Newx(new_block, r1c + r2c, struct reg_code_block);
1025 
1026         dst = new_block;
1027 
1028         while (i1 < r1c || i2 < r2c) {
1029             struct reg_code_block *src;
1030             bool is_qr = 0;
1031 
1032             if (i1 == r1c) {
1033                 src = &r2->code_blocks->cb[i2++];
1034                 is_qr = 1;
1035             }
1036             else if (i2 == r2c)
1037                 src = &r1->code_blocks->cb[i1++];
1038             else if (  r1->code_blocks->cb[i1].start
1039                      < r2->code_blocks->cb[i2].start)
1040             {
1041                 src = &r1->code_blocks->cb[i1++];
1042                 assert(src->end < r2->code_blocks->cb[i2].start);
1043             }
1044             else {
1045                 assert(  r1->code_blocks->cb[i1].start
1046                        > r2->code_blocks->cb[i2].start);
1047                 src = &r2->code_blocks->cb[i2++];
1048                 is_qr = 1;
1049                 assert(src->end < r1->code_blocks->cb[i1].start);
1050             }
1051 
1052             assert(pat[src->start] == '(');
1053             assert(pat[src->end]   == ')');
1054             dst->start	    = src->start;
1055             dst->end	    = src->end;
1056             dst->block	    = src->block;
1057             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
1058                                     : src->src_regex;
1059             dst++;
1060         }
1061         r1->code_blocks->count += r2c;
1062         Safefree(r1->code_blocks->cb);
1063         r1->code_blocks->cb = new_block;
1064     }
1065 
1066     SvREFCNT_dec_NN(qr);
1067     return 1;
1068 }
1069 
1070 
1071 STATIC bool
S_setup_longest(pTHX_ RExC_state_t * pRExC_state,struct reg_substr_datum * rsd,struct scan_data_substrs * sub,STRLEN longest_length)1072 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
1073                       struct reg_substr_datum  *rsd,
1074                       struct scan_data_substrs *sub,
1075                       STRLEN longest_length)
1076 {
1077     /* This is the common code for setting up the floating and fixed length
1078      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
1079      * as to whether succeeded or not */
1080 
1081     I32 t;
1082     SSize_t ml;
1083     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
1084     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
1085 
1086     if (! (longest_length
1087            || (eol /* Can't have SEOL and MULTI */
1088                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
1089           )
1090             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
1091         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
1092     {
1093         return FALSE;
1094     }
1095 
1096     /* copy the information about the longest from the reg_scan_data
1097         over to the program. */
1098     if (SvUTF8(sub->str)) {
1099         rsd->substr      = NULL;
1100         rsd->utf8_substr = sub->str;
1101     } else {
1102         rsd->substr      = sub->str;
1103         rsd->utf8_substr = NULL;
1104     }
1105     /* end_shift is how many chars that must be matched that
1106         follow this item. We calculate it ahead of time as once the
1107         lookbehind offset is added in we lose the ability to correctly
1108         calculate it.*/
1109     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
1110     rsd->end_shift = ml - sub->min_offset
1111         - longest_length
1112             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
1113              * intead? - DAPM
1114             + (SvTAIL(sub->str) != 0)
1115             */
1116         + sub->lookbehind;
1117 
1118     t = (eol/* Can't have SEOL and MULTI */
1119          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
1120     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
1121 
1122     return TRUE;
1123 }
1124 
1125 STATIC void
S_set_regex_pv(pTHX_ RExC_state_t * pRExC_state,REGEXP * Rx)1126 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
1127 {
1128     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
1129      * properly wrapped with the right modifiers */
1130 
1131     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
1132     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
1133                                                 != REGEX_DEPENDS_CHARSET);
1134 
1135     /* The caret is output if there are any defaults: if not all the STD
1136         * flags are set, or if no character set specifier is needed */
1137     bool has_default =
1138                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
1139                 || ! has_charset);
1140     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
1141                                                 == REG_RUN_ON_COMMENT_SEEN);
1142     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
1143                         >> RXf_PMf_STD_PMMOD_SHIFT);
1144     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
1145     char *p;
1146     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
1147 
1148     /* We output all the necessary flags; we never output a minus, as all
1149         * those are defaults, so are
1150         * covered by the caret */
1151     const STRLEN wraplen = pat_len + has_p + has_runon
1152         + has_default       /* If needs a caret */
1153         + PL_bitcount[reganch] /* 1 char for each set standard flag */
1154 
1155             /* If needs a character set specifier */
1156         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
1157         + (sizeof("(?:)") - 1);
1158 
1159     PERL_ARGS_ASSERT_SET_REGEX_PV;
1160 
1161     /* make sure PL_bitcount bounds not exceeded */
1162     STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
1163 
1164     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
1165     SvPOK_on(Rx);
1166     if (RExC_utf8)
1167         SvFLAGS(Rx) |= SVf_UTF8;
1168     *p++='('; *p++='?';
1169 
1170     /* If a default, cover it using the caret */
1171     if (has_default) {
1172         *p++= DEFAULT_PAT_MOD;
1173     }
1174     if (has_charset) {
1175         STRLEN len;
1176         const char* name;
1177 
1178         name = get_regex_charset_name(RExC_rx->extflags, &len);
1179         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
1180             assert(RExC_utf8);
1181             name = UNICODE_PAT_MODS;
1182             len = sizeof(UNICODE_PAT_MODS) - 1;
1183         }
1184         Copy(name, p, len, char);
1185         p += len;
1186     }
1187     if (has_p)
1188         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
1189     {
1190         char ch;
1191         while((ch = *fptr++)) {
1192             if(reganch & 1)
1193                 *p++ = ch;
1194             reganch >>= 1;
1195         }
1196     }
1197 
1198     *p++ = ':';
1199     Copy(RExC_precomp, p, pat_len, char);
1200     assert ((RX_WRAPPED(Rx) - p) < 16);
1201     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
1202     p += pat_len;
1203 
1204     /* Adding a trailing \n causes this to compile properly:
1205             my $R = qr / A B C # D E/x; /($R)/
1206         Otherwise the parens are considered part of the comment */
1207     if (has_runon)
1208         *p++ = '\n';
1209     *p++ = ')';
1210     *p = 0;
1211     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
1212 }
1213 
1214 /*
1215  * Perl_re_op_compile - the perl internal RE engine's function to compile a
1216  * regular expression into internal code.
1217  * The pattern may be passed either as:
1218  *    a list of SVs (patternp plus pat_count)
1219  *    a list of OPs (expr)
1220  * If both are passed, the SV list is used, but the OP list indicates
1221  * which SVs are actually pre-compiled code blocks
1222  *
1223  * The SVs in the list have magic and qr overloading applied to them (and
1224  * the list may be modified in-place with replacement SVs in the latter
1225  * case).
1226  *
1227  * If the pattern hasn't changed from old_re, then old_re will be
1228  * returned.
1229  *
1230  * eng is the current engine. If that engine has an op_comp method, then
1231  * handle directly (i.e. we assume that op_comp was us); otherwise, just
1232  * do the initial concatenation of arguments and pass on to the external
1233  * engine.
1234  *
1235  * If is_bare_re is not null, set it to a boolean indicating whether the
1236  * arg list reduced (after overloading) to a single bare regex which has
1237  * been returned (i.e. /$qr/).
1238  *
1239  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
1240  *
1241  * pm_flags contains the PMf_* flags, typically based on those from the
1242  * pm_flags field of the related PMOP. Currently we're only interested in
1243  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
1244  *
1245  * For many years this code had an initial sizing pass that calculated
1246  * (sometimes incorrectly, leading to security holes) the size needed for the
1247  * compiled pattern.  That was changed by commit
1248  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
1249  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
1250  * references to this sizing pass.
1251  *
1252  * Now, an initial crude guess as to the size needed is made, based on the
1253  * length of the pattern.  Patches welcome to improve that guess.  That amount
1254  * of space is malloc'd and then immediately freed, and then clawed back node
1255  * by node.  This design is to minimize, to the extent possible, memory churn
1256  * when doing the reallocs.
1257  *
1258  * A separate parentheses counting pass may be needed in some cases.
1259  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
1260  * of these cases.
1261  *
1262  * The existence of a sizing pass necessitated design decisions that are no
1263  * longer needed.  There are potential areas of simplification.
1264  *
1265  * Beware that the optimization-preparation code in here knows about some
1266  * of the structure of the compiled regexp.  [I'll say.]
1267  */
1268 
1269 REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp,int pat_count,OP * expr,const regexp_engine * eng,REGEXP * old_re,bool * is_bare_re,const U32 orig_rx_flags,const U32 pm_flags)1270 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
1271                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
1272                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
1273 {
1274     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
1275     STRLEN plen;
1276     char *exp;
1277     regnode *scan;
1278     I32 flags;
1279     SSize_t minlen = 0;
1280     U32 rx_flags;
1281     SV *pat;
1282     SV** new_patternp = patternp;
1283 
1284     /* these are all flags - maybe they should be turned
1285      * into a single int with different bit masks */
1286     I32 sawlookahead = 0;
1287     I32 sawplus = 0;
1288     I32 sawopen = 0;
1289     I32 sawminmod = 0;
1290 
1291     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
1292     bool recompile = 0;
1293     bool runtime_code = 0;
1294     scan_data_t data;
1295     RExC_state_t RExC_state;
1296     RExC_state_t * const pRExC_state = &RExC_state;
1297 #ifdef TRIE_STUDY_OPT
1298     /* search for "restudy" in this file for a detailed explanation */
1299     int restudied = 0;
1300     RExC_state_t copyRExC_state;
1301 #endif
1302     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1303 
1304     PERL_ARGS_ASSERT_RE_OP_COMPILE;
1305 
1306     DEBUG_r(if (!PL_colorset) reginitcolors());
1307 
1308 
1309     pRExC_state->warn_text = NULL;
1310     pRExC_state->unlexed_names = NULL;
1311     pRExC_state->code_blocks = NULL;
1312 
1313     if (is_bare_re)
1314         *is_bare_re = FALSE;
1315 
1316     if (expr && (expr->op_type == OP_LIST ||
1317                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
1318         /* allocate code_blocks if needed */
1319         OP *o;
1320         int ncode = 0;
1321 
1322         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
1323             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
1324                 ncode++; /* count of DO blocks */
1325 
1326         if (ncode)
1327             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
1328     }
1329 
1330     if (!pat_count) {
1331         /* compile-time pattern with just OP_CONSTs and DO blocks */
1332 
1333         int n;
1334         OP *o;
1335 
1336         /* find how many CONSTs there are */
1337         assert(expr);
1338         n = 0;
1339         if (expr->op_type == OP_CONST)
1340             n = 1;
1341         else
1342             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1343                 if (o->op_type == OP_CONST)
1344                     n++;
1345             }
1346 
1347         /* fake up an SV array */
1348 
1349         assert(!new_patternp);
1350         Newx(new_patternp, n, SV*);
1351         SAVEFREEPV(new_patternp);
1352         pat_count = n;
1353 
1354         n = 0;
1355         if (expr->op_type == OP_CONST)
1356             new_patternp[n] = cSVOPx_sv(expr);
1357         else
1358             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1359                 if (o->op_type == OP_CONST)
1360                     new_patternp[n++] = cSVOPo_sv;
1361             }
1362 
1363     }
1364 
1365     DEBUG_PARSE_r(Perl_re_printf( aTHX_
1366         "Assembling pattern from %d elements%s\n", pat_count,
1367             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1368 
1369     /* set expr to the first arg op */
1370 
1371     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
1372          && expr->op_type != OP_CONST)
1373     {
1374             expr = cLISTOPx(expr)->op_first;
1375             assert(   expr->op_type == OP_PUSHMARK
1376                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
1377                    || expr->op_type == OP_PADRANGE);
1378             expr = OpSIBLING(expr);
1379     }
1380 
1381     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
1382                         expr, &recompile, NULL);
1383 
1384     /* handle bare (possibly after overloading) regex: foo =~ $re */
1385     {
1386         SV *re = pat;
1387         if (SvROK(re))
1388             re = SvRV(re);
1389         if (SvTYPE(re) == SVt_REGEXP) {
1390             if (is_bare_re)
1391                 *is_bare_re = TRUE;
1392             SvREFCNT_inc(re);
1393             DEBUG_PARSE_r(Perl_re_printf( aTHX_
1394                 "Precompiled pattern%s\n",
1395                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1396 
1397             return (REGEXP*)re;
1398         }
1399     }
1400 
1401     exp = SvPV_nomg(pat, plen);
1402 
1403     if (!eng->op_comp) {
1404         if ((SvUTF8(pat) && IN_BYTES)
1405                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
1406         {
1407             /* make a temporary copy; either to convert to bytes,
1408              * or to avoid repeating get-magic / overloaded stringify */
1409             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
1410                                         (IN_BYTES ? 0 : SvUTF8(pat)));
1411         }
1412         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1413     }
1414 
1415     /* ignore the utf8ness if the pattern is 0 length */
1416     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
1417     RExC_uni_semantics = 0;
1418     RExC_contains_locale = 0;
1419     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
1420     RExC_in_script_run = 0;
1421     RExC_study_started = 0;
1422     pRExC_state->runtime_code_qr = NULL;
1423     RExC_frame_head= NULL;
1424     RExC_frame_last= NULL;
1425     RExC_frame_count= 0;
1426     RExC_latest_warn_offset = 0;
1427     RExC_use_BRANCHJ = 0;
1428     RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
1429     RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
1430     RExC_logical_total_parens = 0;
1431     RExC_total_parens = 0;
1432     RExC_logical_to_parno = NULL;
1433     RExC_parno_to_logical = NULL;
1434     RExC_open_parens = NULL;
1435     RExC_close_parens = NULL;
1436     RExC_paren_names = NULL;
1437     RExC_size = 0;
1438     RExC_seen_d_op = FALSE;
1439 #ifdef DEBUGGING
1440     RExC_paren_name_list = NULL;
1441 #endif
1442 
1443     DEBUG_r({
1444         RExC_mysv1= sv_newmortal();
1445         RExC_mysv2= sv_newmortal();
1446     });
1447 
1448     DEBUG_COMPILE_r({
1449             SV *dsv= sv_newmortal();
1450             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1451             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
1452                           PL_colors[4], PL_colors[5], s);
1453         });
1454 
1455     /* we jump here if we have to recompile, e.g., from upgrading the pattern
1456      * to utf8 */
1457 
1458     if ((pm_flags & PMf_USE_RE_EVAL)
1459                 /* this second condition covers the non-regex literal case,
1460                  * i.e.  $foo =~ '(?{})'. */
1461                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
1462     )
1463         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
1464 
1465   redo_parse:
1466     /* return old regex if pattern hasn't changed */
1467     /* XXX: note in the below we have to check the flags as well as the
1468      * pattern.
1469      *
1470      * Things get a touch tricky as we have to compare the utf8 flag
1471      * independently from the compile flags.  */
1472 
1473     if (   old_re
1474         && !recompile
1475         && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
1476         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
1477         && RX_PRELEN(old_re) == plen
1478         && memEQ(RX_PRECOMP(old_re), exp, plen)
1479         && !runtime_code /* with runtime code, always recompile */ )
1480     {
1481         DEBUG_COMPILE_r({
1482             SV *dsv= sv_newmortal();
1483             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1484             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
1485                           PL_colors[4], PL_colors[5], s);
1486         });
1487         return old_re;
1488     }
1489 
1490     /* Allocate the pattern's SV */
1491     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
1492     RExC_rx = ReANY(Rx);
1493     if ( RExC_rx == NULL )
1494         FAIL("Regexp out of space");
1495 
1496     rx_flags = orig_rx_flags;
1497 
1498     if (   toUSE_UNI_CHARSET_NOT_DEPENDS
1499         && initial_charset == REGEX_DEPENDS_CHARSET)
1500     {
1501 
1502         /* Set to use unicode semantics if the pattern is in utf8 and has the
1503          * 'depends' charset specified, as it means unicode when utf8  */
1504         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
1505         RExC_uni_semantics = 1;
1506     }
1507 
1508     RExC_pm_flags = pm_flags;
1509 
1510     if (runtime_code) {
1511         assert(TAINTING_get || !TAINT_get);
1512         if (TAINT_get)
1513             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
1514 
1515         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
1516             /* whoops, we have a non-utf8 pattern, whilst run-time code
1517              * got compiled as utf8. Try again with a utf8 pattern */
1518             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1519                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1520             goto redo_parse;
1521         }
1522     }
1523     assert(!pRExC_state->runtime_code_qr);
1524 
1525     RExC_sawback = 0;
1526 
1527     RExC_seen = 0;
1528     RExC_maxlen = 0;
1529     RExC_in_lookaround = 0;
1530     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1531     RExC_recode_x_to_native = 0;
1532     RExC_in_multi_char_class = 0;
1533 
1534     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
1535     RExC_precomp_end = RExC_end = exp + plen;
1536     RExC_nestroot = 0;
1537     RExC_whilem_seen = 0;
1538     RExC_end_op = NULL;
1539     RExC_recurse = NULL;
1540     RExC_study_chunk_recursed = NULL;
1541     RExC_study_chunk_recursed_bytes= 0;
1542     RExC_recurse_count = 0;
1543     RExC_sets_depth = 0;
1544     pRExC_state->code_index = 0;
1545 
1546     /* Initialize the string in the compiled pattern.  This is so that there is
1547      * something to output if necessary */
1548     set_regex_pv(pRExC_state, Rx);
1549 
1550     DEBUG_PARSE_r({
1551         Perl_re_printf( aTHX_
1552             "Starting parse and generation\n");
1553         RExC_lastnum=0;
1554         RExC_lastparse=NULL;
1555     });
1556 
1557     /* Allocate space and zero-initialize. Note, the two step process
1558        of zeroing when in debug mode, thus anything assigned has to
1559        happen after that */
1560     if (!  RExC_size) {
1561 
1562         /* On the first pass of the parse, we guess how big this will be.  Then
1563          * we grow in one operation to that amount and then give it back.  As
1564          * we go along, we re-allocate what we need.
1565          *
1566          * XXX Currently the guess is essentially that the pattern will be an
1567          * EXACT node with one byte input, one byte output.  This is crude, and
1568          * better heuristics are welcome.
1569          *
1570          * On any subsequent passes, we guess what we actually computed in the
1571          * latest earlier pass.  Such a pass probably didn't complete so is
1572          * missing stuff.  We could improve those guesses by knowing where the
1573          * parse stopped, and use the length so far plus apply the above
1574          * assumption to what's left. */
1575         RExC_size = STR_SZ(RExC_end - RExC_start);
1576     }
1577 
1578     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
1579     if ( RExC_rxi == NULL )
1580         FAIL("Regexp out of space");
1581 
1582     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
1583     RXi_SET( RExC_rx, RExC_rxi );
1584 
1585     /* We start from 0 (over from 0 in the case this is a reparse.  The first
1586      * node parsed will give back any excess memory we have allocated so far).
1587      * */
1588     RExC_size = 0;
1589 
1590     /* non-zero initialization begins here */
1591     RExC_rx->engine= eng;
1592     RExC_rx->extflags = rx_flags;
1593     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
1594 
1595     if (pm_flags & PMf_IS_QR) {
1596         RExC_rxi->code_blocks = pRExC_state->code_blocks;
1597         if (RExC_rxi->code_blocks) {
1598             RExC_rxi->code_blocks->refcnt++;
1599         }
1600     }
1601 
1602     RExC_rx->intflags = 0;
1603 
1604     RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
1605     RExC_parse_set(exp);
1606 
1607     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
1608      * code makes sure the final byte is an uncounted NUL.  But should this
1609      * ever not be the case, lots of things could read beyond the end of the
1610      * buffer: loops like
1611      *      while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
1612      *      strchr(RExC_parse, "foo");
1613      * etc.  So it is worth noting. */
1614     assert(*RExC_end == '\0');
1615 
1616     RExC_naughty = 0;
1617     RExC_npar = 1;
1618     RExC_logical_npar = 1;
1619     RExC_parens_buf_size = 0;
1620     RExC_emit_start = RExC_rxi->program;
1621     pRExC_state->code_index = 0;
1622 
1623     *((char*) RExC_emit_start) = (char) REG_MAGIC;
1624     RExC_emit = NODE_STEP_REGNODE;
1625 
1626     /* Do the parse */
1627     if (reg(pRExC_state, 0, &flags, 1)) {
1628 
1629         /* Success!, But we may need to redo the parse knowing how many parens
1630          * there actually are */
1631         if (IN_PARENS_PASS) {
1632             flags |= RESTART_PARSE;
1633         }
1634 
1635         /* We have that number in RExC_npar */
1636         RExC_total_parens = RExC_npar;
1637         RExC_logical_total_parens = RExC_logical_npar;
1638     }
1639     else if (! MUST_RESTART(flags)) {
1640         ReREFCNT_dec(Rx);
1641         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
1642     }
1643 
1644     /* Here, we either have success, or we have to redo the parse for some reason */
1645     if (MUST_RESTART(flags)) {
1646 
1647         /* It's possible to write a regexp in ascii that represents Unicode
1648         codepoints outside of the byte range, such as via \x{100}. If we
1649         detect such a sequence we have to convert the entire pattern to utf8
1650         and then recompile, as our sizing calculation will have been based
1651         on 1 byte == 1 character, but we will need to use utf8 to encode
1652         at least some part of the pattern, and therefore must convert the whole
1653         thing.
1654         -- dmq */
1655         if (flags & NEED_UTF8) {
1656 
1657             /* We have stored the offset of the final warning output so far.
1658              * That must be adjusted.  Any variant characters between the start
1659              * of the pattern and this warning count for 2 bytes in the final,
1660              * so just add them again */
1661             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
1662                 RExC_latest_warn_offset +=
1663                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
1664                                                 + RExC_latest_warn_offset);
1665             }
1666             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1667             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1668             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
1669         }
1670         else {
1671             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
1672         }
1673 
1674         if (ALL_PARENS_COUNTED) {
1675             /* Make enough room for all the known parens, and zero it */
1676             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
1677             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
1678             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
1679 
1680             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
1681             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
1682             /* we do NOT reinitialize  RExC_logical_to_parno and
1683              * RExC_parno_to_logical here. We need their data on the second
1684              * pass */
1685         }
1686         else { /* Parse did not complete.  Reinitialize the parentheses
1687                   structures */
1688             RExC_total_parens = 0;
1689             if (RExC_open_parens) {
1690                 Safefree(RExC_open_parens);
1691                 RExC_open_parens = NULL;
1692             }
1693             if (RExC_close_parens) {
1694                 Safefree(RExC_close_parens);
1695                 RExC_close_parens = NULL;
1696             }
1697             if (RExC_logical_to_parno) {
1698                 Safefree(RExC_logical_to_parno);
1699                 RExC_logical_to_parno = NULL;
1700             }
1701             if (RExC_parno_to_logical) {
1702                 Safefree(RExC_parno_to_logical);
1703                 RExC_parno_to_logical = NULL;
1704             }
1705         }
1706 
1707         /* Clean up what we did in this parse */
1708         SvREFCNT_dec_NN(RExC_rx_sv);
1709 
1710         goto redo_parse;
1711     }
1712 
1713     /* Here, we have successfully parsed and generated the pattern's program
1714      * for the regex engine.  We are ready to finish things up and look for
1715      * optimizations. */
1716 
1717     /* Update the string to compile, with correct modifiers, etc */
1718     set_regex_pv(pRExC_state, Rx);
1719 
1720     RExC_rx->nparens = RExC_total_parens - 1;
1721     RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
1722 
1723     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
1724     if (RExC_whilem_seen > 15)
1725         RExC_whilem_seen = 15;
1726 
1727     DEBUG_PARSE_r({
1728         Perl_re_printf( aTHX_
1729             "Required size %" IVdf " nodes\n", (IV)RExC_size);
1730         RExC_lastnum=0;
1731         RExC_lastparse=NULL;
1732     });
1733 
1734     SetProgLen(RExC_rxi,RExC_size);
1735 
1736     DEBUG_DUMP_PRE_OPTIMIZE_r({
1737         SV * const sv = sv_newmortal();
1738         RXi_GET_DECL(RExC_rx, ri);
1739         DEBUG_RExC_seen();
1740         Perl_re_printf( aTHX_ "Program before optimization:\n");
1741 
1742         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
1743                         sv, 0, 0);
1744     });
1745 
1746     DEBUG_OPTIMISE_r(
1747         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
1748     );
1749 
1750     /* XXXX To minimize changes to RE engine we always allocate
1751        3-units-long substrs field. */
1752     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
1753     if (RExC_recurse_count) {
1754         Newx(RExC_recurse, RExC_recurse_count, regnode *);
1755         SAVEFREEPV(RExC_recurse);
1756     }
1757 
1758     if (RExC_seen & REG_RECURSE_SEEN) {
1759         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
1760          * So its 1 if there are no parens. */
1761         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
1762                                          ((RExC_total_parens & 0x07) != 0);
1763         Newx(RExC_study_chunk_recursed,
1764              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1765         SAVEFREEPV(RExC_study_chunk_recursed);
1766     }
1767 
1768   reStudy:
1769     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
1770     DEBUG_r(
1771         RExC_study_chunk_recursed_count= 0;
1772     );
1773     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
1774     if (RExC_study_chunk_recursed) {
1775         Zero(RExC_study_chunk_recursed,
1776              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1777     }
1778 
1779 
1780 #ifdef TRIE_STUDY_OPT
1781     /* search for "restudy" in this file for a detailed explanation */
1782     if (!restudied) {
1783         StructCopy(&zero_scan_data, &data, scan_data_t);
1784         copyRExC_state = RExC_state;
1785     } else {
1786         U32 seen=RExC_seen;
1787         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
1788 
1789         RExC_state = copyRExC_state;
1790         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
1791             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
1792         else
1793             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
1794         StructCopy(&zero_scan_data, &data, scan_data_t);
1795     }
1796 #else
1797     StructCopy(&zero_scan_data, &data, scan_data_t);
1798 #endif
1799 
1800     /* Dig out information for optimizations. */
1801     RExC_rx->extflags = RExC_flags; /* was pm_op */
1802     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
1803 
1804     if (UTF)
1805         SvUTF8_on(Rx);	/* Unicode in it? */
1806     RExC_rxi->regstclass = NULL;
1807     if (RExC_naughty >= TOO_NAUGHTY)	/* Probably an expensive pattern. */
1808         RExC_rx->intflags |= PREGf_NAUGHTY;
1809     scan = RExC_rxi->program + 1;		/* First BRANCH. */
1810 
1811     /* testing for BRANCH here tells us whether there is "must appear"
1812        data in the pattern. If there is then we can use it for optimisations */
1813     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
1814                                                   */
1815         SSize_t fake_deltap;
1816         STRLEN longest_length[2];
1817         regnode_ssc ch_class; /* pointed to by data */
1818         int stclass_flag;
1819         SSize_t last_close = 0; /* pointed to by data */
1820         regnode *first= scan;
1821         regnode *first_next= regnext(first);
1822         regnode *last_close_op= NULL;
1823         int i;
1824 
1825         /*
1826          * Skip introductions and multiplicators >= 1
1827          * so that we can extract the 'meat' of the pattern that must
1828          * match in the large if() sequence following.
1829          * NOTE that EXACT is NOT covered here, as it is normally
1830          * picked up by the optimiser separately.
1831          *
1832          * This is unfortunate as the optimiser isnt handling lookahead
1833          * properly currently.
1834          *
1835          */
1836         while (1)
1837         {
1838             if (OP(first) == OPEN)
1839                 sawopen = 1;
1840             else
1841             if (OP(first) == IFMATCH && !FLAGS(first))
1842                 /* for now we can't handle lookbehind IFMATCH */
1843                 sawlookahead = 1;
1844             else
1845             if (OP(first) == PLUS)
1846                 sawplus = 1;
1847             else
1848             if (OP(first) == MINMOD)
1849                 sawminmod = 1;
1850             else
1851             if (!(
1852                 /* An OR of *one* alternative - should not happen now. */
1853                 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
1854                 /* An {n,m} with n>0 */
1855                 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) ||
1856                 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
1857             )){
1858                 break;
1859             }
1860 
1861             first = REGNODE_AFTER(first);
1862             first_next= regnext(first);
1863         }
1864 
1865         /* Starting-point info. */
1866       again:
1867         DEBUG_PEEP("first:", first, 0, 0);
1868         /* Ignore EXACT as we deal with it later. */
1869         if (REGNODE_TYPE(OP(first)) == EXACT) {
1870             if (! isEXACTFish(OP(first))) {
1871                 NOOP;	/* Empty, get anchored substr later. */
1872             }
1873             else
1874                 RExC_rxi->regstclass = first;
1875         }
1876 #ifdef TRIE_STCLASS
1877         else if (REGNODE_TYPE(OP(first)) == TRIE &&
1878                 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
1879         {
1880             /* this can happen only on restudy
1881              * Search for "restudy" in this file to find
1882              * a comment with details. */
1883             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
1884         }
1885 #endif
1886         else if (REGNODE_SIMPLE(OP(first)))
1887             RExC_rxi->regstclass = first;
1888         else if (REGNODE_TYPE(OP(first)) == BOUND ||
1889                  REGNODE_TYPE(OP(first)) == NBOUND)
1890             RExC_rxi->regstclass = first;
1891         else if (REGNODE_TYPE(OP(first)) == BOL) {
1892             RExC_rx->intflags |= (OP(first) == MBOL
1893                            ? PREGf_ANCH_MBOL
1894                            : PREGf_ANCH_SBOL);
1895             first = REGNODE_AFTER(first);
1896             goto again;
1897         }
1898         else if (OP(first) == GPOS) {
1899             RExC_rx->intflags |= PREGf_ANCH_GPOS;
1900             first = REGNODE_AFTER_type(first,tregnode_GPOS);
1901             goto again;
1902         }
1903         else if ((!sawopen || !RExC_sawback) &&
1904             !sawlookahead &&
1905             (OP(first) == STAR &&
1906             REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
1907             !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
1908         {
1909             /* turn .* into ^.* with an implied $*=1 */
1910             const int type =
1911                 (OP(REGNODE_AFTER(first)) == REG_ANY)
1912                     ? PREGf_ANCH_MBOL
1913                     : PREGf_ANCH_SBOL;
1914             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
1915             first = REGNODE_AFTER(first);
1916             goto again;
1917         }
1918         if (sawplus && !sawminmod && !sawlookahead
1919             && (!sawopen || !RExC_sawback)
1920             && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
1921             /* x+ must match at the 1st pos of run of x's */
1922             RExC_rx->intflags |= PREGf_SKIP;
1923 
1924         /* Scan is after the zeroth branch, first is atomic matcher. */
1925 #ifdef TRIE_STUDY_OPT
1926         /* search for "restudy" in this file for a detailed explanation */
1927         DEBUG_PARSE_r(
1928             if (!restudied)
1929                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
1930                               (IV)(first - scan + 1))
1931         );
1932 #else
1933         DEBUG_PARSE_r(
1934             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
1935                 (IV)(first - scan + 1))
1936         );
1937 #endif
1938 
1939 
1940         /*
1941         * If there's something expensive in the r.e., find the
1942         * longest literal string that must appear and make it the
1943         * regmust.  Resolve ties in favor of later strings, since
1944         * the regstart check works with the beginning of the r.e.
1945         * and avoiding duplication strengthens checking.  Not a
1946         * strong reason, but sufficient in the absence of others.
1947         * [Now we resolve ties in favor of the earlier string if
1948         * it happens that c_offset_min has been invalidated, since the
1949         * earlier string may buy us something the later one won't.]
1950         */
1951 
1952         data.substrs[0].str = newSVpvs("");
1953         data.substrs[1].str = newSVpvs("");
1954         data.last_found = newSVpvs("");
1955         data.cur_is_floating = 0; /* initially any found substring is fixed */
1956         ENTER_with_name("study_chunk");
1957         SAVEFREESV(data.substrs[0].str);
1958         SAVEFREESV(data.substrs[1].str);
1959         SAVEFREESV(data.last_found);
1960         first = scan;
1961         if (!RExC_rxi->regstclass) {
1962             ssc_init(pRExC_state, &ch_class);
1963             data.start_class = &ch_class;
1964             stclass_flag = SCF_DO_STCLASS_AND;
1965         } else				/* XXXX Check for BOUND? */
1966             stclass_flag = 0;
1967         data.last_closep = &last_close;
1968         data.last_close_opp = &last_close_op;
1969 
1970         DEBUG_RExC_seen();
1971         /*
1972          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
1973          * (NO top level branches)
1974          */
1975         minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
1976                              scan + RExC_size, /* Up to end */
1977             &data, -1, 0, NULL,
1978             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
1979                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
1980             0, TRUE);
1981         /* search for "restudy" in this file for a detailed explanation
1982          * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
1983 
1984 
1985         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
1986 
1987 
1988         if ( RExC_total_parens == 1 && !data.cur_is_floating
1989              && data.last_start_min == 0 && data.last_end > 0
1990              && !RExC_seen_zerolen
1991              && !(RExC_seen & REG_VERBARG_SEEN)
1992              && !(RExC_seen & REG_GPOS_SEEN)
1993         ){
1994             RExC_rx->extflags |= RXf_CHECK_ALL;
1995         }
1996         scan_commit(pRExC_state, &data,&minlen, 0);
1997 
1998 
1999         /* XXX this is done in reverse order because that's the way the
2000          * code was before it was parameterised. Don't know whether it
2001          * actually needs doing in reverse order. DAPM */
2002         for (i = 1; i >= 0; i--) {
2003             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
2004 
2005             if (   !(   i
2006                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
2007                      &&    data.substrs[0].min_offset
2008                         == data.substrs[1].min_offset
2009                      &&    SvCUR(data.substrs[0].str)
2010                         == SvCUR(data.substrs[1].str)
2011                     )
2012                 && S_setup_longest (aTHX_ pRExC_state,
2013                                         &(RExC_rx->substrs->data[i]),
2014                                         &(data.substrs[i]),
2015                                         longest_length[i]))
2016             {
2017                 RExC_rx->substrs->data[i].min_offset =
2018                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
2019 
2020                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
2021                 /* Don't offset infinity */
2022                 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
2023                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
2024                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
2025             }
2026             else {
2027                 RExC_rx->substrs->data[i].substr      = NULL;
2028                 RExC_rx->substrs->data[i].utf8_substr = NULL;
2029                 longest_length[i] = 0;
2030             }
2031         }
2032 
2033         LEAVE_with_name("study_chunk");
2034 
2035         if (RExC_rxi->regstclass
2036             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
2037             RExC_rxi->regstclass = NULL;
2038 
2039         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
2040               || RExC_rx->substrs->data[0].min_offset)
2041             && stclass_flag
2042             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2043             && is_ssc_worth_it(pRExC_state, data.start_class))
2044         {
2045             const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2046 
2047             ssc_finalize(pRExC_state, data.start_class);
2048 
2049             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2050             StructCopy(data.start_class,
2051                        (regnode_ssc*)RExC_rxi->data->data[n],
2052                        regnode_ssc);
2053             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2054             RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
2055             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
2056                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2057                       Perl_re_printf( aTHX_
2058                                     "synthetic stclass \"%s\".\n",
2059                                     SvPVX_const(sv));});
2060             data.start_class = NULL;
2061         }
2062 
2063         /* A temporary algorithm prefers floated substr to fixed one of
2064          * same length to dig more info. */
2065         i = (longest_length[0] <= longest_length[1]);
2066         RExC_rx->substrs->check_ix = i;
2067         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
2068         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
2069         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
2070         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
2071         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
2072         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
2073             RExC_rx->intflags |= PREGf_NOSCAN;
2074 
2075         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
2076             RExC_rx->extflags |= RXf_USE_INTUIT;
2077             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
2078                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
2079         }
2080 
2081         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
2082         if ( (STRLEN)minlen < longest_length[1] )
2083             minlen= longest_length[1];
2084         if ( (STRLEN)minlen < longest_length[0] )
2085             minlen= longest_length[0];
2086         */
2087     }
2088     else {
2089         /* Several toplevels. Best we can is to set minlen. */
2090         SSize_t fake_deltap;
2091         regnode_ssc ch_class;
2092         SSize_t last_close = 0;
2093         regnode *last_close_op = NULL;
2094 
2095         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
2096 
2097         scan = RExC_rxi->program + 1;
2098         ssc_init(pRExC_state, &ch_class);
2099         data.start_class = &ch_class;
2100         data.last_closep = &last_close;
2101         data.last_close_opp = &last_close_op;
2102 
2103         DEBUG_RExC_seen();
2104         /*
2105          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
2106          * (patterns WITH top level branches)
2107          */
2108         minlen = study_chunk(pRExC_state,
2109             &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
2110             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
2111                                                       ? SCF_TRIE_DOING_RESTUDY
2112                                                       : 0),
2113             0, TRUE);
2114         /* search for "restudy" in this file for a detailed explanation
2115          * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2116 
2117         CHECK_RESTUDY_GOTO_butfirst(NOOP);
2118 
2119         RExC_rx->check_substr = NULL;
2120         RExC_rx->check_utf8 = NULL;
2121         RExC_rx->substrs->data[0].substr      = NULL;
2122         RExC_rx->substrs->data[0].utf8_substr = NULL;
2123         RExC_rx->substrs->data[1].substr      = NULL;
2124         RExC_rx->substrs->data[1].utf8_substr = NULL;
2125 
2126         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2127             && is_ssc_worth_it(pRExC_state, data.start_class))
2128         {
2129             const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2130 
2131             ssc_finalize(pRExC_state, data.start_class);
2132 
2133             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2134             StructCopy(data.start_class,
2135                        (regnode_ssc*)RExC_rxi->data->data[n],
2136                        regnode_ssc);
2137             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2138             RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
2139             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
2140                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2141                       Perl_re_printf( aTHX_
2142                                     "synthetic stclass \"%s\".\n",
2143                                     SvPVX_const(sv));});
2144             data.start_class = NULL;
2145         }
2146     }
2147 
2148     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
2149         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
2150         RExC_rx->maxlen = REG_INFTY;
2151     }
2152     else {
2153         RExC_rx->maxlen = RExC_maxlen;
2154     }
2155 
2156     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
2157        the "real" pattern. */
2158     DEBUG_OPTIMISE_r({
2159         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
2160                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
2161     });
2162     RExC_rx->minlenret = minlen;
2163     if (RExC_rx->minlen < minlen)
2164         RExC_rx->minlen = minlen;
2165 
2166     if (RExC_seen & REG_RECURSE_SEEN ) {
2167         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
2168         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
2169     }
2170     if (RExC_seen & REG_GPOS_SEEN)
2171         RExC_rx->intflags |= PREGf_GPOS_SEEN;
2172 
2173     if (RExC_seen & REG_PESSIMIZE_SEEN)
2174         RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN;
2175 
2176     if (RExC_seen & REG_LOOKBEHIND_SEEN)
2177         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
2178                                                 lookbehind */
2179     if (pRExC_state->code_blocks)
2180         RExC_rx->extflags |= RXf_EVAL_SEEN;
2181 
2182     if (RExC_seen & REG_VERBARG_SEEN) {
2183         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
2184         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
2185     }
2186 
2187     if (RExC_seen & REG_CUTGROUP_SEEN)
2188         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
2189 
2190     if (pm_flags & PMf_USE_RE_EVAL)
2191         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
2192 
2193     if (RExC_paren_names)
2194         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
2195     else
2196         RXp_PAREN_NAMES(RExC_rx) = NULL;
2197 
2198     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
2199      * so it can be used in pp.c */
2200     if (RExC_rx->intflags & PREGf_ANCH)
2201         RExC_rx->extflags |= RXf_IS_ANCHORED;
2202 
2203 
2204     {
2205         /* this is used to identify "special" patterns that might result
2206          * in Perl NOT calling the regex engine and instead doing the match "itself",
2207          * particularly special cases in split//. By having the regex compiler
2208          * do this pattern matching at a regop level (instead of by inspecting the pattern)
2209          * we avoid weird issues with equivalent patterns resulting in different behavior,
2210          * AND we allow non Perl engines to get the same optimizations by the setting the
2211          * flags appropriately - Yves */
2212         regnode *first = RExC_rxi->program + 1;
2213         U8 fop = OP(first);
2214         regnode *next = NULL;
2215         U8 nop = 0;
2216         if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
2217             next = REGNODE_AFTER(first);
2218             nop = OP(next);
2219         }
2220         /* It's safe to read through *next only if OP(first) is a regop of
2221          * the right type (not EXACT, for example).
2222          */
2223         if (REGNODE_TYPE(fop) == NOTHING && nop == END)
2224             RExC_rx->extflags |= RXf_NULL;
2225         else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END)
2226             /* when fop is SBOL first->flags will be true only when it was
2227              * produced by parsing /\A/, and not when parsing /^/. This is
2228              * very important for the split code as there we want to
2229              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
2230              * See rt #122761 for more details. -- Yves */
2231             RExC_rx->extflags |= RXf_START_ONLY;
2232         else if (fop == PLUS
2233                  && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
2234                  && OP(regnext(first)) == END)
2235             RExC_rx->extflags |= RXf_WHITE;
2236         else if ( RExC_rx->extflags & RXf_SPLIT
2237                   && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
2238                   && STR_LEN(first) == 1
2239                   && *(STRING(first)) == ' '
2240                   && OP(regnext(first)) == END )
2241             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
2242 
2243     }
2244 
2245     if (RExC_contains_locale) {
2246         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
2247     }
2248 
2249 #ifdef DEBUGGING
2250     if (RExC_paren_names) {
2251         RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
2252         RExC_rxi->data->data[RExC_rxi->name_list_idx]
2253                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
2254     } else
2255 #endif
2256     RExC_rxi->name_list_idx = 0;
2257 
2258     while ( RExC_recurse_count > 0 ) {
2259         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
2260         /*
2261          * This data structure is set up in study_chunk() and is used
2262          * to calculate the distance between a GOSUB regopcode and
2263          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
2264          * it refers to.
2265          *
2266          * If for some reason someone writes code that optimises
2267          * away a GOSUB opcode then the assert should be changed to
2268          * an if(scan) to guard the ARG2i_SET() - Yves
2269          *
2270          */
2271         assert(scan && OP(scan) == GOSUB);
2272         ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
2273     }
2274     if (RExC_logical_total_parens != RExC_total_parens) {
2275         Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
2276         /* we rebuild this below */
2277         Zero(RExC_logical_to_parno, RExC_total_parens, I32);
2278         for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
2279             int logical_parno= RExC_parno_to_logical[parno];
2280             assert(logical_parno);
2281             RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
2282             RExC_logical_to_parno[logical_parno] = parno;
2283         }
2284         RExC_rx->logical_to_parno = RExC_logical_to_parno;
2285         RExC_rx->parno_to_logical = RExC_parno_to_logical;
2286         RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
2287         RExC_logical_to_parno = NULL;
2288         RExC_parno_to_logical = NULL;
2289         RExC_parno_to_logical_next = NULL;
2290     } else {
2291         RExC_rx->logical_to_parno = NULL;
2292         RExC_rx->parno_to_logical = NULL;
2293         RExC_rx->parno_to_logical_next = NULL;
2294     }
2295 
2296     Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
2297     /* assume we don't need to swap parens around before we match */
2298     DEBUG_TEST_r({
2299         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
2300             (unsigned long)RExC_study_chunk_recursed_count);
2301     });
2302     DEBUG_DUMP_r({
2303         DEBUG_RExC_seen();
2304         Perl_re_printf( aTHX_ "Final program:\n");
2305         regdump(RExC_rx);
2306     });
2307 
2308     if (RExC_open_parens) {
2309         Safefree(RExC_open_parens);
2310         RExC_open_parens = NULL;
2311     }
2312     if (RExC_close_parens) {
2313         Safefree(RExC_close_parens);
2314         RExC_close_parens = NULL;
2315     }
2316     if (RExC_logical_to_parno) {
2317         Safefree(RExC_logical_to_parno);
2318         RExC_logical_to_parno = NULL;
2319     }
2320     if (RExC_parno_to_logical) {
2321         Safefree(RExC_parno_to_logical);
2322         RExC_parno_to_logical = NULL;
2323     }
2324 
2325 #ifdef USE_ITHREADS
2326     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
2327      * by setting the regexp SV to readonly-only instead. If the
2328      * pattern's been recompiled, the USEDness should remain. */
2329     if (old_re && SvREADONLY(old_re))
2330         SvREADONLY_on(Rx);
2331 #endif
2332     return Rx;
2333 }
2334 
2335 
2336 
2337 SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)2338 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
2339 {
2340     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
2341         PERL_UNUSED_ARG(rx);
2342         if (0)
2343             return NULL;
2344         else
2345             return newSVpvs("Regexp");
2346 }
2347 
2348 /* Scans the name of a named buffer from the pattern.
2349  * If flags is REG_RSN_RETURN_NULL returns null.
2350  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
2351  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
2352  * to the parsed name as looked up in the RExC_paren_names hash.
2353  * If there is an error throws a vFAIL().. type exception.
2354  */
2355 
2356 #define REG_RSN_RETURN_NULL    0
2357 #define REG_RSN_RETURN_NAME    1
2358 #define REG_RSN_RETURN_DATA    2
2359 
2360 STATIC SV*
S_reg_scan_name(pTHX_ RExC_state_t * pRExC_state,U32 flags)2361 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
2362 {
2363     char *name_start = RExC_parse;
2364     SV* sv_name;
2365 
2366     PERL_ARGS_ASSERT_REG_SCAN_NAME;
2367 
2368     assert (RExC_parse <= RExC_end);
2369     if (RExC_parse == RExC_end) NOOP;
2370     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
2371          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
2372           * using do...while */
2373         if (UTF)
2374             do {
2375                 RExC_parse_inc_utf8();
2376             } while (   RExC_parse < RExC_end
2377                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
2378         else
2379             do {
2380                 RExC_parse_inc_by(1);
2381             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
2382     } else {
2383         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
2384                          character */
2385         vFAIL("Group name must start with a non-digit word character");
2386     }
2387     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
2388                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
2389     if ( flags == REG_RSN_RETURN_NAME)
2390         return sv_name;
2391     else if (flags==REG_RSN_RETURN_DATA) {
2392         HE *he_str = NULL;
2393         SV *sv_dat = NULL;
2394         if ( ! sv_name )      /* should not happen*/
2395             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
2396         if (RExC_paren_names)
2397             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
2398         if ( he_str )
2399             sv_dat = HeVAL(he_str);
2400         if ( ! sv_dat ) {   /* Didn't find group */
2401 
2402             /* It might be a forward reference; we can't fail until we
2403                 * know, by completing the parse to get all the groups, and
2404                 * then reparsing */
2405             if (ALL_PARENS_COUNTED)  {
2406                 vFAIL("Reference to nonexistent named group");
2407             }
2408             else {
2409                 REQUIRE_PARENS_PASS;
2410             }
2411         }
2412         return sv_dat;
2413     }
2414 
2415     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
2416                      (unsigned long) flags);
2417 }
2418 
2419 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
2420     if (RExC_lastparse!=RExC_parse) {                           \
2421         Perl_re_printf( aTHX_  "%s",                            \
2422             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
2423                 RExC_end - RExC_parse, 16,                      \
2424                 "", "",                                         \
2425                 PERL_PV_ESCAPE_UNI_DETECT |                     \
2426                 PERL_PV_PRETTY_ELLIPSES   |                     \
2427                 PERL_PV_PRETTY_LTGT       |                     \
2428                 PERL_PV_ESCAPE_RE         |                     \
2429                 PERL_PV_PRETTY_EXACTSIZE                        \
2430             )                                                   \
2431         );                                                      \
2432     } else                                                      \
2433         Perl_re_printf( aTHX_ "%16s","");                       \
2434                                                                 \
2435     if (RExC_lastnum!=RExC_emit)                                \
2436        Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
2437     else                                                        \
2438        Perl_re_printf( aTHX_ "|%4s","");                        \
2439     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
2440         (int)((depth*2)), "",                                   \
2441         (funcname)                                              \
2442     );                                                          \
2443     RExC_lastnum=RExC_emit;                                     \
2444     RExC_lastparse=RExC_parse;                                  \
2445 })
2446 
2447 
2448 
2449 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
2450     DEBUG_PARSE_MSG((funcname));                            \
2451     Perl_re_printf( aTHX_ "%4s","\n");                                  \
2452 })
2453 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
2454     DEBUG_PARSE_MSG((funcname));                            \
2455     Perl_re_printf( aTHX_ fmt "\n",args);                               \
2456 })
2457 
2458 
2459 STATIC void
S_parse_lparen_question_flags(pTHX_ RExC_state_t * pRExC_state)2460 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
2461 {
2462     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
2463      * constructs, and updates RExC_flags with them.  On input, RExC_parse
2464      * should point to the first flag; it is updated on output to point to the
2465      * final ')' or ':'.  There needs to be at least one flag, or this will
2466      * abort */
2467 
2468     /* for (?g), (?gc), and (?o) warnings; warning
2469        about (?c) will warn about (?g) -- japhy    */
2470 
2471 #define WASTED_O  0x01
2472 #define WASTED_G  0x02
2473 #define WASTED_C  0x04
2474 #define WASTED_GC (WASTED_G|WASTED_C)
2475     I32 wastedflags = 0x00;
2476     U32 posflags = 0, negflags = 0;
2477     U32 *flagsp = &posflags;
2478     char has_charset_modifier = '\0';
2479     regex_charset cs;
2480     bool has_use_defaults = FALSE;
2481     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
2482     int x_mod_count = 0;
2483 
2484     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
2485 
2486     /* '^' as an initial flag sets certain defaults */
2487     if (UCHARAT(RExC_parse) == '^') {
2488         RExC_parse_inc_by(1);
2489         has_use_defaults = TRUE;
2490         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
2491         cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2492              ? REGEX_UNICODE_CHARSET
2493              : REGEX_DEPENDS_CHARSET;
2494         set_regex_charset(&RExC_flags, cs);
2495     }
2496     else {
2497         cs = get_regex_charset(RExC_flags);
2498         if (   cs == REGEX_DEPENDS_CHARSET
2499             && (toUSE_UNI_CHARSET_NOT_DEPENDS))
2500         {
2501             cs = REGEX_UNICODE_CHARSET;
2502         }
2503     }
2504 
2505     while (RExC_parse < RExC_end) {
2506         /* && memCHRs("iogcmsx", *RExC_parse) */
2507         /* (?g), (?gc) and (?o) are useless here
2508            and must be globally applied -- japhy */
2509         if ((RExC_pm_flags & PMf_WILDCARD)) {
2510             if (flagsp == & negflags) {
2511                 if (*RExC_parse == 'm') {
2512                     RExC_parse_inc_by(1);
2513                     /* diag_listed_as: Use of %s is not allowed in Unicode
2514                        property wildcard subpatterns in regex; marked by <--
2515                        HERE in m/%s/ */
2516                     vFAIL("Use of modifier '-m' is not allowed in Unicode"
2517                           " property wildcard subpatterns");
2518                 }
2519             }
2520             else {
2521                 if (*RExC_parse == 's') {
2522                     goto modifier_illegal_in_wildcard;
2523                 }
2524             }
2525         }
2526 
2527         switch (*RExC_parse) {
2528 
2529             /* Code for the imsxn flags */
2530             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
2531 
2532             case LOCALE_PAT_MOD:
2533                 if (has_charset_modifier) {
2534                     goto excess_modifier;
2535                 }
2536                 else if (flagsp == &negflags) {
2537                     goto neg_modifier;
2538                 }
2539                 cs = REGEX_LOCALE_CHARSET;
2540                 has_charset_modifier = LOCALE_PAT_MOD;
2541                 break;
2542             case UNICODE_PAT_MOD:
2543                 if (has_charset_modifier) {
2544                     goto excess_modifier;
2545                 }
2546                 else if (flagsp == &negflags) {
2547                     goto neg_modifier;
2548                 }
2549                 cs = REGEX_UNICODE_CHARSET;
2550                 has_charset_modifier = UNICODE_PAT_MOD;
2551                 break;
2552             case ASCII_RESTRICT_PAT_MOD:
2553                 if (flagsp == &negflags) {
2554                     goto neg_modifier;
2555                 }
2556                 if (has_charset_modifier) {
2557                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
2558                         goto excess_modifier;
2559                     }
2560                     /* Doubled modifier implies more restricted */
2561                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
2562                 }
2563                 else {
2564                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
2565                 }
2566                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
2567                 break;
2568             case DEPENDS_PAT_MOD:
2569                 if (has_use_defaults) {
2570                     goto fail_modifiers;
2571                 }
2572                 else if (flagsp == &negflags) {
2573                     goto neg_modifier;
2574                 }
2575                 else if (has_charset_modifier) {
2576                     goto excess_modifier;
2577                 }
2578 
2579                 /* The dual charset means unicode semantics if the
2580                  * pattern (or target, not known until runtime) are
2581                  * utf8, or something in the pattern indicates unicode
2582                  * semantics */
2583                 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2584                      ? REGEX_UNICODE_CHARSET
2585                      : REGEX_DEPENDS_CHARSET;
2586                 has_charset_modifier = DEPENDS_PAT_MOD;
2587                 break;
2588               excess_modifier:
2589                 RExC_parse_inc_by(1);
2590                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
2591                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
2592                 }
2593                 else if (has_charset_modifier == *(RExC_parse - 1)) {
2594                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
2595                                         *(RExC_parse - 1));
2596                 }
2597                 else {
2598                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
2599                 }
2600                 NOT_REACHED; /*NOTREACHED*/
2601               neg_modifier:
2602                 RExC_parse_inc_by(1);
2603                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
2604                                     *(RExC_parse - 1));
2605                 NOT_REACHED; /*NOTREACHED*/
2606             case GLOBAL_PAT_MOD: /* 'g' */
2607                 if (RExC_pm_flags & PMf_WILDCARD) {
2608                     goto modifier_illegal_in_wildcard;
2609                 }
2610                 /*FALLTHROUGH*/
2611             case ONCE_PAT_MOD: /* 'o' */
2612                 if (ckWARN(WARN_REGEXP)) {
2613                     const I32 wflagbit = *RExC_parse == 'o'
2614                                          ? WASTED_O
2615                                          : WASTED_G;
2616                     if (! (wastedflags & wflagbit) ) {
2617                         wastedflags |= wflagbit;
2618                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2619                         vWARN5(
2620                             RExC_parse + 1,
2621                             "Useless (%s%c) - %suse /%c modifier",
2622                             flagsp == &negflags ? "?-" : "?",
2623                             *RExC_parse,
2624                             flagsp == &negflags ? "don't " : "",
2625                             *RExC_parse
2626                         );
2627                     }
2628                 }
2629                 break;
2630 
2631             case CONTINUE_PAT_MOD: /* 'c' */
2632                 if (RExC_pm_flags & PMf_WILDCARD) {
2633                     goto modifier_illegal_in_wildcard;
2634                 }
2635                 if (ckWARN(WARN_REGEXP)) {
2636                     if (! (wastedflags & WASTED_C) ) {
2637                         wastedflags |= WASTED_GC;
2638                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2639                         vWARN3(
2640                             RExC_parse + 1,
2641                             "Useless (%sc) - %suse /gc modifier",
2642                             flagsp == &negflags ? "?-" : "?",
2643                             flagsp == &negflags ? "don't " : ""
2644                         );
2645                     }
2646                 }
2647                 break;
2648             case KEEPCOPY_PAT_MOD: /* 'p' */
2649                 if (RExC_pm_flags & PMf_WILDCARD) {
2650                     goto modifier_illegal_in_wildcard;
2651                 }
2652                 if (flagsp == &negflags) {
2653                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
2654                 } else {
2655                     *flagsp |= RXf_PMf_KEEPCOPY;
2656                 }
2657                 break;
2658             case '-':
2659                 /* A flag is a default iff it is following a minus, so
2660                  * if there is a minus, it means will be trying to
2661                  * re-specify a default which is an error */
2662                 if (has_use_defaults || flagsp == &negflags) {
2663                     goto fail_modifiers;
2664                 }
2665                 flagsp = &negflags;
2666                 wastedflags = 0;  /* reset so (?g-c) warns twice */
2667                 x_mod_count = 0;
2668                 break;
2669             case ':':
2670             case ')':
2671 
2672                 if (  (RExC_pm_flags & PMf_WILDCARD)
2673                     && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
2674                 {
2675                     RExC_parse_inc_by(1);
2676                     /* diag_listed_as: Use of %s is not allowed in Unicode
2677                        property wildcard subpatterns in regex; marked by <--
2678                        HERE in m/%s/ */
2679                     vFAIL2("Use of modifier '%c' is not allowed in Unicode"
2680                            " property wildcard subpatterns",
2681                            has_charset_modifier);
2682                 }
2683 
2684                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
2685                     negflags |= RXf_PMf_EXTENDED_MORE;
2686                 }
2687                 RExC_flags |= posflags;
2688 
2689                 if (negflags & RXf_PMf_EXTENDED) {
2690                     negflags |= RXf_PMf_EXTENDED_MORE;
2691                 }
2692                 RExC_flags &= ~negflags;
2693                 set_regex_charset(&RExC_flags, cs);
2694 
2695                 return;
2696             default:
2697               fail_modifiers:
2698                 RExC_parse_inc_if_char();
2699                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
2700                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
2701                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
2702                 NOT_REACHED; /*NOTREACHED*/
2703         }
2704 
2705         RExC_parse_inc();
2706     }
2707 
2708     vFAIL("Sequence (?... not terminated");
2709 
2710   modifier_illegal_in_wildcard:
2711     RExC_parse_inc_by(1);
2712     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
2713        subpatterns in regex; marked by <-- HERE in m/%s/ */
2714     vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
2715            " subpatterns", *(RExC_parse - 1));
2716 }
2717 
2718 /*
2719  - reg - regular expression, i.e. main body or parenthesized thing
2720  *
2721  * Caller must absorb opening parenthesis.
2722  *
2723  * Combining parenthesis handling with the base level of regular expression
2724  * is a trifle forced, but the need to tie the tails of the branches to what
2725  * follows makes it hard to avoid.
2726  */
2727 
2728 STATIC regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,char * backref_parse_start,char ch)2729 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
2730                              I32 *flagp,
2731                              char * backref_parse_start,
2732                              char ch
2733                       )
2734 {
2735     regnode_offset ret;
2736     char* name_start = RExC_parse;
2737     U32 num = 0;
2738     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
2739     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2740 
2741     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
2742 
2743     if (RExC_parse != name_start && ch == '}') {
2744         while (isBLANK(*RExC_parse)) {
2745             RExC_parse_inc_by(1);
2746         }
2747     }
2748     if (RExC_parse == name_start || *RExC_parse != ch) {
2749         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
2750         vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
2751     }
2752 
2753     if (sv_dat) {
2754         num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
2755         RExC_rxi->data->data[num]=(void*)sv_dat;
2756         SvREFCNT_inc_simple_void_NN(sv_dat);
2757     }
2758     RExC_sawback = 1;
2759     ret = reg2node(pRExC_state,
2760                    ((! FOLD)
2761                      ? REFN
2762                      : (ASCII_FOLD_RESTRICTED)
2763                        ? REFFAN
2764                        : (AT_LEAST_UNI_SEMANTICS)
2765                          ? REFFUN
2766                          : (LOC)
2767                            ? REFFLN
2768                            : REFFN),
2769                     num, RExC_nestroot);
2770     if (RExC_nestroot && num >= (U32)RExC_nestroot)
2771         FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
2772     *flagp |= HASWIDTH;
2773 
2774     nextchar(pRExC_state);
2775     return ret;
2776 }
2777 
2778 /* reg_la_NOTHING()
2779  *
2780  * Maybe parse a parenthesized lookaround construct that is equivalent to a
2781  * NOTHING regop when the construct is empty.
2782  *
2783  * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2784  *
2785  * Checks for unterminated constructs and throws a "not terminated" error
2786  * with the appropriate type if necessary
2787  *
2788  * Assuming it does not throw an exception increments RExC_seen_zerolen.
2789  *
2790  * If the construct is empty generates a NOTHING op and returns its
2791  * regnode_offset, which the caller would then return to its caller.
2792  *
2793  * If the construct is not empty increments RExC_in_lookaround, and turns
2794  * on any flags provided in RExC_seen, and then returns 0 to signify
2795  * that parsing should continue.
2796  *
2797  * PS: I would have called this reg_parse_lookaround_NOTHING() but then
2798  * any use of it would have had to be broken onto multiple lines, hence
2799  * the abbreviation.
2800  */
2801 STATIC regnode_offset
S_reg_la_NOTHING(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2802 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2803     const char *type)
2804 {
2805 
2806     PERL_ARGS_ASSERT_REG_LA_NOTHING;
2807 
2808     /* false below so we do not force /x */
2809     skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2810 
2811     if (RExC_parse >= RExC_end)
2812         vFAIL2("Sequence (%s... not terminated", type);
2813 
2814     /* Always increment as NOTHING regops are zerolen */
2815     RExC_seen_zerolen++;
2816 
2817     if (*RExC_parse == ')') {
2818         regnode_offset ret= reg_node(pRExC_state, NOTHING);
2819         nextchar(pRExC_state);
2820         return ret;
2821     }
2822 
2823     RExC_seen |= flags;
2824     RExC_in_lookaround++;
2825     return 0; /* keep parsing! */
2826 }
2827 
2828 /* reg_la_OPFAIL()
2829  *
2830  * Maybe parse a parenthesized lookaround construct that is equivalent to a
2831  * OPFAIL regop when the construct is empty.
2832  *
2833  * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2834  *
2835  * Checks for unterminated constructs and throws a "not terminated" error
2836  * if necessary.
2837  *
2838  * If the construct is empty generates an OPFAIL op and returns its
2839  * regnode_offset which the caller should then return to its caller.
2840  *
2841  * If the construct is not empty increments RExC_in_lookaround, and also
2842  * increments RExC_seen_zerolen, and turns on the flags provided in
2843  * RExC_seen, and then returns 0 to signify that parsing should continue.
2844  *
2845  * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
2846  * any use of it would have had to be broken onto multiple lines, hence
2847  * the abbreviation.
2848  */
2849 
2850 STATIC regnode_offset
S_reg_la_OPFAIL(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2851 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2852     const char *type)
2853 {
2854 
2855     PERL_ARGS_ASSERT_REG_LA_OPFAIL;
2856 
2857     /* FALSE so we don't force to /x below */;
2858     skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2859 
2860     if (RExC_parse >= RExC_end)
2861         vFAIL2("Sequence (%s... not terminated", type);
2862 
2863     if (*RExC_parse == ')') {
2864         regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
2865         nextchar(pRExC_state);
2866         return ret; /* return produced regop */
2867     }
2868 
2869     /* only increment zerolen *after* we check if we produce an OPFAIL
2870      * as an OPFAIL does not match a zero length construct, as it
2871      * does not match ever. */
2872     RExC_seen_zerolen++;
2873     RExC_seen |= flags;
2874     RExC_in_lookaround++;
2875     return 0; /* keep parsing! */
2876 }
2877 
2878 /* Below are the main parsing routines.
2879  *
2880  * S_reg()      parses a whole pattern or subpattern.  It itself handles things
2881  *              like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
2882  *              alternation '|' in the '...' pattern.
2883  * S_regbranch() effectively implements the concatenation operator, handling
2884  *              one alternative of '|', repeatedly calling S_regpiece on each
2885  *              segment of the input.
2886  * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
2887  *              and then adds any quantifier for that chunk.
2888  * S_regatom()  parses the next chunk of the input, returning when it
2889  *              determines it has found a complete atomic chunk.  The chunk may
2890  *              be a nested subpattern, in which case S_reg is called
2891  *              recursively
2892  *
2893  * The functions generate regnodes as they go along, appending each to the
2894  * pattern data structure so far.  They return the offset of the current final
2895  * node into that structure, or 0 on failure.
2896  *
2897  * There are three parameters common to all of them:
2898  *   pRExC_state    is a structure with much information about the current
2899  *                  state of the parse.  It's easy to add new elements to
2900  *                  convey new information, but beware that an error return may
2901  *                  require clearing the element.
2902  *   flagp          is a pointer to bit flags set in a lower level to pass up
2903  *                  to higher levels information, such as the cause of a
2904  *                  failure, or some characteristic about the generated node
2905  *   depth          is roughly the recursion depth, mostly unused except for
2906  *                  pretty printing debugging info.
2907  *
2908  * There are ancillary functions that these may farm work out to, using the
2909  * same parameters.
2910  *
2911  * The protocol for handling flags is that each function will, before
2912  * returning, add into *flagp the flags it needs to pass up.  Each function has
2913  * a second flags variable, typically named 'flags', which it sets and clears
2914  * at will.  Flag bits in it are used in that function, and it calls the next
2915  * layer down with its 'flagp' parameter set to '&flags'.  Thus, upon return,
2916  * 'flags' will contain whatever it had before the call, plus whatever that
2917  * function passed up.  If it wants to pass any of these up to its caller, it
2918  * has to add them to its *flagp.  This means that it takes extra steps to keep
2919  * passing a flag upwards, and otherwise the flag bit is cleared for higher
2920  * functions.
2921  */
2922 
2923 /* On success, returns the offset at which any next node should be placed into
2924  * the regex engine program being compiled.
2925  *
2926  * Returns 0 otherwise, with *flagp set to indicate why:
2927  *  TRYAGAIN        at the end of (?) that only sets flags.
2928  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
2929  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
2930  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
2931  *  happen.  */
2932 STATIC regnode_offset
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp,U32 depth)2933 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
2934     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
2935      * 2 is like 1, but indicates that nextchar() has been called to advance
2936      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
2937      * this flag alerts us to the need to check for that */
2938 {
2939     regnode_offset ret = 0;    /* Will be the head of the group. */
2940     regnode_offset br;
2941     regnode_offset lastbr;
2942     regnode_offset ender = 0;
2943     I32 logical_parno = 0;
2944     I32 parno = 0;
2945     I32 flags;
2946     U32 oregflags = RExC_flags;
2947     bool have_branch = 0;
2948     bool is_open = 0;
2949     I32 freeze_paren = 0;
2950     I32 after_freeze = 0;
2951     I32 num; /* numeric backreferences */
2952     SV * max_open;  /* Max number of unclosed parens */
2953     I32 was_in_lookaround = RExC_in_lookaround;
2954     I32 fake_eval = 0; /* matches paren */
2955 
2956     /* The difference between the following variables can be seen with  *
2957      * the broken pattern /(?:foo/ where segment_parse_start will point *
2958      * at the 'f', and reg_parse_start will point at the '('            */
2959 
2960     /* the following is used for unmatched '(' errors */
2961     char * const reg_parse_start = RExC_parse;
2962 
2963     /* the following is used to track where various segments of
2964      * the pattern that we parse out started. */
2965     char * segment_parse_start = RExC_parse;
2966 
2967     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2968 
2969     PERL_ARGS_ASSERT_REG;
2970     DEBUG_PARSE("reg ");
2971 
2972     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
2973     assert(max_open);
2974     if (!SvIOK(max_open)) {
2975         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
2976     }
2977     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
2978                                               open paren */
2979         vFAIL("Too many nested open parens");
2980     }
2981 
2982     *flagp = 0;				/* Initialize. */
2983 
2984     /* Having this true makes it feasible to have a lot fewer tests for the
2985      * parse pointer being in scope.  For example, we can write
2986      *      while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2987      * instead of
2988      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2989      */
2990     assert(*RExC_end == '\0');
2991 
2992     /* Make an OPEN node, if parenthesized. */
2993     if (paren) {
2994 
2995         /* Under /x, space and comments can be gobbled up between the '(' and
2996          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
2997          * intervening space, as the sequence is a token, and a token should be
2998          * indivisible */
2999         bool has_intervening_patws = (paren == 2)
3000                                   && *(RExC_parse - 1) != '(';
3001 
3002         if (RExC_parse >= RExC_end) {
3003             vFAIL("Unmatched (");
3004         }
3005 
3006         if (paren == 'r') {     /* Atomic script run */
3007             paren = '>';
3008             goto parse_rest;
3009         }
3010         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
3011             if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */
3012                 fake_eval = '{';
3013                 goto handle_qmark;
3014             }
3015 
3016             char *start_verb = RExC_parse + 1;
3017             STRLEN verb_len;
3018             char *start_arg = NULL;
3019             unsigned char op = 0;
3020             int arg_required = 0;
3021             int internal_argval = -1; /* if > -1 no argument allowed */
3022             bool has_upper = FALSE;
3023             U32 seen_flag_set = 0; /* RExC_seen flags we must set */
3024 
3025             if (has_intervening_patws) {
3026                 RExC_parse_inc_by(1);   /* past the '*' */
3027 
3028                 /* For strict backwards compatibility, don't change the message
3029                  * now that we also have lowercase operands */
3030                 if (isUPPER(*RExC_parse)) {
3031                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
3032                 }
3033                 else {
3034                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
3035                 }
3036             }
3037             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
3038                 if ( *RExC_parse == ':' ) {
3039                     start_arg = RExC_parse + 1;
3040                     break;
3041                 }
3042                 else if (! UTF) {
3043                     if (isUPPER(*RExC_parse)) {
3044                         has_upper = TRUE;
3045                     }
3046                     RExC_parse_inc_by(1);
3047                 }
3048                 else {
3049                     RExC_parse_inc_utf8();
3050                 }
3051             }
3052             verb_len = RExC_parse - start_verb;
3053             if ( start_arg ) {
3054                 if (RExC_parse >= RExC_end) {
3055                     goto unterminated_verb_pattern;
3056                 }
3057 
3058                 RExC_parse_inc();
3059                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
3060                     RExC_parse_inc();
3061                 }
3062                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3063                   unterminated_verb_pattern:
3064                     if (has_upper) {
3065                         vFAIL("Unterminated verb pattern argument");
3066                     }
3067                     else {
3068                         vFAIL("Unterminated '(*...' argument");
3069                     }
3070                 }
3071             } else {
3072                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3073                     if (has_upper) {
3074                         vFAIL("Unterminated verb pattern");
3075                     }
3076                     else {
3077                         vFAIL("Unterminated '(*...' construct");
3078                     }
3079                 }
3080             }
3081 
3082             /* Here, we know that RExC_parse < RExC_end */
3083 
3084             switch ( *start_verb ) {
3085             case 'A':  /* (*ACCEPT) */
3086                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
3087                     op = ACCEPT;
3088                     internal_argval = RExC_nestroot;
3089                 }
3090                 break;
3091             case 'C':  /* (*COMMIT) */
3092                 if ( memEQs(start_verb, verb_len,"COMMIT") )
3093                     op = COMMIT;
3094                 break;
3095             case 'F':  /* (*FAIL) */
3096                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
3097                     op = OPFAIL;
3098                 }
3099                 break;
3100             case ':':  /* (*:NAME) */
3101             case 'M':  /* (*MARK:NAME) */
3102                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
3103                     op = MARKPOINT;
3104                     arg_required = 1;
3105                 }
3106                 break;
3107             case 'P':  /* (*PRUNE) */
3108                 if ( memEQs(start_verb, verb_len,"PRUNE") )
3109                     op = PRUNE;
3110                 break;
3111             case 'S':   /* (*SKIP) */
3112                 if ( memEQs(start_verb, verb_len,"SKIP") )
3113                     op = SKIP;
3114                 break;
3115             case 'T':  /* (*THEN) */
3116                 /* [19:06] <TimToady> :: is then */
3117                 if ( memEQs(start_verb, verb_len,"THEN") ) {
3118                     op = CUTGROUP;
3119                     RExC_seen |= REG_CUTGROUP_SEEN;
3120                 }
3121                 break;
3122             case 'a':
3123                 if (   memEQs(start_verb, verb_len, "asr")
3124                     || memEQs(start_verb, verb_len, "atomic_script_run"))
3125                 {
3126                     paren = 'r';        /* Mnemonic: recursed run */
3127                     goto script_run;
3128                 }
3129                 else if (memEQs(start_verb, verb_len, "atomic")) {
3130                     paren = 't';    /* AtOMIC */
3131                     goto alpha_assertions;
3132                 }
3133                 break;
3134             case 'p':
3135                 if (   memEQs(start_verb, verb_len, "plb")
3136                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
3137                 {
3138                     paren = 'b';
3139                     goto lookbehind_alpha_assertions;
3140                 }
3141                 else if (   memEQs(start_verb, verb_len, "pla")
3142                          || memEQs(start_verb, verb_len, "positive_lookahead"))
3143                 {
3144                     paren = 'a';
3145                     goto alpha_assertions;
3146                 }
3147                 break;
3148             case 'n':
3149                 if (   memEQs(start_verb, verb_len, "nlb")
3150                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
3151                 {
3152                     paren = 'B';
3153                     goto lookbehind_alpha_assertions;
3154                 }
3155                 else if (   memEQs(start_verb, verb_len, "nla")
3156                          || memEQs(start_verb, verb_len, "negative_lookahead"))
3157                 {
3158                     paren = 'A';
3159                     goto alpha_assertions;
3160                 }
3161                 break;
3162             case 's':
3163                 if (   memEQs(start_verb, verb_len, "sr")
3164                     || memEQs(start_verb, verb_len, "script_run"))
3165                 {
3166                     regnode_offset atomic;
3167 
3168                     paren = 's';
3169 
3170                    script_run:
3171 
3172                     /* This indicates Unicode rules. */
3173                     REQUIRE_UNI_RULES(flagp, 0);
3174 
3175                     if (! start_arg) {
3176                         goto no_colon;
3177                     }
3178 
3179                     RExC_parse_set(start_arg);
3180 
3181                     if (RExC_in_script_run) {
3182 
3183                         /*  Nested script runs are treated as no-ops, because
3184                          *  if the nested one fails, the outer one must as
3185                          *  well.  It could fail sooner, and avoid (??{} with
3186                          *  side effects, but that is explicitly documented as
3187                          *  undefined behavior. */
3188 
3189                         ret = 0;
3190 
3191                         if (paren == 's') {
3192                             paren = ':';
3193                             goto parse_rest;
3194                         }
3195 
3196                         /* But, the atomic part of a nested atomic script run
3197                          * isn't a no-op, but can be treated just like a '(?>'
3198                          * */
3199                         paren = '>';
3200                         goto parse_rest;
3201                     }
3202 
3203                     if (paren == 's') {
3204                         /* Here, we're starting a new regular script run */
3205                         ret = reg_node(pRExC_state, SROPEN);
3206                         RExC_in_script_run = 1;
3207                         is_open = 1;
3208                         goto parse_rest;
3209                     }
3210 
3211                     /* Here, we are starting an atomic script run.  This is
3212                      * handled by recursing to deal with the atomic portion
3213                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
3214 
3215                     ret = reg_node(pRExC_state, SROPEN);
3216 
3217                     RExC_in_script_run = 1;
3218 
3219                     atomic = reg(pRExC_state, 'r', &flags, depth);
3220                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
3221                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
3222                         return 0;
3223                     }
3224 
3225                     if (! REGTAIL(pRExC_state, ret, atomic)) {
3226                         REQUIRE_BRANCHJ(flagp, 0);
3227                     }
3228 
3229                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
3230                                                                 SRCLOSE)))
3231                     {
3232                         REQUIRE_BRANCHJ(flagp, 0);
3233                     }
3234 
3235                     RExC_in_script_run = 0;
3236                     return ret;
3237                 }
3238 
3239                 break;
3240 
3241             lookbehind_alpha_assertions:
3242                 seen_flag_set = REG_LOOKBEHIND_SEEN;
3243                 /*FALLTHROUGH*/
3244 
3245             alpha_assertions:
3246 
3247                 if ( !start_arg ) {
3248                     goto no_colon;
3249                 }
3250 
3251                 if ( RExC_parse == start_arg ) {
3252                     if ( paren == 'A' || paren == 'B' ) {
3253                         /* An empty negative lookaround assertion is failure.
3254                          * See also: S_reg_la_OPFAIL() */
3255 
3256                         /* Note: OPFAIL is *not* zerolen. */
3257                         ret = reg1node(pRExC_state, OPFAIL, 0);
3258                         nextchar(pRExC_state);
3259                         return ret;
3260                     }
3261                     else
3262                     if ( paren == 'a' || paren == 'b' ) {
3263                         /* An empty positive lookaround assertion is success.
3264                          * See also: S_reg_la_NOTHING() */
3265 
3266                         /* Note: NOTHING is zerolen, so increment here */
3267                         RExC_seen_zerolen++;
3268                         ret = reg_node(pRExC_state, NOTHING);
3269                         nextchar(pRExC_state);
3270                         return ret;
3271                     }
3272                 }
3273 
3274                 RExC_seen_zerolen++;
3275                 RExC_in_lookaround++;
3276                 RExC_seen |= seen_flag_set;
3277 
3278                 RExC_parse_set(start_arg);
3279                 goto parse_rest;
3280 
3281               no_colon:
3282                 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
3283                     UTF8fARG(UTF, verb_len, start_verb));
3284                 NOT_REACHED; /*NOTREACHED*/
3285 
3286             } /* End of switch */
3287             if ( ! op ) {
3288                 RExC_parse_inc_safe();
3289                 if (has_upper || verb_len == 0) {
3290                     vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
3291                         UTF8fARG(UTF, verb_len, start_verb));
3292                 }
3293                 else {
3294                     vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
3295                         UTF8fARG(UTF, verb_len, start_verb));
3296                 }
3297             }
3298             if ( RExC_parse == start_arg ) {
3299                 start_arg = NULL;
3300             }
3301             if ( arg_required && !start_arg ) {
3302                 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
3303                     (int) verb_len, start_verb);
3304             }
3305             if (internal_argval == -1) {
3306                 ret = reg1node(pRExC_state, op, 0);
3307             } else {
3308                 ret = reg2node(pRExC_state, op, 0, internal_argval);
3309             }
3310             RExC_seen |= REG_VERBARG_SEEN;
3311             if (start_arg) {
3312                 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
3313                 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
3314                                         STR_WITH_LEN("S"));
3315                 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
3316                 FLAGS(REGNODE_p(ret)) = 1;
3317             } else {
3318                 FLAGS(REGNODE_p(ret)) = 0;
3319             }
3320             if ( internal_argval != -1 )
3321                 ARG2i_SET(REGNODE_p(ret), internal_argval);
3322             nextchar(pRExC_state);
3323             return ret;
3324         }
3325         else if (*RExC_parse == '?') { /* (?...) */
3326           handle_qmark:
3327             ; /* make sure the label has a statement associated with it*/
3328             bool is_logical = 0, is_optimistic = 0;
3329             const char * const seqstart = RExC_parse;
3330             const char * endptr;
3331             const char non_existent_group_msg[]
3332                                             = "Reference to nonexistent group";
3333             const char impossible_group[] = "Invalid reference to group";
3334 
3335             if (has_intervening_patws) {
3336                 RExC_parse_inc_by(1);
3337                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
3338             }
3339 
3340             RExC_parse_inc_by(1);   /* past the '?' */
3341             if (!fake_eval) {
3342                 paren = *RExC_parse;    /* might be a trailing NUL, if not
3343                                            well-formed */
3344                 is_optimistic = 0;
3345             } else {
3346                 is_optimistic = 1;
3347                 paren = fake_eval;
3348             }
3349             RExC_parse_inc();
3350             if (RExC_parse > RExC_end) {
3351                 paren = '\0';
3352             }
3353             ret = 0;			/* For look-ahead/behind. */
3354             switch (paren) {
3355 
3356             case 'P':	/* (?P...) variants for those used to PCRE/Python */
3357                 paren = *RExC_parse;
3358                 if ( paren == '<') {    /* (?P<...>) named capture */
3359                     RExC_parse_inc_by(1);
3360                     if (RExC_parse >= RExC_end) {
3361                         vFAIL("Sequence (?P<... not terminated");
3362                     }
3363                     goto named_capture;
3364                 }
3365                 else if (paren == '>') {   /* (?P>name) named recursion */
3366                     RExC_parse_inc_by(1);
3367                     if (RExC_parse >= RExC_end) {
3368                         vFAIL("Sequence (?P>... not terminated");
3369                     }
3370                     goto named_recursion;
3371                 }
3372                 else if (paren == '=') {   /* (?P=...)  named backref */
3373                     RExC_parse_inc_by(1);
3374                     return handle_named_backref(pRExC_state, flagp,
3375                                                 segment_parse_start, ')');
3376                 }
3377                 RExC_parse_inc_if_char();
3378                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3379                 vFAIL3("Sequence (%.*s...) not recognized",
3380                                 (int) (RExC_parse - seqstart), seqstart);
3381                 NOT_REACHED; /*NOTREACHED*/
3382             case '<':           /* (?<...) */
3383                 /* If you want to support (?<*...), first reconcile with GH #17363 */
3384                 if (*RExC_parse == '!') {
3385                     paren = ','; /* negative lookbehind (?<! ... ) */
3386                     RExC_parse_inc_by(1);
3387                     if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
3388                         return ret;
3389                     break;
3390                 }
3391                 else
3392                 if (*RExC_parse == '=') {
3393                     /* paren = '<' - negative lookahead (?<= ... ) */
3394                     RExC_parse_inc_by(1);
3395                     if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
3396                         return ret;
3397                     break;
3398                 }
3399                 else
3400               named_capture:
3401                 {               /* (?<...>) */
3402                     char *name_start;
3403                     SV *svname;
3404                     paren= '>';
3405                 /* FALLTHROUGH */
3406             case '\'':          /* (?'...') */
3407                     name_start = RExC_parse;
3408                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
3409                     if (   RExC_parse == name_start
3410                         || RExC_parse >= RExC_end
3411                         || *RExC_parse != paren)
3412                     {
3413                         vFAIL2("Sequence (?%c... not terminated",
3414                             paren=='>' ? '<' : (char) paren);
3415                     }
3416                     {
3417                         HE *he_str;
3418                         SV *sv_dat = NULL;
3419                         if (!svname) /* shouldn't happen */
3420                             Perl_croak(aTHX_
3421                                 "panic: reg_scan_name returned NULL");
3422                         if (!RExC_paren_names) {
3423                             RExC_paren_names= newHV();
3424                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
3425 #ifdef DEBUGGING
3426                             RExC_paren_name_list= newAV();
3427                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
3428 #endif
3429                         }
3430                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
3431                         if ( he_str )
3432                             sv_dat = HeVAL(he_str);
3433                         if ( ! sv_dat ) {
3434                             /* croak baby croak */
3435                             Perl_croak(aTHX_
3436                                 "panic: paren_name hash element allocation failed");
3437                         } else if ( SvPOK(sv_dat) ) {
3438                             /* (?|...) can mean we have dupes so scan to check
3439                                its already been stored. Maybe a flag indicating
3440                                we are inside such a construct would be useful,
3441                                but the arrays are likely to be quite small, so
3442                                for now we punt -- dmq */
3443                             IV count = SvIV(sv_dat);
3444                             I32 *pv = (I32*)SvPVX(sv_dat);
3445                             IV i;
3446                             for ( i = 0 ; i < count ; i++ ) {
3447                                 if ( pv[i] == RExC_npar ) {
3448                                     count = 0;
3449                                     break;
3450                                 }
3451                             }
3452                             if ( count ) {
3453                                 pv = (I32*)SvGROW(sv_dat,
3454                                                 SvCUR(sv_dat) + sizeof(I32)+1);
3455                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
3456                                 pv[count] = RExC_npar;
3457                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
3458                             }
3459                         } else {
3460                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
3461                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
3462                                                                 sizeof(I32));
3463                             SvIOK_on(sv_dat);
3464                             SvIV_set(sv_dat, 1);
3465                         }
3466 #ifdef DEBUGGING
3467                         /* No, this does not cause a memory leak under
3468                          * debugging. RExC_paren_name_list is freed later
3469                          * on in the dump process. - Yves
3470                          */
3471                         if (!av_store(RExC_paren_name_list,
3472                                       RExC_npar, SvREFCNT_inc_NN(svname)))
3473                             SvREFCNT_dec_NN(svname);
3474 #endif
3475 
3476                     }
3477                     nextchar(pRExC_state);
3478                     paren = 1;
3479                     goto capturing_parens;
3480                 }
3481                 NOT_REACHED; /*NOTREACHED*/
3482             case '=':           /* (?=...) */
3483                 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
3484                     return ret;
3485                 break;
3486             case '!':           /* (?!...) */
3487                 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
3488                     return ret;
3489                 break;
3490             case '|':           /* (?|...) */
3491                 /* branch reset, behave like a (?:...) except that
3492                    buffers in alternations share the same numbers */
3493                 paren = ':';
3494                 after_freeze = freeze_paren = RExC_logical_npar;
3495 
3496                 /* XXX This construct currently requires an extra pass.
3497                  * Investigation would be required to see if that could be
3498                  * changed */
3499                 REQUIRE_PARENS_PASS;
3500                 break;
3501             case ':':           /* (?:...) */
3502             case '>':           /* (?>...) */
3503                 break;
3504             case '$':           /* (?$...) */
3505             case '@':           /* (?@...) */
3506                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3507                 break;
3508             case '0' :           /* (?0) */
3509             case 'R' :           /* (?R) */
3510                 if (RExC_parse == RExC_end || *RExC_parse != ')')
3511                     FAIL("Sequence (?R) not terminated");
3512                 num = 0;
3513                 RExC_seen |= REG_RECURSE_SEEN;
3514 
3515                 /* XXX These constructs currently require an extra pass.
3516                  * It probably could be changed */
3517                 REQUIRE_PARENS_PASS;
3518 
3519                 *flagp |= POSTPONED;
3520                 goto gen_recurse_regop;
3521                 /*notreached*/
3522             /* named and numeric backreferences */
3523             case '&':            /* (?&NAME) */
3524                 segment_parse_start = RExC_parse - 1;
3525               named_recursion:
3526                 {
3527                     SV *sv_dat = reg_scan_name(pRExC_state,
3528                                                REG_RSN_RETURN_DATA);
3529                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
3530                 }
3531                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
3532                     vFAIL("Sequence (?&... not terminated");
3533                 goto gen_recurse_regop;
3534                 /* NOTREACHED */
3535             case '+':
3536                 if (! inRANGE(RExC_parse[0], '1', '9')) {
3537                     RExC_parse_inc_by(1);
3538                     vFAIL("Illegal pattern");
3539                 }
3540                 goto parse_recursion;
3541                 /* NOTREACHED*/
3542             case '-': /* (?-1) */
3543                 if (! inRANGE(RExC_parse[0], '1', '9')) {
3544                     RExC_parse--; /* rewind to let it be handled later */
3545                     goto parse_flags;
3546                 }
3547                 /* FALLTHROUGH */
3548             case '1': case '2': case '3': case '4': /* (?1) */
3549             case '5': case '6': case '7': case '8': case '9':
3550                 RExC_parse_set((char *) seqstart + 1);  /* Point to the digit */
3551               parse_recursion:
3552                 {
3553                     bool is_neg = FALSE;
3554                     UV unum;
3555                     segment_parse_start = RExC_parse - 1;
3556                     if (*RExC_parse == '-') {
3557                         RExC_parse_inc_by(1);
3558                         is_neg = TRUE;
3559                     }
3560                     endptr = RExC_end;
3561                     if (grok_atoUV(RExC_parse, &unum, &endptr)
3562                         && unum <= I32_MAX
3563                     ) {
3564                         num = (I32)unum;
3565                         RExC_parse_set((char*)endptr);
3566                     }
3567                     else {  /* Overflow, or something like that.  Position
3568                                beyond all digits for the message */
3569                         while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
3570                             RExC_parse_inc_by(1);
3571                         }
3572                         vFAIL(impossible_group);
3573                     }
3574                     if (is_neg) {
3575                         /* -num is always representable on 1 and 2's complement
3576                          * machines */
3577                         num = -num;
3578                     }
3579                 }
3580                 if (*RExC_parse!=')')
3581                     vFAIL("Expecting close bracket");
3582 
3583                 if (paren == '-' || paren == '+') {
3584 
3585                     /* Don't overflow */
3586                     if (UNLIKELY(I32_MAX - RExC_npar < num)) {
3587                         RExC_parse_inc_by(1);
3588                         vFAIL(impossible_group);
3589                     }
3590 
3591                     /*
3592                     Diagram of capture buffer numbering.
3593                     Top line is the normal capture buffer numbers
3594                     Bottom line is the negative indexing as from
3595                     the X (the (?-2))
3596 
3597                         1 2    3 4 5 X   Y      6 7
3598                        /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
3599                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
3600                     -   5 4    3 2 1 X   Y      x x
3601 
3602                     Resolve to absolute group.  Recall that RExC_npar is +1 of
3603                     the actual parenthesis group number.  For lookahead, we
3604                     have to compensate for that.  Using the above example, when
3605                     we get to Y in the parse, num is 2 and RExC_npar is 6.  We
3606                     want 7 for +2, and 4 for -2.
3607                     */
3608                     if ( paren == '+' ) {
3609                         num--;
3610                     }
3611 
3612                     num += RExC_npar;
3613 
3614                     if (paren == '-' && num < 1) {
3615                         RExC_parse_inc_by(1);
3616                         vFAIL(non_existent_group_msg);
3617                     }
3618                 }
3619                 else
3620                 if (num && num < RExC_logical_npar) {
3621                     num = RExC_logical_to_parno[num];
3622                 }
3623                 else
3624                 if (ALL_PARENS_COUNTED) {
3625                     if (num < RExC_logical_total_parens) {
3626                         num = RExC_logical_to_parno[num];
3627                     }
3628                     else {
3629                         RExC_parse_inc_by(1);
3630                         vFAIL(non_existent_group_msg);
3631                     }
3632                 }
3633                 else {
3634                     REQUIRE_PARENS_PASS;
3635                 }
3636 
3637 
3638               gen_recurse_regop:
3639                 if (num >= RExC_npar) {
3640 
3641                     /* It might be a forward reference; we can't fail until we
3642                      * know, by completing the parse to get all the groups, and
3643                      * then reparsing */
3644                     if (ALL_PARENS_COUNTED)  {
3645                         if (num >= RExC_total_parens) {
3646                             RExC_parse_inc_by(1);
3647                             vFAIL(non_existent_group_msg);
3648                         }
3649                     }
3650                     else {
3651                         REQUIRE_PARENS_PASS;
3652                     }
3653                 }
3654 
3655                 /* We keep track how many GOSUB items we have produced.
3656                    To start off the ARG2i() of the GOSUB holds its "id",
3657                    which is used later in conjunction with RExC_recurse
3658                    to calculate the offset we need to jump for the GOSUB,
3659                    which it will store in the final representation.
3660                    We have to defer the actual calculation until much later
3661                    as the regop may move.
3662                  */
3663                 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
3664                 RExC_recurse_count++;
3665                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
3666                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
3667                             22, "|    |", (int)(depth * 2 + 1), "",
3668                             (UV)ARG1u(REGNODE_p(ret)),
3669                             (IV)ARG2i(REGNODE_p(ret))));
3670                 RExC_seen |= REG_RECURSE_SEEN;
3671 
3672                 *flagp |= POSTPONED;
3673                 assert(*RExC_parse == ')');
3674                 nextchar(pRExC_state);
3675                 return ret;
3676 
3677             /* NOTREACHED */
3678 
3679             case '?':           /* (??...) */
3680                 is_logical = 1;
3681                 if (*RExC_parse != '{') {
3682                     RExC_parse_inc_if_char();
3683                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3684                     vFAIL2utf8f(
3685                         "Sequence (%" UTF8f "...) not recognized",
3686                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
3687                     NOT_REACHED; /*NOTREACHED*/
3688                 }
3689                 *flagp |= POSTPONED;
3690                 paren = '{';
3691                 RExC_parse_inc_by(1);
3692                 /* FALLTHROUGH */
3693             case '{':           /* (?{...}) */
3694             {
3695                 U32 n = 0;
3696                 struct reg_code_block *cb;
3697                 OP * o;
3698 
3699                 RExC_seen_zerolen++;
3700 
3701                 if (   !pRExC_state->code_blocks
3702                     || pRExC_state->code_index
3703                                         >= pRExC_state->code_blocks->count
3704                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
3705                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
3706                             - RExC_start)
3707                 ) {
3708                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
3709                         FAIL("panic: Sequence (?{...}): no code block found\n");
3710                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
3711                 }
3712                 /* this is a pre-compiled code block (?{...}) */
3713                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
3714                 RExC_parse_set(RExC_start + cb->end);
3715                 o = cb->block;
3716                 if (cb->src_regex) {
3717                     n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
3718                     RExC_rxi->data->data[n] =
3719                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
3720                     RExC_rxi->data->data[n+1] = (void*)o;
3721                 }
3722                 else {
3723                     n = reg_add_data(pRExC_state,
3724                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
3725                     RExC_rxi->data->data[n] = (void*)o;
3726                 }
3727                 pRExC_state->code_index++;
3728                 nextchar(pRExC_state);
3729                 if (!is_optimistic)
3730                     RExC_seen |= REG_PESSIMIZE_SEEN;
3731 
3732                 if (is_logical) {
3733                     regnode_offset eval;
3734                     ret = reg_node(pRExC_state, LOGICAL);
3735                     FLAGS(REGNODE_p(ret)) = 2;
3736 
3737                     eval = reg2node(pRExC_state, EVAL,
3738                                        n,
3739 
3740                                        /* for later propagation into (??{})
3741                                         * return value */
3742                                        RExC_flags & RXf_PMf_COMPILETIME
3743                                       );
3744                     FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3745                     if (! REGTAIL(pRExC_state, ret, eval)) {
3746                         REQUIRE_BRANCHJ(flagp, 0);
3747                     }
3748                     return ret;
3749                 }
3750                 ret = reg2node(pRExC_state, EVAL, n, 0);
3751                 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3752 
3753                 return ret;
3754             }
3755             case '(':           /* (?(?{...})...) and (?(?=...)...) */
3756             {
3757                 int is_define= 0;
3758                 const int DEFINE_len = sizeof("DEFINE") - 1;
3759                 if (    RExC_parse < RExC_end - 1
3760                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
3761                             && (   RExC_parse[1] == '='
3762                                 || RExC_parse[1] == '!'
3763                                 || RExC_parse[1] == '<'
3764                                 || RExC_parse[1] == '{'))
3765                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
3766                             && (   RExC_parse[1] == '{'
3767                             || (   memBEGINs(RExC_parse + 1,
3768                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3769                                          "pla:")
3770                                 || memBEGINs(RExC_parse + 1,
3771                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3772                                          "plb:")
3773                                 || memBEGINs(RExC_parse + 1,
3774                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3775                                          "nla:")
3776                                 || memBEGINs(RExC_parse + 1,
3777                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3778                                          "nlb:")
3779                                 || memBEGINs(RExC_parse + 1,
3780                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3781                                          "positive_lookahead:")
3782                                 || memBEGINs(RExC_parse + 1,
3783                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3784                                          "positive_lookbehind:")
3785                                 || memBEGINs(RExC_parse + 1,
3786                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3787                                          "negative_lookahead:")
3788                                 || memBEGINs(RExC_parse + 1,
3789                                          (Size_t) (RExC_end - (RExC_parse + 1)),
3790                                          "negative_lookbehind:")))))
3791                 ) { /* Lookahead or eval. */
3792                     I32 flag;
3793                     regnode_offset tail;
3794 
3795                     ret = reg_node(pRExC_state, LOGICAL);
3796                     FLAGS(REGNODE_p(ret)) = 1;
3797 
3798                     tail = reg(pRExC_state, 1, &flag, depth+1);
3799                     RETURN_FAIL_ON_RESTART(flag, flagp);
3800                     if (! REGTAIL(pRExC_state, ret, tail)) {
3801                         REQUIRE_BRANCHJ(flagp, 0);
3802                     }
3803                     goto insert_if;
3804                 }
3805                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
3806                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
3807                 {
3808                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
3809                     char *name_start= RExC_parse;
3810                     RExC_parse_inc_by(1);
3811                     U32 num = 0;
3812                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
3813                     if (   RExC_parse == name_start
3814                         || RExC_parse >= RExC_end
3815                         || *RExC_parse != ch)
3816                     {
3817                         vFAIL2("Sequence (?(%c... not terminated",
3818                             (ch == '>' ? '<' : ch));
3819                     }
3820                     RExC_parse_inc_by(1);
3821                     if (sv_dat) {
3822                         num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
3823                         RExC_rxi->data->data[num]=(void*)sv_dat;
3824                         SvREFCNT_inc_simple_void_NN(sv_dat);
3825                     }
3826                     ret = reg1node(pRExC_state, GROUPPN, num);
3827                     goto insert_if_check_paren;
3828                 }
3829                 else if (memBEGINs(RExC_parse,
3830                                    (STRLEN) (RExC_end - RExC_parse),
3831                                    "DEFINE"))
3832                 {
3833                     ret = reg1node(pRExC_state, DEFINEP, 0);
3834                     RExC_parse_inc_by(DEFINE_len);
3835                     is_define = 1;
3836                     goto insert_if_check_paren;
3837                 }
3838                 else if (RExC_parse[0] == 'R') {
3839                     RExC_parse_inc_by(1);
3840                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
3841                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
3842                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
3843                      */
3844                     parno = 0;
3845                     if (RExC_parse[0] == '0') {
3846                         parno = 1;
3847                         RExC_parse_inc_by(1);
3848                     }
3849                     else if (inRANGE(RExC_parse[0], '1', '9')) {
3850                         UV uv;
3851                         endptr = RExC_end;
3852                         if (grok_atoUV(RExC_parse, &uv, &endptr)
3853                             && uv <= I32_MAX
3854                         ) {
3855                             parno = (I32)uv + 1;
3856                             RExC_parse_set((char*)endptr);
3857                         }
3858                         /* else "Switch condition not recognized" below */
3859                     } else if (RExC_parse[0] == '&') {
3860                         SV *sv_dat;
3861                         RExC_parse_inc_by(1);
3862                         sv_dat = reg_scan_name(pRExC_state,
3863                                                REG_RSN_RETURN_DATA);
3864                         if (sv_dat)
3865                             parno = 1 + *((I32 *)SvPVX(sv_dat));
3866                     }
3867                     ret = reg1node(pRExC_state, INSUBP, parno);
3868                     goto insert_if_check_paren;
3869                 }
3870                 else if (inRANGE(RExC_parse[0], '1', '9')) {
3871                     /* (?(1)...) */
3872                     char c;
3873                     UV uv;
3874                     endptr = RExC_end;
3875                     if (grok_atoUV(RExC_parse, &uv, &endptr)
3876                         && uv <= I32_MAX
3877                     ) {
3878                         parno = (I32)uv;
3879                         RExC_parse_set((char*)endptr);
3880                     }
3881                     else {
3882                         vFAIL("panic: grok_atoUV returned FALSE");
3883                     }
3884                     ret = reg1node(pRExC_state, GROUPP, parno);
3885 
3886                  insert_if_check_paren:
3887                     if (UCHARAT(RExC_parse) != ')') {
3888                         RExC_parse_inc_safe();
3889                         vFAIL("Switch condition not recognized");
3890                     }
3891                     nextchar(pRExC_state);
3892                   insert_if:
3893                     if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
3894                                                              IFTHEN, 0)))
3895                     {
3896                         REQUIRE_BRANCHJ(flagp, 0);
3897                     }
3898                     br = regbranch(pRExC_state, &flags, 1, depth+1);
3899                     if (br == 0) {
3900                         RETURN_FAIL_ON_RESTART(flags,flagp);
3901                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3902                               (UV) flags);
3903                     } else
3904                     if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
3905                                                              LONGJMP, 0)))
3906                     {
3907                         REQUIRE_BRANCHJ(flagp, 0);
3908                     }
3909                     c = UCHARAT(RExC_parse);
3910                     nextchar(pRExC_state);
3911                     if (flags&HASWIDTH)
3912                         *flagp |= HASWIDTH;
3913                     if (c == '|') {
3914                         if (is_define)
3915                             vFAIL("(?(DEFINE)....) does not allow branches");
3916 
3917                         /* Fake one for optimizer.  */
3918                         lastbr = reg1node(pRExC_state, IFTHEN, 0);
3919 
3920                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
3921                             RETURN_FAIL_ON_RESTART(flags, flagp);
3922                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3923                                   (UV) flags);
3924                         }
3925                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
3926                             REQUIRE_BRANCHJ(flagp, 0);
3927                         }
3928                         if (flags&HASWIDTH)
3929                             *flagp |= HASWIDTH;
3930                         c = UCHARAT(RExC_parse);
3931                         nextchar(pRExC_state);
3932                     }
3933                     else
3934                         lastbr = 0;
3935                     if (c != ')') {
3936                         if (RExC_parse >= RExC_end)
3937                             vFAIL("Switch (?(condition)... not terminated");
3938                         else
3939                             vFAIL("Switch (?(condition)... contains too many branches");
3940                     }
3941                     ender = reg_node(pRExC_state, TAIL);
3942                     if (! REGTAIL(pRExC_state, br, ender)) {
3943                         REQUIRE_BRANCHJ(flagp, 0);
3944                     }
3945                     if (lastbr) {
3946                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
3947                             REQUIRE_BRANCHJ(flagp, 0);
3948                         }
3949                         if (! REGTAIL(pRExC_state,
3950                                       REGNODE_OFFSET(
3951                                         REGNODE_AFTER(REGNODE_p(lastbr))),
3952                                       ender))
3953                         {
3954                             REQUIRE_BRANCHJ(flagp, 0);
3955                         }
3956                     }
3957                     else
3958                         if (! REGTAIL(pRExC_state, ret, ender)) {
3959                             REQUIRE_BRANCHJ(flagp, 0);
3960                         }
3961 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
3962                     RExC_size++; /* XXX WHY do we need this?!!
3963                                     For large programs it seems to be required
3964                                     but I can't figure out why. -- dmq*/
3965 #endif
3966                     return ret;
3967                 }
3968                 RExC_parse_inc_safe();
3969                 vFAIL("Unknown switch condition (?(...))");
3970             }
3971             case '[':           /* (?[ ... ]) */
3972                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
3973             case 0: /* A NUL */
3974                 RExC_parse--; /* for vFAIL to print correctly */
3975                 vFAIL("Sequence (? incomplete");
3976                 break;
3977 
3978             case ')':
3979                 if (RExC_strict) {  /* [perl #132851] */
3980                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
3981                 }
3982                 /* FALLTHROUGH */
3983             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
3984             /* FALLTHROUGH */
3985             default: /* e.g., (?i) */
3986                 RExC_parse_set((char *) seqstart + 1);
3987               parse_flags:
3988                 parse_lparen_question_flags(pRExC_state);
3989                 if (UCHARAT(RExC_parse) != ':') {
3990                     if (RExC_parse < RExC_end)
3991                         nextchar(pRExC_state);
3992                     *flagp = TRYAGAIN;
3993                     return 0;
3994                 }
3995                 paren = ':';
3996                 nextchar(pRExC_state);
3997                 ret = 0;
3998                 goto parse_rest;
3999             } /* end switch */
4000         }
4001         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
4002           capturing_parens:
4003             parno = RExC_npar;
4004             RExC_npar++;
4005             if (RExC_npar >= U16_MAX)
4006                 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
4007 
4008             logical_parno = RExC_logical_npar;
4009             RExC_logical_npar++;
4010             if (! ALL_PARENS_COUNTED) {
4011                 /* If we are in our first pass through (and maybe only pass),
4012                  * we  need to allocate memory for the capturing parentheses
4013                  * data structures.
4014                  */
4015 
4016                 if (!RExC_parens_buf_size) {
4017                     /* first guess at number of parens we might encounter */
4018                     RExC_parens_buf_size = 10;
4019 
4020                     /* setup RExC_open_parens, which holds the address of each
4021                      * OPEN tag, and to make things simpler for the 0 index the
4022                      * start of the program - this is used later for offsets */
4023                     Newxz(RExC_open_parens, RExC_parens_buf_size,
4024                             regnode_offset);
4025                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
4026 
4027                     /* setup RExC_close_parens, which holds the address of each
4028                      * CLOSE tag, and to make things simpler for the 0 index
4029                      * the end of the program - this is used later for offsets
4030                      * */
4031                     Newxz(RExC_close_parens, RExC_parens_buf_size,
4032                             regnode_offset);
4033                     /* we don't know where end op starts yet, so we don't need to
4034                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
4035                      * above */
4036 
4037                     Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4038                     Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4039                 }
4040                 else if (RExC_npar > RExC_parens_buf_size) {
4041                     I32 old_size = RExC_parens_buf_size;
4042 
4043                     RExC_parens_buf_size *= 2;
4044 
4045                     Renew(RExC_open_parens, RExC_parens_buf_size,
4046                             regnode_offset);
4047                     Zero(RExC_open_parens + old_size,
4048                             RExC_parens_buf_size - old_size, regnode_offset);
4049 
4050                     Renew(RExC_close_parens, RExC_parens_buf_size,
4051                             regnode_offset);
4052                     Zero(RExC_close_parens + old_size,
4053                             RExC_parens_buf_size - old_size, regnode_offset);
4054 
4055                     Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4056                     Zero(RExC_logical_to_parno + old_size,
4057                          RExC_parens_buf_size - old_size, I32);
4058 
4059                     Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4060                     Zero(RExC_parno_to_logical + old_size,
4061                          RExC_parens_buf_size - old_size, I32);
4062                 }
4063             }
4064 
4065             ret = reg1node(pRExC_state, OPEN, parno);
4066             if (!RExC_nestroot)
4067                 RExC_nestroot = parno;
4068             if (RExC_open_parens && !RExC_open_parens[parno])
4069             {
4070                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4071                     "%*s%*s Setting open paren #%" IVdf " to %zu\n",
4072                     22, "|    |", (int)(depth * 2 + 1), "",
4073                     (IV)parno, ret));
4074                 RExC_open_parens[parno]= ret;
4075             }
4076             if (RExC_parno_to_logical) {
4077                 RExC_parno_to_logical[parno] = logical_parno;
4078                 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
4079                     RExC_logical_to_parno[logical_parno] = parno;
4080             }
4081             is_open = 1;
4082         } else {
4083             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
4084             paren = ':';
4085             ret = 0;
4086         }
4087     }
4088     else                        /* ! paren */
4089         ret = 0;
4090 
4091    parse_rest:
4092     /* Pick up the branches, linking them together. */
4093     segment_parse_start = RExC_parse;
4094     I32 npar_before_regbranch = RExC_npar - 1;
4095     br = regbranch(pRExC_state, &flags, 1, depth+1);
4096 
4097     /*     branch_len = (paren != 0); */
4098 
4099     if (br == 0) {
4100         RETURN_FAIL_ON_RESTART(flags, flagp);
4101         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4102     }
4103     if (*RExC_parse == '|') {
4104         if (RExC_use_BRANCHJ) {
4105             reginsert(pRExC_state, BRANCHJ, br, depth+1);
4106             ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
4107             ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4108         }
4109         else {
4110             reginsert(pRExC_state, BRANCH, br, depth+1);
4111             ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
4112             ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4113         }
4114         have_branch = 1;
4115     }
4116     else if (paren == ':') {
4117         *flagp |= flags&SIMPLE;
4118     }
4119     if (is_open) {				/* Starts with OPEN. */
4120         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
4121             REQUIRE_BRANCHJ(flagp, 0);
4122         }
4123     }
4124     else if (paren != '?')		/* Not Conditional */
4125         ret = br;
4126     *flagp |= flags & (HASWIDTH | POSTPONED);
4127     lastbr = br;
4128     while (*RExC_parse == '|') {
4129         if (RExC_use_BRANCHJ) {
4130             bool shut_gcc_up;
4131 
4132             ender = reg1node(pRExC_state, LONGJMP, 0);
4133 
4134             /* Append to the previous. */
4135             shut_gcc_up = REGTAIL(pRExC_state,
4136                          REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
4137                          ender);
4138             PERL_UNUSED_VAR(shut_gcc_up);
4139         }
4140         nextchar(pRExC_state);
4141         if (freeze_paren) {
4142             if (RExC_logical_npar > after_freeze)
4143                 after_freeze = RExC_logical_npar;
4144             RExC_logical_npar = freeze_paren;
4145         }
4146         br = regbranch(pRExC_state, &flags, 0, depth+1);
4147 
4148         if (br == 0) {
4149             RETURN_FAIL_ON_RESTART(flags, flagp);
4150             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4151         }
4152         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
4153             REQUIRE_BRANCHJ(flagp, 0);
4154         }
4155         assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4156         assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4157         if (OP(REGNODE_p(br)) == BRANCH) {
4158             if (OP(REGNODE_p(lastbr)) == BRANCH)
4159                 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4160             else
4161                 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4162         }
4163         else
4164         if (OP(REGNODE_p(br)) == BRANCHJ) {
4165             if (OP(REGNODE_p(lastbr)) == BRANCH)
4166                 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4167             else
4168                 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4169         }
4170 
4171         lastbr = br;
4172         *flagp |= flags & (HASWIDTH | POSTPONED);
4173     }
4174 
4175     if (have_branch || paren != ':') {
4176         regnode * br;
4177 
4178         /* Make a closing node, and hook it on the end. */
4179         switch (paren) {
4180         case ':':
4181             ender = reg_node(pRExC_state, TAIL);
4182             break;
4183         case 1: case 2:
4184             ender = reg1node(pRExC_state, CLOSE, parno);
4185             if ( RExC_close_parens ) {
4186                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4187                         "%*s%*s Setting close paren #%" IVdf " to %zu\n",
4188                         22, "|    |", (int)(depth * 2 + 1), "",
4189                         (IV)parno, ender));
4190                 RExC_close_parens[parno]= ender;
4191                 if (RExC_nestroot == parno)
4192                     RExC_nestroot = 0;
4193             }
4194             break;
4195         case 's':
4196             ender = reg_node(pRExC_state, SRCLOSE);
4197             RExC_in_script_run = 0;
4198             break;
4199         /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
4200         case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
4201         case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
4202         case '<': /* (?<= ... ) */
4203         case ',': /* (?<! ... ) */
4204             *flagp &= ~HASWIDTH;
4205             ender = reg_node(pRExC_state, LOOKBEHIND_END);
4206             break;
4207         /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
4208         case 'a':
4209         case 'A':
4210         case '=':
4211         case '!':
4212             *flagp &= ~HASWIDTH;
4213             /* FALLTHROUGH */
4214         case 't':   /* aTomic */
4215         case '>':
4216             ender = reg_node(pRExC_state, SUCCEED);
4217             break;
4218         case 0:
4219             ender = reg_node(pRExC_state, END);
4220             assert(!RExC_end_op); /* there can only be one! */
4221             RExC_end_op = REGNODE_p(ender);
4222             if (RExC_close_parens) {
4223                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4224                     "%*s%*s Setting close paren #0 (END) to %zu\n",
4225                     22, "|    |", (int)(depth * 2 + 1), "",
4226                     ender));
4227 
4228                 RExC_close_parens[0]= ender;
4229             }
4230             break;
4231         }
4232         DEBUG_PARSE_r({
4233             DEBUG_PARSE_MSG("lsbr");
4234             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
4235             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
4236             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4237                           SvPV_nolen_const(RExC_mysv1),
4238                           (IV)lastbr,
4239                           SvPV_nolen_const(RExC_mysv2),
4240                           (IV)ender,
4241                           (IV)(ender - lastbr)
4242             );
4243         });
4244         if (OP(REGNODE_p(lastbr)) == BRANCH) {
4245             ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4246         }
4247         else
4248         if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4249             ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4250         }
4251 
4252         if (! REGTAIL(pRExC_state, lastbr, ender)) {
4253             REQUIRE_BRANCHJ(flagp, 0);
4254         }
4255 
4256         if (have_branch) {
4257             char is_nothing= 1;
4258             if (depth==1)
4259                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
4260 
4261             /* Hook the tails of the branches to the closing node. */
4262             for (br = REGNODE_p(ret); br; br = regnext(br)) {
4263                 const U8 op = REGNODE_TYPE(OP(br));
4264                 regnode *nextoper = REGNODE_AFTER(br);
4265                 if (op == BRANCH) {
4266                     if (! REGTAIL_STUDY(pRExC_state,
4267                                         REGNODE_OFFSET(nextoper),
4268                                         ender))
4269                     {
4270                         REQUIRE_BRANCHJ(flagp, 0);
4271                     }
4272                     if ( OP(nextoper) != NOTHING
4273                          || regnext(nextoper) != REGNODE_p(ender))
4274                         is_nothing= 0;
4275                 }
4276                 else if (op == BRANCHJ) {
4277                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
4278                                         REGNODE_OFFSET(nextoper),
4279                                         ender);
4280                     PERL_UNUSED_VAR(shut_gcc_up);
4281                     /* for now we always disable this optimisation * /
4282                     regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
4283                     if ( OP(nopr) != NOTHING
4284                          || regnext(nopr) != REGNODE_p(ender))
4285                     */
4286                         is_nothing= 0;
4287                 }
4288             }
4289             if (is_nothing) {
4290                 regnode * ret_as_regnode = REGNODE_p(ret);
4291                 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
4292                                ? regnext(ret_as_regnode)
4293                                : ret_as_regnode;
4294                 DEBUG_PARSE_r({
4295                     DEBUG_PARSE_MSG("NADA");
4296                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
4297                                      NULL, pRExC_state);
4298                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
4299                                      NULL, pRExC_state);
4300                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4301                                   SvPV_nolen_const(RExC_mysv1),
4302                                   (IV)REG_NODE_NUM(ret_as_regnode),
4303                                   SvPV_nolen_const(RExC_mysv2),
4304                                   (IV)ender,
4305                                   (IV)(ender - ret)
4306                     );
4307                 });
4308                 OP(br)= NOTHING;
4309                 if (OP(REGNODE_p(ender)) == TAIL) {
4310                     NEXT_OFF(br)= 0;
4311                     RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
4312                 } else {
4313                     regnode *opt;
4314                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
4315                         OP(opt)= OPTIMIZED;
4316                     NEXT_OFF(br)= REGNODE_p(ender) - br;
4317                 }
4318             }
4319         }
4320     }
4321 
4322     {
4323         const char *p;
4324          /* Even/odd or x=don't care: 010101x10x */
4325         static const char parens[] = "=!aA<,>Bbt";
4326          /* flag below is set to 0 up through 'A'; 1 for larger */
4327 
4328         if (paren && (p = strchr(parens, paren))) {
4329             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4330             int flag = (p - parens) > 3;
4331 
4332             if (paren == '>' || paren == 't') {
4333                 node = SUSPEND, flag = 0;
4334             }
4335 
4336             reginsert(pRExC_state, node, ret, depth+1);
4337             FLAGS(REGNODE_p(ret)) = flag;
4338             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
4339             {
4340                 REQUIRE_BRANCHJ(flagp, 0);
4341             }
4342         }
4343     }
4344 
4345     /* Check for proper termination. */
4346     if (paren) {
4347         /* restore original flags, but keep (?p) and, if we've encountered
4348          * something in the parse that changes /d rules into /u, keep the /u */
4349         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
4350         if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
4351             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
4352         }
4353         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
4354             RExC_parse_set(reg_parse_start);
4355             vFAIL("Unmatched (");
4356         }
4357         nextchar(pRExC_state);
4358     }
4359     else if (!paren && RExC_parse < RExC_end) {
4360         if (*RExC_parse == ')') {
4361             RExC_parse_inc_by(1);
4362             vFAIL("Unmatched )");
4363         }
4364         else
4365             FAIL("Junk on end of regexp");	/* "Can't happen". */
4366         NOT_REACHED; /* NOTREACHED */
4367     }
4368 
4369     if (after_freeze > RExC_logical_npar)
4370         RExC_logical_npar = after_freeze;
4371 
4372     RExC_in_lookaround = was_in_lookaround;
4373 
4374     return(ret);
4375 }
4376 
4377 /*
4378  - regbranch - one alternative of an | operator
4379  *
4380  * Implements the concatenation operator.
4381  *
4382  * On success, returns the offset at which any next node should be placed into
4383  * the regex engine program being compiled.
4384  *
4385  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
4386  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
4387  * UTF-8
4388  */
4389 STATIC regnode_offset
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first,U32 depth)4390 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4391 {
4392     regnode_offset ret;
4393     regnode_offset chain = 0;
4394     regnode_offset latest;
4395     regnode *branch_node = NULL;
4396     I32 flags = 0, c = 0;
4397     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4398 
4399     PERL_ARGS_ASSERT_REGBRANCH;
4400 
4401     DEBUG_PARSE("brnc");
4402 
4403     if (first)
4404         ret = 0;
4405     else {
4406         if (RExC_use_BRANCHJ) {
4407             ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
4408             branch_node = REGNODE_p(ret);
4409             ARG2a_SET(branch_node, (U16)RExC_npar-1);
4410         } else {
4411             ret = reg1node(pRExC_state, BRANCH, 0);
4412             branch_node = REGNODE_p(ret);
4413             ARG1a_SET(branch_node, (U16)RExC_npar-1);
4414         }
4415     }
4416 
4417     *flagp = 0;			/* Initialize. */
4418 
4419     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
4420                             FALSE /* Don't force to /x */ );
4421     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4422         flags &= ~TRYAGAIN;
4423         latest = regpiece(pRExC_state, &flags, depth+1);
4424         if (latest == 0) {
4425             if (flags & TRYAGAIN)
4426                 continue;
4427             RETURN_FAIL_ON_RESTART(flags, flagp);
4428             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
4429         }
4430         else if (ret == 0)
4431             ret = latest;
4432         *flagp |= flags&(HASWIDTH|POSTPONED);
4433         if (chain != 0) {
4434             /* FIXME adding one for every branch after the first is probably
4435              * excessive now we have TRIE support. (hv) */
4436             MARK_NAUGHTY(1);
4437             if (! REGTAIL(pRExC_state, chain, latest)) {
4438                 /* XXX We could just redo this branch, but figuring out what
4439                  * bookkeeping needs to be reset is a pain, and it's likely
4440                  * that other branches that goto END will also be too large */
4441                 REQUIRE_BRANCHJ(flagp, 0);
4442             }
4443         }
4444         chain = latest;
4445         c++;
4446     }
4447     if (chain == 0) {	/* Loop ran zero times. */
4448         chain = reg_node(pRExC_state, NOTHING);
4449         if (ret == 0)
4450             ret = chain;
4451     }
4452     if (c == 1) {
4453         *flagp |= flags & SIMPLE;
4454     }
4455     return ret;
4456 }
4457 
4458 #define RBRACE  0
4459 #define MIN_S   1
4460 #define MIN_E   2
4461 #define MAX_S   3
4462 #define MAX_E   4
4463 
4464 #ifndef PERL_IN_XSUB_RE
4465 bool
Perl_regcurly(const char * s,const char * e,const char * result[5])4466 Perl_regcurly(const char *s, const char *e, const char * result[5])
4467 {
4468     /* This function matches a {m,n} quantifier.  When called with a NULL final
4469      * argument, it simply parses the input from 's' up through 'e-1', and
4470      * returns a boolean as to whether or not this input is syntactically a
4471      * {m,n} quantifier.
4472      *
4473      * When called with a non-NULL final parameter, and when the function
4474      * returns TRUE, it additionally stores information into the array
4475      * specified by that parameter about what it found in the parse.  The
4476      * parameter must be a pointer into a 5 element array of 'const char *'
4477      * elements.  The returned information is as follows:
4478      *   result[RBRACE]  points to the closing brace
4479      *   result[MIN_S]   points to the first byte of the lower bound
4480      *   result[MIN_E]   points to one beyond the final byte of the lower bound
4481      *   result[MAX_S]   points to the first byte of the upper bound
4482      *   result[MAX_E]   points to one beyond the final byte of the upper bound
4483      *
4484      * If the quantifier is of the form {m,} (meaning an infinite upper
4485      * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
4486      * to is irrelevant, just that it's the same place
4487      *
4488      * If instead the quantifier is of the form {m} there is actually only
4489      * one bound, and both the upper and lower result[] elements are set to
4490      * point to it.
4491      *
4492      * This function checks only for syntactic validity; it leaves checking for
4493      * semantic validity and raising any diagnostics to the caller.  This
4494      * function is called in multiple places to check for syntax, but only from
4495      * one for semantics.  It makes it as simple as possible for the
4496      * syntax-only callers, while furnishing just enough information for the
4497      * semantic caller.
4498      */
4499 
4500     const char * min_start = NULL;
4501     const char * max_start = NULL;
4502     const char * min_end = NULL;
4503     const char * max_end = NULL;
4504 
4505     bool has_comma = FALSE;
4506 
4507     PERL_ARGS_ASSERT_REGCURLY;
4508 
4509     if (s >= e || *s++ != '{')
4510         return FALSE;
4511 
4512     while (s < e && isBLANK(*s)) {
4513         s++;
4514     }
4515 
4516     if isDIGIT(*s) {
4517         min_start = s;
4518         do {
4519             s++;
4520         } while (s < e && isDIGIT(*s));
4521         min_end = s;
4522     }
4523 
4524     while (s < e && isBLANK(*s)) {
4525         s++;
4526     }
4527 
4528     if (*s == ',') {
4529         has_comma = TRUE;
4530         s++;
4531 
4532         while (s < e && isBLANK(*s)) {
4533             s++;
4534         }
4535 
4536         if isDIGIT(*s) {
4537             max_start = s;
4538             do {
4539                 s++;
4540             } while (s < e && isDIGIT(*s));
4541             max_end = s;
4542         }
4543     }
4544 
4545     while (s < e && isBLANK(*s)) {
4546         s++;
4547     }
4548                                /* Need at least one number */
4549     if (s >= e || *s != '}' || (! min_start && ! max_end)) {
4550         return FALSE;
4551     }
4552 
4553     if (result) {
4554 
4555         result[RBRACE] = s;
4556 
4557         result[MIN_S] = min_start;
4558         result[MIN_E] = min_end;
4559         if (has_comma) {
4560             if (max_start) {
4561                 result[MAX_S] = max_start;
4562                 result[MAX_E] = max_end;
4563             }
4564             else {
4565                 /* Having no value after the comma is signalled by setting
4566                  * start and end to the same value.  What that value is isn't
4567                  * relevant; NULL is chosen simply because it will fail if the
4568                  * caller mistakenly uses it */
4569                 result[MAX_S] = result[MAX_E] = NULL;
4570             }
4571         }
4572         else {  /* No comma means lower and upper bounds are the same */
4573             result[MAX_S] = min_start;
4574             result[MAX_E] = min_end;
4575         }
4576     }
4577 
4578     return TRUE;
4579 }
4580 #endif
4581 
4582 U32
S_get_quantifier_value(pTHX_ RExC_state_t * pRExC_state,const char * start,const char * end)4583 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
4584                        const char * start, const char * end)
4585 {
4586     /* This is a helper function for regpiece() to compute, given the
4587      * quantifier {m,n}, the value of either m or n, based on the starting
4588      * position 'start' in the string, through the byte 'end-1', returning it
4589      * if valid, and failing appropriately if not.  It knows the restrictions
4590      * imposed on quantifier values */
4591 
4592     UV uv;
4593     STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
4594 
4595     PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
4596 
4597     if (grok_atoUV(start, &uv, &end)) {
4598         if (uv < REG_INFTY) {   /* A valid, small-enough number */
4599             return (U32) uv;
4600         }
4601     }
4602     else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
4603                                  leading zeros or overflow */
4604         RExC_parse_set((char * ) end);
4605 
4606         /* Perhaps too generic a msg for what is only failure from having
4607          * leading zeros, but this is how it's always behaved. */
4608         vFAIL("Invalid quantifier in {,}");
4609         NOT_REACHED; /*NOTREACHED*/
4610     }
4611 
4612     /* Here, found a quantifier, but was too large; either it overflowed or was
4613      * too big a legal number */
4614     RExC_parse_set((char * ) end);
4615     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4616 
4617     NOT_REACHED; /*NOTREACHED*/
4618     return U32_MAX; /* Perhaps some compilers will be expecting a return */
4619 }
4620 
4621 /*
4622  - regpiece - something followed by possible quantifier * + ? {n,m}
4623  *
4624  * Note that the branching code sequences used for ? and the general cases
4625  * of * and + are somewhat optimized:  they use the same NOTHING node as
4626  * both the endmarker for their branch list and the body of the last branch.
4627  * It might seem that this node could be dispensed with entirely, but the
4628  * endmarker role is not redundant.
4629  *
4630  * On success, returns the offset at which any next node should be placed into
4631  * the regex engine program being compiled.
4632  *
4633  * Returns 0 otherwise, with *flagp set to indicate why:
4634  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
4635  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
4636  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
4637  */
4638 STATIC regnode_offset
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)4639 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4640 {
4641     regnode_offset ret;
4642     char op;
4643     I32 flags;
4644     const char * const origparse = RExC_parse;
4645     I32 min;
4646     I32 max = REG_INFTY;
4647     I32 npar_before = RExC_npar-1;
4648 
4649     /* Save the original in case we change the emitted regop to a FAIL. */
4650     const regnode_offset orig_emit = RExC_emit;
4651 
4652     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4653 
4654     PERL_ARGS_ASSERT_REGPIECE;
4655 
4656     DEBUG_PARSE("piec");
4657 
4658     ret = regatom(pRExC_state, &flags, depth+1);
4659     if (ret == 0) {
4660         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
4661         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
4662     }
4663     I32 npar_after = RExC_npar-1;
4664 
4665     op = *RExC_parse;
4666     switch (op) {
4667         const char * regcurly_return[5];
4668 
4669       case '*':
4670         nextchar(pRExC_state);
4671         min = 0;
4672         break;
4673 
4674       case '+':
4675         nextchar(pRExC_state);
4676         min = 1;
4677         break;
4678 
4679       case '?':
4680         nextchar(pRExC_state);
4681         min = 0; max = 1;
4682         break;
4683 
4684       case '{':  /* A '{' may or may not indicate a quantifier; call regcurly()
4685                     to determine which */
4686         if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
4687             const char * min_start = regcurly_return[MIN_S];
4688             const char * min_end   = regcurly_return[MIN_E];
4689             const char * max_start = regcurly_return[MAX_S];
4690             const char * max_end   = regcurly_return[MAX_E];
4691 
4692             if (min_start) {
4693                 min = get_quantifier_value(pRExC_state, min_start, min_end);
4694             }
4695             else {
4696                 min = 0;
4697             }
4698 
4699             if (max_start == max_end) {     /* Was of the form {m,} */
4700                 max = REG_INFTY;
4701             }
4702             else if (max_start == min_start) {  /* Was of the form {m} */
4703                 max = min;
4704             }
4705             else {  /* Was of the form {m,n} */
4706                 assert(max_end >= max_start);
4707 
4708                 max = get_quantifier_value(pRExC_state, max_start, max_end);
4709             }
4710 
4711             RExC_parse_set((char *) regcurly_return[RBRACE]);
4712             nextchar(pRExC_state);
4713 
4714             if (max < min) {    /* If can't match, warn and optimize to fail
4715                                    unconditionally */
4716                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
4717                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
4718                 NEXT_OFF(REGNODE_p(orig_emit)) =
4719                                     REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
4720                 return ret;
4721             }
4722             else if (min == max && *RExC_parse == '?') {
4723                 ckWARN2reg(RExC_parse + 1,
4724                            "Useless use of greediness modifier '%c'",
4725                            *RExC_parse);
4726             }
4727 
4728             break;
4729         } /* End of is {m,n} */
4730 
4731         /* Here was a '{', but what followed it didn't form a quantifier. */
4732         /* FALLTHROUGH */
4733 
4734       default:
4735         *flagp = flags;
4736         return(ret);
4737         NOT_REACHED; /*NOTREACHED*/
4738     }
4739 
4740     /* Here we have a quantifier, and have calculated 'min' and 'max'.
4741      *
4742      * Check and possibly adjust a zero width operand */
4743     if (! (flags & (HASWIDTH|POSTPONED))) {
4744         if (max > REG_INFTY/3) {
4745             ckWARN2reg(RExC_parse,
4746                        "%" UTF8f " matches null string many times",
4747                        UTF8fARG(UTF, (RExC_parse >= origparse
4748                                      ? RExC_parse - origparse
4749                                      : 0),
4750                        origparse));
4751         }
4752 
4753         /* There's no point in trying to match something 0 length more than
4754          * once except for extra side effects, which we don't have here since
4755          * not POSTPONED */
4756         if (max > 1) {
4757             max = 1;
4758             if (min > max) {
4759                 min = max;
4760             }
4761         }
4762     }
4763 
4764     /* If this is a code block pass it up */
4765     *flagp |= (flags & POSTPONED);
4766 
4767     if (max > 0) {
4768         *flagp |= (flags & HASWIDTH);
4769         if (max == REG_INFTY)
4770             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
4771     }
4772 
4773     /* 'SIMPLE' operands don't require full generality */
4774     if ((flags&SIMPLE)) {
4775         if (max == REG_INFTY) {
4776             if (min == 0) {
4777                 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
4778                     goto min0_maxINF_wildcard_forbidden;
4779                 }
4780 
4781                 reginsert(pRExC_state, STAR, ret, depth+1);
4782                 MARK_NAUGHTY(4);
4783                 goto done_main_op;
4784             }
4785             else if (min == 1) {
4786                 reginsert(pRExC_state, PLUS, ret, depth+1);
4787                 MARK_NAUGHTY(3);
4788                 goto done_main_op;
4789             }
4790         }
4791 
4792         /* Here, SIMPLE, but not the '*' and '+' special cases */
4793 
4794         MARK_NAUGHTY_EXP(2, 2);
4795         reginsert(pRExC_state, CURLY, ret, depth+1);
4796     }
4797     else {  /* not SIMPLE */
4798         const regnode_offset w = reg_node(pRExC_state, WHILEM);
4799 
4800         FLAGS(REGNODE_p(w)) = 0;
4801         if (!  REGTAIL(pRExC_state, ret, w)) {
4802             REQUIRE_BRANCHJ(flagp, 0);
4803         }
4804         if (RExC_use_BRANCHJ) {
4805             reginsert(pRExC_state, LONGJMP, ret, depth+1);
4806             reginsert(pRExC_state, NOTHING, ret, depth+1);
4807             REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
4808         }
4809         reginsert(pRExC_state, CURLYX, ret, depth+1);
4810         if (RExC_use_BRANCHJ)
4811             /* Go over NOTHING to LONGJMP. */
4812             REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
4813 
4814         if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
4815                                                   NOTHING)))
4816         {
4817             REQUIRE_BRANCHJ(flagp, 0);
4818         }
4819         RExC_whilem_seen++;
4820         MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
4821     }
4822 
4823     /* Finish up the CURLY/CURLYX case */
4824     FLAGS(REGNODE_p(ret)) = 0;
4825 
4826     ARG1i_SET(REGNODE_p(ret), min);
4827     ARG2i_SET(REGNODE_p(ret), max);
4828 
4829     /* if we had a npar_after then we need to increment npar_before,
4830      * we want to track the range of parens we need to reset each iteration
4831      */
4832     if (npar_after!=npar_before) {
4833         ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
4834         ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
4835     } else {
4836         ARG3a_SET(REGNODE_p(ret), 0);
4837         ARG3b_SET(REGNODE_p(ret), 0);
4838     }
4839 
4840   done_main_op:
4841 
4842     /* Process any greediness modifiers */
4843     if (*RExC_parse == '?') {
4844         nextchar(pRExC_state);
4845         reginsert(pRExC_state, MINMOD, ret, depth+1);
4846         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
4847             REQUIRE_BRANCHJ(flagp, 0);
4848         }
4849     }
4850     else if (*RExC_parse == '+') {
4851         regnode_offset ender;
4852         nextchar(pRExC_state);
4853         ender = reg_node(pRExC_state, SUCCEED);
4854         if (! REGTAIL(pRExC_state, ret, ender)) {
4855             REQUIRE_BRANCHJ(flagp, 0);
4856         }
4857         reginsert(pRExC_state, SUSPEND, ret, depth+1);
4858         ender = reg_node(pRExC_state, TAIL);
4859         if (! REGTAIL(pRExC_state, ret, ender)) {
4860             REQUIRE_BRANCHJ(flagp, 0);
4861         }
4862     }
4863 
4864     /* Forbid extra quantifiers */
4865     if (isQUANTIFIER(RExC_parse, RExC_end)) {
4866         RExC_parse_inc_by(1);
4867         vFAIL("Nested quantifiers");
4868     }
4869 
4870     return(ret);
4871 
4872   min0_maxINF_wildcard_forbidden:
4873 
4874     /* Here we are in a wildcard match, and the minimum match length is 0, and
4875      * the max could be infinity.  This is currently forbidden.  The only
4876      * reason is to make it harder to write patterns that take a long long time
4877      * to halt, and because the use of this construct isn't necessary in
4878      * matching Unicode property values */
4879     RExC_parse_inc_by(1);
4880     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
4881        subpatterns in regex; marked by <-- HERE in m/%s/
4882      */
4883     vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
4884           " subpatterns");
4885 
4886     /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
4887      * legal at all in wildcards, so can't get this far */
4888 
4889     NOT_REACHED; /*NOTREACHED*/
4890 }
4891 
4892 STATIC bool
S_grok_bslash_N(pTHX_ RExC_state_t * pRExC_state,regnode_offset * node_p,UV * code_point_p,int * cp_count,I32 * flagp,const bool strict,const U32 depth)4893 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
4894                 regnode_offset * node_p,
4895                 UV * code_point_p,
4896                 int * cp_count,
4897                 I32 * flagp,
4898                 const bool strict,
4899                 const U32 depth
4900     )
4901 {
4902  /* This routine teases apart the various meanings of \N and returns
4903   * accordingly.  The input parameters constrain which meaning(s) is/are valid
4904   * in the current context.
4905   *
4906   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
4907   *
4908   * If <code_point_p> is not NULL, the context is expecting the result to be a
4909   * single code point.  If this \N instance turns out to a single code point,
4910   * the function returns TRUE and sets *code_point_p to that code point.
4911   *
4912   * If <node_p> is not NULL, the context is expecting the result to be one of
4913   * the things representable by a regnode.  If this \N instance turns out to be
4914   * one such, the function generates the regnode, returns TRUE and sets *node_p
4915   * to point to the offset of that regnode into the regex engine program being
4916   * compiled.
4917   *
4918   * If this instance of \N isn't legal in any context, this function will
4919   * generate a fatal error and not return.
4920   *
4921   * On input, RExC_parse should point to the first char following the \N at the
4922   * time of the call.  On successful return, RExC_parse will have been updated
4923   * to point to just after the sequence identified by this routine.  Also
4924   * *flagp has been updated as needed.
4925   *
4926   * When there is some problem with the current context and this \N instance,
4927   * the function returns FALSE, without advancing RExC_parse, nor setting
4928   * *node_p, nor *code_point_p, nor *flagp.
4929   *
4930   * If <cp_count> is not NULL, the caller wants to know the length (in code
4931   * points) that this \N sequence matches.  This is set, and the input is
4932   * parsed for errors, even if the function returns FALSE, as detailed below.
4933   *
4934   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
4935   *
4936   * Probably the most common case is for the \N to specify a single code point.
4937   * *cp_count will be set to 1, and *code_point_p will be set to that code
4938   * point.
4939   *
4940   * Another possibility is for the input to be an empty \N{}.  This is no
4941   * longer accepted, and will generate a fatal error.
4942   *
4943   * Another possibility is for a custom charnames handler to be in effect which
4944   * translates the input name to an empty string.  *cp_count will be set to 0.
4945   * *node_p will be set to a generated NOTHING node.
4946   *
4947   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
4948   * set to 0. *node_p will be set to a generated REG_ANY node.
4949   *
4950   * The fifth possibility is that \N resolves to a sequence of more than one
4951   * code points.  *cp_count will be set to the number of code points in the
4952   * sequence. *node_p will be set to a generated node returned by this
4953   * function calling S_reg().
4954   *
4955   * The sixth and final possibility is that it is premature to be calling this
4956   * function; the parse needs to be restarted.  This can happen when this
4957   * changes from /d to /u rules, or when the pattern needs to be upgraded to
4958   * UTF-8.  The latter occurs only when the fifth possibility would otherwise
4959   * be in effect, and is because one of those code points requires the pattern
4960   * to be recompiled as UTF-8.  The function returns FALSE, and sets the
4961   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
4962   * happens, the caller needs to desist from continuing parsing, and return
4963   * this information to its caller.  This is not set for when there is only one
4964   * code point, as this can be called as part of an ANYOF node, and they can
4965   * store above-Latin1 code points without the pattern having to be in UTF-8.
4966   *
4967   * For non-single-quoted regexes, the tokenizer has resolved character and
4968   * sequence names inside \N{...} into their Unicode values, normalizing the
4969   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
4970   * hex-represented code points in the sequence.  This is done there because
4971   * the names can vary based on what charnames pragma is in scope at the time,
4972   * so we need a way to take a snapshot of what they resolve to at the time of
4973   * the original parse. [perl #56444].
4974   *
4975   * That parsing is skipped for single-quoted regexes, so here we may get
4976   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
4977   * like '\N{U+41}', that code point is Unicode, and has to be translated into
4978   * the native character set for non-ASCII platforms.  The other possibilities
4979   * are already native, so no translation is done. */
4980 
4981     char * endbrace;    /* points to '}' following the name */
4982     char * e;           /* points to final non-blank before endbrace */
4983     char* p = RExC_parse; /* Temporary */
4984 
4985     SV * substitute_parse = NULL;
4986     char *orig_end;
4987     char *save_start;
4988     I32 flags;
4989 
4990     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4991 
4992     PERL_ARGS_ASSERT_GROK_BSLASH_N;
4993 
4994     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
4995     assert(! (node_p && cp_count));               /* At most 1 should be set */
4996 
4997     if (cp_count) {     /* Initialize return for the most common case */
4998         *cp_count = 1;
4999     }
5000 
5001     /* The [^\n] meaning of \N ignores spaces and comments under the /x
5002      * modifier.  The other meanings do not (except blanks adjacent to and
5003      * within the braces), so use a temporary until we find out which we are
5004      * being called with */
5005     skip_to_be_ignored_text(pRExC_state, &p,
5006                             FALSE /* Don't force to /x */ );
5007 
5008     /* Disambiguate between \N meaning a named character versus \N meaning
5009      * [^\n].  The latter is assumed when the {...} following the \N is a legal
5010      * quantifier, or if there is no '{' at all */
5011     if (*p != '{' || regcurly(p, RExC_end, NULL)) {
5012         RExC_parse_set(p);
5013         if (cp_count) {
5014             *cp_count = -1;
5015         }
5016 
5017         if (! node_p) {
5018             return FALSE;
5019         }
5020 
5021         *node_p = reg_node(pRExC_state, REG_ANY);
5022         *flagp |= HASWIDTH|SIMPLE;
5023         MARK_NAUGHTY(1);
5024         return TRUE;
5025     }
5026 
5027     /* The test above made sure that the next real character is a '{', but
5028      * under the /x modifier, it could be separated by space (or a comment and
5029      * \n) and this is not allowed (for consistency with \x{...} and the
5030      * tokenizer handling of \N{NAME}). */
5031     if (*RExC_parse != '{') {
5032         vFAIL("Missing braces on \\N{}");
5033     }
5034 
5035     RExC_parse_inc_by(1);       /* Skip past the '{' */
5036 
5037     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
5038     if (! endbrace) { /* no trailing brace */
5039         vFAIL2("Missing right brace on \\%c{}", 'N');
5040     }
5041 
5042     /* Here, we have decided it should be a named character or sequence.  These
5043      * imply Unicode semantics */
5044     REQUIRE_UNI_RULES(flagp, FALSE);
5045 
5046     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
5047      * nothing at all (not allowed under strict) */
5048     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
5049         RExC_parse_set(endbrace);
5050         if (strict) {
5051             RExC_parse_inc_by(1);   /* Position after the "}" */
5052             vFAIL("Zero length \\N{}");
5053         }
5054 
5055         if (cp_count) {
5056             *cp_count = 0;
5057         }
5058         nextchar(pRExC_state);
5059         if (! node_p) {
5060             return FALSE;
5061         }
5062 
5063         *node_p = reg_node(pRExC_state, NOTHING);
5064         return TRUE;
5065     }
5066 
5067     while (isBLANK(*RExC_parse)) {
5068         RExC_parse_inc_by(1);
5069     }
5070 
5071     e = endbrace;
5072     while (RExC_parse < e && isBLANK(*(e-1))) {
5073         e--;
5074     }
5075 
5076     if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
5077 
5078         /* Here, the name isn't of the form  U+....  This can happen if the
5079          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
5080          * is the time to find out what the name means */
5081 
5082         const STRLEN name_len = e - RExC_parse;
5083         SV *  value_sv;     /* What does this name evaluate to */
5084         SV ** value_svp;
5085         const U8 * value;   /* string of name's value */
5086         STRLEN value_len;   /* and its length */
5087 
5088         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
5089          *  toke.c, and their values. Make sure is initialized */
5090         if (! RExC_unlexed_names) {
5091             RExC_unlexed_names = newHV();
5092         }
5093 
5094         /* If we have already seen this name in this pattern, use that.  This
5095          * allows us to only call the charnames handler once per name per
5096          * pattern.  A broken or malicious handler could return something
5097          * different each time, which could cause the results to vary depending
5098          * on if something gets added or subtracted from the pattern that
5099          * causes the number of passes to change, for example */
5100         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
5101                                                       name_len, 0)))
5102         {
5103             value_sv = *value_svp;
5104         }
5105         else { /* Otherwise we have to go out and get the name */
5106             const char * error_msg = NULL;
5107             value_sv = get_and_check_backslash_N_name(RExC_parse, e,
5108                                                       UTF,
5109                                                       &error_msg);
5110             if (error_msg) {
5111                 RExC_parse_set(endbrace);
5112                 vFAIL(error_msg);
5113             }
5114 
5115             /* If no error message, should have gotten a valid return */
5116             assert (value_sv);
5117 
5118             /* Save the name's meaning for later use */
5119             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
5120                            value_sv, 0))
5121             {
5122                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
5123             }
5124         }
5125 
5126         /* Here, we have the value the name evaluates to in 'value_sv' */
5127         value = (U8 *) SvPV(value_sv, value_len);
5128 
5129         /* See if the result is one code point vs 0 or multiple */
5130         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
5131                                   ? UTF8SKIP(value)
5132                                   : 1)))
5133         {
5134             /* Here, exactly one code point.  If that isn't what is wanted,
5135              * fail */
5136             if (! code_point_p) {
5137                 RExC_parse_set(p);
5138                 return FALSE;
5139             }
5140 
5141             /* Convert from string to numeric code point */
5142             *code_point_p = (SvUTF8(value_sv))
5143                             ? valid_utf8_to_uvchr(value, NULL)
5144                             : *value;
5145 
5146             /* Have parsed this entire single code point \N{...}.  *cp_count
5147              * has already been set to 1, so don't do it again. */
5148             RExC_parse_set(endbrace);
5149             nextchar(pRExC_state);
5150             return TRUE;
5151         } /* End of is a single code point */
5152 
5153         /* Count the code points, if caller desires.  The API says to do this
5154          * even if we will later return FALSE */
5155         if (cp_count) {
5156             *cp_count = 0;
5157 
5158             *cp_count = (SvUTF8(value_sv))
5159                         ? utf8_length(value, value + value_len)
5160                         : value_len;
5161         }
5162 
5163         /* Fail if caller doesn't want to handle a multi-code-point sequence.
5164          * But don't back the pointer up if the caller wants to know how many
5165          * code points there are (they need to handle it themselves in this
5166          * case).  */
5167         if (! node_p) {
5168             if (! cp_count) {
5169                 RExC_parse_set(p);
5170             }
5171             return FALSE;
5172         }
5173 
5174         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
5175          * reg recursively to parse it.  That way, it retains its atomicness,
5176          * while not having to worry about any special handling that some code
5177          * points may have. */
5178 
5179         substitute_parse = newSVpvs("?:");
5180         sv_catsv(substitute_parse, value_sv);
5181         sv_catpv(substitute_parse, ")");
5182 
5183         /* The value should already be native, so no need to convert on EBCDIC
5184          * platforms.*/
5185         assert(! RExC_recode_x_to_native);
5186 
5187     }
5188     else {   /* \N{U+...} */
5189         Size_t count = 0;   /* code point count kept internally */
5190 
5191         /* We can get to here when the input is \N{U+...} or when toke.c has
5192          * converted a name to the \N{U+...} form.  This include changing a
5193          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
5194 
5195         RExC_parse_inc_by(2);    /* Skip past the 'U+' */
5196 
5197         /* Code points are separated by dots.  The '}' terminates the whole
5198          * thing. */
5199 
5200         do {    /* Loop until the ending brace */
5201             I32 flags = PERL_SCAN_SILENT_OVERFLOW
5202                       | PERL_SCAN_SILENT_ILLDIGIT
5203                       | PERL_SCAN_NOTIFY_ILLDIGIT
5204                       | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
5205                       | PERL_SCAN_DISALLOW_PREFIX;
5206             STRLEN len = e - RExC_parse;
5207             NV overflow_value;
5208             char * start_digit = RExC_parse;
5209             UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
5210 
5211             if (len == 0) {
5212                 RExC_parse_inc_by(1);
5213               bad_NU:
5214                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
5215             }
5216 
5217             RExC_parse_inc_by(len);
5218 
5219             if (cp > MAX_LEGAL_CP) {
5220                 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
5221             }
5222 
5223             if (RExC_parse >= e) { /* Got to the closing '}' */
5224                 if (count) {
5225                     goto do_concat;
5226                 }
5227 
5228                 /* Here, is a single code point; fail if doesn't want that */
5229                 if (! code_point_p) {
5230                     RExC_parse_set(p);
5231                     return FALSE;
5232                 }
5233 
5234                 /* A single code point is easy to handle; just return it */
5235                 *code_point_p = UNI_TO_NATIVE(cp);
5236                 RExC_parse_set(endbrace);
5237                 nextchar(pRExC_state);
5238                 return TRUE;
5239             }
5240 
5241             /* Here, the parse stopped bfore the ending brace.  This is legal
5242              * only if that character is a dot separating code points, like a
5243              * multiple character sequence (of the form "\N{U+c1.c2. ... }".
5244              * So the next character must be a dot (and the one after that
5245              * can't be the ending brace, or we'd have something like
5246              * \N{U+100.} )
5247              * */
5248             if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
5249                 /*point to after 1st invalid */
5250                 RExC_parse_incf(RExC_orig_utf8);
5251                 /*Guard against malformed utf8*/
5252                 RExC_parse_set(MIN(e, RExC_parse));
5253                 goto bad_NU;
5254             }
5255 
5256             /* Here, looks like its really a multiple character sequence.  Fail
5257              * if that's not what the caller wants.  But continue with counting
5258              * and error checking if they still want a count */
5259             if (! node_p && ! cp_count) {
5260                 return FALSE;
5261             }
5262 
5263             /* What is done here is to convert this to a sub-pattern of the
5264              * form \x{char1}\x{char2}...  and then call reg recursively to
5265              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
5266              * atomicness, while not having to worry about special handling
5267              * that some code points may have.  We don't create a subpattern,
5268              * but go through the motions of code point counting and error
5269              * checking, if the caller doesn't want a node returned. */
5270 
5271             if (node_p && ! substitute_parse) {
5272                 substitute_parse = newSVpvs("?:");
5273             }
5274 
5275           do_concat:
5276 
5277             if (node_p) {
5278                 /* Convert to notation the rest of the code understands */
5279                 sv_catpvs(substitute_parse, "\\x{");
5280                 sv_catpvn(substitute_parse, start_digit,
5281                                             RExC_parse - start_digit);
5282                 sv_catpvs(substitute_parse, "}");
5283             }
5284 
5285             /* Move to after the dot (or ending brace the final time through.)
5286              * */
5287             RExC_parse_inc_by(1);
5288             count++;
5289 
5290         } while (RExC_parse < e);
5291 
5292         if (! node_p) { /* Doesn't want the node */
5293             assert (cp_count);
5294 
5295             *cp_count = count;
5296             return FALSE;
5297         }
5298 
5299         sv_catpvs(substitute_parse, ")");
5300 
5301         /* The values are Unicode, and therefore have to be converted to native
5302          * on a non-Unicode (meaning non-ASCII) platform. */
5303         SET_recode_x_to_native(1);
5304     }
5305 
5306     /* Here, we have the string the name evaluates to, ready to be parsed,
5307      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
5308      * constructs.  This can be called from within a substitute parse already.
5309      * The error reporting mechanism doesn't work for 2 levels of this, but the
5310      * code above has validated this new construct, so there should be no
5311      * errors generated by the below.  And this isn't an exact copy, so the
5312      * mechanism to seamlessly deal with this won't work, so turn off warnings
5313      * during it */
5314     save_start = RExC_start;
5315     orig_end = RExC_end;
5316 
5317     RExC_start = SvPVX(substitute_parse);
5318     RExC_parse_set(RExC_start);
5319     RExC_end = RExC_parse + SvCUR(substitute_parse);
5320     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
5321 
5322     *node_p = reg(pRExC_state, 1, &flags, depth+1);
5323 
5324     /* Restore the saved values */
5325     RESTORE_WARNINGS;
5326     RExC_start = save_start;
5327     RExC_parse_set(endbrace);
5328     RExC_end = orig_end;
5329     SET_recode_x_to_native(0);
5330 
5331     SvREFCNT_dec_NN(substitute_parse);
5332 
5333     if (! *node_p) {
5334         RETURN_FAIL_ON_RESTART(flags, flagp);
5335         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
5336             (UV) flags);
5337     }
5338     *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5339 
5340     nextchar(pRExC_state);
5341 
5342     return TRUE;
5343 }
5344 
5345 
5346 STATIC U8
S_compute_EXACTish(RExC_state_t * pRExC_state)5347 S_compute_EXACTish(RExC_state_t *pRExC_state)
5348 {
5349     U8 op;
5350 
5351     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
5352 
5353     if (! FOLD) {
5354         return (LOC)
5355                 ? EXACTL
5356                 : EXACT;
5357     }
5358 
5359     op = get_regex_charset(RExC_flags);
5360     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
5361         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
5362                  been, so there is no hole */
5363     }
5364 
5365     return op + EXACTF;
5366 }
5367 
5368 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
5369  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
5370 
5371 static I32
S_backref_value(char * p,char * e)5372 S_backref_value(char *p, char *e)
5373 {
5374     const char* endptr = e;
5375     UV val;
5376     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
5377         return (I32)val;
5378     return I32_MAX;
5379 }
5380 
5381 
5382 /*
5383  - regatom - the lowest level
5384 
5385    Try to identify anything special at the start of the current parse position.
5386    If there is, then handle it as required. This may involve generating a
5387    single regop, such as for an assertion; or it may involve recursing, such as
5388    to handle a () structure.
5389 
5390    If the string doesn't start with something special then we gobble up
5391    as much literal text as we can.  If we encounter a quantifier, we have to
5392    back off the final literal character, as that quantifier applies to just it
5393    and not to the whole string of literals.
5394 
5395    Once we have been able to handle whatever type of thing started the
5396    sequence, we return the offset into the regex engine program being compiled
5397    at which any  next regnode should be placed.
5398 
5399    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
5400    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
5401    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
5402    Otherwise does not return 0.
5403 
5404    Note: we have to be careful with escapes, as they can be both literal
5405    and special, and in the case of \10 and friends, context determines which.
5406 
5407    A summary of the code structure is:
5408 
5409    switch (first_byte) {
5410         cases for each special:
5411             handle this special;
5412             break;
5413         case '\\':
5414             switch (2nd byte) {
5415                 cases for each unambiguous special:
5416                     handle this special;
5417                     break;
5418                 cases for each ambiguous special/literal:
5419                     disambiguate;
5420                     if (special)  handle here
5421                     else goto defchar;
5422                 default: // unambiguously literal:
5423                     goto defchar;
5424             }
5425         default:  // is a literal char
5426             // FALL THROUGH
5427         defchar:
5428             create EXACTish node for literal;
5429             while (more input and node isn't full) {
5430                 switch (input_byte) {
5431                    cases for each special;
5432                        make sure parse pointer is set so that the next call to
5433                            regatom will see this special first
5434                        goto loopdone; // EXACTish node terminated by prev. char
5435                    default:
5436                        append char to EXACTISH node;
5437                 }
5438                 get next input byte;
5439             }
5440         loopdone:
5441    }
5442    return the generated node;
5443 
5444    Specifically there are two separate switches for handling
5445    escape sequences, with the one for handling literal escapes requiring
5446    a dummy entry for all of the special escapes that are actually handled
5447    by the other.
5448 
5449 */
5450 
5451 STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)5452 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5453 {
5454     regnode_offset ret = 0;
5455     I32 flags = 0;
5456     char *atom_parse_start;
5457     U8 op;
5458     int invert = 0;
5459 
5460     DECLARE_AND_GET_RE_DEBUG_FLAGS;
5461 
5462     *flagp = 0;		/* Initialize. */
5463 
5464     DEBUG_PARSE("atom");
5465 
5466     PERL_ARGS_ASSERT_REGATOM;
5467 
5468   tryagain:
5469     atom_parse_start = RExC_parse;
5470     assert(RExC_parse < RExC_end);
5471     switch ((U8)*RExC_parse) {
5472     case '^':
5473         RExC_seen_zerolen++;
5474         nextchar(pRExC_state);
5475         if (RExC_flags & RXf_PMf_MULTILINE)
5476             ret = reg_node(pRExC_state, MBOL);
5477         else
5478             ret = reg_node(pRExC_state, SBOL);
5479         break;
5480     case '$':
5481         nextchar(pRExC_state);
5482         if (*RExC_parse)
5483             RExC_seen_zerolen++;
5484         if (RExC_flags & RXf_PMf_MULTILINE)
5485             ret = reg_node(pRExC_state, MEOL);
5486         else
5487             ret = reg_node(pRExC_state, SEOL);
5488         break;
5489     case '.':
5490         nextchar(pRExC_state);
5491         if (RExC_flags & RXf_PMf_SINGLELINE)
5492             ret = reg_node(pRExC_state, SANY);
5493         else
5494             ret = reg_node(pRExC_state, REG_ANY);
5495         *flagp |= HASWIDTH|SIMPLE;
5496         MARK_NAUGHTY(1);
5497         break;
5498     case '[':
5499     {
5500         char * const cc_parse_start = ++RExC_parse;
5501         ret = regclass(pRExC_state, flagp, depth+1,
5502                        FALSE, /* means parse the whole char class */
5503                        TRUE, /* allow multi-char folds */
5504                        FALSE, /* don't silence non-portable warnings. */
5505                        (bool) RExC_strict,
5506                        TRUE, /* Allow an optimized regnode result */
5507                        NULL);
5508         if (ret == 0) {
5509             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5510             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5511                   (UV) *flagp);
5512         }
5513         if (*RExC_parse != ']') {
5514             RExC_parse_set(cc_parse_start);
5515             vFAIL("Unmatched [");
5516         }
5517         nextchar(pRExC_state);
5518         break;
5519     }
5520     case '(':
5521         nextchar(pRExC_state);
5522         ret = reg(pRExC_state, 2, &flags, depth+1);
5523         if (ret == 0) {
5524                 if (flags & TRYAGAIN) {
5525                     if (RExC_parse >= RExC_end) {
5526                          /* Make parent create an empty node if needed. */
5527                         *flagp |= TRYAGAIN;
5528                         return(0);
5529                     }
5530                     goto tryagain;
5531                 }
5532                 RETURN_FAIL_ON_RESTART(flags, flagp);
5533                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
5534                                                                  (UV) flags);
5535         }
5536         *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5537         break;
5538     case '|':
5539     case ')':
5540         if (flags & TRYAGAIN) {
5541             *flagp |= TRYAGAIN;
5542             return 0;
5543         }
5544         vFAIL("Internal urp");
5545                                 /* Supposed to be caught earlier. */
5546         break;
5547     case '?':
5548     case '+':
5549     case '*':
5550         RExC_parse_inc_by(1);
5551         vFAIL("Quantifier follows nothing");
5552         break;
5553     case '\\':
5554         /* Special Escapes
5555 
5556            This switch handles escape sequences that resolve to some kind
5557            of special regop and not to literal text. Escape sequences that
5558            resolve to literal text are handled below in the switch marked
5559            "Literal Escapes".
5560 
5561            Every entry in this switch *must* have a corresponding entry
5562            in the literal escape switch. However, the opposite is not
5563            required, as the default for this switch is to jump to the
5564            literal text handling code.
5565         */
5566         RExC_parse_inc_by(1);
5567         switch ((U8)*RExC_parse) {
5568         /* Special Escapes */
5569         case 'A':
5570             RExC_seen_zerolen++;
5571             /* Under wildcards, this is changed to match \n; should be
5572              * invisible to the user, as they have to compile under /m */
5573             if (RExC_pm_flags & PMf_WILDCARD) {
5574                 ret = reg_node(pRExC_state, MBOL);
5575             }
5576             else {
5577                 ret = reg_node(pRExC_state, SBOL);
5578                 /* SBOL is shared with /^/ so we set the flags so we can tell
5579                  * /\A/ from /^/ in split. */
5580                 FLAGS(REGNODE_p(ret)) = 1;
5581             }
5582             goto finish_meta_pat;
5583         case 'G':
5584             if (RExC_pm_flags & PMf_WILDCARD) {
5585                 RExC_parse_inc_by(1);
5586                 /* diag_listed_as: Use of %s is not allowed in Unicode property
5587                    wildcard subpatterns in regex; marked by <-- HERE in m/%s/
5588                  */
5589                 vFAIL("Use of '\\G' is not allowed in Unicode property"
5590                       " wildcard subpatterns");
5591             }
5592             ret = reg_node(pRExC_state, GPOS);
5593             RExC_seen |= REG_GPOS_SEEN;
5594             goto finish_meta_pat;
5595         case 'K':
5596             if (!RExC_in_lookaround) {
5597                 RExC_seen_zerolen++;
5598                 ret = reg_node(pRExC_state, KEEPS);
5599                 /* XXX:dmq : disabling in-place substitution seems to
5600                  * be necessary here to avoid cases of memory corruption, as
5601                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
5602                  */
5603                 RExC_seen |= REG_LOOKBEHIND_SEEN;
5604                 goto finish_meta_pat;
5605             }
5606             else {
5607                 ++RExC_parse; /* advance past the 'K' */
5608                 vFAIL("\\K not permitted in lookahead/lookbehind");
5609             }
5610         case 'Z':
5611             if (RExC_pm_flags & PMf_WILDCARD) {
5612                 /* See comment under \A above */
5613                 ret = reg_node(pRExC_state, MEOL);
5614             }
5615             else {
5616                 ret = reg_node(pRExC_state, SEOL);
5617             }
5618             RExC_seen_zerolen++;		/* Do not optimize RE away */
5619             goto finish_meta_pat;
5620         case 'z':
5621             if (RExC_pm_flags & PMf_WILDCARD) {
5622                 /* See comment under \A above */
5623                 ret = reg_node(pRExC_state, MEOL);
5624             }
5625             else {
5626                 ret = reg_node(pRExC_state, EOS);
5627             }
5628             RExC_seen_zerolen++;		/* Do not optimize RE away */
5629             goto finish_meta_pat;
5630         case 'C':
5631             vFAIL("\\C no longer supported");
5632         case 'X':
5633             ret = reg_node(pRExC_state, CLUMP);
5634             *flagp |= HASWIDTH;
5635             goto finish_meta_pat;
5636 
5637         case 'B':
5638             invert = 1;
5639             /* FALLTHROUGH */
5640         case 'b':
5641           {
5642             U8 flags = 0;
5643             regex_charset charset = get_regex_charset(RExC_flags);
5644 
5645             RExC_seen_zerolen++;
5646             RExC_seen |= REG_LOOKBEHIND_SEEN;
5647             op = BOUND + charset;
5648 
5649             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
5650                 flags = TRADITIONAL_BOUND;
5651                 if (op > BOUNDA) {  /* /aa is same as /a */
5652                     op = BOUNDA;
5653                 }
5654             }
5655             else {
5656                 STRLEN length;
5657                 char name = *RExC_parse;
5658                 char * endbrace =  (char *) memchr(RExC_parse, '}',
5659                                                    RExC_end - RExC_parse);
5660                 char * e = endbrace;
5661 
5662                 RExC_parse_inc_by(2);
5663 
5664                 if (! endbrace) {
5665                     vFAIL2("Missing right brace on \\%c{}", name);
5666                 }
5667 
5668                 while (isBLANK(*RExC_parse)) {
5669                     RExC_parse_inc_by(1);
5670                 }
5671 
5672                 while (RExC_parse < e && isBLANK(*(e - 1))) {
5673                     e--;
5674                 }
5675 
5676                 if (e == RExC_parse) {
5677                     RExC_parse_set(endbrace + 1);  /* After the '}' */
5678                     vFAIL2("Empty \\%c{}", name);
5679                 }
5680 
5681                 length = e - RExC_parse;
5682 
5683                 switch (*RExC_parse) {
5684                     case 'g':
5685                         if (    length != 1
5686                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
5687                         {
5688                             goto bad_bound_type;
5689                         }
5690                         flags = GCB_BOUND;
5691                         break;
5692                     case 'l':
5693                         if (length != 2 || *(RExC_parse + 1) != 'b') {
5694                             goto bad_bound_type;
5695                         }
5696                         flags = LB_BOUND;
5697                         break;
5698                     case 's':
5699                         if (length != 2 || *(RExC_parse + 1) != 'b') {
5700                             goto bad_bound_type;
5701                         }
5702                         flags = SB_BOUND;
5703                         break;
5704                     case 'w':
5705                         if (length != 2 || *(RExC_parse + 1) != 'b') {
5706                             goto bad_bound_type;
5707                         }
5708                         flags = WB_BOUND;
5709                         break;
5710                     default:
5711                       bad_bound_type:
5712                         RExC_parse_set(e);
5713                         vFAIL2utf8f(
5714                             "'%" UTF8f "' is an unknown bound type",
5715                             UTF8fARG(UTF, length, e - length));
5716                         NOT_REACHED; /*NOTREACHED*/
5717                 }
5718                 RExC_parse_set(endbrace);
5719                 REQUIRE_UNI_RULES(flagp, 0);
5720 
5721                 if (op == BOUND) {
5722                     op = BOUNDU;
5723                 }
5724                 else if (op >= BOUNDA) {  /* /aa is same as /a */
5725                     op = BOUNDU;
5726                     length += 4;
5727 
5728                     /* Don't have to worry about UTF-8, in this message because
5729                      * to get here the contents of the \b must be ASCII */
5730                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
5731                               "Using /u for '%.*s' instead of /%s",
5732                               (unsigned) length,
5733                               endbrace - length + 1,
5734                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
5735                               ? ASCII_RESTRICT_PAT_MODS
5736                               : ASCII_MORE_RESTRICT_PAT_MODS);
5737                 }
5738             }
5739 
5740             if (op == BOUND) {
5741                 RExC_seen_d_op = TRUE;
5742             }
5743             else if (op == BOUNDL) {
5744                 RExC_contains_locale = 1;
5745             }
5746 
5747             if (invert) {
5748                 op += NBOUND - BOUND;
5749             }
5750 
5751             ret = reg_node(pRExC_state, op);
5752             FLAGS(REGNODE_p(ret)) = flags;
5753 
5754             goto finish_meta_pat;
5755           }
5756 
5757         case 'R':
5758             ret = reg_node(pRExC_state, LNBREAK);
5759             *flagp |= HASWIDTH|SIMPLE;
5760             goto finish_meta_pat;
5761 
5762         case 'd':
5763         case 'D':
5764         case 'h':
5765         case 'H':
5766         case 'p':
5767         case 'P':
5768         case 's':
5769         case 'S':
5770         case 'v':
5771         case 'V':
5772         case 'w':
5773         case 'W':
5774             /* These all have the same meaning inside [brackets], and it knows
5775              * how to do the best optimizations for them.  So, pretend we found
5776              * these within brackets, and let it do the work */
5777             RExC_parse--;
5778 
5779             ret = regclass(pRExC_state, flagp, depth+1,
5780                            TRUE, /* means just parse this element */
5781                            FALSE, /* don't allow multi-char folds */
5782                            FALSE, /* don't silence non-portable warnings.  It
5783                                      would be a bug if these returned
5784                                      non-portables */
5785                            (bool) RExC_strict,
5786                            TRUE, /* Allow an optimized regnode result */
5787                            NULL);
5788             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5789             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
5790              * multi-char folds are allowed.  */
5791             if (!ret)
5792                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5793                       (UV) *flagp);
5794 
5795             RExC_parse--;   /* regclass() leaves this one too far ahead */
5796 
5797           finish_meta_pat:
5798                    /* The escapes above that don't take a parameter can't be
5799                     * followed by a '{'.  But 'pX', 'p{foo}' and
5800                     * correspondingly 'P' can be */
5801             if (   RExC_parse - atom_parse_start == 1
5802                 && UCHARAT(RExC_parse + 1) == '{'
5803                 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
5804             {
5805                 RExC_parse_inc_by(2);
5806                 vFAIL("Unescaped left brace in regex is illegal here");
5807             }
5808             nextchar(pRExC_state);
5809             break;
5810         case 'N':
5811             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
5812              * \N{...} evaluates to a sequence of more than one code points).
5813              * The function call below returns a regnode, which is our result.
5814              * The parameters cause it to fail if the \N{} evaluates to a
5815              * single code point; we handle those like any other literal.  The
5816              * reason that the multicharacter case is handled here and not as
5817              * part of the EXACtish code is because of quantifiers.  In
5818              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
5819              * this way makes that Just Happen. dmq.
5820              * join_exact() will join this up with adjacent EXACTish nodes
5821              * later on, if appropriate. */
5822             ++RExC_parse;
5823             if (grok_bslash_N(pRExC_state,
5824                               &ret,     /* Want a regnode returned */
5825                               NULL,     /* Fail if evaluates to a single code
5826                                            point */
5827                               NULL,     /* Don't need a count of how many code
5828                                            points */
5829                               flagp,
5830                               RExC_strict,
5831                               depth)
5832             ) {
5833                 break;
5834             }
5835 
5836             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5837 
5838             /* Here, evaluates to a single code point.  Go get that */
5839             RExC_parse_set(atom_parse_start);
5840             goto defchar;
5841 
5842         case 'k':    /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
5843       parse_named_seq:  /* Also handle non-numeric \g{...} */
5844         {
5845             char ch;
5846             if (   RExC_parse >= RExC_end - 1
5847                 || ((   ch = RExC_parse[1]) != '<'
5848                                       && ch != '\''
5849                                       && ch != '{'))
5850             {
5851                 RExC_parse_inc_by(1);
5852                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
5853                 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
5854             } else {
5855                 RExC_parse_inc_by(2);
5856                 if (ch == '{') {
5857                     while (isBLANK(*RExC_parse)) {
5858                         RExC_parse_inc_by(1);
5859                     }
5860                 }
5861                 ret = handle_named_backref(pRExC_state,
5862                                            flagp,
5863                                            atom_parse_start,
5864                                            (ch == '<')
5865                                            ? '>'
5866                                            : (ch == '{')
5867                                              ? '}'
5868                                              : '\'');
5869             }
5870             break;
5871         }
5872         case 'g':
5873         case '1': case '2': case '3': case '4':
5874         case '5': case '6': case '7': case '8': case '9':
5875             {
5876                 I32 num;
5877                 char * endbrace = NULL;
5878                 char * s = RExC_parse;
5879                 char * e = RExC_end;
5880 
5881                 if (*s == 'g') {
5882                     bool isrel = 0;
5883 
5884                     s++;
5885                     if (*s == '{') {
5886                         endbrace = (char *) memchr(s, '}', RExC_end - s);
5887                         if (! endbrace ) {
5888 
5889                             /* Missing '}'.  Position after the number to give
5890                              * a better indication to the user of where the
5891                              * problem is. */
5892                             s++;
5893                             if (*s == '-') {
5894                                 s++;
5895                             }
5896 
5897                             /* If it looks to be a name and not a number, go
5898                              * handle it there */
5899                             if (! isDIGIT(*s)) {
5900                                 goto parse_named_seq;
5901                             }
5902 
5903                             do {
5904                                 s++;
5905                             } while isDIGIT(*s);
5906 
5907                             RExC_parse_set(s);
5908                             vFAIL("Unterminated \\g{...} pattern");
5909                         }
5910 
5911                         s++;    /* Past the '{' */
5912 
5913                         while (isBLANK(*s)) {
5914                             s++;
5915                         }
5916 
5917                         /* Ignore trailing blanks */
5918                         e = endbrace;
5919                         while (s < e && isBLANK(*(e - 1))) {
5920                             e--;
5921                         }
5922                     }
5923 
5924                     /* Here, have isolated the meat of the construct from any
5925                      * surrounding braces */
5926 
5927                     if (*s == '-') {
5928                         isrel = 1;
5929                         s++;
5930                     }
5931 
5932                     if (endbrace && !isDIGIT(*s)) {
5933                         goto parse_named_seq;
5934                     }
5935 
5936                     RExC_parse_set(s);
5937                     num = S_backref_value(RExC_parse, RExC_end);
5938                     if (num == 0)
5939                         vFAIL("Reference to invalid group 0");
5940                     else if (num == I32_MAX) {
5941                          if (isDIGIT(*RExC_parse))
5942                             vFAIL("Reference to nonexistent group");
5943                         else
5944                             vFAIL("Unterminated \\g... pattern");
5945                     }
5946 
5947                     if (isrel) {
5948                         num = RExC_npar - num;
5949                         if (num < 1)
5950                             vFAIL("Reference to nonexistent or unclosed group");
5951                     }
5952                     else
5953                     if (num < RExC_logical_npar) {
5954                         num = RExC_logical_to_parno[num];
5955                     }
5956                     else
5957                     if (ALL_PARENS_COUNTED)  {
5958                         if (num < RExC_logical_total_parens)
5959                             num = RExC_logical_to_parno[num];
5960                         else {
5961                             num = -1;
5962                         }
5963                     }
5964                     else{
5965                         REQUIRE_PARENS_PASS;
5966                     }
5967                 }
5968                 else {
5969                     num = S_backref_value(RExC_parse, RExC_end);
5970                     /* bare \NNN might be backref or octal - if it is larger
5971                      * than or equal RExC_npar then it is assumed to be an
5972                      * octal escape. Note RExC_npar is +1 from the actual
5973                      * number of parens. */
5974                     /* Note we do NOT check if num == I32_MAX here, as that is
5975                      * handled by the RExC_npar check */
5976 
5977                     if (    /* any numeric escape < 10 is always a backref */
5978                            num > 9
5979                             /* any numeric escape < RExC_npar is a backref */
5980                         && num >= RExC_logical_npar
5981                             /* cannot be an octal escape if it starts with [89]
5982                              * */
5983                         && ! inRANGE(*RExC_parse, '8', '9')
5984                     ) {
5985                         /* Probably not meant to be a backref, instead likely
5986                          * to be an octal character escape, e.g. \35 or \777.
5987                          * The above logic should make it obvious why using
5988                          * octal escapes in patterns is problematic. - Yves */
5989                         RExC_parse_set(atom_parse_start);
5990                         goto defchar;
5991                     }
5992                     if (num < RExC_logical_npar) {
5993                         num = RExC_logical_to_parno[num];
5994                     }
5995                     else
5996                     if (ALL_PARENS_COUNTED) {
5997                         if (num < RExC_logical_total_parens) {
5998                             num = RExC_logical_to_parno[num];
5999                         } else {
6000                             num = -1;
6001                         }
6002                     } else {
6003                         REQUIRE_PARENS_PASS;
6004                     }
6005                 }
6006 
6007                 /* At this point RExC_parse points at a numeric escape like
6008                  * \12 or \88 or the digits in \g{34} or \g34 or something
6009                  * similar, which we should NOT treat as an octal escape. It
6010                  * may or may not be a valid backref escape. For instance
6011                  * \88888888 is unlikely to be a valid backref.
6012                  *
6013                  * We've already figured out what value the digits represent.
6014                  * Now, move the parse to beyond them. */
6015                 if (endbrace) {
6016                     RExC_parse_set(endbrace + 1);
6017                 }
6018                 else while (isDIGIT(*RExC_parse)) {
6019                     RExC_parse_inc_by(1);
6020                 }
6021                 if (num < 0)
6022                     vFAIL("Reference to nonexistent group");
6023 
6024                 if (num >= (I32)RExC_npar) {
6025                     /* It might be a forward reference; we can't fail until we
6026                      * know, by completing the parse to get all the groups, and
6027                      * then reparsing */
6028                     if (ALL_PARENS_COUNTED)  {
6029                         if (num >= RExC_total_parens)  {
6030                             vFAIL("Reference to nonexistent group");
6031                         }
6032                     }
6033                     else {
6034                         REQUIRE_PARENS_PASS;
6035                     }
6036                 }
6037                 RExC_sawback = 1;
6038                 ret = reg2node(pRExC_state,
6039                                ((! FOLD)
6040                                  ? REF
6041                                  : (ASCII_FOLD_RESTRICTED)
6042                                    ? REFFA
6043                                    : (AT_LEAST_UNI_SEMANTICS)
6044                                      ? REFFU
6045                                      : (LOC)
6046                                        ? REFFL
6047                                        : REFF),
6048                                 num, RExC_nestroot);
6049                 if (RExC_nestroot && num >= RExC_nestroot)
6050                     FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
6051                 if (OP(REGNODE_p(ret)) == REFF) {
6052                     RExC_seen_d_op = TRUE;
6053                 }
6054                 *flagp |= HASWIDTH;
6055 
6056                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
6057                                         FALSE /* Don't force to /x */ );
6058             }
6059             break;
6060         case '\0':
6061             if (RExC_parse >= RExC_end)
6062                 FAIL("Trailing \\");
6063             /* FALLTHROUGH */
6064         default:
6065             /* Do not generate "unrecognized" warnings here, we fall
6066                back into the quick-grab loop below */
6067             RExC_parse_set(atom_parse_start);
6068             goto defchar;
6069         } /* end of switch on a \foo sequence */
6070         break;
6071 
6072     case '#':
6073 
6074         /* '#' comments should have been spaced over before this function was
6075          * called */
6076         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
6077         /*
6078         if (RExC_flags & RXf_PMf_EXTENDED) {
6079             RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
6080             if (RExC_parse < RExC_end)
6081                 goto tryagain;
6082         }
6083         */
6084 
6085         /* FALLTHROUGH */
6086 
6087     default:
6088           defchar: {
6089 
6090             /* Here, we have determined that the next thing is probably a
6091              * literal character.  RExC_parse points to the first byte of its
6092              * definition.  (It still may be an escape sequence that evaluates
6093              * to a single character) */
6094 
6095             STRLEN len = 0;
6096             UV ender = 0;
6097             char *p;
6098             char *s, *old_s = NULL, *old_old_s = NULL;
6099             char *s0;
6100             U32 max_string_len = 255;
6101 
6102             /* We may have to reparse the node, artificially stopping filling
6103              * it early, based on info gleaned in the first parse.  This
6104              * variable gives where we stop.  Make it above the normal stopping
6105              * place first time through; otherwise it would stop too early */
6106             U32 upper_fill = max_string_len + 1;
6107 
6108             /* We start out as an EXACT node, even if under /i, until we find a
6109              * character which is in a fold.  The algorithm now segregates into
6110              * separate nodes, characters that fold from those that don't under
6111              * /i.  (This hopefully will create nodes that are fixed strings
6112              * even under /i, giving the optimizer something to grab on to.)
6113              * So, if a node has something in it and the next character is in
6114              * the opposite category, that node is closed up, and the function
6115              * returns.  Then regatom is called again, and a new node is
6116              * created for the new category. */
6117             U8 node_type = EXACT;
6118 
6119             /* Assume the node will be fully used; the excess is given back at
6120              * the end.  Under /i, we may need to temporarily add the fold of
6121              * an extra character or two at the end to check for splitting
6122              * multi-char folds, so allocate extra space for that.   We can't
6123              * make any other length assumptions, as a byte input sequence
6124              * could shrink down. */
6125             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
6126                                                  + ((! FOLD)
6127                                                     ? 0
6128                                                     : 2 * ((UTF)
6129                                                            ? UTF8_MAXBYTES_CASE
6130                         /* Max non-UTF-8 expansion is 2 */ : 2)));
6131 
6132             bool next_is_quantifier;
6133             char * oldp = NULL;
6134 
6135             /* We can convert EXACTF nodes to EXACTFU if they contain only
6136              * characters that match identically regardless of the target
6137              * string's UTF8ness.  The reason to do this is that EXACTF is not
6138              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
6139              * runtime.
6140              *
6141              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
6142              * contain only above-Latin1 characters (hence must be in UTF8),
6143              * which don't participate in folds with Latin1-range characters,
6144              * as the latter's folds aren't known until runtime. */
6145             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6146 
6147             /* Single-character EXACTish nodes are almost always SIMPLE.  This
6148              * allows us to override this as encountered */
6149             U8 maybe_SIMPLE = SIMPLE;
6150 
6151             /* Does this node contain something that can't match unless the
6152              * target string is (also) in UTF-8 */
6153             bool requires_utf8_target = FALSE;
6154 
6155             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
6156             bool has_ss = FALSE;
6157 
6158             /* So is the MICRO SIGN */
6159             bool has_micro_sign = FALSE;
6160 
6161             /* Set when we fill up the current node and there is still more
6162              * text to process */
6163             bool overflowed;
6164 
6165             /* Allocate an EXACT node.  The node_type may change below to
6166              * another EXACTish node, but since the size of the node doesn't
6167              * change, it works */
6168             ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
6169             FILL_NODE(ret, node_type);
6170             RExC_emit += NODE_STEP_REGNODE;
6171 
6172             s = STRING(REGNODE_p(ret));
6173 
6174             s0 = s;
6175 
6176           reparse:
6177 
6178             p = RExC_parse;
6179             len = 0;
6180             s = s0;
6181             node_type = EXACT;
6182             oldp = NULL;
6183             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6184             maybe_SIMPLE = SIMPLE;
6185             requires_utf8_target = FALSE;
6186             has_ss = FALSE;
6187             has_micro_sign = FALSE;
6188 
6189           continue_parse:
6190 
6191             /* This breaks under rare circumstances.  If folding, we do not
6192              * want to split a node at a character that is a non-final in a
6193              * multi-char fold, as an input string could just happen to want to
6194              * match across the node boundary.  The code at the end of the loop
6195              * looks for this, and backs off until it finds not such a
6196              * character, but it is possible (though extremely, extremely
6197              * unlikely) for all characters in the node to be non-final fold
6198              * ones, in which case we just leave the node fully filled, and
6199              * hope that it doesn't match the string in just the wrong place */
6200 
6201             assert( ! UTF     /* Is at the beginning of a character */
6202                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
6203                    || UTF8_IS_START(UCHARAT(RExC_parse)));
6204 
6205             overflowed = FALSE;
6206 
6207             /* Here, we have a literal character.  Find the maximal string of
6208              * them in the input that we can fit into a single EXACTish node.
6209              * We quit at the first non-literal or when the node gets full, or
6210              * under /i the categorization of folding/non-folding character
6211              * changes */
6212             while (p < RExC_end && len < upper_fill) {
6213 
6214                 /* In most cases each iteration adds one byte to the output.
6215                  * The exceptions override this */
6216                 Size_t added_len = 1;
6217 
6218                 oldp = p;
6219                 old_old_s = old_s;
6220                 old_s = s;
6221 
6222                 /* White space has already been ignored */
6223                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
6224                        || ! is_PATWS_safe((p), RExC_end, UTF));
6225 
6226                 switch ((U8)*p) {
6227                   const char* message;
6228                   U32 packed_warn;
6229                   U8 grok_c_char;
6230 
6231                 case '^':
6232                 case '$':
6233                 case '.':
6234                 case '[':
6235                 case '(':
6236                 case ')':
6237                 case '|':
6238                     goto loopdone;
6239                 case '\\':
6240                     /* Literal Escapes Switch
6241 
6242                        This switch is meant to handle escape sequences that
6243                        resolve to a literal character.
6244 
6245                        Every escape sequence that represents something
6246                        else, like an assertion or a char class, is handled
6247                        in the switch marked 'Special Escapes' above in this
6248                        routine, but also has an entry here as anything that
6249                        isn't explicitly mentioned here will be treated as
6250                        an unescaped equivalent literal.
6251                     */
6252 
6253                     switch ((U8)*++p) {
6254 
6255                     /* These are all the special escapes. */
6256                     case 'A':             /* Start assertion */
6257                     case 'b': case 'B':   /* Word-boundary assertion*/
6258                     case 'C':             /* Single char !DANGEROUS! */
6259                     case 'd': case 'D':   /* digit class */
6260                     case 'g': case 'G':   /* generic-backref, pos assertion */
6261                     case 'h': case 'H':   /* HORIZWS */
6262                     case 'k': case 'K':   /* named backref, keep marker */
6263                     case 'p': case 'P':   /* Unicode property */
6264                               case 'R':   /* LNBREAK */
6265                     case 's': case 'S':   /* space class */
6266                     case 'v': case 'V':   /* VERTWS */
6267                     case 'w': case 'W':   /* word class */
6268                     case 'X':             /* eXtended Unicode "combining
6269                                              character sequence" */
6270                     case 'z': case 'Z':   /* End of line/string assertion */
6271                         --p;
6272                         goto loopdone;
6273 
6274                     /* Anything after here is an escape that resolves to a
6275                        literal. (Except digits, which may or may not)
6276                      */
6277                     case 'n':
6278                         ender = '\n';
6279                         p++;
6280                         break;
6281                     case 'N': /* Handle a single-code point named character. */
6282                         RExC_parse_set( p + 1 );
6283                         if (! grok_bslash_N(pRExC_state,
6284                                             NULL,   /* Fail if evaluates to
6285                                                        anything other than a
6286                                                        single code point */
6287                                             &ender, /* The returned single code
6288                                                        point */
6289                                             NULL,   /* Don't need a count of
6290                                                        how many code points */
6291                                             flagp,
6292                                             RExC_strict,
6293                                             depth)
6294                         ) {
6295                             if (*flagp & NEED_UTF8)
6296                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
6297                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
6298 
6299                             /* Here, it wasn't a single code point.  Go close
6300                              * up this EXACTish node.  The switch() prior to
6301                              * this switch handles the other cases */
6302                             p = oldp;
6303                             RExC_parse_set(p);
6304                             goto loopdone;
6305                         }
6306                         p = RExC_parse;
6307                         RExC_parse_set(atom_parse_start);
6308 
6309                         /* The \N{} means the pattern, if previously /d,
6310                          * becomes /u.  That means it can't be an EXACTF node,
6311                          * but an EXACTFU */
6312                         if (node_type == EXACTF) {
6313                             node_type = EXACTFU;
6314 
6315                             /* If the node already contains something that
6316                              * differs between EXACTF and EXACTFU, reparse it
6317                              * as EXACTFU */
6318                             if (! maybe_exactfu) {
6319                                 len = 0;
6320                                 s = s0;
6321                                 goto reparse;
6322                             }
6323                         }
6324 
6325                         break;
6326                     case 'r':
6327                         ender = '\r';
6328                         p++;
6329                         break;
6330                     case 't':
6331                         ender = '\t';
6332                         p++;
6333                         break;
6334                     case 'f':
6335                         ender = '\f';
6336                         p++;
6337                         break;
6338                     case 'e':
6339                         ender = ESC_NATIVE;
6340                         p++;
6341                         break;
6342                     case 'a':
6343                         ender = '\a';
6344                         p++;
6345                         break;
6346                     case 'o':
6347                         if (! grok_bslash_o(&p,
6348                                             RExC_end,
6349                                             &ender,
6350                                             &message,
6351                                             &packed_warn,
6352                                             (bool) RExC_strict,
6353                                             FALSE, /* No illegal cp's */
6354                                             UTF))
6355                         {
6356                             RExC_parse_set(p); /* going to die anyway; point to
6357                                                exact spot of failure */
6358                             vFAIL(message);
6359                         }
6360 
6361                         if (message && TO_OUTPUT_WARNINGS(p)) {
6362                             warn_non_literal_string(p, packed_warn, message);
6363                         }
6364                         break;
6365                     case 'x':
6366                         if (! grok_bslash_x(&p,
6367                                             RExC_end,
6368                                             &ender,
6369                                             &message,
6370                                             &packed_warn,
6371                                             (bool) RExC_strict,
6372                                             FALSE, /* No illegal cp's */
6373                                             UTF))
6374                         {
6375                             RExC_parse_set(p);        /* going to die anyway; point
6376                                                    to exact spot of failure */
6377                             vFAIL(message);
6378                         }
6379 
6380                         if (message && TO_OUTPUT_WARNINGS(p)) {
6381                             warn_non_literal_string(p, packed_warn, message);
6382                         }
6383 
6384 #ifdef EBCDIC
6385                         if (ender < 0x100) {
6386                             if (RExC_recode_x_to_native) {
6387                                 ender = LATIN1_TO_NATIVE(ender);
6388                             }
6389                         }
6390 #endif
6391                         break;
6392                     case 'c':
6393                         p++;
6394                         if (! grok_bslash_c(*p, &grok_c_char,
6395                                             &message, &packed_warn))
6396                         {
6397                             /* going to die anyway; point to exact spot of
6398                              * failure */
6399                             char *new_p= p + ((UTF)
6400                                               ? UTF8_SAFE_SKIP(p, RExC_end)
6401                                               : 1);
6402                             RExC_parse_set(new_p);
6403                             vFAIL(message);
6404                         }
6405 
6406                         ender = grok_c_char;
6407                         p++;
6408                         if (message && TO_OUTPUT_WARNINGS(p)) {
6409                             warn_non_literal_string(p, packed_warn, message);
6410                         }
6411 
6412                         break;
6413                     case '8': case '9': /* must be a backreference */
6414                         --p;
6415                         /* we have an escape like \8 which cannot be an octal escape
6416                          * so we exit the loop, and let the outer loop handle this
6417                          * escape which may or may not be a legitimate backref. */
6418                         goto loopdone;
6419                     case '1': case '2': case '3':case '4':
6420                     case '5': case '6': case '7':
6421 
6422                         /* When we parse backslash escapes there is ambiguity
6423                          * between backreferences and octal escapes. Any escape
6424                          * from \1 - \9 is a backreference, any multi-digit
6425                          * escape which does not start with 0 and which when
6426                          * evaluated as decimal could refer to an already
6427                          * parsed capture buffer is a back reference. Anything
6428                          * else is octal.
6429                          *
6430                          * Note this implies that \118 could be interpreted as
6431                          * 118 OR as "\11" . "8" depending on whether there
6432                          * were 118 capture buffers defined already in the
6433                          * pattern.  */
6434 
6435                         /* NOTE, RExC_npar is 1 more than the actual number of
6436                          * parens we have seen so far, hence the "<" as opposed
6437                          * to "<=" */
6438                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
6439                         {  /* Not to be treated as an octal constant, go
6440                                    find backref */
6441                             p = oldp;
6442                             goto loopdone;
6443                         }
6444                         /* FALLTHROUGH */
6445                     case '0':
6446                         {
6447                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT
6448                                       | PERL_SCAN_NOTIFY_ILLDIGIT;
6449                             STRLEN numlen = 3;
6450                             ender = grok_oct(p, &numlen, &flags, NULL);
6451                             p += numlen;
6452                             if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
6453                                 && isDIGIT(*p)  /* like \08, \178 */
6454                                 && ckWARN(WARN_REGEXP))
6455                             {
6456                                 reg_warn_non_literal_string(
6457                                      p + 1,
6458                                      form_alien_digit_msg(8, numlen, p,
6459                                                         RExC_end, UTF, FALSE));
6460                             }
6461                         }
6462                         break;
6463                     case '\0':
6464                         if (p >= RExC_end)
6465                             FAIL("Trailing \\");
6466                         /* FALLTHROUGH */
6467                     default:
6468                         if (isALPHANUMERIC(*p)) {
6469                             /* An alpha followed by '{' is going to fail next
6470                              * iteration, so don't output this warning in that
6471                              * case */
6472                             if (! isALPHA(*p) || *(p + 1) != '{') {
6473                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
6474                                                   " passed through", p);
6475                             }
6476                         }
6477                         goto normal_default;
6478                     } /* End of switch on '\' */
6479                     break;
6480                 case '{':
6481                     /* Trying to gain new uses for '{' without breaking too
6482                      * much existing code is hard.  The solution currently
6483                      * adopted is:
6484                      *  1)  If there is no ambiguity that a '{' should always
6485                      *      be taken literally, at the start of a construct, we
6486                      *      just do so.
6487                      *  2)  If the literal '{' conflicts with our desired use
6488                      *      of it as a metacharacter, we die.  The deprecation
6489                      *      cycles for this have come and gone.
6490                      *  3)  If there is ambiguity, we raise a simple warning.
6491                      *      This could happen, for example, if the user
6492                      *      intended it to introduce a quantifier, but slightly
6493                      *      misspelled the quantifier.  Without this warning,
6494                      *      the quantifier would silently be taken as a literal
6495                      *      string of characters instead of a meta construct */
6496                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
6497                         if (      RExC_strict
6498                             || (  p > atom_parse_start + 1
6499                                 && isALPHA_A(*(p - 1))
6500                                 && *(p - 2) == '\\'))
6501                         {
6502                             RExC_parse_set(p + 1);
6503                             vFAIL("Unescaped left brace in regex is "
6504                                   "illegal here");
6505                         }
6506                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
6507                                          " passed through");
6508                     }
6509                     goto normal_default;
6510                 case '}':
6511                 case ']':
6512                     if (p > RExC_parse && RExC_strict) {
6513                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
6514                     }
6515                     /*FALLTHROUGH*/
6516                 default:    /* A literal character */
6517                   normal_default:
6518                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
6519                         STRLEN numlen;
6520                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6521                                                &numlen, UTF8_ALLOW_DEFAULT);
6522                         p += numlen;
6523                     }
6524                     else
6525                         ender = (U8) *p++;
6526                     break;
6527                 } /* End of switch on the literal */
6528 
6529                 /* Here, have looked at the literal character, and <ender>
6530                  * contains its ordinal; <p> points to the character after it.
6531                  * */
6532 
6533                 if (ender > 255) {
6534                     REQUIRE_UTF8(flagp);
6535                     if (   UNICODE_IS_PERL_EXTENDED(ender)
6536                         && TO_OUTPUT_WARNINGS(p))
6537                     {
6538                         ckWARN2_non_literal_string(p,
6539                                                    packWARN(WARN_PORTABLE),
6540                                                    PL_extended_cp_format,
6541                                                    ender);
6542                     }
6543                 }
6544 
6545                 /* We need to check if the next non-ignored thing is a
6546                  * quantifier.  Move <p> to after anything that should be
6547                  * ignored, which, as a side effect, positions <p> for the next
6548                  * loop iteration */
6549                 skip_to_be_ignored_text(pRExC_state, &p,
6550                                         FALSE /* Don't force to /x */ );
6551 
6552                 /* If the next thing is a quantifier, it applies to this
6553                  * character only, which means that this character has to be in
6554                  * its own node and can't just be appended to the string in an
6555                  * existing node, so if there are already other characters in
6556                  * the node, close the node with just them, and set up to do
6557                  * this character again next time through, when it will be the
6558                  * only thing in its new node */
6559 
6560                 next_is_quantifier =    LIKELY(p < RExC_end)
6561                                      && UNLIKELY(isQUANTIFIER(p, RExC_end));
6562 
6563                 if (next_is_quantifier && LIKELY(len)) {
6564                     p = oldp;
6565                     goto loopdone;
6566                 }
6567 
6568                 /* Ready to add 'ender' to the node */
6569 
6570                 if (! FOLD) {  /* The simple case, just append the literal */
6571                   not_fold_common:
6572 
6573                     /* Don't output if it would overflow */
6574                     if (UNLIKELY(len > max_string_len - ((UTF)
6575                                                       ? UVCHR_SKIP(ender)
6576                                                       : 1)))
6577                     {
6578                         overflowed = TRUE;
6579                         break;
6580                     }
6581 
6582                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
6583                         *(s++) = (char) ender;
6584                     }
6585                     else {
6586                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
6587                         added_len = (char *) new_s - s;
6588                         s = (char *) new_s;
6589 
6590                         if (ender > 255)  {
6591                             requires_utf8_target = TRUE;
6592                         }
6593                     }
6594                 }
6595                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
6596 
6597                     /* Here are folding under /l, and the code point is
6598                      * problematic.  If this is the first character in the
6599                      * node, change the node type to folding.   Otherwise, if
6600                      * this is the first problematic character, close up the
6601                      * existing node, so can start a new node with this one */
6602                     if (! len) {
6603                         node_type = EXACTFL;
6604                         RExC_contains_locale = 1;
6605                     }
6606                     else if (node_type == EXACT) {
6607                         p = oldp;
6608                         goto loopdone;
6609                     }
6610 
6611                     /* This problematic code point means we can't simplify
6612                      * things */
6613                     maybe_exactfu = FALSE;
6614 
6615                     /* Although these two characters have folds that are
6616                      * locale-problematic, they also have folds to above Latin1
6617                      * that aren't a problem.  Doing these now helps at
6618                      * runtime. */
6619                     if (UNLIKELY(   ender == GREEK_CAPITAL_LETTER_MU
6620                                  || ender == LATIN_CAPITAL_LETTER_SHARP_S))
6621                     {
6622                         goto fold_anyway;
6623                     }
6624 
6625                     /* Here, we are adding a problematic fold character.
6626                      * "Problematic" in this context means that its fold isn't
6627                      * known until runtime.  (The non-problematic code points
6628                      * are the above-Latin1 ones that fold to also all
6629                      * above-Latin1.  Their folds don't vary no matter what the
6630                      * locale is.) But here we have characters whose fold
6631                      * depends on the locale.  We just add in the unfolded
6632                      * character, and wait until runtime to fold it */
6633                     goto not_fold_common;
6634                 }
6635                 else /* regular fold; see if actually is in a fold */
6636                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
6637                          || (ender > 255
6638                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
6639                 {
6640                     /* Here, folding, but the character isn't in a fold.
6641                      *
6642                      * Start a new node if previous characters in the node were
6643                      * folded */
6644                     if (len && node_type != EXACT) {
6645                         p = oldp;
6646                         goto loopdone;
6647                     }
6648 
6649                     /* Here, continuing a node with non-folded characters.  Add
6650                      * this one */
6651                     goto not_fold_common;
6652                 }
6653                 else {  /* Here, does participate in some fold */
6654 
6655                     /* If this is the first character in the node, change its
6656                      * type to folding.  Otherwise, if this is the first
6657                      * folding character in the node, close up the existing
6658                      * node, so can start a new node with this one.  */
6659                     if (! len) {
6660                         node_type = compute_EXACTish(pRExC_state);
6661                     }
6662                     else if (node_type == EXACT) {
6663                         p = oldp;
6664                         goto loopdone;
6665                     }
6666 
6667                     if (UTF) {  /* Alway use the folded value for UTF-8
6668                                    patterns */
6669                         if (UVCHR_IS_INVARIANT(ender)) {
6670                             if (UNLIKELY(len + 1 > max_string_len)) {
6671                                 overflowed = TRUE;
6672                                 break;
6673                             }
6674 
6675                             *(s)++ = (U8) toFOLD(ender);
6676                         }
6677                         else {
6678                             UV folded;
6679 
6680                           fold_anyway:
6681                             folded = _to_uni_fold_flags(
6682                                     ender,
6683                                     (U8 *) s,  /* We have allocated extra space
6684                                                   in 's' so can't run off the
6685                                                   end */
6686                                     &added_len,
6687                                     FOLD_FLAGS_FULL
6688                                   | ((   ASCII_FOLD_RESTRICTED
6689                                       || node_type == EXACTFL)
6690                                     ? FOLD_FLAGS_NOMIX_ASCII
6691                                     : 0));
6692                             if (UNLIKELY(len + added_len > max_string_len)) {
6693                                 overflowed = TRUE;
6694                                 break;
6695                             }
6696 
6697                             s += added_len;
6698 
6699                             if (   folded > 255
6700                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
6701                             {
6702                                 /* U+B5 folds to the MU, so its possible for a
6703                                  * non-UTF-8 target to match it */
6704                                 requires_utf8_target = TRUE;
6705                             }
6706                         }
6707                     }
6708                     else { /* Here is non-UTF8. */
6709 
6710                         /* The fold will be one or (rarely) two characters.
6711                          * Check that there's room for at least a single one
6712                          * before setting any flags, etc.  Because otherwise an
6713                          * overflowing character could cause a flag to be set
6714                          * even though it doesn't end up in this node.  (For
6715                          * the two character fold, we check again, before
6716                          * setting any flags) */
6717                         if (UNLIKELY(len + 1 > max_string_len)) {
6718                             overflowed = TRUE;
6719                             break;
6720                         }
6721 
6722 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
6723    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
6724                                       || UNICODE_DOT_DOT_VERSION > 0)
6725 
6726                         /* On non-ancient Unicodes, check for the only possible
6727                          * multi-char fold  */
6728                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
6729 
6730                             /* This potential multi-char fold means the node
6731                              * can't be simple (because it could match more
6732                              * than a single char).  And in some cases it will
6733                              * match 'ss', so set that flag */
6734                             maybe_SIMPLE = 0;
6735                             has_ss = TRUE;
6736 
6737                             /* It can't change to be an EXACTFU (unless already
6738                              * is one).  We fold it iff under /u rules. */
6739                             if (node_type != EXACTFU) {
6740                                 maybe_exactfu = FALSE;
6741                             }
6742                             else {
6743                                 if (UNLIKELY(len + 2 > max_string_len)) {
6744                                     overflowed = TRUE;
6745                                     break;
6746                                 }
6747 
6748                                 *(s++) = 's';
6749                                 *(s++) = 's';
6750                                 added_len = 2;
6751 
6752                                 goto done_with_this_char;
6753                             }
6754                         }
6755                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
6756                                  && LIKELY(len > 0)
6757                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
6758                         {
6759                             /* Also, the sequence 'ss' is special when not
6760                              * under /u.  If the target string is UTF-8, it
6761                              * should match SHARP S; otherwise it won't.  So,
6762                              * here we have to exclude the possibility of this
6763                              * node moving to /u.*/
6764                             has_ss = TRUE;
6765                             maybe_exactfu = FALSE;
6766                         }
6767 #endif
6768                         /* Here, the fold will be a single character */
6769 
6770                         if (UNLIKELY(ender == MICRO_SIGN)) {
6771                             has_micro_sign = TRUE;
6772                         }
6773                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
6774 
6775                             /* If the character's fold differs between /d and
6776                              * /u, this can't change to be an EXACTFU node */
6777                             maybe_exactfu = FALSE;
6778                         }
6779 
6780                         *(s++) = (DEPENDS_SEMANTICS)
6781                                  ? (char) toFOLD(ender)
6782 
6783                                    /* Under /u, the fold of any character in
6784                                     * the 0-255 range happens to be its
6785                                     * lowercase equivalent, except for LATIN
6786                                     * SMALL LETTER SHARP S, which was handled
6787                                     * above, and the MICRO SIGN, whose fold
6788                                     * requires UTF-8 to represent.  */
6789                                  : (char) toLOWER_L1(ender);
6790                     }
6791                 } /* End of adding current character to the node */
6792 
6793               done_with_this_char:
6794 
6795                 len += added_len;
6796 
6797                 if (next_is_quantifier) {
6798 
6799                     /* Here, the next input is a quantifier, and to get here,
6800                      * the current character is the only one in the node. */
6801                     goto loopdone;
6802                 }
6803 
6804             } /* End of loop through literal characters */
6805 
6806             /* Here we have either exhausted the input or run out of room in
6807              * the node.  If the former, we are done.  (If we encountered a
6808              * character that can't be in the node, transfer is made directly
6809              * to <loopdone>, and so we wouldn't have fallen off the end of the
6810              * loop.)  */
6811             if (LIKELY(! overflowed)) {
6812                 goto loopdone;
6813             }
6814 
6815             /* Here we have run out of room.  We can grow plain EXACT and
6816              * LEXACT nodes.  If the pattern is gigantic enough, though,
6817              * eventually we'll have to artificially chunk the pattern into
6818              * multiple nodes. */
6819             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
6820                 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
6821                 Size_t overhead_expansion = 0;
6822                 char temp[256];
6823                 Size_t max_nodes_for_string;
6824                 Size_t achievable;
6825                 SSize_t delta;
6826 
6827                 /* Here we couldn't fit the final character in the current
6828                  * node, so it will have to be reparsed, no matter what else we
6829                  * do */
6830                 p = oldp;
6831 
6832                 /* If would have overflowed a regular EXACT node, switch
6833                  * instead to an LEXACT.  The code below is structured so that
6834                  * the actual growing code is common to changing from an EXACT
6835                  * or just increasing the LEXACT size.  This means that we have
6836                  * to save the string in the EXACT case before growing, and
6837                  * then copy it afterwards to its new location */
6838                 if (node_type == EXACT) {
6839                     overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
6840                     RExC_emit += overhead_expansion;
6841                     Copy(s0, temp, len, char);
6842                 }
6843 
6844                 /* Ready to grow.  If it was a plain EXACT, the string was
6845                  * saved, and the first few bytes of it overwritten by adding
6846                  * an argument field.  We assume, as we do elsewhere in this
6847                  * file, that one byte of remaining input will translate into
6848                  * one byte of output, and if that's too small, we grow again,
6849                  * if too large the excess memory is freed at the end */
6850 
6851                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
6852                 achievable = MIN(max_nodes_for_string,
6853                                  current_string_nodes + STR_SZ(RExC_end - p));
6854                 delta = achievable - current_string_nodes;
6855 
6856                 /* If there is just no more room, go finish up this chunk of
6857                  * the pattern. */
6858                 if (delta <= 0) {
6859                     goto loopdone;
6860                 }
6861 
6862                 change_engine_size(pRExC_state, delta + overhead_expansion);
6863                 current_string_nodes += delta;
6864                 max_string_len
6865                            = sizeof(struct regnode) * current_string_nodes;
6866                 upper_fill = max_string_len + 1;
6867 
6868                 /* If the length was small, we know this was originally an
6869                  * EXACT node now converted to LEXACT, and the string has to be
6870                  * restored.  Otherwise the string was untouched.  260 is just
6871                  * a number safely above 255 so don't have to worry about
6872                  * getting it precise */
6873                 if (len < 260) {
6874                     node_type = LEXACT;
6875                     FILL_NODE(ret, node_type);
6876                     s0 = STRING(REGNODE_p(ret));
6877                     Copy(temp, s0, len, char);
6878                     s = s0 + len;
6879                 }
6880 
6881                 goto continue_parse;
6882             }
6883             else if (FOLD) {
6884                 bool splittable = FALSE;
6885                 bool backed_up = FALSE;
6886                 char * e;       /* should this be U8? */
6887                 char * s_start; /* should this be U8? */
6888 
6889                 /* Here is /i.  Running out of room creates a problem if we are
6890                  * folding, and the split happens in the middle of a
6891                  * multi-character fold, as a match that should have occurred,
6892                  * won't, due to the way nodes are matched, and our artificial
6893                  * boundary.  So back off until we aren't splitting such a
6894                  * fold.  If there is no such place to back off to, we end up
6895                  * taking the entire node as-is.  This can happen if the node
6896                  * consists entirely of 'f' or entirely of 's' characters (or
6897                  * things that fold to them) as 'ff' and 'ss' are
6898                  * multi-character folds.
6899                  *
6900                  * The Unicode standard says that multi character folds consist
6901                  * of either two or three characters.  That means we would be
6902                  * splitting one if the final character in the node is at the
6903                  * beginning of either type, or is the second of a three
6904                  * character fold.
6905                  *
6906                  * At this point:
6907                  *  ender     is the code point of the character that won't fit
6908                  *            in the node
6909                  *  s         points to just beyond the final byte in the node.
6910                  *            It's where we would place ender if there were
6911                  *            room, and where in fact we do place ender's fold
6912                  *            in the code below, as we've over-allocated space
6913                  *            for s0 (hence s) to allow for this
6914                  *  e         starts at 's' and advances as we append things.
6915                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
6916                  *            have been advanced to beyond it).
6917                  *  old_old_s points to the beginning byte of the final
6918                  *            character in the node
6919                  *  p         points to the beginning byte in the input of the
6920                  *            character beyond 'ender'.
6921                  *  oldp      points to the beginning byte in the input of
6922                  *            'ender'.
6923                  *
6924                  * In the case of /il, we haven't folded anything that could be
6925                  * affected by the locale.  That means only above-Latin1
6926                  * characters that fold to other above-latin1 characters get
6927                  * folded at compile time.  To check where a good place to
6928                  * split nodes is, everything in it will have to be folded.
6929                  * The boolean 'maybe_exactfu' keeps track in /il if there are
6930                  * any unfolded characters in the node. */
6931                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
6932 
6933                 /* If we do need to fold the node, we need a place to store the
6934                  * folded copy, and a way to map back to the unfolded original
6935                  * */
6936                 char * locfold_buf = NULL;
6937                 Size_t * loc_correspondence = NULL;
6938 
6939                 if (! need_to_fold_loc) {   /* The normal case.  Just
6940                                                initialize to the actual node */
6941                     e = s;
6942                     s_start = s0;
6943                     s = old_old_s;  /* Point to the beginning of the final char
6944                                        that fits in the node */
6945                 }
6946                 else {
6947 
6948                     /* Here, we have filled a /il node, and there are unfolded
6949                      * characters in it.  If the runtime locale turns out to be
6950                      * UTF-8, there are possible multi-character folds, just
6951                      * like when not under /l.  The node hence can't terminate
6952                      * in the middle of such a fold.  To determine this, we
6953                      * have to create a folded copy of this node.  That means
6954                      * reparsing the node, folding everything assuming a UTF-8
6955                      * locale.  (If at runtime it isn't such a locale, the
6956                      * actions here wouldn't have been necessary, but we have
6957                      * to assume the worst case.)  If we find we need to back
6958                      * off the folded string, we do so, and then map that
6959                      * position back to the original unfolded node, which then
6960                      * gets output, truncated at that spot */
6961 
6962                     char * redo_p = RExC_parse;
6963                     char * redo_e;
6964                     char * old_redo_e;
6965 
6966                     /* Allow enough space assuming a single byte input folds to
6967                      * a single byte output, plus assume that the two unparsed
6968                      * characters (that we may need) fold to the largest number
6969                      * of bytes possible, plus extra for one more worst case
6970                      * scenario.  In the loop below, if we start eating into
6971                      * that final spare space, we enlarge this initial space */
6972                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
6973 
6974                     Newxz(locfold_buf, size, char);
6975                     Newxz(loc_correspondence, size, Size_t);
6976 
6977                     /* Redo this node's parse, folding into 'locfold_buf' */
6978                     redo_p = RExC_parse;
6979                     old_redo_e = redo_e = locfold_buf;
6980                     while (redo_p <= oldp) {
6981 
6982                         old_redo_e = redo_e;
6983                         loc_correspondence[redo_e - locfold_buf]
6984                                                         = redo_p - RExC_parse;
6985 
6986                         if (UTF) {
6987                             Size_t added_len;
6988 
6989                             (void) _to_utf8_fold_flags((U8 *) redo_p,
6990                                                        (U8 *) RExC_end,
6991                                                        (U8 *) redo_e,
6992                                                        &added_len,
6993                                                        FOLD_FLAGS_FULL);
6994                             redo_e += added_len;
6995                             redo_p += UTF8SKIP(redo_p);
6996                         }
6997                         else {
6998 
6999                             /* Note that if this code is run on some ancient
7000                              * Unicode versions, SHARP S doesn't fold to 'ss',
7001                              * but rather than clutter the code with #ifdef's,
7002                              * as is done above, we ignore that possibility.
7003                              * This is ok because this code doesn't affect what
7004                              * gets matched, but merely where the node gets
7005                              * split */
7006                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
7007                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
7008                             }
7009                             else {
7010                                 *redo_e++ = 's';
7011                                 *redo_e++ = 's';
7012                             }
7013                             redo_p++;
7014                         }
7015 
7016 
7017                         /* If we're getting so close to the end that a
7018                          * worst-case fold in the next character would cause us
7019                          * to overflow, increase, assuming one byte output byte
7020                          * per one byte input one, plus room for another worst
7021                          * case fold */
7022                         if (   redo_p <= oldp
7023                             && redo_e > locfold_buf + size
7024                                                     - (UTF8_MAXBYTES_CASE + 1))
7025                         {
7026                             Size_t new_size = size
7027                                             + (oldp - redo_p)
7028                                             + UTF8_MAXBYTES_CASE + 1;
7029                             Ptrdiff_t e_offset = redo_e - locfold_buf;
7030 
7031                             Renew(locfold_buf, new_size, char);
7032                             Renew(loc_correspondence, new_size, Size_t);
7033                             size = new_size;
7034 
7035                             redo_e = locfold_buf + e_offset;
7036                         }
7037                     }
7038 
7039                     /* Set so that things are in terms of the folded, temporary
7040                      * string */
7041                     s = old_redo_e;
7042                     s_start = locfold_buf;
7043                     e = redo_e;
7044 
7045                 }
7046 
7047                 /* Here, we have 's', 's_start' and 'e' set up to point to the
7048                  * input that goes into the node, folded.
7049                  *
7050                  * If the final character of the node and the fold of ender
7051                  * form the first two characters of a three character fold, we
7052                  * need to peek ahead at the next (unparsed) character in the
7053                  * input to determine if the three actually do form such a
7054                  * fold.  Just looking at that character is not generally
7055                  * sufficient, as it could be, for example, an escape sequence
7056                  * that evaluates to something else, and it needs to be folded.
7057                  *
7058                  * khw originally thought to just go through the parse loop one
7059                  * extra time, but that doesn't work easily as that iteration
7060                  * could cause things to think that the parse is over and to
7061                  * goto loopdone.  The character could be a '$' for example, or
7062                  * the character beyond could be a quantifier, and other
7063                  * glitches as well.
7064                  *
7065                  * The solution used here for peeking ahead is to look at that
7066                  * next character.  If it isn't ASCII punctuation, then it will
7067                  * be something that would continue on in an EXACTish node if
7068                  * there were space.  We append the fold of it to s, having
7069                  * reserved enough room in s0 for the purpose.  If we can't
7070                  * reasonably peek ahead, we instead assume the worst case:
7071                  * that it is something that would form the completion of a
7072                  * multi-char fold.
7073                  *
7074                  * If we can't split between s and ender, we work backwards
7075                  * character-by-character down to s0.  At each current point
7076                  * see if we are at the beginning of a multi-char fold.  If so,
7077                  * that means we would be splitting the fold across nodes, and
7078                  * so we back up one and try again.
7079                  *
7080                  * If we're not at the beginning, we still could be at the
7081                  * final two characters of a (rare) three character fold.  We
7082                  * check if the sequence starting at the character before the
7083                  * current position (and including the current and next
7084                  * characters) is a three character fold.  If not, the node can
7085                  * be split here.  If it is, we have to backup two characters
7086                  * and try again.
7087                  *
7088                  * Otherwise, the node can be split at the current position.
7089                  *
7090                  * The same logic is used for UTF-8 patterns and not */
7091                 if (UTF) {
7092                     Size_t added_len;
7093 
7094                     /* Append the fold of ender */
7095                     (void) _to_uni_fold_flags(
7096                         ender,
7097                         (U8 *) e,
7098                         &added_len,
7099                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7100                                         ? FOLD_FLAGS_NOMIX_ASCII
7101                                         : 0));
7102                     e += added_len;
7103 
7104                     /* 's' and the character folded to by ender may be the
7105                      * first two of a three-character fold, in which case the
7106                      * node should not be split here.  That may mean examining
7107                      * the so-far unparsed character starting at 'p'.  But if
7108                      * ender folded to more than one character, we already have
7109                      * three characters to look at.  Also, we first check if
7110                      * the sequence consisting of s and the next character form
7111                      * the first two of some three character fold.  If not,
7112                      * there's no need to peek ahead. */
7113                     if (   added_len <= UTF8SKIP(e - added_len)
7114                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
7115                     {
7116                         /* Here, the two do form the beginning of a potential
7117                          * three character fold.  The unexamined character may
7118                          * or may not complete it.  Peek at it.  It might be
7119                          * something that ends the node or an escape sequence,
7120                          * in which case we don't know without a lot of work
7121                          * what it evaluates to, so we have to assume the worst
7122                          * case: that it does complete the fold, and so we
7123                          * can't split here.  All such instances  will have
7124                          * that character be an ASCII punctuation character,
7125                          * like a backslash.  So, for that case, backup one and
7126                          * drop down to try at that position */
7127                         if (isPUNCT(*p)) {
7128                             s = (char *) utf8_hop_back((U8 *) s, -1,
7129                                        (U8 *) s_start);
7130                             backed_up = TRUE;
7131                         }
7132                         else {
7133                             /* Here, since it's not punctuation, it must be a
7134                              * real character, and we can append its fold to
7135                              * 'e' (having deliberately reserved enough space
7136                              * for this eventuality) and drop down to check if
7137                              * the three actually do form a folded sequence */
7138                             (void) _to_utf8_fold_flags(
7139                                 (U8 *) p, (U8 *) RExC_end,
7140                                 (U8 *) e,
7141                                 &added_len,
7142                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7143                                                 ? FOLD_FLAGS_NOMIX_ASCII
7144                                                 : 0));
7145                             e += added_len;
7146                         }
7147                     }
7148 
7149                     /* Here, we either have three characters available in
7150                      * sequence starting at 's', or we have two characters and
7151                      * know that the following one can't possibly be part of a
7152                      * three character fold.  We go through the node backwards
7153                      * until we find a place where we can split it without
7154                      * breaking apart a multi-character fold.  At any given
7155                      * point we have to worry about if such a fold begins at
7156                      * the current 's', and also if a three-character fold
7157                      * begins at s-1, (containing s and s+1).  Splitting in
7158                      * either case would break apart a fold */
7159                     do {
7160                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
7161                                                             (U8 *) s_start);
7162 
7163                         /* If is a multi-char fold, can't split here.  Backup
7164                          * one char and try again */
7165                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
7166                             s = prev_s;
7167                             backed_up = TRUE;
7168                             continue;
7169                         }
7170 
7171                         /* If the two characters beginning at 's' are part of a
7172                          * three character fold starting at the character
7173                          * before s, we can't split either before or after s.
7174                          * Backup two chars and try again */
7175                         if (   LIKELY(s > s_start)
7176                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
7177                         {
7178                             s = prev_s;
7179                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
7180                             backed_up = TRUE;
7181                             continue;
7182                         }
7183 
7184                         /* Here there's no multi-char fold between s and the
7185                          * next character following it.  We can split */
7186                         splittable = TRUE;
7187                         break;
7188 
7189                     } while (s > s_start); /* End of loops backing up through the node */
7190 
7191                     /* Here we either couldn't find a place to split the node,
7192                      * or else we broke out of the loop setting 'splittable' to
7193                      * true.  In the latter case, the place to split is between
7194                      * the first and second characters in the sequence starting
7195                      * at 's' */
7196                     if (splittable) {
7197                         s += UTF8SKIP(s);
7198                     }
7199                 }
7200                 else {  /* Pattern not UTF-8 */
7201                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
7202                         || ASCII_FOLD_RESTRICTED)
7203                     {
7204                         assert( toLOWER_L1(ender) < 256 );
7205                         *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7206                     }
7207                     else {
7208                         *e++ = 's';
7209                         *e++ = 's';
7210                     }
7211 
7212                     if (   e - s  <= 1
7213                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
7214                     {
7215                         if (isPUNCT(*p)) {
7216                             s--;
7217                             backed_up = TRUE;
7218                         }
7219                         else {
7220                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
7221                                 || ASCII_FOLD_RESTRICTED)
7222                             {
7223                                 assert( toLOWER_L1(ender) < 256 );
7224                                 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7225                             }
7226                             else {
7227                                 *e++ = 's';
7228                                 *e++ = 's';
7229                             }
7230                         }
7231                     }
7232 
7233                     do {
7234                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
7235                             s--;
7236                             backed_up = TRUE;
7237                             continue;
7238                         }
7239 
7240                         if (   LIKELY(s > s_start)
7241                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
7242                         {
7243                             s -= 2;
7244                             backed_up = TRUE;
7245                             continue;
7246                         }
7247 
7248                         splittable = TRUE;
7249                         break;
7250 
7251                     } while (s > s_start);
7252 
7253                     if (splittable) {
7254                         s++;
7255                     }
7256                 }
7257 
7258                 /* Here, we are done backing up.  If we didn't backup at all
7259                  * (the likely case), just proceed */
7260                 if (backed_up) {
7261 
7262                    /* If we did find a place to split, reparse the entire node
7263                     * stopping where we have calculated. */
7264                     if (splittable) {
7265 
7266                        /* If we created a temporary folded string under /l, we
7267                         * have to map that back to the original */
7268                         if (need_to_fold_loc) {
7269                             upper_fill = loc_correspondence[s - s_start];
7270                             if (upper_fill == 0) {
7271                                 FAIL2("panic: loc_correspondence[%d] is 0",
7272                                       (int) (s - s_start));
7273                             }
7274                             Safefree(locfold_buf);
7275                             Safefree(loc_correspondence);
7276                         }
7277                         else {
7278                             upper_fill = s - s0;
7279                         }
7280                         goto reparse;
7281                     }
7282 
7283                     /* Here the node consists entirely of non-final multi-char
7284                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
7285                      * decent place to split it, so give up and just take the
7286                      * whole thing */
7287                     len = old_s - s0;
7288                 }
7289 
7290                 if (need_to_fold_loc) {
7291                     Safefree(locfold_buf);
7292                     Safefree(loc_correspondence);
7293                 }
7294             }   /* End of verifying node ends with an appropriate char */
7295 
7296             /* We need to start the next node at the character that didn't fit
7297              * in this one */
7298             p = oldp;
7299 
7300           loopdone:   /* Jumped to when encounters something that shouldn't be
7301                          in the node */
7302 
7303             /* Free up any over-allocated space; cast is to silence bogus
7304              * warning in MS VC */
7305             change_engine_size(pRExC_state,
7306                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
7307 
7308             /* I (khw) don't know if you can get here with zero length, but the
7309              * old code handled this situation by creating a zero-length EXACT
7310              * node.  Might as well be NOTHING instead */
7311             if (len == 0) {
7312                 OP(REGNODE_p(ret)) = NOTHING;
7313             }
7314             else {
7315 
7316                 /* If the node type is EXACT here, check to see if it
7317                  * should be EXACTL, or EXACT_REQ8. */
7318                 if (node_type == EXACT) {
7319                     if (LOC) {
7320                         node_type = EXACTL;
7321                     }
7322                     else if (requires_utf8_target) {
7323                         node_type = EXACT_REQ8;
7324                     }
7325                 }
7326                 else if (node_type == LEXACT) {
7327                     if (requires_utf8_target) {
7328                         node_type = LEXACT_REQ8;
7329                     }
7330                 }
7331                 else if (FOLD) {
7332                     if (    UNLIKELY(has_micro_sign || has_ss)
7333                         && (node_type == EXACTFU || (   node_type == EXACTF
7334                                                      && maybe_exactfu)))
7335                     {   /* These two conditions are problematic in non-UTF-8
7336                            EXACTFU nodes. */
7337                         assert(! UTF);
7338                         node_type = EXACTFUP;
7339                     }
7340                     else if (node_type == EXACTFL) {
7341 
7342                         /* 'maybe_exactfu' is deliberately set above to
7343                          * indicate this node type, where all code points in it
7344                          * are above 255 */
7345                         if (maybe_exactfu) {
7346                             node_type = EXACTFLU8;
7347                         }
7348                         else if (UNLIKELY(
7349                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
7350                         {
7351                             /* A character that folds to more than one will
7352                              * match multiple characters, so can't be SIMPLE.
7353                              * We don't have to worry about this with EXACTFLU8
7354                              * nodes just above, as they have already been
7355                              * folded (since the fold doesn't vary at run
7356                              * time).  Here, if the final character in the node
7357                              * folds to multiple, it can't be simple.  (This
7358                              * only has an effect if the node has only a single
7359                              * character, hence the final one, as elsewhere we
7360                              * turn off simple for nodes whose length > 1 */
7361                             maybe_SIMPLE = 0;
7362                         }
7363                     }
7364                     else if (node_type == EXACTF) {  /* Means is /di */
7365 
7366                         /* This intermediate variable is needed solely because
7367                          * the asserts in the macro where used exceed Win32's
7368                          * literal string capacity */
7369                         char first_char = * STRING(REGNODE_p(ret));
7370 
7371                         /* If 'maybe_exactfu' is clear, then we need to stay
7372                          * /di.  If it is set, it means there are no code
7373                          * points that match differently depending on UTF8ness
7374                          * of the target string, so it can become an EXACTFU
7375                          * node */
7376                         if (! maybe_exactfu) {
7377                             RExC_seen_d_op = TRUE;
7378                         }
7379                         else if (   isALPHA_FOLD_EQ(first_char, 's')
7380                                  || isALPHA_FOLD_EQ(ender, 's'))
7381                         {
7382                             /* But, if the node begins or ends in an 's' we
7383                              * have to defer changing it into an EXACTFU, as
7384                              * the node could later get joined with another one
7385                              * that ends or begins with 's' creating an 'ss'
7386                              * sequence which would then wrongly match the
7387                              * sharp s without the target being UTF-8.  We
7388                              * create a special node that we resolve later when
7389                              * we join nodes together */
7390 
7391                             node_type = EXACTFU_S_EDGE;
7392                         }
7393                         else {
7394                             node_type = EXACTFU;
7395                         }
7396                     }
7397 
7398                     if (requires_utf8_target && node_type == EXACTFU) {
7399                         node_type = EXACTFU_REQ8;
7400                     }
7401                 }
7402 
7403                 OP(REGNODE_p(ret)) = node_type;
7404                 setSTR_LEN(REGNODE_p(ret), len);
7405                 RExC_emit += STR_SZ(len);
7406 
7407                 /* If the node isn't a single character, it can't be SIMPLE */
7408                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
7409                     maybe_SIMPLE = 0;
7410                 }
7411 
7412                 *flagp |= HASWIDTH | maybe_SIMPLE;
7413             }
7414 
7415             RExC_parse_set(p);
7416 
7417             {
7418                 /* len is STRLEN which is unsigned, need to copy to signed */
7419                 IV iv = len;
7420                 if (iv < 0)
7421                     vFAIL("Internal disaster");
7422             }
7423 
7424         } /* End of label 'defchar:' */
7425         break;
7426     } /* End of giant switch on input character */
7427 
7428     /* Position parse to next real character */
7429     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
7430                                             FALSE /* Don't force to /x */ );
7431     if (   *RExC_parse == '{'
7432         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
7433     {
7434         if (RExC_strict) {
7435             RExC_parse_inc_by(1);
7436             vFAIL("Unescaped left brace in regex is illegal here");
7437         }
7438         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
7439                                   " passed through");
7440     }
7441 
7442     return(ret);
7443 }
7444 
7445 
7446 void
Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode * node,SV ** invlist_ptr)7447 Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
7448 {
7449     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
7450      * sets up the bitmap and any flags, removing those code points from the
7451      * inversion list, setting it to NULL should it become completely empty */
7452 
7453 
7454     PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
7455 
7456     /* There is no bitmap for this node type */
7457     if (REGNODE_TYPE(OP(node))  != ANYOF) {
7458         return;
7459     }
7460 
7461     ANYOF_BITMAP_ZERO(node);
7462     if (*invlist_ptr) {
7463 
7464         /* This gets set if we actually need to modify things */
7465         bool change_invlist = FALSE;
7466 
7467         UV start, end;
7468 
7469         /* Start looking through *invlist_ptr */
7470         invlist_iterinit(*invlist_ptr);
7471         while (invlist_iternext(*invlist_ptr, &start, &end)) {
7472             UV high;
7473             int i;
7474 
7475             /* Quit if are above what we should change */
7476             if (start >= NUM_ANYOF_CODE_POINTS) {
7477                 break;
7478             }
7479 
7480             change_invlist = TRUE;
7481 
7482             /* Set all the bits in the range, up to the max that we are doing */
7483             high = (end < NUM_ANYOF_CODE_POINTS - 1)
7484                    ? end
7485                    : NUM_ANYOF_CODE_POINTS - 1;
7486             for (i = start; i <= (int) high; i++) {
7487                 ANYOF_BITMAP_SET(node, i);
7488             }
7489         }
7490         invlist_iterfinish(*invlist_ptr);
7491 
7492         /* Done with loop; remove any code points that are in the bitmap from
7493          * *invlist_ptr */
7494         if (change_invlist) {
7495             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
7496         }
7497 
7498         /* If have completely emptied it, remove it completely */
7499         if (_invlist_len(*invlist_ptr) == 0) {
7500             SvREFCNT_dec_NN(*invlist_ptr);
7501             *invlist_ptr = NULL;
7502         }
7503     }
7504 }
7505 
7506 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7507    Character classes ([:foo:]) can also be negated ([:^foo:]).
7508    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7509    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7510    but trigger failures because they are currently unimplemented. */
7511 
7512 #define POSIXCC_DONE(c)   ((c) == ':')
7513 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7514 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7515 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
7516 
7517 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
7518 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
7519 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
7520 
7521 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
7522 
7523 /* 'posix_warnings' and 'warn_text' are names of variables in the following
7524  * routine. q.v. */
7525 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
7526         if (posix_warnings) {                                               \
7527             if (! RExC_warn_text ) RExC_warn_text =                         \
7528                                          (AV *) sv_2mortal((SV *) newAV()); \
7529             av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
7530                                              WARNING_PREFIX                 \
7531                                              text                           \
7532                                              REPORT_LOCATION,               \
7533                                              REPORT_LOCATION_ARGS(p)));     \
7534         }                                                                   \
7535     } STMT_END
7536 #define CLEAR_POSIX_WARNINGS()                                              \
7537     STMT_START {                                                            \
7538         if (posix_warnings && RExC_warn_text)                               \
7539             av_clear(RExC_warn_text);                                       \
7540     } STMT_END
7541 
7542 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
7543     STMT_START {                                                            \
7544         CLEAR_POSIX_WARNINGS();                                             \
7545         return ret;                                                         \
7546     } STMT_END
7547 
7548 STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t * pRExC_state,const char * const s,char ** updated_parse_ptr,AV ** posix_warnings,const bool check_only)7549 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
7550 
7551     const char * const s,      /* Where the putative posix class begins.
7552                                   Normally, this is one past the '['.  This
7553                                   parameter exists so it can be somewhere
7554                                   besides RExC_parse. */
7555     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
7556                                   NULL */
7557     AV ** posix_warnings,      /* Where to place any generated warnings, or
7558                                   NULL */
7559     const bool check_only      /* Don't die if error */
7560 )
7561 {
7562     /* This parses what the caller thinks may be one of the three POSIX
7563      * constructs:
7564      *  1) a character class, like [:blank:]
7565      *  2) a collating symbol, like [. .]
7566      *  3) an equivalence class, like [= =]
7567      * In the latter two cases, it croaks if it finds a syntactically legal
7568      * one, as these are not handled by Perl.
7569      *
7570      * The main purpose is to look for a POSIX character class.  It returns:
7571      *  a) the class number
7572      *      if it is a completely syntactically and semantically legal class.
7573      *      'updated_parse_ptr', if not NULL, is set to point to just after the
7574      *      closing ']' of the class
7575      *  b) OOB_NAMEDCLASS
7576      *      if it appears that one of the three POSIX constructs was meant, but
7577      *      its specification was somehow defective.  'updated_parse_ptr', if
7578      *      not NULL, is set to point to the character just after the end
7579      *      character of the class.  See below for handling of warnings.
7580      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
7581      *      if it  doesn't appear that a POSIX construct was intended.
7582      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
7583      *      raised.
7584      *
7585      * In b) there may be errors or warnings generated.  If 'check_only' is
7586      * TRUE, then any errors are discarded.  Warnings are returned to the
7587      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
7588      * instead it is NULL, warnings are suppressed.
7589      *
7590      * The reason for this function, and its complexity is that a bracketed
7591      * character class can contain just about anything.  But it's easy to
7592      * mistype the very specific posix class syntax but yielding a valid
7593      * regular bracketed class, so it silently gets compiled into something
7594      * quite unintended.
7595      *
7596      * The solution adopted here maintains backward compatibility except that
7597      * it adds a warning if it looks like a posix class was intended but
7598      * improperly specified.  The warning is not raised unless what is input
7599      * very closely resembles one of the 14 legal posix classes.  To do this,
7600      * it uses fuzzy parsing.  It calculates how many single-character edits it
7601      * would take to transform what was input into a legal posix class.  Only
7602      * if that number is quite small does it think that the intention was a
7603      * posix class.  Obviously these are heuristics, and there will be cases
7604      * where it errs on one side or another, and they can be tweaked as
7605      * experience informs.
7606      *
7607      * The syntax for a legal posix class is:
7608      *
7609      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
7610      *
7611      * What this routine considers syntactically to be an intended posix class
7612      * is this (the comments indicate some restrictions that the pattern
7613      * doesn't show):
7614      *
7615      *  qr/(?x: \[?                         # The left bracket, possibly
7616      *                                      # omitted
7617      *          \h*                         # possibly followed by blanks
7618      *          (?: \^ \h* )?               # possibly a misplaced caret
7619      *          [:;]?                       # The opening class character,
7620      *                                      # possibly omitted.  A typo
7621      *                                      # semi-colon can also be used.
7622      *          \h*
7623      *          \^?                         # possibly a correctly placed
7624      *                                      # caret, but not if there was also
7625      *                                      # a misplaced one
7626      *          \h*
7627      *          .{3,15}                     # The class name.  If there are
7628      *                                      # deviations from the legal syntax,
7629      *                                      # its edit distance must be close
7630      *                                      # to a real class name in order
7631      *                                      # for it to be considered to be
7632      *                                      # an intended posix class.
7633      *          \h*
7634      *          [[:punct:]]?                # The closing class character,
7635      *                                      # possibly omitted.  If not a colon
7636      *                                      # nor semi colon, the class name
7637      *                                      # must be even closer to a valid
7638      *                                      # one
7639      *          \h*
7640      *          \]?                         # The right bracket, possibly
7641      *                                      # omitted.
7642      *     )/
7643      *
7644      * In the above, \h must be ASCII-only.
7645      *
7646      * These are heuristics, and can be tweaked as field experience dictates.
7647      * There will be cases when someone didn't intend to specify a posix class
7648      * that this warns as being so.  The goal is to minimize these, while
7649      * maximizing the catching of things intended to be a posix class that
7650      * aren't parsed as such.
7651      */
7652 
7653     const char* p             = s;
7654     const char * const e      = RExC_end;
7655     unsigned complement       = 0;      /* If to complement the class */
7656     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
7657     bool has_opening_bracket  = FALSE;
7658     bool has_opening_colon    = FALSE;
7659     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
7660                                                    valid class */
7661     const char * possible_end = NULL;   /* used for a 2nd parse pass */
7662     const char* name_start;             /* ptr to class name first char */
7663 
7664     /* If the number of single-character typos the input name is away from a
7665      * legal name is no more than this number, it is considered to have meant
7666      * the legal name */
7667     int max_distance          = 2;
7668 
7669     /* to store the name.  The size determines the maximum length before we
7670      * decide that no posix class was intended.  Should be at least
7671      * sizeof("alphanumeric") */
7672     UV input_text[15];
7673     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
7674 
7675     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
7676 
7677     CLEAR_POSIX_WARNINGS();
7678 
7679     if (p >= e) {
7680         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
7681     }
7682 
7683     if (*(p - 1) != '[') {
7684         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
7685         found_problem = TRUE;
7686     }
7687     else {
7688         has_opening_bracket = TRUE;
7689     }
7690 
7691     /* They could be confused and think you can put spaces between the
7692      * components */
7693     if (isBLANK(*p)) {
7694         found_problem = TRUE;
7695 
7696         do {
7697             p++;
7698         } while (p < e && isBLANK(*p));
7699 
7700         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7701     }
7702 
7703     /* For [. .] and [= =].  These are quite different internally from [: :],
7704      * so they are handled separately.  */
7705     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
7706                                             and 1 for at least one char in it
7707                                           */
7708     {
7709         const char open_char  = *p;
7710         const char * temp_ptr = p + 1;
7711 
7712         /* These two constructs are not handled by perl, and if we find a
7713          * syntactically valid one, we croak.  khw, who wrote this code, finds
7714          * this explanation of them very unclear:
7715          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
7716          * And searching the rest of the internet wasn't very helpful either.
7717          * It looks like just about any byte can be in these constructs,
7718          * depending on the locale.  But unless the pattern is being compiled
7719          * under /l, which is very rare, Perl runs under the C or POSIX locale.
7720          * In that case, it looks like [= =] isn't allowed at all, and that
7721          * [. .] could be any single code point, but for longer strings the
7722          * constituent characters would have to be the ASCII alphabetics plus
7723          * the minus-hyphen.  Any sensible locale definition would limit itself
7724          * to these.  And any portable one definitely should.  Trying to parse
7725          * the general case is a nightmare (see [perl #127604]).  So, this code
7726          * looks only for interiors of these constructs that match:
7727          *      qr/.|[-\w]{2,}/
7728          * Using \w relaxes the apparent rules a little, without adding much
7729          * danger of mistaking something else for one of these constructs.
7730          *
7731          * [. .] in some implementations described on the internet is usable to
7732          * escape a character that otherwise is special in bracketed character
7733          * classes.  For example [.].] means a literal right bracket instead of
7734          * the ending of the class
7735          *
7736          * [= =] can legitimately contain a [. .] construct, but we don't
7737          * handle this case, as that [. .] construct will later get parsed
7738          * itself and croak then.  And [= =] is checked for even when not under
7739          * /l, as Perl has long done so.
7740          *
7741          * The code below relies on there being a trailing NUL, so it doesn't
7742          * have to keep checking if the parse ptr < e.
7743          */
7744         if (temp_ptr[1] == open_char) {
7745             temp_ptr++;
7746         }
7747         else while (    temp_ptr < e
7748                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
7749         {
7750             temp_ptr++;
7751         }
7752 
7753         if (*temp_ptr == open_char) {
7754             temp_ptr++;
7755             if (*temp_ptr == ']') {
7756                 temp_ptr++;
7757                 if (! found_problem && ! check_only) {
7758                     RExC_parse_set((char *) temp_ptr);
7759                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
7760                             "extensions", open_char, open_char);
7761                 }
7762 
7763                 /* Here, the syntax wasn't completely valid, or else the call
7764                  * is to check-only */
7765                 if (updated_parse_ptr) {
7766                     *updated_parse_ptr = (char *) temp_ptr;
7767                 }
7768 
7769                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
7770             }
7771         }
7772 
7773         /* If we find something that started out to look like one of these
7774          * constructs, but isn't, we continue below so that it can be checked
7775          * for being a class name with a typo of '.' or '=' instead of a colon.
7776          * */
7777     }
7778 
7779     /* Here, we think there is a possibility that a [: :] class was meant, and
7780      * we have the first real character.  It could be they think the '^' comes
7781      * first */
7782     if (*p == '^') {
7783         found_problem = TRUE;
7784         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
7785         complement = 1;
7786         p++;
7787 
7788         if (isBLANK(*p)) {
7789             found_problem = TRUE;
7790 
7791             do {
7792                 p++;
7793             } while (p < e && isBLANK(*p));
7794 
7795             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7796         }
7797     }
7798 
7799     /* But the first character should be a colon, which they could have easily
7800      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
7801      * distinguish from a colon, so treat that as a colon).  */
7802     if (*p == ':') {
7803         p++;
7804         has_opening_colon = TRUE;
7805     }
7806     else if (*p == ';') {
7807         found_problem = TRUE;
7808         p++;
7809         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7810         has_opening_colon = TRUE;
7811     }
7812     else {
7813         found_problem = TRUE;
7814         ADD_POSIX_WARNING(p, "there must be a starting ':'");
7815 
7816         /* Consider an initial punctuation (not one of the recognized ones) to
7817          * be a left terminator */
7818         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
7819             p++;
7820         }
7821     }
7822 
7823     /* They may think that you can put spaces between the components */
7824     if (isBLANK(*p)) {
7825         found_problem = TRUE;
7826 
7827         do {
7828             p++;
7829         } while (p < e && isBLANK(*p));
7830 
7831         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7832     }
7833 
7834     if (*p == '^') {
7835 
7836         /* We consider something like [^:^alnum:]] to not have been intended to
7837          * be a posix class, but XXX maybe we should */
7838         if (complement) {
7839             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7840         }
7841 
7842         complement = 1;
7843         p++;
7844     }
7845 
7846     /* Again, they may think that you can put spaces between the components */
7847     if (isBLANK(*p)) {
7848         found_problem = TRUE;
7849 
7850         do {
7851             p++;
7852         } while (p < e && isBLANK(*p));
7853 
7854         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7855     }
7856 
7857     if (*p == ']') {
7858 
7859         /* XXX This ']' may be a typo, and something else was meant.  But
7860          * treating it as such creates enough complications, that that
7861          * possibility isn't currently considered here.  So we assume that the
7862          * ']' is what is intended, and if we've already found an initial '[',
7863          * this leaves this construct looking like [:] or [:^], which almost
7864          * certainly weren't intended to be posix classes */
7865         if (has_opening_bracket) {
7866             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7867         }
7868 
7869         /* But this function can be called when we parse the colon for
7870          * something like qr/[alpha:]]/, so we back up to look for the
7871          * beginning */
7872         p--;
7873 
7874         if (*p == ';') {
7875             found_problem = TRUE;
7876             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7877         }
7878         else if (*p != ':') {
7879 
7880             /* XXX We are currently very restrictive here, so this code doesn't
7881              * consider the possibility that, say, /[alpha.]]/ was intended to
7882              * be a posix class. */
7883             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7884         }
7885 
7886         /* Here we have something like 'foo:]'.  There was no initial colon,
7887          * and we back up over 'foo.  XXX Unlike the going forward case, we
7888          * don't handle typos of non-word chars in the middle */
7889         has_opening_colon = FALSE;
7890         p--;
7891 
7892         while (p > RExC_start && isWORDCHAR(*p)) {
7893             p--;
7894         }
7895         p++;
7896 
7897         /* Here, we have positioned ourselves to where we think the first
7898          * character in the potential class is */
7899     }
7900 
7901     /* Now the interior really starts.  There are certain key characters that
7902      * can end the interior, or these could just be typos.  To catch both
7903      * cases, we may have to do two passes.  In the first pass, we keep on
7904      * going unless we come to a sequence that matches
7905      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
7906      * This means it takes a sequence to end the pass, so two typos in a row if
7907      * that wasn't what was intended.  If the class is perfectly formed, just
7908      * this one pass is needed.  We also stop if there are too many characters
7909      * being accumulated, but this number is deliberately set higher than any
7910      * real class.  It is set high enough so that someone who thinks that
7911      * 'alphanumeric' is a correct name would get warned that it wasn't.
7912      * While doing the pass, we keep track of where the key characters were in
7913      * it.  If we don't find an end to the class, and one of the key characters
7914      * was found, we redo the pass, but stop when we get to that character.
7915      * Thus the key character was considered a typo in the first pass, but a
7916      * terminator in the second.  If two key characters are found, we stop at
7917      * the second one in the first pass.  Again this can miss two typos, but
7918      * catches a single one
7919      *
7920      * In the first pass, 'possible_end' starts as NULL, and then gets set to
7921      * point to the first key character.  For the second pass, it starts as -1.
7922      * */
7923 
7924     name_start = p;
7925   parse_name:
7926     {
7927         bool has_blank               = FALSE;
7928         bool has_upper               = FALSE;
7929         bool has_terminating_colon   = FALSE;
7930         bool has_terminating_bracket = FALSE;
7931         bool has_semi_colon          = FALSE;
7932         unsigned int name_len        = 0;
7933         int punct_count              = 0;
7934 
7935         while (p < e) {
7936 
7937             /* Squeeze out blanks when looking up the class name below */
7938             if (isBLANK(*p) ) {
7939                 has_blank = TRUE;
7940                 found_problem = TRUE;
7941                 p++;
7942                 continue;
7943             }
7944 
7945             /* The name will end with a punctuation */
7946             if (isPUNCT(*p)) {
7947                 const char * peek = p + 1;
7948 
7949                 /* Treat any non-']' punctuation followed by a ']' (possibly
7950                  * with intervening blanks) as trying to terminate the class.
7951                  * ']]' is very likely to mean a class was intended (but
7952                  * missing the colon), but the warning message that gets
7953                  * generated shows the error position better if we exit the
7954                  * loop at the bottom (eventually), so skip it here. */
7955                 if (*p != ']') {
7956                     if (peek < e && isBLANK(*peek)) {
7957                         has_blank = TRUE;
7958                         found_problem = TRUE;
7959                         do {
7960                             peek++;
7961                         } while (peek < e && isBLANK(*peek));
7962                     }
7963 
7964                     if (peek < e && *peek == ']') {
7965                         has_terminating_bracket = TRUE;
7966                         if (*p == ':') {
7967                             has_terminating_colon = TRUE;
7968                         }
7969                         else if (*p == ';') {
7970                             has_semi_colon = TRUE;
7971                             has_terminating_colon = TRUE;
7972                         }
7973                         else {
7974                             found_problem = TRUE;
7975                         }
7976                         p = peek + 1;
7977                         goto try_posix;
7978                     }
7979                 }
7980 
7981                 /* Here we have punctuation we thought didn't end the class.
7982                  * Keep track of the position of the key characters that are
7983                  * more likely to have been class-enders */
7984                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
7985 
7986                     /* Allow just one such possible class-ender not actually
7987                      * ending the class. */
7988                     if (possible_end) {
7989                         break;
7990                     }
7991                     possible_end = p;
7992                 }
7993 
7994                 /* If we have too many punctuation characters, no use in
7995                  * keeping going */
7996                 if (++punct_count > max_distance) {
7997                     break;
7998                 }
7999 
8000                 /* Treat the punctuation as a typo. */
8001                 input_text[name_len++] = *p;
8002                 p++;
8003             }
8004             else if (isUPPER(*p)) { /* Use lowercase for lookup */
8005                 input_text[name_len++] = toLOWER(*p);
8006                 has_upper = TRUE;
8007                 found_problem = TRUE;
8008                 p++;
8009             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
8010                 input_text[name_len++] = *p;
8011                 p++;
8012             }
8013             else {
8014                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
8015                 p+= UTF8SKIP(p);
8016             }
8017 
8018             /* The declaration of 'input_text' is how long we allow a potential
8019              * class name to be, before saying they didn't mean a class name at
8020              * all */
8021             if (name_len >= C_ARRAY_LENGTH(input_text)) {
8022                 break;
8023             }
8024         }
8025 
8026         /* We get to here when the possible class name hasn't been properly
8027          * terminated before:
8028          *   1) we ran off the end of the pattern; or
8029          *   2) found two characters, each of which might have been intended to
8030          *      be the name's terminator
8031          *   3) found so many punctuation characters in the purported name,
8032          *      that the edit distance to a valid one is exceeded
8033          *   4) we decided it was more characters than anyone could have
8034          *      intended to be one. */
8035 
8036         found_problem = TRUE;
8037 
8038         /* In the final two cases, we know that looking up what we've
8039          * accumulated won't lead to a match, even a fuzzy one. */
8040         if (   name_len >= C_ARRAY_LENGTH(input_text)
8041             || punct_count > max_distance)
8042         {
8043             /* If there was an intermediate key character that could have been
8044              * an intended end, redo the parse, but stop there */
8045             if (possible_end && possible_end != (char *) -1) {
8046                 possible_end = (char *) -1; /* Special signal value to say
8047                                                we've done a first pass */
8048                 p = name_start;
8049                 goto parse_name;
8050             }
8051 
8052             /* Otherwise, it can't have meant to have been a class */
8053             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8054         }
8055 
8056         /* If we ran off the end, and the final character was a punctuation
8057          * one, back up one, to look at that final one just below.  Later, we
8058          * will restore the parse pointer if appropriate */
8059         if (name_len && p == e && isPUNCT(*(p-1))) {
8060             p--;
8061             name_len--;
8062         }
8063 
8064         if (p < e && isPUNCT(*p)) {
8065             if (*p == ']') {
8066                 has_terminating_bracket = TRUE;
8067 
8068                 /* If this is a 2nd ']', and the first one is just below this
8069                  * one, consider that to be the real terminator.  This gives a
8070                  * uniform and better positioning for the warning message  */
8071                 if (   possible_end
8072                     && possible_end != (char *) -1
8073                     && *possible_end == ']'
8074                     && name_len && input_text[name_len - 1] == ']')
8075                 {
8076                     name_len--;
8077                     p = possible_end;
8078 
8079                     /* And this is actually equivalent to having done the 2nd
8080                      * pass now, so set it to not try again */
8081                     possible_end = (char *) -1;
8082                 }
8083             }
8084             else {
8085                 if (*p == ':') {
8086                     has_terminating_colon = TRUE;
8087                 }
8088                 else if (*p == ';') {
8089                     has_semi_colon = TRUE;
8090                     has_terminating_colon = TRUE;
8091                 }
8092                 p++;
8093             }
8094         }
8095 
8096     try_posix:
8097 
8098         /* Here, we have a class name to look up.  We can short circuit the
8099          * stuff below for short names that can't possibly be meant to be a
8100          * class name.  (We can do this on the first pass, as any second pass
8101          * will yield an even shorter name) */
8102         if (name_len < 3) {
8103             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8104         }
8105 
8106         /* Find which class it is.  Initially switch on the length of the name.
8107          * */
8108         switch (name_len) {
8109             case 4:
8110                 if (memEQs(name_start, 4, "word")) {
8111                     /* this is not POSIX, this is the Perl \w */
8112                     class_number = ANYOF_WORDCHAR;
8113                 }
8114                 break;
8115             case 5:
8116                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
8117                  *                        graph lower print punct space upper
8118                  * Offset 4 gives the best switch position.  */
8119                 switch (name_start[4]) {
8120                     case 'a':
8121                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
8122                             class_number = ANYOF_ALPHA;
8123                         break;
8124                     case 'e':
8125                         if (memBEGINs(name_start, 5, "spac")) /* space */
8126                             class_number = ANYOF_SPACE;
8127                         break;
8128                     case 'h':
8129                         if (memBEGINs(name_start, 5, "grap")) /* graph */
8130                             class_number = ANYOF_GRAPH;
8131                         break;
8132                     case 'i':
8133                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
8134                             class_number = ANYOF_ASCII;
8135                         break;
8136                     case 'k':
8137                         if (memBEGINs(name_start, 5, "blan")) /* blank */
8138                             class_number = ANYOF_BLANK;
8139                         break;
8140                     case 'l':
8141                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
8142                             class_number = ANYOF_CNTRL;
8143                         break;
8144                     case 'm':
8145                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
8146                             class_number = ANYOF_ALPHANUMERIC;
8147                         break;
8148                     case 'r':
8149                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
8150                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
8151                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
8152                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
8153                         break;
8154                     case 't':
8155                         if (memBEGINs(name_start, 5, "digi")) /* digit */
8156                             class_number = ANYOF_DIGIT;
8157                         else if (memBEGINs(name_start, 5, "prin")) /* print */
8158                             class_number = ANYOF_PRINT;
8159                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
8160                             class_number = ANYOF_PUNCT;
8161                         break;
8162                 }
8163                 break;
8164             case 6:
8165                 if (memEQs(name_start, 6, "xdigit"))
8166                     class_number = ANYOF_XDIGIT;
8167                 break;
8168         }
8169 
8170         /* If the name exactly matches a posix class name the class number will
8171          * here be set to it, and the input almost certainly was meant to be a
8172          * posix class, so we can skip further checking.  If instead the syntax
8173          * is exactly correct, but the name isn't one of the legal ones, we
8174          * will return that as an error below.  But if neither of these apply,
8175          * it could be that no posix class was intended at all, or that one
8176          * was, but there was a typo.  We tease these apart by doing fuzzy
8177          * matching on the name */
8178         if (class_number == OOB_NAMEDCLASS && found_problem) {
8179             const UV posix_names[][6] = {
8180                                                 { 'a', 'l', 'n', 'u', 'm' },
8181                                                 { 'a', 'l', 'p', 'h', 'a' },
8182                                                 { 'a', 's', 'c', 'i', 'i' },
8183                                                 { 'b', 'l', 'a', 'n', 'k' },
8184                                                 { 'c', 'n', 't', 'r', 'l' },
8185                                                 { 'd', 'i', 'g', 'i', 't' },
8186                                                 { 'g', 'r', 'a', 'p', 'h' },
8187                                                 { 'l', 'o', 'w', 'e', 'r' },
8188                                                 { 'p', 'r', 'i', 'n', 't' },
8189                                                 { 'p', 'u', 'n', 'c', 't' },
8190                                                 { 's', 'p', 'a', 'c', 'e' },
8191                                                 { 'u', 'p', 'p', 'e', 'r' },
8192                                                 { 'w', 'o', 'r', 'd' },
8193                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
8194                                             };
8195             /* The names of the above all have added NULs to make them the same
8196              * size, so we need to also have the real lengths */
8197             const UV posix_name_lengths[] = {
8198                                                 sizeof("alnum") - 1,
8199                                                 sizeof("alpha") - 1,
8200                                                 sizeof("ascii") - 1,
8201                                                 sizeof("blank") - 1,
8202                                                 sizeof("cntrl") - 1,
8203                                                 sizeof("digit") - 1,
8204                                                 sizeof("graph") - 1,
8205                                                 sizeof("lower") - 1,
8206                                                 sizeof("print") - 1,
8207                                                 sizeof("punct") - 1,
8208                                                 sizeof("space") - 1,
8209                                                 sizeof("upper") - 1,
8210                                                 sizeof("word")  - 1,
8211                                                 sizeof("xdigit")- 1
8212                                             };
8213             unsigned int i;
8214             int temp_max = max_distance;    /* Use a temporary, so if we
8215                                                reparse, we haven't changed the
8216                                                outer one */
8217 
8218             /* Use a smaller max edit distance if we are missing one of the
8219              * delimiters */
8220             if (   has_opening_bracket + has_opening_colon < 2
8221                 || has_terminating_bracket + has_terminating_colon < 2)
8222             {
8223                 temp_max--;
8224             }
8225 
8226             /* See if the input name is close to a legal one */
8227             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
8228 
8229                 /* Short circuit call if the lengths are too far apart to be
8230                  * able to match */
8231                 if (abs( (int) (name_len - posix_name_lengths[i]))
8232                     > temp_max)
8233                 {
8234                     continue;
8235                 }
8236 
8237                 if (edit_distance(input_text,
8238                                   posix_names[i],
8239                                   name_len,
8240                                   posix_name_lengths[i],
8241                                   temp_max
8242                                  )
8243                     > -1)
8244                 { /* If it is close, it probably was intended to be a class */
8245                     goto probably_meant_to_be;
8246                 }
8247             }
8248 
8249             /* Here the input name is not close enough to a valid class name
8250              * for us to consider it to be intended to be a posix class.  If
8251              * we haven't already done so, and the parse found a character that
8252              * could have been terminators for the name, but which we absorbed
8253              * as typos during the first pass, repeat the parse, signalling it
8254              * to stop at that character */
8255             if (possible_end && possible_end != (char *) -1) {
8256                 possible_end = (char *) -1;
8257                 p = name_start;
8258                 goto parse_name;
8259             }
8260 
8261             /* Here neither pass found a close-enough class name */
8262             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8263         }
8264 
8265     probably_meant_to_be:
8266 
8267         /* Here we think that a posix specification was intended.  Update any
8268          * parse pointer */
8269         if (updated_parse_ptr) {
8270             *updated_parse_ptr = (char *) p;
8271         }
8272 
8273         /* If a posix class name was intended but incorrectly specified, we
8274          * output or return the warnings */
8275         if (found_problem) {
8276 
8277             /* We set flags for these issues in the parse loop above instead of
8278              * adding them to the list of warnings, because we can parse it
8279              * twice, and we only want one warning instance */
8280             if (has_upper) {
8281                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
8282             }
8283             if (has_blank) {
8284                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
8285             }
8286             if (has_semi_colon) {
8287                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
8288             }
8289             else if (! has_terminating_colon) {
8290                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
8291             }
8292             if (! has_terminating_bracket) {
8293                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
8294             }
8295 
8296             if (   posix_warnings
8297                 && RExC_warn_text
8298                 && av_count(RExC_warn_text) > 0)
8299             {
8300                 *posix_warnings = RExC_warn_text;
8301             }
8302         }
8303         else if (class_number != OOB_NAMEDCLASS) {
8304             /* If it is a known class, return the class.  The class number
8305              * #defines are structured so each complement is +1 to the normal
8306              * one */
8307             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
8308         }
8309         else if (! check_only) {
8310 
8311             /* Here, it is an unrecognized class.  This is an error (unless the
8312             * call is to check only, which we've already handled above) */
8313             const char * const complement_string = (complement)
8314                                                    ? "^"
8315                                                    : "";
8316             RExC_parse_set((char *) p);
8317             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
8318                         complement_string,
8319                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
8320         }
8321     }
8322 
8323     return OOB_NAMEDCLASS;
8324 }
8325 #undef ADD_POSIX_WARNING
8326 
8327 STATIC unsigned  int
S_regex_set_precedence(const U8 my_operator)8328 S_regex_set_precedence(const U8 my_operator) {
8329 
8330     /* Returns the precedence in the (?[...]) construct of the input operator,
8331      * specified by its character representation.  The precedence follows
8332      * general Perl rules, but it extends this so that ')' and ']' have (low)
8333      * precedence even though they aren't really operators */
8334 
8335     switch (my_operator) {
8336         case '!':
8337             return 5;
8338         case '&':
8339             return 4;
8340         case '^':
8341         case '|':
8342         case '+':
8343         case '-':
8344             return 3;
8345         case ')':
8346             return 2;
8347         case ']':
8348             return 1;
8349     }
8350 
8351     NOT_REACHED; /* NOTREACHED */
8352     return 0;   /* Silence compiler warning */
8353 }
8354 
8355 STATIC regnode_offset
S_handle_regex_sets(pTHX_ RExC_state_t * pRExC_state,SV ** return_invlist,I32 * flagp,U32 depth)8356 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
8357                     I32 *flagp, U32 depth)
8358 {
8359     /* Handle the (?[...]) construct to do set operations */
8360 
8361     U8 curchar;                     /* Current character being parsed */
8362     UV start, end;	            /* End points of code point ranges */
8363     SV* final = NULL;               /* The end result inversion list */
8364     SV* result_string;              /* 'final' stringified */
8365     AV* stack;                      /* stack of operators and operands not yet
8366                                        resolved */
8367     AV* fence_stack = NULL;         /* A stack containing the positions in
8368                                        'stack' of where the undealt-with left
8369                                        parens would be if they were actually
8370                                        put there */
8371     /* The 'volatile' is a workaround for an optimiser bug
8372      * in Solaris Studio 12.3. See RT #127455 */
8373     volatile IV fence = 0;          /* Position of where most recent undealt-
8374                                        with left paren in stack is; -1 if none.
8375                                      */
8376     STRLEN len;                     /* Temporary */
8377     regnode_offset node;            /* Temporary, and final regnode returned by
8378                                        this function */
8379     const bool save_fold = FOLD;    /* Temporary */
8380     char *save_end, *save_parse;    /* Temporaries */
8381     const bool in_locale = LOC;     /* we turn off /l during processing */
8382 
8383     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8384 
8385     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
8386 
8387     DEBUG_PARSE("xcls");
8388 
8389     if (in_locale) {
8390         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
8391     }
8392 
8393     /* The use of this operator implies /u.  This is required so that the
8394      * compile time values are valid in all runtime cases */
8395     REQUIRE_UNI_RULES(flagp, 0);
8396 
8397     /* Everything in this construct is a metacharacter.  Operands begin with
8398      * either a '\' (for an escape sequence), or a '[' for a bracketed
8399      * character class.  Any other character should be an operator, or
8400      * parenthesis for grouping.  Both types of operands are handled by calling
8401      * regclass() to parse them.  It is called with a parameter to indicate to
8402      * return the computed inversion list.  The parsing here is implemented via
8403      * a stack.  Each entry on the stack is a single character representing one
8404      * of the operators; or else a pointer to an operand inversion list. */
8405 
8406 #define IS_OPERATOR(a) SvIOK(a)
8407 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
8408 
8409     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
8410      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
8411      * with pronouncing it called it Reverse Polish instead, but now that YOU
8412      * know how to pronounce it you can use the correct term, thus giving due
8413      * credit to the person who invented it, and impressing your geek friends.
8414      * Wikipedia says that the pronunciation of "Ł" has been changing so that
8415      * it is now more like an English initial W (as in wonk) than an L.)
8416      *
8417      * This means that, for example, 'a | b & c' is stored on the stack as
8418      *
8419      * c  [4]
8420      * b  [3]
8421      * &  [2]
8422      * a  [1]
8423      * |  [0]
8424      *
8425      * where the numbers in brackets give the stack [array] element number.
8426      * In this implementation, parentheses are not stored on the stack.
8427      * Instead a '(' creates a "fence" so that the part of the stack below the
8428      * fence is invisible except to the corresponding ')' (this allows us to
8429      * replace testing for parens, by using instead subtraction of the fence
8430      * position).  As new operands are processed they are pushed onto the stack
8431      * (except as noted in the next paragraph).  New operators of higher
8432      * precedence than the current final one are inserted on the stack before
8433      * the lhs operand (so that when the rhs is pushed next, everything will be
8434      * in the correct positions shown above.  When an operator of equal or
8435      * lower precedence is encountered in parsing, all the stacked operations
8436      * of equal or higher precedence are evaluated, leaving the result as the
8437      * top entry on the stack.  This makes higher precedence operations
8438      * evaluate before lower precedence ones, and causes operations of equal
8439      * precedence to left associate.
8440      *
8441      * The only unary operator '!' is immediately pushed onto the stack when
8442      * encountered.  When an operand is encountered, if the top of the stack is
8443      * a '!", the complement is immediately performed, and the '!' popped.  The
8444      * resulting value is treated as a new operand, and the logic in the
8445      * previous paragraph is executed.  Thus in the expression
8446      *      [a] + ! [b]
8447      * the stack looks like
8448      *
8449      * !
8450      * a
8451      * +
8452      *
8453      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
8454      * becomes
8455      *
8456      * !b
8457      * a
8458      * +
8459      *
8460      * A ')' is treated as an operator with lower precedence than all the
8461      * aforementioned ones, which causes all operations on the stack above the
8462      * corresponding '(' to be evaluated down to a single resultant operand.
8463      * Then the fence for the '(' is removed, and the operand goes through the
8464      * algorithm above, without the fence.
8465      *
8466      * A separate stack is kept of the fence positions, so that the position of
8467      * the latest so-far unbalanced '(' is at the top of it.
8468      *
8469      * The ']' ending the construct is treated as the lowest operator of all,
8470      * so that everything gets evaluated down to a single operand, which is the
8471      * result */
8472 
8473     stack = (AV*)newSV_type_mortal(SVt_PVAV);
8474     fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
8475 
8476     while (RExC_parse < RExC_end) {
8477         I32 top_index;              /* Index of top-most element in 'stack' */
8478         SV** top_ptr;               /* Pointer to top 'stack' element */
8479         SV* current = NULL;         /* To contain the current inversion list
8480                                        operand */
8481         SV* only_to_avoid_leaks;
8482 
8483         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
8484                                 TRUE /* Force /x */ );
8485         if (RExC_parse >= RExC_end) {   /* Fail */
8486             break;
8487         }
8488 
8489         curchar = UCHARAT(RExC_parse);
8490 
8491 redo_curchar:
8492 
8493 #ifdef ENABLE_REGEX_SETS_DEBUGGING
8494                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
8495         DEBUG_U(dump_regex_sets_structures(pRExC_state,
8496                                            stack, fence, fence_stack));
8497 #endif
8498 
8499         top_index = av_tindex_skip_len_mg(stack);
8500 
8501         switch (curchar) {
8502             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
8503             char stacked_operator;  /* The topmost operator on the 'stack'. */
8504             SV* lhs;                /* Operand to the left of the operator */
8505             SV* rhs;                /* Operand to the right of the operator */
8506             SV* fence_ptr;          /* Pointer to top element of the fence
8507                                        stack */
8508             case '(':
8509 
8510                 if (   RExC_parse < RExC_end - 2
8511                     && UCHARAT(RExC_parse + 1) == '?'
8512                     && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
8513                 {
8514                     const regnode_offset orig_emit = RExC_emit;
8515                     SV * resultant_invlist;
8516 
8517                     /* Here it could be an embedded '(?flags:(?[...])'.
8518                      * This happens when we have some thing like
8519                      *
8520                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
8521                      *   ...
8522                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
8523                      *
8524                      * Here we would be handling the interpolated
8525                      * '$thai_or_lao'.  We handle this by a recursive call to
8526                      * reg which returns the inversion list the
8527                      * interpolated expression evaluates to.  Actually, the
8528                      * return is a special regnode containing a pointer to that
8529                      * inversion list.  If the return isn't that regnode alone,
8530                      * we know that this wasn't such an interpolation, which is
8531                      * an error: we need to get a single inversion list back
8532                      * from the recursion */
8533 
8534                     RExC_parse_inc_by(1);
8535                     RExC_sets_depth++;
8536 
8537                     node = reg(pRExC_state, 2, flagp, depth+1);
8538                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
8539 
8540                     if (   OP(REGNODE_p(node)) != REGEX_SET
8541                            /* If more than a single node returned, the nested
8542                             * parens evaluated to more than just a (?[...]),
8543                             * which isn't legal */
8544                         || RExC_emit != orig_emit
8545                                       + NODE_STEP_REGNODE
8546                                       + REGNODE_ARG_LEN(REGEX_SET))
8547                     {
8548                         vFAIL("Expecting interpolated extended charclass");
8549                     }
8550                     resultant_invlist = (SV *) ARGp(REGNODE_p(node));
8551                     current = invlist_clone(resultant_invlist, NULL);
8552                     SvREFCNT_dec(resultant_invlist);
8553 
8554                     RExC_sets_depth--;
8555                     RExC_emit = orig_emit;
8556                     goto handle_operand;
8557                 }
8558 
8559                 /* A regular '('.  Look behind for illegal syntax */
8560                 if (top_index - fence >= 0) {
8561                     /* If the top entry on the stack is an operator, it had
8562                      * better be a '!', otherwise the entry below the top
8563                      * operand should be an operator */
8564                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
8565                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
8566                         || (   IS_OPERAND(*top_ptr)
8567                             && (   top_index - fence < 1
8568                                 || ! (stacked_ptr = av_fetch(stack,
8569                                                              top_index - 1,
8570                                                              FALSE))
8571                                 || ! IS_OPERATOR(*stacked_ptr))))
8572                     {
8573                         RExC_parse_inc_by(1);
8574                         vFAIL("Unexpected '(' with no preceding operator");
8575                     }
8576                 }
8577 
8578                 /* Stack the position of this undealt-with left paren */
8579                 av_push_simple(fence_stack, newSViv(fence));
8580                 fence = top_index + 1;
8581                 break;
8582 
8583             case '\\':
8584                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8585                  * multi-char folds are allowed.  */
8586                 if (!regclass(pRExC_state, flagp, depth+1,
8587                               TRUE, /* means parse just the next thing */
8588                               FALSE, /* don't allow multi-char folds */
8589                               FALSE, /* don't silence non-portable warnings.  */
8590                               TRUE,  /* strict */
8591                               FALSE, /* Require return to be an ANYOF */
8592                               &current))
8593                 {
8594                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
8595                     goto regclass_failed;
8596                 }
8597 
8598                 assert(current);
8599 
8600                 /* regclass() will return with parsing just the \ sequence,
8601                  * leaving the parse pointer at the next thing to parse */
8602                 RExC_parse--;
8603                 goto handle_operand;
8604 
8605             case '[':   /* Is a bracketed character class */
8606             {
8607                 /* See if this is a [:posix:] class. */
8608                 bool is_posix_class = (OOB_NAMEDCLASS
8609                             < handle_possible_posix(pRExC_state,
8610                                                 RExC_parse + 1,
8611                                                 NULL,
8612                                                 NULL,
8613                                                 TRUE /* checking only */));
8614                 /* If it is a posix class, leave the parse pointer at the '['
8615                  * to fool regclass() into thinking it is part of a
8616                  * '[[:posix:]]'. */
8617                 if (! is_posix_class) {
8618                     RExC_parse_inc_by(1);
8619                 }
8620 
8621                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8622                  * multi-char folds are allowed.  */
8623                 if (!regclass(pRExC_state, flagp, depth+1,
8624                                 is_posix_class, /* parse the whole char
8625                                                     class only if not a
8626                                                     posix class */
8627                                 FALSE, /* don't allow multi-char folds */
8628                                 TRUE, /* silence non-portable warnings. */
8629                                 TRUE, /* strict */
8630                                 FALSE, /* Require return to be an ANYOF */
8631                                 &current))
8632                 {
8633                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
8634                     goto regclass_failed;
8635                 }
8636 
8637                 assert(current);
8638 
8639                 /* function call leaves parse pointing to the ']', except if we
8640                  * faked it */
8641                 if (is_posix_class) {
8642                     RExC_parse--;
8643                 }
8644 
8645                 goto handle_operand;
8646             }
8647 
8648             case ']':
8649                 if (top_index >= 1) {
8650                     goto join_operators;
8651                 }
8652 
8653                 /* Only a single operand on the stack: are done */
8654                 goto done;
8655 
8656             case ')':
8657                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
8658                     if (UCHARAT(RExC_parse - 1) == ']')  {
8659                         break;
8660                     }
8661                     RExC_parse_inc_by(1);
8662                     vFAIL("Unexpected ')'");
8663                 }
8664 
8665                 /* If nothing after the fence, is missing an operand */
8666                 if (top_index - fence < 0) {
8667                     RExC_parse_inc_by(1);
8668                     goto bad_syntax;
8669                 }
8670                 /* If at least two things on the stack, treat this as an
8671                   * operator */
8672                 if (top_index - fence >= 1) {
8673                     goto join_operators;
8674                 }
8675 
8676                 /* Here only a single thing on the fenced stack, and there is a
8677                  * fence.  Get rid of it */
8678                 fence_ptr = av_pop(fence_stack);
8679                 assert(fence_ptr);
8680                 fence = SvIV(fence_ptr);
8681                 SvREFCNT_dec_NN(fence_ptr);
8682                 fence_ptr = NULL;
8683 
8684                 if (fence < 0) {
8685                     fence = 0;
8686                 }
8687 
8688                 /* Having gotten rid of the fence, we pop the operand at the
8689                  * stack top and process it as a newly encountered operand */
8690                 current = av_pop(stack);
8691                 if (IS_OPERAND(current)) {
8692                     goto handle_operand;
8693                 }
8694 
8695                 RExC_parse_inc_by(1);
8696                 goto bad_syntax;
8697 
8698             case '&':
8699             case '|':
8700             case '+':
8701             case '-':
8702             case '^':
8703 
8704                 /* These binary operators should have a left operand already
8705                  * parsed */
8706                 if (   top_index - fence < 0
8707                     || top_index - fence == 1
8708                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
8709                     || ! IS_OPERAND(*top_ptr))
8710                 {
8711                     goto unexpected_binary;
8712                 }
8713 
8714                 /* If only the one operand is on the part of the stack visible
8715                  * to us, we just place this operator in the proper position */
8716                 if (top_index - fence < 2) {
8717 
8718                     /* Place the operator before the operand */
8719 
8720                     SV* lhs = av_pop(stack);
8721                     av_push_simple(stack, newSVuv(curchar));
8722                     av_push_simple(stack, lhs);
8723                     break;
8724                 }
8725 
8726                 /* But if there is something else on the stack, we need to
8727                  * process it before this new operator if and only if the
8728                  * stacked operation has equal or higher precedence than the
8729                  * new one */
8730 
8731              join_operators:
8732 
8733                 /* The operator on the stack is supposed to be below both its
8734                  * operands */
8735                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
8736                     || IS_OPERAND(*stacked_ptr))
8737                 {
8738                     /* But if not, it's legal and indicates we are completely
8739                      * done if and only if we're currently processing a ']',
8740                      * which should be the final thing in the expression */
8741                     if (curchar == ']') {
8742                         goto done;
8743                     }
8744 
8745                   unexpected_binary:
8746                     RExC_parse_inc_by(1);
8747                     vFAIL2("Unexpected binary operator '%c' with no "
8748                            "preceding operand", curchar);
8749                 }
8750                 stacked_operator = (char) SvUV(*stacked_ptr);
8751 
8752                 if (regex_set_precedence(curchar)
8753                     > regex_set_precedence(stacked_operator))
8754                 {
8755                     /* Here, the new operator has higher precedence than the
8756                      * stacked one.  This means we need to add the new one to
8757                      * the stack to await its rhs operand (and maybe more
8758                      * stuff).  We put it before the lhs operand, leaving
8759                      * untouched the stacked operator and everything below it
8760                      * */
8761                     lhs = av_pop(stack);
8762                     assert(IS_OPERAND(lhs));
8763                     av_push_simple(stack, newSVuv(curchar));
8764                     av_push_simple(stack, lhs);
8765                     break;
8766                 }
8767 
8768                 /* Here, the new operator has equal or lower precedence than
8769                  * what's already there.  This means the operation already
8770                  * there should be performed now, before the new one. */
8771 
8772                 rhs = av_pop(stack);
8773                 if (! IS_OPERAND(rhs)) {
8774 
8775                     /* This can happen when a ! is not followed by an operand,
8776                      * like in /(?[\t &!])/ */
8777                     goto bad_syntax;
8778                 }
8779 
8780                 lhs = av_pop(stack);
8781 
8782                 if (! IS_OPERAND(lhs)) {
8783 
8784                     /* This can happen when there is an empty (), like in
8785                      * /(?[[0]+()+])/ */
8786                     goto bad_syntax;
8787                 }
8788 
8789                 switch (stacked_operator) {
8790                     case '&':
8791                         _invlist_intersection(lhs, rhs, &rhs);
8792                         break;
8793 
8794                     case '|':
8795                     case '+':
8796                         _invlist_union(lhs, rhs, &rhs);
8797                         break;
8798 
8799                     case '-':
8800                         _invlist_subtract(lhs, rhs, &rhs);
8801                         break;
8802 
8803                     case '^':   /* The union minus the intersection */
8804                     {
8805                         SV* i = NULL;
8806                         SV* u = NULL;
8807 
8808                         _invlist_union(lhs, rhs, &u);
8809                         _invlist_intersection(lhs, rhs, &i);
8810                         _invlist_subtract(u, i, &rhs);
8811                         SvREFCNT_dec_NN(i);
8812                         SvREFCNT_dec_NN(u);
8813                         break;
8814                     }
8815                 }
8816                 SvREFCNT_dec(lhs);
8817 
8818                 /* Here, the higher precedence operation has been done, and the
8819                  * result is in 'rhs'.  We overwrite the stacked operator with
8820                  * the result.  Then we redo this code to either push the new
8821                  * operator onto the stack or perform any higher precedence
8822                  * stacked operation */
8823                 only_to_avoid_leaks = av_pop(stack);
8824                 SvREFCNT_dec(only_to_avoid_leaks);
8825                 av_push_simple(stack, rhs);
8826                 goto redo_curchar;
8827 
8828             case '!':   /* Highest priority, right associative */
8829 
8830                 /* If what's already at the top of the stack is another '!",
8831                  * they just cancel each other out */
8832                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
8833                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
8834                 {
8835                     only_to_avoid_leaks = av_pop(stack);
8836                     SvREFCNT_dec(only_to_avoid_leaks);
8837                 }
8838                 else { /* Otherwise, since it's right associative, just push
8839                           onto the stack */
8840                     av_push_simple(stack, newSVuv(curchar));
8841                 }
8842                 break;
8843 
8844             default:
8845                 RExC_parse_inc();
8846                 if (RExC_parse >= RExC_end) {
8847                     break;
8848                 }
8849                 vFAIL("Unexpected character");
8850 
8851           handle_operand:
8852 
8853             /* Here 'current' is the operand.  If something is already on the
8854              * stack, we have to check if it is a !.  But first, the code above
8855              * may have altered the stack in the time since we earlier set
8856              * 'top_index'.  */
8857 
8858             top_index = av_tindex_skip_len_mg(stack);
8859             if (top_index - fence >= 0) {
8860                 /* If the top entry on the stack is an operator, it had better
8861                  * be a '!', otherwise the entry below the top operand should
8862                  * be an operator */
8863                 top_ptr = av_fetch(stack, top_index, FALSE);
8864                 assert(top_ptr);
8865                 if (IS_OPERATOR(*top_ptr)) {
8866 
8867                     /* The only permissible operator at the top of the stack is
8868                      * '!', which is applied immediately to this operand. */
8869                     curchar = (char) SvUV(*top_ptr);
8870                     if (curchar != '!') {
8871                         SvREFCNT_dec(current);
8872                         vFAIL2("Unexpected binary operator '%c' with no "
8873                                 "preceding operand", curchar);
8874                     }
8875 
8876                     _invlist_invert(current);
8877 
8878                     only_to_avoid_leaks = av_pop(stack);
8879                     SvREFCNT_dec(only_to_avoid_leaks);
8880 
8881                     /* And we redo with the inverted operand.  This allows
8882                      * handling multiple ! in a row */
8883                     goto handle_operand;
8884                 }
8885                           /* Single operand is ok only for the non-binary ')'
8886                            * operator */
8887                 else if ((top_index - fence == 0 && curchar != ')')
8888                          || (top_index - fence > 0
8889                              && (! (stacked_ptr = av_fetch(stack,
8890                                                            top_index - 1,
8891                                                            FALSE))
8892                                  || IS_OPERAND(*stacked_ptr))))
8893                 {
8894                     SvREFCNT_dec(current);
8895                     vFAIL("Operand with no preceding operator");
8896                 }
8897             }
8898 
8899             /* Here there was nothing on the stack or the top element was
8900              * another operand.  Just add this new one */
8901             av_push_simple(stack, current);
8902 
8903         } /* End of switch on next parse token */
8904 
8905         RExC_parse_inc();
8906     } /* End of loop parsing through the construct */
8907 
8908     vFAIL("Syntax error in (?[...])");
8909 
8910   done:
8911 
8912     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
8913         if (RExC_parse < RExC_end) {
8914             RExC_parse_inc_by(1);
8915         }
8916 
8917         vFAIL("Unexpected ']' with no following ')' in (?[...");
8918     }
8919 
8920     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
8921         vFAIL("Unmatched (");
8922     }
8923 
8924     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
8925         || ((final = av_pop(stack)) == NULL)
8926         || ! IS_OPERAND(final)
8927         || ! is_invlist(final)
8928         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
8929     {
8930       bad_syntax:
8931         SvREFCNT_dec(final);
8932         vFAIL("Incomplete expression within '(?[ ])'");
8933     }
8934 
8935     /* Here, 'final' is the resultant inversion list from evaluating the
8936      * expression.  Return it if so requested */
8937     if (return_invlist) {
8938         *return_invlist = final;
8939         return END;
8940     }
8941 
8942     if (RExC_sets_depth) {  /* If within a recursive call, return in a special
8943                                regnode */
8944         RExC_parse_inc_by(1);
8945         node = regpnode(pRExC_state, REGEX_SET, final);
8946     }
8947     else {
8948 
8949         /* Otherwise generate a resultant node, based on 'final'.  regclass()
8950          * is expecting a string of ranges and individual code points */
8951         invlist_iterinit(final);
8952         result_string = newSVpvs("");
8953         while (invlist_iternext(final, &start, &end)) {
8954             if (start == end) {
8955                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
8956             }
8957             else {
8958                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
8959                                                         UVXf "}", start, end);
8960             }
8961         }
8962 
8963         /* About to generate an ANYOF (or similar) node from the inversion list
8964          * we have calculated */
8965         save_parse = RExC_parse;
8966         RExC_parse_set(SvPV(result_string, len));
8967         save_end = RExC_end;
8968         RExC_end = RExC_parse + len;
8969         TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
8970 
8971         /* We turn off folding around the call, as the class we have
8972          * constructed already has all folding taken into consideration, and we
8973          * don't want regclass() to add to that */
8974         RExC_flags &= ~RXf_PMf_FOLD;
8975         /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
8976          * folds are allowed.  */
8977         node = regclass(pRExC_state, flagp, depth+1,
8978                         FALSE, /* means parse the whole char class */
8979                         FALSE, /* don't allow multi-char folds */
8980                         TRUE, /* silence non-portable warnings.  The above may
8981                                  very well have generated non-portable code
8982                                  points, but they're valid on this machine */
8983                         FALSE, /* similarly, no need for strict */
8984 
8985                         /* We can optimize into something besides an ANYOF,
8986                          * except under /l, which needs to be ANYOF because of
8987                          * runtime checks for locale sanity, etc */
8988                     ! in_locale,
8989                         NULL
8990                     );
8991 
8992         RESTORE_WARNINGS;
8993         RExC_parse_set(save_parse + 1);
8994         RExC_end = save_end;
8995         SvREFCNT_dec_NN(final);
8996         SvREFCNT_dec_NN(result_string);
8997 
8998         if (save_fold) {
8999             RExC_flags |= RXf_PMf_FOLD;
9000         }
9001 
9002         if (!node) {
9003             RETURN_FAIL_ON_RESTART(*flagp, flagp);
9004             goto regclass_failed;
9005         }
9006 
9007         /* Fix up the node type if we are in locale.  (We have pretended we are
9008          * under /u for the purposes of regclass(), as this construct will only
9009          * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
9010          * (so as to cause any warnings about bad locales to be output in
9011          * regexec.c), and add the flag that indicates to check if not in a
9012          * UTF-8 locale.  The reason we above forbid optimization into
9013          * something other than an ANYOF node is simply to minimize the number
9014          * of code changes in regexec.c.  Otherwise we would have to create new
9015          * EXACTish node types and deal with them.  This decision could be
9016          * revisited should this construct become popular.
9017          *
9018          * (One might think we could look at the resulting ANYOF node and
9019          * suppress the flag if everything is above 255, as those would be
9020          * UTF-8 only, but this isn't true, as the components that led to that
9021          * result could have been locale-affected, and just happen to cancel
9022          * each other out under UTF-8 locales.) */
9023         if (in_locale) {
9024             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
9025 
9026             assert(OP(REGNODE_p(node)) == ANYOF);
9027 
9028             OP(REGNODE_p(node)) = ANYOFL;
9029             ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
9030         }
9031     }
9032 
9033     nextchar(pRExC_state);
9034     return node;
9035 
9036   regclass_failed:
9037     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
9038                                                                 (UV) *flagp);
9039 }
9040 
9041 #ifdef ENABLE_REGEX_SETS_DEBUGGING
9042 
9043 STATIC void
S_dump_regex_sets_structures(pTHX_ RExC_state_t * pRExC_state,AV * stack,const IV fence,AV * fence_stack)9044 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
9045                              AV * stack, const IV fence, AV * fence_stack)
9046 {   /* Dumps the stacks in handle_regex_sets() */
9047 
9048     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
9049     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
9050     SSize_t i;
9051 
9052     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
9053 
9054     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
9055 
9056     if (stack_top < 0) {
9057         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
9058     }
9059     else {
9060         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
9061         for (i = stack_top; i >= 0; i--) {
9062             SV ** element_ptr = av_fetch(stack, i, FALSE);
9063             if (! element_ptr) {
9064             }
9065 
9066             if (IS_OPERATOR(*element_ptr)) {
9067                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
9068                                             (int) i, (int) SvIV(*element_ptr));
9069             }
9070             else {
9071                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
9072                 sv_dump(*element_ptr);
9073             }
9074         }
9075     }
9076 
9077     if (fence_stack_top < 0) {
9078         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
9079     }
9080     else {
9081         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
9082         for (i = fence_stack_top; i >= 0; i--) {
9083             SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
9084             if (! element_ptr) {
9085             }
9086 
9087             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
9088                                             (int) i, (int) SvIV(*element_ptr));
9089         }
9090     }
9091 }
9092 
9093 #endif
9094 
9095 #undef IS_OPERATOR
9096 #undef IS_OPERAND
9097 
9098 void
Perl_add_above_Latin1_folds(pTHX_ RExC_state_t * pRExC_state,const U8 cp,SV ** invlist)9099 Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
9100 {
9101     /* This adds the Latin1/above-Latin1 folding rules.
9102      *
9103      * This should be called only for a Latin1-range code points, cp, which is
9104      * known to be involved in a simple fold with other code points above
9105      * Latin1.  It would give false results if /aa has been specified.
9106      * Multi-char folds are outside the scope of this, and must be handled
9107      * specially. */
9108 
9109     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
9110 
9111     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
9112 
9113     /* The rules that are valid for all Unicode versions are hard-coded in */
9114     switch (cp) {
9115         case 'k':
9116         case 'K':
9117           *invlist =
9118              add_cp_to_invlist(*invlist, KELVIN_SIGN);
9119             break;
9120         case 's':
9121         case 'S':
9122           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
9123             break;
9124         case MICRO_SIGN:
9125           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
9126           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
9127             break;
9128         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9129         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9130           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
9131             break;
9132         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9133           *invlist = add_cp_to_invlist(*invlist,
9134                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9135             break;
9136 
9137         default:    /* Other code points are checked against the data for the
9138                        current Unicode version */
9139           {
9140             Size_t folds_count;
9141             U32 first_fold;
9142             const U32 * remaining_folds;
9143             UV folded_cp;
9144 
9145             if (isASCII(cp)) {
9146                 folded_cp = toFOLD(cp);
9147             }
9148             else {
9149                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
9150                 Size_t dummy_len;
9151                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
9152             }
9153 
9154             if (folded_cp > 255) {
9155                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
9156             }
9157 
9158             folds_count = _inverse_folds(folded_cp, &first_fold,
9159                                                     &remaining_folds);
9160             if (folds_count == 0) {
9161 
9162                 /* Use deprecated warning to increase the chances of this being
9163                  * output */
9164                 ckWARN2reg_d(RExC_parse,
9165                         "Perl folding rules are not up-to-date for 0x%02X;"
9166                         " please use the perlbug utility to report;", cp);
9167             }
9168             else {
9169                 unsigned int i;
9170 
9171                 if (first_fold > 255) {
9172                     *invlist = add_cp_to_invlist(*invlist, first_fold);
9173                 }
9174                 for (i = 0; i < folds_count - 1; i++) {
9175                     if (remaining_folds[i] > 255) {
9176                         *invlist = add_cp_to_invlist(*invlist,
9177                                                     remaining_folds[i]);
9178                     }
9179                 }
9180             }
9181             break;
9182          }
9183     }
9184 }
9185 
9186 STATIC void
S_output_posix_warnings(pTHX_ RExC_state_t * pRExC_state,AV * posix_warnings)9187 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
9188 {
9189     /* Output the elements of the array given by '*posix_warnings' as REGEXP
9190      * warnings. */
9191 
9192     SV * msg;
9193     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
9194 
9195     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
9196 
9197     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
9198         CLEAR_POSIX_WARNINGS();
9199         return;
9200     }
9201 
9202     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
9203         if (first_is_fatal) {           /* Avoid leaking this */
9204             av_undef(posix_warnings);   /* This isn't necessary if the
9205                                             array is mortal, but is a
9206                                             fail-safe */
9207             (void) sv_2mortal(msg);
9208             PREPARE_TO_DIE;
9209         }
9210         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
9211         SvREFCNT_dec_NN(msg);
9212     }
9213 
9214     UPDATE_WARNINGS_LOC(RExC_parse);
9215 }
9216 
9217 PERL_STATIC_INLINE Size_t
S_find_first_differing_byte_pos(const U8 * s1,const U8 * s2,const Size_t max)9218 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
9219 {
9220     const U8 * const start = s1;
9221     const U8 * const send = start + max;
9222 
9223     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
9224 
9225     while (s1 < send && *s1  == *s2) {
9226         s1++; s2++;
9227     }
9228 
9229     return s1 - start;
9230 }
9231 
9232 STATIC AV *
S_add_multi_match(pTHX_ AV * multi_char_matches,SV * multi_string,const STRLEN cp_count)9233 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
9234 {
9235     /* This adds the string scalar <multi_string> to the array
9236      * <multi_char_matches>.  <multi_string> is known to have exactly
9237      * <cp_count> code points in it.  This is used when constructing a
9238      * bracketed character class and we find something that needs to match more
9239      * than a single character.
9240      *
9241      * <multi_char_matches> is actually an array of arrays.  Each top-level
9242      * element is an array that contains all the strings known so far that are
9243      * the same length.  And that length (in number of code points) is the same
9244      * as the index of the top-level array.  Hence, the [2] element is an
9245      * array, each element thereof is a string containing TWO code points;
9246      * while element [3] is for strings of THREE characters, and so on.  Since
9247      * this is for multi-char strings there can never be a [0] nor [1] element.
9248      *
9249      * When we rewrite the character class below, we will do so such that the
9250      * longest strings are written first, so that it prefers the longest
9251      * matching strings first.  This is done even if it turns out that any
9252      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
9253      * Christiansen has agreed that this is ok.  This makes the test for the
9254      * ligature 'ffi' come before the test for 'ff', for example */
9255 
9256     AV* this_array;
9257     AV** this_array_ptr;
9258 
9259     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
9260 
9261     if (! multi_char_matches) {
9262         multi_char_matches = newAV();
9263     }
9264 
9265     if (av_exists(multi_char_matches, cp_count)) {
9266         this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
9267         this_array = *this_array_ptr;
9268     }
9269     else {
9270         this_array = newAV();
9271         av_store_simple(multi_char_matches, cp_count,
9272                  (SV*) this_array);
9273     }
9274     av_push_simple(this_array, multi_string);
9275 
9276     return multi_char_matches;
9277 }
9278 
9279 /* The names of properties whose definitions are not known at compile time are
9280  * stored in this SV, after a constant heading.  So if the length has been
9281  * changed since initialization, then there is a run-time definition. */
9282 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
9283                                         (SvCUR(listsv) != initial_listsv_len)
9284 
9285 /* There is a restricted set of white space characters that are legal when
9286  * ignoring white space in a bracketed character class.  This generates the
9287  * code to skip them.
9288  *
9289  * There is a line below that uses the same white space criteria but is outside
9290  * this macro.  Both here and there must use the same definition */
9291 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
9292     STMT_START {                                                        \
9293         if (do_skip) {                                                  \
9294             while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
9295             {                                                           \
9296                 p++;                                                    \
9297             }                                                           \
9298         }                                                               \
9299     } STMT_END
9300 
9301 STATIC regnode_offset
S_regclass(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth,const bool stop_at_1,bool allow_mutiple_chars,const bool silence_non_portable,const bool strict,bool optimizable,SV ** ret_invlist)9302 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
9303                  const bool stop_at_1,  /* Just parse the next thing, don't
9304                                            look for a full character class */
9305                  bool allow_mutiple_chars,
9306                  const bool silence_non_portable,   /* Don't output warnings
9307                                                        about too large
9308                                                        characters */
9309                  const bool strict,
9310                  bool optimizable,                  /* ? Allow a non-ANYOF return
9311                                                        node */
9312                  SV** ret_invlist  /* Return an inversion list, not a node */
9313           )
9314 {
9315     /* parse a bracketed class specification.  Most of these will produce an
9316      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
9317      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
9318      * under /i with multi-character folds: it will be rewritten following the
9319      * paradigm of this example, where the <multi-fold>s are characters which
9320      * fold to multiple character sequences:
9321      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
9322      * gets effectively rewritten as:
9323      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
9324      * reg() gets called (recursively) on the rewritten version, and this
9325      * function will return what it constructs.  (Actually the <multi-fold>s
9326      * aren't physically removed from the [abcdefghi], it's just that they are
9327      * ignored in the recursion by means of a flag:
9328      * <RExC_in_multi_char_class>.)
9329      *
9330      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
9331      * characters, with the corresponding bit set if that character is in the
9332      * list.  For characters above this, an inversion list is used.  There
9333      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
9334      * determinable at compile time
9335      *
9336      * On success, returns the offset at which any next node should be placed
9337      * into the regex engine program being compiled.
9338      *
9339      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
9340      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
9341      * UTF-8
9342      */
9343 
9344     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
9345     IV range = 0;
9346     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
9347     regnode_offset ret = -1;    /* Initialized to an illegal value */
9348     STRLEN numlen;
9349     int namedclass = OOB_NAMEDCLASS;
9350     char *rangebegin = NULL;
9351     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
9352                                aren't available at the time this was called */
9353     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9354                                       than just initialized.  */
9355     SV* properties = NULL;    /* Code points that match \p{} \P{} */
9356     SV* posixes = NULL;     /* Code points that match classes like [:word:],
9357                                extended beyond the Latin1 range.  These have to
9358                                be kept separate from other code points for much
9359                                of this function because their handling  is
9360                                different under /i, and for most classes under
9361                                /d as well */
9362     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
9363                                separate for a while from the non-complemented
9364                                versions because of complications with /d
9365                                matching */
9366     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
9367                                   treated more simply than the general case,
9368                                   leading to less compilation and execution
9369                                   work */
9370     UV element_count = 0;   /* Number of distinct elements in the class.
9371                                Optimizations may be possible if this is tiny */
9372     AV * multi_char_matches = NULL; /* Code points that fold to more than one
9373                                        character; used under /i */
9374     UV n;
9375     char * stop_ptr = RExC_end;    /* where to stop parsing */
9376 
9377     /* ignore unescaped whitespace? */
9378     const bool skip_white = cBOOL(   ret_invlist
9379                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
9380 
9381     /* inversion list of code points this node matches only when the target
9382      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
9383      * /d) */
9384     SV* upper_latin1_only_utf8_matches = NULL;
9385 
9386     /* Inversion list of code points this node matches regardless of things
9387      * like locale, folding, utf8ness of the target string */
9388     SV* cp_list = NULL;
9389 
9390     /* Like cp_list, but code points on this list need to be checked for things
9391      * that fold to/from them under /i */
9392     SV* cp_foldable_list = NULL;
9393 
9394     /* Like cp_list, but code points on this list are valid only when the
9395      * runtime locale is UTF-8 */
9396     SV* only_utf8_locale_list = NULL;
9397 
9398     /* In a range, if one of the endpoints is non-character-set portable,
9399      * meaning that it hard-codes a code point that may mean a different
9400      * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
9401      * mnemonic '\t' which each mean the same character no matter which
9402      * character set the platform is on. */
9403     unsigned int non_portable_endpoint = 0;
9404 
9405     /* Is the range unicode? which means on a platform that isn't 1-1 native
9406      * to Unicode (i.e. non-ASCII), each code point in it should be considered
9407      * to be a Unicode value.  */
9408     bool unicode_range = FALSE;
9409     bool invert = FALSE;    /* Is this class to be complemented */
9410 
9411     bool warn_super = ALWAYS_WARN_SUPER;
9412 
9413     const char * orig_parse = RExC_parse;
9414 
9415     /* This variable is used to mark where the end in the input is of something
9416      * that looks like a POSIX construct but isn't.  During the parse, when
9417      * something looks like it could be such a construct is encountered, it is
9418      * checked for being one, but not if we've already checked this area of the
9419      * input.  Only after this position is reached do we check again */
9420     char *not_posix_region_end = RExC_parse - 1;
9421 
9422     AV* posix_warnings = NULL;
9423     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
9424     U8 op = ANYOF;    /* The returned node-type, initialized to the expected
9425                          type. */
9426     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
9427     U32 posixl = 0;       /* bit field of posix classes matched under /l */
9428 
9429 
9430 /* Flags as to what things aren't knowable until runtime.  (Note that these are
9431  * mutually exclusive.) */
9432 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
9433                                             haven't been defined as of yet */
9434 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
9435                                             UTF-8 or not */
9436 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
9437                                             what gets folded */
9438     U32 has_runtime_dependency = 0;     /* OR of the above flags */
9439 
9440     DECLARE_AND_GET_RE_DEBUG_FLAGS;
9441 
9442     PERL_ARGS_ASSERT_REGCLASS;
9443 #ifndef DEBUGGING
9444     PERL_UNUSED_ARG(depth);
9445 #endif
9446 
9447     assert(! (ret_invlist && allow_mutiple_chars));
9448 
9449     /* If wants an inversion list returned, we can't optimize to something
9450      * else. */
9451     if (ret_invlist) {
9452         optimizable = FALSE;
9453     }
9454 
9455     DEBUG_PARSE("clas");
9456 
9457 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
9458     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
9459                                    && UNICODE_DOT_DOT_VERSION == 0)
9460     allow_mutiple_chars = FALSE;
9461 #endif
9462 
9463     /* We include the /i status at the beginning of this so that we can
9464      * know it at runtime */
9465     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
9466     initial_listsv_len = SvCUR(listsv);
9467     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
9468 
9469     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9470 
9471     assert(RExC_parse <= RExC_end);
9472 
9473     if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
9474         RExC_parse_inc_by(1);
9475         invert = TRUE;
9476         allow_mutiple_chars = FALSE;
9477         MARK_NAUGHTY(1);
9478         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9479     }
9480 
9481     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
9482     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
9483         int maybe_class = handle_possible_posix(pRExC_state,
9484                                                 RExC_parse,
9485                                                 &not_posix_region_end,
9486                                                 NULL,
9487                                                 TRUE /* checking only */);
9488         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
9489             ckWARN4reg(not_posix_region_end,
9490                     "POSIX syntax [%c %c] belongs inside character classes%s",
9491                     *RExC_parse, *RExC_parse,
9492                     (maybe_class == OOB_NAMEDCLASS)
9493                     ? ((POSIXCC_NOTYET(*RExC_parse))
9494                         ? " (but this one isn't implemented)"
9495                         : " (but this one isn't fully valid)")
9496                     : ""
9497                     );
9498         }
9499     }
9500 
9501     /* If the caller wants us to just parse a single element, accomplish this
9502      * by faking the loop ending condition */
9503     if (stop_at_1 && RExC_end > RExC_parse) {
9504         stop_ptr = RExC_parse + 1;
9505     }
9506 
9507     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
9508     if (UCHARAT(RExC_parse) == ']')
9509         goto charclassloop;
9510 
9511     while (1) {
9512 
9513         if (   posix_warnings
9514             && av_tindex_skip_len_mg(posix_warnings) >= 0
9515             && RExC_parse > not_posix_region_end)
9516         {
9517             /* Warnings about posix class issues are considered tentative until
9518              * we are far enough along in the parse that we can no longer
9519              * change our mind, at which point we output them.  This is done
9520              * each time through the loop so that a later class won't zap them
9521              * before they have been dealt with. */
9522             output_posix_warnings(pRExC_state, posix_warnings);
9523         }
9524 
9525         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9526 
9527         if  (RExC_parse >= stop_ptr) {
9528             break;
9529         }
9530 
9531         if  (UCHARAT(RExC_parse) == ']') {
9532             break;
9533         }
9534 
9535       charclassloop:
9536 
9537         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9538         save_value = value;
9539         save_prevvalue = prevvalue;
9540 
9541         if (!range) {
9542             rangebegin = RExC_parse;
9543             element_count++;
9544             non_portable_endpoint = 0;
9545         }
9546         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
9547             value = utf8n_to_uvchr((U8*)RExC_parse,
9548                                    RExC_end - RExC_parse,
9549                                    &numlen, UTF8_ALLOW_DEFAULT);
9550             RExC_parse_inc_by(numlen);
9551         }
9552         else {
9553             value = UCHARAT(RExC_parse);
9554             RExC_parse_inc_by(1);
9555         }
9556 
9557         if (value == '[') {
9558             char * posix_class_end;
9559             namedclass = handle_possible_posix(pRExC_state,
9560                                                RExC_parse,
9561                                                &posix_class_end,
9562                                                do_posix_warnings ? &posix_warnings : NULL,
9563                                                FALSE    /* die if error */);
9564             if (namedclass > OOB_NAMEDCLASS) {
9565 
9566                 /* If there was an earlier attempt to parse this particular
9567                  * posix class, and it failed, it was a false alarm, as this
9568                  * successful one proves */
9569                 if (   posix_warnings
9570                     && av_tindex_skip_len_mg(posix_warnings) >= 0
9571                     && not_posix_region_end >= RExC_parse
9572                     && not_posix_region_end <= posix_class_end)
9573                 {
9574                     av_undef(posix_warnings);
9575                 }
9576 
9577                 RExC_parse_set(posix_class_end);
9578             }
9579             else if (namedclass == OOB_NAMEDCLASS) {
9580                 not_posix_region_end = posix_class_end;
9581             }
9582             else {
9583                 namedclass = OOB_NAMEDCLASS;
9584             }
9585         }
9586         else if (   RExC_parse - 1 > not_posix_region_end
9587                  && MAYBE_POSIXCC(value))
9588         {
9589             (void) handle_possible_posix(
9590                         pRExC_state,
9591                         RExC_parse - 1,  /* -1 because parse has already been
9592                                             advanced */
9593                         &not_posix_region_end,
9594                         do_posix_warnings ? &posix_warnings : NULL,
9595                         TRUE /* checking only */);
9596         }
9597         else if (  strict && ! skip_white
9598                  && (   generic_isCC_(value, CC_VERTSPACE_)
9599                      || is_VERTWS_cp_high(value)))
9600         {
9601             vFAIL("Literal vertical space in [] is illegal except under /x");
9602         }
9603         else if (value == '\\') {
9604             /* Is a backslash; get the code point of the char after it */
9605 
9606             if (RExC_parse >= RExC_end) {
9607                 vFAIL("Unmatched [");
9608             }
9609 
9610             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
9611                 value = utf8n_to_uvchr((U8*)RExC_parse,
9612                                    RExC_end - RExC_parse,
9613                                    &numlen, UTF8_ALLOW_DEFAULT);
9614                 RExC_parse_inc_by(numlen);
9615             }
9616             else {
9617                 value = UCHARAT(RExC_parse);
9618                 RExC_parse_inc_by(1);
9619             }
9620 
9621             /* Some compilers cannot handle switching on 64-bit integer
9622              * values, therefore value cannot be an UV.  Yes, this will
9623              * be a problem later if we want switch on Unicode.
9624              * A similar issue a little bit later when switching on
9625              * namedclass. --jhi */
9626 
9627             /* If the \ is escaping white space when white space is being
9628              * skipped, it means that that white space is wanted literally, and
9629              * is already in 'value'.  Otherwise, need to translate the escape
9630              * into what it signifies. */
9631             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
9632                 const char * message;
9633                 U32 packed_warn;
9634                 U8 grok_c_char;
9635 
9636             case 'w':	namedclass = ANYOF_WORDCHAR;	break;
9637             case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
9638             case 's':	namedclass = ANYOF_SPACE;	break;
9639             case 'S':	namedclass = ANYOF_NSPACE;	break;
9640             case 'd':	namedclass = ANYOF_DIGIT;	break;
9641             case 'D':	namedclass = ANYOF_NDIGIT;	break;
9642             case 'v':	namedclass = ANYOF_VERTWS;	break;
9643             case 'V':	namedclass = ANYOF_NVERTWS;	break;
9644             case 'h':	namedclass = ANYOF_HORIZWS;	break;
9645             case 'H':	namedclass = ANYOF_NHORIZWS;	break;
9646             case 'N':  /* Handle \N{NAME} in class */
9647                 {
9648                     const char * const backslash_N_beg = RExC_parse - 2;
9649                     int cp_count;
9650 
9651                     if (! grok_bslash_N(pRExC_state,
9652                                         NULL,      /* No regnode */
9653                                         &value,    /* Yes single value */
9654                                         &cp_count, /* Multiple code pt count */
9655                                         flagp,
9656                                         strict,
9657                                         depth)
9658                     ) {
9659 
9660                         if (*flagp & NEED_UTF8)
9661                             FAIL("panic: grok_bslash_N set NEED_UTF8");
9662 
9663                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
9664 
9665                         if (cp_count < 0) {
9666                             vFAIL("\\N in a character class must be a named character: \\N{...}");
9667                         }
9668                         else if (cp_count == 0) {
9669                             ckWARNreg(RExC_parse,
9670                               "Ignoring zero length \\N{} in character class");
9671                         }
9672                         else { /* cp_count > 1 */
9673                             assert(cp_count > 1);
9674                             if (! RExC_in_multi_char_class) {
9675                                 if ( ! allow_mutiple_chars
9676                                     || invert
9677                                     || range
9678                                     || *RExC_parse == '-')
9679                                 {
9680                                     if (strict) {
9681                                         RExC_parse--;
9682                                         vFAIL("\\N{} here is restricted to one character");
9683                                     }
9684                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
9685                                     break; /* <value> contains the first code
9686                                               point. Drop out of the switch to
9687                                               process it */
9688                                 }
9689                                 else {
9690                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
9691                                                  RExC_parse - backslash_N_beg);
9692                                     multi_char_matches
9693                                         = add_multi_match(multi_char_matches,
9694                                                           multi_char_N,
9695                                                           cp_count);
9696                                 }
9697                             }
9698                         } /* End of cp_count != 1 */
9699 
9700                         /* This element should not be processed further in this
9701                          * class */
9702                         element_count--;
9703                         value = save_value;
9704                         prevvalue = save_prevvalue;
9705                         continue;   /* Back to top of loop to get next char */
9706                     }
9707 
9708                     /* Here, is a single code point, and <value> contains it */
9709                     unicode_range = TRUE;   /* \N{} are Unicode */
9710                 }
9711                 break;
9712             case 'p':
9713             case 'P':
9714                 {
9715                 char *e;
9716 
9717                 if (RExC_pm_flags & PMf_WILDCARD) {
9718                     RExC_parse_inc_by(1);
9719                     /* diag_listed_as: Use of %s is not allowed in Unicode
9720                        property wildcard subpatterns in regex; marked by <--
9721                        HERE in m/%s/ */
9722                     vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
9723                            " wildcard subpatterns", (char) value, *(RExC_parse - 1));
9724                 }
9725 
9726                 /* \p means they want Unicode semantics */
9727                 REQUIRE_UNI_RULES(flagp, 0);
9728 
9729                 if (RExC_parse >= RExC_end)
9730                     vFAIL2("Empty \\%c", (U8)value);
9731                 if (*RExC_parse == '{') {
9732                     const U8 c = (U8)value;
9733                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
9734                     if (!e) {
9735                         RExC_parse_inc_by(1);
9736                         vFAIL2("Missing right brace on \\%c{}", c);
9737                     }
9738 
9739                     RExC_parse_inc_by(1);
9740 
9741                     /* White space is allowed adjacent to the braces and after
9742                      * any '^', even when not under /x */
9743                     while (isSPACE(*RExC_parse)) {
9744                          RExC_parse_inc_by(1);
9745                     }
9746 
9747                     if (UCHARAT(RExC_parse) == '^') {
9748 
9749                         /* toggle.  (The rhs xor gets the single bit that
9750                          * differs between P and p; the other xor inverts just
9751                          * that bit) */
9752                         value ^= 'P' ^ 'p';
9753 
9754                         RExC_parse_inc_by(1);
9755                         while (isSPACE(*RExC_parse)) {
9756                             RExC_parse_inc_by(1);
9757                         }
9758                     }
9759 
9760                     if (e == RExC_parse)
9761                         vFAIL2("Empty \\%c{}", c);
9762 
9763                     n = e - RExC_parse;
9764                     while (isSPACE(*(RExC_parse + n - 1)))
9765                         n--;
9766 
9767                 }   /* The \p isn't immediately followed by a '{' */
9768                 else if (! isALPHA(*RExC_parse)) {
9769                     RExC_parse_inc_safe();
9770                     vFAIL2("Character following \\%c must be '{' or a "
9771                            "single-character Unicode property name",
9772                            (U8) value);
9773                 }
9774                 else {
9775                     e = RExC_parse;
9776                     n = 1;
9777                 }
9778                 {
9779                     char* name = RExC_parse;
9780 
9781                     /* Any message returned about expanding the definition */
9782                     SV* msg = newSVpvs_flags("", SVs_TEMP);
9783 
9784                     /* If set TRUE, the property is user-defined as opposed to
9785                      * official Unicode */
9786                     bool user_defined = FALSE;
9787                     AV * strings = NULL;
9788 
9789                     SV * prop_definition = parse_uniprop_string(
9790                                             name, n, UTF, FOLD,
9791                                             FALSE, /* This is compile-time */
9792 
9793                                             /* We can't defer this defn when
9794                                              * the full result is required in
9795                                              * this call */
9796                                             ! cBOOL(ret_invlist),
9797 
9798                                             &strings,
9799                                             &user_defined,
9800                                             msg,
9801                                             0 /* Base level */
9802                                            );
9803                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
9804                         assert(prop_definition == NULL);
9805                         RExC_parse_set(e + 1);
9806                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
9807                                                thing so, or else the display is
9808                                                mojibake */
9809                             RExC_utf8 = TRUE;
9810                         }
9811                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
9812                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
9813                                     SvCUR(msg), SvPVX(msg)));
9814                     }
9815 
9816                     assert(prop_definition || strings);
9817 
9818                     if (strings) {
9819                         if (ret_invlist) {
9820                             if (! prop_definition) {
9821                                 RExC_parse_set(e + 1);
9822                                 vFAIL("Unicode string properties are not implemented in (?[...])");
9823                             }
9824                             else {
9825                                 ckWARNreg(e + 1,
9826                                     "Using just the single character results"
9827                                     " returned by \\p{} in (?[...])");
9828                             }
9829                         }
9830                         else if (! RExC_in_multi_char_class) {
9831                             if (invert ^ (value == 'P')) {
9832                                 RExC_parse_set(e + 1);
9833                                 vFAIL("Inverting a character class which contains"
9834                                     " a multi-character sequence is illegal");
9835                             }
9836 
9837                             /* For each multi-character string ... */
9838                             while (av_count(strings) > 0) {
9839                                 /* ... Each entry is itself an array of code
9840                                 * points. */
9841                                 AV * this_string = (AV *) av_shift( strings);
9842                                 STRLEN cp_count = av_count(this_string);
9843                                 SV * final = newSV(cp_count ? cp_count * 4 : 1);
9844                                 SvPVCLEAR_FRESH(final);
9845 
9846                                 /* Create another string of sequences of \x{...} */
9847                                 while (av_count(this_string) > 0) {
9848                                     SV * character = av_shift(this_string);
9849                                     UV cp = SvUV(character);
9850 
9851                                     if (cp > 255) {
9852                                         REQUIRE_UTF8(flagp);
9853                                     }
9854                                     Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
9855                                                                         cp);
9856                                     SvREFCNT_dec_NN(character);
9857                                 }
9858                                 SvREFCNT_dec_NN(this_string);
9859 
9860                                 /* And add that to the list of such things */
9861                                 multi_char_matches
9862                                             = add_multi_match(multi_char_matches,
9863                                                             final,
9864                                                             cp_count);
9865                             }
9866                         }
9867                         SvREFCNT_dec_NN(strings);
9868                     }
9869 
9870                     if (! prop_definition) {    /* If we got only a string,
9871                                                    this iteration didn't really
9872                                                    find a character */
9873                         element_count--;
9874                     }
9875                     else if (! is_invlist(prop_definition)) {
9876 
9877                         /* Here, the definition isn't known, so we have gotten
9878                          * returned a string that will be evaluated if and when
9879                          * encountered at runtime.  We add it to the list of
9880                          * such properties, along with whether it should be
9881                          * complemented or not */
9882                         if (value == 'P') {
9883                             sv_catpvs(listsv, "!");
9884                         }
9885                         else {
9886                             sv_catpvs(listsv, "+");
9887                         }
9888                         sv_catsv(listsv, prop_definition);
9889 
9890                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
9891 
9892                         /* We don't know yet what this matches, so have to flag
9893                          * it */
9894                         anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
9895                     }
9896                     else {
9897                         assert (prop_definition && is_invlist(prop_definition));
9898 
9899                         /* Here we do have the complete property definition
9900                          *
9901                          * Temporary workaround for [GH #16520].  For this
9902                          * precise input that is in the .t that is failing,
9903                          * load utf8.pm, which is what the test wants, so that
9904                          * that .t passes */
9905                         if (     memEQs(RExC_start, e + 1 - RExC_start,
9906                                         "foo\\p{Alnum}")
9907                             && ! hv_common(GvHVn(PL_incgv),
9908                                            NULL,
9909                                            "utf8.pm", sizeof("utf8.pm") - 1,
9910                                            0, HV_FETCH_ISEXISTS, NULL, 0))
9911                         {
9912                             require_pv("utf8.pm");
9913                         }
9914 
9915                         if (! user_defined &&
9916                             /* We warn on matching an above-Unicode code point
9917                              * if the match would return true, except don't
9918                              * warn for \p{All}, which has exactly one element
9919                              * = 0 */
9920                             (_invlist_contains_cp(prop_definition, 0x110000)
9921                                 && (! (_invlist_len(prop_definition) == 1
9922                                        && *invlist_array(prop_definition) == 0))))
9923                         {
9924                             warn_super = TRUE;
9925                         }
9926 
9927                         /* Invert if asking for the complement */
9928                         if (value == 'P') {
9929                             _invlist_union_complement_2nd(properties,
9930                                                           prop_definition,
9931                                                           &properties);
9932                         }
9933                         else {
9934                             _invlist_union(properties, prop_definition, &properties);
9935                         }
9936                     }
9937                 }
9938 
9939                 RExC_parse_set(e + 1);
9940                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
9941                                                 named */
9942                 }
9943                 break;
9944             case 'n':	value = '\n';			break;
9945             case 'r':	value = '\r';			break;
9946             case 't':	value = '\t';			break;
9947             case 'f':	value = '\f';			break;
9948             case 'b':	value = '\b';			break;
9949             case 'e':	value = ESC_NATIVE;             break;
9950             case 'a':	value = '\a';                   break;
9951             case 'o':
9952                 RExC_parse--;	/* function expects to be pointed at the 'o' */
9953                 if (! grok_bslash_o(&RExC_parse,
9954                                             RExC_end,
9955                                             &value,
9956                                             &message,
9957                                             &packed_warn,
9958                                             strict,
9959                                             cBOOL(range), /* MAX_UV allowed for range
9960                                                       upper limit */
9961                                             UTF))
9962                 {
9963                     vFAIL(message);
9964                 }
9965                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9966                     warn_non_literal_string(RExC_parse, packed_warn, message);
9967                 }
9968 
9969                 if (value < 256) {
9970                     non_portable_endpoint++;
9971                 }
9972                 break;
9973             case 'x':
9974                 RExC_parse--;	/* function expects to be pointed at the 'x' */
9975                 if (!  grok_bslash_x(&RExC_parse,
9976                                             RExC_end,
9977                                             &value,
9978                                             &message,
9979                                             &packed_warn,
9980                                             strict,
9981                                             cBOOL(range), /* MAX_UV allowed for range
9982                                                       upper limit */
9983                                             UTF))
9984                 {
9985                     vFAIL(message);
9986                 }
9987                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9988                     warn_non_literal_string(RExC_parse, packed_warn, message);
9989                 }
9990 
9991                 if (value < 256) {
9992                     non_portable_endpoint++;
9993                 }
9994                 break;
9995             case 'c':
9996                 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
9997                                                                 &packed_warn))
9998                 {
9999                     /* going to die anyway; point to exact spot of
10000                         * failure */
10001                     RExC_parse_inc_safe();
10002                     vFAIL(message);
10003                 }
10004 
10005                 value = grok_c_char;
10006                 RExC_parse_inc_by(1);
10007                 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10008                     warn_non_literal_string(RExC_parse, packed_warn, message);
10009                 }
10010 
10011                 non_portable_endpoint++;
10012                 break;
10013             case '0': case '1': case '2': case '3': case '4':
10014             case '5': case '6': case '7':
10015                 {
10016                     /* Take 1-3 octal digits */
10017                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
10018                               | PERL_SCAN_NOTIFY_ILLDIGIT;
10019                     numlen = (strict) ? 4 : 3;
10020                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10021                     RExC_parse_inc_by(numlen);
10022                     if (numlen != 3) {
10023                         if (strict) {
10024                             RExC_parse_inc_safe();
10025                             vFAIL("Need exactly 3 octal digits");
10026                         }
10027                         else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
10028                                  && RExC_parse < RExC_end
10029                                  && isDIGIT(*RExC_parse)
10030                                  && ckWARN(WARN_REGEXP))
10031                         {
10032                             reg_warn_non_literal_string(
10033                                  RExC_parse + 1,
10034                                  form_alien_digit_msg(8, numlen, RExC_parse,
10035                                                         RExC_end, UTF, FALSE));
10036                         }
10037                     }
10038                     if (value < 256) {
10039                         non_portable_endpoint++;
10040                     }
10041                     break;
10042                 }
10043             default:
10044                 /* Allow \_ to not give an error */
10045                 if (isWORDCHAR(value) && value != '_') {
10046                     if (strict) {
10047                         vFAIL2("Unrecognized escape \\%c in character class",
10048                                (int)value);
10049                     }
10050                     else {
10051                         ckWARN2reg(RExC_parse,
10052                             "Unrecognized escape \\%c in character class passed through",
10053                             (int)value);
10054                     }
10055                 }
10056                 break;
10057             }   /* End of switch on char following backslash */
10058         } /* end of handling backslash escape sequences */
10059 
10060         /* Here, we have the current token in 'value' */
10061 
10062         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10063             U8 classnum;
10064 
10065             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10066              * literal, as is the character that began the false range, i.e.
10067              * the 'a' in the examples */
10068             if (range) {
10069                 const int w = (RExC_parse >= rangebegin)
10070                                 ? RExC_parse - rangebegin
10071                                 : 0;
10072                 if (strict) {
10073                     vFAIL2utf8f(
10074                         "False [] range \"%" UTF8f "\"",
10075                         UTF8fARG(UTF, w, rangebegin));
10076                 }
10077                 else {
10078                     ckWARN2reg(RExC_parse,
10079                         "False [] range \"%" UTF8f "\"",
10080                         UTF8fARG(UTF, w, rangebegin));
10081                     cp_list = add_cp_to_invlist(cp_list, '-');
10082                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
10083                                                             prevvalue);
10084                 }
10085 
10086                 range = 0; /* this was not a true range */
10087                 element_count += 2; /* So counts for three values */
10088             }
10089 
10090             classnum = namedclass_to_classnum(namedclass);
10091 
10092             if (LOC && namedclass < ANYOF_POSIXL_MAX
10093 #ifndef HAS_ISASCII
10094                 && classnum != CC_ASCII_
10095 #endif
10096             ) {
10097                 SV* scratch_list = NULL;
10098 
10099                 /* What the Posix classes (like \w, [:space:]) match isn't
10100                  * generally knowable under locale until actual match time.  A
10101                  * special node is used for these which has extra space for a
10102                  * bitmap, with a bit reserved for each named class that is to
10103                  * be matched against.  (This isn't needed for \p{} and
10104                  * pseudo-classes, as they are not affected by locale, and
10105                  * hence are dealt with separately.)  However, if a named class
10106                  * and its complement are both present, then it matches
10107                  * everything, and there is no runtime dependency.  Odd numbers
10108                  * are the complements of the next lower number, so xor works.
10109                  * (Note that something like [\w\D] should match everything,
10110                  * because \d should be a proper subset of \w.  But rather than
10111                  * trust that the locale is well behaved, we leave this to
10112                  * runtime to sort out) */
10113                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
10114                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
10115                     POSIXL_ZERO(posixl);
10116                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
10117                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
10118                     continue;   /* We could ignore the rest of the class, but
10119                                    best to parse it for any errors */
10120                 }
10121                 else { /* Here, isn't the complement of any already parsed
10122                           class */
10123                     POSIXL_SET(posixl, namedclass);
10124                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
10125                     anyof_flags |= ANYOF_MATCHES_POSIXL;
10126 
10127                     /* The above-Latin1 characters are not subject to locale
10128                      * rules.  Just add them to the unconditionally-matched
10129                      * list */
10130 
10131                     /* Get the list of the above-Latin1 code points this
10132                      * matches */
10133                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
10134                                             PL_XPosix_ptrs[classnum],
10135 
10136                                             /* Odd numbers are complements,
10137                                              * like NDIGIT, NASCII, ... */
10138                                             namedclass % 2 != 0,
10139                                             &scratch_list);
10140                     /* Checking if 'cp_list' is NULL first saves an extra
10141                      * clone.  Its reference count will be decremented at the
10142                      * next union, etc, or if this is the only instance, at the
10143                      * end of the routine */
10144                     if (! cp_list) {
10145                         cp_list = scratch_list;
10146                     }
10147                     else {
10148                         _invlist_union(cp_list, scratch_list, &cp_list);
10149                         SvREFCNT_dec_NN(scratch_list);
10150                     }
10151                     continue;   /* Go get next character */
10152                 }
10153             }
10154             else {
10155 
10156                 /* Here, is not /l, or is a POSIX class for which /l doesn't
10157                  * matter (or is a Unicode property, which is skipped here). */
10158                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
10159                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
10160 
10161                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
10162                          * nor /l make a difference in what these match,
10163                          * therefore we just add what they match to cp_list. */
10164                         if (classnum != CC_VERTSPACE_) {
10165                             assert(   namedclass == ANYOF_HORIZWS
10166                                    || namedclass == ANYOF_NHORIZWS);
10167 
10168                             /* It turns out that \h is just a synonym for
10169                              * XPosixBlank */
10170                             classnum = CC_BLANK_;
10171                         }
10172 
10173                         _invlist_union_maybe_complement_2nd(
10174                                 cp_list,
10175                                 PL_XPosix_ptrs[classnum],
10176                                 namedclass % 2 != 0,    /* Complement if odd
10177                                                           (NHORIZWS, NVERTWS)
10178                                                         */
10179                                 &cp_list);
10180                     }
10181                 }
10182                 else if (   AT_LEAST_UNI_SEMANTICS
10183                          || classnum == CC_ASCII_
10184                          || (DEPENDS_SEMANTICS && (   classnum == CC_DIGIT_
10185                                                    || classnum == CC_XDIGIT_)))
10186                 {
10187                     /* We usually have to worry about /d affecting what POSIX
10188                      * classes match, with special code needed because we won't
10189                      * know until runtime what all matches.  But there is no
10190                      * extra work needed under /u and /a; and [:ascii:] is
10191                      * unaffected by /d; and :digit: and :xdigit: don't have
10192                      * runtime differences under /d.  So we can special case
10193                      * these, and avoid some extra work below, and at runtime.
10194                      * */
10195                     _invlist_union_maybe_complement_2nd(
10196                                                      simple_posixes,
10197                                                       ((AT_LEAST_ASCII_RESTRICTED)
10198                                                        ? PL_Posix_ptrs[classnum]
10199                                                        : PL_XPosix_ptrs[classnum]),
10200                                                      namedclass % 2 != 0,
10201                                                      &simple_posixes);
10202                 }
10203                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
10204                            complement and use nposixes */
10205                     SV** posixes_ptr = namedclass % 2 == 0
10206                                        ? &posixes
10207                                        : &nposixes;
10208                     _invlist_union_maybe_complement_2nd(
10209                                                      *posixes_ptr,
10210                                                      PL_XPosix_ptrs[classnum],
10211                                                      namedclass % 2 != 0,
10212                                                      posixes_ptr);
10213                 }
10214             }
10215         } /* end of namedclass \blah */
10216 
10217         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
10218 
10219         /* If 'range' is set, 'value' is the ending of a range--check its
10220          * validity.  (If value isn't a single code point in the case of a
10221          * range, we should have figured that out above in the code that
10222          * catches false ranges).  Later, we will handle each individual code
10223          * point in the range.  If 'range' isn't set, this could be the
10224          * beginning of a range, so check for that by looking ahead to see if
10225          * the next real character to be processed is the range indicator--the
10226          * minus sign */
10227 
10228         if (range) {
10229 #ifdef EBCDIC
10230             /* For unicode ranges, we have to test that the Unicode as opposed
10231              * to the native values are not decreasing.  (Above 255, there is
10232              * no difference between native and Unicode) */
10233             if (unicode_range && prevvalue < 255 && value < 255) {
10234                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
10235                     goto backwards_range;
10236                 }
10237             }
10238             else
10239 #endif
10240             if (prevvalue > value) /* b-a */ {
10241                 int w;
10242 #ifdef EBCDIC
10243               backwards_range:
10244 #endif
10245                 w = RExC_parse - rangebegin;
10246                 vFAIL2utf8f(
10247                     "Invalid [] range \"%" UTF8f "\"",
10248                     UTF8fARG(UTF, w, rangebegin));
10249                 NOT_REACHED; /* NOTREACHED */
10250             }
10251         }
10252         else {
10253             prevvalue = value; /* save the beginning of the potential range */
10254             if (! stop_at_1     /* Can't be a range if parsing just one thing */
10255                 && *RExC_parse == '-')
10256             {
10257                 char* next_char_ptr = RExC_parse + 1;
10258 
10259                 /* Get the next real char after the '-' */
10260                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
10261 
10262                 /* If the '-' is at the end of the class (just before the ']',
10263                  * it is a literal minus; otherwise it is a range */
10264                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
10265                     RExC_parse_set(next_char_ptr);
10266 
10267                     /* a bad range like \w-, [:word:]- ? */
10268                     if (namedclass > OOB_NAMEDCLASS) {
10269                         if (strict || ckWARN(WARN_REGEXP)) {
10270                             const int w = RExC_parse >= rangebegin
10271                                           ?  RExC_parse - rangebegin
10272                                           : 0;
10273                             if (strict) {
10274                                 vFAIL4("False [] range \"%*.*s\"",
10275                                     w, w, rangebegin);
10276                             }
10277                             else {
10278                                 vWARN4(RExC_parse,
10279                                     "False [] range \"%*.*s\"",
10280                                     w, w, rangebegin);
10281                             }
10282                         }
10283                         cp_list = add_cp_to_invlist(cp_list, '-');
10284                         element_count++;
10285                     } else
10286                         range = 1;	/* yeah, it's a range! */
10287                     continue;	/* but do it the next time */
10288                 }
10289             }
10290         }
10291 
10292         if (namedclass > OOB_NAMEDCLASS) {
10293             continue;
10294         }
10295 
10296         /* Here, we have a single value this time through the loop, and
10297          * <prevvalue> is the beginning of the range, if any; or <value> if
10298          * not. */
10299 
10300         /* non-Latin1 code point implies unicode semantics. */
10301         if (value > 255) {
10302             if (value > MAX_LEGAL_CP && (   value != UV_MAX
10303                                          || prevvalue > MAX_LEGAL_CP))
10304             {
10305                 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
10306             }
10307             REQUIRE_UNI_RULES(flagp, 0);
10308             if (  ! silence_non_portable
10309                 &&  UNICODE_IS_PERL_EXTENDED(value)
10310                 &&  TO_OUTPUT_WARNINGS(RExC_parse))
10311             {
10312                 ckWARN2_non_literal_string(RExC_parse,
10313                                            packWARN(WARN_PORTABLE),
10314                                            PL_extended_cp_format,
10315                                            value);
10316             }
10317         }
10318 
10319         /* Ready to process either the single value, or the completed range.
10320          * For single-valued non-inverted ranges, we consider the possibility
10321          * of multi-char folds.  (We made a conscious decision to not do this
10322          * for the other cases because it can often lead to non-intuitive
10323          * results.  For example, you have the peculiar case that:
10324          *  "s s" =~ /^[^\xDF]+$/i => Y
10325          *  "ss"  =~ /^[^\xDF]+$/i => N
10326          *
10327          * See [perl #89750] */
10328         if (FOLD && allow_mutiple_chars && value == prevvalue) {
10329             if (    value == LATIN_SMALL_LETTER_SHARP_S
10330                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
10331                                                         value)))
10332             {
10333                 /* Here <value> is indeed a multi-char fold.  Get what it is */
10334 
10335                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10336                 STRLEN foldlen;
10337 
10338                 UV folded = _to_uni_fold_flags(
10339                                 value,
10340                                 foldbuf,
10341                                 &foldlen,
10342                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
10343                                                    ? FOLD_FLAGS_NOMIX_ASCII
10344                                                    : 0)
10345                                 );
10346 
10347                 /* Here, <folded> should be the first character of the
10348                  * multi-char fold of <value>, with <foldbuf> containing the
10349                  * whole thing.  But, if this fold is not allowed (because of
10350                  * the flags), <fold> will be the same as <value>, and should
10351                  * be processed like any other character, so skip the special
10352                  * handling */
10353                 if (folded != value) {
10354 
10355                     /* Skip if we are recursed, currently parsing the class
10356                      * again.  Otherwise add this character to the list of
10357                      * multi-char folds. */
10358                     if (! RExC_in_multi_char_class) {
10359                         STRLEN cp_count = utf8_length(foldbuf,
10360                                                       foldbuf + foldlen);
10361                         SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
10362 
10363                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
10364 
10365                         multi_char_matches
10366                                         = add_multi_match(multi_char_matches,
10367                                                           multi_fold,
10368                                                           cp_count);
10369 
10370                     }
10371 
10372                     /* This element should not be processed further in this
10373                      * class */
10374                     element_count--;
10375                     value = save_value;
10376                     prevvalue = save_prevvalue;
10377                     continue;
10378                 }
10379             }
10380         }
10381 
10382         if (strict && ckWARN(WARN_REGEXP)) {
10383             if (range) {
10384 
10385                 /* If the range starts above 255, everything is portable and
10386                  * likely to be so for any forseeable character set, so don't
10387                  * warn. */
10388                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
10389                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
10390                 }
10391                 else if (prevvalue != value) {
10392 
10393                     /* Under strict, ranges that stop and/or end in an ASCII
10394                      * printable should have each end point be a portable value
10395                      * for it (preferably like 'A', but we don't warn if it is
10396                      * a (portable) Unicode name or code point), and the range
10397                      * must be all digits or all letters of the same case.
10398                      * Otherwise, the range is non-portable and unclear as to
10399                      * what it contains */
10400                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
10401                         && (          non_portable_endpoint
10402                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
10403                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
10404                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
10405                     ))) {
10406                         vWARN(RExC_parse, "Ranges of ASCII printables should"
10407                                           " be some subset of \"0-9\","
10408                                           " \"A-Z\", or \"a-z\"");
10409                     }
10410                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
10411                         SSize_t index_start;
10412                         SSize_t index_final;
10413 
10414                         /* But the nature of Unicode and languages mean we
10415                          * can't do the same checks for above-ASCII ranges,
10416                          * except in the case of digit ones.  These should
10417                          * contain only digits from the same group of 10.  The
10418                          * ASCII case is handled just above.  Hence here, the
10419                          * range could be a range of digits.  First some
10420                          * unlikely special cases.  Grandfather in that a range
10421                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
10422                          * if its starting value is one of the 10 digits prior
10423                          * to it.  This is because it is an alternate way of
10424                          * writing 19D1, and some people may expect it to be in
10425                          * that group.  But it is bad, because it won't give
10426                          * the expected results.  In Unicode 5.2 it was
10427                          * considered to be in that group (of 11, hence), but
10428                          * this was fixed in the next version */
10429 
10430                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
10431                             goto warn_bad_digit_range;
10432                         }
10433                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
10434                                           &&     value <= 0x1D7FF))
10435                         {
10436                             /* This is the only other case currently in Unicode
10437                              * where the algorithm below fails.  The code
10438                              * points just above are the end points of a single
10439                              * range containing only decimal digits.  It is 5
10440                              * different series of 0-9.  All other ranges of
10441                              * digits currently in Unicode are just a single
10442                              * series.  (And mktables will notify us if a later
10443                              * Unicode version breaks this.)
10444                              *
10445                              * If the range being checked is at most 9 long,
10446                              * and the digit values represented are in
10447                              * numerical order, they are from the same series.
10448                              * */
10449                             if (         value - prevvalue > 9
10450                                 ||    (((    value - 0x1D7CE) % 10)
10451                                      <= (prevvalue - 0x1D7CE) % 10))
10452                             {
10453                                 goto warn_bad_digit_range;
10454                             }
10455                         }
10456                         else {
10457 
10458                             /* For all other ranges of digits in Unicode, the
10459                              * algorithm is just to check if both end points
10460                              * are in the same series, which is the same range.
10461                              * */
10462                             index_start = _invlist_search(
10463                                                     PL_XPosix_ptrs[CC_DIGIT_],
10464                                                     prevvalue);
10465 
10466                             /* Warn if the range starts and ends with a digit,
10467                              * and they are not in the same group of 10. */
10468                             if (   index_start >= 0
10469                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
10470                                 && (index_final =
10471                                     _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
10472                                                     value)) != index_start
10473                                 && index_final >= 0
10474                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
10475                             {
10476                               warn_bad_digit_range:
10477                                 vWARN(RExC_parse, "Ranges of digits should be"
10478                                                   " from the same group of"
10479                                                   " 10");
10480                             }
10481                         }
10482                     }
10483                 }
10484             }
10485             if ((! range || prevvalue == value) && non_portable_endpoint) {
10486                 if (isPRINT_A(value)) {
10487                     char literal[3];
10488                     unsigned d = 0;
10489                     if (isBACKSLASHED_PUNCT(value)) {
10490                         literal[d++] = '\\';
10491                     }
10492                     literal[d++] = (char) value;
10493                     literal[d++] = '\0';
10494 
10495                     vWARN4(RExC_parse,
10496                            "\"%.*s\" is more clearly written simply as \"%s\"",
10497                            (int) (RExC_parse - rangebegin),
10498                            rangebegin,
10499                            literal
10500                         );
10501                 }
10502                 else if (isMNEMONIC_CNTRL(value)) {
10503                     vWARN4(RExC_parse,
10504                            "\"%.*s\" is more clearly written simply as \"%s\"",
10505                            (int) (RExC_parse - rangebegin),
10506                            rangebegin,
10507                            cntrl_to_mnemonic((U8) value)
10508                         );
10509                 }
10510             }
10511         }
10512 
10513         /* Deal with this element of the class */
10514 
10515 #ifndef EBCDIC
10516         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10517                                                     prevvalue, value);
10518 #else
10519         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
10520          * that don't require special handling, we can just add the range like
10521          * we do for ASCII platforms */
10522         if ((UNLIKELY(prevvalue == 0) && value >= 255)
10523             || ! (prevvalue < 256
10524                     && (unicode_range
10525                         || (! non_portable_endpoint
10526                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
10527                                 || (isUPPER_A(prevvalue)
10528                                     && isUPPER_A(value)))))))
10529         {
10530             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10531                                                         prevvalue, value);
10532         }
10533         else {
10534             /* Here, requires special handling.  This can be because it is a
10535              * range whose code points are considered to be Unicode, and so
10536              * must be individually translated into native, or because its a
10537              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
10538              * EBCDIC, but we have defined them to include only the "expected"
10539              * upper or lower case ASCII alphabetics.  Subranges above 255 are
10540              * the same in native and Unicode, so can be added as a range */
10541             U8 start = NATIVE_TO_LATIN1(prevvalue);
10542             unsigned j;
10543             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
10544             for (j = start; j <= end; j++) {
10545                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
10546             }
10547             if (value > 255) {
10548                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10549                                                             256, value);
10550             }
10551         }
10552 #endif
10553 
10554         range = 0; /* this range (if it was one) is done now */
10555     } /* End of loop through all the text within the brackets */
10556 
10557     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
10558         output_posix_warnings(pRExC_state, posix_warnings);
10559     }
10560 
10561     /* If anything in the class expands to more than one character, we have to
10562      * deal with them by building up a substitute parse string, and recursively
10563      * calling reg() on it, instead of proceeding */
10564     if (multi_char_matches) {
10565         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
10566         I32 cp_count;
10567         STRLEN len;
10568         char *save_end = RExC_end;
10569         char *save_parse = RExC_parse;
10570         char *save_start = RExC_start;
10571         Size_t constructed_prefix_len = 0; /* This gives the length of the
10572                                               constructed portion of the
10573                                               substitute parse. */
10574         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
10575                                        a "|" */
10576         I32 reg_flags;
10577 
10578         assert(! invert);
10579         /* Only one level of recursion allowed */
10580         assert(RExC_copy_start_in_constructed == RExC_precomp);
10581 
10582 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
10583            because too confusing */
10584         if (invert) {
10585             sv_catpvs(substitute_parse, "(?:");
10586         }
10587 #endif
10588 
10589         /* Look at the longest strings first */
10590         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
10591                         cp_count > 0;
10592                         cp_count--)
10593         {
10594 
10595             if (av_exists(multi_char_matches, cp_count)) {
10596                 AV** this_array_ptr;
10597                 SV* this_sequence;
10598 
10599                 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
10600                                                  cp_count, FALSE);
10601                 while ((this_sequence = av_pop(*this_array_ptr)) !=
10602                                                                 &PL_sv_undef)
10603                 {
10604                     if (! first_time) {
10605                         sv_catpvs(substitute_parse, "|");
10606                     }
10607                     first_time = FALSE;
10608 
10609                     sv_catpv(substitute_parse, SvPVX(this_sequence));
10610                 }
10611             }
10612         }
10613 
10614         /* If the character class contains anything else besides these
10615          * multi-character strings, have to include it in recursive parsing */
10616         if (element_count) {
10617             bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
10618 
10619             sv_catpvs(substitute_parse, "|");
10620             if (has_l_bracket) {    /* Add an [ if the original had one */
10621                 sv_catpvs(substitute_parse, "[");
10622             }
10623             constructed_prefix_len = SvCUR(substitute_parse);
10624             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
10625 
10626             /* Put in a closing ']' to match any opening one, but not if going
10627              * off the end, as otherwise we are adding something that really
10628              * isn't there */
10629             if (has_l_bracket && RExC_parse < RExC_end) {
10630                 sv_catpvs(substitute_parse, "]");
10631             }
10632         }
10633 
10634         sv_catpvs(substitute_parse, ")");
10635 #if 0
10636         if (invert) {
10637             /* This is a way to get the parse to skip forward a whole named
10638              * sequence instead of matching the 2nd character when it fails the
10639              * first */
10640             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
10641         }
10642 #endif
10643 
10644         /* Set up the data structure so that any errors will be properly
10645          * reported.  See the comments at the definition of
10646          * REPORT_LOCATION_ARGS for details */
10647         RExC_copy_start_in_input = (char *) orig_parse;
10648         RExC_start = SvPV(substitute_parse, len);
10649         RExC_parse_set( RExC_start );
10650         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
10651         RExC_end = RExC_parse + len;
10652         RExC_in_multi_char_class = 1;
10653 
10654         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
10655 
10656         *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
10657 
10658         /* And restore so can parse the rest of the pattern */
10659         RExC_parse_set(save_parse);
10660         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
10661         RExC_end = save_end;
10662         RExC_in_multi_char_class = 0;
10663         SvREFCNT_dec_NN(multi_char_matches);
10664         SvREFCNT_dec(properties);
10665         SvREFCNT_dec(cp_list);
10666         SvREFCNT_dec(simple_posixes);
10667         SvREFCNT_dec(posixes);
10668         SvREFCNT_dec(nposixes);
10669         SvREFCNT_dec(cp_foldable_list);
10670         return ret;
10671     }
10672 
10673     /* If folding, we calculate all characters that could fold to or from the
10674      * ones already on the list */
10675     if (cp_foldable_list) {
10676         if (FOLD) {
10677             UV start, end;	/* End points of code point ranges */
10678 
10679             SV* fold_intersection = NULL;
10680             SV** use_list;
10681 
10682             /* Our calculated list will be for Unicode rules.  For locale
10683              * matching, we have to keep a separate list that is consulted at
10684              * runtime only when the locale indicates Unicode rules (and we
10685              * don't include potential matches in the ASCII/Latin1 range, as
10686              * any code point could fold to any other, based on the run-time
10687              * locale).   For non-locale, we just use the general list */
10688             if (LOC) {
10689                 use_list = &only_utf8_locale_list;
10690             }
10691             else {
10692                 use_list = &cp_list;
10693             }
10694 
10695             /* Only the characters in this class that participate in folds need
10696              * be checked.  Get the intersection of this class and all the
10697              * possible characters that are foldable.  This can quickly narrow
10698              * down a large class */
10699             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
10700                                   &fold_intersection);
10701 
10702             /* Now look at the foldable characters in this class individually */
10703             invlist_iterinit(fold_intersection);
10704             while (invlist_iternext(fold_intersection, &start, &end)) {
10705                 UV j;
10706                 UV folded;
10707 
10708                 /* Look at every character in the range */
10709                 for (j = start; j <= end; j++) {
10710                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10711                     STRLEN foldlen;
10712                     unsigned int k;
10713                     Size_t folds_count;
10714                     U32 first_fold;
10715                     const U32 * remaining_folds;
10716 
10717                     if (j < 256) {
10718 
10719                         /* Under /l, we don't know what code points below 256
10720                          * fold to, except we do know the MICRO SIGN folds to
10721                          * an above-255 character if the locale is UTF-8, so we
10722                          * add it to the special list (in *use_list)  Otherwise
10723                          * we know now what things can match, though some folds
10724                          * are valid under /d only if the target is UTF-8.
10725                          * Those go in a separate list */
10726                         if (      IS_IN_SOME_FOLD_L1(j)
10727                             && ! (LOC && j != MICRO_SIGN))
10728                         {
10729 
10730                             /* ASCII is always matched; non-ASCII is matched
10731                              * only under Unicode rules (which could happen
10732                              * under /l if the locale is a UTF-8 one */
10733                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
10734                                 *use_list = add_cp_to_invlist(*use_list,
10735                                                             PL_fold_latin1[j]);
10736                             }
10737                             else if (j != PL_fold_latin1[j]) {
10738                                 upper_latin1_only_utf8_matches
10739                                         = add_cp_to_invlist(
10740                                                 upper_latin1_only_utf8_matches,
10741                                                 PL_fold_latin1[j]);
10742                             }
10743                         }
10744 
10745                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
10746                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
10747                         {
10748                             add_above_Latin1_folds(pRExC_state,
10749                                                    (U8) j,
10750                                                    use_list);
10751                         }
10752                         continue;
10753                     }
10754 
10755                     /* Here is an above Latin1 character.  We don't have the
10756                      * rules hard-coded for it.  First, get its fold.  This is
10757                      * the simple fold, as the multi-character folds have been
10758                      * handled earlier and separated out */
10759                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
10760                                                         (ASCII_FOLD_RESTRICTED)
10761                                                         ? FOLD_FLAGS_NOMIX_ASCII
10762                                                         : 0);
10763 
10764                     /* Single character fold of above Latin1.  Add everything
10765                      * in its fold closure to the list that this node should
10766                      * match. */
10767                     folds_count = _inverse_folds(folded, &first_fold,
10768                                                     &remaining_folds);
10769                     for (k = 0; k <= folds_count; k++) {
10770                         UV c = (k == 0)     /* First time through use itself */
10771                                 ? folded
10772                                 : (k == 1)  /* 2nd time use, the first fold */
10773                                    ? first_fold
10774 
10775                                      /* Then the remaining ones */
10776                                    : remaining_folds[k-2];
10777 
10778                         /* /aa doesn't allow folds between ASCII and non- */
10779                         if ((   ASCII_FOLD_RESTRICTED
10780                             && (isASCII(c) != isASCII(j))))
10781                         {
10782                             continue;
10783                         }
10784 
10785                         /* Folds under /l which cross the 255/256 boundary are
10786                          * added to a separate list.  (These are valid only
10787                          * when the locale is UTF-8.) */
10788                         if (c < 256 && LOC) {
10789                             *use_list = add_cp_to_invlist(*use_list, c);
10790                             continue;
10791                         }
10792 
10793                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
10794                         {
10795                             cp_list = add_cp_to_invlist(cp_list, c);
10796                         }
10797                         else {
10798                             /* Similarly folds involving non-ascii Latin1
10799                              * characters under /d are added to their list */
10800                             upper_latin1_only_utf8_matches
10801                                     = add_cp_to_invlist(
10802                                                 upper_latin1_only_utf8_matches,
10803                                                 c);
10804                         }
10805                     }
10806                 }
10807             }
10808             SvREFCNT_dec_NN(fold_intersection);
10809         }
10810 
10811         /* Now that we have finished adding all the folds, there is no reason
10812          * to keep the foldable list separate */
10813         _invlist_union(cp_list, cp_foldable_list, &cp_list);
10814         SvREFCNT_dec_NN(cp_foldable_list);
10815     }
10816 
10817     /* And combine the result (if any) with any inversion lists from posix
10818      * classes.  The lists are kept separate up to now because we don't want to
10819      * fold the classes */
10820     if (simple_posixes) {   /* These are the classes known to be unaffected by
10821                                /a, /aa, and /d */
10822         if (cp_list) {
10823             _invlist_union(cp_list, simple_posixes, &cp_list);
10824             SvREFCNT_dec_NN(simple_posixes);
10825         }
10826         else {
10827             cp_list = simple_posixes;
10828         }
10829     }
10830     if (posixes || nposixes) {
10831         if (! DEPENDS_SEMANTICS) {
10832 
10833             /* For everything but /d, we can just add the current 'posixes' and
10834              * 'nposixes' to the main list */
10835             if (posixes) {
10836                 if (cp_list) {
10837                     _invlist_union(cp_list, posixes, &cp_list);
10838                     SvREFCNT_dec_NN(posixes);
10839                 }
10840                 else {
10841                     cp_list = posixes;
10842                 }
10843             }
10844             if (nposixes) {
10845                 if (cp_list) {
10846                     _invlist_union(cp_list, nposixes, &cp_list);
10847                     SvREFCNT_dec_NN(nposixes);
10848                 }
10849                 else {
10850                     cp_list = nposixes;
10851                 }
10852             }
10853         }
10854         else {
10855             /* Under /d, things like \w match upper Latin1 characters only if
10856              * the target string is in UTF-8.  But things like \W match all the
10857              * upper Latin1 characters if the target string is not in UTF-8.
10858              *
10859              * Handle the case with something like \W separately */
10860             if (nposixes) {
10861                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
10862 
10863                 /* A complemented posix class matches all upper Latin1
10864                  * characters if not in UTF-8.  And it matches just certain
10865                  * ones when in UTF-8.  That means those certain ones are
10866                  * matched regardless, so can just be added to the
10867                  * unconditional list */
10868                 if (cp_list) {
10869                     _invlist_union(cp_list, nposixes, &cp_list);
10870                     SvREFCNT_dec_NN(nposixes);
10871                     nposixes = NULL;
10872                 }
10873                 else {
10874                     cp_list = nposixes;
10875                 }
10876 
10877                 /* Likewise for 'posixes' */
10878                 _invlist_union(posixes, cp_list, &cp_list);
10879                 SvREFCNT_dec(posixes);
10880 
10881                 /* Likewise for anything else in the range that matched only
10882                  * under UTF-8 */
10883                 if (upper_latin1_only_utf8_matches) {
10884                     _invlist_union(cp_list,
10885                                    upper_latin1_only_utf8_matches,
10886                                    &cp_list);
10887                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10888                     upper_latin1_only_utf8_matches = NULL;
10889                 }
10890 
10891                 /* If we don't match all the upper Latin1 characters regardless
10892                  * of UTF-8ness, we have to set a flag to match the rest when
10893                  * not in UTF-8 */
10894                 _invlist_subtract(only_non_utf8_list, cp_list,
10895                                   &only_non_utf8_list);
10896                 if (_invlist_len(only_non_utf8_list) != 0) {
10897                     anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
10898                 }
10899                 SvREFCNT_dec_NN(only_non_utf8_list);
10900             }
10901             else {
10902                 /* Here there were no complemented posix classes.  That means
10903                  * the upper Latin1 characters in 'posixes' match only when the
10904                  * target string is in UTF-8.  So we have to add them to the
10905                  * list of those types of code points, while adding the
10906                  * remainder to the unconditional list.
10907                  *
10908                  * First calculate what they are */
10909                 SV* nonascii_but_latin1_properties = NULL;
10910                 _invlist_intersection(posixes, PL_UpperLatin1,
10911                                       &nonascii_but_latin1_properties);
10912 
10913                 /* And add them to the final list of such characters. */
10914                 _invlist_union(upper_latin1_only_utf8_matches,
10915                                nonascii_but_latin1_properties,
10916                                &upper_latin1_only_utf8_matches);
10917 
10918                 /* Remove them from what now becomes the unconditional list */
10919                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
10920                                   &posixes);
10921 
10922                 /* And add those unconditional ones to the final list */
10923                 if (cp_list) {
10924                     _invlist_union(cp_list, posixes, &cp_list);
10925                     SvREFCNT_dec_NN(posixes);
10926                     posixes = NULL;
10927                 }
10928                 else {
10929                     cp_list = posixes;
10930                 }
10931 
10932                 SvREFCNT_dec(nonascii_but_latin1_properties);
10933 
10934                 /* Get rid of any characters from the conditional list that we
10935                  * now know are matched unconditionally, which may make that
10936                  * list empty */
10937                 _invlist_subtract(upper_latin1_only_utf8_matches,
10938                                   cp_list,
10939                                   &upper_latin1_only_utf8_matches);
10940                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
10941                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10942                     upper_latin1_only_utf8_matches = NULL;
10943                 }
10944             }
10945         }
10946     }
10947 
10948     /* And combine the result (if any) with any inversion list from properties.
10949      * The lists are kept separate up to now so that we can distinguish the two
10950      * in regards to matching above-Unicode.  A run-time warning is generated
10951      * if a Unicode property is matched against a non-Unicode code point. But,
10952      * we allow user-defined properties to match anything, without any warning,
10953      * and we also suppress the warning if there is a portion of the character
10954      * class that isn't a Unicode property, and which matches above Unicode, \W
10955      * or [\x{110000}] for example.
10956      * (Note that in this case, unlike the Posix one above, there is no
10957      * <upper_latin1_only_utf8_matches>, because having a Unicode property
10958      * forces Unicode semantics */
10959     if (properties) {
10960         if (cp_list) {
10961 
10962             /* If it matters to the final outcome, see if a non-property
10963              * component of the class matches above Unicode.  If so, the
10964              * warning gets suppressed.  This is true even if just a single
10965              * such code point is specified, as, though not strictly correct if
10966              * another such code point is matched against, the fact that they
10967              * are using above-Unicode code points indicates they should know
10968              * the issues involved */
10969             if (warn_super) {
10970                 warn_super = ! (invert
10971                                ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
10972             }
10973 
10974             _invlist_union(properties, cp_list, &cp_list);
10975             SvREFCNT_dec_NN(properties);
10976         }
10977         else {
10978             cp_list = properties;
10979         }
10980 
10981         if (warn_super) {
10982             anyof_flags |= ANYOF_WARN_SUPER__shared;
10983 
10984             /* Because an ANYOF node is the only one that warns, this node
10985              * can't be optimized into something else */
10986             optimizable = FALSE;
10987         }
10988     }
10989 
10990     /* Here, we have calculated what code points should be in the character
10991      * class.
10992      *
10993      * Now we can see about various optimizations.  Fold calculation (which we
10994      * did above) needs to take place before inversion.  Otherwise /[^k]/i
10995      * would invert to include K, which under /i would match k, which it
10996      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
10997      * folded until runtime */
10998 
10999     /* If we didn't do folding, it's because some information isn't available
11000      * until runtime; set the run-time fold flag for these  We know to set the
11001      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
11002      * at least one 0-255 range code point */
11003     if (LOC && FOLD) {
11004 
11005         /* Some things on the list might be unconditionally included because of
11006          * other components.  Remove them, and clean up the list if it goes to
11007          * 0 elements */
11008         if (only_utf8_locale_list && cp_list) {
11009             _invlist_subtract(only_utf8_locale_list, cp_list,
11010                               &only_utf8_locale_list);
11011 
11012             if (_invlist_len(only_utf8_locale_list) == 0) {
11013                 SvREFCNT_dec_NN(only_utf8_locale_list);
11014                 only_utf8_locale_list = NULL;
11015             }
11016         }
11017         if (    only_utf8_locale_list
11018             || (    cp_list
11019                 && (   _invlist_contains_cp(cp_list,
11020                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
11021                     || _invlist_contains_cp(cp_list,
11022                                             LATIN_SMALL_LETTER_DOTLESS_I))))
11023         {
11024             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11025             anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11026         }
11027         else if (cp_list && invlist_lowest(cp_list) < 256) {
11028             /* If nothing is below 256, has no locale dependency; otherwise it
11029              * does */
11030             anyof_flags |= ANYOFL_FOLD;
11031             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11032 
11033             /* In a Turkish locale these could match, notify the run-time code
11034              * to check for that */
11035             if (   _invlist_contains_cp(cp_list, 'I')
11036                 || _invlist_contains_cp(cp_list, 'i'))
11037             {
11038                 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11039             }
11040         }
11041     }
11042     else if (   DEPENDS_SEMANTICS
11043              && (    upper_latin1_only_utf8_matches
11044                  || (  anyof_flags
11045                      & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
11046     {
11047         RExC_seen_d_op = TRUE;
11048         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
11049     }
11050 
11051     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
11052      * compile time. */
11053     if (     cp_list
11054         &&   invert
11055         && ! has_runtime_dependency)
11056     {
11057         _invlist_invert(cp_list);
11058 
11059         /* Clear the invert flag since have just done it here */
11060         invert = FALSE;
11061     }
11062 
11063     /* All possible optimizations below still have these characteristics.
11064      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
11065      * routine) */
11066     *flagp |= HASWIDTH|SIMPLE;
11067 
11068     if (ret_invlist) {
11069         *ret_invlist = cp_list;
11070 
11071         return (cp_list) ? RExC_emit : 0;
11072     }
11073 
11074     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
11075         RExC_contains_locale = 1;
11076     }
11077 
11078     if (optimizable) {
11079 
11080         /* Some character classes are equivalent to other nodes.  Such nodes
11081          * take up less room, and some nodes require fewer operations to
11082          * execute, than ANYOF nodes.  EXACTish nodes may be joinable with
11083          * adjacent nodes to improve efficiency. */
11084         op = optimize_regclass(pRExC_state, cp_list,
11085                                             only_utf8_locale_list,
11086                                             upper_latin1_only_utf8_matches,
11087                                             has_runtime_dependency,
11088                                             posixl,
11089                                             &anyof_flags, &invert, &ret, flagp);
11090         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
11091 
11092         /* If optimized to something else and emitted, clean up and return */
11093         if (ret >= 0) {
11094             SvREFCNT_dec(cp_list);;
11095             SvREFCNT_dec(only_utf8_locale_list);
11096             SvREFCNT_dec(upper_latin1_only_utf8_matches);
11097             return ret;
11098         }
11099 
11100         /* If no optimization was found, an END was returned and we will now
11101          * emit an ANYOF */
11102         if (op == END) {
11103             op = ANYOF;
11104         }
11105     }
11106 
11107     /* Here are going to emit an ANYOF; set the particular type */
11108     if (op == ANYOF) {
11109         if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
11110             op = ANYOFD;
11111         }
11112         else if (posixl) {
11113             op = ANYOFPOSIXL;
11114         }
11115         else if (LOC) {
11116             op = ANYOFL;
11117         }
11118     }
11119 
11120     ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
11121     FILL_NODE(ret, op);        /* We set the argument later */
11122     RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
11123     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
11124 
11125     /* Here, <cp_list> contains all the code points we can determine at
11126      * compile time that match under all conditions.  Go through it, and
11127      * for things that belong in the bitmap, put them there, and delete from
11128      * <cp_list>.  While we are at it, see if everything above 255 is in the
11129      * list, and if so, set a flag to speed up execution */
11130 
11131     populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
11132 
11133     if (posixl) {
11134         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
11135     }
11136 
11137     if (invert) {
11138         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
11139     }
11140 
11141     /* Here, the bitmap has been populated with all the Latin1 code points that
11142      * always match.  Can now add to the overall list those that match only
11143      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
11144      * */
11145     if (upper_latin1_only_utf8_matches) {
11146         if (cp_list) {
11147             _invlist_union(cp_list,
11148                            upper_latin1_only_utf8_matches,
11149                            &cp_list);
11150             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11151         }
11152         else {
11153             cp_list = upper_latin1_only_utf8_matches;
11154         }
11155         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11156     }
11157 
11158     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
11159                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
11160                    ? listsv
11161                    : NULL,
11162                   only_utf8_locale_list);
11163 
11164     SvREFCNT_dec(cp_list);;
11165     SvREFCNT_dec(only_utf8_locale_list);
11166     return ret;
11167 }
11168 
11169 STATIC U8
S_optimize_regclass(pTHX_ RExC_state_t * pRExC_state,SV * cp_list,SV * only_utf8_locale_list,SV * upper_latin1_only_utf8_matches,const U32 has_runtime_dependency,const U32 posixl,U8 * anyof_flags,bool * invert,regnode_offset * ret,I32 * flagp)11170 S_optimize_regclass(pTHX_
11171                     RExC_state_t *pRExC_state,
11172                     SV * cp_list,
11173                     SV* only_utf8_locale_list,
11174                     SV* upper_latin1_only_utf8_matches,
11175                     const U32 has_runtime_dependency,
11176                     const U32 posixl,
11177                     U8  * anyof_flags,
11178                     bool * invert,
11179                     regnode_offset * ret,
11180                     I32 *flagp
11181                   )
11182 {
11183     /* This function exists just to make S_regclass() smaller.  It extracts out
11184      * the code that looks for potential optimizations away from a full generic
11185      * ANYOF node.  The parameter names are the same as the corresponding
11186      * variables in S_regclass.
11187      *
11188      * It returns the new op (the impossible END one if no optimization found)
11189      * and sets *ret to any created regnode.  If the new op is sufficiently
11190      * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
11191      *
11192      * Certain of the parameters may be updated as a result of the changes
11193      * herein */
11194 
11195     U8 op = END;    /* The returned node-type, initialized to an impossible
11196                       one. */
11197     UV value = 0;
11198     PERL_UINT_FAST8_T i;
11199     UV partial_cp_count = 0;
11200     UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
11201     UV   end[MAX_FOLD_FROMS+1] = { 0 };
11202     bool single_range = FALSE;
11203     UV lowest_cp = 0, highest_cp = 0;
11204 
11205     PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
11206 
11207     if (cp_list) { /* Count the code points in enough ranges that we would see
11208                       all the ones possible in any fold in this version of
11209                       Unicode */
11210 
11211         invlist_iterinit(cp_list);
11212         for (i = 0; i <= MAX_FOLD_FROMS; i++) {
11213             if (! invlist_iternext(cp_list, &start[i], &end[i])) {
11214                 break;
11215             }
11216             partial_cp_count += end[i] - start[i] + 1;
11217         }
11218 
11219         if (i == 1) {
11220             single_range = TRUE;
11221         }
11222         invlist_iterfinish(cp_list);
11223 
11224         /* If we know at compile time that this matches every possible code
11225          * point, any run-time dependencies don't matter */
11226         if (start[0] == 0 && end[0] == UV_MAX) {
11227             if (*invert) {
11228                 goto return_OPFAIL;
11229             }
11230             else {
11231                 goto return_SANY;
11232             }
11233         }
11234 
11235         /* Use a clearer mnemonic for below */
11236         lowest_cp = start[0];
11237 
11238         highest_cp = invlist_highest(cp_list);
11239     }
11240 
11241     /* Similarly, for /l posix classes, if both a class and its complement
11242      * match, any run-time dependencies don't matter */
11243     if (posixl) {
11244         int namedclass;
11245         for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
11246             if (   POSIXL_TEST(posixl, namedclass)      /* class */
11247                 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
11248             {
11249                 if (*invert) {
11250                     goto return_OPFAIL;
11251                 }
11252                 goto return_SANY;
11253             }
11254         }
11255 
11256         /* For well-behaved locales, some classes are subsets of others, so
11257          * complementing the subset and including the non-complemented superset
11258          * should match everything, like [\D[:alnum:]], and
11259          * [[:^alpha:][:alnum:]], but some implementations of locales are
11260          * buggy, and khw thinks its a bad idea to have optimization change
11261          * behavior, even if it avoids an OS bug in a given case */
11262 
11263 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
11264 
11265         /* If is a single posix /l class, can optimize to just that op.  Such a
11266          * node will not match anything in the Latin1 range, as that is not
11267          * determinable until runtime, but will match whatever the class does
11268          * outside that range.  (Note that some classes won't match anything
11269          * outside the range, like [:ascii:]) */
11270         if (   isSINGLE_BIT_SET(posixl)
11271             && (partial_cp_count == 0 || lowest_cp > 255))
11272         {
11273             U8 classnum;
11274             SV * class_above_latin1 = NULL;
11275             bool already_inverted;
11276             bool are_equivalent;
11277 
11278 
11279             namedclass = single_1bit_pos32(posixl);
11280             classnum = namedclass_to_classnum(namedclass);
11281 
11282             /* The named classes are such that the inverted number is one
11283              * larger than the non-inverted one */
11284             already_inverted = namedclass - classnum_to_namedclass(classnum);
11285 
11286             /* Create an inversion list of the official property, inverted if
11287              * the constructed node list is inverted, and restricted to only
11288              * the above latin1 code points, which are the only ones known at
11289              * compile time */
11290             _invlist_intersection_maybe_complement_2nd(
11291                                                 PL_AboveLatin1,
11292                                                 PL_XPosix_ptrs[classnum],
11293                                                 already_inverted,
11294                                                 &class_above_latin1);
11295             are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
11296             SvREFCNT_dec_NN(class_above_latin1);
11297 
11298             if (are_equivalent) {
11299 
11300                 /* Resolve the run-time inversion flag with this possibly
11301                  * inverted class */
11302                 *invert = *invert ^ already_inverted;
11303 
11304                 op = POSIXL + *invert * (NPOSIXL - POSIXL);
11305                 *ret = reg_node(pRExC_state, op);
11306                 FLAGS(REGNODE_p(*ret)) = classnum;
11307                 return op;
11308             }
11309         }
11310     }
11311 
11312     /* khw can't think of any other possible transformation involving these. */
11313     if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
11314         return END;
11315     }
11316 
11317     if (! has_runtime_dependency) {
11318 
11319         /* If the list is empty, nothing matches.  This happens, for example,
11320          * when a Unicode property that doesn't match anything is the only
11321          * element in the character class (perluniprops.pod notes such
11322          * properties). */
11323         if (partial_cp_count == 0) {
11324             if (*invert) {
11325                 goto return_SANY;
11326             }
11327             else {
11328                 goto return_OPFAIL;
11329             }
11330         }
11331 
11332         /* If matches everything but \n */
11333         if (   start[0] == 0 && end[0] == '\n' - 1
11334             && start[1] == '\n' + 1 && end[1] == UV_MAX)
11335         {
11336             assert (! *invert);
11337             op = REG_ANY;
11338             *ret = reg_node(pRExC_state, op);
11339             MARK_NAUGHTY(1);
11340             return op;
11341         }
11342     }
11343 
11344     /* Next see if can optimize classes that contain just a few code points
11345      * into an EXACTish node.  The reason to do this is to let the optimizer
11346      * join this node with adjacent EXACTish ones, and ANYOF nodes require
11347      * runtime conversion to code point from UTF-8, which we'd like to avoid.
11348      *
11349      * An EXACTFish node can be generated even if not under /i, and vice versa.
11350      * But care must be taken.  An EXACTFish node has to be such that it only
11351      * matches precisely the code points in the class, but we want to generate
11352      * the least restrictive one that does that, to increase the odds of being
11353      * able to join with an adjacent node.  For example, if the class contains
11354      * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
11355      * from matching.  Whether we are under /i or not is irrelevant in this
11356      * case.  Less obvious is the pattern qr/[\x{02BC}]n/i.  U+02BC is MODIFIER
11357      * LETTER APOSTROPHE. That is supposed to match the single character U+0149
11358      * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE.  And so even though there
11359      * is no simple fold that includes \X{02BC}, there is a multi-char fold
11360      * that does, and so the node generated for it must be an EXACTFish one.
11361      * On the other hand qr/:/i should generate a plain EXACT node since the
11362      * colon participates in no fold whatsoever, and having it be EXACT tells
11363      * the optimizer the target string cannot match unless it has a colon in
11364      * it. */
11365     if (   ! posixl
11366         && ! *invert
11367 
11368             /* Only try if there are no more code points in the class than in
11369              * the max possible fold */
11370         &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
11371     {
11372         /* We can always make a single code point class into an EXACTish node.
11373          * */
11374         if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
11375             if (LOC) {
11376 
11377                 /* Here is /l:  Use EXACTL, except if there is a fold not known
11378                  * until runtime so shows as only a single code point here.
11379                  * For code points above 255, we know which can cause problems
11380                  * by having a potential fold to the Latin1 range. */
11381                 if (  ! FOLD
11382                     || (     lowest_cp > 255
11383                         && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
11384                 {
11385                     op = EXACTL;
11386                 }
11387                 else {
11388                     op = EXACTFL;
11389                 }
11390             }
11391             else if (! FOLD) { /* Not /l and not /i */
11392                 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
11393             }
11394             else if (lowest_cp < 256) { /* /i, not /l, and the code point is
11395                                           small */
11396 
11397                 /* Under /i, it gets a little tricky.  A code point that
11398                  * doesn't participate in a fold should be an EXACT node.  We
11399                  * know this one isn't the result of a simple fold, or there'd
11400                  * be more than one code point in the list, but it could be
11401                  * part of a multi-character fold.  In that case we better not
11402                  * create an EXACT node, as we would wrongly be telling the
11403                  * optimizer that this code point must be in the target string,
11404                  * and that is wrong.  This is because if the sequence around
11405                  * this code point forms a multi-char fold, what needs to be in
11406                  * the string could be the code point that folds to the
11407                  * sequence.
11408                  *
11409                  * This handles the case of below-255 code points, as we have
11410                  * an easy look up for those.  The next clause handles the
11411                  * above-256 one */
11412                 op = IS_IN_SOME_FOLD_L1(lowest_cp)
11413                      ? EXACTFU
11414                      : EXACT;
11415             }
11416             else {  /* /i, larger code point.  Since we are under /i, and have
11417                        just this code point, we know that it can't fold to
11418                        something else, so PL_InMultiCharFold applies to it */
11419                 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
11420                          ? EXACTFU_REQ8
11421                          : EXACT_REQ8;
11422                 }
11423 
11424                 value = lowest_cp;
11425         }
11426         else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
11427                  && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
11428         {
11429             /* Here, the only runtime dependency, if any, is from /d, and the
11430              * class matches more than one code point, and the lowest code
11431              * point participates in some fold.  It might be that the other
11432              * code points are /i equivalent to this one, and hence they would
11433              * be representable by an EXACTFish node.  Above, we eliminated
11434              * classes that contain too many code points to be EXACTFish, with
11435              * the test for MAX_FOLD_FROMS
11436              *
11437              * First, special case the ASCII fold pairs, like 'B' and 'b'.  We
11438              * do this because we have EXACTFAA at our disposal for the ASCII
11439              * range */
11440             if (partial_cp_count == 2 && isASCII(lowest_cp)) {
11441 
11442                 /* The only ASCII characters that participate in folds are
11443                  * alphabetics */
11444                 assert(isALPHA(lowest_cp));
11445                 if (   end[0] == start[0]   /* First range is a single
11446                                                character, so 2nd exists */
11447                     && isALPHA_FOLD_EQ(start[0], start[1]))
11448                 {
11449                     /* Here, is part of an ASCII fold pair */
11450 
11451                     if (   ASCII_FOLD_RESTRICTED
11452                         || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
11453                     {
11454                         /* If the second clause just above was true, it means
11455                          * we can't be under /i, or else the list would have
11456                          * included more than this fold pair.  Therefore we
11457                          * have to exclude the possibility of whatever else it
11458                          * is that folds to these, by using EXACTFAA */
11459                         op = EXACTFAA;
11460                     }
11461                     else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
11462 
11463                         /* Here, there's no simple fold that lowest_cp is part
11464                          * of, but there is a multi-character one.  If we are
11465                          * not under /i, we want to exclude that possibility;
11466                          * if under /i, we want to include it */
11467                         op = (FOLD) ? EXACTFU : EXACTFAA;
11468                     }
11469                     else {
11470 
11471                         /* Here, the only possible fold lowest_cp participates in
11472                          * is with start[1].  /i or not isn't relevant */
11473                         op = EXACTFU;
11474                     }
11475 
11476                     value = toFOLD(lowest_cp);
11477                 }
11478             }
11479             else if (  ! upper_latin1_only_utf8_matches
11480                      || (   _invlist_len(upper_latin1_only_utf8_matches) == 2
11481                          && PL_fold_latin1[
11482                            invlist_highest(upper_latin1_only_utf8_matches)]
11483                          == lowest_cp))
11484             {
11485                 /* Here, the smallest character is non-ascii or there are more
11486                  * than 2 code points matched by this node.  Also, we either
11487                  * don't have /d UTF-8 dependent matches, or if we do, they
11488                  * look like they could be a single character that is the fold
11489                  * of the lowest one is in the always-match list.  This test
11490                  * quickly excludes most of the false positives when there are
11491                  * /d UTF-8 depdendent matches.  These are like LATIN CAPITAL
11492                  * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
11493                  * iff the target string is UTF-8.  (We don't have to worry
11494                  * above about exceeding the array bounds of PL_fold_latin1[]
11495                  * because any code point in 'upper_latin1_only_utf8_matches'
11496                  * is below 256.)
11497                  *
11498                  * EXACTFAA would apply only to pairs (hence exactly 2 code
11499                  * points) in the ASCII range, so we can't use it here to
11500                  * artificially restrict the fold domain, so we check if the
11501                  * class does or does not match some EXACTFish node.  Further,
11502                  * if we aren't under /i, and and the folded-to character is
11503                  * part of a multi-character fold, we can't do this
11504                  * optimization, as the sequence around it could be that
11505                  * multi-character fold, and we don't here know the context, so
11506                  * we have to assume it is that multi-char fold, to prevent
11507                  * potential bugs.
11508                  *
11509                  * To do the general case, we first find the fold of the lowest
11510                  * code point (which may be higher than that lowest unfolded
11511                  * one), then find everything that folds to it.  (The data
11512                  * structure we have only maps from the folded code points, so
11513                  * we have to do the earlier step.) */
11514 
11515                 Size_t foldlen;
11516                 U8 foldbuf[UTF8_MAXBYTES_CASE];
11517                 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
11518                 U32 first_fold;
11519                 const U32 * remaining_folds;
11520                 Size_t folds_to_this_cp_count = _inverse_folds(
11521                                                             folded,
11522                                                             &first_fold,
11523                                                             &remaining_folds);
11524                 Size_t folds_count = folds_to_this_cp_count + 1;
11525                 SV * fold_list = _new_invlist(folds_count);
11526                 unsigned int i;
11527 
11528                 /* If there are UTF-8 dependent matches, create a temporary
11529                  * list of what this node matches, including them. */
11530                 SV * all_cp_list = NULL;
11531                 SV ** use_this_list = &cp_list;
11532 
11533                 if (upper_latin1_only_utf8_matches) {
11534                     all_cp_list = _new_invlist(0);
11535                     use_this_list = &all_cp_list;
11536                     _invlist_union(cp_list,
11537                                    upper_latin1_only_utf8_matches,
11538                                    use_this_list);
11539                 }
11540 
11541                 /* Having gotten everything that participates in the fold
11542                  * containing the lowest code point, we turn that into an
11543                  * inversion list, making sure everything is included. */
11544                 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
11545                 fold_list = add_cp_to_invlist(fold_list, folded);
11546                 if (folds_to_this_cp_count > 0) {
11547                     fold_list = add_cp_to_invlist(fold_list, first_fold);
11548                     for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
11549                         fold_list = add_cp_to_invlist(fold_list,
11550                                                     remaining_folds[i]);
11551                     }
11552                 }
11553 
11554                 /* If the fold list is identical to what's in this ANYOF node,
11555                  * the node can be represented by an EXACTFish one instead */
11556                 if (_invlistEQ(*use_this_list, fold_list,
11557                                0 /* Don't complement */ )
11558                 ) {
11559 
11560                     /* But, we have to be careful, as mentioned above.  Just
11561                      * the right sequence of characters could match this if it
11562                      * is part of a multi-character fold.  That IS what we want
11563                      * if we are under /i.  But it ISN'T what we want if not
11564                      * under /i, as it could match when it shouldn't.  So, when
11565                      * we aren't under /i and this character participates in a
11566                      * multi-char fold, we don't optimize into an EXACTFish
11567                      * node.  So, for each case below we have to check if we
11568                      * are folding, and if not, if it is not part of a
11569                      * multi-char fold.  */
11570                     if (lowest_cp > 255) {    /* Highish code point */
11571                         if (FOLD || ! _invlist_contains_cp(
11572                                                    PL_InMultiCharFold, folded))
11573                         {
11574                             op = (LOC)
11575                                  ? EXACTFLU8
11576                                  : (ASCII_FOLD_RESTRICTED)
11577                                    ? EXACTFAA
11578                                    : EXACTFU_REQ8;
11579                             value = folded;
11580                         }
11581                     }   /* Below, the lowest code point < 256 */
11582                     else if (    FOLD
11583                              &&  folded == 's'
11584                              &&  DEPENDS_SEMANTICS)
11585                     {   /* An EXACTF node containing a single character 's',
11586                            can be an EXACTFU if it doesn't get joined with an
11587                            adjacent 's' */
11588                         op = EXACTFU_S_EDGE;
11589                         value = folded;
11590                     }
11591                     else if (     FOLD
11592                              || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
11593                     {
11594                         if (upper_latin1_only_utf8_matches) {
11595                             op = EXACTF;
11596 
11597                             /* We can't use the fold, as that only matches
11598                              * under UTF-8 */
11599                             value = lowest_cp;
11600                         }
11601                         else if (     UNLIKELY(lowest_cp == MICRO_SIGN)
11602                                  && ! UTF)
11603                         {   /* EXACTFUP is a special node for this character */
11604                             op = (ASCII_FOLD_RESTRICTED)
11605                                  ? EXACTFAA
11606                                  : EXACTFUP;
11607                             value = MICRO_SIGN;
11608                         }
11609                         else if (     ASCII_FOLD_RESTRICTED
11610                                  && ! isASCII(lowest_cp))
11611                         {   /* For ASCII under /iaa, we can use EXACTFU below
11612                              */
11613                             op = EXACTFAA;
11614                             value = folded;
11615                         }
11616                         else {
11617                             op = EXACTFU;
11618                             value = folded;
11619                         }
11620                     }
11621                 }
11622 
11623                 SvREFCNT_dec_NN(fold_list);
11624                 SvREFCNT_dec(all_cp_list);
11625             }
11626         }
11627 
11628         if (op != END) {
11629             U8 len;
11630 
11631             /* Here, we have calculated what EXACTish node to use.  Have to
11632              * convert to UTF-8 if not already there */
11633             if (value > 255) {
11634                 if (! UTF) {
11635                     SvREFCNT_dec(cp_list);;
11636                     REQUIRE_UTF8(flagp);
11637                 }
11638 
11639                 /* This is a kludge to the special casing issues with this
11640                  * ligature under /aa.  FB05 should fold to FB06, but the call
11641                  * above to _to_uni_fold_flags() didn't find this, as it didn't
11642                  * use the /aa restriction in order to not miss other folds
11643                  * that would be affected.  This is the only instance likely to
11644                  * ever be a problem in all of Unicode.  So special case it. */
11645                 if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
11646                     && ASCII_FOLD_RESTRICTED)
11647                 {
11648                     value = LATIN_SMALL_LIGATURE_ST;
11649                 }
11650             }
11651 
11652             len = (UTF) ? UVCHR_SKIP(value) : 1;
11653 
11654             *ret = REGNODE_GUTS(pRExC_state, op, len);
11655             FILL_NODE(*ret, op);
11656             RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
11657             setSTR_LEN(REGNODE_p(*ret), len);
11658             if (len == 1) {
11659                 *STRINGs(REGNODE_p(*ret)) = (U8) value;
11660             }
11661             else {
11662                 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
11663             }
11664 
11665             return op;
11666         }
11667     }
11668 
11669     if (! has_runtime_dependency) {
11670 
11671         /* See if this can be turned into an ANYOFM node.  Think about the bit
11672          * patterns in two different bytes.  In some positions, the bits in
11673          * each will be 1; and in other positions both will be 0; and in some
11674          * positions the bit will be 1 in one byte, and 0 in the other.  Let
11675          * 'n' be the number of positions where the bits differ.  We create a
11676          * mask which has exactly 'n' 0 bits, each in a position where the two
11677          * bytes differ.  Now take the set of all bytes that when ANDed with
11678          * the mask yield the same result.  That set has 2**n elements, and is
11679          * representable by just two 8 bit numbers: the result and the mask.
11680          * Importantly, matching the set can be vectorized by creating a word
11681          * full of the result bytes, and a word full of the mask bytes,
11682          * yielding a significant speed up.  Here, see if this node matches
11683          * such a set.  As a concrete example consider [01], and the byte
11684          * representing '0' which is 0x30 on ASCII machines.  It has the bits
11685          * 0011 0000.  Take the mask 1111 1110.  If we AND 0x31 and 0x30 with
11686          * that mask we get 0x30.  Any other bytes ANDed yield something else.
11687          * So [01], which is a common usage, is optimizable into ANYOFM, and
11688          * can benefit from the speed up.  We can only do this on UTF-8
11689          * invariant bytes, because they have the same bit patterns under UTF-8
11690          * as not. */
11691         PERL_UINT_FAST8_T inverted = 0;
11692 
11693         /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
11694          * EBCDIC */
11695         const PERL_UINT_FAST8_T max_permissible
11696                                     = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
11697 
11698         /* If doesn't fit the criteria for ANYOFM, invert and try again.  If
11699          * that works we will instead later generate an NANYOFM, and invert
11700          * back when through */
11701         if (highest_cp > max_permissible) {
11702             _invlist_invert(cp_list);
11703             inverted = 1;
11704         }
11705 
11706         if (invlist_highest(cp_list) <= max_permissible) {
11707             UV this_start, this_end;
11708             UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
11709             U8 bits_differing = 0;
11710             Size_t full_cp_count = 0;
11711             bool first_time = TRUE;
11712 
11713             /* Go through the bytes and find the bit positions that differ */
11714             invlist_iterinit(cp_list);
11715             while (invlist_iternext(cp_list, &this_start, &this_end)) {
11716                 unsigned int i = this_start;
11717 
11718                 if (first_time) {
11719                     if (! UVCHR_IS_INVARIANT(i)) {
11720                         goto done_anyofm;
11721                     }
11722 
11723                     first_time = FALSE;
11724                     lowest_cp = this_start;
11725 
11726                     /* We have set up the code point to compare with.  Don't
11727                      * compare it with itself */
11728                     i++;
11729                 }
11730 
11731                 /* Find the bit positions that differ from the lowest code
11732                  * point in the node.  Keep track of all such positions by
11733                  * OR'ing */
11734                 for (; i <= this_end; i++) {
11735                     if (! UVCHR_IS_INVARIANT(i)) {
11736                         goto done_anyofm;
11737                     }
11738 
11739                     bits_differing  |= i ^ lowest_cp;
11740                 }
11741 
11742                 full_cp_count += this_end - this_start + 1;
11743             }
11744 
11745             /* At the end of the loop, we count how many bits differ from the
11746              * bits in lowest code point, call the count 'd'.  If the set we
11747              * found contains 2**d elements, it is the closure of all code
11748              * points that differ only in those bit positions.  To convince
11749              * yourself of that, first note that the number in the closure must
11750              * be a power of 2, which we test for.  The only way we could have
11751              * that count and it be some differing set, is if we got some code
11752              * points that don't differ from the lowest code point in any
11753              * position, but do differ from each other in some other position.
11754              * That means one code point has a 1 in that position, and another
11755              * has a 0.  But that would mean that one of them differs from the
11756              * lowest code point in that position, which possibility we've
11757              * already excluded.  */
11758             if (  (inverted || full_cp_count > 1)
11759                 && full_cp_count == 1U << PL_bitcount[bits_differing])
11760             {
11761                 U8 ANYOFM_mask;
11762 
11763                 op = ANYOFM + inverted;;
11764 
11765                 /* We need to make the bits that differ be 0's */
11766                 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
11767 
11768                 /* The argument is the lowest code point */
11769                 *ret = reg1node(pRExC_state, op, lowest_cp);
11770                 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
11771             }
11772 
11773           done_anyofm:
11774             invlist_iterfinish(cp_list);
11775         }
11776 
11777         if (inverted) {
11778             _invlist_invert(cp_list);
11779         }
11780 
11781         if (op != END) {
11782             return op;
11783         }
11784 
11785         /* XXX We could create an ANYOFR_LOW node here if we saved above if all
11786          * were invariants, it wasn't inverted, and there is a single range.
11787          * This would be faster than some of the posix nodes we create below
11788          * like /\d/a, but would be twice the size.  Without having actually
11789          * measured the gain, khw doesn't think the tradeoff is really worth it
11790          * */
11791     }
11792 
11793     if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
11794         PERL_UINT_FAST8_T type;
11795         SV * intersection = NULL;
11796         SV* d_invlist = NULL;
11797 
11798         /* See if this matches any of the POSIX classes.  The POSIXA and POSIXD
11799          * ones are about the same speed as ANYOF ops, but take less room; the
11800          * ones that have above-Latin1 code point matches are somewhat faster
11801          * than ANYOF. */
11802 
11803         for (type = POSIXA; type >= POSIXD; type--) {
11804             int posix_class;
11805 
11806             if (type == POSIXL) {   /* But not /l posix classes */
11807                 continue;
11808             }
11809 
11810             for (posix_class = 0;
11811                  posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
11812                  posix_class++)
11813             {
11814                 SV** our_code_points = &cp_list;
11815                 SV** official_code_points;
11816                 int try_inverted;
11817 
11818                 if (type == POSIXA) {
11819                     official_code_points = &PL_Posix_ptrs[posix_class];
11820                 }
11821                 else {
11822                     official_code_points = &PL_XPosix_ptrs[posix_class];
11823                 }
11824 
11825                 /* Skip non-existent classes of this type.  e.g. \v only has an
11826                  * entry in PL_XPosix_ptrs */
11827                 if (! *official_code_points) {
11828                     continue;
11829                 }
11830 
11831                 /* Try both the regular class, and its inversion */
11832                 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
11833                     bool this_inverted = *invert ^ try_inverted;
11834 
11835                     if (type != POSIXD) {
11836 
11837                         /* This class that isn't /d can't match if we have /d
11838                          * dependencies */
11839                         if (has_runtime_dependency
11840                                                 & HAS_D_RUNTIME_DEPENDENCY)
11841                         {
11842                             continue;
11843                         }
11844                     }
11845                     else /* is /d */ if (! this_inverted) {
11846 
11847                         /* /d classes don't match anything non-ASCII below 256
11848                          * unconditionally (which cp_list contains) */
11849                         _invlist_intersection(cp_list, PL_UpperLatin1,
11850                                                        &intersection);
11851                         if (_invlist_len(intersection) != 0) {
11852                             continue;
11853                         }
11854 
11855                         SvREFCNT_dec(d_invlist);
11856                         d_invlist = invlist_clone(cp_list, NULL);
11857 
11858                         /* But under UTF-8 it turns into using /u rules.  Add
11859                          * the things it matches under these conditions so that
11860                          * we check below that these are identical to what the
11861                          * tested class should match */
11862                         if (upper_latin1_only_utf8_matches) {
11863                             _invlist_union(
11864                                         d_invlist,
11865                                         upper_latin1_only_utf8_matches,
11866                                         &d_invlist);
11867                         }
11868                         our_code_points = &d_invlist;
11869                     }
11870                     else {  /* POSIXD, inverted.  If this doesn't have this
11871                                flag set, it isn't /d. */
11872                         if (! ( *anyof_flags
11873                                & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
11874                         {
11875                             continue;
11876                         }
11877 
11878                         our_code_points = &cp_list;
11879                     }
11880 
11881                     /* Here, have weeded out some things.  We want to see if
11882                      * the list of characters this node contains
11883                      * ('*our_code_points') precisely matches those of the
11884                      * class we are currently checking against
11885                      * ('*official_code_points'). */
11886                     if (_invlistEQ(*our_code_points,
11887                                    *official_code_points,
11888                                    try_inverted))
11889                     {
11890                         /* Here, they precisely match.  Optimize this ANYOF
11891                          * node into its equivalent POSIX one of the correct
11892                          * type, possibly inverted.
11893                          *
11894                          * Some of these nodes match a single range of
11895                          * characters (or [:alpha:] matches two parallel ranges
11896                          * on ASCII platforms).  The array lookup at execution
11897                          * time could be replaced by a range check for such
11898                          * nodes.  But regnodes are a finite resource, and the
11899                          * possible performance boost isn't large, so this
11900                          * hasn't been done.  An attempt to use just one node
11901                          * (and its inverse) to encompass all such cases was
11902                          * made in d62feba66bf43f35d092bb026694f927e9f94d38.
11903                          * But the shifting/masking it used ended up being
11904                          * slower than the array look up, so it was reverted */
11905                         op = (try_inverted)
11906                             ? type + NPOSIXA - POSIXA
11907                             : type;
11908                         *ret = reg_node(pRExC_state, op);
11909                         FLAGS(REGNODE_p(*ret)) = posix_class;
11910                         SvREFCNT_dec(d_invlist);
11911                         SvREFCNT_dec(intersection);
11912                         return op;
11913                     }
11914                 }
11915             }
11916         }
11917         SvREFCNT_dec(d_invlist);
11918         SvREFCNT_dec(intersection);
11919     }
11920 
11921     /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
11922      * in size and speed.  Currently, a 20 bit range base (smallest code point
11923      * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
11924      * This allows for using it on all of the Unicode code points except for
11925      * the highest plane, which is only for private use code points.  khw
11926      * doubts that a bigger delta is likely in real world applications */
11927     if (     single_range
11928         && ! has_runtime_dependency
11929         &&   *anyof_flags == 0
11930         &&   start[0] < (1 << ANYOFR_BASE_BITS)
11931         &&   end[0] - start[0]
11932                 < ((1U << (sizeof(ARG1u_LOC(NULL))
11933                                * CHARBITS - ANYOFR_BASE_BITS))))
11934 
11935     {
11936         U8 low_utf8[UTF8_MAXBYTES+1];
11937         U8 high_utf8[UTF8_MAXBYTES+1];
11938 
11939         op = ANYOFR;
11940         *ret = reg1node(pRExC_state, op,
11941                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
11942 
11943         /* Place the lowest UTF-8 start byte in the flags field, so as to allow
11944          * efficient ruling out at run time of many possible inputs.  */
11945         (void) uvchr_to_utf8(low_utf8, start[0]);
11946         (void) uvchr_to_utf8(high_utf8, end[0]);
11947 
11948         /* If all code points share the same first byte, this can be an
11949          * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
11950          * quickly rule out many inputs at run-time without having to compute
11951          * the code point from UTF-8.  For EBCDIC, we use I8, as not doing that
11952          * transformation would not rule out nearly so many things */
11953         if (low_utf8[0] == high_utf8[0]) {
11954             op = ANYOFRb;
11955             OP(REGNODE_p(*ret)) = op;
11956             ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
11957         }
11958         else {
11959             ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
11960         }
11961 
11962         return op;
11963     }
11964 
11965     /* If didn't find an optimization and there is no need for a bitmap,
11966      * of the lowest code points, optimize to indicate that */
11967     if (     lowest_cp >= NUM_ANYOF_CODE_POINTS
11968         && ! LOC
11969         && ! upper_latin1_only_utf8_matches
11970         &&   *anyof_flags == 0)
11971     {
11972         U8 low_utf8[UTF8_MAXBYTES+1];
11973         UV highest_cp = invlist_highest(cp_list);
11974 
11975         /* Currently the maximum allowed code point by the system is IV_MAX.
11976          * Higher ones are reserved for future internal use.  This particular
11977          * regnode can be used for higher ones, but we can't calculate the code
11978          * point of those.  IV_MAX suffices though, as it will be a large first
11979          * byte */
11980         Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
11981                        - low_utf8;
11982 
11983         /* We store the lowest possible first byte of the UTF-8 representation,
11984          * using the flags field.  This allows for quick ruling out of some
11985          * inputs without having to convert from UTF-8 to code point.  For
11986          * EBCDIC, we use I8, as not doing that transformation would not rule
11987          * out nearly so many things */
11988         *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
11989 
11990         op = ANYOFH;
11991 
11992         /* If the first UTF-8 start byte for the highest code point in the
11993          * range is suitably small, we may be able to get an upper bound as
11994          * well */
11995         if (highest_cp <= IV_MAX) {
11996             U8 high_utf8[UTF8_MAXBYTES+1];
11997             Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
11998 
11999             /* If the lowest and highest are the same, we can get an exact
12000              * first byte instead of a just minimum or even a sequence of exact
12001              * leading bytes.  We signal these with different regnodes */
12002             if (low_utf8[0] == high_utf8[0]) {
12003                 Size_t len = find_first_differing_byte_pos(low_utf8,
12004                                                            high_utf8,
12005                                                    MIN(low_len, high_len));
12006                 if (len == 1) {
12007 
12008                     /* No need to convert to I8 for EBCDIC as this is an exact
12009                      * match */
12010                     *anyof_flags = low_utf8[0];
12011 
12012                     if (high_len == 2) {
12013                         /* If the elements matched all have a 2-byte UTF-8
12014                          * representation, with the first byte being the same,
12015                          * we can use a compact, fast regnode. capable of
12016                          * matching any combination of continuation byte
12017                          * patterns.
12018                          *
12019                          * (A similar regnode could be created for the Latin1
12020                          * range; the complication being that it could match
12021                          * non-UTF8 targets.  The internal bitmap would serve
12022                          * both cases; with some extra code in regexec.c) */
12023                         op = ANYOFHbbm;
12024                         *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12025                         FILL_NODE(*ret, op);
12026                         FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0],
12027 
12028                         /* The 64 bit (or 32 on EBCCDIC) map can be looked up
12029                          * directly based on the continuation byte, without
12030                          * needing to convert to code point */
12031                         populate_bitmap_from_invlist(
12032                             cp_list,
12033 
12034                             /* The base code point is from the start byte */
12035                             TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
12036                                                     UTF_CONTINUATION_MARK | 0),
12037 
12038                             ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
12039                             REGNODE_BBM_BITMAP_LEN);
12040                         RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
12041                         return op;
12042                     }
12043                     else {
12044                         op = ANYOFHb;
12045                     }
12046                 }
12047                 else {
12048                     op = ANYOFHs;
12049                     *ret = REGNODE_GUTS(pRExC_state, op,
12050                                        REGNODE_ARG_LEN(op) + STR_SZ(len));
12051                     FILL_NODE(*ret, op);
12052                     STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret))
12053                                                                     = len;
12054                     Copy(low_utf8,  /* Add the common bytes */
12055                     ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
12056                        len, U8);
12057                     RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
12058                     set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
12059                                               NULL, only_utf8_locale_list);
12060                     return op;
12061                 }
12062             }
12063             else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
12064 
12065                 /* Here, the high byte is not the same as the low, but is small
12066                  * enough that its reasonable to have a loose upper bound,
12067                  * which is packed in with the strict lower bound.  See
12068                  * comments at the definition of MAX_ANYOF_HRx_BYTE.  On EBCDIC
12069                  * platforms, I8 is used.  On ASCII platforms I8 is the same
12070                  * thing as UTF-8 */
12071 
12072                 U8 bits = 0;
12073                 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
12074                 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
12075                             - *anyof_flags;
12076 
12077                 if (range_diff <= max_range_diff / 8) {
12078                     bits = 3;
12079                 }
12080                 else if (range_diff <= max_range_diff / 4) {
12081                     bits = 2;
12082                 }
12083                 else if (range_diff <= max_range_diff / 2) {
12084                     bits = 1;
12085                 }
12086                 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
12087                 op = ANYOFHr;
12088             }
12089         }
12090     }
12091 
12092     return op;
12093 
12094   return_OPFAIL:
12095     op = OPFAIL;
12096     *ret = reg1node(pRExC_state, op, 0);
12097     return op;
12098 
12099   return_SANY:
12100     op = SANY;
12101     *ret = reg_node(pRExC_state, op);
12102     MARK_NAUGHTY(1);
12103     return op;
12104 }
12105 
12106 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12107 
12108 void
Perl_set_ANYOF_arg(pTHX_ RExC_state_t * const pRExC_state,regnode * const node,SV * const cp_list,SV * const runtime_defns,SV * const only_utf8_locale_list)12109 Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
12110                 regnode* const node,
12111                 SV* const cp_list,
12112                 SV* const runtime_defns,
12113                 SV* const only_utf8_locale_list)
12114 {
12115     /* Sets the arg field of an ANYOF-type node 'node', using information about
12116      * the node passed-in.  If only the bitmap is needed to determine what
12117      * matches, the arg is set appropriately to either
12118      *      1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
12119      *      2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
12120      *
12121      * Otherwise, it sets the argument to the count returned by reg_add_data(),
12122      * having allocated and stored an array, av, as follows:
12123      *  av[0] stores the inversion list defining this class as far as known at
12124      *        this time, or PL_sv_undef if nothing definite is now known.
12125      *  av[1] stores the inversion list of code points that match only if the
12126      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
12127      *        av[2], or no entry otherwise.
12128      *  av[2] stores the list of user-defined properties whose subroutine
12129      *        definitions aren't known at this time, or no entry if none. */
12130 
12131     UV n;
12132 
12133     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
12134 
12135     /* If this is set, the final disposition won't be known until runtime, so
12136      * we can't do any of the compile time optimizations */
12137     if (! runtime_defns) {
12138 
12139         /* On plain ANYOF nodes without the possibility of a runtime locale
12140          * making a difference, maybe there's no information to be gleaned
12141          * except for what's in the bitmap */
12142         if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
12143 
12144             /* There are two such cases:
12145              *  1)  there is no list of code points matched outside the bitmap
12146              */
12147             if (! cp_list) {
12148                 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
12149                 return;
12150             }
12151 
12152             /*  2)  the list indicates everything outside the bitmap matches */
12153             if (   invlist_highest(cp_list) == UV_MAX
12154                 && invlist_highest_range_start(cp_list)
12155                                                        <= NUM_ANYOF_CODE_POINTS)
12156             {
12157                 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
12158                 return;
12159             }
12160 
12161             /* In all other cases there are things outside the bitmap that we
12162              * may need to check at runtime. */
12163         }
12164 
12165         /* Here, we have resolved all the possible run-time matches, and they
12166          * are stored in one or both of two possible lists.  (While some match
12167          * only under certain runtime circumstances, we know all the possible
12168          * ones for each such circumstance.)
12169          *
12170          * It may very well be that the pattern being compiled contains an
12171          * identical class, already encountered.  Reusing that class here saves
12172          * space.  Look through all classes so far encountered. */
12173         U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
12174         for (unsigned int i = 0; i < existing_items; i++) {
12175 
12176             /* Only look at auxiliary data of this type */
12177             if (RExC_rxi->data->what[i] != 's') {
12178                 continue;
12179             }
12180 
12181             SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
12182             AV * const av = MUTABLE_AV(SvRV(rv));
12183 
12184             /* If the already encountered class has data that won't be known
12185              * until runtime (stored in the final element of the array), we
12186              * can't share */
12187             if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
12188                 continue;
12189             }
12190 
12191             SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
12192                                                 false /* no lvalue */);
12193 
12194             /* The new and the existing one both have to have or both not
12195              * have this element, for this one to duplicate that one */
12196             if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
12197                 continue;
12198             }
12199 
12200             /* If the inversion lists aren't equivalent, can't share */
12201             if (cp_list && ! _invlistEQ(cp_list,
12202                                         *stored_cp_list_ptr,
12203                                         FALSE /* don't complement */))
12204             {
12205                 continue;
12206             }
12207 
12208             /* Similarly for the other list */
12209             SV ** stored_only_utf8_locale_list_ptr = av_fetch(
12210                                                 av,
12211                                                 ONLY_LOCALE_MATCHES_INDEX,
12212                                                 false /* no lvalue */);
12213             if (   cBOOL(only_utf8_locale_list)
12214                 != cBOOL(stored_only_utf8_locale_list_ptr))
12215             {
12216                 continue;
12217             }
12218 
12219             if (only_utf8_locale_list && ! _invlistEQ(
12220                                          only_utf8_locale_list,
12221                                          *stored_only_utf8_locale_list_ptr,
12222                                          FALSE /* don't complement */))
12223             {
12224                 continue;
12225             }
12226 
12227             /* Here, the existence and contents of both compile-time lists
12228              * are identical between the new and existing data.  Re-use the
12229              * existing one */
12230             ARG1u_SET(node, i);
12231             return;
12232         } /* end of loop through existing classes */
12233     }
12234 
12235     /* Here, we need to create a new auxiliary data element; either because
12236      * this doesn't duplicate an existing one, or we can't tell at this time if
12237      * it eventually will */
12238 
12239     AV * const av = newAV();
12240     SV *rv;
12241 
12242     if (cp_list) {
12243         av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
12244     }
12245 
12246     /* (Note that if any of this changes, the size calculations in
12247      * S_optimize_regclass() might need to be updated.) */
12248 
12249     if (only_utf8_locale_list) {
12250         av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
12251                                        SvREFCNT_inc_NN(only_utf8_locale_list));
12252     }
12253 
12254     if (runtime_defns) {
12255         av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
12256                      SvREFCNT_inc_NN(runtime_defns));
12257     }
12258 
12259     rv = newRV_noinc(MUTABLE_SV(av));
12260     n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
12261     RExC_rxi->data->data[n] = (void*)rv;
12262     ARG1u_SET(node, n);
12263 }
12264 
12265 SV *
12266 
12267 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
Perl_get_regclass_aux_data(pTHX_ const regexp * prog,const regnode * node,bool doinit,SV ** listsvp,SV ** only_utf8_locale_ptr,SV ** output_invlist)12268 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12269 #else
12270 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12271 #endif
12272 
12273 {
12274     /* For internal core use only.
12275      * Returns the inversion list for the input 'node' in the regex 'prog'.
12276      * If <doinit> is 'true', will attempt to create the inversion list if not
12277      *    already done.  If it is created, it will add to the normal inversion
12278      *    list any that comes from user-defined properties.  It croaks if this
12279      *    is called before such a list is ready to be generated, that is when a
12280      *    user-defined property has been declared, buyt still not yet defined.
12281      * If <listsvp> is non-null, will return the printable contents of the
12282      *    property definition.  This can be used to get debugging information
12283      *    even before the inversion list exists, by calling this function with
12284      *    'doinit' set to false, in which case the components that will be used
12285      *    to eventually create the inversion list are returned  (in a printable
12286      *    form).
12287      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
12288      *    store an inversion list of code points that should match only if the
12289      *    execution-time locale is a UTF-8 one.
12290      * If <output_invlist> is not NULL, it is where this routine is to store an
12291      *    inversion list of the code points that would be instead returned in
12292      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
12293      *    when this parameter is used, is just the non-code point data that
12294      *    will go into creating the inversion list.  This currently should be just
12295      *    user-defined properties whose definitions were not known at compile
12296      *    time.  Using this parameter allows for easier manipulation of the
12297      *    inversion list's data by the caller.  It is illegal to call this
12298      *    function with this parameter set, but not <listsvp>
12299      *
12300      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
12301      * that, in spite of this function's name, the inversion list it returns
12302      * may include the bitmap data as well */
12303 
12304     SV *si  = NULL;         /* Input initialization string */
12305     SV* invlist = NULL;
12306 
12307     RXi_GET_DECL_NULL(prog, progi);
12308     const struct reg_data * const data = prog ? progi->data : NULL;
12309 
12310 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12311     PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
12312 #else
12313     PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
12314 #endif
12315     assert(! output_invlist || listsvp);
12316 
12317     if (data && data->count) {
12318         const U32 n = ARG1u(node);
12319 
12320         if (data->what[n] == 's') {
12321             SV * const rv = MUTABLE_SV(data->data[n]);
12322             AV * const av = MUTABLE_AV(SvRV(rv));
12323             SV **const ary = AvARRAY(av);
12324 
12325             invlist = ary[INVLIST_INDEX];
12326 
12327             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
12328                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
12329             }
12330 
12331             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
12332                 si = ary[DEFERRED_USER_DEFINED_INDEX];
12333             }
12334 
12335             if (doinit && (si || invlist)) {
12336                 if (si) {
12337                     bool user_defined;
12338                     SV * msg = newSVpvs_flags("", SVs_TEMP);
12339 
12340                     SV * prop_definition = handle_user_defined_property(
12341                             "", 0, FALSE,   /* There is no \p{}, \P{} */
12342                             SvPVX_const(si)[1] - '0',   /* /i or not has been
12343                                                            stored here for just
12344                                                            this occasion */
12345                             TRUE,           /* run time */
12346                             FALSE,          /* This call must find the defn */
12347                             si,             /* The property definition  */
12348                             &user_defined,
12349                             msg,
12350                             0               /* base level call */
12351                            );
12352 
12353                     if (SvCUR(msg)) {
12354                         assert(prop_definition == NULL);
12355 
12356                         Perl_croak(aTHX_ "%" UTF8f,
12357                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
12358                     }
12359 
12360                     if (invlist) {
12361                         _invlist_union(invlist, prop_definition, &invlist);
12362                         SvREFCNT_dec_NN(prop_definition);
12363                     }
12364                     else {
12365                         invlist = prop_definition;
12366                     }
12367 
12368                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
12369                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
12370 
12371                     ary[INVLIST_INDEX] = invlist;
12372                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
12373                                  ? ONLY_LOCALE_MATCHES_INDEX
12374                                  : INVLIST_INDEX);
12375                     si = NULL;
12376                 }
12377             }
12378         }
12379     }
12380 
12381     /* If requested, return a printable version of what this ANYOF node matches
12382      * */
12383     if (listsvp) {
12384         SV* matches_string = NULL;
12385 
12386         /* This function can be called at compile-time, before everything gets
12387          * resolved, in which case we return the currently best available
12388          * information, which is the string that will eventually be used to do
12389          * that resolving, 'si' */
12390         if (si) {
12391             /* Here, we only have 'si' (and possibly some passed-in data in
12392              * 'invlist', which is handled below)  If the caller only wants
12393              * 'si', use that.  */
12394             if (! output_invlist) {
12395                 matches_string = newSVsv(si);
12396             }
12397             else {
12398                 /* But if the caller wants an inversion list of the node, we
12399                  * need to parse 'si' and place as much as possible in the
12400                  * desired output inversion list, making 'matches_string' only
12401                  * contain the currently unresolvable things */
12402                 const char *si_string = SvPVX(si);
12403                 STRLEN remaining = SvCUR(si);
12404                 UV prev_cp = 0;
12405                 U8 count = 0;
12406 
12407                 /* Ignore everything before and including the first new-line */
12408                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
12409                 assert (si_string != NULL);
12410                 si_string++;
12411                 remaining = SvPVX(si) + SvCUR(si) - si_string;
12412 
12413                 while (remaining > 0) {
12414 
12415                     /* The data consists of just strings defining user-defined
12416                      * property names, but in prior incarnations, and perhaps
12417                      * somehow from pluggable regex engines, it could still
12418                      * hold hex code point definitions, all of which should be
12419                      * legal (or it wouldn't have gotten this far).  Each
12420                      * component of a range would be separated by a tab, and
12421                      * each range by a new-line.  If these are found, instead
12422                      * add them to the inversion list */
12423                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
12424                                      |PERL_SCAN_SILENT_NON_PORTABLE;
12425                     STRLEN len = remaining;
12426                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
12427 
12428                     /* If the hex decode routine found something, it should go
12429                      * up to the next \n */
12430                     if (   *(si_string + len) == '\n') {
12431                         if (count) {    /* 2nd code point on line */
12432                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
12433                         }
12434                         else {
12435                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
12436                         }
12437                         count = 0;
12438                         goto prepare_for_next_iteration;
12439                     }
12440 
12441                     /* If the hex decode was instead for the lower range limit,
12442                      * save it, and go parse the upper range limit */
12443                     if (*(si_string + len) == '\t') {
12444                         assert(count == 0);
12445 
12446                         prev_cp = cp;
12447                         count = 1;
12448                       prepare_for_next_iteration:
12449                         si_string += len + 1;
12450                         remaining -= len + 1;
12451                         continue;
12452                     }
12453 
12454                     /* Here, didn't find a legal hex number.  Just add the text
12455                      * from here up to the next \n, omitting any trailing
12456                      * markers. */
12457 
12458                     remaining -= len;
12459                     len = strcspn(si_string,
12460                                         DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
12461                     remaining -= len;
12462                     if (matches_string) {
12463                         sv_catpvn(matches_string, si_string, len);
12464                     }
12465                     else {
12466                         matches_string = newSVpvn(si_string, len);
12467                     }
12468                     sv_catpvs(matches_string, " ");
12469 
12470                     si_string += len;
12471                     if (   remaining
12472                         && UCHARAT(si_string)
12473                                             == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
12474                     {
12475                         si_string++;
12476                         remaining--;
12477                     }
12478                     if (remaining && UCHARAT(si_string) == '\n') {
12479                         si_string++;
12480                         remaining--;
12481                     }
12482                 } /* end of loop through the text */
12483 
12484                 assert(matches_string);
12485                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
12486                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
12487                 }
12488             } /* end of has an 'si' */
12489         }
12490 
12491         /* Add the stuff that's already known */
12492         if (invlist) {
12493 
12494             /* Again, if the caller doesn't want the output inversion list, put
12495              * everything in 'matches-string' */
12496             if (! output_invlist) {
12497                 if ( ! matches_string) {
12498                     matches_string = newSVpvs("\n");
12499                 }
12500                 sv_catsv(matches_string, invlist_contents(invlist,
12501                                                   TRUE /* traditional style */
12502                                                   ));
12503             }
12504             else if (! *output_invlist) {
12505                 *output_invlist = invlist_clone(invlist, NULL);
12506             }
12507             else {
12508                 _invlist_union(*output_invlist, invlist, output_invlist);
12509             }
12510         }
12511 
12512         *listsvp = matches_string;
12513     }
12514 
12515     return invlist;
12516 }
12517 
12518 /* reg_skipcomment()
12519 
12520    Absorbs an /x style # comment from the input stream,
12521    returning a pointer to the first character beyond the comment, or if the
12522    comment terminates the pattern without anything following it, this returns
12523    one past the final character of the pattern (in other words, RExC_end) and
12524    sets the REG_RUN_ON_COMMENT_SEEN flag.
12525 
12526    Note it's the callers responsibility to ensure that we are
12527    actually in /x mode
12528 
12529 */
12530 
12531 PERL_STATIC_INLINE char*
S_reg_skipcomment(RExC_state_t * pRExC_state,char * p)12532 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
12533 {
12534     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12535 
12536     assert(*p == '#');
12537 
12538     while (p < RExC_end) {
12539         if (*(++p) == '\n') {
12540             return p+1;
12541         }
12542     }
12543 
12544     /* we ran off the end of the pattern without ending the comment, so we have
12545      * to add an \n when wrapping */
12546     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12547     return p;
12548 }
12549 
12550 STATIC void
S_skip_to_be_ignored_text(pTHX_ RExC_state_t * pRExC_state,char ** p,const bool force_to_xmod)12551 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
12552                                 char ** p,
12553                                 const bool force_to_xmod
12554                          )
12555 {
12556     /* If the text at the current parse position '*p' is a '(?#...)' comment,
12557      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
12558      * is /x whitespace, advance '*p' so that on exit it points to the first
12559      * byte past all such white space and comments */
12560 
12561     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
12562 
12563     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
12564 
12565     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
12566 
12567     for (;;) {
12568         if (RExC_end - (*p) >= 3
12569             && *(*p)     == '('
12570             && *(*p + 1) == '?'
12571             && *(*p + 2) == '#')
12572         {
12573             while (*(*p) != ')') {
12574                 if ((*p) == RExC_end)
12575                     FAIL("Sequence (?#... not terminated");
12576                 (*p)++;
12577             }
12578             (*p)++;
12579             continue;
12580         }
12581 
12582         if (use_xmod) {
12583             const char * save_p = *p;
12584             while ((*p) < RExC_end) {
12585                 STRLEN len;
12586                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
12587                     (*p) += len;
12588                 }
12589                 else if (*(*p) == '#') {
12590                     (*p) = reg_skipcomment(pRExC_state, (*p));
12591                 }
12592                 else {
12593                     break;
12594                 }
12595             }
12596             if (*p != save_p) {
12597                 continue;
12598             }
12599         }
12600 
12601         break;
12602     }
12603 
12604     return;
12605 }
12606 
12607 /* nextchar()
12608 
12609    Advances the parse position by one byte, unless that byte is the beginning
12610    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
12611    those two cases, the parse position is advanced beyond all such comments and
12612    white space.
12613 
12614    This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
12615 */
12616 
12617 STATIC void
S_nextchar(pTHX_ RExC_state_t * pRExC_state)12618 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12619 {
12620     PERL_ARGS_ASSERT_NEXTCHAR;
12621 
12622     if (RExC_parse < RExC_end) {
12623         assert(   ! UTF
12624                || UTF8_IS_INVARIANT(*RExC_parse)
12625                || UTF8_IS_START(*RExC_parse));
12626 
12627         RExC_parse_inc_safe();
12628 
12629         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12630                                 FALSE /* Don't force /x */ );
12631     }
12632 }
12633 
12634 STATIC void
S_change_engine_size(pTHX_ RExC_state_t * pRExC_state,const Ptrdiff_t size)12635 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
12636 {
12637     /* 'size' is the delta number of smallest regnode equivalents to add or
12638      * subtract from the current memory allocated to the regex engine being
12639      * constructed. */
12640 
12641     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
12642 
12643     RExC_size += size;
12644 
12645     Renewc(RExC_rxi,
12646            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
12647                                                 /* +1 for REG_MAGIC */
12648            char,
12649            regexp_internal);
12650     if ( RExC_rxi == NULL )
12651         FAIL("Regexp out of space");
12652     RXi_SET(RExC_rx, RExC_rxi);
12653 
12654     RExC_emit_start = RExC_rxi->program;
12655     if (size > 0) {
12656         Zero(REGNODE_p(RExC_emit), size, regnode);
12657     }
12658 }
12659 
12660 STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t * pRExC_state,const STRLEN extra_size)12661 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
12662 {
12663     /* Allocate a regnode that is (1 + extra_size) times as big as the
12664      * smallest regnode worth of space, and also aligns and increments
12665      * RExC_size appropriately.
12666      *
12667      * It returns the regnode's offset into the regex engine program */
12668 
12669     const regnode_offset ret = RExC_emit;
12670 
12671     PERL_ARGS_ASSERT_REGNODE_GUTS;
12672 
12673     SIZE_ALIGN(RExC_size);
12674     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
12675     NODE_ALIGN_FILL(REGNODE_p(ret));
12676     return(ret);
12677 }
12678 
12679 #ifdef DEBUGGING
12680 
12681 STATIC regnode_offset
S_regnode_guts_debug(pTHX_ RExC_state_t * pRExC_state,const U8 op,const STRLEN extra_size)12682 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
12683     PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
12684     assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
12685     return S_regnode_guts(aTHX_ pRExC_state, extra_size);
12686 }
12687 
12688 #endif
12689 
12690 
12691 
12692 /*
12693 - reg_node - emit a node
12694 */
12695 STATIC regnode_offset /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)12696 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12697 {
12698     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12699     regnode_offset ptr = ret;
12700 
12701     PERL_ARGS_ASSERT_REG_NODE;
12702 
12703     assert(REGNODE_ARG_LEN(op) == 0);
12704 
12705     FILL_ADVANCE_NODE(ptr, op);
12706     RExC_emit = ptr;
12707     return(ret);
12708 }
12709 
12710 /*
12711 - reg1node - emit a node with an argument
12712 */
12713 STATIC regnode_offset /* Location. */
S_reg1node(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)12714 S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12715 {
12716     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12717     regnode_offset ptr = ret;
12718 
12719     PERL_ARGS_ASSERT_REG1NODE;
12720 
12721     /* ANYOF are special cased to allow non-length 1 args */
12722     assert(REGNODE_ARG_LEN(op) == 1);
12723 
12724     FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
12725     RExC_emit = ptr;
12726     return(ret);
12727 }
12728 
12729 /*
12730 - regpnode - emit a temporary node with a SV* argument
12731 */
12732 STATIC regnode_offset /* Location. */
S_regpnode(pTHX_ RExC_state_t * pRExC_state,U8 op,SV * arg)12733 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
12734 {
12735     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12736     regnode_offset ptr = ret;
12737 
12738     PERL_ARGS_ASSERT_REGPNODE;
12739 
12740     FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
12741     RExC_emit = ptr;
12742     return(ret);
12743 }
12744 
12745 STATIC regnode_offset
S_reg2node(pTHX_ RExC_state_t * pRExC_state,const U8 op,const U32 arg1,const I32 arg2)12746 S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
12747 {
12748     /* emit a node with U32 and I32 arguments */
12749 
12750     const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12751     regnode_offset ptr = ret;
12752 
12753     PERL_ARGS_ASSERT_REG2NODE;
12754 
12755     assert(REGNODE_ARG_LEN(op) == 2);
12756 
12757     FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
12758     RExC_emit = ptr;
12759     return(ret);
12760 }
12761 
12762 /*
12763 - reginsert - insert an operator in front of already-emitted operand
12764 *
12765 * That means that on exit 'operand' is the offset of the newly inserted
12766 * operator, and the original operand has been relocated.
12767 *
12768 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
12769 * set up NEXT_OFF() of the inserted node if needed. Something like this:
12770 *
12771 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
12772 *   NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
12773 *
12774 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
12775 */
12776 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,const U8 op,const regnode_offset operand,const U32 depth)12777 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
12778                   const regnode_offset operand, const U32 depth)
12779 {
12780     regnode *src;
12781     regnode *dst;
12782     regnode *place;
12783     const int offset = REGNODE_ARG_LEN((U8)op);
12784     const int size = NODE_STEP_REGNODE + offset;
12785     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12786 
12787     PERL_ARGS_ASSERT_REGINSERT;
12788     PERL_UNUSED_CONTEXT;
12789     PERL_UNUSED_ARG(depth);
12790     DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
12791     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
12792                                     studying. If this is wrong then we need to adjust RExC_recurse
12793                                     below like we do with RExC_open_parens/RExC_close_parens. */
12794     change_engine_size(pRExC_state, (Ptrdiff_t) size);
12795     src = REGNODE_p(RExC_emit);
12796     RExC_emit += size;
12797     dst = REGNODE_p(RExC_emit);
12798 
12799     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
12800      * and [perl #133871] shows this can lead to problems, so skip this
12801      * realignment of parens until a later pass when they are reliable */
12802     if (! IN_PARENS_PASS && RExC_open_parens) {
12803         int paren;
12804         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
12805         /* remember that RExC_npar is rex->nparens + 1,
12806          * iow it is 1 more than the number of parens seen in
12807          * the pattern so far. */
12808         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12809             /* note, RExC_open_parens[0] is the start of the
12810              * regex, it can't move. RExC_close_parens[0] is the end
12811              * of the regex, it *can* move. */
12812             if ( paren && RExC_open_parens[paren] >= operand ) {
12813                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
12814                 RExC_open_parens[paren] += size;
12815             } else {
12816                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12817             }
12818             if ( RExC_close_parens[paren] >= operand ) {
12819                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
12820                 RExC_close_parens[paren] += size;
12821             } else {
12822                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12823             }
12824         }
12825     }
12826     if (RExC_end_op)
12827         RExC_end_op += size;
12828 
12829     while (src > REGNODE_p(operand)) {
12830         StructCopy(--src, --dst, regnode);
12831     }
12832 
12833     place = REGNODE_p(operand);	/* Op node, where operand used to be. */
12834     src = place + 1; /* NOT REGNODE_AFTER! */
12835     FLAGS(place) = 0;
12836     FILL_NODE(operand, op);
12837 
12838     /* Zero out any arguments in the new node */
12839     Zero(src, offset, regnode);
12840 }
12841 
12842 /*
12843 - regtail - set the next-pointer at the end of a node chain of p to val.  If
12844             that value won't fit in the space available, instead returns FALSE.
12845             (Except asserts if we can't fit in the largest space the regex
12846             engine is designed for.)
12847 - SEE ALSO: regtail_study
12848 */
12849 STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,const regnode_offset p,const regnode_offset val,const U32 depth)12850 S_regtail(pTHX_ RExC_state_t * pRExC_state,
12851                 const regnode_offset p,
12852                 const regnode_offset val,
12853                 const U32 depth)
12854 {
12855     regnode_offset scan;
12856     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12857 
12858     PERL_ARGS_ASSERT_REGTAIL;
12859 #ifndef DEBUGGING
12860     PERL_UNUSED_ARG(depth);
12861 #endif
12862 
12863     /* The final node in the chain is the first one with a nonzero next pointer
12864      * */
12865     scan = (regnode_offset) p;
12866     for (;;) {
12867         regnode * const temp = regnext(REGNODE_p(scan));
12868         DEBUG_PARSE_r({
12869             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12870             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12871             Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
12872                 SvPV_nolen_const(RExC_mysv), scan,
12873                     (temp == NULL ? "->" : ""),
12874                     (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
12875             );
12876         });
12877         if (temp == NULL)
12878             break;
12879         scan = REGNODE_OFFSET(temp);
12880     }
12881 
12882     /* Populate this node's next pointer */
12883     assert(val >= scan);
12884     if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12885         assert((UV) (val - scan) <= U32_MAX);
12886         ARG1u_SET(REGNODE_p(scan), val - scan);
12887     }
12888     else {
12889         if (val - scan > U16_MAX) {
12890             /* Populate this with something that won't loop and will likely
12891              * lead to a crash if the caller ignores the failure return, and
12892              * execution continues */
12893             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12894             return FALSE;
12895         }
12896         NEXT_OFF(REGNODE_p(scan)) = val - scan;
12897     }
12898 
12899     return TRUE;
12900 }
12901 
12902 #ifdef DEBUGGING
12903 /*
12904 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12905 - Look for optimizable sequences at the same time.
12906 - currently only looks for EXACT chains.
12907 
12908 This is experimental code. The idea is to use this routine to perform
12909 in place optimizations on branches and groups as they are constructed,
12910 with the long term intention of removing optimization from study_chunk so
12911 that it is purely analytical.
12912 
12913 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12914 to control which is which.
12915 
12916 This used to return a value that was ignored.  It was a problem that it is
12917 #ifdef'd to be another function that didn't return a value.  khw has changed it
12918 so both currently return a pass/fail return.
12919 
12920 */
12921 /* TODO: All four parms should be const */
12922 
12923 STATIC bool
S_regtail_study(pTHX_ RExC_state_t * pRExC_state,regnode_offset p,const regnode_offset val,U32 depth)12924 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
12925                       const regnode_offset val, U32 depth)
12926 {
12927     regnode_offset scan;
12928     U8 exact = PSEUDO;
12929 #ifdef EXPERIMENTAL_INPLACESCAN
12930     I32 min = 0;
12931 #endif
12932     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12933 
12934     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12935 
12936 
12937     /* Find last node. */
12938 
12939     scan = p;
12940     for (;;) {
12941         regnode * const temp = regnext(REGNODE_p(scan));
12942 #ifdef EXPERIMENTAL_INPLACESCAN
12943         if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12944             bool unfolded_multi_char;	/* Unexamined in this routine */
12945             if (join_exact(pRExC_state, scan, &min,
12946                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
12947                 return TRUE; /* Was return EXACT */
12948         }
12949 #endif
12950         if ( exact ) {
12951             if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12952                 if (exact == PSEUDO )
12953                     exact= OP(REGNODE_p(scan));
12954                 else if (exact != OP(REGNODE_p(scan)) )
12955                     exact= 0;
12956             }
12957             else if (OP(REGNODE_p(scan)) != NOTHING) {
12958                 exact= 0;
12959             }
12960         }
12961         DEBUG_PARSE_r({
12962             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12963             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12964             Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
12965                 SvPV_nolen_const(RExC_mysv),
12966                 scan,
12967                 REGNODE_NAME(exact));
12968         });
12969         if (temp == NULL)
12970             break;
12971         scan = REGNODE_OFFSET(temp);
12972     }
12973     DEBUG_PARSE_r({
12974         DEBUG_PARSE_MSG("");
12975         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
12976         Perl_re_printf( aTHX_
12977                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
12978                       SvPV_nolen_const(RExC_mysv),
12979                       (IV)val,
12980                       (IV)(val - scan)
12981         );
12982     });
12983     if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12984         assert((UV) (val - scan) <= U32_MAX);
12985         ARG1u_SET(REGNODE_p(scan), val - scan);
12986     }
12987     else {
12988         if (val - scan > U16_MAX) {
12989             /* Populate this with something that won't loop and will likely
12990              * lead to a crash if the caller ignores the failure return, and
12991              * execution continues */
12992             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12993             return FALSE;
12994         }
12995         NEXT_OFF(REGNODE_p(scan)) = val - scan;
12996     }
12997 
12998     return TRUE; /* Was 'return exact' */
12999 }
13000 #endif
13001 
13002 SV*
Perl_get_ANYOFM_contents(pTHX_ const regnode * n)13003 Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
13004 
13005     /* Returns an inversion list of all the code points matched by the
13006      * ANYOFM/NANYOFM node 'n' */
13007 
13008     SV * cp_list = _new_invlist(-1);
13009     const U8 lowest = (U8) ARG1u(n);
13010     unsigned int i;
13011     U8 count = 0;
13012     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
13013 
13014     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
13015 
13016     /* Starting with the lowest code point, any code point that ANDed with the
13017      * mask yields the lowest code point is in the set */
13018     for (i = lowest; i <= 0xFF; i++) {
13019         if ((i & FLAGS(n)) == ARG1u(n)) {
13020             cp_list = add_cp_to_invlist(cp_list, i);
13021             count++;
13022 
13023             /* We know how many code points (a power of two) that are in the
13024              * set.  No use looking once we've got that number */
13025             if (count >= needed) break;
13026         }
13027     }
13028 
13029     if (OP(n) == NANYOFM) {
13030         _invlist_invert(cp_list);
13031     }
13032     return cp_list;
13033 }
13034 
13035 SV *
Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n)13036 Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
13037     PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
13038 
13039     SV * cp_list = NULL;
13040     populate_invlist_from_bitmap(
13041               ((struct regnode_bbm *) n)->bitmap,
13042               REGNODE_BBM_BITMAP_LEN * CHARBITS,
13043               &cp_list,
13044 
13045               /* The base cp is from the start byte plus a zero continuation */
13046               TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
13047                                       UTF_CONTINUATION_MARK | 0));
13048     return cp_list;
13049 }
13050 
13051 
13052 
13053 SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)13054 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13055 {				/* Assume that RE_INTUIT is set */
13056     /* Returns an SV containing a string that must appear in the target for it
13057      * to match, or NULL if nothing is known that must match.
13058      *
13059      * CAUTION: the SV can be freed during execution of the regex engine */
13060 
13061     struct regexp *const prog = ReANY(r);
13062     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13063 
13064     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13065     PERL_UNUSED_CONTEXT;
13066 
13067     DEBUG_COMPILE_r(
13068         {
13069             if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
13070                 const char * const s = SvPV_nolen_const(RX_UTF8(r)
13071                       ? prog->check_utf8 : prog->check_substr);
13072 
13073                 if (!PL_colorset) reginitcolors();
13074                 Perl_re_printf( aTHX_
13075                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13076                       PL_colors[4],
13077                       RX_UTF8(r) ? "utf8 " : "",
13078                       PL_colors[5], PL_colors[0],
13079                       s,
13080                       PL_colors[1],
13081                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
13082             }
13083         } );
13084 
13085     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
13086     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
13087 }
13088 
13089 /*
13090    pregfree()
13091 
13092    handles refcounting and freeing the perl core regexp structure. When
13093    it is necessary to actually free the structure the first thing it
13094    does is call the 'free' method of the regexp_engine associated to
13095    the regexp, allowing the handling of the void *pprivate; member
13096    first. (This routine is not overridable by extensions, which is why
13097    the extensions free is called first.)
13098 
13099    See regdupe and regdupe_internal if you change anything here.
13100 */
13101 #ifndef PERL_IN_XSUB_RE
13102 void
Perl_pregfree(pTHX_ REGEXP * r)13103 Perl_pregfree(pTHX_ REGEXP *r)
13104 {
13105     SvREFCNT_dec(r);
13106 }
13107 
13108 void
Perl_pregfree2(pTHX_ REGEXP * rx)13109 Perl_pregfree2(pTHX_ REGEXP *rx)
13110 {
13111     struct regexp *const r = ReANY(rx);
13112     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13113 
13114     PERL_ARGS_ASSERT_PREGFREE2;
13115 
13116     if (! r)
13117         return;
13118 
13119     if (r->mother_re) {
13120         ReREFCNT_dec(r->mother_re);
13121     } else {
13122         CALLREGFREE_PVT(rx); /* free the private data */
13123         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13124     }
13125     if (r->substrs) {
13126         int i;
13127         for (i = 0; i < 2; i++) {
13128             SvREFCNT_dec(r->substrs->data[i].substr);
13129             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
13130         }
13131         Safefree(r->substrs);
13132     }
13133     RX_MATCH_COPY_FREE(rx);
13134 #ifdef PERL_ANY_COW
13135     SvREFCNT_dec(r->saved_copy);
13136 #endif
13137     Safefree(RXp_OFFSp(r));
13138     if (r->logical_to_parno) {
13139         Safefree(r->logical_to_parno);
13140         Safefree(r->parno_to_logical);
13141         Safefree(r->parno_to_logical_next);
13142     }
13143 
13144     SvREFCNT_dec(r->qr_anoncv);
13145     if (r->recurse_locinput)
13146         Safefree(r->recurse_locinput);
13147 }
13148 
13149 
13150 /*  reg_temp_copy()
13151 
13152     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
13153     except that dsv will be created if NULL.
13154 
13155     This function is used in two main ways. First to implement
13156         $r = qr/....; $s = $$r;
13157 
13158     Secondly, it is used as a hacky workaround to the structural issue of
13159     match results
13160     being stored in the regexp structure which is in turn stored in
13161     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13162     could be PL_curpm in multiple contexts, and could require multiple
13163     result sets being associated with the pattern simultaneously, such
13164     as when doing a recursive match with (??{$qr})
13165 
13166     The solution is to make a lightweight copy of the regexp structure
13167     when a qr// is returned from the code executed by (??{$qr}) this
13168     lightweight copy doesn't actually own any of its data except for
13169     the starp/end and the actual regexp structure itself.
13170 
13171 */
13172 
13173 
13174 REGEXP *
Perl_reg_temp_copy(pTHX_ REGEXP * dsv,REGEXP * ssv)13175 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
13176 {
13177     struct regexp *drx;
13178     struct regexp *const srx = ReANY(ssv);
13179     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
13180 
13181     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13182 
13183     if (!dsv)
13184         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
13185     else {
13186         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
13187 
13188         /* our only valid caller, sv_setsv_flags(), should have done
13189          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
13190         assert(!SvOOK(dsv));
13191         assert(!SvIsCOW(dsv));
13192         assert(!SvROK(dsv));
13193 
13194         if (SvPVX_const(dsv)) {
13195             if (SvLEN(dsv))
13196                 Safefree(SvPVX(dsv));
13197             SvPVX(dsv) = NULL;
13198         }
13199         SvLEN_set(dsv, 0);
13200         SvCUR_set(dsv, 0);
13201         SvOK_off((SV *)dsv);
13202 
13203         if (islv) {
13204             /* For PVLVs, the head (sv_any) points to an XPVLV, while
13205              * the LV's xpvlenu_rx will point to a regexp body, which
13206              * we allocate here */
13207             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13208             assert(!SvPVX(dsv));
13209             /* We "steal" the body from the newly allocated SV temp, changing
13210              * the pointer in its HEAD to NULL. We then change its type to
13211              * SVt_NULL so that when we immediately release its only reference,
13212              * no memory deallocation happens.
13213              *
13214              * The body will eventually be freed (from the PVLV) either in
13215              * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
13216              * the regexp body needs to be removed)
13217              * or in Perl_sv_clear() (if the PVLV still holds the pointer until
13218              * the PVLV itself is deallocated). */
13219             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
13220             temp->sv_any = NULL;
13221             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13222             SvREFCNT_dec_NN(temp);
13223             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
13224                ing below will not set it. */
13225             SvCUR_set(dsv, SvCUR(ssv));
13226         }
13227     }
13228     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
13229        sv_force_normal(sv) is called.  */
13230     SvFAKE_on(dsv);
13231     drx = ReANY(dsv);
13232 
13233     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
13234     SvPV_set(dsv, RX_WRAPPED(ssv));
13235     /* We share the same string buffer as the original regexp, on which we
13236        hold a reference count, incremented when mother_re is set below.
13237        The string pointer is copied here, being part of the regexp struct.
13238      */
13239     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
13240            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13241 
13242     if (!islv)
13243         SvLEN_set(dsv, 0);
13244     if (RXp_OFFSp(srx)) {
13245         const I32 npar = srx->nparens+1;
13246         NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
13247     }
13248     if (srx->substrs) {
13249         int i;
13250         Newx(drx->substrs, 1, struct reg_substr_data);
13251         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
13252 
13253         for (i = 0; i < 2; i++) {
13254             SvREFCNT_inc_void(drx->substrs->data[i].substr);
13255             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
13256         }
13257 
13258         /* check_substr and check_utf8, if non-NULL, point to either their
13259            anchored or float namesakes, and don't hold a second reference.  */
13260     }
13261     if (srx->logical_to_parno) {
13262         NewCopy(srx->logical_to_parno,
13263                 drx->logical_to_parno,
13264                 srx->nparens+1, I32);
13265         NewCopy(srx->parno_to_logical,
13266                 drx->parno_to_logical,
13267                 srx->nparens+1, I32);
13268         NewCopy(srx->parno_to_logical_next,
13269                 drx->parno_to_logical_next,
13270                 srx->nparens+1, I32);
13271     } else {
13272         drx->logical_to_parno = NULL;
13273         drx->parno_to_logical = NULL;
13274         drx->parno_to_logical_next = NULL;
13275     }
13276     drx->logical_nparens = srx->logical_nparens;
13277 
13278     RX_MATCH_COPIED_off(dsv);
13279 #ifdef PERL_ANY_COW
13280     RXp_SAVED_COPY(drx) = NULL;
13281 #endif
13282     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
13283     SvREFCNT_inc_void(drx->qr_anoncv);
13284     if (srx->recurse_locinput)
13285         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
13286 
13287     return dsv;
13288 }
13289 #endif
13290 
13291 
13292 /* regfree_internal()
13293 
13294    Free the private data in a regexp. This is overloadable by
13295    extensions. Perl takes care of the regexp structure in pregfree(),
13296    this covers the *pprivate pointer which technically perl doesn't
13297    know about, however of course we have to handle the
13298    regexp_internal structure when no extension is in use.
13299 
13300    Note this is called before freeing anything in the regexp
13301    structure.
13302  */
13303 
13304 void
Perl_regfree_internal(pTHX_ REGEXP * const rx)13305 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13306 {
13307     struct regexp *const r = ReANY(rx);
13308     RXi_GET_DECL(r, ri);
13309     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13310 
13311     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13312 
13313     if (! ri) {
13314         return;
13315     }
13316 
13317     DEBUG_COMPILE_r({
13318         if (!PL_colorset)
13319             reginitcolors();
13320         {
13321             SV *dsv= sv_newmortal();
13322             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13323                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
13324             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
13325                 PL_colors[4], PL_colors[5], s);
13326         }
13327     });
13328 
13329     if (ri->code_blocks)
13330         S_free_codeblocks(aTHX_ ri->code_blocks);
13331 
13332     if (ri->data) {
13333         int n = ri->data->count;
13334 
13335         while (--n >= 0) {
13336           /* If you add a ->what type here, update the comment in regcomp.h */
13337             switch (ri->data->what[n]) {
13338             case 'a':
13339             case 'r':
13340             case 's':
13341             case 'S':
13342             case 'u':
13343                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13344                 break;
13345             case 'f':
13346                 Safefree(ri->data->data[n]);
13347                 break;
13348             case 'l':
13349             case 'L':
13350                 break;
13351             case 'T':
13352                 { /* Aho Corasick add-on structure for a trie node.
13353                      Used in stclass optimization only */
13354                     U32 refcount;
13355                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13356                     OP_REFCNT_LOCK;
13357                     refcount = --aho->refcount;
13358                     OP_REFCNT_UNLOCK;
13359                     if ( !refcount ) {
13360                         PerlMemShared_free(aho->states);
13361                         PerlMemShared_free(aho->fail);
13362                          /* do this last!!!! */
13363                         PerlMemShared_free(ri->data->data[n]);
13364                         /* we should only ever get called once, so
13365                          * assert as much, and also guard the free
13366                          * which /might/ happen twice. At the least
13367                          * it will make code anlyzers happy and it
13368                          * doesn't cost much. - Yves */
13369                         assert(ri->regstclass);
13370                         if (ri->regstclass) {
13371                             PerlMemShared_free(ri->regstclass);
13372                             ri->regstclass = 0;
13373                         }
13374                     }
13375                 }
13376                 break;
13377             case 't':
13378                 {
13379                     /* trie structure. */
13380                     U32 refcount;
13381                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13382                     OP_REFCNT_LOCK;
13383                     refcount = --trie->refcount;
13384                     OP_REFCNT_UNLOCK;
13385                     if ( !refcount ) {
13386                         PerlMemShared_free(trie->charmap);
13387                         PerlMemShared_free(trie->states);
13388                         PerlMemShared_free(trie->trans);
13389                         if (trie->bitmap)
13390                             PerlMemShared_free(trie->bitmap);
13391                         if (trie->jump)
13392                             PerlMemShared_free(trie->jump);
13393                         if (trie->j_before_paren)
13394                             PerlMemShared_free(trie->j_before_paren);
13395                         if (trie->j_after_paren)
13396                             PerlMemShared_free(trie->j_after_paren);
13397                         PerlMemShared_free(trie->wordinfo);
13398                         /* do this last!!!! */
13399                         PerlMemShared_free(ri->data->data[n]);
13400                     }
13401                 }
13402                 break;
13403             case '%':
13404                 /* NO-OP a '%' data contains a null pointer, so that reg_add_data
13405                  * always returns non-zero, this should only ever happen in the
13406                  * 0 index */
13407                 assert(n==0);
13408                 break;
13409             default:
13410                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
13411                                                     ri->data->what[n]);
13412             }
13413         }
13414         Safefree(ri->data->what);
13415         Safefree(ri->data);
13416     }
13417 
13418     Safefree(ri);
13419 }
13420 
13421 #define SAVEPVN(p, n)	((p) ? savepvn(p, n) : NULL)
13422 
13423 /*
13424 =for apidoc re_dup_guts
13425 Duplicate a regexp.
13426 
13427 This routine is expected to clone a given regexp structure. It is only
13428 compiled under USE_ITHREADS.
13429 
13430 After all of the core data stored in struct regexp is duplicated
13431 the C<regexp_engine.dupe> method is used to copy any private data
13432 stored in the *pprivate pointer. This allows extensions to handle
13433 any duplication they need to do.
13434 
13435 =cut
13436 
13437    See pregfree() and regfree_internal() if you change anything here.
13438 */
13439 #if defined(USE_ITHREADS)
13440 #ifndef PERL_IN_XSUB_RE
13441 void
Perl_re_dup_guts(pTHX_ const REGEXP * sstr,REGEXP * dstr,CLONE_PARAMS * param)13442 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13443 {
13444     I32 npar;
13445     const struct regexp *r = ReANY(sstr);
13446     struct regexp *ret = ReANY(dstr);
13447 
13448     PERL_ARGS_ASSERT_RE_DUP_GUTS;
13449 
13450     npar = r->nparens+1;
13451     NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
13452 
13453     if (ret->substrs) {
13454         /* Do it this way to avoid reading from *r after the StructCopy().
13455            That way, if any of the sv_dup_inc()s dislodge *r from the L1
13456            cache, it doesn't matter.  */
13457         int i;
13458         const bool anchored = r->check_substr
13459             ? r->check_substr == r->substrs->data[0].substr
13460             : r->check_utf8   == r->substrs->data[0].utf8_substr;
13461         Newx(ret->substrs, 1, struct reg_substr_data);
13462         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13463 
13464         for (i = 0; i < 2; i++) {
13465             ret->substrs->data[i].substr =
13466                         sv_dup_inc(ret->substrs->data[i].substr, param);
13467             ret->substrs->data[i].utf8_substr =
13468                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
13469         }
13470 
13471         /* check_substr and check_utf8, if non-NULL, point to either their
13472            anchored or float namesakes, and don't hold a second reference.  */
13473 
13474         if (ret->check_substr) {
13475             if (anchored) {
13476                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
13477 
13478                 ret->check_substr = ret->substrs->data[0].substr;
13479                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
13480             } else {
13481                 assert(r->check_substr == r->substrs->data[1].substr);
13482                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
13483 
13484                 ret->check_substr = ret->substrs->data[1].substr;
13485                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
13486             }
13487         } else if (ret->check_utf8) {
13488             if (anchored) {
13489                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13490             } else {
13491                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13492             }
13493         }
13494     }
13495 
13496     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13497     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13498     if (r->recurse_locinput)
13499         Newx(ret->recurse_locinput, r->nparens + 1, char *);
13500 
13501     if (ret->pprivate)
13502         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
13503 
13504     if (RX_MATCH_COPIED(dstr))
13505         RXp_SUBBEG(ret)  = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
13506     else
13507         RXp_SUBBEG(ret) = NULL;
13508 #ifdef PERL_ANY_COW
13509     RXp_SAVED_COPY(ret) = NULL;
13510 #endif
13511 
13512     if (r->logical_to_parno) {
13513         /* we use total_parens for all three just for symmetry */
13514         ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
13515         ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
13516         ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
13517     } else {
13518         ret->logical_to_parno = NULL;
13519         ret->parno_to_logical = NULL;
13520         ret->parno_to_logical_next = NULL;
13521     }
13522 
13523     ret->logical_nparens = r->logical_nparens;
13524 
13525     /* Whether mother_re be set or no, we need to copy the string.  We
13526        cannot refrain from copying it when the storage points directly to
13527        our mother regexp, because that's
13528                1: a buffer in a different thread
13529                2: something we no longer hold a reference on
13530                so we need to copy it locally.  */
13531     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
13532     /* set malloced length to a non-zero value so it will be freed
13533      * (otherwise in combination with SVf_FAKE it looks like an alien
13534      * buffer). It doesn't have to be the actual malloced size, since it
13535      * should never be grown */
13536     SvLEN_set(dstr, SvCUR(sstr)+1);
13537     ret->mother_re   = NULL;
13538 }
13539 #endif /* PERL_IN_XSUB_RE */
13540 
13541 /*
13542    regdupe_internal()
13543 
13544    This is the internal complement to regdupe() which is used to copy
13545    the structure pointed to by the *pprivate pointer in the regexp.
13546    This is the core version of the extension overridable cloning hook.
13547    The regexp structure being duplicated will be copied by perl prior
13548    to this and will be provided as the regexp *r argument, however
13549    with the /old/ structures pprivate pointer value. Thus this routine
13550    may override any copying normally done by perl.
13551 
13552    It returns a pointer to the new regexp_internal structure.
13553 */
13554 
13555 void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx,CLONE_PARAMS * param)13556 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13557 {
13558     struct regexp *const r = ReANY(rx);
13559     regexp_internal *reti;
13560     int len;
13561     RXi_GET_DECL(r, ri);
13562 
13563     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13564 
13565     len = ProgLen(ri);
13566 
13567     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
13568           char, regexp_internal);
13569     Copy(ri->program, reti->program, len+1, regnode);
13570 
13571 
13572     if (ri->code_blocks) {
13573         int n;
13574         Newx(reti->code_blocks, 1, struct reg_code_blocks);
13575         Newx(reti->code_blocks->cb, ri->code_blocks->count,
13576                     struct reg_code_block);
13577         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
13578              ri->code_blocks->count, struct reg_code_block);
13579         for (n = 0; n < ri->code_blocks->count; n++)
13580              reti->code_blocks->cb[n].src_regex = (REGEXP*)
13581                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
13582         reti->code_blocks->count = ri->code_blocks->count;
13583         reti->code_blocks->refcnt = 1;
13584     }
13585     else
13586         reti->code_blocks = NULL;
13587 
13588     reti->regstclass = NULL;
13589 
13590     if (ri->data) {
13591         struct reg_data *d;
13592         const int count = ri->data->count;
13593         int i;
13594 
13595         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13596                 char, struct reg_data);
13597         Newx(d->what, count, U8);
13598 
13599         d->count = count;
13600         for (i = 0; i < count; i++) {
13601             d->what[i] = ri->data->what[i];
13602             switch (d->what[i]) {
13603                 /* see also regcomp.h and regfree_internal() */
13604             case 'a': /* actually an AV, but the dup function is identical.
13605                          values seem to be "plain sv's" generally. */
13606             case 'r': /* a compiled regex (but still just another SV) */
13607             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
13608                          this use case should go away, the code could have used
13609                          'a' instead - see S_set_ANYOF_arg() for array contents. */
13610             case 'S': /* actually an SV, but the dup function is identical.  */
13611             case 'u': /* actually an HV, but the dup function is identical.
13612                          values are "plain sv's" */
13613                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13614                 break;
13615             case 'f':
13616                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
13617                  * patterns which could start with several different things. Pre-TRIE
13618                  * this was more important than it is now, however this still helps
13619                  * in some places, for instance /x?a+/ might produce a SSC equivalent
13620                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
13621                  * in regexec.c
13622                  */
13623                 /* This is cheating. */
13624                 Newx(d->data[i], 1, regnode_ssc);
13625                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
13626                 reti->regstclass = (regnode*)d->data[i];
13627                 break;
13628             case 'T':
13629                 /* AHO-CORASICK fail table */
13630                 /* Trie stclasses are readonly and can thus be shared
13631                  * without duplication. We free the stclass in pregfree
13632                  * when the corresponding reg_ac_data struct is freed.
13633                  */
13634                 reti->regstclass= ri->regstclass;
13635                 /* FALLTHROUGH */
13636             case 't':
13637                 /* TRIE transition table */
13638                 OP_REFCNT_LOCK;
13639                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13640                 OP_REFCNT_UNLOCK;
13641                 /* FALLTHROUGH */
13642             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
13643             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
13644                          is not from another regexp */
13645                 d->data[i] = ri->data->data[i];
13646                 break;
13647             case '%':
13648                 /* this is a placeholder type, it exists purely so that
13649                  * reg_add_data always returns a non-zero value, this type of
13650                  * entry should ONLY be present in the 0 slot of the array */
13651                 assert(i == 0);
13652                 d->data[i]= ri->data->data[i];
13653                 break;
13654             default:
13655                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
13656                                                            ri->data->what[i]);
13657             }
13658         }
13659 
13660         reti->data = d;
13661     }
13662     else
13663         reti->data = NULL;
13664 
13665     if (ri->regstclass && !reti->regstclass) {
13666         /* Assume that the regstclass is a regnode which is inside of the
13667          * program which we have to copy over */
13668         regnode *node= ri->regstclass;
13669         assert(node >= ri->program && (node - ri->program) < len);
13670         reti->regstclass = reti->program + (node - ri->program);
13671     }
13672 
13673 
13674     reti->name_list_idx = ri->name_list_idx;
13675 
13676     SetProgLen(reti, len);
13677 
13678     return (void*)reti;
13679 }
13680 
13681 #endif    /* USE_ITHREADS */
13682 
13683 STATIC void
S_re_croak(pTHX_ bool utf8,const char * pat,...)13684 S_re_croak(pTHX_ bool utf8, const char* pat,...)
13685 {
13686     va_list args;
13687     STRLEN len = strlen(pat);
13688     char buf[512];
13689     SV *msv;
13690     const char *message;
13691 
13692     PERL_ARGS_ASSERT_RE_CROAK;
13693 
13694     if (len > 510)
13695         len = 510;
13696     Copy(pat, buf, len , char);
13697     buf[len] = '\n';
13698     buf[len + 1] = '\0';
13699     va_start(args, pat);
13700     msv = vmess(buf, &args);
13701     va_end(args);
13702     message = SvPV_const(msv, len);
13703     if (len > 512)
13704         len = 512;
13705     Copy(message, buf, len , char);
13706     /* len-1 to avoid \n */
13707     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
13708 }
13709 
13710 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13711 
13712 #ifndef PERL_IN_XSUB_RE
13713 void
Perl_save_re_context(pTHX)13714 Perl_save_re_context(pTHX)
13715 {
13716     I32 nparens = -1;
13717     I32 i;
13718 
13719     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13720 
13721     if (PL_curpm) {
13722         const REGEXP * const rx = PM_GETRE(PL_curpm);
13723         if (rx)
13724             nparens = RX_NPARENS(rx);
13725     }
13726 
13727     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
13728      * that PL_curpm will be null, but that utf8.pm and the modules it
13729      * loads will only use $1..$3.
13730      * The t/porting/re_context.t test file checks this assumption.
13731      */
13732     if (nparens == -1)
13733         nparens = 3;
13734 
13735     for (i = 1; i <= nparens; i++) {
13736         char digits[TYPE_CHARS(long)];
13737         const STRLEN len = my_snprintf(digits, sizeof(digits),
13738                                        "%lu", (long)i);
13739         GV *const *const gvp
13740             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13741 
13742         if (gvp) {
13743             GV * const gv = *gvp;
13744             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13745                 save_scalar(gv);
13746         }
13747     }
13748 }
13749 #endif
13750 
13751 #ifndef PERL_IN_XSUB_RE
13752 
13753 #  include "uni_keywords.h"
13754 
13755 void
Perl_init_uniprops(pTHX)13756 Perl_init_uniprops(pTHX)
13757 {
13758 
13759 #  ifdef DEBUGGING
13760     char * dump_len_string;
13761 
13762     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
13763     if (   ! dump_len_string
13764         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
13765     {
13766         PL_dump_re_max_len = 60;    /* A reasonable default */
13767     }
13768 #  endif
13769 
13770     PL_user_def_props = newHV();
13771 
13772 #  ifdef USE_ITHREADS
13773 
13774     HvSHAREKEYS_off(PL_user_def_props);
13775     PL_user_def_props_aTHX = aTHX;
13776 
13777 #  endif
13778 
13779     /* Set up the inversion list interpreter-level variables */
13780 
13781     PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13782     PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
13783     PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
13784     PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
13785     PL_XPosix_ptrs[CC_CASED_] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
13786     PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
13787     PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
13788     PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
13789     PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
13790     PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
13791     PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
13792     PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
13793     PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
13794     PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
13795     PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
13796     PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
13797 
13798     PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13799     PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
13800     PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
13801     PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
13802     PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
13803     PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
13804     PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
13805     PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
13806     PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
13807     PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
13808     PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
13809     PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
13810     PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
13811     PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
13812     PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
13813     PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
13814 
13815     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
13816     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
13817     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
13818     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
13819     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
13820 
13821     PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
13822     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
13823     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
13824     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
13825 
13826     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
13827 
13828     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
13829     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
13830 
13831     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
13832     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
13833 
13834     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
13835     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13836                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
13837     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13838                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
13839     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
13840     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
13841     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
13842     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
13843     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
13844     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
13845     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
13846     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
13847     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
13848 
13849 #  ifdef UNI_XIDC
13850     /* The below are used only by deprecated functions.  They could be removed */
13851     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
13852     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
13853     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
13854 #  endif
13855 }
13856 
13857 /* These four functions are compiled only in regcomp.c, where they have access
13858  * to the data they return.  They are a way for re_comp.c to get access to that
13859  * data without having to compile the whole data structures. */
13860 
13861 I16
Perl_do_uniprop_match(const char * const key,const U16 key_len)13862 Perl_do_uniprop_match(const char * const key, const U16 key_len)
13863 {
13864     PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
13865 
13866     return match_uniprop((U8 *) key, key_len);
13867 }
13868 
13869 SV *
Perl_get_prop_definition(pTHX_ const int table_index)13870 Perl_get_prop_definition(pTHX_ const int table_index)
13871 {
13872     PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
13873 
13874     /* Create and return the inversion list */
13875     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
13876 }
13877 
13878 const char * const *
Perl_get_prop_values(const int table_index)13879 Perl_get_prop_values(const int table_index)
13880 {
13881     PERL_ARGS_ASSERT_GET_PROP_VALUES;
13882 
13883     return UNI_prop_value_ptrs[table_index];
13884 }
13885 
13886 const char *
Perl_get_deprecated_property_msg(const Size_t warning_offset)13887 Perl_get_deprecated_property_msg(const Size_t warning_offset)
13888 {
13889     PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
13890 
13891     return deprecated_property_msgs[warning_offset];
13892 }
13893 
13894 #  if 0
13895 
13896 This code was mainly added for backcompat to give a warning for non-portable
13897 code points in user-defined properties.  But experiments showed that the
13898 warning in earlier perls were only omitted on overflow, which should be an
13899 error, so there really isnt a backcompat issue, and actually adding the
13900 warning when none was present before might cause breakage, for little gain.  So
13901 khw left this code in, but not enabled.  Tests were never added.
13902 
13903 embed.fnc entry:
13904 Ei	|const char *|get_extended_utf8_msg|const UV cp
13905 
13906 PERL_STATIC_INLINE const char *
13907 S_get_extended_utf8_msg(pTHX_ const UV cp)
13908 {
13909     U8 dummy[UTF8_MAXBYTES + 1];
13910     HV *msgs;
13911     SV **msg;
13912 
13913     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
13914                              &msgs);
13915 
13916     msg = hv_fetchs(msgs, "text", 0);
13917     assert(msg);
13918 
13919     (void) sv_2mortal((SV *) msgs);
13920 
13921     return SvPVX(*msg);
13922 }
13923 
13924 #  endif
13925 #endif /* end of ! PERL_IN_XSUB_RE */
13926 
13927 STATIC REGEXP *
S_compile_wildcard(pTHX_ const char * subpattern,const STRLEN len,const bool ignore_case)13928 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
13929                          const bool ignore_case)
13930 {
13931     /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
13932      * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
13933      * because nothing outside of ASCII will match.  Use /m because the input
13934      * string may be a bunch of lines strung together.
13935      *
13936      * Also sets up the debugging info */
13937 
13938     U32 flags = PMf_MULTILINE|PMf_WILDCARD;
13939     U32 rx_flags;
13940     SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
13941     REGEXP * subpattern_re;
13942     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13943 
13944     PERL_ARGS_ASSERT_COMPILE_WILDCARD;
13945 
13946     if (ignore_case) {
13947         flags |= PMf_FOLD;
13948     }
13949     set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
13950 
13951     /* Like in op.c, we copy the compile time pm flags to the rx ones */
13952     rx_flags = flags & RXf_PMf_COMPILETIME;
13953 
13954 #ifndef PERL_IN_XSUB_RE
13955     /* Use the core engine if this file is regcomp.c.  That means no
13956      * 'use re "Debug ..." is in effect, so the core engine is sufficient */
13957     subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13958                                              &PL_core_reg_engine,
13959                                              NULL, NULL,
13960                                              rx_flags, flags);
13961 #else
13962     if (isDEBUG_WILDCARD) {
13963         /* Use the special debugging engine if this file is re_comp.c and wants
13964          * to output the wildcard matching.  This uses whatever
13965          * 'use re "Debug ..." is in effect */
13966         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13967                                                  &my_reg_engine,
13968                                                  NULL, NULL,
13969                                                  rx_flags, flags);
13970     }
13971     else {
13972         /* Use the special wildcard engine if this file is re_comp.c and
13973          * doesn't want to output the wildcard matching.  This uses whatever
13974          * 'use re "Debug ..." is in effect for compilation, but this engine
13975          * structure has been set up so that it uses the core engine for
13976          * execution, so no execution debugging as a result of re.pm will be
13977          * displayed. */
13978         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13979                                                  &wild_reg_engine,
13980                                                  NULL, NULL,
13981                                                  rx_flags, flags);
13982         /* XXX The above has the effect that any user-supplied regex engine
13983          * won't be called for matching wildcards.  That might be good, or bad.
13984          * It could be changed in several ways.  The reason it is done the
13985          * current way is to avoid having to save and restore
13986          * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
13987          * could be used.  Another suggestion is to keep the authoritative
13988          * value of the debug flags in a thread-local variable and add set/get
13989          * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
13990          * Still another is to pass a flag, say in the engine's intflags that
13991          * would be checked each time before doing the debug output */
13992     }
13993 #endif
13994 
13995     assert(subpattern_re);  /* Should have died if didn't compile successfully */
13996     return subpattern_re;
13997 }
13998 
13999 STATIC I32
S_execute_wildcard(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)14000 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
14001          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
14002 {
14003     I32 result;
14004     DECLARE_AND_GET_RE_DEBUG_FLAGS;
14005 
14006     PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
14007 
14008     ENTER;
14009 
14010     /* The compilation has set things up so that if the program doesn't want to
14011      * see the wildcard matching procedure, it will get the core execution
14012      * engine, which is subject only to -Dr.  So we have to turn that off
14013      * around this procedure */
14014     if (! isDEBUG_WILDCARD) {
14015         /* Note! Casts away 'volatile' */
14016         SAVEI32(PL_debug);
14017         PL_debug &= ~ DEBUG_r_FLAG;
14018     }
14019 
14020     result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
14021                          NULL, nosave);
14022     LEAVE;
14023 
14024     return result;
14025 }
14026 
14027 SV *
S_handle_user_defined_property(pTHX_ const char * name,const STRLEN name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,SV * contents,bool * user_defined_ptr,SV * msg,const STRLEN level)14028 S_handle_user_defined_property(pTHX_
14029 
14030     /* Parses the contents of a user-defined property definition; returning the
14031      * expanded definition if possible.  If so, the return is an inversion
14032      * list.
14033      *
14034      * If there are subroutines that are part of the expansion and which aren't
14035      * known at the time of the call to this function, this returns what
14036      * parse_uniprop_string() returned for the first one encountered.
14037      *
14038      * If an error was found, NULL is returned, and 'msg' gets a suitable
14039      * message appended to it.  (Appending allows the back trace of how we got
14040      * to the faulty definition to be displayed through nested calls of
14041      * user-defined subs.)
14042      *
14043      * The caller IS responsible for freeing any returned SV.
14044      *
14045      * The syntax of the contents is pretty much described in perlunicode.pod,
14046      * but we also allow comments on each line */
14047 
14048     const char * name,          /* Name of property */
14049     const STRLEN name_len,      /* The name's length in bytes */
14050     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14051     const bool to_fold,         /* ? Is this under /i */
14052     const bool runtime,         /* ? Are we in compile- or run-time */
14053     const bool deferrable,      /* Is it ok for this property's full definition
14054                                    to be deferred until later? */
14055     SV* contents,               /* The property's definition */
14056     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
14057                                    getting called unless this is thought to be
14058                                    a user-defined property */
14059     SV * msg,                   /* Any error or warning msg(s) are appended to
14060                                    this */
14061     const STRLEN level)         /* Recursion level of this call */
14062 {
14063     STRLEN len;
14064     const char * string         = SvPV_const(contents, len);
14065     const char * const e        = string + len;
14066     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
14067     const STRLEN msgs_length_on_entry = SvCUR(msg);
14068 
14069     const char * s0 = string;   /* Points to first byte in the current line
14070                                    being parsed in 'string' */
14071     const char overflow_msg[] = "Code point too large in \"";
14072     SV* running_definition = NULL;
14073 
14074     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
14075 
14076     *user_defined_ptr = TRUE;
14077 
14078     /* Look at each line */
14079     while (s0 < e) {
14080         const char * s;     /* Current byte */
14081         char op = '+';      /* Default operation is 'union' */
14082         IV   min = 0;       /* range begin code point */
14083         IV   max = -1;      /* and range end */
14084         SV* this_definition;
14085 
14086         /* Skip comment lines */
14087         if (*s0 == '#') {
14088             s0 = strchr(s0, '\n');
14089             if (s0 == NULL) {
14090                 break;
14091             }
14092             s0++;
14093             continue;
14094         }
14095 
14096         /* For backcompat, allow an empty first line */
14097         if (*s0 == '\n') {
14098             s0++;
14099             continue;
14100         }
14101 
14102         /* First character in the line may optionally be the operation */
14103         if (   *s0 == '+'
14104             || *s0 == '!'
14105             || *s0 == '-'
14106             || *s0 == '&')
14107         {
14108             op = *s0++;
14109         }
14110 
14111         /* If the line is one or two hex digits separated by blank space, its
14112          * a range; otherwise it is either another user-defined property or an
14113          * error */
14114 
14115         s = s0;
14116 
14117         if (! isXDIGIT(*s)) {
14118             goto check_if_property;
14119         }
14120 
14121         do { /* Each new hex digit will add 4 bits. */
14122             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
14123                 s = strchr(s, '\n');
14124                 if (s == NULL) {
14125                     s = e;
14126                 }
14127                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14128                 sv_catpv(msg, overflow_msg);
14129                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14130                                      UTF8fARG(is_contents_utf8, s - s0, s0));
14131                 sv_catpvs(msg, "\"");
14132                 goto return_failure;
14133             }
14134 
14135             /* Accumulate this digit into the value */
14136             min = (min << 4) + READ_XDIGIT(s);
14137         } while (isXDIGIT(*s));
14138 
14139         while (isBLANK(*s)) { s++; }
14140 
14141         /* We allow comments at the end of the line */
14142         if (*s == '#') {
14143             s = strchr(s, '\n');
14144             if (s == NULL) {
14145                 s = e;
14146             }
14147             s++;
14148         }
14149         else if (s < e && *s != '\n') {
14150             if (! isXDIGIT(*s)) {
14151                 goto check_if_property;
14152             }
14153 
14154             /* Look for the high point of the range */
14155             max = 0;
14156             do {
14157                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
14158                     s = strchr(s, '\n');
14159                     if (s == NULL) {
14160                         s = e;
14161                     }
14162                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14163                     sv_catpv(msg, overflow_msg);
14164                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14165                                       UTF8fARG(is_contents_utf8, s - s0, s0));
14166                     sv_catpvs(msg, "\"");
14167                     goto return_failure;
14168                 }
14169 
14170                 max = (max << 4) + READ_XDIGIT(s);
14171             } while (isXDIGIT(*s));
14172 
14173             while (isBLANK(*s)) { s++; }
14174 
14175             if (*s == '#') {
14176                 s = strchr(s, '\n');
14177                 if (s == NULL) {
14178                     s = e;
14179                 }
14180             }
14181             else if (s < e && *s != '\n') {
14182                 goto check_if_property;
14183             }
14184         }
14185 
14186         if (max == -1) {    /* The line only had one entry */
14187             max = min;
14188         }
14189         else if (max < min) {
14190             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14191             sv_catpvs(msg, "Illegal range in \"");
14192             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14193                                 UTF8fARG(is_contents_utf8, s - s0, s0));
14194             sv_catpvs(msg, "\"");
14195             goto return_failure;
14196         }
14197 
14198 #  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
14199 
14200         if (   UNICODE_IS_PERL_EXTENDED(min)
14201             || UNICODE_IS_PERL_EXTENDED(max))
14202         {
14203             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14204 
14205             /* If both code points are non-portable, warn only on the lower
14206              * one. */
14207             sv_catpv(msg, get_extended_utf8_msg(
14208                                             (UNICODE_IS_PERL_EXTENDED(min))
14209                                             ? min : max));
14210             sv_catpvs(msg, " in \"");
14211             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14212                                  UTF8fARG(is_contents_utf8, s - s0, s0));
14213             sv_catpvs(msg, "\"");
14214         }
14215 
14216 #  endif
14217 
14218         /* Here, this line contains a legal range */
14219         this_definition = sv_2mortal(_new_invlist(2));
14220         this_definition = _add_range_to_invlist(this_definition, min, max);
14221         goto calculate;
14222 
14223       check_if_property:
14224 
14225         /* Here it isn't a legal range line.  See if it is a legal property
14226          * line.  First find the end of the meat of the line */
14227         s = strpbrk(s, "#\n");
14228         if (s == NULL) {
14229             s = e;
14230         }
14231 
14232         /* Ignore trailing blanks in keeping with the requirements of
14233          * parse_uniprop_string() */
14234         s--;
14235         while (s > s0 && isBLANK_A(*s)) {
14236             s--;
14237         }
14238         s++;
14239 
14240         this_definition = parse_uniprop_string(s0, s - s0,
14241                                                is_utf8, to_fold, runtime,
14242                                                deferrable,
14243                                                NULL,
14244                                                user_defined_ptr, msg,
14245                                                (name_len == 0)
14246                                                 ? level /* Don't increase level
14247                                                            if input is empty */
14248                                                 : level + 1
14249                                               );
14250         if (this_definition == NULL) {
14251             goto return_failure;    /* 'msg' should have had the reason
14252                                        appended to it by the above call */
14253         }
14254 
14255         if (! is_invlist(this_definition)) {    /* Unknown at this time */
14256             return newSVsv(this_definition);
14257         }
14258 
14259         if (*s != '\n') {
14260             s = strchr(s, '\n');
14261             if (s == NULL) {
14262                 s = e;
14263             }
14264         }
14265 
14266       calculate:
14267 
14268         switch (op) {
14269             case '+':
14270                 _invlist_union(running_definition, this_definition,
14271                                                         &running_definition);
14272                 break;
14273             case '-':
14274                 _invlist_subtract(running_definition, this_definition,
14275                                                         &running_definition);
14276                 break;
14277             case '&':
14278                 _invlist_intersection(running_definition, this_definition,
14279                                                         &running_definition);
14280                 break;
14281             case '!':
14282                 _invlist_union_complement_2nd(running_definition,
14283                                         this_definition, &running_definition);
14284                 break;
14285             default:
14286                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
14287                                  __FILE__, __LINE__, op);
14288                 break;
14289         }
14290 
14291         /* Position past the '\n' */
14292         s0 = s + 1;
14293     }   /* End of loop through the lines of 'contents' */
14294 
14295     /* Here, we processed all the lines in 'contents' without error.  If we
14296      * didn't add any warnings, simply return success */
14297     if (msgs_length_on_entry == SvCUR(msg)) {
14298 
14299         /* If the expansion was empty, the answer isn't nothing: its an empty
14300          * inversion list */
14301         if (running_definition == NULL) {
14302             running_definition = _new_invlist(1);
14303         }
14304 
14305         return running_definition;
14306     }
14307 
14308     /* Otherwise, add some explanatory text, but we will return success */
14309     goto return_msg;
14310 
14311   return_failure:
14312     running_definition = NULL;
14313 
14314   return_msg:
14315 
14316     if (name_len > 0) {
14317         sv_catpvs(msg, " in expansion of ");
14318         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
14319     }
14320 
14321     return running_definition;
14322 }
14323 
14324 /* As explained below, certain operations need to take place in the first
14325  * thread created.  These macros switch contexts */
14326 #  ifdef USE_ITHREADS
14327 #    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
14328                                         PerlInterpreter * save_aTHX = aTHX;
14329 #    define SWITCH_TO_GLOBAL_CONTEXT                                        \
14330                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
14331 #    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
14332 #    define CUR_CONTEXT      aTHX
14333 #    define ORIGINAL_CONTEXT save_aTHX
14334 #  else
14335 #    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
14336 #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
14337 #    define RESTORE_CONTEXT                   NOOP
14338 #    define CUR_CONTEXT                       NULL
14339 #    define ORIGINAL_CONTEXT                  NULL
14340 #  endif
14341 
14342 STATIC void
S_delete_recursion_entry(pTHX_ void * key)14343 S_delete_recursion_entry(pTHX_ void *key)
14344 {
14345     /* Deletes the entry used to detect recursion when expanding user-defined
14346      * properties.  This is a function so it can be set up to be called even if
14347      * the program unexpectedly quits */
14348 
14349     SV ** current_entry;
14350     const STRLEN key_len = strlen((const char *) key);
14351     DECLARATION_FOR_GLOBAL_CONTEXT;
14352 
14353     SWITCH_TO_GLOBAL_CONTEXT;
14354 
14355     /* If the entry is one of these types, it is a permanent entry, and not the
14356      * one used to detect recursions.  This function should delete only the
14357      * recursion entry */
14358     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
14359     if (     current_entry
14360         && ! is_invlist(*current_entry)
14361         && ! SvPOK(*current_entry))
14362     {
14363         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
14364                                                                     G_DISCARD);
14365     }
14366 
14367     RESTORE_CONTEXT;
14368 }
14369 
14370 STATIC SV *
S_get_fq_name(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool has_colon_colon)14371 S_get_fq_name(pTHX_
14372               const char * const name,    /* The first non-blank in the \p{}, \P{} */
14373               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
14374               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14375               const bool has_colon_colon
14376              )
14377 {
14378     /* Returns a mortal SV containing the fully qualified version of the input
14379      * name */
14380 
14381     SV * fq_name;
14382 
14383     fq_name = newSVpvs_flags("", SVs_TEMP);
14384 
14385     /* Use the current package if it wasn't included in our input */
14386     if (! has_colon_colon) {
14387         const HV * pkg = (IN_PERL_COMPILETIME)
14388                          ? PL_curstash
14389                          : CopSTASH(PL_curcop);
14390         const char* pkgname = HvNAME(pkg);
14391 
14392         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14393                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
14394         sv_catpvs(fq_name, "::");
14395     }
14396 
14397     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14398                          UTF8fARG(is_utf8, name_len, name));
14399     return fq_name;
14400 }
14401 
14402 STATIC SV *
S_parse_uniprop_string(pTHX_ const char * const name,Size_t name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,AV ** strings,bool * user_defined_ptr,SV * msg,const STRLEN level)14403 S_parse_uniprop_string(pTHX_
14404 
14405     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
14406      * now.  If so, the return is an inversion list.
14407      *
14408      * If the property is user-defined, it is a subroutine, which in turn
14409      * may call other subroutines.  This function will call the whole nest of
14410      * them to get the definition they return; if some aren't known at the time
14411      * of the call to this function, the fully qualified name of the highest
14412      * level sub is returned.  It is an error to call this function at runtime
14413      * without every sub defined.
14414      *
14415      * If an error was found, NULL is returned, and 'msg' gets a suitable
14416      * message appended to it.  (Appending allows the back trace of how we got
14417      * to the faulty definition to be displayed through nested calls of
14418      * user-defined subs.)
14419      *
14420      * The caller should NOT try to free any returned inversion list.
14421      *
14422      * Other parameters will be set on return as described below */
14423 
14424     const char * const name,    /* The first non-blank in the \p{}, \P{} */
14425     Size_t name_len,            /* Its length in bytes, not including any
14426                                    trailing space */
14427     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14428     const bool to_fold,         /* ? Is this under /i */
14429     const bool runtime,         /* TRUE if this is being called at run time */
14430     const bool deferrable,      /* TRUE if it's ok for the definition to not be
14431                                    known at this call */
14432     AV ** strings,              /* To return string property values, like named
14433                                    sequences */
14434     bool *user_defined_ptr,     /* Upon return from this function it will be
14435                                    set to TRUE if any component is a
14436                                    user-defined property */
14437     SV * msg,                   /* Any error or warning msg(s) are appended to
14438                                    this */
14439     const STRLEN level)         /* Recursion level of this call */
14440 {
14441     char* lookup_name;          /* normalized name for lookup in our tables */
14442     unsigned lookup_len;        /* Its length */
14443     enum { Not_Strict = 0,      /* Some properties have stricter name */
14444            Strict,              /* normalization rules, which we decide */
14445            As_Is                /* upon based on parsing */
14446          } stricter = Not_Strict;
14447 
14448     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
14449      * (though it requires extra effort to download them from Unicode and
14450      * compile perl to know about them) */
14451     bool is_nv_type = FALSE;
14452 
14453     unsigned int i = 0, i_zero = 0, j = 0;
14454     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
14455     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
14456     int table_index = 0;    /* The entry number for this property in the table
14457                                of all Unicode property names */
14458     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
14459     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
14460                                    the normalized name in certain situations */
14461     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
14462                                    part of a package name */
14463     Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
14464     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
14465                                              property rather than a Unicode
14466                                              one. */
14467     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
14468                                      if an error.  If it is an inversion list,
14469                                      it is the definition.  Otherwise it is a
14470                                      string containing the fully qualified sub
14471                                      name of 'name' */
14472     SV * fq_name = NULL;        /* For user-defined properties, the fully
14473                                    qualified name */
14474     bool invert_return = FALSE; /* ? Do we need to complement the result before
14475                                      returning it */
14476     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
14477                                        explicit utf8:: package that we strip
14478                                        off  */
14479     /* The expansion of properties that could be either user-defined or
14480      * official unicode ones is deferred until runtime, including a marker for
14481      * those that might be in the latter category.  This boolean indicates if
14482      * we've seen that marker.  If not, what we're parsing can't be such an
14483      * official Unicode property whose expansion was deferred */
14484     bool could_be_deferred_official = FALSE;
14485 
14486     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
14487 
14488     /* The input will be normalized into 'lookup_name' */
14489     Newx(lookup_name, name_len, char);
14490     SAVEFREEPV(lookup_name);
14491 
14492     /* Parse the input. */
14493     for (i = 0; i < name_len; i++) {
14494         char cur = name[i];
14495 
14496         /* Most of the characters in the input will be of this ilk, being parts
14497          * of a name */
14498         if (isIDCONT_A(cur)) {
14499 
14500             /* Case differences are ignored.  Our lookup routine assumes
14501              * everything is lowercase, so normalize to that */
14502             if (isUPPER_A(cur)) {
14503                 lookup_name[j++] = toLOWER_A(cur);
14504                 continue;
14505             }
14506 
14507             if (cur == '_') { /* Don't include these in the normalized name */
14508                 continue;
14509             }
14510 
14511             lookup_name[j++] = cur;
14512 
14513             /* The first character in a user-defined name must be of this type.
14514              * */
14515             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
14516                 could_be_user_defined = FALSE;
14517             }
14518 
14519             continue;
14520         }
14521 
14522         /* Here, the character is not something typically in a name,  But these
14523          * two types of characters (and the '_' above) can be freely ignored in
14524          * most situations.  Later it may turn out we shouldn't have ignored
14525          * them, and we have to reparse, but we don't have enough information
14526          * yet to make that decision */
14527         if (cur == '-' || isSPACE_A(cur)) {
14528             could_be_user_defined = FALSE;
14529             continue;
14530         }
14531 
14532         /* An equals sign or single colon mark the end of the first part of
14533          * the property name */
14534         if (    cur == '='
14535             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
14536         {
14537             lookup_name[j++] = '='; /* Treat the colon as an '=' */
14538             equals_pos = j; /* Note where it occurred in the input */
14539             could_be_user_defined = FALSE;
14540             break;
14541         }
14542 
14543         /* If this looks like it is a marker we inserted at compile time,
14544          * set a flag and otherwise ignore it.  If it isn't in the final
14545          * position, keep it as it would have been user input. */
14546         if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
14547             && ! deferrable
14548             &&   could_be_user_defined
14549             &&   i == name_len - 1)
14550         {
14551             name_len--;
14552             could_be_deferred_official = TRUE;
14553             continue;
14554         }
14555 
14556         /* Otherwise, this character is part of the name. */
14557         lookup_name[j++] = cur;
14558 
14559         /* Here it isn't a single colon, so if it is a colon, it must be a
14560          * double colon */
14561         if (cur == ':') {
14562 
14563             /* A double colon should be a package qualifier.  We note its
14564              * position and continue.  Note that one could have
14565              *      pkg1::pkg2::...::foo
14566              * so that the position at the end of the loop will be just after
14567              * the final qualifier */
14568 
14569             i++;
14570             non_pkg_begin = i + 1;
14571             lookup_name[j++] = ':';
14572             lun_non_pkg_begin = j;
14573         }
14574         else { /* Only word chars (and '::') can be in a user-defined name */
14575             could_be_user_defined = FALSE;
14576         }
14577     } /* End of parsing through the lhs of the property name (or all of it if
14578          no rhs) */
14579 
14580     /* If there is a single package name 'utf8::', it is ambiguous.  It could
14581      * be for a user-defined property, or it could be a Unicode property, as
14582      * all of them are considered to be for that package.  For the purposes of
14583      * parsing the rest of the property, strip it off */
14584     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
14585         lookup_name += STRLENs("utf8::");
14586         j           -= STRLENs("utf8::");
14587         equals_pos  -= STRLENs("utf8::");
14588         i_zero       = STRLENs("utf8::");   /* When resetting 'i' to reparse
14589                                                from the beginning, it has to be
14590                                                set past what we're stripping
14591                                                off */
14592         stripped_utf8_pkg = TRUE;
14593     }
14594 
14595     /* Here, we are either done with the whole property name, if it was simple;
14596      * or are positioned just after the '=' if it is compound. */
14597 
14598     if (equals_pos >= 0) {
14599         assert(stricter == Not_Strict); /* We shouldn't have set this yet */
14600 
14601         /* Space immediately after the '=' is ignored */
14602         i++;
14603         for (; i < name_len; i++) {
14604             if (! isSPACE_A(name[i])) {
14605                 break;
14606             }
14607         }
14608 
14609         /* Most punctuation after the equals indicates a subpattern, like
14610          * \p{foo=/bar/} */
14611         if (   isPUNCT_A(name[i])
14612             &&  name[i] != '-'
14613             &&  name[i] != '+'
14614             &&  name[i] != '_'
14615             &&  name[i] != '{'
14616                 /* A backslash means the real delimiter is the next character,
14617                  * but it must be punctuation */
14618             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
14619         {
14620             bool special_property = memEQs(lookup_name, j - 1, "name")
14621                                  || memEQs(lookup_name, j - 1, "na");
14622             if (! special_property) {
14623                 /* Find the property.  The table includes the equals sign, so
14624                  * we use 'j' as-is */
14625                 table_index = do_uniprop_match(lookup_name, j);
14626             }
14627             if (special_property || table_index) {
14628                 REGEXP * subpattern_re;
14629                 char open = name[i++];
14630                 char close;
14631                 const char * pos_in_brackets;
14632                 const char * const * prop_values;
14633                 bool escaped = 0;
14634 
14635                 /* Backslash => delimiter is the character following.  We
14636                  * already checked that it is punctuation */
14637                 if (open == '\\') {
14638                     open = name[i++];
14639                     escaped = 1;
14640                 }
14641 
14642                 /* This data structure is constructed so that the matching
14643                  * closing bracket is 3 past its matching opening.  The second
14644                  * set of closing is so that if the opening is something like
14645                  * ']', the closing will be that as well.  Something similar is
14646                  * done in toke.c */
14647                 pos_in_brackets = memCHRs("([<)]>)]>", open);
14648                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
14649 
14650                 if (    i >= name_len
14651                     ||  name[name_len-1] != close
14652                     || (escaped && name[name_len-2] != '\\')
14653                         /* Also make sure that there are enough characters.
14654                          * e.g., '\\\' would show up incorrectly as legal even
14655                          * though it is too short */
14656                     || (SSize_t) (name_len - i - 1 - escaped) < 0)
14657                 {
14658                     sv_catpvs(msg, "Unicode property wildcard not terminated");
14659                     goto append_name_to_msg;
14660                 }
14661 
14662                 Perl_ck_warner_d(aTHX_
14663                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
14664                     "The Unicode property wildcards feature is experimental");
14665 
14666                 if (special_property) {
14667                     const char * error_msg;
14668                     const char * revised_name = name + i;
14669                     Size_t revised_name_len = name_len - (i + 1 + escaped);
14670 
14671                     /* Currently, the only 'special_property' is name, which we
14672                      * lookup in _charnames.pm */
14673 
14674                     if (! load_charnames(newSVpvs("placeholder"),
14675                                          revised_name, revised_name_len,
14676                                          &error_msg))
14677                     {
14678                         sv_catpv(msg, error_msg);
14679                         goto append_name_to_msg;
14680                     }
14681 
14682                     /* Farm this out to a function just to make the current
14683                      * function less unwieldy */
14684                     if (handle_names_wildcard(revised_name, revised_name_len,
14685                                               &prop_definition,
14686                                               strings))
14687                     {
14688                         return prop_definition;
14689                     }
14690 
14691                     goto failed;
14692                 }
14693 
14694                 prop_values = get_prop_values(table_index);
14695 
14696                 /* Now create and compile the wildcard subpattern.  Use /i
14697                  * because the property values are supposed to match with case
14698                  * ignored. */
14699                 subpattern_re = compile_wildcard(name + i,
14700                                                  name_len - i - 1 - escaped,
14701                                                  TRUE /* /i */
14702                                                 );
14703 
14704                 /* For each legal property value, see if the supplied pattern
14705                  * matches it. */
14706                 while (*prop_values) {
14707                     const char * const entry = *prop_values;
14708                     const Size_t len = strlen(entry);
14709                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
14710 
14711                     if (execute_wildcard(subpattern_re,
14712                                  (char *) entry,
14713                                  (char *) entry + len,
14714                                  (char *) entry, 0,
14715                                  entry_sv,
14716                                  0))
14717                     { /* Here, matched.  Add to the returned list */
14718                         Size_t total_len = j + len;
14719                         SV * sub_invlist = NULL;
14720                         char * this_string;
14721 
14722                         /* We know this is a legal \p{property=value}.  Call
14723                          * the function to return the list of code points that
14724                          * match it */
14725                         Newxz(this_string, total_len + 1, char);
14726                         Copy(lookup_name, this_string, j, char);
14727                         my_strlcat(this_string, entry, total_len + 1);
14728                         SAVEFREEPV(this_string);
14729                         sub_invlist = parse_uniprop_string(this_string,
14730                                                            total_len,
14731                                                            is_utf8,
14732                                                            to_fold,
14733                                                            runtime,
14734                                                            deferrable,
14735                                                            NULL,
14736                                                            user_defined_ptr,
14737                                                            msg,
14738                                                            level + 1);
14739                         _invlist_union(prop_definition, sub_invlist,
14740                                        &prop_definition);
14741                     }
14742 
14743                     prop_values++;  /* Next iteration, look at next propvalue */
14744                 } /* End of looking through property values; (the data
14745                      structure is terminated by a NULL ptr) */
14746 
14747                 SvREFCNT_dec_NN(subpattern_re);
14748 
14749                 if (prop_definition) {
14750                     return prop_definition;
14751                 }
14752 
14753                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
14754                 goto append_name_to_msg;
14755             }
14756 
14757             /* Here's how khw thinks we should proceed to handle the properties
14758              * not yet done:    Bidi Mirroring Glyph        can map to ""
14759                                 Bidi Paired Bracket         can map to ""
14760                                 Case Folding  (both full and simple)
14761                                             Shouldn't /i be good enough for Full
14762                                 Decomposition Mapping
14763                                 Equivalent Unified Ideograph    can map to ""
14764                                 Lowercase Mapping  (both full and simple)
14765                                 NFKC Case Fold                  can map to ""
14766                                 Titlecase Mapping  (both full and simple)
14767                                 Uppercase Mapping  (both full and simple)
14768              * Handle these the same way Name is done, using say, _wild.pm, but
14769              * having both loose and full, like in charclass_invlists.h.
14770              * Perhaps move block and script to that as they are somewhat large
14771              * in charclass_invlists.h.
14772              * For properties where the default is the code point itself, such
14773              * as any of the case changing mappings, the string would otherwise
14774              * consist of all Unicode code points in UTF-8 strung together.
14775              * This would be impractical.  So instead, examine their compiled
14776              * pattern, looking at the ssc.  If none, reject the pattern as an
14777              * error.  Otherwise run the pattern against every code point in
14778              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
14779              * And it might be good to create an API to return the ssc.
14780              * Or handle them like the algorithmic names are done
14781              */
14782         } /* End of is a wildcard subppattern */
14783 
14784         /* \p{name=...} is handled specially.  Instead of using the normal
14785          * mechanism involving charclass_invlists.h, it uses _charnames.pm
14786          * which has the necessary (huge) data accessible to it, and which
14787          * doesn't get loaded unless necessary.  The legal syntax for names is
14788          * somewhat different than other properties due both to the vagaries of
14789          * a few outlier official names, and the fact that only a few ASCII
14790          * characters are permitted in them */
14791         if (   memEQs(lookup_name, j - 1, "name")
14792             || memEQs(lookup_name, j - 1, "na"))
14793         {
14794             dSP;
14795             HV * table;
14796             SV * character;
14797             const char * error_msg;
14798             CV* lookup_loose;
14799             SV * character_name;
14800             STRLEN character_len;
14801             UV cp;
14802 
14803             stricter = As_Is;
14804 
14805             /* Since the RHS (after skipping initial space) is passed unchanged
14806              * to charnames, and there are different criteria for what are
14807              * legal characters in the name, just parse it here.  A character
14808              * name must begin with an ASCII alphabetic */
14809             if (! isALPHA(name[i])) {
14810                 goto failed;
14811             }
14812             lookup_name[j++] = name[i];
14813 
14814             for (++i; i < name_len; i++) {
14815                 /* Official names can only be in the ASCII range, and only
14816                  * certain characters */
14817                 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
14818                     goto failed;
14819                 }
14820                 lookup_name[j++] = name[i];
14821             }
14822 
14823             /* Finished parsing, save the name into an SV */
14824             character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
14825 
14826             /* Make sure _charnames is loaded.  (The parameters give context
14827              * for any errors generated */
14828             table = load_charnames(character_name, name, name_len, &error_msg);
14829             if (table == NULL) {
14830                 sv_catpv(msg, error_msg);
14831                 goto append_name_to_msg;
14832             }
14833 
14834             lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
14835             if (! lookup_loose) {
14836                 Perl_croak(aTHX_
14837                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
14838             }
14839 
14840             PUSHSTACKi(PERLSI_REGCOMP);
14841             ENTER ;
14842             SAVETMPS;
14843             save_re_context();
14844 
14845             PUSHMARK(SP) ;
14846             XPUSHs(character_name);
14847             PUTBACK;
14848             call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
14849 
14850             SPAGAIN ;
14851 
14852             character = POPs;
14853             SvREFCNT_inc_simple_void_NN(character);
14854 
14855             PUTBACK ;
14856             FREETMPS ;
14857             LEAVE ;
14858             POPSTACK;
14859 
14860             if (! SvOK(character)) {
14861                 goto failed;
14862             }
14863 
14864             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
14865             if (character_len == SvCUR(character)) {
14866                 prop_definition = add_cp_to_invlist(NULL, cp);
14867             }
14868             else {
14869                 AV * this_string;
14870 
14871                 /* First of the remaining characters in the string. */
14872                 char * remaining = SvPVX(character) + character_len;
14873 
14874                 if (strings == NULL) {
14875                     goto failed;    /* XXX Perhaps a specific msg instead, like
14876                                        'not available here' */
14877                 }
14878 
14879                 if (*strings == NULL) {
14880                     *strings = newAV();
14881                 }
14882 
14883                 this_string = newAV();
14884                 av_push_simple(this_string, newSVuv(cp));
14885 
14886                 do {
14887                     cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
14888                     av_push_simple(this_string, newSVuv(cp));
14889                     remaining += character_len;
14890                 } while (remaining < SvEND(character));
14891 
14892                 av_push_simple(*strings, (SV *) this_string);
14893             }
14894 
14895             return prop_definition;
14896         }
14897 
14898         /* Certain properties whose values are numeric need special handling.
14899          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
14900          * purposes of checking if this is one of those properties */
14901         if (memBEGINPs(lookup_name, j, "is")) {
14902             lookup_offset = 2;
14903         }
14904 
14905         /* Then check if it is one of these specially-handled properties.  The
14906          * possibilities are hard-coded because easier this way, and the list
14907          * is unlikely to change.
14908          *
14909          * All numeric value type properties are of this ilk, and are also
14910          * special in a different way later on.  So find those first.  There
14911          * are several numeric value type properties in the Unihan DB (which is
14912          * unlikely to be compiled with perl, but we handle it here in case it
14913          * does get compiled).  They all end with 'numeric'.  The interiors
14914          * aren't checked for the precise property.  This would stop working if
14915          * a cjk property were to be created that ended with 'numeric' and
14916          * wasn't a numeric type */
14917         is_nv_type = memEQs(lookup_name + lookup_offset,
14918                        j - 1 - lookup_offset, "numericvalue")
14919                   || memEQs(lookup_name + lookup_offset,
14920                       j - 1 - lookup_offset, "nv")
14921                   || (   memENDPs(lookup_name + lookup_offset,
14922                             j - 1 - lookup_offset, "numeric")
14923                       && (   memBEGINPs(lookup_name + lookup_offset,
14924                                       j - 1 - lookup_offset, "cjk")
14925                           || memBEGINPs(lookup_name + lookup_offset,
14926                                       j - 1 - lookup_offset, "k")));
14927         if (   is_nv_type
14928             || memEQs(lookup_name + lookup_offset,
14929                       j - 1 - lookup_offset, "canonicalcombiningclass")
14930             || memEQs(lookup_name + lookup_offset,
14931                       j - 1 - lookup_offset, "ccc")
14932             || memEQs(lookup_name + lookup_offset,
14933                       j - 1 - lookup_offset, "age")
14934             || memEQs(lookup_name + lookup_offset,
14935                       j - 1 - lookup_offset, "in")
14936             || memEQs(lookup_name + lookup_offset,
14937                       j - 1 - lookup_offset, "presentin"))
14938         {
14939             unsigned int k;
14940 
14941             /* Since the stuff after the '=' is a number, we can't throw away
14942              * '-' willy-nilly, as those could be a minus sign.  Other stricter
14943              * rules also apply.  However, these properties all can have the
14944              * rhs not be a number, in which case they contain at least one
14945              * alphabetic.  In those cases, the stricter rules don't apply.
14946              * But the numeric type properties can have the alphas [Ee] to
14947              * signify an exponent, and it is still a number with stricter
14948              * rules.  So look for an alpha that signifies not-strict */
14949             stricter = Strict;
14950             for (k = i; k < name_len; k++) {
14951                 if (   isALPHA_A(name[k])
14952                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
14953                 {
14954                     stricter = Not_Strict;
14955                     break;
14956                 }
14957             }
14958         }
14959 
14960         if (stricter) {
14961 
14962             /* A number may have a leading '+' or '-'.  The latter is retained
14963              * */
14964             if (name[i] == '+') {
14965                 i++;
14966             }
14967             else if (name[i] == '-') {
14968                 lookup_name[j++] = '-';
14969                 i++;
14970             }
14971 
14972             /* Skip leading zeros including single underscores separating the
14973              * zeros, or between the final leading zero and the first other
14974              * digit */
14975             for (; i < name_len - 1; i++) {
14976                 if (    name[i] != '0'
14977                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
14978                 {
14979                     break;
14980                 }
14981             }
14982 
14983             /* Turn nv=-0 into nv=0.  These should be equivalent, but vary by
14984              * underling libc implementation. */
14985             if (   i == name_len - 1
14986                 && name[name_len-1] == '0'
14987                 && lookup_name[j-1] == '-')
14988             {
14989                 j--;
14990             }
14991         }
14992     }
14993     else {  /* No '=' */
14994 
14995        /* Only a few properties without an '=' should be parsed with stricter
14996         * rules.  The list is unlikely to change. */
14997         if (   memBEGINPs(lookup_name, j, "perl")
14998             && memNEs(lookup_name + 4, j - 4, "space")
14999             && memNEs(lookup_name + 4, j - 4, "word"))
15000         {
15001             stricter = Strict;
15002 
15003             /* We set the inputs back to 0 and the code below will reparse,
15004              * using strict */
15005             i = i_zero;
15006             j = 0;
15007         }
15008     }
15009 
15010     /* Here, we have either finished the property, or are positioned to parse
15011      * the remainder, and we know if stricter rules apply.  Finish out, if not
15012      * already done */
15013     for (; i < name_len; i++) {
15014         char cur = name[i];
15015 
15016         /* In all instances, case differences are ignored, and we normalize to
15017          * lowercase */
15018         if (isUPPER_A(cur)) {
15019             lookup_name[j++] = toLOWER(cur);
15020             continue;
15021         }
15022 
15023         /* An underscore is skipped, but not under strict rules unless it
15024          * separates two digits */
15025         if (cur == '_') {
15026             if (    stricter
15027                 && (   i == i_zero || (int) i == equals_pos || i == name_len- 1
15028                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
15029             {
15030                 lookup_name[j++] = '_';
15031             }
15032             continue;
15033         }
15034 
15035         /* Hyphens are skipped except under strict */
15036         if (cur == '-' && ! stricter) {
15037             continue;
15038         }
15039 
15040         /* XXX Bug in documentation.  It says white space skipped adjacent to
15041          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
15042          * in a number */
15043         if (isSPACE_A(cur) && ! stricter) {
15044             continue;
15045         }
15046 
15047         lookup_name[j++] = cur;
15048 
15049         /* Unless this is a non-trailing slash, we are done with it */
15050         if (i >= name_len - 1 || cur != '/') {
15051             continue;
15052         }
15053 
15054         slash_pos = j;
15055 
15056         /* A slash in the 'numeric value' property indicates that what follows
15057          * is a denominator.  It can have a leading '+' and '0's that should be
15058          * skipped.  But we have never allowed a negative denominator, so treat
15059          * a minus like every other character.  (No need to rule out a second
15060          * '/', as that won't match anything anyway */
15061         if (is_nv_type) {
15062             i++;
15063             if (i < name_len && name[i] == '+') {
15064                 i++;
15065             }
15066 
15067             /* Skip leading zeros including underscores separating digits */
15068             for (; i < name_len - 1; i++) {
15069                 if (   name[i] != '0'
15070                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15071                 {
15072                     break;
15073                 }
15074             }
15075 
15076             /* Store the first real character in the denominator */
15077             if (i < name_len) {
15078                 lookup_name[j++] = name[i];
15079             }
15080         }
15081     }
15082 
15083     /* Here are completely done parsing the input 'name', and 'lookup_name'
15084      * contains a copy, normalized.
15085      *
15086      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
15087      * different from without the underscores.  */
15088     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
15089            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
15090         && UNLIKELY(name[name_len-1] == '_'))
15091     {
15092         lookup_name[j++] = '&';
15093     }
15094 
15095     /* If the original input began with 'In' or 'Is', it could be a subroutine
15096      * call to a user-defined property instead of a Unicode property name. */
15097     if (    name_len - non_pkg_begin > 2
15098         &&  name[non_pkg_begin+0] == 'I'
15099         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
15100     {
15101         /* Names that start with In have different characteristics than those
15102          * that start with Is */
15103         if (name[non_pkg_begin+1] == 's') {
15104             starts_with_Is = TRUE;
15105         }
15106     }
15107     else {
15108         could_be_user_defined = FALSE;
15109     }
15110 
15111     if (could_be_user_defined) {
15112         CV* user_sub;
15113 
15114         /* If the user defined property returns the empty string, it could
15115          * easily be because the pattern is being compiled before the data it
15116          * actually needs to compile is available.  This could be argued to be
15117          * a bug in the perl code, but this is a change of behavior for Perl,
15118          * so we handle it.  This means that intentionally returning nothing
15119          * will not be resolved until runtime */
15120         bool empty_return = FALSE;
15121 
15122         /* Here, the name could be for a user defined property, which are
15123          * implemented as subs. */
15124         user_sub = get_cvn_flags(name, name_len, 0);
15125         if (! user_sub) {
15126 
15127             /* Here, the property name could be a user-defined one, but there
15128              * is no subroutine to handle it (as of now).   Defer handling it
15129              * until runtime.  Otherwise, a block defined by Unicode in a later
15130              * release would get the synonym InFoo added for it, and existing
15131              * code that used that name would suddenly break if it referred to
15132              * the property before the sub was declared.  See [perl #134146] */
15133             if (deferrable) {
15134                 goto definition_deferred;
15135             }
15136 
15137             /* Here, we are at runtime, and didn't find the user property.  It
15138              * could be an official property, but only if no package was
15139              * specified, or just the utf8:: package. */
15140             if (could_be_deferred_official) {
15141                 lookup_name += lun_non_pkg_begin;
15142                 j -= lun_non_pkg_begin;
15143             }
15144             else if (! stripped_utf8_pkg) {
15145                 goto unknown_user_defined;
15146             }
15147 
15148             /* Drop down to look up in the official properties */
15149         }
15150         else {
15151             const char insecure[] = "Insecure user-defined property";
15152 
15153             /* Here, there is a sub by the correct name.  Normally we call it
15154              * to get the property definition */
15155             dSP;
15156             SV * user_sub_sv = MUTABLE_SV(user_sub);
15157             SV * error;     /* Any error returned by calling 'user_sub' */
15158             SV * key;       /* The key into the hash of user defined sub names
15159                              */
15160             SV * placeholder;
15161             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
15162 
15163             /* How many times to retry when another thread is in the middle of
15164              * expanding the same definition we want */
15165             PERL_INT_FAST8_T retry_countdown = 10;
15166 
15167             DECLARATION_FOR_GLOBAL_CONTEXT;
15168 
15169             /* If we get here, we know this property is user-defined */
15170             *user_defined_ptr = TRUE;
15171 
15172             /* We refuse to call a potentially tainted subroutine; returning an
15173              * error instead */
15174             if (TAINT_get) {
15175                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15176                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15177                 goto append_name_to_msg;
15178             }
15179 
15180             /* In principal, we only call each subroutine property definition
15181              * once during the life of the program.  This guarantees that the
15182              * property definition never changes.  The results of the single
15183              * sub call are stored in a hash, which is used instead for future
15184              * references to this property.  The property definition is thus
15185              * immutable.  But, to allow the user to have a /i-dependent
15186              * definition, we call the sub once for non-/i, and once for /i,
15187              * should the need arise, passing the /i status as a parameter.
15188              *
15189              * We start by constructing the hash key name, consisting of the
15190              * fully qualified subroutine name, preceded by the /i status, so
15191              * that there is a key for /i and a different key for non-/i */
15192             key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
15193             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15194                                           non_pkg_begin != 0);
15195             sv_catsv(key, fq_name);
15196 
15197             /* We only call the sub once throughout the life of the program
15198              * (with the /i, non-/i exception noted above).  That means the
15199              * hash must be global and accessible to all threads.  It is
15200              * created at program start-up, before any threads are created, so
15201              * is accessible to all children.  But this creates some
15202              * complications.
15203              *
15204              * 1) The keys can't be shared, or else problems arise; sharing is
15205              *    turned off at hash creation time
15206              * 2) All SVs in it are there for the remainder of the life of the
15207              *    program, and must be created in the same interpreter context
15208              *    as the hash, or else they will be freed from the wrong pool
15209              *    at global destruction time.  This is handled by switching to
15210              *    the hash's context to create each SV going into it, and then
15211              *    immediately switching back
15212              * 3) All accesses to the hash must be controlled by a mutex, to
15213              *    prevent two threads from getting an unstable state should
15214              *    they simultaneously be accessing it.  The code below is
15215              *    crafted so that the mutex is locked whenever there is an
15216              *    access and unlocked only when the next stable state is
15217              *    achieved.
15218              *
15219              * The hash stores either the definition of the property if it was
15220              * valid, or, if invalid, the error message that was raised.  We
15221              * use the type of SV to distinguish.
15222              *
15223              * There's also the need to guard against the definition expansion
15224              * from infinitely recursing.  This is handled by storing the aTHX
15225              * of the expanding thread during the expansion.  Again the SV type
15226              * is used to distinguish this from the other two cases.  If we
15227              * come to here and the hash entry for this property is our aTHX,
15228              * it means we have recursed, and the code assumes that we would
15229              * infinitely recurse, so instead stops and raises an error.
15230              * (Any recursion has always been treated as infinite recursion in
15231              * this feature.)
15232              *
15233              * If instead, the entry is for a different aTHX, it means that
15234              * that thread has gotten here first, and hasn't finished expanding
15235              * the definition yet.  We just have to wait until it is done.  We
15236              * sleep and retry a few times, returning an error if the other
15237              * thread doesn't complete. */
15238 
15239           re_fetch:
15240             USER_PROP_MUTEX_LOCK;
15241 
15242             /* If we have an entry for this key, the subroutine has already
15243              * been called once with this /i status. */
15244             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
15245                                                    SvPVX(key), SvCUR(key), 0);
15246             if (saved_user_prop_ptr) {
15247 
15248                 /* If the saved result is an inversion list, it is the valid
15249                  * definition of this property */
15250                 if (is_invlist(*saved_user_prop_ptr)) {
15251                     prop_definition = *saved_user_prop_ptr;
15252 
15253                     /* The SV in the hash won't be removed until global
15254                      * destruction, so it is stable and we can unlock */
15255                     USER_PROP_MUTEX_UNLOCK;
15256 
15257                     /* The caller shouldn't try to free this SV */
15258                     return prop_definition;
15259                 }
15260 
15261                 /* Otherwise, if it is a string, it is the error message
15262                  * that was returned when we first tried to evaluate this
15263                  * property.  Fail, and append the message */
15264                 if (SvPOK(*saved_user_prop_ptr)) {
15265                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15266                     sv_catsv(msg, *saved_user_prop_ptr);
15267 
15268                     /* The SV in the hash won't be removed until global
15269                      * destruction, so it is stable and we can unlock */
15270                     USER_PROP_MUTEX_UNLOCK;
15271 
15272                     return NULL;
15273                 }
15274 
15275                 assert(SvIOK(*saved_user_prop_ptr));
15276 
15277                 /* Here, we have an unstable entry in the hash.  Either another
15278                  * thread is in the middle of expanding the property's
15279                  * definition, or we are ourselves recursing.  We use the aTHX
15280                  * in it to distinguish */
15281                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
15282 
15283                     /* Here, it's another thread doing the expanding.  We've
15284                      * looked as much as we are going to at the contents of the
15285                      * hash entry.  It's safe to unlock. */
15286                     USER_PROP_MUTEX_UNLOCK;
15287 
15288                     /* Retry a few times */
15289                     if (retry_countdown-- > 0) {
15290                         PerlProc_sleep(1);
15291                         goto re_fetch;
15292                     }
15293 
15294                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15295                     sv_catpvs(msg, "Timeout waiting for another thread to "
15296                                    "define");
15297                     goto append_name_to_msg;
15298                 }
15299 
15300                 /* Here, we are recursing; don't dig any deeper */
15301                 USER_PROP_MUTEX_UNLOCK;
15302 
15303                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15304                 sv_catpvs(msg,
15305                           "Infinite recursion in user-defined property");
15306                 goto append_name_to_msg;
15307             }
15308 
15309             /* Here, this thread has exclusive control, and there is no entry
15310              * for this property in the hash.  So we have the go ahead to
15311              * expand the definition ourselves. */
15312 
15313             PUSHSTACKi(PERLSI_REGCOMP);
15314             ENTER;
15315 
15316             /* Create a temporary placeholder in the hash to detect recursion
15317              * */
15318             SWITCH_TO_GLOBAL_CONTEXT;
15319             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
15320             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
15321             RESTORE_CONTEXT;
15322 
15323             /* Now that we have a placeholder, we can let other threads
15324              * continue */
15325             USER_PROP_MUTEX_UNLOCK;
15326 
15327             /* Make sure the placeholder always gets destroyed */
15328             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
15329 
15330             PUSHMARK(SP);
15331             SAVETMPS;
15332 
15333             /* Call the user's function, with the /i status as a parameter.
15334              * Note that we have gone to a lot of trouble to keep this call
15335              * from being within the locked mutex region. */
15336             XPUSHs(boolSV(to_fold));
15337             PUTBACK;
15338 
15339             /* The following block was taken from swash_init().  Presumably
15340              * they apply to here as well, though we no longer use a swash --
15341              * khw */
15342             SAVEHINTS();
15343             save_re_context();
15344             /* We might get here via a subroutine signature which uses a utf8
15345              * parameter name, at which point PL_subname will have been set
15346              * but not yet used. */
15347             save_item(PL_subname);
15348 
15349             /* G_SCALAR guarantees a single return value */
15350             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
15351 
15352             SPAGAIN;
15353 
15354             error = ERRSV;
15355             if (TAINT_get || SvTRUE(error)) {
15356                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15357                 if (SvTRUE(error)) {
15358                     sv_catpvs(msg, "Error \"");
15359                     sv_catsv(msg, error);
15360                     sv_catpvs(msg, "\"");
15361                 }
15362                 if (TAINT_get) {
15363                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
15364                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15365                 }
15366 
15367                 if (name_len > 0) {
15368                     sv_catpvs(msg, " in expansion of ");
15369                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
15370                                                                   name_len,
15371                                                                   name));
15372                 }
15373 
15374                 (void) POPs;
15375                 prop_definition = NULL;
15376             }
15377             else {
15378                 SV * contents = POPs;
15379 
15380                 /* The contents is supposed to be the expansion of the property
15381                  * definition.  If the definition is deferrable, and we got an
15382                  * empty string back, set a flag to later defer it (after clean
15383                  * up below). */
15384                 if (      deferrable
15385                     && (! SvPOK(contents) || SvCUR(contents) == 0))
15386                 {
15387                         empty_return = TRUE;
15388                 }
15389                 else { /* Otherwise, call a function to check for valid syntax,
15390                           and handle it */
15391 
15392                     prop_definition = handle_user_defined_property(
15393                                                     name, name_len,
15394                                                     is_utf8, to_fold, runtime,
15395                                                     deferrable,
15396                                                     contents, user_defined_ptr,
15397                                                     msg,
15398                                                     level);
15399                 }
15400             }
15401 
15402             /* Here, we have the results of the expansion.  Delete the
15403              * placeholder, and if the definition is now known, replace it with
15404              * that definition.  We need exclusive access to the hash, and we
15405              * can't let anyone else in, between when we delete the placeholder
15406              * and add the permanent entry */
15407             USER_PROP_MUTEX_LOCK;
15408 
15409             S_delete_recursion_entry(aTHX_ SvPVX(key));
15410 
15411             if (    ! empty_return
15412                 && (! prop_definition || is_invlist(prop_definition)))
15413             {
15414                 /* If we got success we use the inversion list defining the
15415                  * property; otherwise use the error message */
15416                 SWITCH_TO_GLOBAL_CONTEXT;
15417                 (void) hv_store_ent(PL_user_def_props,
15418                                     key,
15419                                     ((prop_definition)
15420                                      ? newSVsv(prop_definition)
15421                                      : newSVsv(msg)),
15422                                     0);
15423                 RESTORE_CONTEXT;
15424             }
15425 
15426             /* All done, and the hash now has a permanent entry for this
15427              * property.  Give up exclusive control */
15428             USER_PROP_MUTEX_UNLOCK;
15429 
15430             FREETMPS;
15431             LEAVE;
15432             POPSTACK;
15433 
15434             if (empty_return) {
15435                 goto definition_deferred;
15436             }
15437 
15438             if (prop_definition) {
15439 
15440                 /* If the definition is for something not known at this time,
15441                  * we toss it, and go return the main property name, as that's
15442                  * the one the user will be aware of */
15443                 if (! is_invlist(prop_definition)) {
15444                     SvREFCNT_dec_NN(prop_definition);
15445                     goto definition_deferred;
15446                 }
15447 
15448                 sv_2mortal(prop_definition);
15449             }
15450 
15451             /* And return */
15452             return prop_definition;
15453 
15454         }   /* End of calling the subroutine for the user-defined property */
15455     }       /* End of it could be a user-defined property */
15456 
15457     /* Here it wasn't a user-defined property that is known at this time.  See
15458      * if it is a Unicode property */
15459 
15460     lookup_len = j;     /* This is a more mnemonic name than 'j' */
15461 
15462     /* Get the index into our pointer table of the inversion list corresponding
15463      * to the property */
15464     table_index = do_uniprop_match(lookup_name, lookup_len);
15465 
15466     /* If it didn't find the property ... */
15467     if (table_index == 0) {
15468 
15469         /* Try again stripping off any initial 'Is'.  This is because we
15470          * promise that an initial Is is optional.  The same isn't true of
15471          * names that start with 'In'.  Those can match only blocks, and the
15472          * lookup table already has those accounted for.  The lookup table also
15473          * has already accounted for Perl extensions (without and = sign)
15474          * starting with 'i's'. */
15475         if (starts_with_Is && equals_pos >= 0) {
15476             lookup_name += 2;
15477             lookup_len -= 2;
15478             equals_pos -= 2;
15479             slash_pos -= 2;
15480 
15481             table_index = do_uniprop_match(lookup_name, lookup_len);
15482         }
15483 
15484         if (table_index == 0) {
15485             char * canonical;
15486 
15487             /* Here, we didn't find it.  If not a numeric type property, and
15488              * can't be a user-defined one, it isn't a legal property */
15489             if (! is_nv_type) {
15490                 if (! could_be_user_defined) {
15491                     goto failed;
15492                 }
15493 
15494                 /* Here, the property name is legal as a user-defined one.   At
15495                  * compile time, it might just be that the subroutine for that
15496                  * property hasn't been encountered yet, but at runtime, it's
15497                  * an error to try to use an undefined one */
15498                 if (! deferrable) {
15499                     goto unknown_user_defined;;
15500                 }
15501 
15502                 goto definition_deferred;
15503             } /* End of isn't a numeric type property */
15504 
15505             /* The numeric type properties need more work to decide.  What we
15506              * do is make sure we have the number in canonical form and look
15507              * that up. */
15508 
15509             if (slash_pos < 0) {    /* No slash */
15510 
15511                 /* When it isn't a rational, take the input, convert it to a
15512                  * NV, then create a canonical string representation of that
15513                  * NV. */
15514 
15515                 NV value;
15516                 SSize_t value_len = lookup_len - equals_pos;
15517 
15518                 /* Get the value */
15519                 if (   value_len <= 0
15520                     || my_atof3(lookup_name + equals_pos, &value,
15521                                 value_len)
15522                           != lookup_name + lookup_len)
15523                 {
15524                     goto failed;
15525                 }
15526 
15527                 /* If the value is an integer, the canonical value is integral
15528                  * */
15529                 if (Perl_ceil(value) == value) {
15530                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
15531                                             equals_pos, lookup_name, value);
15532                 }
15533                 else {  /* Otherwise, it is %e with a known precision */
15534                     char * exp_ptr;
15535 
15536                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
15537                                                 equals_pos, lookup_name,
15538                                                 PL_E_FORMAT_PRECISION, value);
15539 
15540                     /* The exponent generated is expecting two digits, whereas
15541                      * %e on some systems will generate three.  Remove leading
15542                      * zeros in excess of 2 from the exponent.  We start
15543                      * looking for them after the '=' */
15544                     exp_ptr = strchr(canonical + equals_pos, 'e');
15545                     if (exp_ptr) {
15546                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
15547                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
15548 
15549                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
15550 
15551                         if (excess_exponent_len > 0) {
15552                             SSize_t leading_zeros = strspn(cur_ptr, "0");
15553                             SSize_t excess_leading_zeros
15554                                     = MIN(leading_zeros, excess_exponent_len);
15555                             if (excess_leading_zeros > 0) {
15556                                 Move(cur_ptr + excess_leading_zeros,
15557                                      cur_ptr,
15558                                      strlen(cur_ptr) - excess_leading_zeros
15559                                        + 1,  /* Copy the NUL as well */
15560                                      char);
15561                             }
15562                         }
15563                     }
15564                 }
15565             }
15566             else {  /* Has a slash.  Create a rational in canonical form  */
15567                 UV numerator, denominator, gcd, trial;
15568                 const char * end_ptr;
15569                 const char * sign = "";
15570 
15571                 /* We can't just find the numerator, denominator, and do the
15572                  * division, then use the method above, because that is
15573                  * inexact.  And the input could be a rational that is within
15574                  * epsilon (given our precision) of a valid rational, and would
15575                  * then incorrectly compare valid.
15576                  *
15577                  * We're only interested in the part after the '=' */
15578                 const char * this_lookup_name = lookup_name + equals_pos;
15579                 lookup_len -= equals_pos;
15580                 slash_pos -= equals_pos;
15581 
15582                 /* Handle any leading minus */
15583                 if (this_lookup_name[0] == '-') {
15584                     sign = "-";
15585                     this_lookup_name++;
15586                     lookup_len--;
15587                     slash_pos--;
15588                 }
15589 
15590                 /* Convert the numerator to numeric */
15591                 end_ptr = this_lookup_name + slash_pos;
15592                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
15593                     goto failed;
15594                 }
15595 
15596                 /* It better have included all characters before the slash */
15597                 if (*end_ptr != '/') {
15598                     goto failed;
15599                 }
15600 
15601                 /* Set to look at just the denominator */
15602                 this_lookup_name += slash_pos;
15603                 lookup_len -= slash_pos;
15604                 end_ptr = this_lookup_name + lookup_len;
15605 
15606                 /* Convert the denominator to numeric */
15607                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
15608                     goto failed;
15609                 }
15610 
15611                 /* It better be the rest of the characters, and don't divide by
15612                  * 0 */
15613                 if (   end_ptr != this_lookup_name + lookup_len
15614                     || denominator == 0)
15615                 {
15616                     goto failed;
15617                 }
15618 
15619                 /* Get the greatest common denominator using
15620                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
15621                 gcd = numerator;
15622                 trial = denominator;
15623                 while (trial != 0) {
15624                     UV temp = trial;
15625                     trial = gcd % trial;
15626                     gcd = temp;
15627                 }
15628 
15629                 /* If already in lowest possible terms, we have already tried
15630                  * looking this up */
15631                 if (gcd == 1) {
15632                     goto failed;
15633                 }
15634 
15635                 /* Reduce the rational, which should put it in canonical form
15636                  * */
15637                 numerator /= gcd;
15638                 denominator /= gcd;
15639 
15640                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
15641                         equals_pos, lookup_name, sign, numerator, denominator);
15642             }
15643 
15644             /* Here, we have the number in canonical form.  Try that */
15645             table_index = do_uniprop_match(canonical, strlen(canonical));
15646             if (table_index == 0) {
15647                 goto failed;
15648             }
15649         }   /* End of still didn't find the property in our table */
15650     }       /* End of       didn't find the property in our table */
15651 
15652     /* Here, we have a non-zero return, which is an index into a table of ptrs.
15653      * A negative return signifies that the real index is the absolute value,
15654      * but the result needs to be inverted */
15655     if (table_index < 0) {
15656         invert_return = TRUE;
15657         table_index = -table_index;
15658     }
15659 
15660     /* Out-of band indices indicate a deprecated property.  The proper index is
15661      * modulo it with the table size.  And dividing by the table size yields
15662      * an offset into a table constructed by regen/mk_invlists.pl to contain
15663      * the corresponding warning message */
15664     if (table_index > MAX_UNI_KEYWORD_INDEX) {
15665         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
15666         table_index %= MAX_UNI_KEYWORD_INDEX;
15667         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
15668                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
15669                 (int) name_len, name,
15670                 get_deprecated_property_msg(warning_offset));
15671     }
15672 
15673     /* In a few properties, a different property is used under /i.  These are
15674      * unlikely to change, so are hard-coded here. */
15675     if (to_fold) {
15676         if (   table_index == UNI_XPOSIXUPPER
15677             || table_index == UNI_XPOSIXLOWER
15678             || table_index == UNI_TITLE)
15679         {
15680             table_index = UNI_CASED;
15681         }
15682         else if (   table_index == UNI_UPPERCASELETTER
15683                  || table_index == UNI_LOWERCASELETTER
15684 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
15685                  || table_index == UNI_TITLECASELETTER
15686 #  endif
15687         ) {
15688             table_index = UNI_CASEDLETTER;
15689         }
15690         else if (  table_index == UNI_POSIXUPPER
15691                 || table_index == UNI_POSIXLOWER)
15692         {
15693             table_index = UNI_POSIXALPHA;
15694         }
15695     }
15696 
15697     /* Create and return the inversion list */
15698     prop_definition = get_prop_definition(table_index);
15699     sv_2mortal(prop_definition);
15700 
15701     /* See if there is a private use override to add to this definition */
15702     {
15703         COPHH * hinthash = (IN_PERL_COMPILETIME)
15704                            ? CopHINTHASH_get(&PL_compiling)
15705                            : CopHINTHASH_get(PL_curcop);
15706         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
15707 
15708         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
15709 
15710             /* See if there is an element in the hints hash for this table */
15711             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
15712             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
15713 
15714             if (pos) {
15715                 bool dummy;
15716                 SV * pu_definition;
15717                 SV * pu_invlist;
15718                 SV * expanded_prop_definition =
15719                             sv_2mortal(invlist_clone(prop_definition, NULL));
15720 
15721                 /* If so, it's definition is the string from here to the next
15722                  * \a character.  And its format is the same as a user-defined
15723                  * property */
15724                 pos += SvCUR(pu_lookup);
15725                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
15726                 pu_invlist = handle_user_defined_property(lookup_name,
15727                                                           lookup_len,
15728                                                           0, /* Not UTF-8 */
15729                                                           0, /* Not folded */
15730                                                           runtime,
15731                                                           deferrable,
15732                                                           pu_definition,
15733                                                           &dummy,
15734                                                           msg,
15735                                                           level);
15736                 if (TAINT_get) {
15737                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15738                     sv_catpvs(msg, "Insecure private-use override");
15739                     goto append_name_to_msg;
15740                 }
15741 
15742                 /* For now, as a safety measure, make sure that it doesn't
15743                  * override non-private use code points */
15744                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
15745 
15746                 /* Add it to the list to be returned */
15747                 _invlist_union(prop_definition, pu_invlist,
15748                                &expanded_prop_definition);
15749                 prop_definition = expanded_prop_definition;
15750                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
15751             }
15752         }
15753     }
15754 
15755     if (invert_return) {
15756         _invlist_invert(prop_definition);
15757     }
15758     return prop_definition;
15759 
15760   unknown_user_defined:
15761     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15762     sv_catpvs(msg, "Unknown user-defined property name");
15763     goto append_name_to_msg;
15764 
15765   failed:
15766     if (non_pkg_begin != 0) {
15767         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15768         sv_catpvs(msg, "Illegal user-defined property name");
15769     }
15770     else {
15771         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15772         sv_catpvs(msg, "Can't find Unicode property definition");
15773     }
15774     /* FALLTHROUGH */
15775 
15776   append_name_to_msg:
15777     {
15778         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
15779         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
15780 
15781         sv_catpv(msg, prefix);
15782         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
15783         sv_catpv(msg, suffix);
15784     }
15785 
15786     return NULL;
15787 
15788   definition_deferred:
15789 
15790     {
15791         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
15792 
15793         /* Here it could yet to be defined, so defer evaluation of this until
15794          * its needed at runtime.  We need the fully qualified property name to
15795          * avoid ambiguity */
15796         if (! fq_name) {
15797             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15798                                                                 is_qualified);
15799         }
15800 
15801         /* If it didn't come with a package, or the package is utf8::, this
15802          * actually could be an official Unicode property whose inclusion we
15803          * are deferring until runtime to make sure that it isn't overridden by
15804          * a user-defined property of the same name (which we haven't
15805          * encountered yet).  Add a marker to indicate this possibility, for
15806          * use at such time when we first need the definition during pattern
15807          * matching execution */
15808         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
15809             sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
15810         }
15811 
15812         /* We also need a trailing newline */
15813         sv_catpvs(fq_name, "\n");
15814 
15815         *user_defined_ptr = TRUE;
15816         return fq_name;
15817     }
15818 }
15819 
15820 STATIC bool
S_handle_names_wildcard(pTHX_ const char * wname,const STRLEN wname_len,SV ** prop_definition,AV ** strings)15821 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
15822                               const STRLEN wname_len, /* Its length */
15823                               SV ** prop_definition,
15824                               AV ** strings)
15825 {
15826     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
15827      * any matches, adding them to prop_definition */
15828 
15829     dSP;
15830 
15831     CV * get_names_info;        /* entry to charnames.pm to get info we need */
15832     SV * names_string;          /* Contains all character names, except algo */
15833     SV * algorithmic_names;     /* Contains info about algorithmically
15834                                    generated character names */
15835     REGEXP * subpattern_re;     /* The user's pattern to match with */
15836     struct regexp * prog;       /* The compiled pattern */
15837     char * all_names_start;     /* lib/unicore/Name.pl string of every
15838                                    (non-algorithmic) character name */
15839     char * cur_pos;             /* We match, effectively using /gc; this is
15840                                    where we are now */
15841     bool found_matches = FALSE; /* Did any name match so far? */
15842     SV * empty;                 /* For matching zero length names */
15843     SV * must_sv;               /* Contains the substring, if any, that must be
15844                                    in a name for the subpattern to match */
15845     const char * must;          /* The PV of 'must' */
15846     STRLEN must_len;            /* And its length */
15847     SV * syllable_name = NULL;  /* For Hangul syllables */
15848     const char hangul_prefix[] = "HANGUL SYLLABLE ";
15849     const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
15850 
15851     /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
15852      * syllable name, and these are immutable and guaranteed by the Unicode
15853      * standard to never be extended */
15854     const STRLEN syl_max_len = hangul_prefix_len + 7;
15855 
15856     IV i;
15857 
15858     PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
15859 
15860     /* Make sure _charnames is loaded.  (The parameters give context
15861      * for any errors generated */
15862     get_names_info = get_cv("_charnames::_get_names_info", 0);
15863     if (! get_names_info) {
15864         Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
15865     }
15866 
15867     /* Get the charnames data */
15868     PUSHSTACKi(PERLSI_REGCOMP);
15869     ENTER ;
15870     SAVETMPS;
15871     save_re_context();
15872 
15873     PUSHMARK(SP) ;
15874     PUTBACK;
15875 
15876     /* Special _charnames entry point that returns the info this routine
15877      * requires */
15878     call_sv(MUTABLE_SV(get_names_info), G_LIST);
15879 
15880     SPAGAIN ;
15881 
15882     /* Data structure for names which end in their very own code points */
15883     algorithmic_names = POPs;
15884     SvREFCNT_inc_simple_void_NN(algorithmic_names);
15885 
15886     /* The lib/unicore/Name.pl string */
15887     names_string = POPs;
15888     SvREFCNT_inc_simple_void_NN(names_string);
15889 
15890     PUTBACK ;
15891     FREETMPS ;
15892     LEAVE ;
15893     POPSTACK;
15894 
15895     if (   ! SvROK(names_string)
15896         || ! SvROK(algorithmic_names))
15897     {   /* Perhaps should panic instead XXX */
15898         SvREFCNT_dec(names_string);
15899         SvREFCNT_dec(algorithmic_names);
15900         return FALSE;
15901     }
15902 
15903     names_string = sv_2mortal(SvRV(names_string));
15904     all_names_start = SvPVX(names_string);
15905     cur_pos = all_names_start;
15906 
15907     algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
15908 
15909     /* Compile the subpattern consisting of the name being looked for */
15910     subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
15911 
15912     must_sv = re_intuit_string(subpattern_re);
15913     if (must_sv) {
15914         /* regexec.c can free the re_intuit_string() return. GH #17734 */
15915         must_sv = sv_2mortal(newSVsv(must_sv));
15916         must = SvPV(must_sv, must_len);
15917     }
15918     else {
15919         must = "";
15920         must_len = 0;
15921     }
15922 
15923     /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
15924      * This works because the NUL causes the function to return early, thus
15925      * showing that there are characters in it other than the acceptable ones,
15926      * which is our desired result.) */
15927 
15928     prog = ReANY(subpattern_re);
15929 
15930     /* If only nothing is matched, skip to where empty names are looked for */
15931     if (prog->maxlen == 0) {
15932         goto check_empty;
15933     }
15934 
15935     /* And match against the string of all names /gc.  Don't even try if it
15936      * must match a character not found in any name. */
15937     if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
15938     {
15939         while (execute_wildcard(subpattern_re,
15940                                 cur_pos,
15941                                 SvEND(names_string),
15942                                 all_names_start, 0,
15943                                 names_string,
15944                                 0))
15945         { /* Here, matched. */
15946 
15947             /* Note the string entries look like
15948              *      00001\nSTART OF HEADING\n\n
15949              * so we could match anywhere in that string.  We have to rule out
15950              * matching a code point line */
15951             char * this_name_start = all_names_start
15952                                                 + RX_OFFS_START(subpattern_re,0);
15953             char * this_name_end   = all_names_start
15954                                                 + RX_OFFS_END(subpattern_re,0);
15955             char * cp_start;
15956             char * cp_end;
15957             UV cp = 0;      /* Silences some compilers */
15958             AV * this_string = NULL;
15959             bool is_multi = FALSE;
15960 
15961             /* If matched nothing, advance to next possible match */
15962             if (this_name_start == this_name_end) {
15963                 cur_pos = (char *) memchr(this_name_end + 1, '\n',
15964                                           SvEND(names_string) - this_name_end);
15965                 if (cur_pos == NULL) {
15966                     break;
15967                 }
15968             }
15969             else {
15970                 /* Position the next match to start beyond the current returned
15971                  * entry */
15972                 cur_pos = (char *) memchr(this_name_end, '\n',
15973                                           SvEND(names_string) - this_name_end);
15974             }
15975 
15976             /* Back up to the \n just before the beginning of the character. */
15977             cp_end = (char *) my_memrchr(all_names_start,
15978                                          '\n',
15979                                          this_name_start - all_names_start);
15980 
15981             /* If we didn't find a \n, it means it matched somewhere in the
15982              * initial '00000' in the string, so isn't a real match */
15983             if (cp_end == NULL) {
15984                 continue;
15985             }
15986 
15987             this_name_start = cp_end + 1;   /* The name starts just after */
15988             cp_end--;                       /* the \n, and the code point */
15989                                             /* ends just before it */
15990 
15991             /* All code points are 5 digits long */
15992             cp_start = cp_end - 4;
15993 
15994             /* This shouldn't happen, as we found a \n, and the first \n is
15995              * further along than what we subtracted */
15996             assert(cp_start >= all_names_start);
15997 
15998             if (cp_start == all_names_start) {
15999                 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
16000                 continue;
16001             }
16002 
16003             /* If the character is a blank, we either have a named sequence, or
16004              * something is wrong */
16005             if (*(cp_start - 1) == ' ') {
16006                 cp_start = (char *) my_memrchr(all_names_start,
16007                                                '\n',
16008                                                cp_start - all_names_start);
16009                 cp_start++;
16010             }
16011 
16012             assert(cp_start != NULL && cp_start >= all_names_start + 2);
16013 
16014             /* Except for the first line in the string, the sequence before the
16015              * code point is \n\n.  If that isn't the case here, we didn't
16016              * match the name of a character.  (We could have matched a named
16017              * sequence, not currently handled */
16018             if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
16019                 continue;
16020             }
16021 
16022             /* We matched!  Add this to the list */
16023             found_matches = TRUE;
16024 
16025             /* Loop through all the code points in the sequence */
16026             while (cp_start < cp_end) {
16027 
16028                 /* Calculate this code point from its 5 digits */
16029                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
16030                    + (XDIGIT_VALUE(cp_start[1]) << 12)
16031                    + (XDIGIT_VALUE(cp_start[2]) << 8)
16032                    + (XDIGIT_VALUE(cp_start[3]) << 4)
16033                    +  XDIGIT_VALUE(cp_start[4]);
16034 
16035                 cp_start += 6;  /* Go past any blank */
16036 
16037                 if (cp_start < cp_end || is_multi) {
16038                     if (this_string == NULL) {
16039                         this_string = newAV();
16040                     }
16041 
16042                     is_multi = TRUE;
16043                     av_push_simple(this_string, newSVuv(cp));
16044                 }
16045             }
16046 
16047             if (is_multi) { /* Was more than one code point */
16048                 if (*strings == NULL) {
16049                     *strings = newAV();
16050                 }
16051 
16052                 av_push_simple(*strings, (SV *) this_string);
16053             }
16054             else {  /* Only a single code point */
16055                 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
16056             }
16057         } /* End of loop through the non-algorithmic names string */
16058     }
16059 
16060     /* There are also character names not in 'names_string'.  These are
16061      * algorithmically generatable.  Try this pattern on each possible one.
16062      * (khw originally planned to leave this out given the large number of
16063      * matches attempted; but the speed turned out to be quite acceptable
16064      *
16065      * There are plenty of opportunities to optimize to skip many of the tests.
16066      * beyond the rudimentary ones already here */
16067 
16068     /* First see if the subpattern matches any of the algorithmic generatable
16069      * Hangul syllable names.
16070      *
16071      * We know none of these syllable names will match if the input pattern
16072      * requires more bytes than any syllable has, or if the input pattern only
16073      * matches an empty name, or if the pattern has something it must match and
16074      * one of the characters in that isn't in any Hangul syllable. */
16075     if (    prog->minlen <= (SSize_t) syl_max_len
16076         &&  prog->maxlen > 0
16077         && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
16078     {
16079         /* These constants, names, values, and algorithm are adapted from the
16080          * Unicode standard, version 5.1, section 3.12, and should never
16081          * change. */
16082         const char * JamoL[] = {
16083             "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
16084             "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
16085         };
16086         const int LCount = C_ARRAY_LENGTH(JamoL);
16087 
16088         const char * JamoV[] = {
16089             "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
16090             "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
16091             "I"
16092         };
16093         const int VCount = C_ARRAY_LENGTH(JamoV);
16094 
16095         const char * JamoT[] = {
16096             "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
16097             "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
16098             "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
16099         };
16100         const int TCount = C_ARRAY_LENGTH(JamoT);
16101 
16102         int L, V, T;
16103 
16104         /* This is the initial Hangul syllable code point; each time through the
16105          * inner loop, it maps to the next higher code point.  For more info,
16106          * see the Hangul syllable section of the Unicode standard. */
16107         int cp = 0xAC00;
16108 
16109         syllable_name = sv_2mortal(newSV(syl_max_len));
16110         sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
16111 
16112         for (L = 0; L < LCount; L++) {
16113             for (V = 0; V < VCount; V++) {
16114                 for (T = 0; T < TCount; T++) {
16115 
16116                     /* Truncate back to the prefix, which is unvarying */
16117                     SvCUR_set(syllable_name, hangul_prefix_len);
16118 
16119                     sv_catpv(syllable_name, JamoL[L]);
16120                     sv_catpv(syllable_name, JamoV[V]);
16121                     sv_catpv(syllable_name, JamoT[T]);
16122 
16123                     if (execute_wildcard(subpattern_re,
16124                                 SvPVX(syllable_name),
16125                                 SvEND(syllable_name),
16126                                 SvPVX(syllable_name), 0,
16127                                 syllable_name,
16128                                 0))
16129                     {
16130                         *prop_definition = add_cp_to_invlist(*prop_definition,
16131                                                              cp);
16132                         found_matches = TRUE;
16133                     }
16134 
16135                     cp++;
16136                 }
16137             }
16138         }
16139     }
16140 
16141     /* The rest of the algorithmically generatable names are of the form
16142      * "PREFIX-code_point".  The prefixes and the code point limits of each
16143      * were returned to us in the array 'algorithmic_names' from data in
16144      * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
16145     for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
16146         IV j;
16147 
16148         /* Each element of the array is a hash, giving the details for the
16149          * series of names it covers.  There is the base name of the characters
16150          * in the series, and the low and high code points in the series.  And,
16151          * for optimization purposes a string containing all the legal
16152          * characters that could possibly be in a name in this series. */
16153         HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
16154         SV * prefix = * hv_fetchs(this_series, "name", 0);
16155         IV low = SvIV(* hv_fetchs(this_series, "low", 0));
16156         IV high = SvIV(* hv_fetchs(this_series, "high", 0));
16157         char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
16158 
16159         /* Pre-allocate an SV with enough space */
16160         SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
16161                                                         SvPVX(prefix)));
16162         if (high >= 0x10000) {
16163             sv_catpvs(algo_name, "0");
16164         }
16165 
16166         /* This series can be skipped entirely if the pattern requires
16167          * something longer than any name in the series, or can only match an
16168          * empty name, or contains a character not found in any name in the
16169          * series */
16170         if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
16171             &&  prog->maxlen > 0
16172             && (strspn(must, legal) == must_len))
16173         {
16174             for (j = low; j <= high; j++) { /* For each code point in the series */
16175 
16176                 /* Get its name, and see if it matches the subpattern */
16177                 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
16178                                      (unsigned) j);
16179 
16180                 if (execute_wildcard(subpattern_re,
16181                                     SvPVX(algo_name),
16182                                     SvEND(algo_name),
16183                                     SvPVX(algo_name), 0,
16184                                     algo_name,
16185                                     0))
16186                 {
16187                     *prop_definition = add_cp_to_invlist(*prop_definition, j);
16188                     found_matches = TRUE;
16189                 }
16190             }
16191         }
16192     }
16193 
16194   check_empty:
16195     /* Finally, see if the subpattern matches an empty string */
16196     empty = newSVpvs("");
16197     if (execute_wildcard(subpattern_re,
16198                          SvPVX(empty),
16199                          SvEND(empty),
16200                          SvPVX(empty), 0,
16201                          empty,
16202                          0))
16203     {
16204         /* Many code points have empty names.  Currently these are the \p{GC=C}
16205          * ones, minus CC and CF */
16206 
16207         SV * empty_names_ref = get_prop_definition(UNI_C);
16208         SV * empty_names = invlist_clone(empty_names_ref, NULL);
16209 
16210         SV * subtract = get_prop_definition(UNI_CC);
16211 
16212         _invlist_subtract(empty_names, subtract, &empty_names);
16213         SvREFCNT_dec_NN(empty_names_ref);
16214         SvREFCNT_dec_NN(subtract);
16215 
16216         subtract = get_prop_definition(UNI_CF);
16217         _invlist_subtract(empty_names, subtract, &empty_names);
16218         SvREFCNT_dec_NN(subtract);
16219 
16220         _invlist_union(*prop_definition, empty_names, prop_definition);
16221         found_matches = TRUE;
16222         SvREFCNT_dec_NN(empty_names);
16223     }
16224     SvREFCNT_dec_NN(empty);
16225 
16226 #if 0
16227     /* If we ever were to accept aliases for, say private use names, we would
16228      * need to do something fancier to find empty names.  The code below works
16229      * (at the time it was written), and is slower than the above */
16230     const char empties_pat[] = "^.";
16231     if (strNE(name, empties_pat)) {
16232         SV * empty = newSVpvs("");
16233         if (execute_wildcard(subpattern_re,
16234                     SvPVX(empty),
16235                     SvEND(empty),
16236                     SvPVX(empty), 0,
16237                     empty,
16238                     0))
16239         {
16240             SV * empties = NULL;
16241 
16242             (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
16243 
16244             _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
16245             SvREFCNT_dec_NN(empties);
16246 
16247             found_matches = TRUE;
16248         }
16249         SvREFCNT_dec_NN(empty);
16250     }
16251 #endif
16252 
16253     SvREFCNT_dec_NN(subpattern_re);
16254     return found_matches;
16255 }
16256 
16257 /*
16258  * ex: set ts=8 sts=4 sw=4 et:
16259  */
16260