1 /* regcomp.c
2 */
3
4 /*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not be
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32 */
33
34 /*
35 * pregcomp and pregexec -- regsub and regerror are not used in perl
36 *
37 * Copyright (c) 1986 by University of Toronto.
38 * Written by Henry Spencer. Not derived from licensed software.
39 *
40 * Permission is granted to anyone to use this software for any
41 * purpose on any computer system, and to redistribute it freely,
42 * subject to the following restrictions:
43 *
44 * 1. The author is not responsible for the consequences of use of
45 * this software, no matter how awful, even if they arise
46 * from defects in it.
47 *
48 * 2. The origin of this software must not be misrepresented, either
49 * by explicit claim or by omission.
50 *
51 * 3. Altered versions must be plainly marked as such, and must not
52 * be misrepresented as being the original software.
53 *
54 *
55 **** Alterations to Henry's code are...
56 ****
57 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
58 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
59 **** by Larry Wall and others
60 ****
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
63
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69
70 /* Note on debug output:
71 *
72 * This is set up so that -Dr turns on debugging like all other flags that are
73 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
74 * all regular expressions encountered in a program, and gives a huge amount of
75 * output for all but the shortest programs.
76 *
77 * The ability to output pattern debugging information lexically, and with much
78 * finer grained control was added, with 'use re qw(Debug ....);' available even
79 * in non-DEBUGGING builds. This is accomplished by copying the contents of
80 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
81 * Those files are compiled and linked into the perl executable, and they are
82 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
83 * to re.pm.
84 *
85 * That would normally mean linking errors when two functions of the same name
86 * are attempted to be placed into the same executable. That is solved in one
87 * of four ways:
88 * 1) Static functions aren't known outside the file they are in, so for the
89 * many functions of that type in this file, it just isn't a problem.
90 * 2) Most externally known functions are enclosed in
91 * #ifndef PERL_IN_XSUB_RE
92 * ...
93 * #endif
94 * blocks, so there is only one definition for them in the whole
95 * executable, the one in regcomp.c (or regexec.c). The implication of
96 * that is any debugging info that comes from them is controlled only by
97 * -Dr. Further, any static function they call will also be the version
98 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
99 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
100 * have different names, so that what gets loaded in the executable is
101 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
102 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
103 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
104 * versions and their callees are under control of re.pm. The catch is
105 * that references to all these go through the regexp_engine structure,
106 * which is initialized in regcomp.h to the Perl_foo versions, and
107 * substituted out in lexical scopes where 'use re' is in effect to the
108 * 'my_foo' ones. That structure is public API, so it would be a hard
109 * sell to add any additional members.
110 * 4) For functions in regcomp.c and re_comp.c that are called only from,
111 * respectively, regexec.c and re_exec.c, they can have two different
112 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
113 * embed.fnc.
114 *
115 * The bottom line is that if you add code to one of the public functions
116 * listed in ext/re/re_top.h, debugging automagically works. But if you write
117 * a new function that needs to do debugging or there is a chain of calls from
118 * it that need to do debugging, all functions in the chain should use options
119 * 2) or 4) above.
120 *
121 * A function may have to be split so that debugging stuff is static, but it
122 * calls out to some other function that only gets compiled in regcomp.c to
123 * access data that we don't want to duplicate.
124 */
125
126 #ifdef PERL_EXT_RE_BUILD
127 #include "re_top.h"
128 #endif
129
130 #include "EXTERN.h"
131 #define PERL_IN_REGEX_ENGINE
132 #define PERL_IN_REGCOMP_ANY
133 #define PERL_IN_REGCOMP_C
134 #include "perl.h"
135
136 #ifdef PERL_IN_XSUB_RE
137 # include "re_comp.h"
138 EXTERN_C const struct regexp_engine my_reg_engine;
139 EXTERN_C const struct regexp_engine wild_reg_engine;
140 #else
141 # include "regcomp.h"
142 #endif
143
144 #include "invlist_inline.h"
145 #include "unicode_constants.h"
146 #include "regcomp_internal.h"
147
148 /* =========================================================
149 * BEGIN edit_distance stuff.
150 *
151 * This calculates how many single character changes of any type are needed to
152 * transform a string into another one. It is taken from version 3.1 of
153 *
154 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
155 */
156
157 /* Our unsorted dictionary linked list. */
158 /* Note we use UVs, not chars. */
159
160 struct dictionary{
161 UV key;
162 UV value;
163 struct dictionary* next;
164 };
165 typedef struct dictionary item;
166
167
168 PERL_STATIC_INLINE item*
push(UV key,item * curr)169 push(UV key, item* curr)
170 {
171 item* head;
172 Newx(head, 1, item);
173 head->key = key;
174 head->value = 0;
175 head->next = curr;
176 return head;
177 }
178
179
180 PERL_STATIC_INLINE item*
find(item * head,UV key)181 find(item* head, UV key)
182 {
183 item* iterator = head;
184 while (iterator){
185 if (iterator->key == key){
186 return iterator;
187 }
188 iterator = iterator->next;
189 }
190
191 return NULL;
192 }
193
194 PERL_STATIC_INLINE item*
uniquePush(item * head,UV key)195 uniquePush(item* head, UV key)
196 {
197 item* iterator = head;
198
199 while (iterator){
200 if (iterator->key == key) {
201 return head;
202 }
203 iterator = iterator->next;
204 }
205
206 return push(key, head);
207 }
208
209 PERL_STATIC_INLINE void
dict_free(item * head)210 dict_free(item* head)
211 {
212 item* iterator = head;
213
214 while (iterator) {
215 item* temp = iterator;
216 iterator = iterator->next;
217 Safefree(temp);
218 }
219
220 head = NULL;
221 }
222
223 /* End of Dictionary Stuff */
224
225 /* All calculations/work are done here */
226 STATIC int
S_edit_distance(const UV * src,const UV * tgt,const STRLEN x,const STRLEN y,const SSize_t maxDistance)227 S_edit_distance(const UV* src,
228 const UV* tgt,
229 const STRLEN x, /* length of src[] */
230 const STRLEN y, /* length of tgt[] */
231 const SSize_t maxDistance
232 )
233 {
234 item *head = NULL;
235 UV swapCount, swapScore, targetCharCount, i, j;
236 UV *scores;
237 UV score_ceil = x + y;
238
239 PERL_ARGS_ASSERT_EDIT_DISTANCE;
240
241 /* initialize matrix start values */
242 Newx(scores, ( (x + 2) * (y + 2)), UV);
243 scores[0] = score_ceil;
244 scores[1 * (y + 2) + 0] = score_ceil;
245 scores[0 * (y + 2) + 1] = score_ceil;
246 scores[1 * (y + 2) + 1] = 0;
247 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
248
249 /* work loops */
250 /* i = src index */
251 /* j = tgt index */
252 for (i=1;i<=x;i++) {
253 if (i < x)
254 head = uniquePush(head, src[i]);
255 scores[(i+1) * (y + 2) + 1] = i;
256 scores[(i+1) * (y + 2) + 0] = score_ceil;
257 swapCount = 0;
258
259 for (j=1;j<=y;j++) {
260 if (i == 1) {
261 if(j < y)
262 head = uniquePush(head, tgt[j]);
263 scores[1 * (y + 2) + (j + 1)] = j;
264 scores[0 * (y + 2) + (j + 1)] = score_ceil;
265 }
266
267 targetCharCount = find(head, tgt[j-1])->value;
268 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
269
270 if (src[i-1] != tgt[j-1]){
271 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
272 }
273 else {
274 swapCount = j;
275 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
276 }
277 }
278
279 find(head, src[i-1])->value = i;
280 }
281
282 {
283 IV score = scores[(x+1) * (y + 2) + (y + 1)];
284 dict_free(head);
285 Safefree(scores);
286 return (maxDistance != 0 && maxDistance < score)?(-1):score;
287 }
288 }
289
290 /* END of edit_distance() stuff
291 * ========================================================= */
292
293 #ifdef PERL_RE_BUILD_AUX
294 /* add a data member to the struct reg_data attached to this regex, it should
295 * always return a non-zero return. the 's' argument is the type of the items
296 * being added and the n is the number of items. The length of 's' should match
297 * the number of items. */
298 U32
Perl_reg_add_data(RExC_state_t * const pRExC_state,const char * const s,const U32 n)299 Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
300 {
301 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
302
303 PERL_ARGS_ASSERT_REG_ADD_DATA;
304
305 /* in the below expression we have (count + n - 1), the minus one is there
306 * because the struct that we allocate already contains a slot for 1 data
307 * item, so we do not need to allocate it the first time. IOW, the
308 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
309 * to allocate. See struct reg_data in regcomp.h
310 */
311 Renewc(RExC_rxi->data,
312 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
313 char, struct reg_data);
314 /* however in the data->what expression we use (count + n) and do not
315 * subtract one from the result because the data structure contains a
316 * pointer to an array, and does not allocate the first element as part of
317 * the data struct. */
318 if (count > 1)
319 Renew(RExC_rxi->data->what, (count + n), U8);
320 else {
321 /* when count == 1 it means we have not initialized anything.
322 * we always fill the 0 slot of the data array with a '%' entry, which
323 * means "zero" (all the other types are letters) which exists purely
324 * so the return from reg_add_data is ALWAYS true, so we can tell it apart
325 * from a "no value" idx=0 in places where we would return an index
326 * into reg_add_data. This is particularly important with the new "single
327 * pass, usually, but not always" strategy that we use, where the code
328 * will use a 0 to represent "not able to compute this yet".
329 */
330 Newx(RExC_rxi->data->what, n+1, U8);
331 /* fill in the placeholder slot of 0 with a what of '%', we use
332 * this because it sorta looks like a zero (0/0) and it is not a letter
333 * like any of the other "whats", this type should never be created
334 * any other way but here. '%' happens to also not appear in this
335 * file for any other reason (at the time of writing this comment)*/
336 RExC_rxi->data->what[0]= '%';
337 RExC_rxi->data->data[0]= NULL;
338 }
339 RExC_rxi->data->count = count + n;
340 Copy(s, RExC_rxi->data->what + count, n, U8);
341 assert(count>0);
342 return count;
343 }
344 #endif /* PERL_RE_BUILD_AUX */
345
346 /*XXX: todo make this not included in a non debugging perl, but appears to be
347 * used anyway there, in 'use re' */
348 #ifndef PERL_IN_XSUB_RE
349 void
Perl_reginitcolors(pTHX)350 Perl_reginitcolors(pTHX)
351 {
352 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
353 if (s) {
354 char *t = savepv(s);
355 int i = 0;
356 PL_colors[0] = t;
357 while (++i < 6) {
358 t = strchr(t, '\t');
359 if (t) {
360 *t = '\0';
361 PL_colors[i] = ++t;
362 }
363 else
364 PL_colors[i] = t = (char *)"";
365 }
366 } else {
367 int i = 0;
368 while (i < 6)
369 PL_colors[i++] = (char *)"";
370 }
371 PL_colorset = 1;
372 }
373 #endif
374
375
376 #ifdef TRIE_STUDY_OPT
377 /* search for "restudy" in this file for a detailed explanation */
378 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
379 STMT_START { \
380 if ( \
381 (data.flags & SCF_TRIE_RESTUDY) \
382 && ! restudied++ \
383 ) { \
384 dOsomething; \
385 goto reStudy; \
386 } \
387 } STMT_END
388 #else
389 #define CHECK_RESTUDY_GOTO_butfirst
390 #endif
391
392 #ifndef PERL_IN_XSUB_RE
393
394 /* return the currently in-scope regex engine (or the default if none) */
395 regexp_engine const *
Perl_current_re_engine(pTHX)396 Perl_current_re_engine(pTHX)
397 {
398 if (IN_PERL_COMPILETIME) {
399 HV * const table = GvHV(PL_hintgv);
400 SV **ptr;
401
402 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
403 return &PL_core_reg_engine;
404 ptr = hv_fetchs(table, "regcomp", FALSE);
405 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
406 return &PL_core_reg_engine;
407 return INT2PTR(regexp_engine*, SvIV(*ptr));
408 }
409 else {
410 SV *ptr;
411 if (!PL_curcop->cop_hints_hash)
412 return &PL_core_reg_engine;
413 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
414 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
415 return &PL_core_reg_engine;
416 return INT2PTR(regexp_engine*, SvIV(ptr));
417 }
418 }
419
420
421 /*
422 * pregcomp - compile a regular expression into internal code
423 *
424 * Decides which engine's compiler to call based on the hint currently in
425 * scope
426 */
427
428 REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern,const U32 flags)429 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
430 {
431 regexp_engine const *eng = current_re_engine();
432 DECLARE_AND_GET_RE_DEBUG_FLAGS;
433
434 PERL_ARGS_ASSERT_PREGCOMP;
435
436 /* Dispatch a request to compile a regexp to correct regexp engine. */
437 DEBUG_COMPILE_r({
438 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
439 PTR2UV(eng));
440 });
441 return CALLREGCOMP_ENG(eng, pattern, flags);
442 }
443 #endif
444
445 /*
446 =for apidoc re_compile
447
448 Compile the regular expression pattern C<pattern>, returning a pointer to the
449 compiled object for later matching with the internal regex engine.
450
451 This function is typically used by a custom regexp engine C<.comp()> function
452 to hand off to the core regexp engine those patterns it doesn't want to handle
453 itself (typically passing through the same flags it was called with). In
454 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
455 to compile using the currently active regexp engine.
456
457 If C<pattern> is already a C<REGEXP>, this function does nothing but return a
458 pointer to the input. Otherwise the PV is extracted and treated like a string
459 representing a pattern. See L<perlre>.
460
461 The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
462 all begin with C<RXf_>.
463
464 =cut
465
466 * public entry point for the perl core's own regex compiling code.
467 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
468 * pattern rather than a list of OPs, and uses the internal engine rather
469 * than the current one */
470
471 REGEXP *
Perl_re_compile(pTHX_ SV * const pattern,U32 rx_flags)472 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
473 {
474 SV *pat = pattern; /* defeat constness! */
475
476 PERL_ARGS_ASSERT_RE_COMPILE;
477
478 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
479 #ifdef PERL_IN_XSUB_RE
480 &my_reg_engine,
481 #else
482 &PL_core_reg_engine,
483 #endif
484 NULL, NULL, rx_flags, 0);
485 }
486
487 static void
S_free_codeblocks(pTHX_ struct reg_code_blocks * cbs)488 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
489 {
490 int n;
491
492 if (--cbs->refcnt > 0)
493 return;
494 for (n = 0; n < cbs->count; n++) {
495 REGEXP *rx = cbs->cb[n].src_regex;
496 if (rx) {
497 cbs->cb[n].src_regex = NULL;
498 SvREFCNT_dec_NN(rx);
499 }
500 }
501 Safefree(cbs->cb);
502 Safefree(cbs);
503 }
504
505
506 static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)507 S_alloc_code_blocks(pTHX_ int ncode)
508 {
509 struct reg_code_blocks *cbs;
510 Newx(cbs, 1, struct reg_code_blocks);
511 cbs->count = ncode;
512 cbs->refcnt = 1;
513 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
514 if (ncode)
515 Newx(cbs->cb, ncode, struct reg_code_block);
516 else
517 cbs->cb = NULL;
518 return cbs;
519 }
520
521
522 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
523 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
524 * point to the realloced string and length.
525 *
526 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
527 * stuff added */
528
529 static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,char ** pat_p,STRLEN * plen_p,int num_code_blocks)530 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
531 char **pat_p, STRLEN *plen_p, int num_code_blocks)
532 {
533 U8 *const src = (U8*)*pat_p;
534 U8 *dst, *d;
535 int n=0;
536 STRLEN s = 0;
537 bool do_end = 0;
538 DECLARE_AND_GET_RE_DEBUG_FLAGS;
539
540 DEBUG_PARSE_r(Perl_re_printf( aTHX_
541 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
542
543 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
544 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
545 d = dst;
546
547 while (s < *plen_p) {
548 append_utf8_from_native_byte(src[s], &d);
549
550 if (n < num_code_blocks) {
551 assert(pRExC_state->code_blocks);
552 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
553 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
554 assert(*(d - 1) == '(');
555 do_end = 1;
556 }
557 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
558 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
559 assert(*(d - 1) == ')');
560 do_end = 0;
561 n++;
562 }
563 }
564 s++;
565 }
566 *d = '\0';
567 *plen_p = d - dst;
568 *pat_p = (char*) dst;
569 SAVEFREEPV(*pat_p);
570 RExC_orig_utf8 = RExC_utf8 = 1;
571 }
572
573
574
575 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
576 * while recording any code block indices, and handling overloading,
577 * nested qr// objects etc. If pat is null, it will allocate a new
578 * string, or just return the first arg, if there's only one.
579 *
580 * Returns the malloced/updated pat.
581 * patternp and pat_count is the array of SVs to be concatted;
582 * oplist is the optional list of ops that generated the SVs;
583 * recompile_p is a pointer to a boolean that will be set if
584 * the regex will need to be recompiled.
585 * delim, if non-null is an SV that will be inserted between each element
586 */
587
588 static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,SV * pat,SV ** const patternp,int pat_count,OP * oplist,bool * recompile_p,SV * delim)589 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
590 SV *pat, SV ** const patternp, int pat_count,
591 OP *oplist, bool *recompile_p, SV *delim)
592 {
593 SV **svp;
594 int n = 0;
595 bool use_delim = FALSE;
596 bool alloced = FALSE;
597
598 /* if we know we have at least two args, create an empty string,
599 * then concatenate args to that. For no args, return an empty string */
600 if (!pat && pat_count != 1) {
601 pat = newSVpvs("");
602 SAVEFREESV(pat);
603 alloced = TRUE;
604 }
605
606 for (svp = patternp; svp < patternp + pat_count; svp++) {
607 SV *sv;
608 SV *rx = NULL;
609 STRLEN orig_patlen = 0;
610 bool code = 0;
611 SV *msv = use_delim ? delim : *svp;
612 if (!msv) msv = &PL_sv_undef;
613
614 /* if we've got a delimiter, we go round the loop twice for each
615 * svp slot (except the last), using the delimiter the second
616 * time round */
617 if (use_delim) {
618 svp--;
619 use_delim = FALSE;
620 }
621 else if (delim)
622 use_delim = TRUE;
623
624 if (SvTYPE(msv) == SVt_PVAV) {
625 /* we've encountered an interpolated array within
626 * the pattern, e.g. /...@a..../. Expand the list of elements,
627 * then recursively append elements.
628 * The code in this block is based on S_pushav() */
629
630 AV *const av = (AV*)msv;
631 const SSize_t maxarg = AvFILL(av) + 1;
632 SV **array;
633
634 if (oplist) {
635 assert(oplist->op_type == OP_PADAV
636 || oplist->op_type == OP_RV2AV);
637 oplist = OpSIBLING(oplist);
638 }
639
640 if (SvRMAGICAL(av)) {
641 SSize_t i;
642
643 Newx(array, maxarg, SV*);
644 SAVEFREEPV(array);
645 for (i=0; i < maxarg; i++) {
646 SV ** const svp = av_fetch(av, i, FALSE);
647 array[i] = svp ? *svp : &PL_sv_undef;
648 }
649 }
650 else
651 array = AvARRAY(av);
652
653 if (maxarg > 0) {
654 pat = S_concat_pat(aTHX_ pRExC_state, pat,
655 array, maxarg, NULL, recompile_p,
656 /* $" */
657 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
658 }
659 else if (!pat) {
660 pat = newSVpvs_flags("", SVs_TEMP);
661 }
662
663 continue;
664 }
665
666
667 /* we make the assumption here that each op in the list of
668 * op_siblings maps to one SV pushed onto the stack,
669 * except for code blocks, with have both an OP_NULL and
670 * an OP_CONST.
671 * This allows us to match up the list of SVs against the
672 * list of OPs to find the next code block.
673 *
674 * Note that PUSHMARK PADSV PADSV ..
675 * is optimised to
676 * PADRANGE PADSV PADSV ..
677 * so the alignment still works. */
678
679 if (oplist) {
680 if (oplist->op_type == OP_NULL
681 && (oplist->op_flags & OPf_SPECIAL))
682 {
683 assert(n < pRExC_state->code_blocks->count);
684 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
685 pRExC_state->code_blocks->cb[n].block = oplist;
686 pRExC_state->code_blocks->cb[n].src_regex = NULL;
687 n++;
688 code = 1;
689 oplist = OpSIBLING(oplist); /* skip CONST */
690 assert(oplist);
691 }
692 oplist = OpSIBLING(oplist);
693 }
694
695 /* apply magic and QR overloading to arg */
696
697 SvGETMAGIC(msv);
698 if (SvROK(msv) && SvAMAGIC(msv)) {
699 SV *sv = AMG_CALLunary(msv, regexp_amg);
700 if (sv) {
701 if (SvROK(sv))
702 sv = SvRV(sv);
703 if (SvTYPE(sv) != SVt_REGEXP)
704 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
705 msv = sv;
706 }
707 }
708
709 /* try concatenation overload ... */
710 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
711 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
712 {
713 sv_setsv(pat, sv);
714 /* overloading involved: all bets are off over literal
715 * code. Pretend we haven't seen it */
716 if (n)
717 pRExC_state->code_blocks->count -= n;
718 n = 0;
719 }
720 else {
721 /* ... or failing that, try "" overload */
722 while (SvAMAGIC(msv)
723 && (sv = AMG_CALLunary(msv, string_amg))
724 && sv != msv
725 && !( SvROK(msv)
726 && SvROK(sv)
727 && SvRV(msv) == SvRV(sv))
728 ) {
729 msv = sv;
730 SvGETMAGIC(msv);
731 }
732 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
733 msv = SvRV(msv);
734
735 if (pat) {
736 /* this is a partially unrolled
737 * sv_catsv_nomg(pat, msv);
738 * that allows us to adjust code block indices if
739 * needed */
740 STRLEN dlen;
741 char *dst = SvPV_force_nomg(pat, dlen);
742 orig_patlen = dlen;
743 if (SvUTF8(msv) && !SvUTF8(pat)) {
744 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
745 sv_setpvn(pat, dst, dlen);
746 SvUTF8_on(pat);
747 }
748 sv_catsv_nomg(pat, msv);
749 rx = msv;
750 }
751 else {
752 /* We have only one SV to process, but we need to verify
753 * it is properly null terminated or we will fail asserts
754 * later. In theory we probably shouldn't get such SV's,
755 * but if we do we should handle it gracefully. */
756 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
757 /* not a string, or a string with a trailing null */
758 pat = msv;
759 } else {
760 /* a string with no trailing null, we need to copy it
761 * so it has a trailing null */
762 pat = sv_2mortal(newSVsv(msv));
763 }
764 }
765
766 if (code)
767 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
768 }
769
770 /* extract any code blocks within any embedded qr//'s */
771 if (rx && SvTYPE(rx) == SVt_REGEXP
772 && RX_ENGINE((REGEXP*)rx)->op_comp)
773 {
774
775 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
776 if (ri->code_blocks && ri->code_blocks->count) {
777 int i;
778 /* the presence of an embedded qr// with code means
779 * we should always recompile: the text of the
780 * qr// may not have changed, but it may be a
781 * different closure than last time */
782 *recompile_p = 1;
783 if (pRExC_state->code_blocks) {
784 int new_count = pRExC_state->code_blocks->count
785 + ri->code_blocks->count;
786 Renew(pRExC_state->code_blocks->cb,
787 new_count, struct reg_code_block);
788 pRExC_state->code_blocks->count = new_count;
789 }
790 else
791 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
792 ri->code_blocks->count);
793
794 for (i=0; i < ri->code_blocks->count; i++) {
795 struct reg_code_block *src, *dst;
796 STRLEN offset = orig_patlen
797 + ReANY((REGEXP *)rx)->pre_prefix;
798 assert(n < pRExC_state->code_blocks->count);
799 src = &ri->code_blocks->cb[i];
800 dst = &pRExC_state->code_blocks->cb[n];
801 dst->start = src->start + offset;
802 dst->end = src->end + offset;
803 dst->block = src->block;
804 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
805 src->src_regex
806 ? src->src_regex
807 : (REGEXP*)rx);
808 n++;
809 }
810 }
811 }
812 }
813 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
814 if (alloced)
815 SvSETMAGIC(pat);
816
817 return pat;
818 }
819
820
821
822 /* see if there are any run-time code blocks in the pattern.
823 * False positives are allowed */
824
825 static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)826 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
827 char *pat, STRLEN plen)
828 {
829 int n = 0;
830 STRLEN s;
831
832 PERL_UNUSED_CONTEXT;
833
834 for (s = 0; s < plen; s++) {
835 if ( pRExC_state->code_blocks
836 && n < pRExC_state->code_blocks->count
837 && s == pRExC_state->code_blocks->cb[n].start)
838 {
839 s = pRExC_state->code_blocks->cb[n].end;
840 n++;
841 continue;
842 }
843 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
844 * positives here */
845 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
846 (pat[s+2] == '{'
847 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
848 )
849 return 1;
850 }
851 return 0;
852 }
853
854 /* Handle run-time code blocks. We will already have compiled any direct
855 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
856 * copy of it, but with any literal code blocks blanked out and
857 * appropriate chars escaped; then feed it into
858 *
859 * eval "qr'modified_pattern'"
860 *
861 * For example,
862 *
863 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
864 *
865 * becomes
866 *
867 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
868 *
869 * After eval_sv()-ing that, grab any new code blocks from the returned qr
870 * and merge them with any code blocks of the original regexp.
871 *
872 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
873 * instead, just save the qr and return FALSE; this tells our caller that
874 * the original pattern needs upgrading to utf8.
875 */
876
877 static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)878 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
879 char *pat, STRLEN plen)
880 {
881 SV *qr;
882
883 DECLARE_AND_GET_RE_DEBUG_FLAGS;
884
885 if (pRExC_state->runtime_code_qr) {
886 /* this is the second time we've been called; this should
887 * only happen if the main pattern got upgraded to utf8
888 * during compilation; re-use the qr we compiled first time
889 * round (which should be utf8 too)
890 */
891 qr = pRExC_state->runtime_code_qr;
892 pRExC_state->runtime_code_qr = NULL;
893 assert(RExC_utf8 && SvUTF8(qr));
894 }
895 else {
896 int n = 0;
897 STRLEN s;
898 char *p, *newpat;
899 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
900 SV *sv, *qr_ref;
901 dSP;
902
903 /* determine how many extra chars we need for ' and \ escaping */
904 for (s = 0; s < plen; s++) {
905 if (pat[s] == '\'' || pat[s] == '\\')
906 newlen++;
907 }
908
909 Newx(newpat, newlen, char);
910 p = newpat;
911 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
912
913 for (s = 0; s < plen; s++) {
914 if ( pRExC_state->code_blocks
915 && n < pRExC_state->code_blocks->count
916 && s == pRExC_state->code_blocks->cb[n].start)
917 {
918 /* blank out literal code block so that they aren't
919 * recompiled: eg change from/to:
920 * /(?{xyz})/
921 * /(?=====)/
922 * and
923 * /(??{xyz})/
924 * /(?======)/
925 * and
926 * /(?(?{xyz}))/
927 * /(?(?=====))/
928 */
929 assert(pat[s] == '(');
930 assert(pat[s+1] == '?');
931 *p++ = '(';
932 *p++ = '?';
933 s += 2;
934 while (s < pRExC_state->code_blocks->cb[n].end) {
935 *p++ = '=';
936 s++;
937 }
938 *p++ = ')';
939 n++;
940 continue;
941 }
942 if (pat[s] == '\'' || pat[s] == '\\')
943 *p++ = '\\';
944 *p++ = pat[s];
945 }
946 *p++ = '\'';
947 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
948 *p++ = 'x';
949 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
950 *p++ = 'x';
951 }
952 }
953 *p++ = '\0';
954 DEBUG_COMPILE_r({
955 Perl_re_printf( aTHX_
956 "%sre-parsing pattern for runtime code:%s %s\n",
957 PL_colors[4], PL_colors[5], newpat);
958 });
959
960 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
961 Safefree(newpat);
962
963 ENTER;
964 SAVETMPS;
965 save_re_context();
966 PUSHSTACKi(PERLSI_REQUIRE);
967 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
968 * parsing qr''; normally only q'' does this. It also alters
969 * hints handling */
970 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
971 SvREFCNT_dec_NN(sv);
972 SPAGAIN;
973 qr_ref = POPs;
974 PUTBACK;
975 {
976 SV * const errsv = ERRSV;
977 if (SvTRUE_NN(errsv))
978 /* use croak_sv ? */
979 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
980 }
981 assert(SvROK(qr_ref));
982 qr = SvRV(qr_ref);
983 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
984 /* the leaving below frees the tmp qr_ref.
985 * Give qr a life of its own */
986 SvREFCNT_inc(qr);
987 POPSTACK;
988 FREETMPS;
989 LEAVE;
990
991 }
992
993 if (!RExC_utf8 && SvUTF8(qr)) {
994 /* first time through; the pattern got upgraded; save the
995 * qr for the next time through */
996 assert(!pRExC_state->runtime_code_qr);
997 pRExC_state->runtime_code_qr = qr;
998 return 0;
999 }
1000
1001
1002 /* extract any code blocks within the returned qr// */
1003
1004
1005 /* merge the main (r1) and run-time (r2) code blocks into one */
1006 {
1007 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
1008 struct reg_code_block *new_block, *dst;
1009 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
1010 int i1 = 0, i2 = 0;
1011 int r1c, r2c;
1012
1013 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
1014 {
1015 SvREFCNT_dec_NN(qr);
1016 return 1;
1017 }
1018
1019 if (!r1->code_blocks)
1020 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
1021
1022 r1c = r1->code_blocks->count;
1023 r2c = r2->code_blocks->count;
1024
1025 Newx(new_block, r1c + r2c, struct reg_code_block);
1026
1027 dst = new_block;
1028
1029 while (i1 < r1c || i2 < r2c) {
1030 struct reg_code_block *src;
1031 bool is_qr = 0;
1032
1033 if (i1 == r1c) {
1034 src = &r2->code_blocks->cb[i2++];
1035 is_qr = 1;
1036 }
1037 else if (i2 == r2c)
1038 src = &r1->code_blocks->cb[i1++];
1039 else if ( r1->code_blocks->cb[i1].start
1040 < r2->code_blocks->cb[i2].start)
1041 {
1042 src = &r1->code_blocks->cb[i1++];
1043 assert(src->end < r2->code_blocks->cb[i2].start);
1044 }
1045 else {
1046 assert( r1->code_blocks->cb[i1].start
1047 > r2->code_blocks->cb[i2].start);
1048 src = &r2->code_blocks->cb[i2++];
1049 is_qr = 1;
1050 assert(src->end < r1->code_blocks->cb[i1].start);
1051 }
1052
1053 assert(pat[src->start] == '(');
1054 assert(pat[src->end] == ')');
1055 dst->start = src->start;
1056 dst->end = src->end;
1057 dst->block = src->block;
1058 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
1059 : src->src_regex;
1060 dst++;
1061 }
1062 r1->code_blocks->count += r2c;
1063 Safefree(r1->code_blocks->cb);
1064 r1->code_blocks->cb = new_block;
1065 }
1066
1067 SvREFCNT_dec_NN(qr);
1068 return 1;
1069 }
1070
1071
1072 STATIC bool
S_setup_longest(pTHX_ RExC_state_t * pRExC_state,struct reg_substr_datum * rsd,struct scan_data_substrs * sub,STRLEN longest_length)1073 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
1074 struct reg_substr_datum *rsd,
1075 struct scan_data_substrs *sub,
1076 STRLEN longest_length)
1077 {
1078 /* This is the common code for setting up the floating and fixed length
1079 * string data extracted from Perl_re_op_compile() below. Returns a boolean
1080 * as to whether succeeded or not */
1081
1082 I32 t;
1083 SSize_t ml;
1084 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
1085 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
1086
1087 if (! (longest_length
1088 || (eol /* Can't have SEOL and MULTI */
1089 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
1090 )
1091 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
1092 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
1093 {
1094 return FALSE;
1095 }
1096
1097 /* copy the information about the longest from the reg_scan_data
1098 over to the program. */
1099 if (SvUTF8(sub->str)) {
1100 rsd->substr = NULL;
1101 rsd->utf8_substr = sub->str;
1102 } else {
1103 rsd->substr = sub->str;
1104 rsd->utf8_substr = NULL;
1105 }
1106 /* end_shift is how many chars that must be matched that
1107 follow this item. We calculate it ahead of time as once the
1108 lookbehind offset is added in we lose the ability to correctly
1109 calculate it.*/
1110 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
1111 rsd->end_shift = ml - sub->min_offset
1112 - longest_length
1113 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
1114 * intead? - DAPM
1115 + (SvTAIL(sub->str) != 0)
1116 */
1117 + sub->lookbehind;
1118
1119 t = (eol/* Can't have SEOL and MULTI */
1120 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
1121 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
1122
1123 return TRUE;
1124 }
1125
1126 STATIC void
S_set_regex_pv(pTHX_ RExC_state_t * pRExC_state,REGEXP * Rx)1127 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
1128 {
1129 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
1130 * properly wrapped with the right modifiers */
1131
1132 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
1133 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
1134 != REGEX_DEPENDS_CHARSET);
1135
1136 /* The caret is output if there are any defaults: if not all the STD
1137 * flags are set, or if no character set specifier is needed */
1138 bool has_default =
1139 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
1140 || ! has_charset);
1141 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
1142 == REG_RUN_ON_COMMENT_SEEN);
1143 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
1144 >> RXf_PMf_STD_PMMOD_SHIFT);
1145 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
1146 char *p;
1147 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
1148
1149 /* We output all the necessary flags; we never output a minus, as all
1150 * those are defaults, so are
1151 * covered by the caret */
1152 const STRLEN wraplen = pat_len + has_p + has_runon
1153 + has_default /* If needs a caret */
1154 + PL_bitcount[reganch] /* 1 char for each set standard flag */
1155
1156 /* If needs a character set specifier */
1157 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
1158 + (sizeof("(?:)") - 1);
1159
1160 PERL_ARGS_ASSERT_SET_REGEX_PV;
1161
1162 /* make sure PL_bitcount bounds not exceeded */
1163 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
1164
1165 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
1166 SvPOK_on(Rx);
1167 if (RExC_utf8)
1168 SvFLAGS(Rx) |= SVf_UTF8;
1169 *p++='('; *p++='?';
1170
1171 /* If a default, cover it using the caret */
1172 if (has_default) {
1173 *p++= DEFAULT_PAT_MOD;
1174 }
1175 if (has_charset) {
1176 STRLEN len;
1177 const char* name;
1178
1179 name = get_regex_charset_name(RExC_rx->extflags, &len);
1180 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
1181 assert(RExC_utf8);
1182 name = UNICODE_PAT_MODS;
1183 len = sizeof(UNICODE_PAT_MODS) - 1;
1184 }
1185 Copy(name, p, len, char);
1186 p += len;
1187 }
1188 if (has_p)
1189 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
1190 {
1191 char ch;
1192 while((ch = *fptr++)) {
1193 if(reganch & 1)
1194 *p++ = ch;
1195 reganch >>= 1;
1196 }
1197 }
1198
1199 *p++ = ':';
1200 Copy(RExC_precomp, p, pat_len, char);
1201 assert ((RX_WRAPPED(Rx) - p) < 16);
1202 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
1203 p += pat_len;
1204
1205 /* Adding a trailing \n causes this to compile properly:
1206 my $R = qr / A B C # D E/x; /($R)/
1207 Otherwise the parens are considered part of the comment */
1208 if (has_runon)
1209 *p++ = '\n';
1210 *p++ = ')';
1211 *p = 0;
1212 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
1213 }
1214
1215 STATIC void
S_ssc_finalize(pTHX_ RExC_state_t * pRExC_state,regnode_ssc * ssc)1216 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1217 {
1218 /* The inversion list in the SSC is marked mortal; now we need a more
1219 * permanent copy, which is stored the same way that is done in a regular
1220 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1221 * map */
1222
1223 SV* invlist = invlist_clone(ssc->invlist, NULL);
1224
1225 PERL_ARGS_ASSERT_SSC_FINALIZE;
1226
1227 assert(is_ANYOF_SYNTHETIC(ssc));
1228
1229 /* The code in this file assumes that all but these flags aren't relevant
1230 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1231 * by the time we reach here */
1232 assert(! (ANYOF_FLAGS(ssc)
1233 & ~( ANYOF_COMMON_FLAGS
1234 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
1235 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
1236
1237 populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
1238
1239 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
1240 SvREFCNT_dec(invlist);
1241
1242 /* Make sure is clone-safe */
1243 ssc->invlist = NULL;
1244
1245 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1246 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1247 OP(ssc) = ANYOFPOSIXL;
1248 }
1249 else if (RExC_contains_locale) {
1250 OP(ssc) = ANYOFL;
1251 }
1252
1253 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1254 }
1255
1256 STATIC bool
S_is_ssc_worth_it(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)1257 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1258 {
1259 /* The synthetic start class is used to hopefully quickly winnow down
1260 * places where a pattern could start a match in the target string. If it
1261 * doesn't really narrow things down that much, there isn't much point to
1262 * having the overhead of using it. This function uses some very crude
1263 * heuristics to decide if to use the ssc or not.
1264 *
1265 * It returns TRUE if 'ssc' rules out more than half what it considers to
1266 * be the "likely" possible matches, but of course it doesn't know what the
1267 * actual things being matched are going to be; these are only guesses
1268 *
1269 * For /l matches, it assumes that the only likely matches are going to be
1270 * in the 0-255 range, uniformly distributed, so half of that is 127
1271 * For /a and /d matches, it assumes that the likely matches will be just
1272 * the ASCII range, so half of that is 63
1273 * For /u and there isn't anything matching above the Latin1 range, it
1274 * assumes that that is the only range likely to be matched, and uses
1275 * half that as the cut-off: 127. If anything matches above Latin1,
1276 * it assumes that all of Unicode could match (uniformly), except for
1277 * non-Unicode code points and things in the General Category "Other"
1278 * (unassigned, private use, surrogates, controls and formats). This
1279 * is a much large number. */
1280
1281 U32 count = 0; /* Running total of number of code points matched by
1282 'ssc' */
1283 UV start, end; /* Start and end points of current range in inversion
1284 XXX outdated. UTF-8 locales are common, what about invert? list */
1285 const U32 max_code_points = (LOC)
1286 ? 256
1287 : (( ! UNI_SEMANTICS
1288 || invlist_highest(ssc->invlist) < 256)
1289 ? 128
1290 : NON_OTHER_COUNT);
1291 const U32 max_match = max_code_points / 2;
1292
1293 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1294
1295 invlist_iterinit(ssc->invlist);
1296 while (invlist_iternext(ssc->invlist, &start, &end)) {
1297 if (start >= max_code_points) {
1298 break;
1299 }
1300 end = MIN(end, max_code_points - 1);
1301 count += end - start + 1;
1302 if (count >= max_match) {
1303 invlist_iterfinish(ssc->invlist);
1304 return FALSE;
1305 }
1306 }
1307
1308 return TRUE;
1309 }
1310
1311 static void
release_RExC_state(pTHX_ void * vstate)1312 release_RExC_state(pTHX_ void *vstate) {
1313 RExC_state_t *pRExC_state = (RExC_state_t *)vstate;
1314
1315 /* Any or all of these might be NULL.
1316
1317 There's no point in setting them to NULL after the free, since
1318 pRExC_state is about to be released.
1319 */
1320 SvREFCNT_dec(RExC_rx_sv);
1321 Safefree(RExC_open_parens);
1322 Safefree(RExC_close_parens);
1323 Safefree(RExC_logical_to_parno);
1324 Safefree(RExC_parno_to_logical);
1325
1326 Safefree(pRExC_state);
1327 }
1328
1329 /*
1330 * Perl_re_op_compile - the perl internal RE engine's function to compile a
1331 * regular expression into internal code.
1332 * The pattern may be passed either as:
1333 * a list of SVs (patternp plus pat_count)
1334 * a list of OPs (expr)
1335 * If both are passed, the SV list is used, but the OP list indicates
1336 * which SVs are actually pre-compiled code blocks
1337 *
1338 * The SVs in the list have magic and qr overloading applied to them (and
1339 * the list may be modified in-place with replacement SVs in the latter
1340 * case).
1341 *
1342 * If the pattern hasn't changed from old_re, then old_re will be
1343 * returned.
1344 *
1345 * eng is the current engine. If that engine has an op_comp method, then
1346 * handle directly (i.e. we assume that op_comp was us); otherwise, just
1347 * do the initial concatenation of arguments and pass on to the external
1348 * engine.
1349 *
1350 * If is_bare_re is not null, set it to a boolean indicating whether the
1351 * arg list reduced (after overloading) to a single bare regex which has
1352 * been returned (i.e. /$qr/).
1353 *
1354 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
1355 *
1356 * pm_flags contains the PMf_* flags, typically based on those from the
1357 * pm_flags field of the related PMOP. Currently we're only interested in
1358 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
1359 *
1360 * For many years this code had an initial sizing pass that calculated
1361 * (sometimes incorrectly, leading to security holes) the size needed for the
1362 * compiled pattern. That was changed by commit
1363 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
1364 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
1365 * references to this sizing pass.
1366 *
1367 * Now, an initial crude guess as to the size needed is made, based on the
1368 * length of the pattern. Patches welcome to improve that guess. That amount
1369 * of space is malloc'd and then immediately freed, and then clawed back node
1370 * by node. This design is to minimize, to the extent possible, memory churn
1371 * when doing the reallocs.
1372 *
1373 * A separate parentheses counting pass may be needed in some cases.
1374 * (Previously the sizing pass did this.) Patches welcome to reduce the number
1375 * of these cases.
1376 *
1377 * The existence of a sizing pass necessitated design decisions that are no
1378 * longer needed. There are potential areas of simplification.
1379 *
1380 * Beware that the optimization-preparation code in here knows about some
1381 * of the structure of the compiled regexp. [I'll say.]
1382 */
1383
1384 REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp,int pat_count,OP * expr,const regexp_engine * eng,REGEXP * old_re,bool * is_bare_re,const U32 orig_rx_flags,const U32 pm_flags)1385 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
1386 OP *expr, const regexp_engine* eng, REGEXP *old_re,
1387 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
1388 {
1389 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
1390 STRLEN plen;
1391 char *exp;
1392 regnode *scan;
1393 I32 flags;
1394 SSize_t minlen = 0;
1395 U32 rx_flags;
1396 SV *pat;
1397 SV** new_patternp = patternp;
1398
1399 /* these are all flags - maybe they should be turned
1400 * into a single int with different bit masks */
1401 I32 sawlookahead = 0;
1402 I32 sawplus = 0;
1403 I32 sawopen = 0;
1404 I32 sawminmod = 0;
1405
1406 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
1407 bool recompile = 0;
1408 bool runtime_code = 0;
1409 scan_data_t data;
1410
1411 #ifdef TRIE_STUDY_OPT
1412 /* search for "restudy" in this file for a detailed explanation */
1413 int restudied = 0;
1414 RExC_state_t copyRExC_state;
1415 #endif
1416 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1417
1418 PERL_ARGS_ASSERT_RE_OP_COMPILE;
1419
1420 DEBUG_r(if (!PL_colorset) reginitcolors());
1421
1422 RExC_state_t *pRExC_state = NULL;
1423 /* Ensure that all members of the pRExC_state is initialized to 0
1424 * at the start of regex compilation. Historically we have had issues
1425 * with people remembering to zero specific members or zeroing them
1426 * too late, etc. Doing it in one place is saner and avoid oversight
1427 * or error. */
1428 Newxz(pRExC_state, 1, RExC_state_t);
1429
1430 SAVEDESTRUCTOR_X(release_RExC_state, pRExC_state);
1431
1432 DEBUG_r({
1433 /* and then initialize RExC_mysv1 and RExC_mysv2 early so if
1434 * something calls regprop we don't have issues. These variables
1435 * not being set up properly motivated the use of Newxz() to initalize
1436 * the pRExC_state structure, as there were codepaths under -Uusedl
1437 * that left these unitialized, and non-null as well. */
1438 RExC_mysv1 = sv_newmortal();
1439 RExC_mysv2 = sv_newmortal();
1440 });
1441
1442 if (is_bare_re)
1443 *is_bare_re = FALSE;
1444
1445 if (expr && (expr->op_type == OP_LIST ||
1446 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
1447 /* allocate code_blocks if needed */
1448 OP *o;
1449 int ncode = 0;
1450
1451 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
1452 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
1453 ncode++; /* count of DO blocks */
1454
1455 if (ncode)
1456 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
1457 }
1458
1459 if (!pat_count) {
1460 /* compile-time pattern with just OP_CONSTs and DO blocks */
1461
1462 int n;
1463 OP *o;
1464
1465 /* find how many CONSTs there are */
1466 assert(expr);
1467 n = 0;
1468 if (expr->op_type == OP_CONST)
1469 n = 1;
1470 else
1471 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1472 if (o->op_type == OP_CONST)
1473 n++;
1474 }
1475
1476 /* fake up an SV array */
1477
1478 assert(!new_patternp);
1479 Newx(new_patternp, n, SV*);
1480 SAVEFREEPV(new_patternp);
1481 pat_count = n;
1482
1483 n = 0;
1484 if (expr->op_type == OP_CONST)
1485 new_patternp[n] = cSVOPx_sv(expr);
1486 else
1487 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1488 if (o->op_type == OP_CONST)
1489 new_patternp[n++] = cSVOPo_sv;
1490 }
1491
1492 }
1493
1494 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1495 "Assembling pattern from %d elements%s\n", pat_count,
1496 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1497
1498 /* set expr to the first arg op */
1499
1500 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
1501 && expr->op_type != OP_CONST)
1502 {
1503 expr = cLISTOPx(expr)->op_first;
1504 assert( expr->op_type == OP_PUSHMARK
1505 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
1506 || expr->op_type == OP_PADRANGE);
1507 expr = OpSIBLING(expr);
1508 }
1509
1510 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
1511 expr, &recompile, NULL);
1512
1513 /* handle bare (possibly after overloading) regex: foo =~ $re */
1514 {
1515 SV *re = pat;
1516 if (SvROK(re))
1517 re = SvRV(re);
1518 if (SvTYPE(re) == SVt_REGEXP) {
1519 if (is_bare_re)
1520 *is_bare_re = TRUE;
1521 SvREFCNT_inc(re);
1522 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1523 "Precompiled pattern%s\n",
1524 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1525
1526 return (REGEXP*)re;
1527 }
1528 }
1529
1530 exp = SvPV_nomg(pat, plen);
1531
1532 if (!eng->op_comp) {
1533 if ((SvUTF8(pat) && IN_BYTES)
1534 || SvGMAGICAL(pat) || SvAMAGIC(pat))
1535 {
1536 /* make a temporary copy; either to convert to bytes,
1537 * or to avoid repeating get-magic / overloaded stringify */
1538 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
1539 (IN_BYTES ? 0 : SvUTF8(pat)));
1540 }
1541 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1542 }
1543
1544 /* ignore the utf8ness if the pattern is 0 length */
1545 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
1546 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
1547
1548
1549 DEBUG_COMPILE_r({
1550 RE_PV_QUOTED_DECL(s, RExC_utf8, RExC_mysv, exp, plen, PL_dump_re_max_len);
1551 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
1552 PL_colors[4], PL_colors[5], s);
1553 });
1554
1555 /* we jump here if we have to recompile, e.g., from upgrading the pattern
1556 * to utf8 */
1557
1558 if ((pm_flags & PMf_USE_RE_EVAL)
1559 /* this second condition covers the non-regex literal case,
1560 * i.e. $foo =~ '(?{})'. */
1561 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
1562 )
1563 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
1564
1565 redo_parse:
1566 /* return old regex if pattern hasn't changed */
1567 /* XXX: note in the below we have to check the flags as well as the
1568 * pattern.
1569 *
1570 * Things get a touch tricky as we have to compare the utf8 flag
1571 * independently from the compile flags.
1572 *
1573 * ALSO NOTE: After this point we may need to zero members of pRExC_state
1574 * explicitly. Prior to this point they should all be zeroed as part of
1575 * a struct wide Zero instruction.
1576 */
1577
1578 if ( old_re
1579 && !recompile
1580 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
1581 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
1582 && RX_PRELEN(old_re) == plen
1583 && memEQ(RX_PRECOMP(old_re), exp, plen)
1584 && !runtime_code /* with runtime code, always recompile */ )
1585 {
1586 DEBUG_COMPILE_r({
1587 RE_PV_QUOTED_DECL(s, RExC_utf8, RExC_mysv, exp, plen, PL_dump_re_max_len);
1588 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
1589 PL_colors[4], PL_colors[5], s);
1590 });
1591 return old_re;
1592 }
1593
1594 /* Allocate the pattern's SV */
1595 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
1596 RExC_rx = ReANY(Rx);
1597 if ( RExC_rx == NULL )
1598 FAIL("Regexp out of space");
1599
1600 rx_flags = orig_rx_flags;
1601 if (rx_flags & RXf_SPLIT)
1602 rx_flags &= ~(RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE);
1603
1604 if ( toUSE_UNI_CHARSET_NOT_DEPENDS
1605 && initial_charset == REGEX_DEPENDS_CHARSET)
1606 {
1607
1608 /* Set to use unicode semantics if the pattern is in utf8 and has the
1609 * 'depends' charset specified, as it means unicode when utf8 */
1610 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
1611 RExC_uni_semantics = 1;
1612 }
1613
1614 RExC_pm_flags = pm_flags;
1615
1616 if (runtime_code) {
1617 assert(TAINTING_get || !TAINT_get);
1618 if (TAINT_get)
1619 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
1620
1621 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
1622 /* whoops, we have a non-utf8 pattern, whilst run-time code
1623 * got compiled as utf8. Try again with a utf8 pattern */
1624 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1625 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1626 goto redo_parse;
1627 }
1628 }
1629 assert(!pRExC_state->runtime_code_qr);
1630
1631 RExC_sawback = 0;
1632
1633 RExC_seen = 0;
1634 RExC_maxlen = 0;
1635 RExC_in_lookaround = 0;
1636 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1637 RExC_recode_x_to_native = 0;
1638 RExC_in_multi_char_class = 0;
1639
1640 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
1641 RExC_precomp_end = RExC_end = exp + plen;
1642 RExC_nestroot = 0;
1643 RExC_whilem_seen = 0;
1644 RExC_end_op = NULL;
1645 RExC_recurse = NULL;
1646 RExC_study_chunk_recursed = NULL;
1647 RExC_study_chunk_recursed_bytes= 0;
1648 RExC_recurse_count = 0;
1649 RExC_sets_depth = 0;
1650 pRExC_state->code_index = 0;
1651
1652 /* Initialize the string in the compiled pattern. This is so that there is
1653 * something to output if necessary */
1654 set_regex_pv(pRExC_state, Rx);
1655
1656 DEBUG_PARSE_r({
1657 Perl_re_printf( aTHX_
1658 "Starting parse and generation\n");
1659 RExC_lastnum=0;
1660 RExC_lastparse=NULL;
1661 });
1662
1663 /* Allocate space and zero-initialize. Note, the two step process
1664 of zeroing when in debug mode, thus anything assigned has to
1665 happen after that */
1666 if (! RExC_size) {
1667
1668 /* On the first pass of the parse, we guess how big this will be. Then
1669 * we grow in one operation to that amount and then give it back. As
1670 * we go along, we re-allocate what we need.
1671 *
1672 * XXX Currently the guess is essentially that the pattern will be an
1673 * EXACT node with one byte input, one byte output. This is crude, and
1674 * better heuristics are welcome.
1675 *
1676 * On any subsequent passes, we guess what we actually computed in the
1677 * latest earlier pass. Such a pass probably didn't complete so is
1678 * missing stuff. We could improve those guesses by knowing where the
1679 * parse stopped, and use the length so far plus apply the above
1680 * assumption to what's left. */
1681 RExC_size = STR_SZ(RExC_end - RExC_start);
1682 }
1683
1684 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
1685 if ( RExC_rxi == NULL )
1686 FAIL("Regexp out of space");
1687
1688 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
1689 RXi_SET( RExC_rx, RExC_rxi );
1690
1691 /* We start from 0 (over from 0 in the case this is a reparse. The first
1692 * node parsed will give back any excess memory we have allocated so far).
1693 * */
1694 RExC_size = 0;
1695
1696 /* non-zero initialization begins here */
1697 RExC_rx->engine= eng;
1698 RExC_rx->extflags = rx_flags;
1699 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
1700
1701 if (pm_flags & PMf_IS_QR) {
1702 RExC_rxi->code_blocks = pRExC_state->code_blocks;
1703 if (RExC_rxi->code_blocks) {
1704 RExC_rxi->code_blocks->refcnt++;
1705 }
1706 }
1707
1708 RExC_rx->intflags = 0;
1709
1710 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
1711 RExC_parse_set(exp);
1712
1713 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
1714 * code makes sure the final byte is an uncounted NUL. But should this
1715 * ever not be the case, lots of things could read beyond the end of the
1716 * buffer: loops like
1717 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
1718 * strchr(RExC_parse, "foo");
1719 * etc. So it is worth noting. */
1720 assert(*RExC_end == '\0');
1721
1722 RExC_naughty = 0;
1723 RExC_npar = 1;
1724 RExC_logical_npar = 1;
1725 RExC_parens_buf_size = 0;
1726 RExC_emit_start = RExC_rxi->program;
1727 pRExC_state->code_index = 0;
1728
1729 *((char*) RExC_emit_start) = (char) REG_MAGIC;
1730 RExC_emit = NODE_STEP_REGNODE;
1731
1732 /* Do the parse */
1733 if (reg(pRExC_state, 0, &flags, 1)) {
1734
1735 /* Success!, But we may need to redo the parse knowing how many parens
1736 * there actually are */
1737 if (IN_PARENS_PASS) {
1738 flags |= RESTART_PARSE;
1739 }
1740
1741 /* We have that number in RExC_npar */
1742 RExC_total_parens = RExC_npar;
1743 RExC_logical_total_parens = RExC_logical_npar;
1744 }
1745 else if (! MUST_RESTART(flags)) {
1746 ReREFCNT_dec(Rx);
1747 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
1748 }
1749
1750 /* Here, we either have success, or we have to redo the parse for some reason */
1751 if (MUST_RESTART(flags)) {
1752
1753 /* It's possible to write a regexp in ascii that represents Unicode
1754 codepoints outside of the byte range, such as via \x{100}. If we
1755 detect such a sequence we have to convert the entire pattern to utf8
1756 and then recompile, as our sizing calculation will have been based
1757 on 1 byte == 1 character, but we will need to use utf8 to encode
1758 at least some part of the pattern, and therefore must convert the whole
1759 thing.
1760 -- dmq */
1761 if (flags & NEED_UTF8) {
1762
1763 /* We have stored the offset of the final warning output so far.
1764 * That must be adjusted. Any variant characters between the start
1765 * of the pattern and this warning count for 2 bytes in the final,
1766 * so just add them again */
1767 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
1768 RExC_latest_warn_offset +=
1769 variant_under_utf8_count((U8 *) exp, (U8 *) exp
1770 + RExC_latest_warn_offset);
1771 }
1772 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1773 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1774 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
1775 }
1776 else {
1777 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
1778 }
1779
1780 if (ALL_PARENS_COUNTED) {
1781 /* Make enough room for all the known parens, and zero it */
1782 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
1783 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
1784 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
1785
1786 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
1787 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
1788 /* we do NOT reinitialize RExC_logical_to_parno and
1789 * RExC_parno_to_logical here. We need their data on the second
1790 * pass */
1791 }
1792 else { /* Parse did not complete. Reinitialize the parentheses
1793 structures */
1794 RExC_total_parens = 0;
1795 if (RExC_open_parens) {
1796 Safefree(RExC_open_parens);
1797 RExC_open_parens = NULL;
1798 }
1799 if (RExC_close_parens) {
1800 Safefree(RExC_close_parens);
1801 RExC_close_parens = NULL;
1802 }
1803 if (RExC_logical_to_parno) {
1804 Safefree(RExC_logical_to_parno);
1805 RExC_logical_to_parno = NULL;
1806 }
1807 if (RExC_parno_to_logical) {
1808 Safefree(RExC_parno_to_logical);
1809 RExC_parno_to_logical = NULL;
1810 }
1811 }
1812
1813 /* Clean up what we did in this parse */
1814 SvREFCNT_dec_NN(RExC_rx_sv);
1815 RExC_rx_sv = NULL;
1816
1817 goto redo_parse;
1818 }
1819
1820 /* Here, we have successfully parsed and generated the pattern's program
1821 * for the regex engine. We are ready to finish things up and look for
1822 * optimizations. */
1823
1824 /* Update the string to compile, with correct modifiers, etc */
1825 set_regex_pv(pRExC_state, Rx);
1826
1827 RExC_rx->nparens = RExC_total_parens - 1;
1828 RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
1829
1830 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
1831 if (RExC_whilem_seen > 15)
1832 RExC_whilem_seen = 15;
1833
1834 DEBUG_PARSE_r({
1835 Perl_re_printf( aTHX_
1836 "Required size %" IVdf " nodes\n", (IV)RExC_size);
1837 RExC_lastnum=0;
1838 RExC_lastparse=NULL;
1839 });
1840
1841 SetProgLen(RExC_rxi,RExC_size);
1842
1843 DEBUG_DUMP_PRE_OPTIMIZE_r({
1844 SV * const sv = sv_newmortal(); /* can this use RExC_mysv? */
1845 RXi_GET_DECL(RExC_rx, ri);
1846 DEBUG_RExC_seen();
1847 Perl_re_printf( aTHX_ "Program before optimization:\n");
1848
1849 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
1850 sv, 0, 0);
1851 });
1852
1853 DEBUG_OPTIMISE_r(
1854 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
1855 );
1856
1857 /* XXXX To minimize changes to RE engine we always allocate
1858 3-units-long substrs field. */
1859 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
1860 if (RExC_recurse_count) {
1861 Newx(RExC_recurse, RExC_recurse_count, regnode *);
1862 SAVEFREEPV(RExC_recurse);
1863 }
1864
1865 if (RExC_seen & REG_RECURSE_SEEN) {
1866 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
1867 * So its 1 if there are no parens. */
1868 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
1869 ((RExC_total_parens & 0x07) != 0);
1870 Newx(RExC_study_chunk_recursed,
1871 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1872 SAVEFREEPV(RExC_study_chunk_recursed);
1873 }
1874
1875 reStudy:
1876 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
1877 DEBUG_r(
1878 RExC_study_chunk_recursed_count= 0;
1879 );
1880 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
1881 if (RExC_study_chunk_recursed) {
1882 Zero(RExC_study_chunk_recursed,
1883 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1884 }
1885
1886
1887 #ifdef TRIE_STUDY_OPT
1888 /* search for "restudy" in this file for a detailed explanation */
1889 if (!restudied) {
1890 StructCopy(&zero_scan_data, &data, scan_data_t);
1891 copyRExC_state = *pRExC_state;
1892 } else {
1893 U32 seen=RExC_seen;
1894 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
1895
1896 *pRExC_state = copyRExC_state;
1897 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
1898 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
1899 else
1900 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
1901 StructCopy(&zero_scan_data, &data, scan_data_t);
1902 }
1903 #else
1904 StructCopy(&zero_scan_data, &data, scan_data_t);
1905 #endif
1906
1907 /* Dig out information for optimizations. */
1908 RExC_rx->extflags = RExC_flags; /* was pm_op */
1909 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
1910
1911 if (UTF)
1912 SvUTF8_on(Rx); /* Unicode in it? */
1913 RExC_rxi->regstclass = NULL;
1914 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
1915 RExC_rx->intflags |= PREGf_NAUGHTY;
1916 scan = RExC_rxi->program + 1; /* First BRANCH. */
1917
1918 /* testing for BRANCH here tells us whether there is "must appear"
1919 data in the pattern. If there is then we can use it for optimisations */
1920 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
1921 */
1922 SSize_t fake_deltap;
1923 STRLEN longest_length[2];
1924 regnode_ssc ch_class; /* pointed to by data */
1925 int stclass_flag;
1926 SSize_t last_close = 0; /* pointed to by data */
1927 regnode *first= scan;
1928 regnode *first_next= regnext(first);
1929 regnode *last_close_op= NULL;
1930 int i;
1931
1932 /*
1933 * Skip introductions and multiplicators >= 1
1934 * so that we can extract the 'meat' of the pattern that must
1935 * match in the large if() sequence following.
1936 * NOTE that EXACT is NOT covered here, as it is normally
1937 * picked up by the optimiser separately.
1938 *
1939 * This is unfortunate as the optimiser isnt handling lookahead
1940 * properly currently.
1941 *
1942 */
1943 while (1)
1944 {
1945 if (OP(first) == OPEN)
1946 sawopen = 1;
1947 else
1948 if (OP(first) == IFMATCH && !FLAGS(first))
1949 /* for now we can't handle lookbehind IFMATCH */
1950 sawlookahead = 1;
1951 else
1952 if (OP(first) == PLUS)
1953 sawplus = 1;
1954 else
1955 if (OP(first) == MINMOD)
1956 sawminmod = 1;
1957 else
1958 if (!(
1959 /* An OR of *one* alternative - should not happen now. */
1960 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
1961 /* An {n,m} with n>0 */
1962 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) ||
1963 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
1964 )){
1965 break;
1966 }
1967
1968 first = REGNODE_AFTER(first);
1969 first_next= regnext(first);
1970 }
1971
1972 /* Starting-point info. */
1973 again:
1974 DEBUG_PEEP("first:", first, 0, 0);
1975 /* Ignore EXACT as we deal with it later. */
1976 if (REGNODE_TYPE(OP(first)) == EXACT) {
1977 if (! isEXACTFish(OP(first))) {
1978 NOOP; /* Empty, get anchored substr later. */
1979 }
1980 else
1981 RExC_rxi->regstclass = first;
1982 }
1983 #ifdef TRIE_STCLASS
1984 else if (REGNODE_TYPE(OP(first)) == TRIE &&
1985 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
1986 {
1987 /* this can happen only on restudy
1988 * Search for "restudy" in this file to find
1989 * a comment with details. */
1990 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
1991 }
1992 #endif
1993 else if (REGNODE_SIMPLE(OP(first)))
1994 RExC_rxi->regstclass = first;
1995 else if (REGNODE_TYPE(OP(first)) == BOUND ||
1996 REGNODE_TYPE(OP(first)) == NBOUND)
1997 RExC_rxi->regstclass = first;
1998 else if (REGNODE_TYPE(OP(first)) == BOL) {
1999 RExC_rx->intflags |= (OP(first) == MBOL
2000 ? PREGf_ANCH_MBOL
2001 : PREGf_ANCH_SBOL);
2002 first = REGNODE_AFTER(first);
2003 goto again;
2004 }
2005 else if (OP(first) == GPOS) {
2006 RExC_rx->intflags |= PREGf_ANCH_GPOS;
2007 first = REGNODE_AFTER_type(first,tregnode_GPOS);
2008 goto again;
2009 }
2010 else if ((!sawopen || !RExC_sawback) &&
2011 !sawlookahead &&
2012 (OP(first) == STAR &&
2013 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
2014 !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
2015 {
2016 /* turn .* into ^.* with an implied $*=1 */
2017 const int type =
2018 (OP(REGNODE_AFTER(first)) == REG_ANY)
2019 ? PREGf_ANCH_MBOL
2020 : PREGf_ANCH_SBOL;
2021 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
2022 first = REGNODE_AFTER(first);
2023 goto again;
2024 }
2025 if (sawplus && !sawminmod && !sawlookahead
2026 && (!sawopen || !RExC_sawback)
2027 && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
2028 /* x+ must match at the 1st pos of run of x's */
2029 RExC_rx->intflags |= PREGf_SKIP;
2030
2031 /* Scan is after the zeroth branch, first is atomic matcher. */
2032 #ifdef TRIE_STUDY_OPT
2033 /* search for "restudy" in this file for a detailed explanation */
2034 DEBUG_PARSE_r(
2035 if (!restudied)
2036 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
2037 (IV)(first - scan + 1))
2038 );
2039 #else
2040 DEBUG_PARSE_r(
2041 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
2042 (IV)(first - scan + 1))
2043 );
2044 #endif
2045
2046
2047 /*
2048 * If there's something expensive in the r.e., find the
2049 * longest literal string that must appear and make it the
2050 * regmust. Resolve ties in favor of later strings, since
2051 * the regstart check works with the beginning of the r.e.
2052 * and avoiding duplication strengthens checking. Not a
2053 * strong reason, but sufficient in the absence of others.
2054 * [Now we resolve ties in favor of the earlier string if
2055 * it happens that c_offset_min has been invalidated, since the
2056 * earlier string may buy us something the later one won't.]
2057 */
2058
2059 data.substrs[0].str = newSVpvs("");
2060 data.substrs[1].str = newSVpvs("");
2061 data.last_found = newSVpvs("");
2062 data.cur_is_floating = 0; /* initially any found substring is fixed */
2063 ENTER_with_name("study_chunk");
2064 SAVEFREESV(data.substrs[0].str);
2065 SAVEFREESV(data.substrs[1].str);
2066 SAVEFREESV(data.last_found);
2067 first = scan;
2068 if (!RExC_rxi->regstclass) {
2069 ssc_init(pRExC_state, &ch_class);
2070 data.start_class = &ch_class;
2071 stclass_flag = SCF_DO_STCLASS_AND;
2072 } else /* XXXX Check for BOUND? */
2073 stclass_flag = 0;
2074 data.last_closep = &last_close;
2075 data.last_close_opp = &last_close_op;
2076
2077 DEBUG_RExC_seen();
2078 /*
2079 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
2080 * (NO top level branches)
2081 */
2082 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
2083 scan + RExC_size, /* Up to end */
2084 &data, -1, 0, NULL,
2085 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
2086 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
2087 0, TRUE);
2088 /* search for "restudy" in this file for a detailed explanation
2089 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2090
2091
2092 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
2093
2094
2095 if ( RExC_total_parens == 1 && !data.cur_is_floating
2096 && data.last_start_min == 0 && data.last_end > 0
2097 && !RExC_seen_zerolen
2098 && !(RExC_seen & REG_VERBARG_SEEN)
2099 && !(RExC_seen & REG_GPOS_SEEN)
2100 ){
2101 RExC_rx->extflags |= RXf_CHECK_ALL;
2102 }
2103 scan_commit(pRExC_state, &data,&minlen, 0);
2104
2105
2106 /* XXX this is done in reverse order because that's the way the
2107 * code was before it was parameterised. Don't know whether it
2108 * actually needs doing in reverse order. DAPM */
2109 for (i = 1; i >= 0; i--) {
2110 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
2111
2112 if ( !( i
2113 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
2114 && data.substrs[0].min_offset
2115 == data.substrs[1].min_offset
2116 && SvCUR(data.substrs[0].str)
2117 == SvCUR(data.substrs[1].str)
2118 )
2119 && S_setup_longest (aTHX_ pRExC_state,
2120 &(RExC_rx->substrs->data[i]),
2121 &(data.substrs[i]),
2122 longest_length[i]))
2123 {
2124 RExC_rx->substrs->data[i].min_offset =
2125 data.substrs[i].min_offset - data.substrs[i].lookbehind;
2126
2127 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
2128 /* Don't offset infinity */
2129 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
2130 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
2131 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
2132 }
2133 else {
2134 RExC_rx->substrs->data[i].substr = NULL;
2135 RExC_rx->substrs->data[i].utf8_substr = NULL;
2136 longest_length[i] = 0;
2137 }
2138 }
2139
2140 LEAVE_with_name("study_chunk");
2141
2142 if (RExC_rxi->regstclass
2143 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
2144 RExC_rxi->regstclass = NULL;
2145
2146 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
2147 || RExC_rx->substrs->data[0].min_offset)
2148 && stclass_flag
2149 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2150 && is_ssc_worth_it(pRExC_state, data.start_class))
2151 {
2152 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2153
2154 ssc_finalize(pRExC_state, data.start_class);
2155
2156 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2157 StructCopy(data.start_class,
2158 (regnode_ssc*)RExC_rxi->data->data[n],
2159 regnode_ssc);
2160 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2161 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2162 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
2163 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2164 Perl_re_printf( aTHX_
2165 "synthetic stclass \"%s\".\n",
2166 SvPVX_const(sv));});
2167 data.start_class = NULL;
2168 }
2169
2170 /* A temporary algorithm prefers floated substr to fixed one of
2171 * same length to dig more info. */
2172 i = (longest_length[0] <= longest_length[1]);
2173 RExC_rx->substrs->check_ix = i;
2174 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
2175 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
2176 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
2177 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
2178 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
2179 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
2180 RExC_rx->intflags |= PREGf_NOSCAN;
2181
2182 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
2183 RExC_rx->extflags |= RXf_USE_INTUIT;
2184 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
2185 RExC_rx->extflags |= RXf_INTUIT_TAIL;
2186 }
2187
2188 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
2189 if ( (STRLEN)minlen < longest_length[1] )
2190 minlen= longest_length[1];
2191 if ( (STRLEN)minlen < longest_length[0] )
2192 minlen= longest_length[0];
2193 */
2194 }
2195 else {
2196 /* Several toplevels. Best we can is to set minlen. */
2197 SSize_t fake_deltap;
2198 regnode_ssc ch_class;
2199 SSize_t last_close = 0;
2200 regnode *last_close_op = NULL;
2201
2202 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
2203
2204 scan = RExC_rxi->program + 1;
2205 ssc_init(pRExC_state, &ch_class);
2206 data.start_class = &ch_class;
2207 data.last_closep = &last_close;
2208 data.last_close_opp = &last_close_op;
2209
2210 DEBUG_RExC_seen();
2211 /*
2212 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
2213 * (patterns WITH top level branches)
2214 */
2215 minlen = study_chunk(pRExC_state,
2216 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
2217 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
2218 ? SCF_TRIE_DOING_RESTUDY
2219 : 0),
2220 0, TRUE);
2221 /* search for "restudy" in this file for a detailed explanation
2222 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2223
2224 CHECK_RESTUDY_GOTO_butfirst(NOOP);
2225
2226 RExC_rx->check_substr = NULL;
2227 RExC_rx->check_utf8 = NULL;
2228 RExC_rx->substrs->data[0].substr = NULL;
2229 RExC_rx->substrs->data[0].utf8_substr = NULL;
2230 RExC_rx->substrs->data[1].substr = NULL;
2231 RExC_rx->substrs->data[1].utf8_substr = NULL;
2232
2233 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2234 && is_ssc_worth_it(pRExC_state, data.start_class))
2235 {
2236 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2237
2238 ssc_finalize(pRExC_state, data.start_class);
2239
2240 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2241 StructCopy(data.start_class,
2242 (regnode_ssc*)RExC_rxi->data->data[n],
2243 regnode_ssc);
2244 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2245 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2246 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
2247 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2248 Perl_re_printf( aTHX_
2249 "synthetic stclass \"%s\".\n",
2250 SvPVX_const(sv));});
2251 data.start_class = NULL;
2252 }
2253 }
2254
2255 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
2256 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
2257 RExC_rx->maxlen = REG_INFTY;
2258 }
2259 else {
2260 RExC_rx->maxlen = RExC_maxlen;
2261 }
2262
2263 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
2264 the "real" pattern. */
2265 DEBUG_OPTIMISE_r({
2266 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
2267 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
2268 });
2269 RExC_rx->minlenret = minlen;
2270 if (RExC_rx->minlen < minlen)
2271 RExC_rx->minlen = minlen;
2272
2273 if (RExC_seen & REG_RECURSE_SEEN ) {
2274 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
2275 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
2276 }
2277 if (RExC_seen & REG_GPOS_SEEN)
2278 RExC_rx->intflags |= PREGf_GPOS_SEEN;
2279
2280 if (RExC_seen & REG_PESSIMIZE_SEEN)
2281 RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN;
2282
2283 if (RExC_seen & REG_LOOKBEHIND_SEEN)
2284 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
2285 lookbehind */
2286 if (pRExC_state->code_blocks)
2287 RExC_rx->extflags |= RXf_EVAL_SEEN;
2288
2289 if (RExC_seen & REG_VERBARG_SEEN) {
2290 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
2291 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
2292 }
2293
2294 if (RExC_seen & REG_CUTGROUP_SEEN)
2295 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
2296
2297 if (pm_flags & PMf_USE_RE_EVAL)
2298 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
2299
2300 if (RExC_paren_names)
2301 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
2302 else
2303 RXp_PAREN_NAMES(RExC_rx) = NULL;
2304
2305 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
2306 * so it can be used in pp.c */
2307 if (RExC_rx->intflags & PREGf_ANCH)
2308 RExC_rx->extflags |= RXf_IS_ANCHORED;
2309
2310
2311 {
2312 /* this is used to identify "special" patterns that might result
2313 * in Perl NOT calling the regex engine and instead doing the match "itself",
2314 * particularly special cases in split//. By having the regex compiler
2315 * do this pattern matching at a regop level (instead of by inspecting the pattern)
2316 * we avoid weird issues with equivalent patterns resulting in different behavior,
2317 * AND we allow non Perl engines to get the same optimizations by the setting the
2318 * flags appropriately - Yves */
2319 regnode *first = RExC_rxi->program + 1;
2320 U8 fop = OP(first);
2321 regnode *next = NULL;
2322 U8 nop = 0;
2323 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
2324 next = REGNODE_AFTER(first);
2325 nop = OP(next);
2326 }
2327 /* It's safe to read through *next only if OP(first) is a regop of
2328 * the right type (not EXACT, for example).
2329 */
2330 if (REGNODE_TYPE(fop) == NOTHING && nop == END)
2331 RExC_rx->extflags |= RXf_NULL;
2332 else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END)
2333 /* when fop is SBOL first->flags will be true only when it was
2334 * produced by parsing /\A/, and not when parsing /^/. This is
2335 * very important for the split code as there we want to
2336 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
2337 * See rt #122761 for more details. -- Yves */
2338 RExC_rx->extflags |= RXf_START_ONLY;
2339 else if (fop == PLUS
2340 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
2341 && OP(regnext(first)) == END)
2342 RExC_rx->extflags |= RXf_WHITE;
2343 else if ( RExC_rx->extflags & RXf_SPLIT
2344 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
2345 && STR_LEN(first) == 1
2346 && *(STRING(first)) == ' '
2347 && OP(regnext(first)) == END )
2348 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
2349
2350 }
2351
2352 if (RExC_contains_locale) {
2353 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
2354 }
2355
2356 #ifdef DEBUGGING
2357 if (RExC_paren_names) {
2358 RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
2359 RExC_rxi->data->data[RExC_rxi->name_list_idx]
2360 = (void*)SvREFCNT_inc(RExC_paren_name_list);
2361 } else
2362 #endif
2363 RExC_rxi->name_list_idx = 0;
2364
2365 while ( RExC_recurse_count > 0 ) {
2366 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
2367 /*
2368 * This data structure is set up in study_chunk() and is used
2369 * to calculate the distance between a GOSUB regopcode and
2370 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
2371 * it refers to.
2372 *
2373 * If for some reason someone writes code that optimises
2374 * away a GOSUB opcode then the assert should be changed to
2375 * an if(scan) to guard the ARG2i_SET() - Yves
2376 *
2377 */
2378 assert(scan && OP(scan) == GOSUB);
2379 ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
2380 }
2381 if (RExC_logical_total_parens != RExC_total_parens) {
2382 Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
2383 /* we rebuild this below */
2384 Zero(RExC_logical_to_parno, RExC_total_parens, I32);
2385 for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
2386 int logical_parno= RExC_parno_to_logical[parno];
2387 assert(logical_parno);
2388 RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
2389 RExC_logical_to_parno[logical_parno] = parno;
2390 }
2391 RExC_rx->logical_to_parno = RExC_logical_to_parno;
2392 RExC_rx->parno_to_logical = RExC_parno_to_logical;
2393 RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
2394 RExC_logical_to_parno = NULL;
2395 RExC_parno_to_logical = NULL;
2396 RExC_parno_to_logical_next = NULL;
2397 } else {
2398 RExC_rx->logical_to_parno = NULL;
2399 RExC_rx->parno_to_logical = NULL;
2400 RExC_rx->parno_to_logical_next = NULL;
2401 }
2402
2403 Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
2404 /* assume we don't need to swap parens around before we match */
2405 DEBUG_TEST_r({
2406 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
2407 (unsigned long)RExC_study_chunk_recursed_count);
2408 });
2409 DEBUG_DUMP_r({
2410 DEBUG_RExC_seen();
2411 Perl_re_printf( aTHX_ "Final program:\n");
2412 regdump(RExC_rx);
2413 });
2414
2415 /* we're returning ownership of the SV to the caller, ensure the cleanup
2416 * doesn't release it
2417 */
2418 RExC_rx_sv = NULL;
2419
2420 #ifdef USE_ITHREADS
2421 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
2422 * by setting the regexp SV to readonly-only instead. If the
2423 * pattern's been recompiled, the USEDness should remain. */
2424 if (old_re && SvREADONLY(old_re))
2425 SvREADONLY_on(Rx);
2426 #endif
2427 return Rx;
2428 }
2429
2430
2431
2432 SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)2433 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
2434 {
2435 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
2436 PERL_UNUSED_ARG(rx);
2437 if (0)
2438 return NULL;
2439 else
2440 return newSVpvs("Regexp");
2441 }
2442
2443 /* Scans the name of a named buffer from the pattern.
2444 * If flags is REG_RSN_RETURN_NULL returns null.
2445 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
2446 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
2447 * to the parsed name as looked up in the RExC_paren_names hash.
2448 * If there is an error throws a vFAIL().. type exception.
2449 */
2450
2451 #define REG_RSN_RETURN_NULL 0
2452 #define REG_RSN_RETURN_NAME 1
2453 #define REG_RSN_RETURN_DATA 2
2454
2455 STATIC SV*
S_reg_scan_name(pTHX_ RExC_state_t * pRExC_state,U32 flags)2456 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
2457 {
2458 char *name_start = RExC_parse;
2459 SV* sv_name;
2460
2461 PERL_ARGS_ASSERT_REG_SCAN_NAME;
2462
2463 assert (RExC_parse <= RExC_end);
2464 if (RExC_parse == RExC_end) NOOP;
2465 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
2466 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
2467 * using do...while */
2468 if (UTF)
2469 do {
2470 RExC_parse_inc_utf8();
2471 } while ( RExC_parse < RExC_end
2472 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
2473 else
2474 do {
2475 RExC_parse_inc_by(1);
2476 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
2477 } else {
2478 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
2479 character */
2480 vFAIL("Group name must start with a non-digit word character");
2481 }
2482 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
2483 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
2484 if ( flags == REG_RSN_RETURN_NAME)
2485 return sv_name;
2486 else if (flags==REG_RSN_RETURN_DATA) {
2487 HE *he_str = NULL;
2488 SV *sv_dat = NULL;
2489 if ( ! sv_name ) /* should not happen*/
2490 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
2491 if (RExC_paren_names)
2492 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
2493 if ( he_str )
2494 sv_dat = HeVAL(he_str);
2495 if ( ! sv_dat ) { /* Didn't find group */
2496
2497 /* It might be a forward reference; we can't fail until we
2498 * know, by completing the parse to get all the groups, and
2499 * then reparsing */
2500 if (ALL_PARENS_COUNTED) {
2501 vFAIL("Reference to nonexistent named group");
2502 }
2503 else {
2504 REQUIRE_PARENS_PASS;
2505 }
2506 }
2507 return sv_dat;
2508 }
2509
2510 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
2511 (unsigned long) flags);
2512 }
2513
2514 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
2515 if (RExC_lastparse!=RExC_parse) { \
2516 Perl_re_printf( aTHX_ "%s", \
2517 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
2518 RExC_end - RExC_parse, 16, \
2519 "", "", \
2520 PERL_PV_ESCAPE_UNI_DETECT | \
2521 PERL_PV_PRETTY_ELLIPSES | \
2522 PERL_PV_PRETTY_LTGT | \
2523 PERL_PV_ESCAPE_RE | \
2524 PERL_PV_PRETTY_EXACTSIZE \
2525 ) \
2526 ); \
2527 } else \
2528 Perl_re_printf( aTHX_ "%16s",""); \
2529 \
2530 if (RExC_lastnum!=RExC_emit) \
2531 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
2532 else \
2533 Perl_re_printf( aTHX_ "|%4s",""); \
2534 Perl_re_printf( aTHX_ "|%*s%-4s", \
2535 (int)((depth*2)), "", \
2536 (funcname) \
2537 ); \
2538 RExC_lastnum=RExC_emit; \
2539 RExC_lastparse=RExC_parse; \
2540 })
2541
2542
2543
2544 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
2545 DEBUG_PARSE_MSG((funcname)); \
2546 Perl_re_printf( aTHX_ "%4s","\n"); \
2547 })
2548 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
2549 DEBUG_PARSE_MSG((funcname)); \
2550 Perl_re_printf( aTHX_ fmt "\n",args); \
2551 })
2552
2553
2554 STATIC void
S_parse_lparen_question_flags(pTHX_ RExC_state_t * pRExC_state)2555 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
2556 {
2557 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
2558 * constructs, and updates RExC_flags with them. On input, RExC_parse
2559 * should point to the first flag; it is updated on output to point to the
2560 * final ')' or ':'. There needs to be at least one flag, or this will
2561 * abort */
2562
2563 /* for (?g), (?gc), and (?o) warnings; warning
2564 about (?c) will warn about (?g) -- japhy */
2565
2566 #define WASTED_O 0x01
2567 #define WASTED_G 0x02
2568 #define WASTED_C 0x04
2569 #define WASTED_GC (WASTED_G|WASTED_C)
2570 I32 wastedflags = 0x00;
2571 U32 posflags = 0, negflags = 0;
2572 U32 *flagsp = &posflags;
2573 char has_charset_modifier = '\0';
2574 regex_charset cs;
2575 bool has_use_defaults = FALSE;
2576 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
2577 int x_mod_count = 0;
2578
2579 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
2580
2581 /* '^' as an initial flag sets certain defaults */
2582 if (UCHARAT(RExC_parse) == '^') {
2583 RExC_parse_inc_by(1);
2584 has_use_defaults = TRUE;
2585 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
2586 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2587 ? REGEX_UNICODE_CHARSET
2588 : REGEX_DEPENDS_CHARSET;
2589 set_regex_charset(&RExC_flags, cs);
2590 }
2591 else {
2592 cs = get_regex_charset(RExC_flags);
2593 if ( cs == REGEX_DEPENDS_CHARSET
2594 && (toUSE_UNI_CHARSET_NOT_DEPENDS))
2595 {
2596 cs = REGEX_UNICODE_CHARSET;
2597 }
2598 }
2599
2600 while (RExC_parse < RExC_end) {
2601 /* && memCHRs("iogcmsx", *RExC_parse) */
2602 /* (?g), (?gc) and (?o) are useless here
2603 and must be globally applied -- japhy */
2604 if ((RExC_pm_flags & PMf_WILDCARD)) {
2605 if (flagsp == & negflags) {
2606 if (*RExC_parse == 'm') {
2607 RExC_parse_inc_by(1);
2608 /* diag_listed_as: Use of %s is not allowed in Unicode
2609 property wildcard subpatterns in regex; marked by <--
2610 HERE in m/%s/ */
2611 vFAIL("Use of modifier '-m' is not allowed in Unicode"
2612 " property wildcard subpatterns");
2613 }
2614 }
2615 else {
2616 if (*RExC_parse == 's') {
2617 goto modifier_illegal_in_wildcard;
2618 }
2619 }
2620 }
2621
2622 switch (*RExC_parse) {
2623
2624 /* Code for the imsxn flags */
2625 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
2626
2627 case LOCALE_PAT_MOD:
2628 if (has_charset_modifier) {
2629 goto excess_modifier;
2630 }
2631 else if (flagsp == &negflags) {
2632 goto neg_modifier;
2633 }
2634 cs = REGEX_LOCALE_CHARSET;
2635 has_charset_modifier = LOCALE_PAT_MOD;
2636 break;
2637 case UNICODE_PAT_MOD:
2638 if (has_charset_modifier) {
2639 goto excess_modifier;
2640 }
2641 else if (flagsp == &negflags) {
2642 goto neg_modifier;
2643 }
2644 cs = REGEX_UNICODE_CHARSET;
2645 has_charset_modifier = UNICODE_PAT_MOD;
2646 break;
2647 case ASCII_RESTRICT_PAT_MOD:
2648 if (flagsp == &negflags) {
2649 goto neg_modifier;
2650 }
2651 if (has_charset_modifier) {
2652 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
2653 goto excess_modifier;
2654 }
2655 /* Doubled modifier implies more restricted */
2656 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
2657 }
2658 else {
2659 cs = REGEX_ASCII_RESTRICTED_CHARSET;
2660 }
2661 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
2662 break;
2663 case DEPENDS_PAT_MOD:
2664 if (has_use_defaults) {
2665 goto fail_modifiers;
2666 }
2667 else if (flagsp == &negflags) {
2668 goto neg_modifier;
2669 }
2670 else if (has_charset_modifier) {
2671 goto excess_modifier;
2672 }
2673
2674 /* The dual charset means unicode semantics if the
2675 * pattern (or target, not known until runtime) are
2676 * utf8, or something in the pattern indicates unicode
2677 * semantics */
2678 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2679 ? REGEX_UNICODE_CHARSET
2680 : REGEX_DEPENDS_CHARSET;
2681 has_charset_modifier = DEPENDS_PAT_MOD;
2682 break;
2683 excess_modifier:
2684 RExC_parse_inc_by(1);
2685 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
2686 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
2687 }
2688 else if (has_charset_modifier == *(RExC_parse - 1)) {
2689 vFAIL2("Regexp modifier \"%c\" may not appear twice",
2690 *(RExC_parse - 1));
2691 }
2692 else {
2693 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
2694 }
2695 NOT_REACHED; /*NOTREACHED*/
2696 neg_modifier:
2697 RExC_parse_inc_by(1);
2698 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
2699 *(RExC_parse - 1));
2700 NOT_REACHED; /*NOTREACHED*/
2701 case GLOBAL_PAT_MOD: /* 'g' */
2702 if (RExC_pm_flags & PMf_WILDCARD) {
2703 goto modifier_illegal_in_wildcard;
2704 }
2705 /*FALLTHROUGH*/
2706 case ONCE_PAT_MOD: /* 'o' */
2707 if (ckWARN(WARN_REGEXP)) {
2708 const I32 wflagbit = *RExC_parse == 'o'
2709 ? WASTED_O
2710 : WASTED_G;
2711 if (! (wastedflags & wflagbit) ) {
2712 wastedflags |= wflagbit;
2713 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2714 vWARN5(
2715 RExC_parse + 1,
2716 "Useless (%s%c) - %suse /%c modifier",
2717 flagsp == &negflags ? "?-" : "?",
2718 *RExC_parse,
2719 flagsp == &negflags ? "don't " : "",
2720 *RExC_parse
2721 );
2722 }
2723 }
2724 break;
2725
2726 case CONTINUE_PAT_MOD: /* 'c' */
2727 if (RExC_pm_flags & PMf_WILDCARD) {
2728 goto modifier_illegal_in_wildcard;
2729 }
2730 if (ckWARN(WARN_REGEXP)) {
2731 if (! (wastedflags & WASTED_C) ) {
2732 wastedflags |= WASTED_GC;
2733 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2734 vWARN3(
2735 RExC_parse + 1,
2736 "Useless (%sc) - %suse /gc modifier",
2737 flagsp == &negflags ? "?-" : "?",
2738 flagsp == &negflags ? "don't " : ""
2739 );
2740 }
2741 }
2742 break;
2743 case KEEPCOPY_PAT_MOD: /* 'p' */
2744 if (RExC_pm_flags & PMf_WILDCARD) {
2745 goto modifier_illegal_in_wildcard;
2746 }
2747 if (flagsp == &negflags) {
2748 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
2749 } else {
2750 *flagsp |= RXf_PMf_KEEPCOPY;
2751 }
2752 break;
2753 case '-':
2754 /* A flag is a default iff it is following a minus, so
2755 * if there is a minus, it means will be trying to
2756 * re-specify a default which is an error */
2757 if (has_use_defaults || flagsp == &negflags) {
2758 goto fail_modifiers;
2759 }
2760 flagsp = &negflags;
2761 wastedflags = 0; /* reset so (?g-c) warns twice */
2762 x_mod_count = 0;
2763 break;
2764 case ':':
2765 case ')':
2766
2767 if ( (RExC_pm_flags & PMf_WILDCARD)
2768 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
2769 {
2770 RExC_parse_inc_by(1);
2771 /* diag_listed_as: Use of %s is not allowed in Unicode
2772 property wildcard subpatterns in regex; marked by <--
2773 HERE in m/%s/ */
2774 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
2775 " property wildcard subpatterns",
2776 has_charset_modifier);
2777 }
2778
2779 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
2780 negflags |= RXf_PMf_EXTENDED_MORE;
2781 }
2782 RExC_flags |= posflags;
2783
2784 if (negflags & RXf_PMf_EXTENDED) {
2785 negflags |= RXf_PMf_EXTENDED_MORE;
2786 }
2787 RExC_flags &= ~negflags;
2788 set_regex_charset(&RExC_flags, cs);
2789
2790 return;
2791 default:
2792 fail_modifiers:
2793 RExC_parse_inc_if_char();
2794 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
2795 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
2796 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
2797 NOT_REACHED; /*NOTREACHED*/
2798 }
2799
2800 RExC_parse_inc();
2801 }
2802
2803 vFAIL("Sequence (?... not terminated");
2804
2805 modifier_illegal_in_wildcard:
2806 RExC_parse_inc_by(1);
2807 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
2808 subpatterns in regex; marked by <-- HERE in m/%s/ */
2809 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
2810 " subpatterns", *(RExC_parse - 1));
2811 }
2812
2813 /*
2814 - reg - regular expression, i.e. main body or parenthesized thing
2815 *
2816 * Caller must absorb opening parenthesis.
2817 *
2818 * Combining parenthesis handling with the base level of regular expression
2819 * is a trifle forced, but the need to tie the tails of the branches to what
2820 * follows makes it hard to avoid.
2821 */
2822
2823 STATIC regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,char * backref_parse_start,char ch)2824 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
2825 I32 *flagp,
2826 char * backref_parse_start,
2827 char ch
2828 )
2829 {
2830 regnode_offset ret;
2831 char* name_start = RExC_parse;
2832 U32 num = 0;
2833 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
2834 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2835
2836 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
2837
2838 if (RExC_parse != name_start && ch == '}') {
2839 while (isBLANK(*RExC_parse)) {
2840 RExC_parse_inc_by(1);
2841 }
2842 }
2843 if (RExC_parse == name_start || *RExC_parse != ch) {
2844 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
2845 vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
2846 }
2847
2848 if (sv_dat) {
2849 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
2850 RExC_rxi->data->data[num]=(void*)sv_dat;
2851 SvREFCNT_inc_simple_void_NN(sv_dat);
2852 }
2853 RExC_sawback = 1;
2854 ret = reg2node(pRExC_state,
2855 ((! FOLD)
2856 ? REFN
2857 : (ASCII_FOLD_RESTRICTED)
2858 ? REFFAN
2859 : (AT_LEAST_UNI_SEMANTICS)
2860 ? REFFUN
2861 : (LOC)
2862 ? REFFLN
2863 : REFFN),
2864 num, RExC_nestroot);
2865 if (RExC_nestroot && num >= (U32)RExC_nestroot)
2866 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
2867 *flagp |= HASWIDTH;
2868
2869 nextchar(pRExC_state);
2870 return ret;
2871 }
2872
2873 /* reg_la_NOTHING()
2874 *
2875 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2876 * NOTHING regop when the construct is empty.
2877 *
2878 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2879 *
2880 * Checks for unterminated constructs and throws a "not terminated" error
2881 * with the appropriate type if necessary
2882 *
2883 * Assuming it does not throw an exception increments RExC_seen_zerolen.
2884 *
2885 * If the construct is empty generates a NOTHING op and returns its
2886 * regnode_offset, which the caller would then return to its caller.
2887 *
2888 * If the construct is not empty increments RExC_in_lookaround, and turns
2889 * on any flags provided in RExC_seen, and then returns 0 to signify
2890 * that parsing should continue.
2891 *
2892 * PS: I would have called this reg_parse_lookaround_NOTHING() but then
2893 * any use of it would have had to be broken onto multiple lines, hence
2894 * the abbreviation.
2895 */
2896 STATIC regnode_offset
S_reg_la_NOTHING(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2897 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2898 const char *type)
2899 {
2900
2901 PERL_ARGS_ASSERT_REG_LA_NOTHING;
2902
2903 /* false below so we do not force /x */
2904 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2905
2906 if (RExC_parse >= RExC_end)
2907 vFAIL2("Sequence (%s... not terminated", type);
2908
2909 /* Always increment as NOTHING regops are zerolen */
2910 RExC_seen_zerolen++;
2911
2912 if (*RExC_parse == ')') {
2913 regnode_offset ret= reg_node(pRExC_state, NOTHING);
2914 nextchar(pRExC_state);
2915 return ret;
2916 }
2917
2918 RExC_seen |= flags;
2919 RExC_in_lookaround++;
2920 return 0; /* keep parsing! */
2921 }
2922
2923 /* reg_la_OPFAIL()
2924 *
2925 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2926 * OPFAIL regop when the construct is empty.
2927 *
2928 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2929 *
2930 * Checks for unterminated constructs and throws a "not terminated" error
2931 * if necessary.
2932 *
2933 * If the construct is empty generates an OPFAIL op and returns its
2934 * regnode_offset which the caller should then return to its caller.
2935 *
2936 * If the construct is not empty increments RExC_in_lookaround, and also
2937 * increments RExC_seen_zerolen, and turns on the flags provided in
2938 * RExC_seen, and then returns 0 to signify that parsing should continue.
2939 *
2940 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
2941 * any use of it would have had to be broken onto multiple lines, hence
2942 * the abbreviation.
2943 */
2944
2945 STATIC regnode_offset
S_reg_la_OPFAIL(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2946 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2947 const char *type)
2948 {
2949
2950 PERL_ARGS_ASSERT_REG_LA_OPFAIL;
2951
2952 /* FALSE so we don't force to /x below */;
2953 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2954
2955 if (RExC_parse >= RExC_end)
2956 vFAIL2("Sequence (%s... not terminated", type);
2957
2958 if (*RExC_parse == ')') {
2959 regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
2960 nextchar(pRExC_state);
2961 return ret; /* return produced regop */
2962 }
2963
2964 /* only increment zerolen *after* we check if we produce an OPFAIL
2965 * as an OPFAIL does not match a zero length construct, as it
2966 * does not match ever. */
2967 RExC_seen_zerolen++;
2968 RExC_seen |= flags;
2969 RExC_in_lookaround++;
2970 return 0; /* keep parsing! */
2971 }
2972
2973 /* Below are the main parsing routines.
2974 *
2975 * S_reg() parses a whole pattern or subpattern. It itself handles things
2976 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
2977 * alternation '|' in the '...' pattern.
2978 * S_regbranch() effectively implements the concatenation operator, handling
2979 * one alternative of '|', repeatedly calling S_regpiece on each
2980 * segment of the input.
2981 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
2982 * and then adds any quantifier for that chunk.
2983 * S_regatom() parses the next chunk of the input, returning when it
2984 * determines it has found a complete atomic chunk. The chunk may
2985 * be a nested subpattern, in which case S_reg is called
2986 * recursively
2987 *
2988 * The functions generate regnodes as they go along, appending each to the
2989 * pattern data structure so far. They return the offset of the current final
2990 * node into that structure, or 0 on failure.
2991 *
2992 * There are three parameters common to all of them:
2993 * pRExC_state is a structure with much information about the current
2994 * state of the parse. It's easy to add new elements to
2995 * convey new information, but beware that an error return may
2996 * require clearing the element.
2997 * flagp is a pointer to bit flags set in a lower level to pass up
2998 * to higher levels information, such as the cause of a
2999 * failure, or some characteristic about the generated node
3000 * depth is roughly the recursion depth, mostly unused except for
3001 * pretty printing debugging info.
3002 *
3003 * There are ancillary functions that these may farm work out to, using the
3004 * same parameters.
3005 *
3006 * The protocol for handling flags is that each function will, before
3007 * returning, add into *flagp the flags it needs to pass up. Each function has
3008 * a second flags variable, typically named 'flags', which it sets and clears
3009 * at will. Flag bits in it are used in that function, and it calls the next
3010 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return,
3011 * 'flags' will contain whatever it had before the call, plus whatever that
3012 * function passed up. If it wants to pass any of these up to its caller, it
3013 * has to add them to its *flagp. This means that it takes extra steps to keep
3014 * passing a flag upwards, and otherwise the flag bit is cleared for higher
3015 * functions.
3016 */
3017
3018 /* On success, returns the offset at which any next node should be placed into
3019 * the regex engine program being compiled.
3020 *
3021 * Returns 0 otherwise, with *flagp set to indicate why:
3022 * TRYAGAIN at the end of (?) that only sets flags.
3023 * RESTART_PARSE if the parse needs to be restarted, or'd with
3024 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
3025 * Otherwise would only return 0 if regbranch() returns 0, which cannot
3026 * happen. */
3027 STATIC regnode_offset
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp,U32 depth)3028 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
3029 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
3030 * 2 is like 1, but indicates that nextchar() has been called to advance
3031 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
3032 * this flag alerts us to the need to check for that */
3033 {
3034 regnode_offset ret = 0; /* Will be the head of the group. */
3035 regnode_offset br;
3036 regnode_offset lastbr;
3037 regnode_offset ender = 0;
3038 I32 logical_parno = 0;
3039 I32 parno = 0;
3040 I32 flags;
3041 U32 oregflags = RExC_flags;
3042 bool have_branch = 0;
3043 bool is_open = 0;
3044 I32 freeze_paren = 0;
3045 I32 after_freeze = 0;
3046 I32 num; /* numeric backreferences */
3047 SV * max_open; /* Max number of unclosed parens */
3048 I32 was_in_lookaround = RExC_in_lookaround;
3049 I32 fake_eval = 0; /* matches paren */
3050
3051 /* The difference between the following variables can be seen with *
3052 * the broken pattern /(?:foo/ where segment_parse_start will point *
3053 * at the 'f', and reg_parse_start will point at the '(' */
3054
3055 /* the following is used for unmatched '(' errors */
3056 char * const reg_parse_start = RExC_parse;
3057
3058 /* the following is used to track where various segments of
3059 * the pattern that we parse out started. */
3060 char * segment_parse_start = RExC_parse;
3061
3062 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3063
3064 PERL_ARGS_ASSERT_REG;
3065 DEBUG_PARSE("reg ");
3066
3067 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
3068 assert(max_open);
3069 if (!SvIOK(max_open)) {
3070 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
3071 }
3072 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
3073 open paren */
3074 vFAIL("Too many nested open parens");
3075 }
3076
3077 *flagp = 0; /* Initialize. */
3078
3079 /* Having this true makes it feasible to have a lot fewer tests for the
3080 * parse pointer being in scope. For example, we can write
3081 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
3082 * instead of
3083 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
3084 */
3085 assert(*RExC_end == '\0');
3086
3087 /* Make an OPEN node, if parenthesized. */
3088 if (paren) {
3089
3090 /* Under /x, space and comments can be gobbled up between the '(' and
3091 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
3092 * intervening space, as the sequence is a token, and a token should be
3093 * indivisible */
3094 bool has_intervening_patws = (paren == 2)
3095 && *(RExC_parse - 1) != '(';
3096
3097 if (RExC_parse >= RExC_end) {
3098 vFAIL("Unmatched (");
3099 }
3100
3101 if (paren == 'r') { /* Atomic script run */
3102 paren = '>';
3103 goto parse_rest;
3104 }
3105 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
3106 if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */
3107 fake_eval = '{';
3108 goto handle_qmark;
3109 }
3110
3111 char *start_verb = RExC_parse + 1;
3112 STRLEN verb_len;
3113 char *start_arg = NULL;
3114 unsigned char op = 0;
3115 int arg_required = 0;
3116 int internal_argval = -1; /* if > -1 no argument allowed */
3117 bool has_upper = FALSE;
3118 U32 seen_flag_set = 0; /* RExC_seen flags we must set */
3119
3120 if (has_intervening_patws) {
3121 RExC_parse_inc_by(1); /* past the '*' */
3122
3123 /* For strict backwards compatibility, don't change the message
3124 * now that we also have lowercase operands */
3125 if (isUPPER(*RExC_parse)) {
3126 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
3127 }
3128 else {
3129 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
3130 }
3131 }
3132 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
3133 if ( *RExC_parse == ':' ) {
3134 start_arg = RExC_parse + 1;
3135 break;
3136 }
3137 else if (! UTF) {
3138 if (isUPPER(*RExC_parse)) {
3139 has_upper = TRUE;
3140 }
3141 RExC_parse_inc_by(1);
3142 }
3143 else {
3144 RExC_parse_inc_utf8();
3145 }
3146 }
3147 verb_len = RExC_parse - start_verb;
3148 if ( start_arg ) {
3149 if (RExC_parse >= RExC_end) {
3150 goto unterminated_verb_pattern;
3151 }
3152
3153 RExC_parse_inc();
3154 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
3155 RExC_parse_inc();
3156 }
3157 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3158 unterminated_verb_pattern:
3159 if (has_upper) {
3160 vFAIL("Unterminated verb pattern argument");
3161 }
3162 else {
3163 vFAIL("Unterminated '(*...' argument");
3164 }
3165 }
3166 } else {
3167 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3168 if (has_upper) {
3169 vFAIL("Unterminated verb pattern");
3170 }
3171 else {
3172 vFAIL("Unterminated '(*...' construct");
3173 }
3174 }
3175 }
3176
3177 /* Here, we know that RExC_parse < RExC_end */
3178
3179 switch ( *start_verb ) {
3180 case 'A': /* (*ACCEPT) */
3181 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
3182 op = ACCEPT;
3183 internal_argval = RExC_nestroot;
3184 }
3185 break;
3186 case 'C': /* (*COMMIT) */
3187 if ( memEQs(start_verb, verb_len,"COMMIT") )
3188 op = COMMIT;
3189 break;
3190 case 'F': /* (*FAIL) */
3191 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
3192 op = OPFAIL;
3193 }
3194 break;
3195 case ':': /* (*:NAME) */
3196 case 'M': /* (*MARK:NAME) */
3197 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
3198 op = MARKPOINT;
3199 arg_required = 1;
3200 }
3201 break;
3202 case 'P': /* (*PRUNE) */
3203 if ( memEQs(start_verb, verb_len,"PRUNE") )
3204 op = PRUNE;
3205 break;
3206 case 'S': /* (*SKIP) */
3207 if ( memEQs(start_verb, verb_len,"SKIP") )
3208 op = SKIP;
3209 break;
3210 case 'T': /* (*THEN) */
3211 /* [19:06] <TimToady> :: is then */
3212 if ( memEQs(start_verb, verb_len,"THEN") ) {
3213 op = CUTGROUP;
3214 RExC_seen |= REG_CUTGROUP_SEEN;
3215 }
3216 break;
3217 case 'a':
3218 if ( memEQs(start_verb, verb_len, "asr")
3219 || memEQs(start_verb, verb_len, "atomic_script_run"))
3220 {
3221 paren = 'r'; /* Mnemonic: recursed run */
3222 goto script_run;
3223 }
3224 else if (memEQs(start_verb, verb_len, "atomic")) {
3225 paren = 't'; /* AtOMIC */
3226 goto alpha_assertions;
3227 }
3228 break;
3229 case 'p':
3230 if ( memEQs(start_verb, verb_len, "plb")
3231 || memEQs(start_verb, verb_len, "positive_lookbehind"))
3232 {
3233 paren = 'b';
3234 goto lookbehind_alpha_assertions;
3235 }
3236 else if ( memEQs(start_verb, verb_len, "pla")
3237 || memEQs(start_verb, verb_len, "positive_lookahead"))
3238 {
3239 paren = 'a';
3240 goto alpha_assertions;
3241 }
3242 break;
3243 case 'n':
3244 if ( memEQs(start_verb, verb_len, "nlb")
3245 || memEQs(start_verb, verb_len, "negative_lookbehind"))
3246 {
3247 paren = 'B';
3248 goto lookbehind_alpha_assertions;
3249 }
3250 else if ( memEQs(start_verb, verb_len, "nla")
3251 || memEQs(start_verb, verb_len, "negative_lookahead"))
3252 {
3253 paren = 'A';
3254 goto alpha_assertions;
3255 }
3256 break;
3257 case 's':
3258 if ( memEQs(start_verb, verb_len, "sr")
3259 || memEQs(start_verb, verb_len, "script_run"))
3260 {
3261 regnode_offset atomic;
3262
3263 paren = 's';
3264
3265 script_run:
3266
3267 /* This indicates Unicode rules. */
3268 REQUIRE_UNI_RULES(flagp, 0);
3269
3270 if (! start_arg) {
3271 goto no_colon;
3272 }
3273
3274 RExC_parse_set(start_arg);
3275
3276 if (RExC_in_script_run) {
3277
3278 /* Nested script runs are treated as no-ops, because
3279 * if the nested one fails, the outer one must as
3280 * well. It could fail sooner, and avoid (??{} with
3281 * side effects, but that is explicitly documented as
3282 * undefined behavior. */
3283
3284 ret = 0;
3285
3286 if (paren == 's') {
3287 paren = ':';
3288 goto parse_rest;
3289 }
3290
3291 /* But, the atomic part of a nested atomic script run
3292 * isn't a no-op, but can be treated just like a '(?>'
3293 * */
3294 paren = '>';
3295 goto parse_rest;
3296 }
3297
3298 if (paren == 's') {
3299 /* Here, we're starting a new regular script run */
3300 ret = reg_node(pRExC_state, SROPEN);
3301 RExC_in_script_run = 1;
3302 is_open = 1;
3303 goto parse_rest;
3304 }
3305
3306 /* Here, we are starting an atomic script run. This is
3307 * handled by recursing to deal with the atomic portion
3308 * separately, enclosed in SROPEN ... SRCLOSE nodes */
3309
3310 ret = reg_node(pRExC_state, SROPEN);
3311
3312 RExC_in_script_run = 1;
3313
3314 atomic = reg(pRExC_state, 'r', &flags, depth);
3315 if (flags & (RESTART_PARSE|NEED_UTF8)) {
3316 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
3317 return 0;
3318 }
3319
3320 if (! REGTAIL(pRExC_state, ret, atomic)) {
3321 REQUIRE_BRANCHJ(flagp, 0);
3322 }
3323
3324 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
3325 SRCLOSE)))
3326 {
3327 REQUIRE_BRANCHJ(flagp, 0);
3328 }
3329
3330 RExC_in_script_run = 0;
3331 return ret;
3332 }
3333
3334 break;
3335
3336 lookbehind_alpha_assertions:
3337 seen_flag_set = REG_LOOKBEHIND_SEEN;
3338 /*FALLTHROUGH*/
3339
3340 alpha_assertions:
3341
3342 if ( !start_arg ) {
3343 goto no_colon;
3344 }
3345
3346 if ( RExC_parse == start_arg ) {
3347 if ( paren == 'A' || paren == 'B' ) {
3348 /* An empty negative lookaround assertion is failure.
3349 * See also: S_reg_la_OPFAIL() */
3350
3351 /* Note: OPFAIL is *not* zerolen. */
3352 ret = reg1node(pRExC_state, OPFAIL, 0);
3353 nextchar(pRExC_state);
3354 return ret;
3355 }
3356 else
3357 if ( paren == 'a' || paren == 'b' ) {
3358 /* An empty positive lookaround assertion is success.
3359 * See also: S_reg_la_NOTHING() */
3360
3361 /* Note: NOTHING is zerolen, so increment here */
3362 RExC_seen_zerolen++;
3363 ret = reg_node(pRExC_state, NOTHING);
3364 nextchar(pRExC_state);
3365 return ret;
3366 }
3367 }
3368
3369 RExC_seen_zerolen++;
3370 RExC_in_lookaround++;
3371 RExC_seen |= seen_flag_set;
3372
3373 RExC_parse_set(start_arg);
3374 goto parse_rest;
3375
3376 no_colon:
3377 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
3378 UTF8fARG(UTF, verb_len, start_verb));
3379 NOT_REACHED; /*NOTREACHED*/
3380
3381 } /* End of switch */
3382 if ( ! op ) {
3383 RExC_parse_inc_safe();
3384 if (has_upper || verb_len == 0) {
3385 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
3386 UTF8fARG(UTF, verb_len, start_verb));
3387 }
3388 else {
3389 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
3390 UTF8fARG(UTF, verb_len, start_verb));
3391 }
3392 }
3393 if ( RExC_parse == start_arg ) {
3394 start_arg = NULL;
3395 }
3396 if ( arg_required && !start_arg ) {
3397 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
3398 (int) verb_len, start_verb);
3399 }
3400 if (internal_argval == -1) {
3401 ret = reg1node(pRExC_state, op, 0);
3402 } else {
3403 ret = reg2node(pRExC_state, op, 0, internal_argval);
3404 }
3405 RExC_seen |= REG_VERBARG_SEEN;
3406 if (start_arg) {
3407 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
3408 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
3409 STR_WITH_LEN("S"));
3410 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
3411 FLAGS(REGNODE_p(ret)) = 1;
3412 } else {
3413 FLAGS(REGNODE_p(ret)) = 0;
3414 }
3415 if ( internal_argval != -1 )
3416 ARG2i_SET(REGNODE_p(ret), internal_argval);
3417 nextchar(pRExC_state);
3418 return ret;
3419 }
3420 else if (*RExC_parse == '?') { /* (?...) */
3421 handle_qmark:
3422 ; /* make sure the label has a statement associated with it*/
3423 bool is_logical = 0, is_optimistic = 0;
3424 const char * const seqstart = RExC_parse;
3425 const char * endptr;
3426 const char non_existent_group_msg[]
3427 = "Reference to nonexistent group";
3428 const char impossible_group[] = "Invalid reference to group";
3429
3430 if (has_intervening_patws) {
3431 RExC_parse_inc_by(1);
3432 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
3433 }
3434
3435 RExC_parse_inc_by(1); /* past the '?' */
3436 if (!fake_eval) {
3437 paren = *RExC_parse; /* might be a trailing NUL, if not
3438 well-formed */
3439 is_optimistic = 0;
3440 } else {
3441 is_optimistic = 1;
3442 paren = fake_eval;
3443 }
3444 RExC_parse_inc();
3445 if (RExC_parse > RExC_end) {
3446 paren = '\0';
3447 }
3448 ret = 0; /* For look-ahead/behind. */
3449 switch (paren) {
3450
3451 case 'P': /* (?P...) variants for those used to PCRE/Python */
3452 paren = *RExC_parse;
3453 if ( paren == '<') { /* (?P<...>) named capture */
3454 RExC_parse_inc_by(1);
3455 if (RExC_parse >= RExC_end) {
3456 vFAIL("Sequence (?P<... not terminated");
3457 }
3458 goto named_capture;
3459 }
3460 else if (paren == '>') { /* (?P>name) named recursion */
3461 RExC_parse_inc_by(1);
3462 if (RExC_parse >= RExC_end) {
3463 vFAIL("Sequence (?P>... not terminated");
3464 }
3465 goto named_recursion;
3466 }
3467 else if (paren == '=') { /* (?P=...) named backref */
3468 RExC_parse_inc_by(1);
3469 return handle_named_backref(pRExC_state, flagp,
3470 segment_parse_start, ')');
3471 }
3472 RExC_parse_inc_if_char();
3473 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3474 vFAIL3("Sequence (%.*s...) not recognized",
3475 (int) (RExC_parse - seqstart), seqstart);
3476 NOT_REACHED; /*NOTREACHED*/
3477 case '<': /* (?<...) */
3478 /* If you want to support (?<*...), first reconcile with GH #17363 */
3479 if (*RExC_parse == '!') {
3480 paren = ','; /* negative lookbehind (?<! ... ) */
3481 RExC_parse_inc_by(1);
3482 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
3483 return ret;
3484 break;
3485 }
3486 else
3487 if (*RExC_parse == '=') {
3488 /* paren = '<' - negative lookahead (?<= ... ) */
3489 RExC_parse_inc_by(1);
3490 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
3491 return ret;
3492 break;
3493 }
3494 else
3495 named_capture:
3496 { /* (?<...>) */
3497 char *name_start;
3498 SV *svname;
3499 paren= '>';
3500 /* FALLTHROUGH */
3501 case '\'': /* (?'...') */
3502 name_start = RExC_parse;
3503 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
3504 if ( RExC_parse == name_start
3505 || RExC_parse >= RExC_end
3506 || *RExC_parse != paren)
3507 {
3508 vFAIL2("Sequence (?%c... not terminated",
3509 paren=='>' ? '<' : (char) paren);
3510 }
3511 {
3512 HE *he_str;
3513 SV *sv_dat = NULL;
3514 if (!svname) /* shouldn't happen */
3515 Perl_croak(aTHX_
3516 "panic: reg_scan_name returned NULL");
3517 if (!RExC_paren_names) {
3518 RExC_paren_names= newHV();
3519 sv_2mortal(MUTABLE_SV(RExC_paren_names));
3520 #ifdef DEBUGGING
3521 RExC_paren_name_list= newAV();
3522 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
3523 #endif
3524 }
3525 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
3526 if ( he_str )
3527 sv_dat = HeVAL(he_str);
3528 if ( ! sv_dat ) {
3529 /* croak baby croak */
3530 Perl_croak(aTHX_
3531 "panic: paren_name hash element allocation failed");
3532 } else if ( SvPOK(sv_dat) ) {
3533 /* (?|...) can mean we have dupes so scan to check
3534 its already been stored. Maybe a flag indicating
3535 we are inside such a construct would be useful,
3536 but the arrays are likely to be quite small, so
3537 for now we punt -- dmq */
3538 IV count = SvIV(sv_dat);
3539 I32 *pv = (I32*)SvPVX(sv_dat);
3540 IV i;
3541 for ( i = 0 ; i < count ; i++ ) {
3542 if ( pv[i] == RExC_npar ) {
3543 count = 0;
3544 break;
3545 }
3546 }
3547 if ( count ) {
3548 pv = (I32*)SvGROW(sv_dat,
3549 SvCUR(sv_dat) + sizeof(I32)+1);
3550 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
3551 pv[count] = RExC_npar;
3552 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
3553 }
3554 } else {
3555 (void)SvUPGRADE(sv_dat, SVt_PVNV);
3556 sv_setpvn(sv_dat, (char *)&(RExC_npar),
3557 sizeof(I32));
3558 SvIOK_on(sv_dat);
3559 SvIV_set(sv_dat, 1);
3560 }
3561 #ifdef DEBUGGING
3562 /* No, this does not cause a memory leak under
3563 * debugging. RExC_paren_name_list is freed later
3564 * on in the dump process. - Yves
3565 */
3566 if (!av_store(RExC_paren_name_list,
3567 RExC_npar, SvREFCNT_inc_NN(svname)))
3568 SvREFCNT_dec_NN(svname);
3569 #endif
3570
3571 }
3572 nextchar(pRExC_state);
3573 paren = 1;
3574 goto capturing_parens;
3575 }
3576 NOT_REACHED; /*NOTREACHED*/
3577 case '=': /* (?=...) */
3578 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
3579 return ret;
3580 break;
3581 case '!': /* (?!...) */
3582 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
3583 return ret;
3584 break;
3585 case '|': /* (?|...) */
3586 /* branch reset, behave like a (?:...) except that
3587 buffers in alternations share the same numbers */
3588 paren = ':';
3589 after_freeze = freeze_paren = RExC_logical_npar;
3590
3591 /* XXX This construct currently requires an extra pass.
3592 * Investigation would be required to see if that could be
3593 * changed */
3594 REQUIRE_PARENS_PASS;
3595 break;
3596 case ':': /* (?:...) */
3597 case '>': /* (?>...) */
3598 break;
3599 case '$': /* (?$...) */
3600 case '@': /* (?@...) */
3601 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3602 break;
3603 case '0' : /* (?0) */
3604 case 'R' : /* (?R) */
3605 if (RExC_parse == RExC_end || *RExC_parse != ')')
3606 FAIL("Sequence (?R) not terminated");
3607 num = 0;
3608 RExC_seen |= REG_RECURSE_SEEN;
3609
3610 /* XXX These constructs currently require an extra pass.
3611 * It probably could be changed */
3612 REQUIRE_PARENS_PASS;
3613
3614 *flagp |= POSTPONED;
3615 goto gen_recurse_regop;
3616 /*notreached*/
3617 /* named and numeric backreferences */
3618 case '&': /* (?&NAME) */
3619 segment_parse_start = RExC_parse - 1;
3620 named_recursion:
3621 {
3622 SV *sv_dat = reg_scan_name(pRExC_state,
3623 REG_RSN_RETURN_DATA);
3624 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
3625 }
3626 if (RExC_parse >= RExC_end || *RExC_parse != ')')
3627 vFAIL("Sequence (?&... not terminated");
3628 goto gen_recurse_regop;
3629 /* NOTREACHED */
3630 case '+':
3631 if (! inRANGE(RExC_parse[0], '1', '9')) {
3632 RExC_parse_inc_by(1);
3633 vFAIL("Illegal pattern");
3634 }
3635 goto parse_recursion;
3636 /* NOTREACHED*/
3637 case '-': /* (?-1) */
3638 if (! inRANGE(RExC_parse[0], '1', '9')) {
3639 RExC_parse--; /* rewind to let it be handled later */
3640 goto parse_flags;
3641 }
3642 /* FALLTHROUGH */
3643 case '1': case '2': case '3': case '4': /* (?1) */
3644 case '5': case '6': case '7': case '8': case '9':
3645 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */
3646 parse_recursion:
3647 {
3648 bool is_neg = FALSE;
3649 UV unum;
3650 segment_parse_start = RExC_parse - 1;
3651 if (*RExC_parse == '-') {
3652 RExC_parse_inc_by(1);
3653 is_neg = TRUE;
3654 }
3655 endptr = RExC_end;
3656 if (grok_atoUV(RExC_parse, &unum, &endptr)
3657 && unum <= I32_MAX
3658 ) {
3659 num = (I32)unum;
3660 RExC_parse_set((char*)endptr);
3661 }
3662 else { /* Overflow, or something like that. Position
3663 beyond all digits for the message */
3664 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
3665 RExC_parse_inc_by(1);
3666 }
3667 vFAIL(impossible_group);
3668 }
3669 if (is_neg) {
3670 /* -num is always representable on 1 and 2's complement
3671 * machines */
3672 num = -num;
3673 }
3674 }
3675 if (*RExC_parse!=')')
3676 vFAIL("Expecting close bracket");
3677
3678 if (paren == '-' || paren == '+') {
3679
3680 /* Don't overflow */
3681 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
3682 RExC_parse_inc_by(1);
3683 vFAIL(impossible_group);
3684 }
3685
3686 /*
3687 Diagram of capture buffer numbering.
3688 Top line is the normal capture buffer numbers
3689 Bottom line is the negative indexing as from
3690 the X (the (?-2))
3691
3692 1 2 3 4 5 X Y 6 7
3693 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
3694 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
3695 - 5 4 3 2 1 X Y x x
3696
3697 Resolve to absolute group. Recall that RExC_npar is +1 of
3698 the actual parenthesis group number. For lookahead, we
3699 have to compensate for that. Using the above example, when
3700 we get to Y in the parse, num is 2 and RExC_npar is 6. We
3701 want 7 for +2, and 4 for -2.
3702 */
3703 if ( paren == '+' ) {
3704 num--;
3705 }
3706
3707 num += RExC_npar;
3708
3709 if (paren == '-' && num < 1) {
3710 RExC_parse_inc_by(1);
3711 vFAIL(non_existent_group_msg);
3712 }
3713 }
3714 else
3715 if (num && num < RExC_logical_npar) {
3716 num = RExC_logical_to_parno[num];
3717 }
3718 else
3719 if (ALL_PARENS_COUNTED) {
3720 if (num < RExC_logical_total_parens) {
3721 num = RExC_logical_to_parno[num];
3722 }
3723 else {
3724 RExC_parse_inc_by(1);
3725 vFAIL(non_existent_group_msg);
3726 }
3727 }
3728 else {
3729 REQUIRE_PARENS_PASS;
3730 }
3731
3732
3733 gen_recurse_regop:
3734 if (num >= RExC_npar) {
3735
3736 /* It might be a forward reference; we can't fail until we
3737 * know, by completing the parse to get all the groups, and
3738 * then reparsing */
3739 if (ALL_PARENS_COUNTED) {
3740 if (num >= RExC_total_parens) {
3741 RExC_parse_inc_by(1);
3742 vFAIL(non_existent_group_msg);
3743 }
3744 }
3745 else {
3746 REQUIRE_PARENS_PASS;
3747 }
3748 }
3749
3750 /* We keep track how many GOSUB items we have produced.
3751 To start off the ARG2i() of the GOSUB holds its "id",
3752 which is used later in conjunction with RExC_recurse
3753 to calculate the offset we need to jump for the GOSUB,
3754 which it will store in the final representation.
3755 We have to defer the actual calculation until much later
3756 as the regop may move.
3757 */
3758 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
3759 RExC_recurse_count++;
3760 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
3761 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
3762 22, "| |", (int)(depth * 2 + 1), "",
3763 (UV)ARG1u(REGNODE_p(ret)),
3764 (IV)ARG2i(REGNODE_p(ret))));
3765 RExC_seen |= REG_RECURSE_SEEN;
3766
3767 *flagp |= POSTPONED;
3768 assert(*RExC_parse == ')');
3769 nextchar(pRExC_state);
3770 return ret;
3771
3772 /* NOTREACHED */
3773
3774 case '?': /* (??...) */
3775 is_logical = 1;
3776 if (*RExC_parse != '{') {
3777 RExC_parse_inc_if_char();
3778 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3779 vFAIL2utf8f(
3780 "Sequence (%" UTF8f "...) not recognized",
3781 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
3782 NOT_REACHED; /*NOTREACHED*/
3783 }
3784 *flagp |= POSTPONED;
3785 paren = '{';
3786 RExC_parse_inc_by(1);
3787 /* FALLTHROUGH */
3788 case '{': /* (?{...}) */
3789 {
3790 U32 n = 0;
3791 struct reg_code_block *cb;
3792 OP * o;
3793
3794 RExC_seen_zerolen++;
3795
3796 if ( !pRExC_state->code_blocks
3797 || pRExC_state->code_index
3798 >= pRExC_state->code_blocks->count
3799 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
3800 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
3801 - RExC_start)
3802 ) {
3803 if (RExC_pm_flags & PMf_USE_RE_EVAL)
3804 FAIL("panic: Sequence (?{...}): no code block found\n");
3805 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3806 }
3807 /* this is a pre-compiled code block (?{...}) */
3808 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
3809 RExC_parse_set(RExC_start + cb->end);
3810 o = cb->block;
3811 if (cb->src_regex) {
3812 n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
3813 RExC_rxi->data->data[n] =
3814 (void*)SvREFCNT_inc((SV*)cb->src_regex);
3815 RExC_rxi->data->data[n+1] = (void*)o;
3816 }
3817 else {
3818 n = reg_add_data(pRExC_state,
3819 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
3820 RExC_rxi->data->data[n] = (void*)o;
3821 }
3822 pRExC_state->code_index++;
3823 nextchar(pRExC_state);
3824 if (!is_optimistic)
3825 RExC_seen |= REG_PESSIMIZE_SEEN;
3826
3827 if (is_logical) {
3828 regnode_offset eval;
3829 ret = reg_node(pRExC_state, LOGICAL);
3830 FLAGS(REGNODE_p(ret)) = 2;
3831
3832 eval = reg2node(pRExC_state, EVAL,
3833 n,
3834
3835 /* for later propagation into (??{})
3836 * return value */
3837 RExC_flags & RXf_PMf_COMPILETIME
3838 );
3839 FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3840 if (! REGTAIL(pRExC_state, ret, eval)) {
3841 REQUIRE_BRANCHJ(flagp, 0);
3842 }
3843 return ret;
3844 }
3845 ret = reg2node(pRExC_state, EVAL, n, 0);
3846 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3847
3848 return ret;
3849 }
3850 case '(': /* (?(?{...})...) and (?(?=...)...) */
3851 {
3852 int is_define= 0;
3853 const int DEFINE_len = sizeof("DEFINE") - 1;
3854 if ( RExC_parse < RExC_end - 1
3855 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
3856 && ( RExC_parse[1] == '='
3857 || RExC_parse[1] == '!'
3858 || RExC_parse[1] == '<'
3859 || RExC_parse[1] == '{'))
3860 || ( RExC_parse[0] == '*' /* (?(*...)) */
3861 && ( RExC_parse[1] == '{'
3862 || ( memBEGINs(RExC_parse + 1,
3863 (Size_t) (RExC_end - (RExC_parse + 1)),
3864 "pla:")
3865 || memBEGINs(RExC_parse + 1,
3866 (Size_t) (RExC_end - (RExC_parse + 1)),
3867 "plb:")
3868 || memBEGINs(RExC_parse + 1,
3869 (Size_t) (RExC_end - (RExC_parse + 1)),
3870 "nla:")
3871 || memBEGINs(RExC_parse + 1,
3872 (Size_t) (RExC_end - (RExC_parse + 1)),
3873 "nlb:")
3874 || memBEGINs(RExC_parse + 1,
3875 (Size_t) (RExC_end - (RExC_parse + 1)),
3876 "positive_lookahead:")
3877 || memBEGINs(RExC_parse + 1,
3878 (Size_t) (RExC_end - (RExC_parse + 1)),
3879 "positive_lookbehind:")
3880 || memBEGINs(RExC_parse + 1,
3881 (Size_t) (RExC_end - (RExC_parse + 1)),
3882 "negative_lookahead:")
3883 || memBEGINs(RExC_parse + 1,
3884 (Size_t) (RExC_end - (RExC_parse + 1)),
3885 "negative_lookbehind:")))))
3886 ) { /* Lookahead or eval. */
3887 I32 flag;
3888 regnode_offset tail;
3889
3890 ret = reg_node(pRExC_state, LOGICAL);
3891 FLAGS(REGNODE_p(ret)) = 1;
3892
3893 tail = reg(pRExC_state, 1, &flag, depth+1);
3894 RETURN_FAIL_ON_RESTART(flag, flagp);
3895 if (! REGTAIL(pRExC_state, ret, tail)) {
3896 REQUIRE_BRANCHJ(flagp, 0);
3897 }
3898 goto insert_if;
3899 }
3900 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
3901 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
3902 {
3903 char ch = RExC_parse[0] == '<' ? '>' : '\'';
3904 char *name_start= RExC_parse;
3905 RExC_parse_inc_by(1);
3906 U32 num = 0;
3907 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
3908 if ( RExC_parse == name_start
3909 || RExC_parse >= RExC_end
3910 || *RExC_parse != ch)
3911 {
3912 vFAIL2("Sequence (?(%c... not terminated",
3913 (ch == '>' ? '<' : ch));
3914 }
3915 RExC_parse_inc_by(1);
3916 if (sv_dat) {
3917 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
3918 RExC_rxi->data->data[num]=(void*)sv_dat;
3919 SvREFCNT_inc_simple_void_NN(sv_dat);
3920 }
3921 ret = reg1node(pRExC_state, GROUPPN, num);
3922 goto insert_if_check_paren;
3923 }
3924 else if (memBEGINs(RExC_parse,
3925 (STRLEN) (RExC_end - RExC_parse),
3926 "DEFINE"))
3927 {
3928 ret = reg1node(pRExC_state, DEFINEP, 0);
3929 RExC_parse_inc_by(DEFINE_len);
3930 is_define = 1;
3931 goto insert_if_check_paren;
3932 }
3933 else if (RExC_parse[0] == 'R') {
3934 RExC_parse_inc_by(1);
3935 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
3936 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
3937 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
3938 */
3939 parno = 0;
3940 if (RExC_parse[0] == '0') {
3941 parno = 1;
3942 RExC_parse_inc_by(1);
3943 }
3944 else if (inRANGE(RExC_parse[0], '1', '9')) {
3945 UV uv;
3946 endptr = RExC_end;
3947 if (grok_atoUV(RExC_parse, &uv, &endptr)
3948 && uv <= I32_MAX
3949 ) {
3950 parno = (I32)uv + 1;
3951 RExC_parse_set((char*)endptr);
3952 }
3953 /* else "Switch condition not recognized" below */
3954 } else if (RExC_parse[0] == '&') {
3955 SV *sv_dat;
3956 RExC_parse_inc_by(1);
3957 sv_dat = reg_scan_name(pRExC_state,
3958 REG_RSN_RETURN_DATA);
3959 if (sv_dat)
3960 parno = 1 + *((I32 *)SvPVX(sv_dat));
3961 }
3962 ret = reg1node(pRExC_state, INSUBP, parno);
3963 goto insert_if_check_paren;
3964 }
3965 else if (inRANGE(RExC_parse[0], '1', '9')) {
3966 /* (?(1)...) */
3967 char c;
3968 UV uv;
3969 endptr = RExC_end;
3970 if (grok_atoUV(RExC_parse, &uv, &endptr)
3971 && uv <= I32_MAX
3972 ) {
3973 parno = (I32)uv;
3974 RExC_parse_set((char*)endptr);
3975 }
3976 else {
3977 vFAIL("panic: grok_atoUV returned FALSE");
3978 }
3979 ret = reg1node(pRExC_state, GROUPP, parno);
3980
3981 insert_if_check_paren:
3982 if (UCHARAT(RExC_parse) != ')') {
3983 RExC_parse_inc_safe();
3984 vFAIL("Switch condition not recognized");
3985 }
3986 nextchar(pRExC_state);
3987 insert_if:
3988 if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
3989 IFTHEN, 0)))
3990 {
3991 REQUIRE_BRANCHJ(flagp, 0);
3992 }
3993 br = regbranch(pRExC_state, &flags, 1, depth+1);
3994 if (br == 0) {
3995 RETURN_FAIL_ON_RESTART(flags,flagp);
3996 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3997 (UV) flags);
3998 } else
3999 if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
4000 LONGJMP, 0)))
4001 {
4002 REQUIRE_BRANCHJ(flagp, 0);
4003 }
4004 c = UCHARAT(RExC_parse);
4005 nextchar(pRExC_state);
4006 if (flags&HASWIDTH)
4007 *flagp |= HASWIDTH;
4008 if (c == '|') {
4009 if (is_define)
4010 vFAIL("(?(DEFINE)....) does not allow branches");
4011
4012 /* Fake one for optimizer. */
4013 lastbr = reg1node(pRExC_state, IFTHEN, 0);
4014
4015 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
4016 RETURN_FAIL_ON_RESTART(flags, flagp);
4017 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
4018 (UV) flags);
4019 }
4020 if (! REGTAIL(pRExC_state, ret, lastbr)) {
4021 REQUIRE_BRANCHJ(flagp, 0);
4022 }
4023 if (flags&HASWIDTH)
4024 *flagp |= HASWIDTH;
4025 c = UCHARAT(RExC_parse);
4026 nextchar(pRExC_state);
4027 }
4028 else
4029 lastbr = 0;
4030 if (c != ')') {
4031 if (RExC_parse >= RExC_end)
4032 vFAIL("Switch (?(condition)... not terminated");
4033 else
4034 vFAIL("Switch (?(condition)... contains too many branches");
4035 }
4036 ender = reg_node(pRExC_state, TAIL);
4037 if (! REGTAIL(pRExC_state, br, ender)) {
4038 REQUIRE_BRANCHJ(flagp, 0);
4039 }
4040 if (lastbr) {
4041 if (! REGTAIL(pRExC_state, lastbr, ender)) {
4042 REQUIRE_BRANCHJ(flagp, 0);
4043 }
4044 if (! REGTAIL(pRExC_state,
4045 REGNODE_OFFSET(
4046 REGNODE_AFTER(REGNODE_p(lastbr))),
4047 ender))
4048 {
4049 REQUIRE_BRANCHJ(flagp, 0);
4050 }
4051 }
4052 else
4053 if (! REGTAIL(pRExC_state, ret, ender)) {
4054 REQUIRE_BRANCHJ(flagp, 0);
4055 }
4056 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
4057 RExC_size++; /* XXX WHY do we need this?!!
4058 For large programs it seems to be required
4059 but I can't figure out why. -- dmq*/
4060 #endif
4061 return ret;
4062 }
4063 RExC_parse_inc_safe();
4064 vFAIL("Unknown switch condition (?(...))");
4065 }
4066 case '[': /* (?[ ... ]) */
4067 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
4068 case 0: /* A NUL */
4069 RExC_parse--; /* for vFAIL to print correctly */
4070 vFAIL("Sequence (? incomplete");
4071 break;
4072
4073 case ')':
4074 if (RExC_strict) { /* [perl #132851] */
4075 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
4076 }
4077 /* FALLTHROUGH */
4078 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
4079 /* FALLTHROUGH */
4080 default: /* e.g., (?i) */
4081 RExC_parse_set((char *) seqstart + 1);
4082 parse_flags:
4083 parse_lparen_question_flags(pRExC_state);
4084 if (UCHARAT(RExC_parse) != ':') {
4085 if (RExC_parse < RExC_end)
4086 nextchar(pRExC_state);
4087 *flagp = TRYAGAIN;
4088 return 0;
4089 }
4090 paren = ':';
4091 nextchar(pRExC_state);
4092 ret = 0;
4093 goto parse_rest;
4094 } /* end switch */
4095 }
4096 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
4097 capturing_parens:
4098 parno = RExC_npar;
4099 RExC_npar++;
4100 if (RExC_npar >= U16_MAX)
4101 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
4102
4103 logical_parno = RExC_logical_npar;
4104 RExC_logical_npar++;
4105 if (! ALL_PARENS_COUNTED) {
4106 /* If we are in our first pass through (and maybe only pass),
4107 * we need to allocate memory for the capturing parentheses
4108 * data structures.
4109 */
4110
4111 if (!RExC_parens_buf_size) {
4112 /* first guess at number of parens we might encounter */
4113 RExC_parens_buf_size = 10;
4114
4115 /* setup RExC_open_parens, which holds the address of each
4116 * OPEN tag, and to make things simpler for the 0 index the
4117 * start of the program - this is used later for offsets */
4118 Newxz(RExC_open_parens, RExC_parens_buf_size,
4119 regnode_offset);
4120 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
4121
4122 /* setup RExC_close_parens, which holds the address of each
4123 * CLOSE tag, and to make things simpler for the 0 index
4124 * the end of the program - this is used later for offsets
4125 * */
4126 Newxz(RExC_close_parens, RExC_parens_buf_size,
4127 regnode_offset);
4128 /* we don't know where end op starts yet, so we don't need to
4129 * set RExC_close_parens[0] like we do RExC_open_parens[0]
4130 * above */
4131
4132 Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4133 Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4134 }
4135 else if (RExC_npar > RExC_parens_buf_size) {
4136 I32 old_size = RExC_parens_buf_size;
4137
4138 RExC_parens_buf_size *= 2;
4139
4140 Renew(RExC_open_parens, RExC_parens_buf_size,
4141 regnode_offset);
4142 Zero(RExC_open_parens + old_size,
4143 RExC_parens_buf_size - old_size, regnode_offset);
4144
4145 Renew(RExC_close_parens, RExC_parens_buf_size,
4146 regnode_offset);
4147 Zero(RExC_close_parens + old_size,
4148 RExC_parens_buf_size - old_size, regnode_offset);
4149
4150 Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4151 Zero(RExC_logical_to_parno + old_size,
4152 RExC_parens_buf_size - old_size, I32);
4153
4154 Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4155 Zero(RExC_parno_to_logical + old_size,
4156 RExC_parens_buf_size - old_size, I32);
4157 }
4158 }
4159
4160 ret = reg1node(pRExC_state, OPEN, parno);
4161 if (!RExC_nestroot)
4162 RExC_nestroot = parno;
4163 if (RExC_open_parens && !RExC_open_parens[parno])
4164 {
4165 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4166 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
4167 22, "| |", (int)(depth * 2 + 1), "",
4168 (IV)parno, ret));
4169 RExC_open_parens[parno]= ret;
4170 }
4171 if (RExC_parno_to_logical) {
4172 RExC_parno_to_logical[parno] = logical_parno;
4173 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
4174 RExC_logical_to_parno[logical_parno] = parno;
4175 }
4176 is_open = 1;
4177 } else {
4178 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
4179 paren = ':';
4180 ret = 0;
4181 }
4182 }
4183 else /* ! paren */
4184 ret = 0;
4185
4186 parse_rest:
4187 /* Pick up the branches, linking them together. */
4188 segment_parse_start = RExC_parse;
4189 I32 npar_before_regbranch = RExC_npar - 1;
4190 br = regbranch(pRExC_state, &flags, 1, depth+1);
4191
4192 /* branch_len = (paren != 0); */
4193
4194 if (br == 0) {
4195 RETURN_FAIL_ON_RESTART(flags, flagp);
4196 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4197 }
4198 if (*RExC_parse == '|') {
4199 if (RExC_use_BRANCHJ) {
4200 reginsert(pRExC_state, BRANCHJ, br, depth+1);
4201 ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
4202 ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4203 }
4204 else {
4205 reginsert(pRExC_state, BRANCH, br, depth+1);
4206 ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
4207 ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4208 }
4209 have_branch = 1;
4210 }
4211 else if (paren == ':') {
4212 *flagp |= flags&SIMPLE;
4213 }
4214 if (is_open) { /* Starts with OPEN. */
4215 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
4216 REQUIRE_BRANCHJ(flagp, 0);
4217 }
4218 }
4219 else if (paren != '?') /* Not Conditional */
4220 ret = br;
4221 *flagp |= flags & (HASWIDTH | POSTPONED);
4222 lastbr = br;
4223 while (*RExC_parse == '|') {
4224 if (RExC_use_BRANCHJ) {
4225 bool shut_gcc_up;
4226
4227 ender = reg1node(pRExC_state, LONGJMP, 0);
4228
4229 /* Append to the previous. */
4230 shut_gcc_up = REGTAIL(pRExC_state,
4231 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
4232 ender);
4233 PERL_UNUSED_VAR(shut_gcc_up);
4234 }
4235 nextchar(pRExC_state);
4236 if (freeze_paren) {
4237 if (RExC_logical_npar > after_freeze)
4238 after_freeze = RExC_logical_npar;
4239 RExC_logical_npar = freeze_paren;
4240 }
4241 br = regbranch(pRExC_state, &flags, 0, depth+1);
4242
4243 if (br == 0) {
4244 RETURN_FAIL_ON_RESTART(flags, flagp);
4245 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4246 }
4247 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
4248 REQUIRE_BRANCHJ(flagp, 0);
4249 }
4250 assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4251 assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4252 if (OP(REGNODE_p(br)) == BRANCH) {
4253 if (OP(REGNODE_p(lastbr)) == BRANCH)
4254 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4255 else
4256 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4257 }
4258 else
4259 if (OP(REGNODE_p(br)) == BRANCHJ) {
4260 if (OP(REGNODE_p(lastbr)) == BRANCH)
4261 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4262 else
4263 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4264 }
4265
4266 lastbr = br;
4267 *flagp |= flags & (HASWIDTH | POSTPONED);
4268 }
4269
4270 if (have_branch || paren != ':') {
4271 regnode * br;
4272
4273 /* Make a closing node, and hook it on the end. */
4274 switch (paren) {
4275 case ':':
4276 ender = reg_node(pRExC_state, TAIL);
4277 break;
4278 case 1: case 2:
4279 ender = reg1node(pRExC_state, CLOSE, parno);
4280 if ( RExC_close_parens ) {
4281 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4282 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
4283 22, "| |", (int)(depth * 2 + 1), "",
4284 (IV)parno, ender));
4285 RExC_close_parens[parno]= ender;
4286 if (RExC_nestroot == parno)
4287 RExC_nestroot = 0;
4288 }
4289 break;
4290 case 's':
4291 ender = reg_node(pRExC_state, SRCLOSE);
4292 RExC_in_script_run = 0;
4293 break;
4294 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
4295 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
4296 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
4297 case '<': /* (?<= ... ) */
4298 case ',': /* (?<! ... ) */
4299 *flagp &= ~HASWIDTH;
4300 ender = reg_node(pRExC_state, LOOKBEHIND_END);
4301 break;
4302 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
4303 case 'a':
4304 case 'A':
4305 case '=':
4306 case '!':
4307 *flagp &= ~HASWIDTH;
4308 /* FALLTHROUGH */
4309 case 't': /* aTomic */
4310 case '>':
4311 ender = reg_node(pRExC_state, SUCCEED);
4312 break;
4313 case 0:
4314 ender = reg_node(pRExC_state, END);
4315 assert(!RExC_end_op); /* there can only be one! */
4316 RExC_end_op = REGNODE_p(ender);
4317 if (RExC_close_parens) {
4318 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4319 "%*s%*s Setting close paren #0 (END) to %zu\n",
4320 22, "| |", (int)(depth * 2 + 1), "",
4321 ender));
4322
4323 RExC_close_parens[0]= ender;
4324 }
4325 break;
4326 }
4327 DEBUG_PARSE_r({
4328 DEBUG_PARSE_MSG("lsbr");
4329 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
4330 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
4331 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4332 SvPV_nolen_const(RExC_mysv1),
4333 (IV)lastbr,
4334 SvPV_nolen_const(RExC_mysv2),
4335 (IV)ender,
4336 (IV)(ender - lastbr)
4337 );
4338 });
4339 if (OP(REGNODE_p(lastbr)) == BRANCH) {
4340 ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4341 }
4342 else
4343 if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4344 ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4345 }
4346
4347 if (! REGTAIL(pRExC_state, lastbr, ender)) {
4348 REQUIRE_BRANCHJ(flagp, 0);
4349 }
4350
4351 if (have_branch) {
4352 char is_nothing= 1;
4353 if (depth==1)
4354 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
4355
4356 /* Hook the tails of the branches to the closing node. */
4357 for (br = REGNODE_p(ret); br; br = regnext(br)) {
4358 const U8 op = REGNODE_TYPE(OP(br));
4359 regnode *nextoper = REGNODE_AFTER(br);
4360 if (op == BRANCH) {
4361 if (! REGTAIL_STUDY(pRExC_state,
4362 REGNODE_OFFSET(nextoper),
4363 ender))
4364 {
4365 REQUIRE_BRANCHJ(flagp, 0);
4366 }
4367 if ( OP(nextoper) != NOTHING
4368 || regnext(nextoper) != REGNODE_p(ender))
4369 is_nothing= 0;
4370 }
4371 else if (op == BRANCHJ) {
4372 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
4373 REGNODE_OFFSET(nextoper),
4374 ender);
4375 PERL_UNUSED_VAR(shut_gcc_up);
4376 /* for now we always disable this optimisation * /
4377 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
4378 if ( OP(nopr) != NOTHING
4379 || regnext(nopr) != REGNODE_p(ender))
4380 */
4381 is_nothing= 0;
4382 }
4383 }
4384 if (is_nothing) {
4385 regnode * ret_as_regnode = REGNODE_p(ret);
4386 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
4387 ? regnext(ret_as_regnode)
4388 : ret_as_regnode;
4389 DEBUG_PARSE_r({
4390 DEBUG_PARSE_MSG("NADA");
4391 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
4392 NULL, pRExC_state);
4393 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
4394 NULL, pRExC_state);
4395 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4396 SvPV_nolen_const(RExC_mysv1),
4397 (IV)REG_NODE_NUM(ret_as_regnode),
4398 SvPV_nolen_const(RExC_mysv2),
4399 (IV)ender,
4400 (IV)(ender - ret)
4401 );
4402 });
4403 OP(br)= NOTHING;
4404 if (OP(REGNODE_p(ender)) == TAIL) {
4405 NEXT_OFF(br)= 0;
4406 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
4407 } else {
4408 regnode *opt;
4409 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
4410 OP(opt)= OPTIMIZED;
4411 NEXT_OFF(br)= REGNODE_p(ender) - br;
4412 }
4413 }
4414 }
4415 }
4416
4417 {
4418 const char *p;
4419 /* Even/odd or x=don't care: 010101x10x */
4420 static const char parens[] = "=!aA<,>Bbt";
4421 /* flag below is set to 0 up through 'A'; 1 for larger */
4422
4423 if (paren && (p = strchr(parens, paren))) {
4424 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4425 int flag = (p - parens) > 3;
4426
4427 if (paren == '>' || paren == 't') {
4428 node = SUSPEND, flag = 0;
4429 }
4430
4431 reginsert(pRExC_state, node, ret, depth+1);
4432 FLAGS(REGNODE_p(ret)) = flag;
4433 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
4434 {
4435 REQUIRE_BRANCHJ(flagp, 0);
4436 }
4437 }
4438 }
4439
4440 /* Check for proper termination. */
4441 if (paren) {
4442 /* restore original flags, but keep (?p) and, if we've encountered
4443 * something in the parse that changes /d rules into /u, keep the /u */
4444 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
4445 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
4446 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
4447 }
4448 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
4449 RExC_parse_set(reg_parse_start);
4450 vFAIL("Unmatched (");
4451 }
4452 nextchar(pRExC_state);
4453 }
4454 else if (!paren && RExC_parse < RExC_end) {
4455 if (*RExC_parse == ')') {
4456 RExC_parse_inc_by(1);
4457 vFAIL("Unmatched )");
4458 }
4459 else
4460 FAIL("Junk on end of regexp"); /* "Can't happen". */
4461 NOT_REACHED; /* NOTREACHED */
4462 }
4463
4464 if (after_freeze > RExC_logical_npar)
4465 RExC_logical_npar = after_freeze;
4466
4467 RExC_in_lookaround = was_in_lookaround;
4468
4469 return(ret);
4470 }
4471
4472 /*
4473 - regbranch - one alternative of an | operator
4474 *
4475 * Implements the concatenation operator.
4476 *
4477 * On success, returns the offset at which any next node should be placed into
4478 * the regex engine program being compiled.
4479 *
4480 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
4481 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
4482 * UTF-8
4483 */
4484 STATIC regnode_offset
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first,U32 depth)4485 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4486 {
4487 regnode_offset ret;
4488 regnode_offset chain = 0;
4489 regnode_offset latest;
4490 regnode *branch_node = NULL;
4491 I32 flags = 0, c = 0;
4492 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4493
4494 PERL_ARGS_ASSERT_REGBRANCH;
4495
4496 DEBUG_PARSE("brnc");
4497
4498 if (first)
4499 ret = 0;
4500 else {
4501 if (RExC_use_BRANCHJ) {
4502 ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
4503 branch_node = REGNODE_p(ret);
4504 ARG2a_SET(branch_node, (U16)RExC_npar-1);
4505 } else {
4506 ret = reg1node(pRExC_state, BRANCH, 0);
4507 branch_node = REGNODE_p(ret);
4508 ARG1a_SET(branch_node, (U16)RExC_npar-1);
4509 }
4510 }
4511
4512 *flagp = 0; /* Initialize. */
4513
4514 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
4515 FALSE /* Don't force to /x */ );
4516 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4517 flags &= ~TRYAGAIN;
4518 latest = regpiece(pRExC_state, &flags, depth+1);
4519 if (latest == 0) {
4520 if (flags & TRYAGAIN)
4521 continue;
4522 RETURN_FAIL_ON_RESTART(flags, flagp);
4523 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
4524 }
4525 else if (ret == 0)
4526 ret = latest;
4527 *flagp |= flags&(HASWIDTH|POSTPONED);
4528 if (chain != 0) {
4529 /* FIXME adding one for every branch after the first is probably
4530 * excessive now we have TRIE support. (hv) */
4531 MARK_NAUGHTY(1);
4532 if (! REGTAIL(pRExC_state, chain, latest)) {
4533 /* XXX We could just redo this branch, but figuring out what
4534 * bookkeeping needs to be reset is a pain, and it's likely
4535 * that other branches that goto END will also be too large */
4536 REQUIRE_BRANCHJ(flagp, 0);
4537 }
4538 }
4539 chain = latest;
4540 c++;
4541 }
4542 if (chain == 0) { /* Loop ran zero times. */
4543 chain = reg_node(pRExC_state, NOTHING);
4544 if (ret == 0)
4545 ret = chain;
4546 }
4547 if (c == 1) {
4548 *flagp |= flags & SIMPLE;
4549 }
4550 return ret;
4551 }
4552
4553 #define RBRACE 0
4554 #define MIN_S 1
4555 #define MIN_E 2
4556 #define MAX_S 3
4557 #define MAX_E 4
4558
4559 #ifndef PERL_IN_XSUB_RE
4560 bool
Perl_regcurly(const char * s,const char * e,const char * result[5])4561 Perl_regcurly(const char *s, const char *e, const char * result[5])
4562 {
4563 /* This function matches a {m,n} quantifier. When called with a NULL final
4564 * argument, it simply parses the input from 's' up through 'e-1', and
4565 * returns a boolean as to whether or not this input is syntactically a
4566 * {m,n} quantifier.
4567 *
4568 * When called with a non-NULL final parameter, and when the function
4569 * returns TRUE, it additionally stores information into the array
4570 * specified by that parameter about what it found in the parse. The
4571 * parameter must be a pointer into a 5 element array of 'const char *'
4572 * elements. The returned information is as follows:
4573 * result[RBRACE] points to the closing brace
4574 * result[MIN_S] points to the first byte of the lower bound
4575 * result[MIN_E] points to one beyond the final byte of the lower bound
4576 * result[MAX_S] points to the first byte of the upper bound
4577 * result[MAX_E] points to one beyond the final byte of the upper bound
4578 *
4579 * If the quantifier is of the form {m,} (meaning an infinite upper
4580 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
4581 * to is irrelevant, just that it's the same place
4582 *
4583 * If instead the quantifier is of the form {m} there is actually only
4584 * one bound, and both the upper and lower result[] elements are set to
4585 * point to it.
4586 *
4587 * This function checks only for syntactic validity; it leaves checking for
4588 * semantic validity and raising any diagnostics to the caller. This
4589 * function is called in multiple places to check for syntax, but only from
4590 * one for semantics. It makes it as simple as possible for the
4591 * syntax-only callers, while furnishing just enough information for the
4592 * semantic caller.
4593 */
4594
4595 const char * min_start = NULL;
4596 const char * max_start = NULL;
4597 const char * min_end = NULL;
4598 const char * max_end = NULL;
4599
4600 bool has_comma = FALSE;
4601
4602 PERL_ARGS_ASSERT_REGCURLY;
4603
4604 if (s >= e || *s++ != '{')
4605 return FALSE;
4606
4607 while (s < e && isBLANK(*s)) {
4608 s++;
4609 }
4610
4611 if isDIGIT(*s) {
4612 min_start = s;
4613 do {
4614 s++;
4615 } while (s < e && isDIGIT(*s));
4616 min_end = s;
4617 }
4618
4619 while (s < e && isBLANK(*s)) {
4620 s++;
4621 }
4622
4623 if (*s == ',') {
4624 has_comma = TRUE;
4625 s++;
4626
4627 while (s < e && isBLANK(*s)) {
4628 s++;
4629 }
4630
4631 if isDIGIT(*s) {
4632 max_start = s;
4633 do {
4634 s++;
4635 } while (s < e && isDIGIT(*s));
4636 max_end = s;
4637 }
4638 }
4639
4640 while (s < e && isBLANK(*s)) {
4641 s++;
4642 }
4643 /* Need at least one number */
4644 if (s >= e || *s != '}' || (! min_start && ! max_end)) {
4645 return FALSE;
4646 }
4647
4648 if (result) {
4649
4650 result[RBRACE] = s;
4651
4652 result[MIN_S] = min_start;
4653 result[MIN_E] = min_end;
4654 if (has_comma) {
4655 if (max_start) {
4656 result[MAX_S] = max_start;
4657 result[MAX_E] = max_end;
4658 }
4659 else {
4660 /* Having no value after the comma is signalled by setting
4661 * start and end to the same value. What that value is isn't
4662 * relevant; NULL is chosen simply because it will fail if the
4663 * caller mistakenly uses it */
4664 result[MAX_S] = result[MAX_E] = NULL;
4665 }
4666 }
4667 else { /* No comma means lower and upper bounds are the same */
4668 result[MAX_S] = min_start;
4669 result[MAX_E] = min_end;
4670 }
4671 }
4672
4673 return TRUE;
4674 }
4675 #endif
4676
4677 U32
S_get_quantifier_value(pTHX_ RExC_state_t * pRExC_state,const char * start,const char * end)4678 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
4679 const char * start, const char * end)
4680 {
4681 /* This is a helper function for regpiece() to compute, given the
4682 * quantifier {m,n}, the value of either m or n, based on the starting
4683 * position 'start' in the string, through the byte 'end-1', returning it
4684 * if valid, and failing appropriately if not. It knows the restrictions
4685 * imposed on quantifier values */
4686
4687 UV uv;
4688 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
4689
4690 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
4691
4692 if (grok_atoUV(start, &uv, &end)) {
4693 if (uv < REG_INFTY) { /* A valid, small-enough number */
4694 return (U32) uv;
4695 }
4696 }
4697 else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
4698 leading zeros or overflow */
4699 RExC_parse_set((char * ) end);
4700
4701 /* Perhaps too generic a msg for what is only failure from having
4702 * leading zeros, but this is how it's always behaved. */
4703 vFAIL("Invalid quantifier in {,}");
4704 NOT_REACHED; /*NOTREACHED*/
4705 }
4706
4707 /* Here, found a quantifier, but was too large; either it overflowed or was
4708 * too big a legal number */
4709 RExC_parse_set((char * ) end);
4710 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4711
4712 NOT_REACHED; /*NOTREACHED*/
4713 return U32_MAX; /* Perhaps some compilers will be expecting a return */
4714 }
4715
4716 /*
4717 - regpiece - something followed by possible quantifier * + ? {n,m}
4718 *
4719 * Note that the branching code sequences used for ? and the general cases
4720 * of * and + are somewhat optimized: they use the same NOTHING node as
4721 * both the endmarker for their branch list and the body of the last branch.
4722 * It might seem that this node could be dispensed with entirely, but the
4723 * endmarker role is not redundant.
4724 *
4725 * On success, returns the offset at which any next node should be placed into
4726 * the regex engine program being compiled.
4727 *
4728 * Returns 0 otherwise, with *flagp set to indicate why:
4729 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
4730 * RESTART_PARSE if the parse needs to be restarted, or'd with
4731 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
4732 */
4733 STATIC regnode_offset
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)4734 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4735 {
4736 regnode_offset ret;
4737 char op;
4738 I32 flags;
4739 const char * const origparse = RExC_parse;
4740 I32 min;
4741 I32 max = REG_INFTY;
4742 I32 npar_before = RExC_npar-1;
4743
4744 /* Save the original in case we change the emitted regop to a FAIL. */
4745 const regnode_offset orig_emit = RExC_emit;
4746
4747 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4748
4749 PERL_ARGS_ASSERT_REGPIECE;
4750
4751 DEBUG_PARSE("piec");
4752
4753 ret = regatom(pRExC_state, &flags, depth+1);
4754 if (ret == 0) {
4755 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
4756 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
4757 }
4758 I32 npar_after = RExC_npar-1;
4759
4760 op = *RExC_parse;
4761 switch (op) {
4762 const char * regcurly_return[5];
4763
4764 case '*':
4765 nextchar(pRExC_state);
4766 min = 0;
4767 break;
4768
4769 case '+':
4770 nextchar(pRExC_state);
4771 min = 1;
4772 break;
4773
4774 case '?':
4775 nextchar(pRExC_state);
4776 min = 0; max = 1;
4777 break;
4778
4779 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
4780 to determine which */
4781 if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
4782 const char * min_start = regcurly_return[MIN_S];
4783 const char * min_end = regcurly_return[MIN_E];
4784 const char * max_start = regcurly_return[MAX_S];
4785 const char * max_end = regcurly_return[MAX_E];
4786
4787 if (min_start) {
4788 min = get_quantifier_value(pRExC_state, min_start, min_end);
4789 }
4790 else {
4791 min = 0;
4792 }
4793
4794 if (max_start == max_end) { /* Was of the form {m,} */
4795 max = REG_INFTY;
4796 }
4797 else if (max_start == min_start) { /* Was of the form {m} */
4798 max = min;
4799 }
4800 else { /* Was of the form {m,n} */
4801 assert(max_end >= max_start);
4802
4803 max = get_quantifier_value(pRExC_state, max_start, max_end);
4804 }
4805
4806 RExC_parse_set((char *) regcurly_return[RBRACE]);
4807 nextchar(pRExC_state);
4808
4809 if (max < min) { /* If can't match, warn and optimize to fail
4810 unconditionally */
4811 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
4812 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
4813 NEXT_OFF(REGNODE_p(orig_emit)) =
4814 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
4815 return ret;
4816 }
4817 else if (min == max && *RExC_parse == '?') {
4818 ckWARN2reg(RExC_parse + 1,
4819 "Useless use of greediness modifier '%c'",
4820 *RExC_parse);
4821 }
4822
4823 break;
4824 } /* End of is {m,n} */
4825
4826 /* Here was a '{', but what followed it didn't form a quantifier. */
4827 /* FALLTHROUGH */
4828
4829 default:
4830 *flagp = flags;
4831 return(ret);
4832 NOT_REACHED; /*NOTREACHED*/
4833 }
4834
4835 /* Here we have a quantifier, and have calculated 'min' and 'max'.
4836 *
4837 * Check and possibly adjust a zero width operand */
4838 if (! (flags & (HASWIDTH|POSTPONED))) {
4839 if (max > REG_INFTY/3) {
4840 ckWARN2reg(RExC_parse,
4841 "%" UTF8f " matches null string many times",
4842 UTF8fARG(UTF, (RExC_parse >= origparse
4843 ? RExC_parse - origparse
4844 : 0),
4845 origparse));
4846 }
4847
4848 /* There's no point in trying to match something 0 length more than
4849 * once except for extra side effects, which we don't have here since
4850 * not POSTPONED */
4851 if (max > 1) {
4852 max = 1;
4853 if (min > max) {
4854 min = max;
4855 }
4856 }
4857 }
4858
4859 /* If this is a code block pass it up */
4860 *flagp |= (flags & POSTPONED);
4861
4862 if (max > 0) {
4863 *flagp |= (flags & HASWIDTH);
4864 if (max == REG_INFTY)
4865 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
4866 }
4867
4868 /* 'SIMPLE' operands don't require full generality */
4869 if ((flags&SIMPLE)) {
4870 if (max == REG_INFTY) {
4871 if (min == 0) {
4872 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
4873 goto min0_maxINF_wildcard_forbidden;
4874 }
4875
4876 reginsert(pRExC_state, STAR, ret, depth+1);
4877 MARK_NAUGHTY(4);
4878 goto done_main_op;
4879 }
4880 else if (min == 1) {
4881 reginsert(pRExC_state, PLUS, ret, depth+1);
4882 MARK_NAUGHTY(3);
4883 goto done_main_op;
4884 }
4885 }
4886
4887 /* Here, SIMPLE, but not the '*' and '+' special cases */
4888
4889 MARK_NAUGHTY_EXP(2, 2);
4890 reginsert(pRExC_state, CURLY, ret, depth+1);
4891 }
4892 else { /* not SIMPLE */
4893 const regnode_offset w = reg_node(pRExC_state, WHILEM);
4894
4895 FLAGS(REGNODE_p(w)) = 0;
4896 if (! REGTAIL(pRExC_state, ret, w)) {
4897 REQUIRE_BRANCHJ(flagp, 0);
4898 }
4899 if (RExC_use_BRANCHJ) {
4900 reginsert(pRExC_state, LONGJMP, ret, depth+1);
4901 reginsert(pRExC_state, NOTHING, ret, depth+1);
4902 REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
4903 }
4904 reginsert(pRExC_state, CURLYX, ret, depth+1);
4905 if (RExC_use_BRANCHJ)
4906 /* Go over NOTHING to LONGJMP. */
4907 REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
4908
4909 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
4910 NOTHING)))
4911 {
4912 REQUIRE_BRANCHJ(flagp, 0);
4913 }
4914 RExC_whilem_seen++;
4915 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
4916 }
4917
4918 /* Finish up the CURLY/CURLYX case */
4919 FLAGS(REGNODE_p(ret)) = 0;
4920
4921 ARG1i_SET(REGNODE_p(ret), min);
4922 ARG2i_SET(REGNODE_p(ret), max);
4923
4924 /* if we had a npar_after then we need to increment npar_before,
4925 * we want to track the range of parens we need to reset each iteration
4926 */
4927 if (npar_after!=npar_before) {
4928 ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
4929 ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
4930 } else {
4931 ARG3a_SET(REGNODE_p(ret), 0);
4932 ARG3b_SET(REGNODE_p(ret), 0);
4933 }
4934
4935 done_main_op:
4936
4937 /* Process any greediness modifiers */
4938 if (*RExC_parse == '?') {
4939 nextchar(pRExC_state);
4940 reginsert(pRExC_state, MINMOD, ret, depth+1);
4941 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
4942 REQUIRE_BRANCHJ(flagp, 0);
4943 }
4944 }
4945 else if (*RExC_parse == '+') {
4946 regnode_offset ender;
4947 nextchar(pRExC_state);
4948 ender = reg_node(pRExC_state, SUCCEED);
4949 if (! REGTAIL(pRExC_state, ret, ender)) {
4950 REQUIRE_BRANCHJ(flagp, 0);
4951 }
4952 reginsert(pRExC_state, SUSPEND, ret, depth+1);
4953 ender = reg_node(pRExC_state, TAIL);
4954 if (! REGTAIL(pRExC_state, ret, ender)) {
4955 REQUIRE_BRANCHJ(flagp, 0);
4956 }
4957 }
4958
4959 /* Forbid extra quantifiers */
4960 if (isQUANTIFIER(RExC_parse, RExC_end)) {
4961 RExC_parse_inc_by(1);
4962 vFAIL("Nested quantifiers");
4963 }
4964
4965 return(ret);
4966
4967 min0_maxINF_wildcard_forbidden:
4968
4969 /* Here we are in a wildcard match, and the minimum match length is 0, and
4970 * the max could be infinity. This is currently forbidden. The only
4971 * reason is to make it harder to write patterns that take a long long time
4972 * to halt, and because the use of this construct isn't necessary in
4973 * matching Unicode property values */
4974 RExC_parse_inc_by(1);
4975 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
4976 subpatterns in regex; marked by <-- HERE in m/%s/
4977 */
4978 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
4979 " subpatterns");
4980
4981 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
4982 * legal at all in wildcards, so can't get this far */
4983
4984 NOT_REACHED; /*NOTREACHED*/
4985 }
4986
4987 STATIC bool
S_grok_bslash_N(pTHX_ RExC_state_t * pRExC_state,regnode_offset * node_p,UV * code_point_p,int * cp_count,I32 * flagp,const bool strict,const U32 depth)4988 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
4989 regnode_offset * node_p,
4990 UV * code_point_p,
4991 int * cp_count,
4992 I32 * flagp,
4993 const bool strict,
4994 const U32 depth
4995 )
4996 {
4997 /* This routine teases apart the various meanings of \N and returns
4998 * accordingly. The input parameters constrain which meaning(s) is/are valid
4999 * in the current context.
5000 *
5001 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
5002 *
5003 * If <code_point_p> is not NULL, the context is expecting the result to be a
5004 * single code point. If this \N instance turns out to a single code point,
5005 * the function returns TRUE and sets *code_point_p to that code point.
5006 *
5007 * If <node_p> is not NULL, the context is expecting the result to be one of
5008 * the things representable by a regnode. If this \N instance turns out to be
5009 * one such, the function generates the regnode, returns TRUE and sets *node_p
5010 * to point to the offset of that regnode into the regex engine program being
5011 * compiled.
5012 *
5013 * If this instance of \N isn't legal in any context, this function will
5014 * generate a fatal error and not return.
5015 *
5016 * On input, RExC_parse should point to the first char following the \N at the
5017 * time of the call. On successful return, RExC_parse will have been updated
5018 * to point to just after the sequence identified by this routine. Also
5019 * *flagp has been updated as needed.
5020 *
5021 * When there is some problem with the current context and this \N instance,
5022 * the function returns FALSE, without advancing RExC_parse, nor setting
5023 * *node_p, nor *code_point_p, nor *flagp.
5024 *
5025 * If <cp_count> is not NULL, the caller wants to know the length (in code
5026 * points) that this \N sequence matches. This is set, and the input is
5027 * parsed for errors, even if the function returns FALSE, as detailed below.
5028 *
5029 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
5030 *
5031 * Probably the most common case is for the \N to specify a single code point.
5032 * *cp_count will be set to 1, and *code_point_p will be set to that code
5033 * point.
5034 *
5035 * Another possibility is for the input to be an empty \N{}. This is no
5036 * longer accepted, and will generate a fatal error.
5037 *
5038 * Another possibility is for a custom charnames handler to be in effect which
5039 * translates the input name to an empty string. *cp_count will be set to 0.
5040 * *node_p will be set to a generated NOTHING node.
5041 *
5042 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
5043 * set to 0. *node_p will be set to a generated REG_ANY node.
5044 *
5045 * The fifth possibility is that \N resolves to a sequence of more than one
5046 * code points. *cp_count will be set to the number of code points in the
5047 * sequence. *node_p will be set to a generated node returned by this
5048 * function calling S_reg().
5049 *
5050 * The sixth and final possibility is that it is premature to be calling this
5051 * function; the parse needs to be restarted. This can happen when this
5052 * changes from /d to /u rules, or when the pattern needs to be upgraded to
5053 * UTF-8. The latter occurs only when the fifth possibility would otherwise
5054 * be in effect, and is because one of those code points requires the pattern
5055 * to be recompiled as UTF-8. The function returns FALSE, and sets the
5056 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
5057 * happens, the caller needs to desist from continuing parsing, and return
5058 * this information to its caller. This is not set for when there is only one
5059 * code point, as this can be called as part of an ANYOF node, and they can
5060 * store above-Latin1 code points without the pattern having to be in UTF-8.
5061 *
5062 * For non-single-quoted regexes, the tokenizer has resolved character and
5063 * sequence names inside \N{...} into their Unicode values, normalizing the
5064 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
5065 * hex-represented code points in the sequence. This is done there because
5066 * the names can vary based on what charnames pragma is in scope at the time,
5067 * so we need a way to take a snapshot of what they resolve to at the time of
5068 * the original parse. [perl #56444].
5069 *
5070 * That parsing is skipped for single-quoted regexes, so here we may get
5071 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
5072 * like '\N{U+41}', that code point is Unicode, and has to be translated into
5073 * the native character set for non-ASCII platforms. The other possibilities
5074 * are already native, so no translation is done. */
5075
5076 char * endbrace; /* points to '}' following the name */
5077 char * e; /* points to final non-blank before endbrace */
5078 char* p = RExC_parse; /* Temporary */
5079
5080 SV * substitute_parse = NULL;
5081 char *orig_end;
5082 char *save_start;
5083 I32 flags;
5084
5085 DECLARE_AND_GET_RE_DEBUG_FLAGS;
5086
5087 PERL_ARGS_ASSERT_GROK_BSLASH_N;
5088
5089 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
5090 assert(! (node_p && cp_count)); /* At most 1 should be set */
5091
5092 if (cp_count) { /* Initialize return for the most common case */
5093 *cp_count = 1;
5094 }
5095
5096 /* The [^\n] meaning of \N ignores spaces and comments under the /x
5097 * modifier. The other meanings do not (except blanks adjacent to and
5098 * within the braces), so use a temporary until we find out which we are
5099 * being called with */
5100 skip_to_be_ignored_text(pRExC_state, &p,
5101 FALSE /* Don't force to /x */ );
5102
5103 /* Disambiguate between \N meaning a named character versus \N meaning
5104 * [^\n]. The latter is assumed when the {...} following the \N is a legal
5105 * quantifier, or if there is no '{' at all */
5106 if (*p != '{' || regcurly(p, RExC_end, NULL)) {
5107 RExC_parse_set(p);
5108 if (cp_count) {
5109 *cp_count = -1;
5110 }
5111
5112 if (! node_p) {
5113 return FALSE;
5114 }
5115
5116 *node_p = reg_node(pRExC_state, REG_ANY);
5117 *flagp |= HASWIDTH|SIMPLE;
5118 MARK_NAUGHTY(1);
5119 return TRUE;
5120 }
5121
5122 /* The test above made sure that the next real character is a '{', but
5123 * under the /x modifier, it could be separated by space (or a comment and
5124 * \n) and this is not allowed (for consistency with \x{...} and the
5125 * tokenizer handling of \N{NAME}). */
5126 if (*RExC_parse != '{') {
5127 vFAIL("Missing braces on \\N{}");
5128 }
5129
5130 RExC_parse_inc_by(1); /* Skip past the '{' */
5131
5132 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
5133 if (! endbrace) { /* no trailing brace */
5134 vFAIL2("Missing right brace on \\%c{}", 'N');
5135 }
5136
5137 /* Here, we have decided it should be a named character or sequence. These
5138 * imply Unicode semantics */
5139 REQUIRE_UNI_RULES(flagp, FALSE);
5140
5141 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
5142 * nothing at all (not allowed under strict) */
5143 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
5144 RExC_parse_set(endbrace);
5145 if (strict) {
5146 RExC_parse_inc_by(1); /* Position after the "}" */
5147 vFAIL("Zero length \\N{}");
5148 }
5149
5150 if (cp_count) {
5151 *cp_count = 0;
5152 }
5153 nextchar(pRExC_state);
5154 if (! node_p) {
5155 return FALSE;
5156 }
5157
5158 *node_p = reg_node(pRExC_state, NOTHING);
5159 return TRUE;
5160 }
5161
5162 while (isBLANK(*RExC_parse)) {
5163 RExC_parse_inc_by(1);
5164 }
5165
5166 e = endbrace;
5167 while (RExC_parse < e && isBLANK(*(e-1))) {
5168 e--;
5169 }
5170
5171 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
5172
5173 /* Here, the name isn't of the form U+.... This can happen if the
5174 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
5175 * is the time to find out what the name means */
5176
5177 const STRLEN name_len = e - RExC_parse;
5178 SV * value_sv; /* What does this name evaluate to */
5179 SV ** value_svp;
5180 const U8 * value; /* string of name's value */
5181 STRLEN value_len; /* and its length */
5182
5183 /* RExC_unlexed_names is a hash of names that weren't evaluated by
5184 * toke.c, and their values. Make sure is initialized */
5185 if (! RExC_unlexed_names) {
5186 RExC_unlexed_names = newHV();
5187 }
5188
5189 /* If we have already seen this name in this pattern, use that. This
5190 * allows us to only call the charnames handler once per name per
5191 * pattern. A broken or malicious handler could return something
5192 * different each time, which could cause the results to vary depending
5193 * on if something gets added or subtracted from the pattern that
5194 * causes the number of passes to change, for example */
5195 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
5196 name_len, 0)))
5197 {
5198 value_sv = *value_svp;
5199 }
5200 else { /* Otherwise we have to go out and get the name */
5201 const char * error_msg = NULL;
5202 value_sv = get_and_check_backslash_N_name(RExC_parse, e,
5203 UTF,
5204 &error_msg);
5205 if (error_msg) {
5206 RExC_parse_set(endbrace);
5207 vFAIL(error_msg);
5208 }
5209
5210 /* If no error message, should have gotten a valid return */
5211 assert (value_sv);
5212
5213 /* Save the name's meaning for later use */
5214 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
5215 value_sv, 0))
5216 {
5217 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
5218 }
5219 }
5220
5221 /* Here, we have the value the name evaluates to in 'value_sv' */
5222 value = (U8 *) SvPV(value_sv, value_len);
5223
5224 /* See if the result is one code point vs 0 or multiple */
5225 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
5226 ? UTF8SKIP(value)
5227 : 1)))
5228 {
5229 /* Here, exactly one code point. If that isn't what is wanted,
5230 * fail */
5231 if (! code_point_p) {
5232 RExC_parse_set(p);
5233 return FALSE;
5234 }
5235
5236 /* Convert from string to numeric code point */
5237 *code_point_p = (SvUTF8(value_sv))
5238 ? valid_utf8_to_uvchr(value, NULL)
5239 : *value;
5240
5241 /* Have parsed this entire single code point \N{...}. *cp_count
5242 * has already been set to 1, so don't do it again. */
5243 RExC_parse_set(endbrace);
5244 nextchar(pRExC_state);
5245 return TRUE;
5246 } /* End of is a single code point */
5247
5248 /* Count the code points, if caller desires. The API says to do this
5249 * even if we will later return FALSE */
5250 if (cp_count) {
5251 *cp_count = 0;
5252
5253 *cp_count = (SvUTF8(value_sv))
5254 ? utf8_length(value, value + value_len)
5255 : value_len;
5256 }
5257
5258 /* Fail if caller doesn't want to handle a multi-code-point sequence.
5259 * But don't back the pointer up if the caller wants to know how many
5260 * code points there are (they need to handle it themselves in this
5261 * case). */
5262 if (! node_p) {
5263 if (! cp_count) {
5264 RExC_parse_set(p);
5265 }
5266 return FALSE;
5267 }
5268
5269 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
5270 * reg recursively to parse it. That way, it retains its atomicness,
5271 * while not having to worry about any special handling that some code
5272 * points may have. */
5273
5274 substitute_parse = newSVpvs("?:");
5275 sv_catsv(substitute_parse, value_sv);
5276 sv_catpv(substitute_parse, ")");
5277
5278 /* The value should already be native, so no need to convert on EBCDIC
5279 * platforms.*/
5280 assert(! RExC_recode_x_to_native);
5281
5282 }
5283 else { /* \N{U+...} */
5284 Size_t count = 0; /* code point count kept internally */
5285
5286 /* We can get to here when the input is \N{U+...} or when toke.c has
5287 * converted a name to the \N{U+...} form. This include changing a
5288 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
5289
5290 RExC_parse_inc_by(2); /* Skip past the 'U+' */
5291
5292 /* Code points are separated by dots. The '}' terminates the whole
5293 * thing. */
5294
5295 do { /* Loop until the ending brace */
5296 I32 flags = PERL_SCAN_SILENT_OVERFLOW
5297 | PERL_SCAN_SILENT_ILLDIGIT
5298 | PERL_SCAN_NOTIFY_ILLDIGIT
5299 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
5300 | PERL_SCAN_DISALLOW_PREFIX;
5301 STRLEN len = e - RExC_parse;
5302 NV overflow_value;
5303 char * start_digit = RExC_parse;
5304 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
5305
5306 if (len == 0) {
5307 RExC_parse_inc_by(1);
5308 bad_NU:
5309 vFAIL("Invalid hexadecimal number in \\N{U+...}");
5310 }
5311
5312 RExC_parse_inc_by(len);
5313
5314 if (cp > MAX_LEGAL_CP) {
5315 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
5316 }
5317
5318 if (RExC_parse >= e) { /* Got to the closing '}' */
5319 if (count) {
5320 goto do_concat;
5321 }
5322
5323 /* Here, is a single code point; fail if doesn't want that */
5324 if (! code_point_p) {
5325 RExC_parse_set(p);
5326 return FALSE;
5327 }
5328
5329 /* A single code point is easy to handle; just return it */
5330 *code_point_p = UNI_TO_NATIVE(cp);
5331 RExC_parse_set(endbrace);
5332 nextchar(pRExC_state);
5333 return TRUE;
5334 }
5335
5336 /* Here, the parse stopped bfore the ending brace. This is legal
5337 * only if that character is a dot separating code points, like a
5338 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
5339 * So the next character must be a dot (and the one after that
5340 * can't be the ending brace, or we'd have something like
5341 * \N{U+100.} )
5342 * */
5343 if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
5344 /*point to after 1st invalid */
5345 RExC_parse_incf(RExC_orig_utf8);
5346 /*Guard against malformed utf8*/
5347 RExC_parse_set(MIN(e, RExC_parse));
5348 goto bad_NU;
5349 }
5350
5351 /* Here, looks like its really a multiple character sequence. Fail
5352 * if that's not what the caller wants. But continue with counting
5353 * and error checking if they still want a count */
5354 if (! node_p && ! cp_count) {
5355 return FALSE;
5356 }
5357
5358 /* What is done here is to convert this to a sub-pattern of the
5359 * form \x{char1}\x{char2}... and then call reg recursively to
5360 * parse it (enclosing in "(?: ... )" ). That way, it retains its
5361 * atomicness, while not having to worry about special handling
5362 * that some code points may have. We don't create a subpattern,
5363 * but go through the motions of code point counting and error
5364 * checking, if the caller doesn't want a node returned. */
5365
5366 if (node_p && ! substitute_parse) {
5367 substitute_parse = newSVpvs("?:");
5368 }
5369
5370 do_concat:
5371
5372 if (node_p) {
5373 /* Convert to notation the rest of the code understands */
5374 sv_catpvs(substitute_parse, "\\x{");
5375 sv_catpvn(substitute_parse, start_digit,
5376 RExC_parse - start_digit);
5377 sv_catpvs(substitute_parse, "}");
5378 }
5379
5380 /* Move to after the dot (or ending brace the final time through.)
5381 * */
5382 RExC_parse_inc_by(1);
5383 count++;
5384
5385 } while (RExC_parse < e);
5386
5387 if (! node_p) { /* Doesn't want the node */
5388 assert (cp_count);
5389
5390 *cp_count = count;
5391 return FALSE;
5392 }
5393
5394 sv_catpvs(substitute_parse, ")");
5395
5396 /* The values are Unicode, and therefore have to be converted to native
5397 * on a non-Unicode (meaning non-ASCII) platform. */
5398 SET_recode_x_to_native(1);
5399 }
5400
5401 /* Here, we have the string the name evaluates to, ready to be parsed,
5402 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
5403 * constructs. This can be called from within a substitute parse already.
5404 * The error reporting mechanism doesn't work for 2 levels of this, but the
5405 * code above has validated this new construct, so there should be no
5406 * errors generated by the below. And this isn't an exact copy, so the
5407 * mechanism to seamlessly deal with this won't work, so turn off warnings
5408 * during it */
5409 save_start = RExC_start;
5410 orig_end = RExC_end;
5411
5412 RExC_start = SvPVX(substitute_parse);
5413 RExC_parse_set(RExC_start);
5414 RExC_end = RExC_parse + SvCUR(substitute_parse);
5415 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
5416
5417 *node_p = reg(pRExC_state, 1, &flags, depth+1);
5418
5419 /* Restore the saved values */
5420 RESTORE_WARNINGS;
5421 RExC_start = save_start;
5422 RExC_parse_set(endbrace);
5423 RExC_end = orig_end;
5424 SET_recode_x_to_native(0);
5425
5426 SvREFCNT_dec_NN(substitute_parse);
5427
5428 if (! *node_p) {
5429 RETURN_FAIL_ON_RESTART(flags, flagp);
5430 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
5431 (UV) flags);
5432 }
5433 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5434
5435 nextchar(pRExC_state);
5436
5437 return TRUE;
5438 }
5439
5440
5441 STATIC U8
S_compute_EXACTish(RExC_state_t * pRExC_state)5442 S_compute_EXACTish(RExC_state_t *pRExC_state)
5443 {
5444 U8 op;
5445
5446 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
5447
5448 if (! FOLD) {
5449 return (LOC)
5450 ? EXACTL
5451 : EXACT;
5452 }
5453
5454 op = get_regex_charset(RExC_flags);
5455 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
5456 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
5457 been, so there is no hole */
5458 }
5459
5460 return op + EXACTF;
5461 }
5462
5463 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
5464 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
5465
5466 static I32
S_backref_value(char * p,char * e)5467 S_backref_value(char *p, char *e)
5468 {
5469 const char* endptr = e;
5470 UV val;
5471 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
5472 return (I32)val;
5473 return I32_MAX;
5474 }
5475
5476
5477 /*
5478 - regatom - the lowest level
5479
5480 Try to identify anything special at the start of the current parse position.
5481 If there is, then handle it as required. This may involve generating a
5482 single regop, such as for an assertion; or it may involve recursing, such as
5483 to handle a () structure.
5484
5485 If the string doesn't start with something special then we gobble up
5486 as much literal text as we can. If we encounter a quantifier, we have to
5487 back off the final literal character, as that quantifier applies to just it
5488 and not to the whole string of literals.
5489
5490 Once we have been able to handle whatever type of thing started the
5491 sequence, we return the offset into the regex engine program being compiled
5492 at which any next regnode should be placed.
5493
5494 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
5495 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
5496 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
5497 Otherwise does not return 0.
5498
5499 Note: we have to be careful with escapes, as they can be both literal
5500 and special, and in the case of \10 and friends, context determines which.
5501
5502 A summary of the code structure is:
5503
5504 switch (first_byte) {
5505 cases for each special:
5506 handle this special;
5507 break;
5508 case '\\':
5509 switch (2nd byte) {
5510 cases for each unambiguous special:
5511 handle this special;
5512 break;
5513 cases for each ambiguous special/literal:
5514 disambiguate;
5515 if (special) handle here
5516 else goto defchar;
5517 default: // unambiguously literal:
5518 goto defchar;
5519 }
5520 default: // is a literal char
5521 // FALL THROUGH
5522 defchar:
5523 create EXACTish node for literal;
5524 while (more input and node isn't full) {
5525 switch (input_byte) {
5526 cases for each special;
5527 make sure parse pointer is set so that the next call to
5528 regatom will see this special first
5529 goto loopdone; // EXACTish node terminated by prev. char
5530 default:
5531 append char to EXACTISH node;
5532 }
5533 get next input byte;
5534 }
5535 loopdone:
5536 }
5537 return the generated node;
5538
5539 Specifically there are two separate switches for handling
5540 escape sequences, with the one for handling literal escapes requiring
5541 a dummy entry for all of the special escapes that are actually handled
5542 by the other.
5543
5544 */
5545
5546 STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)5547 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5548 {
5549 regnode_offset ret = 0;
5550 I32 flags = 0;
5551 char *atom_parse_start;
5552 U8 op;
5553 int invert = 0;
5554
5555 DECLARE_AND_GET_RE_DEBUG_FLAGS;
5556
5557 *flagp = 0; /* Initialize. */
5558
5559 DEBUG_PARSE("atom");
5560
5561 PERL_ARGS_ASSERT_REGATOM;
5562
5563 tryagain:
5564 atom_parse_start = RExC_parse;
5565 assert(RExC_parse < RExC_end);
5566 switch ((U8)*RExC_parse) {
5567 case '^':
5568 RExC_seen_zerolen++;
5569 nextchar(pRExC_state);
5570 if (RExC_flags & RXf_PMf_MULTILINE)
5571 ret = reg_node(pRExC_state, MBOL);
5572 else
5573 ret = reg_node(pRExC_state, SBOL);
5574 break;
5575 case '$':
5576 nextchar(pRExC_state);
5577 if (*RExC_parse)
5578 RExC_seen_zerolen++;
5579 if (RExC_flags & RXf_PMf_MULTILINE)
5580 ret = reg_node(pRExC_state, MEOL);
5581 else
5582 ret = reg_node(pRExC_state, SEOL);
5583 break;
5584 case '.':
5585 nextchar(pRExC_state);
5586 if (RExC_flags & RXf_PMf_SINGLELINE)
5587 ret = reg_node(pRExC_state, SANY);
5588 else
5589 ret = reg_node(pRExC_state, REG_ANY);
5590 *flagp |= HASWIDTH|SIMPLE;
5591 MARK_NAUGHTY(1);
5592 break;
5593 case '[':
5594 {
5595 char * const cc_parse_start = ++RExC_parse;
5596 ret = regclass(pRExC_state, flagp, depth+1,
5597 FALSE, /* means parse the whole char class */
5598 TRUE, /* allow multi-char folds */
5599 FALSE, /* don't silence non-portable warnings. */
5600 (bool) RExC_strict,
5601 TRUE, /* Allow an optimized regnode result */
5602 NULL);
5603 if (ret == 0) {
5604 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5605 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5606 (UV) *flagp);
5607 }
5608 if (*RExC_parse != ']') {
5609 RExC_parse_set(cc_parse_start);
5610 vFAIL("Unmatched [");
5611 }
5612 nextchar(pRExC_state);
5613 break;
5614 }
5615 case '(':
5616 nextchar(pRExC_state);
5617 ret = reg(pRExC_state, 2, &flags, depth+1);
5618 if (ret == 0) {
5619 if (flags & TRYAGAIN) {
5620 if (RExC_parse >= RExC_end) {
5621 /* Make parent create an empty node if needed. */
5622 *flagp |= TRYAGAIN;
5623 return(0);
5624 }
5625 goto tryagain;
5626 }
5627 RETURN_FAIL_ON_RESTART(flags, flagp);
5628 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
5629 (UV) flags);
5630 }
5631 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5632 break;
5633 case '|':
5634 case ')':
5635 if (flags & TRYAGAIN) {
5636 *flagp |= TRYAGAIN;
5637 return 0;
5638 }
5639 vFAIL("Internal urp");
5640 /* Supposed to be caught earlier. */
5641 break;
5642 case '?':
5643 case '+':
5644 case '*':
5645 RExC_parse_inc_by(1);
5646 vFAIL("Quantifier follows nothing");
5647 break;
5648 case '\\':
5649 /* Special Escapes
5650
5651 This switch handles escape sequences that resolve to some kind
5652 of special regop and not to literal text. Escape sequences that
5653 resolve to literal text are handled below in the switch marked
5654 "Literal Escapes".
5655
5656 Every entry in this switch *must* have a corresponding entry
5657 in the literal escape switch. However, the opposite is not
5658 required, as the default for this switch is to jump to the
5659 literal text handling code.
5660 */
5661 RExC_parse_inc_by(1);
5662 switch ((U8)*RExC_parse) {
5663 /* Special Escapes */
5664 case 'A':
5665 RExC_seen_zerolen++;
5666 /* Under wildcards, this is changed to match \n; should be
5667 * invisible to the user, as they have to compile under /m */
5668 if (RExC_pm_flags & PMf_WILDCARD) {
5669 ret = reg_node(pRExC_state, MBOL);
5670 }
5671 else {
5672 ret = reg_node(pRExC_state, SBOL);
5673 /* SBOL is shared with /^/ so we set the flags so we can tell
5674 * /\A/ from /^/ in split. */
5675 FLAGS(REGNODE_p(ret)) = 1;
5676 }
5677 goto finish_meta_pat;
5678 case 'G':
5679 if (RExC_pm_flags & PMf_WILDCARD) {
5680 RExC_parse_inc_by(1);
5681 /* diag_listed_as: Use of %s is not allowed in Unicode property
5682 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
5683 */
5684 vFAIL("Use of '\\G' is not allowed in Unicode property"
5685 " wildcard subpatterns");
5686 }
5687 ret = reg_node(pRExC_state, GPOS);
5688 RExC_seen |= REG_GPOS_SEEN;
5689 goto finish_meta_pat;
5690 case 'K':
5691 if (!RExC_in_lookaround) {
5692 RExC_seen_zerolen++;
5693 ret = reg_node(pRExC_state, KEEPS);
5694 /* XXX:dmq : disabling in-place substitution seems to
5695 * be necessary here to avoid cases of memory corruption, as
5696 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
5697 */
5698 RExC_seen |= REG_LOOKBEHIND_SEEN;
5699 goto finish_meta_pat;
5700 }
5701 else {
5702 ++RExC_parse; /* advance past the 'K' */
5703 vFAIL("\\K not permitted in lookahead/lookbehind");
5704 }
5705 case 'Z':
5706 if (RExC_pm_flags & PMf_WILDCARD) {
5707 /* See comment under \A above */
5708 ret = reg_node(pRExC_state, MEOL);
5709 }
5710 else {
5711 ret = reg_node(pRExC_state, SEOL);
5712 }
5713 RExC_seen_zerolen++; /* Do not optimize RE away */
5714 goto finish_meta_pat;
5715 case 'z':
5716 if (RExC_pm_flags & PMf_WILDCARD) {
5717 /* See comment under \A above */
5718 ret = reg_node(pRExC_state, MEOL);
5719 }
5720 else {
5721 ret = reg_node(pRExC_state, EOS);
5722 }
5723 RExC_seen_zerolen++; /* Do not optimize RE away */
5724 goto finish_meta_pat;
5725 case 'C':
5726 vFAIL("\\C no longer supported");
5727 case 'X':
5728 ret = reg_node(pRExC_state, CLUMP);
5729 *flagp |= HASWIDTH;
5730 goto finish_meta_pat;
5731
5732 case 'B':
5733 invert = 1;
5734 /* FALLTHROUGH */
5735 case 'b':
5736 {
5737 U8 flags = 0;
5738 regex_charset charset = get_regex_charset(RExC_flags);
5739
5740 RExC_seen_zerolen++;
5741 RExC_seen |= REG_LOOKBEHIND_SEEN;
5742 op = BOUND + charset;
5743
5744 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
5745 flags = TRADITIONAL_BOUND;
5746 if (op > BOUNDA) { /* /aa is same as /a */
5747 op = BOUNDA;
5748 }
5749 }
5750 else {
5751 STRLEN length;
5752 char name = *RExC_parse;
5753 char * endbrace = (char *) memchr(RExC_parse, '}',
5754 RExC_end - RExC_parse);
5755 char * e = endbrace;
5756
5757 RExC_parse_inc_by(2);
5758
5759 if (! endbrace) {
5760 vFAIL2("Missing right brace on \\%c{}", name);
5761 }
5762
5763 while (isBLANK(*RExC_parse)) {
5764 RExC_parse_inc_by(1);
5765 }
5766
5767 while (RExC_parse < e && isBLANK(*(e - 1))) {
5768 e--;
5769 }
5770
5771 if (e == RExC_parse) {
5772 RExC_parse_set(endbrace + 1); /* After the '}' */
5773 vFAIL2("Empty \\%c{}", name);
5774 }
5775
5776 length = e - RExC_parse;
5777
5778 switch (*RExC_parse) {
5779 case 'g':
5780 if ( length != 1
5781 && (memNEs(RExC_parse + 1, length - 1, "cb")))
5782 {
5783 goto bad_bound_type;
5784 }
5785 flags = GCB_BOUND;
5786 break;
5787 case 'l':
5788 if (length != 2 || *(RExC_parse + 1) != 'b') {
5789 goto bad_bound_type;
5790 }
5791 flags = LB_BOUND;
5792 break;
5793 case 's':
5794 if (length != 2 || *(RExC_parse + 1) != 'b') {
5795 goto bad_bound_type;
5796 }
5797 flags = SB_BOUND;
5798 break;
5799 case 'w':
5800 if (length != 2 || *(RExC_parse + 1) != 'b') {
5801 goto bad_bound_type;
5802 }
5803 flags = WB_BOUND;
5804 break;
5805 default:
5806 bad_bound_type:
5807 RExC_parse_set(e);
5808 vFAIL2utf8f(
5809 "'%" UTF8f "' is an unknown bound type",
5810 UTF8fARG(UTF, length, e - length));
5811 NOT_REACHED; /*NOTREACHED*/
5812 }
5813 RExC_parse_set(endbrace);
5814 REQUIRE_UNI_RULES(flagp, 0);
5815
5816 if (op == BOUND) {
5817 op = BOUNDU;
5818 }
5819 else if (op >= BOUNDA) { /* /aa is same as /a */
5820 op = BOUNDU;
5821 length += 4;
5822
5823 /* Don't have to worry about UTF-8, in this message because
5824 * to get here the contents of the \b must be ASCII */
5825 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
5826 "Using /u for '%.*s' instead of /%s",
5827 (unsigned) length,
5828 endbrace - length + 1,
5829 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
5830 ? ASCII_RESTRICT_PAT_MODS
5831 : ASCII_MORE_RESTRICT_PAT_MODS);
5832 }
5833 }
5834
5835 if (op == BOUND) {
5836 RExC_seen_d_op = TRUE;
5837 }
5838 else if (op == BOUNDL) {
5839 RExC_contains_locale = 1;
5840 }
5841
5842 if (invert) {
5843 op += NBOUND - BOUND;
5844 }
5845
5846 ret = reg_node(pRExC_state, op);
5847 FLAGS(REGNODE_p(ret)) = flags;
5848
5849 goto finish_meta_pat;
5850 }
5851
5852 case 'R':
5853 ret = reg_node(pRExC_state, LNBREAK);
5854 *flagp |= HASWIDTH|SIMPLE;
5855 goto finish_meta_pat;
5856
5857 case 'd':
5858 case 'D':
5859 case 'h':
5860 case 'H':
5861 case 'p':
5862 case 'P':
5863 case 's':
5864 case 'S':
5865 case 'v':
5866 case 'V':
5867 case 'w':
5868 case 'W':
5869 /* These all have the same meaning inside [brackets], and it knows
5870 * how to do the best optimizations for them. So, pretend we found
5871 * these within brackets, and let it do the work */
5872 RExC_parse--;
5873
5874 ret = regclass(pRExC_state, flagp, depth+1,
5875 TRUE, /* means just parse this element */
5876 FALSE, /* don't allow multi-char folds */
5877 FALSE, /* don't silence non-portable warnings. It
5878 would be a bug if these returned
5879 non-portables */
5880 (bool) RExC_strict,
5881 TRUE, /* Allow an optimized regnode result */
5882 NULL);
5883 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5884 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
5885 * multi-char folds are allowed. */
5886 if (!ret)
5887 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5888 (UV) *flagp);
5889
5890 RExC_parse--; /* regclass() leaves this one too far ahead */
5891
5892 finish_meta_pat:
5893 /* The escapes above that don't take a parameter can't be
5894 * followed by a '{'. But 'pX', 'p{foo}' and
5895 * correspondingly 'P' can be */
5896 if ( RExC_parse - atom_parse_start == 1
5897 && UCHARAT(RExC_parse + 1) == '{'
5898 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
5899 {
5900 RExC_parse_inc_by(2);
5901 vFAIL("Unescaped left brace in regex is illegal here");
5902 }
5903 nextchar(pRExC_state);
5904 break;
5905 case 'N':
5906 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
5907 * \N{...} evaluates to a sequence of more than one code points).
5908 * The function call below returns a regnode, which is our result.
5909 * The parameters cause it to fail if the \N{} evaluates to a
5910 * single code point; we handle those like any other literal. The
5911 * reason that the multicharacter case is handled here and not as
5912 * part of the EXACtish code is because of quantifiers. In
5913 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
5914 * this way makes that Just Happen. dmq.
5915 * join_exact() will join this up with adjacent EXACTish nodes
5916 * later on, if appropriate. */
5917 ++RExC_parse;
5918 if (grok_bslash_N(pRExC_state,
5919 &ret, /* Want a regnode returned */
5920 NULL, /* Fail if evaluates to a single code
5921 point */
5922 NULL, /* Don't need a count of how many code
5923 points */
5924 flagp,
5925 RExC_strict,
5926 depth)
5927 ) {
5928 break;
5929 }
5930
5931 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5932
5933 /* Here, evaluates to a single code point. Go get that */
5934 RExC_parse_set(atom_parse_start);
5935 goto defchar;
5936
5937 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
5938 parse_named_seq: /* Also handle non-numeric \g{...} */
5939 {
5940 char ch;
5941 if ( RExC_parse >= RExC_end - 1
5942 || (( ch = RExC_parse[1]) != '<'
5943 && ch != '\''
5944 && ch != '{'))
5945 {
5946 RExC_parse_inc_by(1);
5947 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
5948 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
5949 } else {
5950 RExC_parse_inc_by(2);
5951 if (ch == '{') {
5952 while (isBLANK(*RExC_parse)) {
5953 RExC_parse_inc_by(1);
5954 }
5955 }
5956 ret = handle_named_backref(pRExC_state,
5957 flagp,
5958 atom_parse_start,
5959 (ch == '<')
5960 ? '>'
5961 : (ch == '{')
5962 ? '}'
5963 : '\'');
5964 }
5965 break;
5966 }
5967 case 'g':
5968 case '1': case '2': case '3': case '4':
5969 case '5': case '6': case '7': case '8': case '9':
5970 {
5971 I32 num;
5972 char * endbrace = NULL;
5973 char * s = RExC_parse;
5974 char * e = RExC_end;
5975
5976 if (*s == 'g') {
5977 bool isrel = 0;
5978
5979 s++;
5980 if (*s == '{') {
5981 endbrace = (char *) memchr(s, '}', RExC_end - s);
5982 if (! endbrace ) {
5983
5984 /* Missing '}'. Position after the number to give
5985 * a better indication to the user of where the
5986 * problem is. */
5987 s++;
5988 if (*s == '-') {
5989 s++;
5990 }
5991
5992 /* If it looks to be a name and not a number, go
5993 * handle it there */
5994 if (! isDIGIT(*s)) {
5995 goto parse_named_seq;
5996 }
5997
5998 do {
5999 s++;
6000 } while isDIGIT(*s);
6001
6002 RExC_parse_set(s);
6003 vFAIL("Unterminated \\g{...} pattern");
6004 }
6005
6006 s++; /* Past the '{' */
6007
6008 while (isBLANK(*s)) {
6009 s++;
6010 }
6011
6012 /* Ignore trailing blanks */
6013 e = endbrace;
6014 while (s < e && isBLANK(*(e - 1))) {
6015 e--;
6016 }
6017 }
6018
6019 /* Here, have isolated the meat of the construct from any
6020 * surrounding braces */
6021
6022 if (*s == '-') {
6023 isrel = 1;
6024 s++;
6025 }
6026
6027 if (endbrace && !isDIGIT(*s)) {
6028 goto parse_named_seq;
6029 }
6030
6031 RExC_parse_set(s);
6032 num = S_backref_value(RExC_parse, RExC_end);
6033 if (num == 0)
6034 vFAIL("Reference to invalid group 0");
6035 else if (num == I32_MAX) {
6036 if (isDIGIT(*RExC_parse))
6037 vFAIL("Reference to nonexistent group");
6038 else
6039 vFAIL("Unterminated \\g... pattern");
6040 }
6041
6042 if (isrel) {
6043 num = RExC_npar - num;
6044 if (num < 1)
6045 vFAIL("Reference to nonexistent or unclosed group");
6046 }
6047 else
6048 if (num < RExC_logical_npar) {
6049 num = RExC_logical_to_parno[num];
6050 }
6051 else
6052 if (ALL_PARENS_COUNTED) {
6053 if (num < RExC_logical_total_parens)
6054 num = RExC_logical_to_parno[num];
6055 else {
6056 num = -1;
6057 }
6058 }
6059 else{
6060 REQUIRE_PARENS_PASS;
6061 }
6062 }
6063 else {
6064 num = S_backref_value(RExC_parse, RExC_end);
6065 /* bare \NNN might be backref or octal - if it is larger
6066 * than or equal RExC_npar then it is assumed to be an
6067 * octal escape. Note RExC_npar is +1 from the actual
6068 * number of parens. */
6069 /* Note we do NOT check if num == I32_MAX here, as that is
6070 * handled by the RExC_npar check */
6071
6072 if ( /* any numeric escape < 10 is always a backref */
6073 num > 9
6074 /* any numeric escape < RExC_npar is a backref */
6075 && num >= RExC_logical_npar
6076 /* cannot be an octal escape if it starts with [89]
6077 * */
6078 && ! inRANGE(*RExC_parse, '8', '9')
6079 ) {
6080 /* Probably not meant to be a backref, instead likely
6081 * to be an octal character escape, e.g. \35 or \777.
6082 * The above logic should make it obvious why using
6083 * octal escapes in patterns is problematic. - Yves */
6084 RExC_parse_set(atom_parse_start);
6085 goto defchar;
6086 }
6087 if (num < RExC_logical_npar) {
6088 num = RExC_logical_to_parno[num];
6089 }
6090 else
6091 if (ALL_PARENS_COUNTED) {
6092 if (num < RExC_logical_total_parens) {
6093 num = RExC_logical_to_parno[num];
6094 } else {
6095 num = -1;
6096 }
6097 } else {
6098 REQUIRE_PARENS_PASS;
6099 }
6100 }
6101
6102 /* At this point RExC_parse points at a numeric escape like
6103 * \12 or \88 or the digits in \g{34} or \g34 or something
6104 * similar, which we should NOT treat as an octal escape. It
6105 * may or may not be a valid backref escape. For instance
6106 * \88888888 is unlikely to be a valid backref.
6107 *
6108 * We've already figured out what value the digits represent.
6109 * Now, move the parse to beyond them. */
6110 if (endbrace) {
6111 RExC_parse_set(endbrace + 1);
6112 }
6113 else while (isDIGIT(*RExC_parse)) {
6114 RExC_parse_inc_by(1);
6115 }
6116 if (num < 0)
6117 vFAIL("Reference to nonexistent group");
6118
6119 if (num >= (I32)RExC_npar) {
6120 /* It might be a forward reference; we can't fail until we
6121 * know, by completing the parse to get all the groups, and
6122 * then reparsing */
6123 if (ALL_PARENS_COUNTED) {
6124 if (num >= RExC_total_parens) {
6125 vFAIL("Reference to nonexistent group");
6126 }
6127 }
6128 else {
6129 REQUIRE_PARENS_PASS;
6130 }
6131 }
6132 RExC_sawback = 1;
6133 ret = reg2node(pRExC_state,
6134 ((! FOLD)
6135 ? REF
6136 : (ASCII_FOLD_RESTRICTED)
6137 ? REFFA
6138 : (AT_LEAST_UNI_SEMANTICS)
6139 ? REFFU
6140 : (LOC)
6141 ? REFFL
6142 : REFF),
6143 num, RExC_nestroot);
6144 if (RExC_nestroot && num >= RExC_nestroot)
6145 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
6146 if (OP(REGNODE_p(ret)) == REFF) {
6147 RExC_seen_d_op = TRUE;
6148 }
6149 *flagp |= HASWIDTH;
6150
6151 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
6152 FALSE /* Don't force to /x */ );
6153 }
6154 break;
6155 case '\0':
6156 if (RExC_parse >= RExC_end)
6157 FAIL("Trailing \\");
6158 /* FALLTHROUGH */
6159 default:
6160 /* Do not generate "unrecognized" warnings here, we fall
6161 back into the quick-grab loop below */
6162 RExC_parse_set(atom_parse_start);
6163 goto defchar;
6164 } /* end of switch on a \foo sequence */
6165 break;
6166
6167 case '#':
6168
6169 /* '#' comments should have been spaced over before this function was
6170 * called */
6171 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
6172 /*
6173 if (RExC_flags & RXf_PMf_EXTENDED) {
6174 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
6175 if (RExC_parse < RExC_end)
6176 goto tryagain;
6177 }
6178 */
6179
6180 /* FALLTHROUGH */
6181
6182 default:
6183 defchar: {
6184
6185 /* Here, we have determined that the next thing is probably a
6186 * literal character. RExC_parse points to the first byte of its
6187 * definition. (It still may be an escape sequence that evaluates
6188 * to a single character) */
6189
6190 STRLEN len = 0;
6191 UV ender = 0;
6192 char *p;
6193 char *s, *old_s = NULL, *old_old_s = NULL;
6194 char *s0;
6195 U32 max_string_len = 255;
6196
6197 /* We may have to reparse the node, artificially stopping filling
6198 * it early, based on info gleaned in the first parse. This
6199 * variable gives where we stop. Make it above the normal stopping
6200 * place first time through; otherwise it would stop too early */
6201 U32 upper_fill = max_string_len + 1;
6202
6203 /* We start out as an EXACT node, even if under /i, until we find a
6204 * character which is in a fold. The algorithm now segregates into
6205 * separate nodes, characters that fold from those that don't under
6206 * /i. (This hopefully will create nodes that are fixed strings
6207 * even under /i, giving the optimizer something to grab on to.)
6208 * So, if a node has something in it and the next character is in
6209 * the opposite category, that node is closed up, and the function
6210 * returns. Then regatom is called again, and a new node is
6211 * created for the new category. */
6212 U8 node_type = EXACT;
6213
6214 /* Assume the node will be fully used; the excess is given back at
6215 * the end. Under /i, we may need to temporarily add the fold of
6216 * an extra character or two at the end to check for splitting
6217 * multi-char folds, so allocate extra space for that. We can't
6218 * make any other length assumptions, as a byte input sequence
6219 * could shrink down. */
6220 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
6221 + ((! FOLD)
6222 ? 0
6223 : 2 * ((UTF)
6224 ? UTF8_MAXBYTES_CASE
6225 /* Max non-UTF-8 expansion is 2 */ : 2)));
6226
6227 bool next_is_quantifier;
6228 char * oldp = NULL;
6229
6230 /* We can convert EXACTF nodes to EXACTFU if they contain only
6231 * characters that match identically regardless of the target
6232 * string's UTF8ness. The reason to do this is that EXACTF is not
6233 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
6234 * runtime.
6235 *
6236 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
6237 * contain only above-Latin1 characters (hence must be in UTF8),
6238 * which don't participate in folds with Latin1-range characters,
6239 * as the latter's folds aren't known until runtime. */
6240 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6241
6242 /* Single-character EXACTish nodes are almost always SIMPLE. This
6243 * allows us to override this as encountered */
6244 U8 maybe_SIMPLE = SIMPLE;
6245
6246 /* Does this node contain something that can't match unless the
6247 * target string is (also) in UTF-8 */
6248 bool requires_utf8_target = FALSE;
6249
6250 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
6251 bool has_ss = FALSE;
6252
6253 /* So is the MICRO SIGN */
6254 bool has_micro_sign = FALSE;
6255
6256 /* Set when we fill up the current node and there is still more
6257 * text to process */
6258 bool overflowed;
6259
6260 /* Allocate an EXACT node. The node_type may change below to
6261 * another EXACTish node, but since the size of the node doesn't
6262 * change, it works */
6263 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
6264 FILL_NODE(ret, node_type);
6265 RExC_emit += NODE_STEP_REGNODE;
6266
6267 s = STRING(REGNODE_p(ret));
6268
6269 s0 = s;
6270
6271 reparse:
6272
6273 p = RExC_parse;
6274 len = 0;
6275 s = s0;
6276 node_type = EXACT;
6277 oldp = NULL;
6278 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6279 maybe_SIMPLE = SIMPLE;
6280 requires_utf8_target = FALSE;
6281 has_ss = FALSE;
6282 has_micro_sign = FALSE;
6283
6284 continue_parse:
6285
6286 /* This breaks under rare circumstances. If folding, we do not
6287 * want to split a node at a character that is a non-final in a
6288 * multi-char fold, as an input string could just happen to want to
6289 * match across the node boundary. The code at the end of the loop
6290 * looks for this, and backs off until it finds not such a
6291 * character, but it is possible (though extremely, extremely
6292 * unlikely) for all characters in the node to be non-final fold
6293 * ones, in which case we just leave the node fully filled, and
6294 * hope that it doesn't match the string in just the wrong place */
6295
6296 assert( ! UTF /* Is at the beginning of a character */
6297 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
6298 || UTF8_IS_START(UCHARAT(RExC_parse)));
6299
6300 overflowed = FALSE;
6301
6302 /* Here, we have a literal character. Find the maximal string of
6303 * them in the input that we can fit into a single EXACTish node.
6304 * We quit at the first non-literal or when the node gets full, or
6305 * under /i the categorization of folding/non-folding character
6306 * changes */
6307 while (p < RExC_end && len < upper_fill) {
6308
6309 /* In most cases each iteration adds one byte to the output.
6310 * The exceptions override this */
6311 Size_t added_len = 1;
6312
6313 oldp = p;
6314 old_old_s = old_s;
6315 old_s = s;
6316
6317 /* White space has already been ignored */
6318 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
6319 || ! is_PATWS_safe((p), RExC_end, UTF));
6320
6321 switch ((U8)*p) {
6322 const char* message;
6323 U32 packed_warn;
6324 U8 grok_c_char;
6325
6326 case '^':
6327 case '$':
6328 case '.':
6329 case '[':
6330 case '(':
6331 case ')':
6332 case '|':
6333 goto loopdone;
6334 case '\\':
6335 /* Literal Escapes Switch
6336
6337 This switch is meant to handle escape sequences that
6338 resolve to a literal character.
6339
6340 Every escape sequence that represents something
6341 else, like an assertion or a char class, is handled
6342 in the switch marked 'Special Escapes' above in this
6343 routine, but also has an entry here as anything that
6344 isn't explicitly mentioned here will be treated as
6345 an unescaped equivalent literal.
6346 */
6347
6348 switch ((U8)*++p) {
6349
6350 /* These are all the special escapes. */
6351 case 'A': /* Start assertion */
6352 case 'b': case 'B': /* Word-boundary assertion*/
6353 case 'C': /* Single char !DANGEROUS! */
6354 case 'd': case 'D': /* digit class */
6355 case 'g': case 'G': /* generic-backref, pos assertion */
6356 case 'h': case 'H': /* HORIZWS */
6357 case 'k': case 'K': /* named backref, keep marker */
6358 case 'p': case 'P': /* Unicode property */
6359 case 'R': /* LNBREAK */
6360 case 's': case 'S': /* space class */
6361 case 'v': case 'V': /* VERTWS */
6362 case 'w': case 'W': /* word class */
6363 case 'X': /* eXtended Unicode "combining
6364 character sequence" */
6365 case 'z': case 'Z': /* End of line/string assertion */
6366 --p;
6367 goto loopdone;
6368
6369 /* Anything after here is an escape that resolves to a
6370 literal. (Except digits, which may or may not)
6371 */
6372 case 'n':
6373 ender = '\n';
6374 p++;
6375 break;
6376 case 'N': /* Handle a single-code point named character. */
6377 RExC_parse_set( p + 1 );
6378 if (! grok_bslash_N(pRExC_state,
6379 NULL, /* Fail if evaluates to
6380 anything other than a
6381 single code point */
6382 &ender, /* The returned single code
6383 point */
6384 NULL, /* Don't need a count of
6385 how many code points */
6386 flagp,
6387 RExC_strict,
6388 depth)
6389 ) {
6390 if (*flagp & NEED_UTF8)
6391 FAIL("panic: grok_bslash_N set NEED_UTF8");
6392 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
6393
6394 /* Here, it wasn't a single code point. Go close
6395 * up this EXACTish node. The switch() prior to
6396 * this switch handles the other cases */
6397 p = oldp;
6398 RExC_parse_set(p);
6399 goto loopdone;
6400 }
6401 p = RExC_parse;
6402 RExC_parse_set(atom_parse_start);
6403
6404 /* The \N{} means the pattern, if previously /d,
6405 * becomes /u. That means it can't be an EXACTF node,
6406 * but an EXACTFU */
6407 if (node_type == EXACTF) {
6408 node_type = EXACTFU;
6409
6410 /* If the node already contains something that
6411 * differs between EXACTF and EXACTFU, reparse it
6412 * as EXACTFU */
6413 if (! maybe_exactfu) {
6414 len = 0;
6415 s = s0;
6416 goto reparse;
6417 }
6418 }
6419
6420 break;
6421 case 'r':
6422 ender = '\r';
6423 p++;
6424 break;
6425 case 't':
6426 ender = '\t';
6427 p++;
6428 break;
6429 case 'f':
6430 ender = '\f';
6431 p++;
6432 break;
6433 case 'e':
6434 ender = ESC_NATIVE;
6435 p++;
6436 break;
6437 case 'a':
6438 ender = '\a';
6439 p++;
6440 break;
6441 case 'o':
6442 if (! grok_bslash_o(&p,
6443 RExC_end,
6444 &ender,
6445 &message,
6446 &packed_warn,
6447 (bool) RExC_strict,
6448 FALSE, /* No illegal cp's */
6449 UTF))
6450 {
6451 RExC_parse_set(p); /* going to die anyway; point to
6452 exact spot of failure */
6453 vFAIL(message);
6454 }
6455
6456 if (message && TO_OUTPUT_WARNINGS(p)) {
6457 warn_non_literal_string(p, packed_warn, message);
6458 }
6459 break;
6460 case 'x':
6461 if (! grok_bslash_x(&p,
6462 RExC_end,
6463 &ender,
6464 &message,
6465 &packed_warn,
6466 (bool) RExC_strict,
6467 FALSE, /* No illegal cp's */
6468 UTF))
6469 {
6470 RExC_parse_set(p); /* going to die anyway; point
6471 to exact spot of failure */
6472 vFAIL(message);
6473 }
6474
6475 if (message && TO_OUTPUT_WARNINGS(p)) {
6476 warn_non_literal_string(p, packed_warn, message);
6477 }
6478
6479 #ifdef EBCDIC
6480 if (ender < 0x100) {
6481 if (RExC_recode_x_to_native) {
6482 ender = LATIN1_TO_NATIVE(ender);
6483 }
6484 }
6485 #endif
6486 break;
6487 case 'c':
6488 p++;
6489 if (! grok_bslash_c(*p, &grok_c_char,
6490 &message, &packed_warn))
6491 {
6492 /* going to die anyway; point to exact spot of
6493 * failure */
6494 char *new_p= p + ((UTF)
6495 ? UTF8_SAFE_SKIP(p, RExC_end)
6496 : 1);
6497 RExC_parse_set(new_p);
6498 vFAIL(message);
6499 }
6500
6501 ender = grok_c_char;
6502 p++;
6503 if (message && TO_OUTPUT_WARNINGS(p)) {
6504 warn_non_literal_string(p, packed_warn, message);
6505 }
6506
6507 break;
6508 case '8': case '9': /* must be a backreference */
6509 --p;
6510 /* we have an escape like \8 which cannot be an octal escape
6511 * so we exit the loop, and let the outer loop handle this
6512 * escape which may or may not be a legitimate backref. */
6513 goto loopdone;
6514 case '1': case '2': case '3':case '4':
6515 case '5': case '6': case '7':
6516
6517 /* When we parse backslash escapes there is ambiguity
6518 * between backreferences and octal escapes. Any escape
6519 * from \1 - \9 is a backreference, any multi-digit
6520 * escape which does not start with 0 and which when
6521 * evaluated as decimal could refer to an already
6522 * parsed capture buffer is a back reference. Anything
6523 * else is octal.
6524 *
6525 * Note this implies that \118 could be interpreted as
6526 * 118 OR as "\11" . "8" depending on whether there
6527 * were 118 capture buffers defined already in the
6528 * pattern. */
6529
6530 /* NOTE, RExC_npar is 1 more than the actual number of
6531 * parens we have seen so far, hence the "<" as opposed
6532 * to "<=" */
6533 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
6534 { /* Not to be treated as an octal constant, go
6535 find backref */
6536 p = oldp;
6537 goto loopdone;
6538 }
6539 /* FALLTHROUGH */
6540 case '0':
6541 {
6542 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
6543 | PERL_SCAN_NOTIFY_ILLDIGIT;
6544 STRLEN numlen = 3;
6545 ender = grok_oct(p, &numlen, &flags, NULL);
6546 p += numlen;
6547 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
6548 && isDIGIT(*p) /* like \08, \178 */
6549 && ckWARN(WARN_REGEXP))
6550 {
6551 reg_warn_non_literal_string(
6552 p + 1,
6553 form_alien_digit_msg(8, numlen, p,
6554 RExC_end, UTF, FALSE));
6555 }
6556 }
6557 break;
6558 case '\0':
6559 if (p >= RExC_end)
6560 FAIL("Trailing \\");
6561 /* FALLTHROUGH */
6562 default:
6563 if (isALPHANUMERIC(*p)) {
6564 /* An alpha followed by '{' is going to fail next
6565 * iteration, so don't output this warning in that
6566 * case */
6567 if (! isALPHA(*p) || *(p + 1) != '{') {
6568 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
6569 " passed through", p);
6570 }
6571 }
6572 goto normal_default;
6573 } /* End of switch on '\' */
6574 break;
6575 case '{':
6576 /* Trying to gain new uses for '{' without breaking too
6577 * much existing code is hard. The solution currently
6578 * adopted is:
6579 * 1) If there is no ambiguity that a '{' should always
6580 * be taken literally, at the start of a construct, we
6581 * just do so.
6582 * 2) If the literal '{' conflicts with our desired use
6583 * of it as a metacharacter, we die. The deprecation
6584 * cycles for this have come and gone.
6585 * 3) If there is ambiguity, we raise a simple warning.
6586 * This could happen, for example, if the user
6587 * intended it to introduce a quantifier, but slightly
6588 * misspelled the quantifier. Without this warning,
6589 * the quantifier would silently be taken as a literal
6590 * string of characters instead of a meta construct */
6591 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
6592 if ( RExC_strict
6593 || ( p > atom_parse_start + 1
6594 && isALPHA_A(*(p - 1))
6595 && *(p - 2) == '\\'))
6596 {
6597 RExC_parse_set(p + 1);
6598 vFAIL("Unescaped left brace in regex is "
6599 "illegal here");
6600 }
6601 ckWARNreg(p + 1, "Unescaped left brace in regex is"
6602 " passed through");
6603 }
6604 goto normal_default;
6605 case '}':
6606 case ']':
6607 if (p > RExC_parse && RExC_strict) {
6608 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
6609 }
6610 /*FALLTHROUGH*/
6611 default: /* A literal character */
6612 normal_default:
6613 if (! UTF8_IS_INVARIANT(*p) && UTF) {
6614 STRLEN numlen;
6615 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6616 &numlen, UTF8_ALLOW_DEFAULT);
6617 p += numlen;
6618 }
6619 else
6620 ender = (U8) *p++;
6621 break;
6622 } /* End of switch on the literal */
6623
6624 /* Here, have looked at the literal character, and <ender>
6625 * contains its ordinal; <p> points to the character after it.
6626 * */
6627
6628 if (ender > 255) {
6629 REQUIRE_UTF8(flagp);
6630 if ( UNICODE_IS_PERL_EXTENDED(ender)
6631 && TO_OUTPUT_WARNINGS(p))
6632 {
6633 ckWARN2_non_literal_string(p,
6634 packWARN(WARN_PORTABLE),
6635 PL_extended_cp_format,
6636 ender);
6637 }
6638 }
6639
6640 /* We need to check if the next non-ignored thing is a
6641 * quantifier. Move <p> to after anything that should be
6642 * ignored, which, as a side effect, positions <p> for the next
6643 * loop iteration */
6644 skip_to_be_ignored_text(pRExC_state, &p,
6645 FALSE /* Don't force to /x */ );
6646
6647 /* If the next thing is a quantifier, it applies to this
6648 * character only, which means that this character has to be in
6649 * its own node and can't just be appended to the string in an
6650 * existing node, so if there are already other characters in
6651 * the node, close the node with just them, and set up to do
6652 * this character again next time through, when it will be the
6653 * only thing in its new node */
6654
6655 next_is_quantifier = LIKELY(p < RExC_end)
6656 && UNLIKELY(isQUANTIFIER(p, RExC_end));
6657
6658 if (next_is_quantifier && LIKELY(len)) {
6659 p = oldp;
6660 goto loopdone;
6661 }
6662
6663 /* Ready to add 'ender' to the node */
6664
6665 if (! FOLD) { /* The simple case, just append the literal */
6666 not_fold_common:
6667
6668 /* Don't output if it would overflow */
6669 if (UNLIKELY(len > max_string_len - ((UTF)
6670 ? UVCHR_SKIP(ender)
6671 : 1)))
6672 {
6673 overflowed = TRUE;
6674 break;
6675 }
6676
6677 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
6678 *(s++) = (char) ender;
6679 }
6680 else {
6681 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
6682 added_len = (char *) new_s - s;
6683 s = (char *) new_s;
6684
6685 if (ender > 255) {
6686 requires_utf8_target = TRUE;
6687 }
6688 }
6689 }
6690 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
6691
6692 /* Here are folding under /l, and the code point is
6693 * problematic. If this is the first character in the
6694 * node, change the node type to folding. Otherwise, if
6695 * this is the first problematic character, close up the
6696 * existing node, so can start a new node with this one */
6697 if (! len) {
6698 node_type = EXACTFL;
6699 RExC_contains_locale = 1;
6700 }
6701 else if (node_type == EXACT) {
6702 p = oldp;
6703 goto loopdone;
6704 }
6705
6706 /* This problematic code point means we can't simplify
6707 * things */
6708 maybe_exactfu = FALSE;
6709
6710 /* Although these two characters have folds that are
6711 * locale-problematic, they also have folds to above Latin1
6712 * that aren't a problem. Doing these now helps at
6713 * runtime. */
6714 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
6715 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
6716 {
6717 goto fold_anyway;
6718 }
6719
6720 /* Here, we are adding a problematic fold character.
6721 * "Problematic" in this context means that its fold isn't
6722 * known until runtime. (The non-problematic code points
6723 * are the above-Latin1 ones that fold to also all
6724 * above-Latin1. Their folds don't vary no matter what the
6725 * locale is.) But here we have characters whose fold
6726 * depends on the locale. We just add in the unfolded
6727 * character, and wait until runtime to fold it */
6728 goto not_fold_common;
6729 }
6730 else /* regular fold; see if actually is in a fold */
6731 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
6732 || (ender > 255
6733 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
6734 {
6735 /* Here, folding, but the character isn't in a fold.
6736 *
6737 * Start a new node if previous characters in the node were
6738 * folded */
6739 if (len && node_type != EXACT) {
6740 p = oldp;
6741 goto loopdone;
6742 }
6743
6744 /* Here, continuing a node with non-folded characters. Add
6745 * this one */
6746 goto not_fold_common;
6747 }
6748 else { /* Here, does participate in some fold */
6749
6750 /* If this is the first character in the node, change its
6751 * type to folding. Otherwise, if this is the first
6752 * folding character in the node, close up the existing
6753 * node, so can start a new node with this one. */
6754 if (! len) {
6755 node_type = compute_EXACTish(pRExC_state);
6756 }
6757 else if (node_type == EXACT) {
6758 p = oldp;
6759 goto loopdone;
6760 }
6761
6762 if (UTF) { /* Alway use the folded value for UTF-8
6763 patterns */
6764 if (UVCHR_IS_INVARIANT(ender)) {
6765 if (UNLIKELY(len + 1 > max_string_len)) {
6766 overflowed = TRUE;
6767 break;
6768 }
6769
6770 *(s)++ = (U8) toFOLD(ender);
6771 }
6772 else {
6773 UV folded;
6774
6775 fold_anyway:
6776 folded = _to_uni_fold_flags(
6777 ender,
6778 (U8 *) s, /* We have allocated extra space
6779 in 's' so can't run off the
6780 end */
6781 &added_len,
6782 FOLD_FLAGS_FULL
6783 | (( ASCII_FOLD_RESTRICTED
6784 || node_type == EXACTFL)
6785 ? FOLD_FLAGS_NOMIX_ASCII
6786 : 0));
6787 if (UNLIKELY(len + added_len > max_string_len)) {
6788 overflowed = TRUE;
6789 break;
6790 }
6791
6792 s += added_len;
6793
6794 if ( folded > 255
6795 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
6796 {
6797 /* U+B5 folds to the MU, so its possible for a
6798 * non-UTF-8 target to match it */
6799 requires_utf8_target = TRUE;
6800 }
6801 }
6802 }
6803 else { /* Here is non-UTF8. */
6804
6805 /* The fold will be one or (rarely) two characters.
6806 * Check that there's room for at least a single one
6807 * before setting any flags, etc. Because otherwise an
6808 * overflowing character could cause a flag to be set
6809 * even though it doesn't end up in this node. (For
6810 * the two character fold, we check again, before
6811 * setting any flags) */
6812 if (UNLIKELY(len + 1 > max_string_len)) {
6813 overflowed = TRUE;
6814 break;
6815 }
6816
6817 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
6818 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
6819 || UNICODE_DOT_DOT_VERSION > 0)
6820
6821 /* On non-ancient Unicodes, check for the only possible
6822 * multi-char fold */
6823 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
6824
6825 /* This potential multi-char fold means the node
6826 * can't be simple (because it could match more
6827 * than a single char). And in some cases it will
6828 * match 'ss', so set that flag */
6829 maybe_SIMPLE = 0;
6830 has_ss = TRUE;
6831
6832 /* It can't change to be an EXACTFU (unless already
6833 * is one). We fold it iff under /u rules. */
6834 if (node_type != EXACTFU) {
6835 maybe_exactfu = FALSE;
6836 }
6837 else {
6838 if (UNLIKELY(len + 2 > max_string_len)) {
6839 overflowed = TRUE;
6840 break;
6841 }
6842
6843 *(s++) = 's';
6844 *(s++) = 's';
6845 added_len = 2;
6846
6847 goto done_with_this_char;
6848 }
6849 }
6850 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
6851 && LIKELY(len > 0)
6852 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
6853 {
6854 /* Also, the sequence 'ss' is special when not
6855 * under /u. If the target string is UTF-8, it
6856 * should match SHARP S; otherwise it won't. So,
6857 * here we have to exclude the possibility of this
6858 * node moving to /u.*/
6859 has_ss = TRUE;
6860 maybe_exactfu = FALSE;
6861 }
6862 #endif
6863 /* Here, the fold will be a single character */
6864
6865 if (UNLIKELY(ender == MICRO_SIGN)) {
6866 has_micro_sign = TRUE;
6867 }
6868 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
6869
6870 /* If the character's fold differs between /d and
6871 * /u, this can't change to be an EXACTFU node */
6872 maybe_exactfu = FALSE;
6873 }
6874
6875 *(s++) = (DEPENDS_SEMANTICS)
6876 ? (char) toFOLD(ender)
6877
6878 /* Under /u, the fold of any character in
6879 * the 0-255 range happens to be its
6880 * lowercase equivalent, except for LATIN
6881 * SMALL LETTER SHARP S, which was handled
6882 * above, and the MICRO SIGN, whose fold
6883 * requires UTF-8 to represent. */
6884 : (char) toLOWER_L1(ender);
6885 }
6886 } /* End of adding current character to the node */
6887
6888 done_with_this_char:
6889
6890 len += added_len;
6891
6892 if (next_is_quantifier) {
6893
6894 /* Here, the next input is a quantifier, and to get here,
6895 * the current character is the only one in the node. */
6896 goto loopdone;
6897 }
6898
6899 } /* End of loop through literal characters */
6900
6901 /* Here we have either exhausted the input or run out of room in
6902 * the node. If the former, we are done. (If we encountered a
6903 * character that can't be in the node, transfer is made directly
6904 * to <loopdone>, and so we wouldn't have fallen off the end of the
6905 * loop.) */
6906 if (LIKELY(! overflowed)) {
6907 goto loopdone;
6908 }
6909
6910 /* Here we have run out of room. We can grow plain EXACT and
6911 * LEXACT nodes. If the pattern is gigantic enough, though,
6912 * eventually we'll have to artificially chunk the pattern into
6913 * multiple nodes. */
6914 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
6915 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
6916 Size_t overhead_expansion = 0;
6917 char temp[256];
6918 Size_t max_nodes_for_string;
6919 Size_t achievable;
6920 SSize_t delta;
6921
6922 /* Here we couldn't fit the final character in the current
6923 * node, so it will have to be reparsed, no matter what else we
6924 * do */
6925 p = oldp;
6926
6927 /* If would have overflowed a regular EXACT node, switch
6928 * instead to an LEXACT. The code below is structured so that
6929 * the actual growing code is common to changing from an EXACT
6930 * or just increasing the LEXACT size. This means that we have
6931 * to save the string in the EXACT case before growing, and
6932 * then copy it afterwards to its new location */
6933 if (node_type == EXACT) {
6934 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
6935 RExC_emit += overhead_expansion;
6936 Copy(s0, temp, len, char);
6937 }
6938
6939 /* Ready to grow. If it was a plain EXACT, the string was
6940 * saved, and the first few bytes of it overwritten by adding
6941 * an argument field. We assume, as we do elsewhere in this
6942 * file, that one byte of remaining input will translate into
6943 * one byte of output, and if that's too small, we grow again,
6944 * if too large the excess memory is freed at the end */
6945
6946 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
6947 achievable = MIN(max_nodes_for_string,
6948 current_string_nodes + STR_SZ(RExC_end - p));
6949 delta = achievable - current_string_nodes;
6950
6951 /* If there is just no more room, go finish up this chunk of
6952 * the pattern. */
6953 if (delta <= 0) {
6954 goto loopdone;
6955 }
6956
6957 change_engine_size(pRExC_state, delta + overhead_expansion);
6958 current_string_nodes += delta;
6959 max_string_len
6960 = sizeof(struct regnode) * current_string_nodes;
6961 upper_fill = max_string_len + 1;
6962
6963 /* If the length was small, we know this was originally an
6964 * EXACT node now converted to LEXACT, and the string has to be
6965 * restored. Otherwise the string was untouched. 260 is just
6966 * a number safely above 255 so don't have to worry about
6967 * getting it precise */
6968 if (len < 260) {
6969 node_type = LEXACT;
6970 FILL_NODE(ret, node_type);
6971 s0 = STRING(REGNODE_p(ret));
6972 Copy(temp, s0, len, char);
6973 s = s0 + len;
6974 }
6975
6976 goto continue_parse;
6977 }
6978 else if (FOLD) {
6979 bool splittable = FALSE;
6980 bool backed_up = FALSE;
6981 char * e; /* should this be U8? */
6982 char * s_start; /* should this be U8? */
6983
6984 /* Here is /i. Running out of room creates a problem if we are
6985 * folding, and the split happens in the middle of a
6986 * multi-character fold, as a match that should have occurred,
6987 * won't, due to the way nodes are matched, and our artificial
6988 * boundary. So back off until we aren't splitting such a
6989 * fold. If there is no such place to back off to, we end up
6990 * taking the entire node as-is. This can happen if the node
6991 * consists entirely of 'f' or entirely of 's' characters (or
6992 * things that fold to them) as 'ff' and 'ss' are
6993 * multi-character folds.
6994 *
6995 * The Unicode standard says that multi character folds consist
6996 * of either two or three characters. That means we would be
6997 * splitting one if the final character in the node is at the
6998 * beginning of either type, or is the second of a three
6999 * character fold.
7000 *
7001 * At this point:
7002 * ender is the code point of the character that won't fit
7003 * in the node
7004 * s points to just beyond the final byte in the node.
7005 * It's where we would place ender if there were
7006 * room, and where in fact we do place ender's fold
7007 * in the code below, as we've over-allocated space
7008 * for s0 (hence s) to allow for this
7009 * e starts at 's' and advances as we append things.
7010 * old_s is the same as 's'. (If ender had fit, 's' would
7011 * have been advanced to beyond it).
7012 * old_old_s points to the beginning byte of the final
7013 * character in the node
7014 * p points to the beginning byte in the input of the
7015 * character beyond 'ender'.
7016 * oldp points to the beginning byte in the input of
7017 * 'ender'.
7018 *
7019 * In the case of /il, we haven't folded anything that could be
7020 * affected by the locale. That means only above-Latin1
7021 * characters that fold to other above-latin1 characters get
7022 * folded at compile time. To check where a good place to
7023 * split nodes is, everything in it will have to be folded.
7024 * The boolean 'maybe_exactfu' keeps track in /il if there are
7025 * any unfolded characters in the node. */
7026 bool need_to_fold_loc = LOC && ! maybe_exactfu;
7027
7028 /* If we do need to fold the node, we need a place to store the
7029 * folded copy, and a way to map back to the unfolded original
7030 * */
7031 char * locfold_buf = NULL;
7032 Size_t * loc_correspondence = NULL;
7033
7034 if (! need_to_fold_loc) { /* The normal case. Just
7035 initialize to the actual node */
7036 e = s;
7037 s_start = s0;
7038 s = old_old_s; /* Point to the beginning of the final char
7039 that fits in the node */
7040 }
7041 else {
7042
7043 /* Here, we have filled a /il node, and there are unfolded
7044 * characters in it. If the runtime locale turns out to be
7045 * UTF-8, there are possible multi-character folds, just
7046 * like when not under /l. The node hence can't terminate
7047 * in the middle of such a fold. To determine this, we
7048 * have to create a folded copy of this node. That means
7049 * reparsing the node, folding everything assuming a UTF-8
7050 * locale. (If at runtime it isn't such a locale, the
7051 * actions here wouldn't have been necessary, but we have
7052 * to assume the worst case.) If we find we need to back
7053 * off the folded string, we do so, and then map that
7054 * position back to the original unfolded node, which then
7055 * gets output, truncated at that spot */
7056
7057 char * redo_p = RExC_parse;
7058 char * redo_e;
7059 char * old_redo_e;
7060
7061 /* Allow enough space assuming a single byte input folds to
7062 * a single byte output, plus assume that the two unparsed
7063 * characters (that we may need) fold to the largest number
7064 * of bytes possible, plus extra for one more worst case
7065 * scenario. In the loop below, if we start eating into
7066 * that final spare space, we enlarge this initial space */
7067 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
7068
7069 Newxz(locfold_buf, size, char);
7070 Newxz(loc_correspondence, size, Size_t);
7071
7072 /* Redo this node's parse, folding into 'locfold_buf' */
7073 redo_p = RExC_parse;
7074 old_redo_e = redo_e = locfold_buf;
7075 while (redo_p <= oldp) {
7076
7077 old_redo_e = redo_e;
7078 loc_correspondence[redo_e - locfold_buf]
7079 = redo_p - RExC_parse;
7080
7081 if (UTF) {
7082 Size_t added_len;
7083
7084 (void) _to_utf8_fold_flags((U8 *) redo_p,
7085 (U8 *) RExC_end,
7086 (U8 *) redo_e,
7087 &added_len,
7088 FOLD_FLAGS_FULL);
7089 redo_e += added_len;
7090 redo_p += UTF8SKIP(redo_p);
7091 }
7092 else {
7093
7094 /* Note that if this code is run on some ancient
7095 * Unicode versions, SHARP S doesn't fold to 'ss',
7096 * but rather than clutter the code with #ifdef's,
7097 * as is done above, we ignore that possibility.
7098 * This is ok because this code doesn't affect what
7099 * gets matched, but merely where the node gets
7100 * split */
7101 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
7102 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
7103 }
7104 else {
7105 *redo_e++ = 's';
7106 *redo_e++ = 's';
7107 }
7108 redo_p++;
7109 }
7110
7111
7112 /* If we're getting so close to the end that a
7113 * worst-case fold in the next character would cause us
7114 * to overflow, increase, assuming one byte output byte
7115 * per one byte input one, plus room for another worst
7116 * case fold */
7117 if ( redo_p <= oldp
7118 && redo_e > locfold_buf + size
7119 - (UTF8_MAXBYTES_CASE + 1))
7120 {
7121 Size_t new_size = size
7122 + (oldp - redo_p)
7123 + UTF8_MAXBYTES_CASE + 1;
7124 Ptrdiff_t e_offset = redo_e - locfold_buf;
7125
7126 Renew(locfold_buf, new_size, char);
7127 Renew(loc_correspondence, new_size, Size_t);
7128 size = new_size;
7129
7130 redo_e = locfold_buf + e_offset;
7131 }
7132 }
7133
7134 /* Set so that things are in terms of the folded, temporary
7135 * string */
7136 s = old_redo_e;
7137 s_start = locfold_buf;
7138 e = redo_e;
7139
7140 }
7141
7142 /* Here, we have 's', 's_start' and 'e' set up to point to the
7143 * input that goes into the node, folded.
7144 *
7145 * If the final character of the node and the fold of ender
7146 * form the first two characters of a three character fold, we
7147 * need to peek ahead at the next (unparsed) character in the
7148 * input to determine if the three actually do form such a
7149 * fold. Just looking at that character is not generally
7150 * sufficient, as it could be, for example, an escape sequence
7151 * that evaluates to something else, and it needs to be folded.
7152 *
7153 * khw originally thought to just go through the parse loop one
7154 * extra time, but that doesn't work easily as that iteration
7155 * could cause things to think that the parse is over and to
7156 * goto loopdone. The character could be a '$' for example, or
7157 * the character beyond could be a quantifier, and other
7158 * glitches as well.
7159 *
7160 * The solution used here for peeking ahead is to look at that
7161 * next character. If it isn't ASCII punctuation, then it will
7162 * be something that would continue on in an EXACTish node if
7163 * there were space. We append the fold of it to s, having
7164 * reserved enough room in s0 for the purpose. If we can't
7165 * reasonably peek ahead, we instead assume the worst case:
7166 * that it is something that would form the completion of a
7167 * multi-char fold.
7168 *
7169 * If we can't split between s and ender, we work backwards
7170 * character-by-character down to s0. At each current point
7171 * see if we are at the beginning of a multi-char fold. If so,
7172 * that means we would be splitting the fold across nodes, and
7173 * so we back up one and try again.
7174 *
7175 * If we're not at the beginning, we still could be at the
7176 * final two characters of a (rare) three character fold. We
7177 * check if the sequence starting at the character before the
7178 * current position (and including the current and next
7179 * characters) is a three character fold. If not, the node can
7180 * be split here. If it is, we have to backup two characters
7181 * and try again.
7182 *
7183 * Otherwise, the node can be split at the current position.
7184 *
7185 * The same logic is used for UTF-8 patterns and not */
7186 if (UTF) {
7187 Size_t added_len;
7188
7189 /* Append the fold of ender */
7190 (void) _to_uni_fold_flags(
7191 ender,
7192 (U8 *) e,
7193 &added_len,
7194 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7195 ? FOLD_FLAGS_NOMIX_ASCII
7196 : 0));
7197 e += added_len;
7198
7199 /* 's' and the character folded to by ender may be the
7200 * first two of a three-character fold, in which case the
7201 * node should not be split here. That may mean examining
7202 * the so-far unparsed character starting at 'p'. But if
7203 * ender folded to more than one character, we already have
7204 * three characters to look at. Also, we first check if
7205 * the sequence consisting of s and the next character form
7206 * the first two of some three character fold. If not,
7207 * there's no need to peek ahead. */
7208 if ( added_len <= UTF8SKIP(e - added_len)
7209 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
7210 {
7211 /* Here, the two do form the beginning of a potential
7212 * three character fold. The unexamined character may
7213 * or may not complete it. Peek at it. It might be
7214 * something that ends the node or an escape sequence,
7215 * in which case we don't know without a lot of work
7216 * what it evaluates to, so we have to assume the worst
7217 * case: that it does complete the fold, and so we
7218 * can't split here. All such instances will have
7219 * that character be an ASCII punctuation character,
7220 * like a backslash. So, for that case, backup one and
7221 * drop down to try at that position */
7222 if (isPUNCT(*p)) {
7223 s = (char *) utf8_hop_back((U8 *) s, -1,
7224 (U8 *) s_start);
7225 backed_up = TRUE;
7226 }
7227 else {
7228 /* Here, since it's not punctuation, it must be a
7229 * real character, and we can append its fold to
7230 * 'e' (having deliberately reserved enough space
7231 * for this eventuality) and drop down to check if
7232 * the three actually do form a folded sequence */
7233 (void) _to_utf8_fold_flags(
7234 (U8 *) p, (U8 *) RExC_end,
7235 (U8 *) e,
7236 &added_len,
7237 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7238 ? FOLD_FLAGS_NOMIX_ASCII
7239 : 0));
7240 e += added_len;
7241 }
7242 }
7243
7244 /* Here, we either have three characters available in
7245 * sequence starting at 's', or we have two characters and
7246 * know that the following one can't possibly be part of a
7247 * three character fold. We go through the node backwards
7248 * until we find a place where we can split it without
7249 * breaking apart a multi-character fold. At any given
7250 * point we have to worry about if such a fold begins at
7251 * the current 's', and also if a three-character fold
7252 * begins at s-1, (containing s and s+1). Splitting in
7253 * either case would break apart a fold */
7254 do {
7255 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
7256 (U8 *) s_start);
7257
7258 /* If is a multi-char fold, can't split here. Backup
7259 * one char and try again */
7260 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
7261 s = prev_s;
7262 backed_up = TRUE;
7263 continue;
7264 }
7265
7266 /* If the two characters beginning at 's' are part of a
7267 * three character fold starting at the character
7268 * before s, we can't split either before or after s.
7269 * Backup two chars and try again */
7270 if ( LIKELY(s > s_start)
7271 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
7272 {
7273 s = prev_s;
7274 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
7275 backed_up = TRUE;
7276 continue;
7277 }
7278
7279 /* Here there's no multi-char fold between s and the
7280 * next character following it. We can split */
7281 splittable = TRUE;
7282 break;
7283
7284 } while (s > s_start); /* End of loops backing up through the node */
7285
7286 /* Here we either couldn't find a place to split the node,
7287 * or else we broke out of the loop setting 'splittable' to
7288 * true. In the latter case, the place to split is between
7289 * the first and second characters in the sequence starting
7290 * at 's' */
7291 if (splittable) {
7292 s += UTF8SKIP(s);
7293 }
7294 }
7295 else { /* Pattern not UTF-8 */
7296 if ( ender != LATIN_SMALL_LETTER_SHARP_S
7297 || ASCII_FOLD_RESTRICTED)
7298 {
7299 assert( toLOWER_L1(ender) < 256 );
7300 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7301 }
7302 else {
7303 *e++ = 's';
7304 *e++ = 's';
7305 }
7306
7307 if ( e - s <= 1
7308 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
7309 {
7310 if (isPUNCT(*p)) {
7311 s--;
7312 backed_up = TRUE;
7313 }
7314 else {
7315 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
7316 || ASCII_FOLD_RESTRICTED)
7317 {
7318 assert( toLOWER_L1(ender) < 256 );
7319 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7320 }
7321 else {
7322 *e++ = 's';
7323 *e++ = 's';
7324 }
7325 }
7326 }
7327
7328 do {
7329 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
7330 s--;
7331 backed_up = TRUE;
7332 continue;
7333 }
7334
7335 if ( LIKELY(s > s_start)
7336 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
7337 {
7338 s -= 2;
7339 backed_up = TRUE;
7340 continue;
7341 }
7342
7343 splittable = TRUE;
7344 break;
7345
7346 } while (s > s_start);
7347
7348 if (splittable) {
7349 s++;
7350 }
7351 }
7352
7353 /* Here, we are done backing up. If we didn't backup at all
7354 * (the likely case), just proceed */
7355 if (backed_up) {
7356
7357 /* If we did find a place to split, reparse the entire node
7358 * stopping where we have calculated. */
7359 if (splittable) {
7360
7361 /* If we created a temporary folded string under /l, we
7362 * have to map that back to the original */
7363 if (need_to_fold_loc) {
7364 upper_fill = loc_correspondence[s - s_start];
7365 if (upper_fill == 0) {
7366 FAIL2("panic: loc_correspondence[%d] is 0",
7367 (int) (s - s_start));
7368 }
7369 Safefree(locfold_buf);
7370 Safefree(loc_correspondence);
7371 }
7372 else {
7373 upper_fill = s - s0;
7374 }
7375 goto reparse;
7376 }
7377
7378 /* Here the node consists entirely of non-final multi-char
7379 * folds. (Likely it is all 'f's or all 's's.) There's no
7380 * decent place to split it, so give up and just take the
7381 * whole thing */
7382 len = old_s - s0;
7383 }
7384
7385 if (need_to_fold_loc) {
7386 Safefree(locfold_buf);
7387 Safefree(loc_correspondence);
7388 }
7389 } /* End of verifying node ends with an appropriate char */
7390
7391 /* We need to start the next node at the character that didn't fit
7392 * in this one */
7393 p = oldp;
7394
7395 loopdone: /* Jumped to when encounters something that shouldn't be
7396 in the node */
7397
7398 /* Free up any over-allocated space; cast is to silence bogus
7399 * warning in MS VC */
7400 change_engine_size(pRExC_state,
7401 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
7402
7403 /* I (khw) don't know if you can get here with zero length, but the
7404 * old code handled this situation by creating a zero-length EXACT
7405 * node. Might as well be NOTHING instead */
7406 if (len == 0) {
7407 OP(REGNODE_p(ret)) = NOTHING;
7408 }
7409 else {
7410
7411 /* If the node type is EXACT here, check to see if it
7412 * should be EXACTL, or EXACT_REQ8. */
7413 if (node_type == EXACT) {
7414 if (LOC) {
7415 node_type = EXACTL;
7416 }
7417 else if (requires_utf8_target) {
7418 node_type = EXACT_REQ8;
7419 }
7420 }
7421 else if (node_type == LEXACT) {
7422 if (requires_utf8_target) {
7423 node_type = LEXACT_REQ8;
7424 }
7425 }
7426 else if (FOLD) {
7427 if ( UNLIKELY(has_micro_sign || has_ss)
7428 && (node_type == EXACTFU || ( node_type == EXACTF
7429 && maybe_exactfu)))
7430 { /* These two conditions are problematic in non-UTF-8
7431 EXACTFU nodes. */
7432 assert(! UTF);
7433 node_type = EXACTFUP;
7434 }
7435 else if (node_type == EXACTFL) {
7436
7437 /* 'maybe_exactfu' is deliberately set above to
7438 * indicate this node type, where all code points in it
7439 * are above 255 */
7440 if (maybe_exactfu) {
7441 node_type = EXACTFLU8;
7442 }
7443 else if (UNLIKELY(
7444 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
7445 {
7446 /* A character that folds to more than one will
7447 * match multiple characters, so can't be SIMPLE.
7448 * We don't have to worry about this with EXACTFLU8
7449 * nodes just above, as they have already been
7450 * folded (since the fold doesn't vary at run
7451 * time). Here, if the final character in the node
7452 * folds to multiple, it can't be simple. (This
7453 * only has an effect if the node has only a single
7454 * character, hence the final one, as elsewhere we
7455 * turn off simple for nodes whose length > 1 */
7456 maybe_SIMPLE = 0;
7457 }
7458 }
7459 else if (node_type == EXACTF) { /* Means is /di */
7460
7461 /* This intermediate variable is needed solely because
7462 * the asserts in the macro where used exceed Win32's
7463 * literal string capacity */
7464 char first_char = * STRING(REGNODE_p(ret));
7465
7466 /* If 'maybe_exactfu' is clear, then we need to stay
7467 * /di. If it is set, it means there are no code
7468 * points that match differently depending on UTF8ness
7469 * of the target string, so it can become an EXACTFU
7470 * node */
7471 if (! maybe_exactfu) {
7472 RExC_seen_d_op = TRUE;
7473 }
7474 else if ( isALPHA_FOLD_EQ(first_char, 's')
7475 || isALPHA_FOLD_EQ(ender, 's'))
7476 {
7477 /* But, if the node begins or ends in an 's' we
7478 * have to defer changing it into an EXACTFU, as
7479 * the node could later get joined with another one
7480 * that ends or begins with 's' creating an 'ss'
7481 * sequence which would then wrongly match the
7482 * sharp s without the target being UTF-8. We
7483 * create a special node that we resolve later when
7484 * we join nodes together */
7485
7486 node_type = EXACTFU_S_EDGE;
7487 }
7488 else {
7489 node_type = EXACTFU;
7490 }
7491 }
7492
7493 if (requires_utf8_target && node_type == EXACTFU) {
7494 node_type = EXACTFU_REQ8;
7495 }
7496 }
7497
7498 OP(REGNODE_p(ret)) = node_type;
7499 setSTR_LEN(REGNODE_p(ret), len);
7500 RExC_emit += STR_SZ(len);
7501
7502 /* If the node isn't a single character, it can't be SIMPLE */
7503 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
7504 maybe_SIMPLE = 0;
7505 }
7506
7507 *flagp |= HASWIDTH | maybe_SIMPLE;
7508 }
7509
7510 RExC_parse_set(p);
7511
7512 {
7513 /* len is STRLEN which is unsigned, need to copy to signed */
7514 IV iv = len;
7515 if (iv < 0)
7516 vFAIL("Internal disaster");
7517 }
7518
7519 } /* End of label 'defchar:' */
7520 break;
7521 } /* End of giant switch on input character */
7522
7523 /* Position parse to next real character */
7524 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
7525 FALSE /* Don't force to /x */ );
7526 if ( *RExC_parse == '{'
7527 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
7528 {
7529 if (RExC_strict) {
7530 RExC_parse_inc_by(1);
7531 vFAIL("Unescaped left brace in regex is illegal here");
7532 }
7533 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
7534 " passed through");
7535 }
7536
7537 return(ret);
7538 }
7539
7540
7541 #ifdef PERL_RE_BUILD_AUX
7542 void
Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode * node,SV ** invlist_ptr)7543 Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
7544 {
7545 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
7546 * sets up the bitmap and any flags, removing those code points from the
7547 * inversion list, setting it to NULL should it become completely empty */
7548
7549
7550 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
7551
7552 /* There is no bitmap for this node type */
7553 if (REGNODE_TYPE(OP(node)) != ANYOF) {
7554 return;
7555 }
7556
7557 ANYOF_BITMAP_ZERO(node);
7558 if (*invlist_ptr) {
7559
7560 /* This gets set if we actually need to modify things */
7561 bool change_invlist = FALSE;
7562
7563 UV start, end;
7564
7565 /* Start looking through *invlist_ptr */
7566 invlist_iterinit(*invlist_ptr);
7567 while (invlist_iternext(*invlist_ptr, &start, &end)) {
7568 UV high;
7569 int i;
7570
7571 /* Quit if are above what we should change */
7572 if (start >= NUM_ANYOF_CODE_POINTS) {
7573 break;
7574 }
7575
7576 change_invlist = TRUE;
7577
7578 /* Set all the bits in the range, up to the max that we are doing */
7579 high = (end < NUM_ANYOF_CODE_POINTS - 1)
7580 ? end
7581 : NUM_ANYOF_CODE_POINTS - 1;
7582 for (i = start; i <= (int) high; i++) {
7583 ANYOF_BITMAP_SET(node, i);
7584 }
7585 }
7586 invlist_iterfinish(*invlist_ptr);
7587
7588 /* Done with loop; remove any code points that are in the bitmap from
7589 * *invlist_ptr */
7590 if (change_invlist) {
7591 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
7592 }
7593
7594 /* If have completely emptied it, remove it completely */
7595 if (_invlist_len(*invlist_ptr) == 0) {
7596 SvREFCNT_dec_NN(*invlist_ptr);
7597 *invlist_ptr = NULL;
7598 }
7599 }
7600 }
7601 #endif /* PERL_RE_BUILD_AUX */
7602
7603 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7604 Character classes ([:foo:]) can also be negated ([:^foo:]).
7605 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7606 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7607 but trigger failures because they are currently unimplemented. */
7608
7609 #define POSIXCC_DONE(c) ((c) == ':')
7610 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7611 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7612 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
7613
7614 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
7615 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
7616 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
7617
7618 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
7619
7620 /* 'posix_warnings' and 'warn_text' are names of variables in the following
7621 * routine. q.v. */
7622 #define ADD_POSIX_WARNING(p, text) STMT_START { \
7623 if (posix_warnings) { \
7624 if (! RExC_warn_text ) RExC_warn_text = \
7625 (AV *) sv_2mortal((SV *) newAV()); \
7626 av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_ \
7627 WARNING_PREFIX \
7628 text \
7629 REPORT_LOCATION, \
7630 REPORT_LOCATION_ARGS(p))); \
7631 } \
7632 } STMT_END
7633 #define CLEAR_POSIX_WARNINGS() \
7634 STMT_START { \
7635 if (posix_warnings && RExC_warn_text) \
7636 av_clear(RExC_warn_text); \
7637 } STMT_END
7638
7639 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
7640 STMT_START { \
7641 CLEAR_POSIX_WARNINGS(); \
7642 return ret; \
7643 } STMT_END
7644
7645 STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t * pRExC_state,const char * const s,char ** updated_parse_ptr,AV ** posix_warnings,const bool check_only)7646 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
7647
7648 const char * const s, /* Where the putative posix class begins.
7649 Normally, this is one past the '['. This
7650 parameter exists so it can be somewhere
7651 besides RExC_parse. */
7652 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
7653 NULL */
7654 AV ** posix_warnings, /* Where to place any generated warnings, or
7655 NULL */
7656 const bool check_only /* Don't die if error */
7657 )
7658 {
7659 /* This parses what the caller thinks may be one of the three POSIX
7660 * constructs:
7661 * 1) a character class, like [:blank:]
7662 * 2) a collating symbol, like [. .]
7663 * 3) an equivalence class, like [= =]
7664 * In the latter two cases, it croaks if it finds a syntactically legal
7665 * one, as these are not handled by Perl.
7666 *
7667 * The main purpose is to look for a POSIX character class. It returns:
7668 * a) the class number
7669 * if it is a completely syntactically and semantically legal class.
7670 * 'updated_parse_ptr', if not NULL, is set to point to just after the
7671 * closing ']' of the class
7672 * b) OOB_NAMEDCLASS
7673 * if it appears that one of the three POSIX constructs was meant, but
7674 * its specification was somehow defective. 'updated_parse_ptr', if
7675 * not NULL, is set to point to the character just after the end
7676 * character of the class. See below for handling of warnings.
7677 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
7678 * if it doesn't appear that a POSIX construct was intended.
7679 * 'updated_parse_ptr' is not changed. No warnings nor errors are
7680 * raised.
7681 *
7682 * In b) there may be errors or warnings generated. If 'check_only' is
7683 * TRUE, then any errors are discarded. Warnings are returned to the
7684 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
7685 * instead it is NULL, warnings are suppressed.
7686 *
7687 * The reason for this function, and its complexity is that a bracketed
7688 * character class can contain just about anything. But it's easy to
7689 * mistype the very specific posix class syntax but yielding a valid
7690 * regular bracketed class, so it silently gets compiled into something
7691 * quite unintended.
7692 *
7693 * The solution adopted here maintains backward compatibility except that
7694 * it adds a warning if it looks like a posix class was intended but
7695 * improperly specified. The warning is not raised unless what is input
7696 * very closely resembles one of the 14 legal posix classes. To do this,
7697 * it uses fuzzy parsing. It calculates how many single-character edits it
7698 * would take to transform what was input into a legal posix class. Only
7699 * if that number is quite small does it think that the intention was a
7700 * posix class. Obviously these are heuristics, and there will be cases
7701 * where it errs on one side or another, and they can be tweaked as
7702 * experience informs.
7703 *
7704 * The syntax for a legal posix class is:
7705 *
7706 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
7707 *
7708 * What this routine considers syntactically to be an intended posix class
7709 * is this (the comments indicate some restrictions that the pattern
7710 * doesn't show):
7711 *
7712 * qr/(?x: \[? # The left bracket, possibly
7713 * # omitted
7714 * \h* # possibly followed by blanks
7715 * (?: \^ \h* )? # possibly a misplaced caret
7716 * [:;]? # The opening class character,
7717 * # possibly omitted. A typo
7718 * # semi-colon can also be used.
7719 * \h*
7720 * \^? # possibly a correctly placed
7721 * # caret, but not if there was also
7722 * # a misplaced one
7723 * \h*
7724 * .{3,15} # The class name. If there are
7725 * # deviations from the legal syntax,
7726 * # its edit distance must be close
7727 * # to a real class name in order
7728 * # for it to be considered to be
7729 * # an intended posix class.
7730 * \h*
7731 * [[:punct:]]? # The closing class character,
7732 * # possibly omitted. If not a colon
7733 * # nor semi colon, the class name
7734 * # must be even closer to a valid
7735 * # one
7736 * \h*
7737 * \]? # The right bracket, possibly
7738 * # omitted.
7739 * )/
7740 *
7741 * In the above, \h must be ASCII-only.
7742 *
7743 * These are heuristics, and can be tweaked as field experience dictates.
7744 * There will be cases when someone didn't intend to specify a posix class
7745 * that this warns as being so. The goal is to minimize these, while
7746 * maximizing the catching of things intended to be a posix class that
7747 * aren't parsed as such.
7748 */
7749
7750 const char* p = s;
7751 const char * const e = RExC_end;
7752 unsigned complement = 0; /* If to complement the class */
7753 bool found_problem = FALSE; /* Assume OK until proven otherwise */
7754 bool has_opening_bracket = FALSE;
7755 bool has_opening_colon = FALSE;
7756 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
7757 valid class */
7758 const char * possible_end = NULL; /* used for a 2nd parse pass */
7759 const char* name_start; /* ptr to class name first char */
7760
7761 /* If the number of single-character typos the input name is away from a
7762 * legal name is no more than this number, it is considered to have meant
7763 * the legal name */
7764 int max_distance = 2;
7765
7766 /* to store the name. The size determines the maximum length before we
7767 * decide that no posix class was intended. Should be at least
7768 * sizeof("alphanumeric") */
7769 UV input_text[15];
7770 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
7771
7772 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
7773
7774 CLEAR_POSIX_WARNINGS();
7775
7776 if (p >= e) {
7777 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
7778 }
7779
7780 if (*(p - 1) != '[') {
7781 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
7782 found_problem = TRUE;
7783 }
7784 else {
7785 has_opening_bracket = TRUE;
7786 }
7787
7788 /* They could be confused and think you can put spaces between the
7789 * components */
7790 if (isBLANK(*p)) {
7791 found_problem = TRUE;
7792
7793 do {
7794 p++;
7795 } while (p < e && isBLANK(*p));
7796
7797 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7798 }
7799
7800 /* For [. .] and [= =]. These are quite different internally from [: :],
7801 * so they are handled separately. */
7802 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
7803 and 1 for at least one char in it
7804 */
7805 {
7806 const char open_char = *p;
7807 const char * temp_ptr = p + 1;
7808
7809 /* These two constructs are not handled by perl, and if we find a
7810 * syntactically valid one, we croak. khw, who wrote this code, finds
7811 * this explanation of them very unclear:
7812 * https://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
7813 * And searching the rest of the internet wasn't very helpful either.
7814 * It looks like just about any byte can be in these constructs,
7815 * depending on the locale. But unless the pattern is being compiled
7816 * under /l, which is very rare, Perl runs under the C or POSIX locale.
7817 * In that case, it looks like [= =] isn't allowed at all, and that
7818 * [. .] could be any single code point, but for longer strings the
7819 * constituent characters would have to be the ASCII alphabetics plus
7820 * the minus-hyphen. Any sensible locale definition would limit itself
7821 * to these. And any portable one definitely should. Trying to parse
7822 * the general case is a nightmare (see [perl #127604]). So, this code
7823 * looks only for interiors of these constructs that match:
7824 * qr/.|[-\w]{2,}/
7825 * Using \w relaxes the apparent rules a little, without adding much
7826 * danger of mistaking something else for one of these constructs.
7827 *
7828 * [. .] in some implementations described on the internet is usable to
7829 * escape a character that otherwise is special in bracketed character
7830 * classes. For example [.].] means a literal right bracket instead of
7831 * the ending of the class
7832 *
7833 * [= =] can legitimately contain a [. .] construct, but we don't
7834 * handle this case, as that [. .] construct will later get parsed
7835 * itself and croak then. And [= =] is checked for even when not under
7836 * /l, as Perl has long done so.
7837 *
7838 * The code below relies on there being a trailing NUL, so it doesn't
7839 * have to keep checking if the parse ptr < e.
7840 */
7841 if (temp_ptr[1] == open_char) {
7842 temp_ptr++;
7843 }
7844 else while ( temp_ptr < e
7845 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
7846 {
7847 temp_ptr++;
7848 }
7849
7850 if (*temp_ptr == open_char) {
7851 temp_ptr++;
7852 if (*temp_ptr == ']') {
7853 temp_ptr++;
7854 if (! found_problem && ! check_only) {
7855 RExC_parse_set((char *) temp_ptr);
7856 vFAIL3("POSIX syntax [%c %c] is reserved for future "
7857 "extensions", open_char, open_char);
7858 }
7859
7860 /* Here, the syntax wasn't completely valid, or else the call
7861 * is to check-only */
7862 if (updated_parse_ptr) {
7863 *updated_parse_ptr = (char *) temp_ptr;
7864 }
7865
7866 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
7867 }
7868 }
7869
7870 /* If we find something that started out to look like one of these
7871 * constructs, but isn't, we continue below so that it can be checked
7872 * for being a class name with a typo of '.' or '=' instead of a colon.
7873 * */
7874 }
7875
7876 /* Here, we think there is a possibility that a [: :] class was meant, and
7877 * we have the first real character. It could be they think the '^' comes
7878 * first */
7879 if (*p == '^') {
7880 found_problem = TRUE;
7881 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
7882 complement = 1;
7883 p++;
7884
7885 if (isBLANK(*p)) {
7886 found_problem = TRUE;
7887
7888 do {
7889 p++;
7890 } while (p < e && isBLANK(*p));
7891
7892 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7893 }
7894 }
7895
7896 /* But the first character should be a colon, which they could have easily
7897 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
7898 * distinguish from a colon, so treat that as a colon). */
7899 if (*p == ':') {
7900 p++;
7901 has_opening_colon = TRUE;
7902 }
7903 else if (*p == ';') {
7904 found_problem = TRUE;
7905 p++;
7906 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7907 has_opening_colon = TRUE;
7908 }
7909 else {
7910 found_problem = TRUE;
7911 ADD_POSIX_WARNING(p, "there must be a starting ':'");
7912
7913 /* Consider an initial punctuation (not one of the recognized ones) to
7914 * be a left terminator */
7915 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
7916 p++;
7917 }
7918 }
7919
7920 /* They may think that you can put spaces between the components */
7921 if (isBLANK(*p)) {
7922 found_problem = TRUE;
7923
7924 do {
7925 p++;
7926 } while (p < e && isBLANK(*p));
7927
7928 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7929 }
7930
7931 if (*p == '^') {
7932
7933 /* We consider something like [^:^alnum:]] to not have been intended to
7934 * be a posix class, but XXX maybe we should */
7935 if (complement) {
7936 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7937 }
7938
7939 complement = 1;
7940 p++;
7941 }
7942
7943 /* Again, they may think that you can put spaces between the components */
7944 if (isBLANK(*p)) {
7945 found_problem = TRUE;
7946
7947 do {
7948 p++;
7949 } while (p < e && isBLANK(*p));
7950
7951 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7952 }
7953
7954 if (*p == ']') {
7955
7956 /* XXX This ']' may be a typo, and something else was meant. But
7957 * treating it as such creates enough complications, that that
7958 * possibility isn't currently considered here. So we assume that the
7959 * ']' is what is intended, and if we've already found an initial '[',
7960 * this leaves this construct looking like [:] or [:^], which almost
7961 * certainly weren't intended to be posix classes */
7962 if (has_opening_bracket) {
7963 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7964 }
7965
7966 /* But this function can be called when we parse the colon for
7967 * something like qr/[alpha:]]/, so we back up to look for the
7968 * beginning */
7969 p--;
7970
7971 if (*p == ';') {
7972 found_problem = TRUE;
7973 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7974 }
7975 else if (*p != ':') {
7976
7977 /* XXX We are currently very restrictive here, so this code doesn't
7978 * consider the possibility that, say, /[alpha.]]/ was intended to
7979 * be a posix class. */
7980 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7981 }
7982
7983 /* Here we have something like 'foo:]'. There was no initial colon,
7984 * and we back up over 'foo. XXX Unlike the going forward case, we
7985 * don't handle typos of non-word chars in the middle */
7986 has_opening_colon = FALSE;
7987 p--;
7988
7989 while (p > RExC_start && isWORDCHAR(*p)) {
7990 p--;
7991 }
7992 p++;
7993
7994 /* Here, we have positioned ourselves to where we think the first
7995 * character in the potential class is */
7996 }
7997
7998 /* Now the interior really starts. There are certain key characters that
7999 * can end the interior, or these could just be typos. To catch both
8000 * cases, we may have to do two passes. In the first pass, we keep on
8001 * going unless we come to a sequence that matches
8002 * qr/ [[:punct:]] [[:blank:]]* \] /xa
8003 * This means it takes a sequence to end the pass, so two typos in a row if
8004 * that wasn't what was intended. If the class is perfectly formed, just
8005 * this one pass is needed. We also stop if there are too many characters
8006 * being accumulated, but this number is deliberately set higher than any
8007 * real class. It is set high enough so that someone who thinks that
8008 * 'alphanumeric' is a correct name would get warned that it wasn't.
8009 * While doing the pass, we keep track of where the key characters were in
8010 * it. If we don't find an end to the class, and one of the key characters
8011 * was found, we redo the pass, but stop when we get to that character.
8012 * Thus the key character was considered a typo in the first pass, but a
8013 * terminator in the second. If two key characters are found, we stop at
8014 * the second one in the first pass. Again this can miss two typos, but
8015 * catches a single one
8016 *
8017 * In the first pass, 'possible_end' starts as NULL, and then gets set to
8018 * point to the first key character. For the second pass, it starts as -1.
8019 * */
8020
8021 name_start = p;
8022 parse_name:
8023 {
8024 bool has_blank = FALSE;
8025 bool has_upper = FALSE;
8026 bool has_terminating_colon = FALSE;
8027 bool has_terminating_bracket = FALSE;
8028 bool has_semi_colon = FALSE;
8029 unsigned int name_len = 0;
8030 int punct_count = 0;
8031
8032 while (p < e) {
8033
8034 /* Squeeze out blanks when looking up the class name below */
8035 if (isBLANK(*p) ) {
8036 has_blank = TRUE;
8037 found_problem = TRUE;
8038 p++;
8039 continue;
8040 }
8041
8042 /* The name will end with a punctuation */
8043 if (isPUNCT(*p)) {
8044 const char * peek = p + 1;
8045
8046 /* Treat any non-']' punctuation followed by a ']' (possibly
8047 * with intervening blanks) as trying to terminate the class.
8048 * ']]' is very likely to mean a class was intended (but
8049 * missing the colon), but the warning message that gets
8050 * generated shows the error position better if we exit the
8051 * loop at the bottom (eventually), so skip it here. */
8052 if (*p != ']') {
8053 if (peek < e && isBLANK(*peek)) {
8054 has_blank = TRUE;
8055 found_problem = TRUE;
8056 do {
8057 peek++;
8058 } while (peek < e && isBLANK(*peek));
8059 }
8060
8061 if (peek < e && *peek == ']') {
8062 has_terminating_bracket = TRUE;
8063 if (*p == ':') {
8064 has_terminating_colon = TRUE;
8065 }
8066 else if (*p == ';') {
8067 has_semi_colon = TRUE;
8068 has_terminating_colon = TRUE;
8069 }
8070 else {
8071 found_problem = TRUE;
8072 }
8073 p = peek + 1;
8074 goto try_posix;
8075 }
8076 }
8077
8078 /* Here we have punctuation we thought didn't end the class.
8079 * Keep track of the position of the key characters that are
8080 * more likely to have been class-enders */
8081 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
8082
8083 /* Allow just one such possible class-ender not actually
8084 * ending the class. */
8085 if (possible_end) {
8086 break;
8087 }
8088 possible_end = p;
8089 }
8090
8091 /* If we have too many punctuation characters, no use in
8092 * keeping going */
8093 if (++punct_count > max_distance) {
8094 break;
8095 }
8096
8097 /* Treat the punctuation as a typo. */
8098 input_text[name_len++] = *p;
8099 p++;
8100 }
8101 else if (isUPPER(*p)) { /* Use lowercase for lookup */
8102 input_text[name_len++] = toLOWER(*p);
8103 has_upper = TRUE;
8104 found_problem = TRUE;
8105 p++;
8106 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
8107 input_text[name_len++] = *p;
8108 p++;
8109 }
8110 else {
8111 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
8112 p+= UTF8SKIP(p);
8113 }
8114
8115 /* The declaration of 'input_text' is how long we allow a potential
8116 * class name to be, before saying they didn't mean a class name at
8117 * all */
8118 if (name_len >= C_ARRAY_LENGTH(input_text)) {
8119 break;
8120 }
8121 }
8122
8123 /* We get to here when the possible class name hasn't been properly
8124 * terminated before:
8125 * 1) we ran off the end of the pattern; or
8126 * 2) found two characters, each of which might have been intended to
8127 * be the name's terminator
8128 * 3) found so many punctuation characters in the purported name,
8129 * that the edit distance to a valid one is exceeded
8130 * 4) we decided it was more characters than anyone could have
8131 * intended to be one. */
8132
8133 found_problem = TRUE;
8134
8135 /* In the final two cases, we know that looking up what we've
8136 * accumulated won't lead to a match, even a fuzzy one. */
8137 if ( name_len >= C_ARRAY_LENGTH(input_text)
8138 || punct_count > max_distance)
8139 {
8140 /* If there was an intermediate key character that could have been
8141 * an intended end, redo the parse, but stop there */
8142 if (possible_end && possible_end != (char *) -1) {
8143 possible_end = (char *) -1; /* Special signal value to say
8144 we've done a first pass */
8145 p = name_start;
8146 goto parse_name;
8147 }
8148
8149 /* Otherwise, it can't have meant to have been a class */
8150 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8151 }
8152
8153 /* If we ran off the end, and the final character was a punctuation
8154 * one, back up one, to look at that final one just below. Later, we
8155 * will restore the parse pointer if appropriate */
8156 if (name_len && p == e && isPUNCT(*(p-1))) {
8157 p--;
8158 name_len--;
8159 }
8160
8161 if (p < e && isPUNCT(*p)) {
8162 if (*p == ']') {
8163 has_terminating_bracket = TRUE;
8164
8165 /* If this is a 2nd ']', and the first one is just below this
8166 * one, consider that to be the real terminator. This gives a
8167 * uniform and better positioning for the warning message */
8168 if ( possible_end
8169 && possible_end != (char *) -1
8170 && *possible_end == ']'
8171 && name_len && input_text[name_len - 1] == ']')
8172 {
8173 name_len--;
8174 p = possible_end;
8175
8176 /* And this is actually equivalent to having done the 2nd
8177 * pass now, so set it to not try again */
8178 possible_end = (char *) -1;
8179 }
8180 }
8181 else {
8182 if (*p == ':') {
8183 has_terminating_colon = TRUE;
8184 }
8185 else if (*p == ';') {
8186 has_semi_colon = TRUE;
8187 has_terminating_colon = TRUE;
8188 }
8189 p++;
8190 }
8191 }
8192
8193 try_posix:
8194
8195 /* Here, we have a class name to look up. We can short circuit the
8196 * stuff below for short names that can't possibly be meant to be a
8197 * class name. (We can do this on the first pass, as any second pass
8198 * will yield an even shorter name) */
8199 if (name_len < 3) {
8200 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8201 }
8202
8203 /* Find which class it is. Initially switch on the length of the name.
8204 * */
8205 switch (name_len) {
8206 case 4:
8207 if (memEQs(name_start, 4, "word")) {
8208 /* this is not POSIX, this is the Perl \w */
8209 class_number = ANYOF_WORDCHAR;
8210 }
8211 break;
8212 case 5:
8213 /* Names all of length 5: alnum alpha ascii blank cntrl digit
8214 * graph lower print punct space upper
8215 * Offset 4 gives the best switch position. */
8216 switch (name_start[4]) {
8217 case 'a':
8218 if (memBEGINs(name_start, 5, "alph")) /* alpha */
8219 class_number = ANYOF_ALPHA;
8220 break;
8221 case 'e':
8222 if (memBEGINs(name_start, 5, "spac")) /* space */
8223 class_number = ANYOF_SPACE;
8224 break;
8225 case 'h':
8226 if (memBEGINs(name_start, 5, "grap")) /* graph */
8227 class_number = ANYOF_GRAPH;
8228 break;
8229 case 'i':
8230 if (memBEGINs(name_start, 5, "asci")) /* ascii */
8231 class_number = ANYOF_ASCII;
8232 break;
8233 case 'k':
8234 if (memBEGINs(name_start, 5, "blan")) /* blank */
8235 class_number = ANYOF_BLANK;
8236 break;
8237 case 'l':
8238 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
8239 class_number = ANYOF_CNTRL;
8240 break;
8241 case 'm':
8242 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
8243 class_number = ANYOF_ALPHANUMERIC;
8244 break;
8245 case 'r':
8246 if (memBEGINs(name_start, 5, "lowe")) /* lower */
8247 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
8248 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
8249 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
8250 break;
8251 case 't':
8252 if (memBEGINs(name_start, 5, "digi")) /* digit */
8253 class_number = ANYOF_DIGIT;
8254 else if (memBEGINs(name_start, 5, "prin")) /* print */
8255 class_number = ANYOF_PRINT;
8256 else if (memBEGINs(name_start, 5, "punc")) /* punct */
8257 class_number = ANYOF_PUNCT;
8258 break;
8259 }
8260 break;
8261 case 6:
8262 if (memEQs(name_start, 6, "xdigit"))
8263 class_number = ANYOF_XDIGIT;
8264 break;
8265 }
8266
8267 /* If the name exactly matches a posix class name the class number will
8268 * here be set to it, and the input almost certainly was meant to be a
8269 * posix class, so we can skip further checking. If instead the syntax
8270 * is exactly correct, but the name isn't one of the legal ones, we
8271 * will return that as an error below. But if neither of these apply,
8272 * it could be that no posix class was intended at all, or that one
8273 * was, but there was a typo. We tease these apart by doing fuzzy
8274 * matching on the name */
8275 if (class_number == OOB_NAMEDCLASS && found_problem) {
8276 const UV posix_names[][6] = {
8277 { 'a', 'l', 'n', 'u', 'm' },
8278 { 'a', 'l', 'p', 'h', 'a' },
8279 { 'a', 's', 'c', 'i', 'i' },
8280 { 'b', 'l', 'a', 'n', 'k' },
8281 { 'c', 'n', 't', 'r', 'l' },
8282 { 'd', 'i', 'g', 'i', 't' },
8283 { 'g', 'r', 'a', 'p', 'h' },
8284 { 'l', 'o', 'w', 'e', 'r' },
8285 { 'p', 'r', 'i', 'n', 't' },
8286 { 'p', 'u', 'n', 'c', 't' },
8287 { 's', 'p', 'a', 'c', 'e' },
8288 { 'u', 'p', 'p', 'e', 'r' },
8289 { 'w', 'o', 'r', 'd' },
8290 { 'x', 'd', 'i', 'g', 'i', 't' }
8291 };
8292 /* The names of the above all have added NULs to make them the same
8293 * size, so we need to also have the real lengths */
8294 const UV posix_name_lengths[] = {
8295 sizeof("alnum") - 1,
8296 sizeof("alpha") - 1,
8297 sizeof("ascii") - 1,
8298 sizeof("blank") - 1,
8299 sizeof("cntrl") - 1,
8300 sizeof("digit") - 1,
8301 sizeof("graph") - 1,
8302 sizeof("lower") - 1,
8303 sizeof("print") - 1,
8304 sizeof("punct") - 1,
8305 sizeof("space") - 1,
8306 sizeof("upper") - 1,
8307 sizeof("word") - 1,
8308 sizeof("xdigit")- 1
8309 };
8310 unsigned int i;
8311 int temp_max = max_distance; /* Use a temporary, so if we
8312 reparse, we haven't changed the
8313 outer one */
8314
8315 /* Use a smaller max edit distance if we are missing one of the
8316 * delimiters */
8317 if ( has_opening_bracket + has_opening_colon < 2
8318 || has_terminating_bracket + has_terminating_colon < 2)
8319 {
8320 temp_max--;
8321 }
8322
8323 /* See if the input name is close to a legal one */
8324 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
8325
8326 /* Short circuit call if the lengths are too far apart to be
8327 * able to match */
8328 if (abs( (int) (name_len - posix_name_lengths[i]))
8329 > temp_max)
8330 {
8331 continue;
8332 }
8333
8334 if (edit_distance(input_text,
8335 posix_names[i],
8336 name_len,
8337 posix_name_lengths[i],
8338 temp_max
8339 )
8340 > -1)
8341 { /* If it is close, it probably was intended to be a class */
8342 goto probably_meant_to_be;
8343 }
8344 }
8345
8346 /* Here the input name is not close enough to a valid class name
8347 * for us to consider it to be intended to be a posix class. If
8348 * we haven't already done so, and the parse found a character that
8349 * could have been terminators for the name, but which we absorbed
8350 * as typos during the first pass, repeat the parse, signalling it
8351 * to stop at that character */
8352 if (possible_end && possible_end != (char *) -1) {
8353 possible_end = (char *) -1;
8354 p = name_start;
8355 goto parse_name;
8356 }
8357
8358 /* Here neither pass found a close-enough class name */
8359 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8360 }
8361
8362 probably_meant_to_be:
8363
8364 /* Here we think that a posix specification was intended. Update any
8365 * parse pointer */
8366 if (updated_parse_ptr) {
8367 *updated_parse_ptr = (char *) p;
8368 }
8369
8370 /* If a posix class name was intended but incorrectly specified, we
8371 * output or return the warnings */
8372 if (found_problem) {
8373
8374 /* We set flags for these issues in the parse loop above instead of
8375 * adding them to the list of warnings, because we can parse it
8376 * twice, and we only want one warning instance */
8377 if (has_upper) {
8378 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
8379 }
8380 if (has_blank) {
8381 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
8382 }
8383 if (has_semi_colon) {
8384 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
8385 }
8386 else if (! has_terminating_colon) {
8387 ADD_POSIX_WARNING(p, "there is no terminating ':'");
8388 }
8389 if (! has_terminating_bracket) {
8390 ADD_POSIX_WARNING(p, "there is no terminating ']'");
8391 }
8392
8393 if ( posix_warnings
8394 && RExC_warn_text
8395 && av_count(RExC_warn_text) > 0)
8396 {
8397 *posix_warnings = RExC_warn_text;
8398 }
8399 }
8400 else if (class_number != OOB_NAMEDCLASS) {
8401 /* If it is a known class, return the class. The class number
8402 * #defines are structured so each complement is +1 to the normal
8403 * one */
8404 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
8405 }
8406 else if (! check_only) {
8407
8408 /* Here, it is an unrecognized class. This is an error (unless the
8409 * call is to check only, which we've already handled above) */
8410 const char * const complement_string = (complement)
8411 ? "^"
8412 : "";
8413 RExC_parse_set((char *) p);
8414 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
8415 complement_string,
8416 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
8417 }
8418 }
8419
8420 return OOB_NAMEDCLASS;
8421 }
8422 #undef ADD_POSIX_WARNING
8423
8424 STATIC unsigned int
S_regex_set_precedence(const U8 my_operator)8425 S_regex_set_precedence(const U8 my_operator) {
8426
8427 /* Returns the precedence in the (?[...]) construct of the input operator,
8428 * specified by its character representation. The precedence follows
8429 * general Perl rules, but it extends this so that ')' and ']' have (low)
8430 * precedence even though they aren't really operators */
8431
8432 switch (my_operator) {
8433 case '!':
8434 return 5;
8435 case '&':
8436 return 4;
8437 case '^':
8438 case '|':
8439 case '+':
8440 case '-':
8441 return 3;
8442 case ')':
8443 return 2;
8444 case ']':
8445 return 1;
8446 }
8447
8448 NOT_REACHED; /* NOTREACHED */
8449 return 0; /* Silence compiler warning */
8450 }
8451
8452 STATIC regnode_offset
S_handle_regex_sets(pTHX_ RExC_state_t * pRExC_state,SV ** return_invlist,I32 * flagp,U32 depth)8453 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
8454 I32 *flagp, U32 depth)
8455 {
8456 /* Handle the (?[...]) construct to do set operations */
8457
8458 U8 curchar; /* Current character being parsed */
8459 UV start, end; /* End points of code point ranges */
8460 SV* final = NULL; /* The end result inversion list */
8461 SV* result_string; /* 'final' stringified */
8462 AV* stack; /* stack of operators and operands not yet
8463 resolved */
8464 AV* fence_stack = NULL; /* A stack containing the positions in
8465 'stack' of where the undealt-with left
8466 parens would be if they were actually
8467 put there */
8468 /* The 'volatile' is a workaround for an optimiser bug
8469 * in Solaris Studio 12.3. See RT #127455 */
8470 volatile IV fence = 0; /* Position of where most recent undealt-
8471 with left paren in stack is; -1 if none.
8472 */
8473 STRLEN len; /* Temporary */
8474 regnode_offset node; /* Temporary, and final regnode returned by
8475 this function */
8476 const bool save_fold = FOLD; /* Temporary */
8477 char *save_end, *save_parse; /* Temporaries */
8478 const bool in_locale = LOC; /* we turn off /l during processing */
8479
8480 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8481
8482 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
8483
8484 DEBUG_PARSE("xcls");
8485
8486 if (in_locale) {
8487 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
8488 }
8489
8490 /* The use of this operator implies /u. This is required so that the
8491 * compile time values are valid in all runtime cases */
8492 REQUIRE_UNI_RULES(flagp, 0);
8493
8494 /* Everything in this construct is a metacharacter. Operands begin with
8495 * either a '\' (for an escape sequence), or a '[' for a bracketed
8496 * character class. Any other character should be an operator, or
8497 * parenthesis for grouping. Both types of operands are handled by calling
8498 * regclass() to parse them. It is called with a parameter to indicate to
8499 * return the computed inversion list. The parsing here is implemented via
8500 * a stack. Each entry on the stack is a single character representing one
8501 * of the operators; or else a pointer to an operand inversion list. */
8502
8503 #define IS_OPERATOR(a) SvIOK(a)
8504 #define IS_OPERAND(a) (! IS_OPERATOR(a))
8505
8506 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
8507 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
8508 * with pronouncing it called it Reverse Polish instead, but now that YOU
8509 * know how to pronounce it you can use the correct term, thus giving due
8510 * credit to the person who invented it, and impressing your geek friends.
8511 * Wikipedia says that the pronunciation of "Ł" has been changing so that
8512 * it is now more like an English initial W (as in wonk) than an L.)
8513 *
8514 * This means that, for example, 'a | b & c' is stored on the stack as
8515 *
8516 * c [4]
8517 * b [3]
8518 * & [2]
8519 * a [1]
8520 * | [0]
8521 *
8522 * where the numbers in brackets give the stack [array] element number.
8523 * In this implementation, parentheses are not stored on the stack.
8524 * Instead a '(' creates a "fence" so that the part of the stack below the
8525 * fence is invisible except to the corresponding ')' (this allows us to
8526 * replace testing for parens, by using instead subtraction of the fence
8527 * position). As new operands are processed they are pushed onto the stack
8528 * (except as noted in the next paragraph). New operators of higher
8529 * precedence than the current final one are inserted on the stack before
8530 * the lhs operand (so that when the rhs is pushed next, everything will be
8531 * in the correct positions shown above. When an operator of equal or
8532 * lower precedence is encountered in parsing, all the stacked operations
8533 * of equal or higher precedence are evaluated, leaving the result as the
8534 * top entry on the stack. This makes higher precedence operations
8535 * evaluate before lower precedence ones, and causes operations of equal
8536 * precedence to left associate.
8537 *
8538 * The only unary operator '!' is immediately pushed onto the stack when
8539 * encountered. When an operand is encountered, if the top of the stack is
8540 * a '!", the complement is immediately performed, and the '!' popped. The
8541 * resulting value is treated as a new operand, and the logic in the
8542 * previous paragraph is executed. Thus in the expression
8543 * [a] + ! [b]
8544 * the stack looks like
8545 *
8546 * !
8547 * a
8548 * +
8549 *
8550 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
8551 * becomes
8552 *
8553 * !b
8554 * a
8555 * +
8556 *
8557 * A ')' is treated as an operator with lower precedence than all the
8558 * aforementioned ones, which causes all operations on the stack above the
8559 * corresponding '(' to be evaluated down to a single resultant operand.
8560 * Then the fence for the '(' is removed, and the operand goes through the
8561 * algorithm above, without the fence.
8562 *
8563 * A separate stack is kept of the fence positions, so that the position of
8564 * the latest so-far unbalanced '(' is at the top of it.
8565 *
8566 * The ']' ending the construct is treated as the lowest operator of all,
8567 * so that everything gets evaluated down to a single operand, which is the
8568 * result */
8569
8570 stack = (AV*)newSV_type_mortal(SVt_PVAV);
8571 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
8572
8573 while (RExC_parse < RExC_end) {
8574 I32 top_index; /* Index of top-most element in 'stack' */
8575 SV** top_ptr; /* Pointer to top 'stack' element */
8576 SV* current = NULL; /* To contain the current inversion list
8577 operand */
8578 SV* only_to_avoid_leaks;
8579
8580 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
8581 TRUE /* Force /x */ );
8582 if (RExC_parse >= RExC_end) { /* Fail */
8583 break;
8584 }
8585
8586 curchar = UCHARAT(RExC_parse);
8587
8588 redo_curchar:
8589
8590 #ifdef ENABLE_REGEX_SETS_DEBUGGING
8591 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
8592 DEBUG_U(dump_regex_sets_structures(pRExC_state,
8593 stack, fence, fence_stack));
8594 #endif
8595
8596 top_index = av_tindex_skip_len_mg(stack);
8597
8598 switch (curchar) {
8599 SV** stacked_ptr; /* Ptr to something already on 'stack' */
8600 char stacked_operator; /* The topmost operator on the 'stack'. */
8601 SV* lhs; /* Operand to the left of the operator */
8602 SV* rhs; /* Operand to the right of the operator */
8603 SV* fence_ptr; /* Pointer to top element of the fence
8604 stack */
8605 case '(':
8606
8607 if ( RExC_parse < RExC_end - 2
8608 && UCHARAT(RExC_parse + 1) == '?'
8609 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
8610 {
8611 const regnode_offset orig_emit = RExC_emit;
8612 SV * resultant_invlist;
8613
8614 /* Here it could be an embedded '(?flags:(?[...])'.
8615 * This happens when we have some thing like
8616 *
8617 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
8618 * ...
8619 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
8620 *
8621 * Here we would be handling the interpolated
8622 * '$thai_or_lao'. We handle this by a recursive call to
8623 * reg which returns the inversion list the
8624 * interpolated expression evaluates to. Actually, the
8625 * return is a special regnode containing a pointer to that
8626 * inversion list. If the return isn't that regnode alone,
8627 * we know that this wasn't such an interpolation, which is
8628 * an error: we need to get a single inversion list back
8629 * from the recursion */
8630
8631 RExC_parse_inc_by(1);
8632 RExC_sets_depth++;
8633
8634 node = reg(pRExC_state, 2, flagp, depth+1);
8635 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8636
8637 if ( OP(REGNODE_p(node)) != REGEX_SET
8638 /* If more than a single node returned, the nested
8639 * parens evaluated to more than just a (?[...]),
8640 * which isn't legal */
8641 || RExC_emit != orig_emit
8642 + NODE_STEP_REGNODE
8643 + REGNODE_ARG_LEN(REGEX_SET))
8644 {
8645 vFAIL("Expecting interpolated extended charclass");
8646 }
8647 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
8648 current = invlist_clone(resultant_invlist, NULL);
8649 SvREFCNT_dec(resultant_invlist);
8650
8651 RExC_sets_depth--;
8652 RExC_emit = orig_emit;
8653 goto handle_operand;
8654 }
8655
8656 /* A regular '('. Look behind for illegal syntax */
8657 if (top_index - fence >= 0) {
8658 /* If the top entry on the stack is an operator, it had
8659 * better be a '!', otherwise the entry below the top
8660 * operand should be an operator */
8661 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
8662 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
8663 || ( IS_OPERAND(*top_ptr)
8664 && ( top_index - fence < 1
8665 || ! (stacked_ptr = av_fetch(stack,
8666 top_index - 1,
8667 FALSE))
8668 || ! IS_OPERATOR(*stacked_ptr))))
8669 {
8670 RExC_parse_inc_by(1);
8671 vFAIL("Unexpected '(' with no preceding operator");
8672 }
8673 }
8674
8675 /* Stack the position of this undealt-with left paren */
8676 av_push_simple(fence_stack, newSViv(fence));
8677 fence = top_index + 1;
8678 break;
8679
8680 case '\\':
8681 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8682 * multi-char folds are allowed. */
8683 if (!regclass(pRExC_state, flagp, depth+1,
8684 TRUE, /* means parse just the next thing */
8685 FALSE, /* don't allow multi-char folds */
8686 FALSE, /* don't silence non-portable warnings. */
8687 TRUE, /* strict */
8688 FALSE, /* Require return to be an ANYOF */
8689 ¤t))
8690 {
8691 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8692 goto regclass_failed;
8693 }
8694
8695 assert(current);
8696
8697 /* regclass() will return with parsing just the \ sequence,
8698 * leaving the parse pointer at the next thing to parse */
8699 RExC_parse--;
8700 goto handle_operand;
8701
8702 case '[': /* Is a bracketed character class */
8703 {
8704 /* See if this is a [:posix:] class. */
8705 bool is_posix_class = (OOB_NAMEDCLASS
8706 < handle_possible_posix(pRExC_state,
8707 RExC_parse + 1,
8708 NULL,
8709 NULL,
8710 TRUE /* checking only */));
8711 /* If it is a posix class, leave the parse pointer at the '['
8712 * to fool regclass() into thinking it is part of a
8713 * '[[:posix:]]'. */
8714 if (! is_posix_class) {
8715 RExC_parse_inc_by(1);
8716 }
8717
8718 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8719 * multi-char folds are allowed. */
8720 if (!regclass(pRExC_state, flagp, depth+1,
8721 is_posix_class, /* parse the whole char
8722 class only if not a
8723 posix class */
8724 FALSE, /* don't allow multi-char folds */
8725 TRUE, /* silence non-portable warnings. */
8726 TRUE, /* strict */
8727 FALSE, /* Require return to be an ANYOF */
8728 ¤t))
8729 {
8730 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8731 goto regclass_failed;
8732 }
8733
8734 assert(current);
8735
8736 /* function call leaves parse pointing to the ']', except if we
8737 * faked it */
8738 if (is_posix_class) {
8739 RExC_parse--;
8740 }
8741
8742 goto handle_operand;
8743 }
8744
8745 case ']':
8746 if (top_index >= 1) {
8747 goto join_operators;
8748 }
8749
8750 /* Only a single operand on the stack: are done */
8751 goto done;
8752
8753 case ')':
8754 if (av_tindex_skip_len_mg(fence_stack) < 0) {
8755 if (UCHARAT(RExC_parse - 1) == ']') {
8756 break;
8757 }
8758 RExC_parse_inc_by(1);
8759 vFAIL("Unexpected ')'");
8760 }
8761
8762 /* If nothing after the fence, is missing an operand */
8763 if (top_index - fence < 0) {
8764 RExC_parse_inc_by(1);
8765 goto bad_syntax;
8766 }
8767 /* If at least two things on the stack, treat this as an
8768 * operator */
8769 if (top_index - fence >= 1) {
8770 goto join_operators;
8771 }
8772
8773 /* Here only a single thing on the fenced stack, and there is a
8774 * fence. Get rid of it */
8775 fence_ptr = av_pop(fence_stack);
8776 assert(fence_ptr);
8777 fence = SvIV(fence_ptr);
8778 SvREFCNT_dec_NN(fence_ptr);
8779 fence_ptr = NULL;
8780
8781 if (fence < 0) {
8782 fence = 0;
8783 }
8784
8785 /* Having gotten rid of the fence, we pop the operand at the
8786 * stack top and process it as a newly encountered operand */
8787 current = av_pop(stack);
8788 if (IS_OPERAND(current)) {
8789 goto handle_operand;
8790 }
8791
8792 RExC_parse_inc_by(1);
8793 goto bad_syntax;
8794
8795 case '&':
8796 case '|':
8797 case '+':
8798 case '-':
8799 case '^':
8800
8801 /* These binary operators should have a left operand already
8802 * parsed */
8803 if ( top_index - fence < 0
8804 || top_index - fence == 1
8805 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
8806 || ! IS_OPERAND(*top_ptr))
8807 {
8808 goto unexpected_binary;
8809 }
8810
8811 /* If only the one operand is on the part of the stack visible
8812 * to us, we just place this operator in the proper position */
8813 if (top_index - fence < 2) {
8814
8815 /* Place the operator before the operand */
8816
8817 SV* lhs = av_pop(stack);
8818 av_push_simple(stack, newSVuv(curchar));
8819 av_push_simple(stack, lhs);
8820 break;
8821 }
8822
8823 /* But if there is something else on the stack, we need to
8824 * process it before this new operator if and only if the
8825 * stacked operation has equal or higher precedence than the
8826 * new one */
8827
8828 join_operators:
8829
8830 /* The operator on the stack is supposed to be below both its
8831 * operands */
8832 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
8833 || IS_OPERAND(*stacked_ptr))
8834 {
8835 /* But if not, it's legal and indicates we are completely
8836 * done if and only if we're currently processing a ']',
8837 * which should be the final thing in the expression */
8838 if (curchar == ']') {
8839 goto done;
8840 }
8841
8842 unexpected_binary:
8843 RExC_parse_inc_by(1);
8844 vFAIL2("Unexpected binary operator '%c' with no "
8845 "preceding operand", curchar);
8846 }
8847 stacked_operator = (char) SvUV(*stacked_ptr);
8848
8849 if (regex_set_precedence(curchar)
8850 > regex_set_precedence(stacked_operator))
8851 {
8852 /* Here, the new operator has higher precedence than the
8853 * stacked one. This means we need to add the new one to
8854 * the stack to await its rhs operand (and maybe more
8855 * stuff). We put it before the lhs operand, leaving
8856 * untouched the stacked operator and everything below it
8857 * */
8858 lhs = av_pop(stack);
8859 assert(IS_OPERAND(lhs));
8860 av_push_simple(stack, newSVuv(curchar));
8861 av_push_simple(stack, lhs);
8862 break;
8863 }
8864
8865 /* Here, the new operator has equal or lower precedence than
8866 * what's already there. This means the operation already
8867 * there should be performed now, before the new one. */
8868
8869 rhs = av_pop(stack);
8870 if (! IS_OPERAND(rhs)) {
8871
8872 /* This can happen when a ! is not followed by an operand,
8873 * like in /(?[\t &!])/ */
8874 goto bad_syntax;
8875 }
8876
8877 lhs = av_pop(stack);
8878
8879 if (! IS_OPERAND(lhs)) {
8880
8881 /* This can happen when there is an empty (), like in
8882 * /(?[[0]+()+])/ */
8883 goto bad_syntax;
8884 }
8885
8886 switch (stacked_operator) {
8887 case '&':
8888 _invlist_intersection(lhs, rhs, &rhs);
8889 break;
8890
8891 case '|':
8892 case '+':
8893 _invlist_union(lhs, rhs, &rhs);
8894 break;
8895
8896 case '-':
8897 _invlist_subtract(lhs, rhs, &rhs);
8898 break;
8899
8900 case '^': /* The union minus the intersection */
8901 {
8902 SV* i = NULL;
8903 SV* u = NULL;
8904
8905 _invlist_union(lhs, rhs, &u);
8906 _invlist_intersection(lhs, rhs, &i);
8907 _invlist_subtract(u, i, &rhs);
8908 SvREFCNT_dec_NN(i);
8909 SvREFCNT_dec_NN(u);
8910 break;
8911 }
8912 }
8913 SvREFCNT_dec(lhs);
8914
8915 /* Here, the higher precedence operation has been done, and the
8916 * result is in 'rhs'. We overwrite the stacked operator with
8917 * the result. Then we redo this code to either push the new
8918 * operator onto the stack or perform any higher precedence
8919 * stacked operation */
8920 only_to_avoid_leaks = av_pop(stack);
8921 SvREFCNT_dec(only_to_avoid_leaks);
8922 av_push_simple(stack, rhs);
8923 goto redo_curchar;
8924
8925 case '!': /* Highest priority, right associative */
8926
8927 /* If what's already at the top of the stack is another '!",
8928 * they just cancel each other out */
8929 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
8930 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
8931 {
8932 only_to_avoid_leaks = av_pop(stack);
8933 SvREFCNT_dec(only_to_avoid_leaks);
8934 }
8935 else { /* Otherwise, since it's right associative, just push
8936 onto the stack */
8937 av_push_simple(stack, newSVuv(curchar));
8938 }
8939 break;
8940
8941 default:
8942 RExC_parse_inc();
8943 if (RExC_parse >= RExC_end) {
8944 break;
8945 }
8946 vFAIL("Unexpected character");
8947
8948 handle_operand:
8949
8950 /* Here 'current' is the operand. If something is already on the
8951 * stack, we have to check if it is a !. But first, the code above
8952 * may have altered the stack in the time since we earlier set
8953 * 'top_index'. */
8954
8955 top_index = av_tindex_skip_len_mg(stack);
8956 if (top_index - fence >= 0) {
8957 /* If the top entry on the stack is an operator, it had better
8958 * be a '!', otherwise the entry below the top operand should
8959 * be an operator */
8960 top_ptr = av_fetch(stack, top_index, FALSE);
8961 assert(top_ptr);
8962 if (IS_OPERATOR(*top_ptr)) {
8963
8964 /* The only permissible operator at the top of the stack is
8965 * '!', which is applied immediately to this operand. */
8966 curchar = (char) SvUV(*top_ptr);
8967 if (curchar != '!') {
8968 SvREFCNT_dec(current);
8969 vFAIL2("Unexpected binary operator '%c' with no "
8970 "preceding operand", curchar);
8971 }
8972
8973 _invlist_invert(current);
8974
8975 only_to_avoid_leaks = av_pop(stack);
8976 SvREFCNT_dec(only_to_avoid_leaks);
8977
8978 /* And we redo with the inverted operand. This allows
8979 * handling multiple ! in a row */
8980 goto handle_operand;
8981 }
8982 /* Single operand is ok only for the non-binary ')'
8983 * operator */
8984 else if ((top_index - fence == 0 && curchar != ')')
8985 || (top_index - fence > 0
8986 && (! (stacked_ptr = av_fetch(stack,
8987 top_index - 1,
8988 FALSE))
8989 || IS_OPERAND(*stacked_ptr))))
8990 {
8991 SvREFCNT_dec(current);
8992 vFAIL("Operand with no preceding operator");
8993 }
8994 }
8995
8996 /* Here there was nothing on the stack or the top element was
8997 * another operand. Just add this new one */
8998 av_push_simple(stack, current);
8999
9000 } /* End of switch on next parse token */
9001
9002 RExC_parse_inc();
9003 } /* End of loop parsing through the construct */
9004
9005 vFAIL("Syntax error in (?[...])");
9006
9007 done:
9008
9009 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
9010 if (RExC_parse < RExC_end) {
9011 RExC_parse_inc_by(1);
9012 }
9013
9014 vFAIL("Unexpected ']' with no following ')' in (?[...");
9015 }
9016
9017 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
9018 vFAIL("Unmatched (");
9019 }
9020
9021 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
9022 || ((final = av_pop(stack)) == NULL)
9023 || ! IS_OPERAND(final)
9024 || ! is_invlist(final)
9025 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
9026 {
9027 bad_syntax:
9028 SvREFCNT_dec(final);
9029 vFAIL("Incomplete expression within '(?[ ])'");
9030 }
9031
9032 /* Here, 'final' is the resultant inversion list from evaluating the
9033 * expression. Return it if so requested */
9034 if (return_invlist) {
9035 *return_invlist = final;
9036 return END;
9037 }
9038
9039 if (RExC_sets_depth) { /* If within a recursive call, return in a special
9040 regnode */
9041 RExC_parse_inc_by(1);
9042 node = regpnode(pRExC_state, REGEX_SET, final);
9043 }
9044 else {
9045
9046 /* Otherwise generate a resultant node, based on 'final'. regclass()
9047 * is expecting a string of ranges and individual code points */
9048 invlist_iterinit(final);
9049 result_string = newSVpvs("");
9050 while (invlist_iternext(final, &start, &end)) {
9051 if (start == end) {
9052 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
9053 }
9054 else {
9055 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
9056 UVXf "}", start, end);
9057 }
9058 }
9059
9060 /* About to generate an ANYOF (or similar) node from the inversion list
9061 * we have calculated */
9062 save_parse = RExC_parse;
9063 RExC_parse_set(SvPV(result_string, len));
9064 save_end = RExC_end;
9065 RExC_end = RExC_parse + len;
9066 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
9067
9068 /* We turn off folding around the call, as the class we have
9069 * constructed already has all folding taken into consideration, and we
9070 * don't want regclass() to add to that */
9071 RExC_flags &= ~RXf_PMf_FOLD;
9072 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
9073 * folds are allowed. */
9074 node = regclass(pRExC_state, flagp, depth+1,
9075 FALSE, /* means parse the whole char class */
9076 FALSE, /* don't allow multi-char folds */
9077 TRUE, /* silence non-portable warnings. The above may
9078 very well have generated non-portable code
9079 points, but they're valid on this machine */
9080 FALSE, /* similarly, no need for strict */
9081
9082 /* We can optimize into something besides an ANYOF,
9083 * except under /l, which needs to be ANYOF because of
9084 * runtime checks for locale sanity, etc */
9085 ! in_locale,
9086 NULL
9087 );
9088
9089 RESTORE_WARNINGS;
9090 RExC_parse_set(save_parse + 1);
9091 RExC_end = save_end;
9092 SvREFCNT_dec_NN(final);
9093 SvREFCNT_dec_NN(result_string);
9094
9095 if (save_fold) {
9096 RExC_flags |= RXf_PMf_FOLD;
9097 }
9098
9099 if (!node) {
9100 RETURN_FAIL_ON_RESTART(*flagp, flagp);
9101 goto regclass_failed;
9102 }
9103
9104 /* Fix up the node type if we are in locale. (We have pretended we are
9105 * under /u for the purposes of regclass(), as this construct will only
9106 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
9107 * (so as to cause any warnings about bad locales to be output in
9108 * regexec.c), and add the flag that indicates to check if not in a
9109 * UTF-8 locale. The reason we above forbid optimization into
9110 * something other than an ANYOF node is simply to minimize the number
9111 * of code changes in regexec.c. Otherwise we would have to create new
9112 * EXACTish node types and deal with them. This decision could be
9113 * revisited should this construct become popular.
9114 *
9115 * (One might think we could look at the resulting ANYOF node and
9116 * suppress the flag if everything is above 255, as those would be
9117 * UTF-8 only, but this isn't true, as the components that led to that
9118 * result could have been locale-affected, and just happen to cancel
9119 * each other out under UTF-8 locales.) */
9120 if (in_locale) {
9121 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
9122
9123 assert(OP(REGNODE_p(node)) == ANYOF);
9124
9125 OP(REGNODE_p(node)) = ANYOFL;
9126 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
9127 }
9128 }
9129
9130 nextchar(pRExC_state);
9131 return node;
9132
9133 regclass_failed:
9134 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
9135 (UV) *flagp);
9136 }
9137
9138 #ifdef ENABLE_REGEX_SETS_DEBUGGING
9139
9140 STATIC void
S_dump_regex_sets_structures(pTHX_ RExC_state_t * pRExC_state,AV * stack,const IV fence,AV * fence_stack)9141 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
9142 AV * stack, const IV fence, AV * fence_stack)
9143 { /* Dumps the stacks in handle_regex_sets() */
9144
9145 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
9146 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
9147 SSize_t i;
9148
9149 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
9150
9151 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
9152
9153 if (stack_top < 0) {
9154 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
9155 }
9156 else {
9157 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
9158 for (i = stack_top; i >= 0; i--) {
9159 SV ** element_ptr = av_fetch(stack, i, FALSE);
9160 if (! element_ptr) {
9161 }
9162
9163 if (IS_OPERATOR(*element_ptr)) {
9164 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
9165 (int) i, (int) SvIV(*element_ptr));
9166 }
9167 else {
9168 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
9169 sv_dump(*element_ptr);
9170 }
9171 }
9172 }
9173
9174 if (fence_stack_top < 0) {
9175 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
9176 }
9177 else {
9178 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
9179 for (i = fence_stack_top; i >= 0; i--) {
9180 SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
9181 if (! element_ptr) {
9182 }
9183
9184 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
9185 (int) i, (int) SvIV(*element_ptr));
9186 }
9187 }
9188 }
9189
9190 #endif
9191
9192 #undef IS_OPERATOR
9193 #undef IS_OPERAND
9194
9195 #ifdef PERL_RE_BUILD_AUX
9196 void
Perl_add_above_Latin1_folds(pTHX_ RExC_state_t * pRExC_state,const U8 cp,SV ** invlist)9197 Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
9198 {
9199 /* This adds the Latin1/above-Latin1 folding rules.
9200 *
9201 * This should be called only for a Latin1-range code points, cp, which is
9202 * known to be involved in a simple fold with other code points above
9203 * Latin1. It would give false results if /aa has been specified.
9204 * Multi-char folds are outside the scope of this, and must be handled
9205 * specially. */
9206
9207 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
9208
9209 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
9210
9211 /* The rules that are valid for all Unicode versions are hard-coded in */
9212 switch (cp) {
9213 case 'k':
9214 case 'K':
9215 *invlist =
9216 add_cp_to_invlist(*invlist, KELVIN_SIGN);
9217 break;
9218 case 's':
9219 case 'S':
9220 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
9221 break;
9222 case MICRO_SIGN:
9223 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
9224 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
9225 break;
9226 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9227 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9228 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
9229 break;
9230 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9231 *invlist = add_cp_to_invlist(*invlist,
9232 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9233 break;
9234
9235 default: /* Other code points are checked against the data for the
9236 current Unicode version */
9237 {
9238 Size_t folds_count;
9239 U32 first_fold;
9240 const U32 * remaining_folds;
9241 UV folded_cp;
9242
9243 if (isASCII(cp)) {
9244 folded_cp = toFOLD(cp);
9245 }
9246 else {
9247 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
9248 Size_t dummy_len;
9249 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
9250 }
9251
9252 if (folded_cp > 255) {
9253 *invlist = add_cp_to_invlist(*invlist, folded_cp);
9254 }
9255
9256 folds_count = _inverse_folds(folded_cp, &first_fold,
9257 &remaining_folds);
9258 if (folds_count == 0) {
9259
9260 /* Use deprecated warning to increase the chances of this being
9261 * output */
9262 ckWARN2reg_d(RExC_parse,
9263 "Perl folding rules are not up-to-date for 0x%02X;"
9264 " please use the perlbug utility to report;", cp);
9265 }
9266 else {
9267 unsigned int i;
9268
9269 if (first_fold > 255) {
9270 *invlist = add_cp_to_invlist(*invlist, first_fold);
9271 }
9272 for (i = 0; i < folds_count - 1; i++) {
9273 if (remaining_folds[i] > 255) {
9274 *invlist = add_cp_to_invlist(*invlist,
9275 remaining_folds[i]);
9276 }
9277 }
9278 }
9279 break;
9280 }
9281 }
9282 }
9283 #endif /* PERL_RE_BUILD_AUX */
9284
9285
9286 STATIC void
S_output_posix_warnings(pTHX_ RExC_state_t * pRExC_state,AV * posix_warnings)9287 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
9288 {
9289 /* Output the elements of the array given by '*posix_warnings' as REGEXP
9290 * warnings. */
9291
9292 SV * msg;
9293 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
9294
9295 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
9296
9297 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
9298 CLEAR_POSIX_WARNINGS();
9299 return;
9300 }
9301
9302 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
9303 if (first_is_fatal) { /* Avoid leaking this */
9304 av_undef(posix_warnings); /* This isn't necessary if the
9305 array is mortal, but is a
9306 fail-safe */
9307 (void) sv_2mortal(msg);
9308 }
9309 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
9310 SvREFCNT_dec_NN(msg);
9311 }
9312
9313 UPDATE_WARNINGS_LOC(RExC_parse);
9314 }
9315
9316 PERL_STATIC_INLINE Size_t
S_find_first_differing_byte_pos(const U8 * s1,const U8 * s2,const Size_t max)9317 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
9318 {
9319 const U8 * const start = s1;
9320 const U8 * const send = start + max;
9321
9322 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
9323
9324 while (s1 < send && *s1 == *s2) {
9325 s1++; s2++;
9326 }
9327
9328 return s1 - start;
9329 }
9330
9331 STATIC AV *
S_add_multi_match(pTHX_ AV * multi_char_matches,SV * multi_string,const STRLEN cp_count)9332 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
9333 {
9334 /* This adds the string scalar <multi_string> to the array
9335 * <multi_char_matches>. <multi_string> is known to have exactly
9336 * <cp_count> code points in it. This is used when constructing a
9337 * bracketed character class and we find something that needs to match more
9338 * than a single character.
9339 *
9340 * <multi_char_matches> is actually an array of arrays. Each top-level
9341 * element is an array that contains all the strings known so far that are
9342 * the same length. And that length (in number of code points) is the same
9343 * as the index of the top-level array. Hence, the [2] element is an
9344 * array, each element thereof is a string containing TWO code points;
9345 * while element [3] is for strings of THREE characters, and so on. Since
9346 * this is for multi-char strings there can never be a [0] nor [1] element.
9347 *
9348 * When we rewrite the character class below, we will do so such that the
9349 * longest strings are written first, so that it prefers the longest
9350 * matching strings first. This is done even if it turns out that any
9351 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
9352 * Christiansen has agreed that this is ok. This makes the test for the
9353 * ligature 'ffi' come before the test for 'ff', for example */
9354
9355 AV* this_array;
9356 AV** this_array_ptr;
9357
9358 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
9359
9360 if (! multi_char_matches) {
9361 multi_char_matches = newAV();
9362 }
9363
9364 if (av_exists(multi_char_matches, cp_count)) {
9365 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
9366 this_array = *this_array_ptr;
9367 }
9368 else {
9369 this_array = newAV();
9370 av_store_simple(multi_char_matches, cp_count,
9371 (SV*) this_array);
9372 }
9373 av_push_simple(this_array, multi_string);
9374
9375 return multi_char_matches;
9376 }
9377
9378 /* The names of properties whose definitions are not known at compile time are
9379 * stored in this SV, after a constant heading. So if the length has been
9380 * changed since initialization, then there is a run-time definition. */
9381 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
9382 (SvCUR(listsv) != initial_listsv_len)
9383
9384 /* There is a restricted set of white space characters that are legal when
9385 * ignoring white space in a bracketed character class. This generates the
9386 * code to skip them.
9387 *
9388 * There is a line below that uses the same white space criteria but is outside
9389 * this macro. Both here and there must use the same definition */
9390 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
9391 STMT_START { \
9392 if (do_skip) { \
9393 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
9394 { \
9395 p++; \
9396 } \
9397 } \
9398 } STMT_END
9399
9400 STATIC regnode_offset
S_regclass(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth,const bool stop_at_1,bool allow_mutiple_chars,const bool silence_non_portable,const bool strict,bool optimizable,SV ** ret_invlist)9401 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
9402 const bool stop_at_1, /* Just parse the next thing, don't
9403 look for a full character class */
9404 bool allow_mutiple_chars,
9405 const bool silence_non_portable, /* Don't output warnings
9406 about too large
9407 characters */
9408 const bool strict,
9409 bool optimizable, /* ? Allow a non-ANYOF return
9410 node */
9411 SV** ret_invlist /* Return an inversion list, not a node */
9412 )
9413 {
9414 /* parse a bracketed class specification. Most of these will produce an
9415 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
9416 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
9417 * under /i with multi-character folds: it will be rewritten following the
9418 * paradigm of this example, where the <multi-fold>s are characters which
9419 * fold to multiple character sequences:
9420 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
9421 * gets effectively rewritten as:
9422 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
9423 * reg() gets called (recursively) on the rewritten version, and this
9424 * function will return what it constructs. (Actually the <multi-fold>s
9425 * aren't physically removed from the [abcdefghi], it's just that they are
9426 * ignored in the recursion by means of a flag:
9427 * <RExC_in_multi_char_class>.)
9428 *
9429 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
9430 * characters, with the corresponding bit set if that character is in the
9431 * list. For characters above this, an inversion list is used. There
9432 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
9433 * determinable at compile time
9434 *
9435 * On success, returns the offset at which any next node should be placed
9436 * into the regex engine program being compiled.
9437 *
9438 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
9439 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
9440 * UTF-8
9441 */
9442
9443 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
9444 IV range = 0;
9445 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
9446 regnode_offset ret = -1; /* Initialized to an illegal value */
9447 STRLEN numlen;
9448 int namedclass = OOB_NAMEDCLASS;
9449 char *rangebegin = NULL;
9450 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
9451 aren't available at the time this was called */
9452 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9453 than just initialized. */
9454 SV* properties = NULL; /* Code points that match \p{} \P{} */
9455 SV* posixes = NULL; /* Code points that match classes like [:word:],
9456 extended beyond the Latin1 range. These have to
9457 be kept separate from other code points for much
9458 of this function because their handling is
9459 different under /i, and for most classes under
9460 /d as well */
9461 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
9462 separate for a while from the non-complemented
9463 versions because of complications with /d
9464 matching */
9465 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
9466 treated more simply than the general case,
9467 leading to less compilation and execution
9468 work */
9469 UV element_count = 0; /* Number of distinct elements in the class.
9470 Optimizations may be possible if this is tiny */
9471 AV * multi_char_matches = NULL; /* Code points that fold to more than one
9472 character; used under /i */
9473 UV n;
9474 char * stop_ptr = RExC_end; /* where to stop parsing */
9475
9476 /* ignore unescaped whitespace? */
9477 const bool skip_white = cBOOL( ret_invlist
9478 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
9479
9480 /* inversion list of code points this node matches only when the target
9481 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
9482 * /d) */
9483 SV* upper_latin1_only_utf8_matches = NULL;
9484
9485 /* Inversion list of code points this node matches regardless of things
9486 * like locale, folding, utf8ness of the target string */
9487 SV* cp_list = NULL;
9488
9489 /* Like cp_list, but code points on this list need to be checked for things
9490 * that fold to/from them under /i */
9491 SV* cp_foldable_list = NULL;
9492
9493 /* Like cp_list, but code points on this list are valid only when the
9494 * runtime locale is UTF-8 */
9495 SV* only_utf8_locale_list = NULL;
9496
9497 /* In a range, if one of the endpoints is non-character-set portable,
9498 * meaning that it hard-codes a code point that may mean a different
9499 * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
9500 * mnemonic '\t' which each mean the same character no matter which
9501 * character set the platform is on. */
9502 unsigned int non_portable_endpoint = 0;
9503
9504 /* Is the range unicode? which means on a platform that isn't 1-1 native
9505 * to Unicode (i.e. non-ASCII), each code point in it should be considered
9506 * to be a Unicode value. */
9507 bool unicode_range = FALSE;
9508 bool invert = FALSE; /* Is this class to be complemented */
9509
9510 bool warn_super = ALWAYS_WARN_SUPER;
9511
9512 const char * orig_parse = RExC_parse;
9513
9514 /* This variable is used to mark where the end in the input is of something
9515 * that looks like a POSIX construct but isn't. During the parse, when
9516 * something looks like it could be such a construct is encountered, it is
9517 * checked for being one, but not if we've already checked this area of the
9518 * input. Only after this position is reached do we check again */
9519 char *not_posix_region_end = RExC_parse - 1;
9520
9521 AV* posix_warnings = NULL;
9522 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
9523 U8 op = ANYOF; /* The returned node-type, initialized to the expected
9524 type. */
9525 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
9526 U32 posixl = 0; /* bit field of posix classes matched under /l */
9527
9528
9529 /* Flags as to what things aren't knowable until runtime. (Note that these are
9530 * mutually exclusive.) */
9531 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
9532 haven't been defined as of yet */
9533 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
9534 UTF-8 or not */
9535 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
9536 what gets folded */
9537 U32 has_runtime_dependency = 0; /* OR of the above flags */
9538
9539 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9540
9541 PERL_ARGS_ASSERT_REGCLASS;
9542 #ifndef DEBUGGING
9543 PERL_UNUSED_ARG(depth);
9544 #endif
9545
9546 assert(! (ret_invlist && allow_mutiple_chars));
9547
9548 /* If wants an inversion list returned, we can't optimize to something
9549 * else. */
9550 if (ret_invlist) {
9551 optimizable = FALSE;
9552 }
9553
9554 DEBUG_PARSE("clas");
9555
9556 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
9557 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
9558 && UNICODE_DOT_DOT_VERSION == 0)
9559 allow_mutiple_chars = FALSE;
9560 #endif
9561
9562 /* We include the /i status at the beginning of this so that we can
9563 * know it at runtime */
9564 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
9565 initial_listsv_len = SvCUR(listsv);
9566 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
9567
9568 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9569
9570 assert(RExC_parse <= RExC_end);
9571
9572 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
9573 RExC_parse_inc_by(1);
9574 invert = TRUE;
9575 allow_mutiple_chars = FALSE;
9576 MARK_NAUGHTY(1);
9577 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9578 }
9579
9580 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
9581 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
9582 int maybe_class = handle_possible_posix(pRExC_state,
9583 RExC_parse,
9584 ¬_posix_region_end,
9585 NULL,
9586 TRUE /* checking only */);
9587 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
9588 ckWARN4reg(not_posix_region_end,
9589 "POSIX syntax [%c %c] belongs inside character classes%s",
9590 *RExC_parse, *RExC_parse,
9591 (maybe_class == OOB_NAMEDCLASS)
9592 ? ((POSIXCC_NOTYET(*RExC_parse))
9593 ? " (but this one isn't implemented)"
9594 : " (but this one isn't fully valid)")
9595 : ""
9596 );
9597 }
9598 }
9599
9600 /* If the caller wants us to just parse a single element, accomplish this
9601 * by faking the loop ending condition */
9602 if (stop_at_1 && RExC_end > RExC_parse) {
9603 stop_ptr = RExC_parse + 1;
9604 }
9605
9606 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
9607 if (UCHARAT(RExC_parse) == ']')
9608 goto charclassloop;
9609
9610 while (1) {
9611
9612 if ( posix_warnings
9613 && av_tindex_skip_len_mg(posix_warnings) >= 0
9614 && RExC_parse > not_posix_region_end)
9615 {
9616 /* Warnings about posix class issues are considered tentative until
9617 * we are far enough along in the parse that we can no longer
9618 * change our mind, at which point we output them. This is done
9619 * each time through the loop so that a later class won't zap them
9620 * before they have been dealt with. */
9621 output_posix_warnings(pRExC_state, posix_warnings);
9622 }
9623
9624 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9625
9626 if (RExC_parse >= stop_ptr) {
9627 break;
9628 }
9629
9630 if (UCHARAT(RExC_parse) == ']') {
9631 break;
9632 }
9633
9634 charclassloop:
9635
9636 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9637 save_value = value;
9638 save_prevvalue = prevvalue;
9639
9640 if (!range) {
9641 rangebegin = RExC_parse;
9642 element_count++;
9643 non_portable_endpoint = 0;
9644 }
9645 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
9646 value = utf8n_to_uvchr((U8*)RExC_parse,
9647 RExC_end - RExC_parse,
9648 &numlen, UTF8_ALLOW_DEFAULT);
9649 RExC_parse_inc_by(numlen);
9650 }
9651 else {
9652 value = UCHARAT(RExC_parse);
9653 RExC_parse_inc_by(1);
9654 }
9655
9656 if (value == '[') {
9657 char * posix_class_end;
9658 namedclass = handle_possible_posix(pRExC_state,
9659 RExC_parse,
9660 &posix_class_end,
9661 do_posix_warnings ? &posix_warnings : NULL,
9662 FALSE /* die if error */);
9663 if (namedclass > OOB_NAMEDCLASS) {
9664
9665 /* If there was an earlier attempt to parse this particular
9666 * posix class, and it failed, it was a false alarm, as this
9667 * successful one proves */
9668 if ( posix_warnings
9669 && av_tindex_skip_len_mg(posix_warnings) >= 0
9670 && not_posix_region_end >= RExC_parse
9671 && not_posix_region_end <= posix_class_end)
9672 {
9673 av_undef(posix_warnings);
9674 }
9675
9676 RExC_parse_set(posix_class_end);
9677 }
9678 else if (namedclass == OOB_NAMEDCLASS) {
9679 not_posix_region_end = posix_class_end;
9680 }
9681 else {
9682 namedclass = OOB_NAMEDCLASS;
9683 }
9684 }
9685 else if ( RExC_parse - 1 > not_posix_region_end
9686 && MAYBE_POSIXCC(value))
9687 {
9688 (void) handle_possible_posix(
9689 pRExC_state,
9690 RExC_parse - 1, /* -1 because parse has already been
9691 advanced */
9692 ¬_posix_region_end,
9693 do_posix_warnings ? &posix_warnings : NULL,
9694 TRUE /* checking only */);
9695 }
9696 else if ( strict && ! skip_white
9697 && ( generic_isCC_(value, CC_VERTSPACE_)
9698 || is_VERTWS_cp_high(value)))
9699 {
9700 vFAIL("Literal vertical space in [] is illegal except under /x");
9701 }
9702 else if (value == '\\') {
9703 /* Is a backslash; get the code point of the char after it */
9704
9705 if (RExC_parse >= RExC_end) {
9706 vFAIL("Unmatched [");
9707 }
9708
9709 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
9710 value = utf8n_to_uvchr((U8*)RExC_parse,
9711 RExC_end - RExC_parse,
9712 &numlen, UTF8_ALLOW_DEFAULT);
9713 RExC_parse_inc_by(numlen);
9714 }
9715 else {
9716 value = UCHARAT(RExC_parse);
9717 RExC_parse_inc_by(1);
9718 }
9719
9720 /* Some compilers cannot handle switching on 64-bit integer
9721 * values, therefore value cannot be an UV. Yes, this will
9722 * be a problem later if we want switch on Unicode.
9723 * A similar issue a little bit later when switching on
9724 * namedclass. --jhi */
9725
9726 /* If the \ is escaping white space when white space is being
9727 * skipped, it means that that white space is wanted literally, and
9728 * is already in 'value'. Otherwise, need to translate the escape
9729 * into what it signifies. */
9730 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
9731 const char * message;
9732 U32 packed_warn;
9733 U8 grok_c_char;
9734
9735 case 'w': namedclass = ANYOF_WORDCHAR; break;
9736 case 'W': namedclass = ANYOF_NWORDCHAR; break;
9737 case 's': namedclass = ANYOF_SPACE; break;
9738 case 'S': namedclass = ANYOF_NSPACE; break;
9739 case 'd': namedclass = ANYOF_DIGIT; break;
9740 case 'D': namedclass = ANYOF_NDIGIT; break;
9741 case 'v': namedclass = ANYOF_VERTWS; break;
9742 case 'V': namedclass = ANYOF_NVERTWS; break;
9743 case 'h': namedclass = ANYOF_HORIZWS; break;
9744 case 'H': namedclass = ANYOF_NHORIZWS; break;
9745 case 'N': /* Handle \N{NAME} in class */
9746 {
9747 const char * const backslash_N_beg = RExC_parse - 2;
9748 int cp_count;
9749
9750 if (! grok_bslash_N(pRExC_state,
9751 NULL, /* No regnode */
9752 &value, /* Yes single value */
9753 &cp_count, /* Multiple code pt count */
9754 flagp,
9755 strict,
9756 depth)
9757 ) {
9758
9759 if (*flagp & NEED_UTF8)
9760 FAIL("panic: grok_bslash_N set NEED_UTF8");
9761
9762 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
9763
9764 if (cp_count < 0) {
9765 vFAIL("\\N in a character class must be a named character: \\N{...}");
9766 }
9767 else if (cp_count == 0) {
9768 ckWARNreg(RExC_parse,
9769 "Ignoring zero length \\N{} in character class");
9770 }
9771 else { /* cp_count > 1 */
9772 assert(cp_count > 1);
9773 if (! RExC_in_multi_char_class) {
9774 if ( ! allow_mutiple_chars
9775 || invert
9776 || range
9777 || *RExC_parse == '-')
9778 {
9779 if (strict) {
9780 RExC_parse--;
9781 vFAIL("\\N{} here is restricted to one character");
9782 }
9783 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
9784 break; /* <value> contains the first code
9785 point. Drop out of the switch to
9786 process it */
9787 }
9788 else {
9789 SV * multi_char_N = newSVpvn(backslash_N_beg,
9790 RExC_parse - backslash_N_beg);
9791 multi_char_matches
9792 = add_multi_match(multi_char_matches,
9793 multi_char_N,
9794 cp_count);
9795 }
9796 }
9797 } /* End of cp_count != 1 */
9798
9799 /* This element should not be processed further in this
9800 * class */
9801 element_count--;
9802 value = save_value;
9803 prevvalue = save_prevvalue;
9804 continue; /* Back to top of loop to get next char */
9805 }
9806
9807 /* Here, is a single code point, and <value> contains it */
9808 unicode_range = TRUE; /* \N{} are Unicode */
9809 }
9810 break;
9811 case 'p':
9812 case 'P':
9813 {
9814 char *e;
9815
9816 if (RExC_pm_flags & PMf_WILDCARD) {
9817 RExC_parse_inc_by(1);
9818 /* diag_listed_as: Use of %s is not allowed in Unicode
9819 property wildcard subpatterns in regex; marked by <--
9820 HERE in m/%s/ */
9821 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
9822 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
9823 }
9824
9825 /* \p means they want Unicode semantics */
9826 REQUIRE_UNI_RULES(flagp, 0);
9827
9828 if (RExC_parse >= RExC_end)
9829 vFAIL2("Empty \\%c", (U8)value);
9830 if (*RExC_parse == '{') {
9831 const U8 c = (U8)value;
9832 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
9833 if (!e) {
9834 RExC_parse_inc_by(1);
9835 vFAIL2("Missing right brace on \\%c{}", c);
9836 }
9837
9838 RExC_parse_inc_by(1);
9839
9840 /* White space is allowed adjacent to the braces and after
9841 * any '^', even when not under /x */
9842 while (isSPACE(*RExC_parse)) {
9843 RExC_parse_inc_by(1);
9844 }
9845
9846 if (UCHARAT(RExC_parse) == '^') {
9847
9848 /* toggle. (The rhs xor gets the single bit that
9849 * differs between P and p; the other xor inverts just
9850 * that bit) */
9851 value ^= 'P' ^ 'p';
9852
9853 RExC_parse_inc_by(1);
9854 while (isSPACE(*RExC_parse)) {
9855 RExC_parse_inc_by(1);
9856 }
9857 }
9858
9859 if (e == RExC_parse)
9860 vFAIL2("Empty \\%c{}", c);
9861
9862 n = e - RExC_parse;
9863 while (isSPACE(*(RExC_parse + n - 1)))
9864 n--;
9865
9866 } /* The \p isn't immediately followed by a '{' */
9867 else if (! isALPHA(*RExC_parse)) {
9868 RExC_parse_inc_safe();
9869 vFAIL2("Character following \\%c must be '{' or a "
9870 "single-character Unicode property name",
9871 (U8) value);
9872 }
9873 else {
9874 e = RExC_parse;
9875 n = 1;
9876 }
9877 {
9878 char* name = RExC_parse;
9879
9880 /* Any message returned about expanding the definition */
9881 SV* msg = newSVpvs_flags("", SVs_TEMP);
9882
9883 /* If set TRUE, the property is user-defined as opposed to
9884 * official Unicode */
9885 bool user_defined = FALSE;
9886 AV * strings = NULL;
9887
9888 SV * prop_definition = parse_uniprop_string(
9889 name, n, UTF, FOLD,
9890 FALSE, /* This is compile-time */
9891
9892 /* We can't defer this defn when
9893 * the full result is required in
9894 * this call */
9895 ! cBOOL(ret_invlist),
9896
9897 &strings,
9898 &user_defined,
9899 msg,
9900 0 /* Base level */
9901 );
9902 if (SvCUR(msg)) { /* Assumes any error causes a msg */
9903 assert(prop_definition == NULL);
9904 RExC_parse_set(e + 1);
9905 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
9906 thing so, or else the display is
9907 mojibake */
9908 RExC_utf8 = TRUE;
9909 }
9910 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
9911 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
9912 SvCUR(msg), SvPVX(msg)));
9913 }
9914
9915 assert(prop_definition || strings);
9916
9917 if (strings) {
9918 if (ret_invlist) {
9919 if (! prop_definition) {
9920 RExC_parse_set(e + 1);
9921 vFAIL("Unicode string properties are not implemented in (?[...])");
9922 }
9923 else {
9924 ckWARNreg(e + 1,
9925 "Using just the single character results"
9926 " returned by \\p{} in (?[...])");
9927 }
9928 }
9929 else if (! RExC_in_multi_char_class) {
9930 if (invert ^ (value == 'P')) {
9931 RExC_parse_set(e + 1);
9932 vFAIL("Inverting a character class which contains"
9933 " a multi-character sequence is illegal");
9934 }
9935
9936 /* For each multi-character string ... */
9937 while (av_count(strings) > 0) {
9938 /* ... Each entry is itself an array of code
9939 * points. */
9940 AV * this_string = (AV *) av_shift( strings);
9941 STRLEN cp_count = av_count(this_string);
9942 SV * final = newSV(cp_count ? cp_count * 4 : 1);
9943 SvPVCLEAR_FRESH(final);
9944
9945 /* Create another string of sequences of \x{...} */
9946 while (av_count(this_string) > 0) {
9947 SV * character = av_shift(this_string);
9948 UV cp = SvUV(character);
9949
9950 if (cp > 255) {
9951 REQUIRE_UTF8(flagp);
9952 }
9953 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
9954 cp);
9955 SvREFCNT_dec_NN(character);
9956 }
9957 SvREFCNT_dec_NN(this_string);
9958
9959 /* And add that to the list of such things */
9960 multi_char_matches
9961 = add_multi_match(multi_char_matches,
9962 final,
9963 cp_count);
9964 }
9965 }
9966 SvREFCNT_dec_NN(strings);
9967 }
9968
9969 if (! prop_definition) { /* If we got only a string,
9970 this iteration didn't really
9971 find a character */
9972 element_count--;
9973 }
9974 else if (! is_invlist(prop_definition)) {
9975
9976 /* Here, the definition isn't known, so we have gotten
9977 * returned a string that will be evaluated if and when
9978 * encountered at runtime. We add it to the list of
9979 * such properties, along with whether it should be
9980 * complemented or not */
9981 if (value == 'P') {
9982 sv_catpvs(listsv, "!");
9983 }
9984 else {
9985 sv_catpvs(listsv, "+");
9986 }
9987 sv_catsv(listsv, prop_definition);
9988
9989 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
9990
9991 /* We don't know yet what this matches, so have to flag
9992 * it */
9993 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
9994 }
9995 else {
9996 assert (prop_definition && is_invlist(prop_definition));
9997
9998 /* Here we do have the complete property definition
9999 *
10000 * Temporary workaround for [GH #16520]. For this
10001 * precise input that is in the .t that is failing,
10002 * load utf8.pm, which is what the test wants, so that
10003 * that .t passes */
10004 if ( memEQs(RExC_start, e + 1 - RExC_start,
10005 "foo\\p{Alnum}")
10006 && ! hv_common(GvHVn(PL_incgv),
10007 NULL,
10008 "utf8.pm", sizeof("utf8.pm") - 1,
10009 0, HV_FETCH_ISEXISTS, NULL, 0))
10010 {
10011 require_pv("utf8.pm");
10012 }
10013
10014 if (! user_defined &&
10015 /* We warn on matching an above-Unicode code point
10016 * if the match would return true, except don't
10017 * warn for \p{All}, which has exactly one element
10018 * = 0 */
10019 (_invlist_contains_cp(prop_definition, 0x110000)
10020 && (! (_invlist_len(prop_definition) == 1
10021 && *invlist_array(prop_definition) == 0))))
10022 {
10023 warn_super = TRUE;
10024 }
10025
10026 /* Invert if asking for the complement */
10027 if (value == 'P') {
10028 _invlist_union_complement_2nd(properties,
10029 prop_definition,
10030 &properties);
10031 }
10032 else {
10033 _invlist_union(properties, prop_definition, &properties);
10034 }
10035 }
10036 }
10037
10038 RExC_parse_set(e + 1);
10039 namedclass = ANYOF_UNIPROP; /* no official name, but it's
10040 named */
10041 }
10042 break;
10043 case 'n': value = '\n'; break;
10044 case 'r': value = '\r'; break;
10045 case 't': value = '\t'; break;
10046 case 'f': value = '\f'; break;
10047 case 'b': value = '\b'; break;
10048 case 'e': value = ESC_NATIVE; break;
10049 case 'a': value = '\a'; break;
10050 case 'o':
10051 RExC_parse--; /* function expects to be pointed at the 'o' */
10052 if (! grok_bslash_o(&RExC_parse,
10053 RExC_end,
10054 &value,
10055 &message,
10056 &packed_warn,
10057 strict,
10058 cBOOL(range), /* MAX_UV allowed for range
10059 upper limit */
10060 UTF))
10061 {
10062 vFAIL(message);
10063 }
10064 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10065 warn_non_literal_string(RExC_parse, packed_warn, message);
10066 }
10067
10068 if (value < 256) {
10069 non_portable_endpoint++;
10070 }
10071 break;
10072 case 'x':
10073 RExC_parse--; /* function expects to be pointed at the 'x' */
10074 if (! grok_bslash_x(&RExC_parse,
10075 RExC_end,
10076 &value,
10077 &message,
10078 &packed_warn,
10079 strict,
10080 cBOOL(range), /* MAX_UV allowed for range
10081 upper limit */
10082 UTF))
10083 {
10084 vFAIL(message);
10085 }
10086 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10087 warn_non_literal_string(RExC_parse, packed_warn, message);
10088 }
10089
10090 if (value < 256) {
10091 non_portable_endpoint++;
10092 }
10093 break;
10094 case 'c':
10095 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
10096 &packed_warn))
10097 {
10098 /* going to die anyway; point to exact spot of
10099 * failure */
10100 RExC_parse_inc_safe();
10101 vFAIL(message);
10102 }
10103
10104 value = grok_c_char;
10105 RExC_parse_inc_by(1);
10106 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10107 warn_non_literal_string(RExC_parse, packed_warn, message);
10108 }
10109
10110 non_portable_endpoint++;
10111 break;
10112 case '0': case '1': case '2': case '3': case '4':
10113 case '5': case '6': case '7':
10114 {
10115 /* Take 1-3 octal digits */
10116 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
10117 | PERL_SCAN_NOTIFY_ILLDIGIT;
10118 numlen = (strict) ? 4 : 3;
10119 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10120 RExC_parse_inc_by(numlen);
10121 if (numlen != 3) {
10122 if (strict) {
10123 RExC_parse_inc_safe();
10124 vFAIL("Need exactly 3 octal digits");
10125 }
10126 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
10127 && RExC_parse < RExC_end
10128 && isDIGIT(*RExC_parse)
10129 && ckWARN(WARN_REGEXP))
10130 {
10131 reg_warn_non_literal_string(
10132 RExC_parse + 1,
10133 form_alien_digit_msg(8, numlen, RExC_parse,
10134 RExC_end, UTF, FALSE));
10135 }
10136 }
10137 if (value < 256) {
10138 non_portable_endpoint++;
10139 }
10140 break;
10141 }
10142 default:
10143 /* Allow \_ to not give an error */
10144 if (isWORDCHAR(value) && value != '_') {
10145 if (strict) {
10146 vFAIL2("Unrecognized escape \\%c in character class",
10147 (int)value);
10148 }
10149 else {
10150 ckWARN2reg(RExC_parse,
10151 "Unrecognized escape \\%c in character class passed through",
10152 (int)value);
10153 }
10154 }
10155 break;
10156 } /* End of switch on char following backslash */
10157 } /* end of handling backslash escape sequences */
10158
10159 /* Here, we have the current token in 'value' */
10160
10161 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10162 U8 classnum;
10163
10164 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10165 * literal, as is the character that began the false range, i.e.
10166 * the 'a' in the examples */
10167 if (range) {
10168 const int w = (RExC_parse >= rangebegin)
10169 ? RExC_parse - rangebegin
10170 : 0;
10171 if (strict) {
10172 vFAIL2utf8f(
10173 "False [] range \"%" UTF8f "\"",
10174 UTF8fARG(UTF, w, rangebegin));
10175 }
10176 else {
10177 ckWARN2reg(RExC_parse,
10178 "False [] range \"%" UTF8f "\"",
10179 UTF8fARG(UTF, w, rangebegin));
10180 cp_list = add_cp_to_invlist(cp_list, '-');
10181 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
10182 prevvalue);
10183 }
10184
10185 range = 0; /* this was not a true range */
10186 element_count += 2; /* So counts for three values */
10187 }
10188
10189 classnum = namedclass_to_classnum(namedclass);
10190
10191 if (LOC && namedclass < ANYOF_POSIXL_MAX
10192 #ifndef HAS_ISASCII
10193 && classnum != CC_ASCII_
10194 #endif
10195 ) {
10196 SV* scratch_list = NULL;
10197
10198 /* What the Posix classes (like \w, [:space:]) match isn't
10199 * generally knowable under locale until actual match time. A
10200 * special node is used for these which has extra space for a
10201 * bitmap, with a bit reserved for each named class that is to
10202 * be matched against. (This isn't needed for \p{} and
10203 * pseudo-classes, as they are not affected by locale, and
10204 * hence are dealt with separately.) However, if a named class
10205 * and its complement are both present, then it matches
10206 * everything, and there is no runtime dependency. Odd numbers
10207 * are the complements of the next lower number, so xor works.
10208 * (Note that something like [\w\D] should match everything,
10209 * because \d should be a proper subset of \w. But rather than
10210 * trust that the locale is well behaved, we leave this to
10211 * runtime to sort out) */
10212 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
10213 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
10214 POSIXL_ZERO(posixl);
10215 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
10216 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
10217 continue; /* We could ignore the rest of the class, but
10218 best to parse it for any errors */
10219 }
10220 else { /* Here, isn't the complement of any already parsed
10221 class */
10222 POSIXL_SET(posixl, namedclass);
10223 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
10224 anyof_flags |= ANYOF_MATCHES_POSIXL;
10225
10226 /* The above-Latin1 characters are not subject to locale
10227 * rules. Just add them to the unconditionally-matched
10228 * list */
10229
10230 /* Get the list of the above-Latin1 code points this
10231 * matches */
10232 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
10233 PL_XPosix_ptrs[classnum],
10234
10235 /* Odd numbers are complements,
10236 * like NDIGIT, NASCII, ... */
10237 namedclass % 2 != 0,
10238 &scratch_list);
10239 /* Checking if 'cp_list' is NULL first saves an extra
10240 * clone. Its reference count will be decremented at the
10241 * next union, etc, or if this is the only instance, at the
10242 * end of the routine */
10243 if (! cp_list) {
10244 cp_list = scratch_list;
10245 }
10246 else {
10247 _invlist_union(cp_list, scratch_list, &cp_list);
10248 SvREFCNT_dec_NN(scratch_list);
10249 }
10250 continue; /* Go get next character */
10251 }
10252 }
10253 else {
10254
10255 /* Here, is not /l, or is a POSIX class for which /l doesn't
10256 * matter (or is a Unicode property, which is skipped here). */
10257 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
10258 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
10259
10260 /* Here, should be \h, \H, \v, or \V. None of /d, /i
10261 * nor /l make a difference in what these match,
10262 * therefore we just add what they match to cp_list. */
10263 if (classnum != CC_VERTSPACE_) {
10264 assert( namedclass == ANYOF_HORIZWS
10265 || namedclass == ANYOF_NHORIZWS);
10266
10267 /* It turns out that \h is just a synonym for
10268 * XPosixBlank */
10269 classnum = CC_BLANK_;
10270 }
10271
10272 _invlist_union_maybe_complement_2nd(
10273 cp_list,
10274 PL_XPosix_ptrs[classnum],
10275 namedclass % 2 != 0, /* Complement if odd
10276 (NHORIZWS, NVERTWS)
10277 */
10278 &cp_list);
10279 }
10280 }
10281 else if ( AT_LEAST_UNI_SEMANTICS
10282 || classnum == CC_ASCII_
10283 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_
10284 || classnum == CC_XDIGIT_)))
10285 {
10286 /* We usually have to worry about /d affecting what POSIX
10287 * classes match, with special code needed because we won't
10288 * know until runtime what all matches. But there is no
10289 * extra work needed under /u and /a; and [:ascii:] is
10290 * unaffected by /d; and :digit: and :xdigit: don't have
10291 * runtime differences under /d. So we can special case
10292 * these, and avoid some extra work below, and at runtime.
10293 * */
10294 _invlist_union_maybe_complement_2nd(
10295 simple_posixes,
10296 ((AT_LEAST_ASCII_RESTRICTED)
10297 ? PL_Posix_ptrs[classnum]
10298 : PL_XPosix_ptrs[classnum]),
10299 namedclass % 2 != 0,
10300 &simple_posixes);
10301 }
10302 else { /* Garden variety class. If is NUPPER, NALPHA, ...
10303 complement and use nposixes */
10304 SV** posixes_ptr = namedclass % 2 == 0
10305 ? &posixes
10306 : &nposixes;
10307 _invlist_union_maybe_complement_2nd(
10308 *posixes_ptr,
10309 PL_XPosix_ptrs[classnum],
10310 namedclass % 2 != 0,
10311 posixes_ptr);
10312 }
10313 }
10314 } /* end of namedclass \blah */
10315
10316 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
10317
10318 /* If 'range' is set, 'value' is the ending of a range--check its
10319 * validity. (If value isn't a single code point in the case of a
10320 * range, we should have figured that out above in the code that
10321 * catches false ranges). Later, we will handle each individual code
10322 * point in the range. If 'range' isn't set, this could be the
10323 * beginning of a range, so check for that by looking ahead to see if
10324 * the next real character to be processed is the range indicator--the
10325 * minus sign */
10326
10327 if (range) {
10328 #ifdef EBCDIC
10329 /* For unicode ranges, we have to test that the Unicode as opposed
10330 * to the native values are not decreasing. (Above 255, there is
10331 * no difference between native and Unicode) */
10332 if (unicode_range && prevvalue < 255 && value < 255) {
10333 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
10334 goto backwards_range;
10335 }
10336 }
10337 else
10338 #endif
10339 if (prevvalue > value) /* b-a */ {
10340 int w;
10341 #ifdef EBCDIC
10342 backwards_range:
10343 #endif
10344 w = RExC_parse - rangebegin;
10345 vFAIL2utf8f(
10346 "Invalid [] range \"%" UTF8f "\"",
10347 UTF8fARG(UTF, w, rangebegin));
10348 NOT_REACHED; /* NOTREACHED */
10349 }
10350 }
10351 else {
10352 prevvalue = value; /* save the beginning of the potential range */
10353 if (! stop_at_1 /* Can't be a range if parsing just one thing */
10354 && *RExC_parse == '-')
10355 {
10356 char* next_char_ptr = RExC_parse + 1;
10357
10358 /* Get the next real char after the '-' */
10359 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
10360
10361 /* If the '-' is at the end of the class (just before the ']',
10362 * it is a literal minus; otherwise it is a range */
10363 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
10364 RExC_parse_set(next_char_ptr);
10365
10366 /* a bad range like \w-, [:word:]- ? */
10367 if (namedclass > OOB_NAMEDCLASS) {
10368 if (strict || ckWARN(WARN_REGEXP)) {
10369 const int w = RExC_parse >= rangebegin
10370 ? RExC_parse - rangebegin
10371 : 0;
10372 if (strict) {
10373 vFAIL4("False [] range \"%*.*s\"",
10374 w, w, rangebegin);
10375 }
10376 else {
10377 vWARN4(RExC_parse,
10378 "False [] range \"%*.*s\"",
10379 w, w, rangebegin);
10380 }
10381 }
10382 cp_list = add_cp_to_invlist(cp_list, '-');
10383 element_count++;
10384 } else
10385 range = 1; /* yeah, it's a range! */
10386 continue; /* but do it the next time */
10387 }
10388 }
10389 }
10390
10391 if (namedclass > OOB_NAMEDCLASS) {
10392 continue;
10393 }
10394
10395 /* Here, we have a single value this time through the loop, and
10396 * <prevvalue> is the beginning of the range, if any; or <value> if
10397 * not. */
10398
10399 /* non-Latin1 code point implies unicode semantics. */
10400 if (value > 255) {
10401 if (value > MAX_LEGAL_CP && ( value != UV_MAX
10402 || prevvalue > MAX_LEGAL_CP))
10403 {
10404 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
10405 }
10406 REQUIRE_UNI_RULES(flagp, 0);
10407 if ( ! silence_non_portable
10408 && UNICODE_IS_PERL_EXTENDED(value)
10409 && TO_OUTPUT_WARNINGS(RExC_parse))
10410 {
10411 ckWARN2_non_literal_string(RExC_parse,
10412 packWARN(WARN_PORTABLE),
10413 PL_extended_cp_format,
10414 value);
10415 }
10416 }
10417
10418 /* Ready to process either the single value, or the completed range.
10419 * For single-valued non-inverted ranges, we consider the possibility
10420 * of multi-char folds. (We made a conscious decision to not do this
10421 * for the other cases because it can often lead to non-intuitive
10422 * results. For example, you have the peculiar case that:
10423 * "s s" =~ /^[^\xDF]+$/i => Y
10424 * "ss" =~ /^[^\xDF]+$/i => N
10425 *
10426 * See [perl #89750] */
10427 if (FOLD && allow_mutiple_chars && value == prevvalue) {
10428 if ( value == LATIN_SMALL_LETTER_SHARP_S
10429 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
10430 value)))
10431 {
10432 /* Here <value> is indeed a multi-char fold. Get what it is */
10433
10434 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10435 STRLEN foldlen;
10436
10437 UV folded = _to_uni_fold_flags(
10438 value,
10439 foldbuf,
10440 &foldlen,
10441 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
10442 ? FOLD_FLAGS_NOMIX_ASCII
10443 : 0)
10444 );
10445
10446 /* Here, <folded> should be the first character of the
10447 * multi-char fold of <value>, with <foldbuf> containing the
10448 * whole thing. But, if this fold is not allowed (because of
10449 * the flags), <fold> will be the same as <value>, and should
10450 * be processed like any other character, so skip the special
10451 * handling */
10452 if (folded != value) {
10453
10454 /* Skip if we are recursed, currently parsing the class
10455 * again. Otherwise add this character to the list of
10456 * multi-char folds. */
10457 if (! RExC_in_multi_char_class) {
10458 STRLEN cp_count = utf8_length(foldbuf,
10459 foldbuf + foldlen);
10460 SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
10461
10462 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
10463
10464 multi_char_matches
10465 = add_multi_match(multi_char_matches,
10466 multi_fold,
10467 cp_count);
10468
10469 }
10470
10471 /* This element should not be processed further in this
10472 * class */
10473 element_count--;
10474 value = save_value;
10475 prevvalue = save_prevvalue;
10476 continue;
10477 }
10478 }
10479 }
10480
10481 if (strict && ckWARN(WARN_REGEXP)) {
10482 if (range) {
10483
10484 /* If the range starts above 255, everything is portable and
10485 * likely to be so for any forseeable character set, so don't
10486 * warn. */
10487 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
10488 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
10489 }
10490 else if (prevvalue != value) {
10491
10492 /* Under strict, ranges that stop and/or end in an ASCII
10493 * printable should have each end point be a portable value
10494 * for it (preferably like 'A', but we don't warn if it is
10495 * a (portable) Unicode name or code point), and the range
10496 * must be all digits or all letters of the same case.
10497 * Otherwise, the range is non-portable and unclear as to
10498 * what it contains */
10499 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
10500 && ( non_portable_endpoint
10501 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
10502 || (isLOWER_A(prevvalue) && isLOWER_A(value))
10503 || (isUPPER_A(prevvalue) && isUPPER_A(value))
10504 ))) {
10505 vWARN(RExC_parse, "Ranges of ASCII printables should"
10506 " be some subset of \"0-9\","
10507 " \"A-Z\", or \"a-z\"");
10508 }
10509 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
10510 SSize_t index_start;
10511 SSize_t index_final;
10512
10513 /* But the nature of Unicode and languages mean we
10514 * can't do the same checks for above-ASCII ranges,
10515 * except in the case of digit ones. These should
10516 * contain only digits from the same group of 10. The
10517 * ASCII case is handled just above. Hence here, the
10518 * range could be a range of digits. First some
10519 * unlikely special cases. Grandfather in that a range
10520 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
10521 * if its starting value is one of the 10 digits prior
10522 * to it. This is because it is an alternate way of
10523 * writing 19D1, and some people may expect it to be in
10524 * that group. But it is bad, because it won't give
10525 * the expected results. In Unicode 5.2 it was
10526 * considered to be in that group (of 11, hence), but
10527 * this was fixed in the next version */
10528
10529 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
10530 goto warn_bad_digit_range;
10531 }
10532 else if (UNLIKELY( prevvalue >= 0x1D7CE
10533 && value <= 0x1D7FF))
10534 {
10535 /* This is the only other case currently in Unicode
10536 * where the algorithm below fails. The code
10537 * points just above are the end points of a single
10538 * range containing only decimal digits. It is 5
10539 * different series of 0-9. All other ranges of
10540 * digits currently in Unicode are just a single
10541 * series. (And mktables will notify us if a later
10542 * Unicode version breaks this.)
10543 *
10544 * If the range being checked is at most 9 long,
10545 * and the digit values represented are in
10546 * numerical order, they are from the same series.
10547 * */
10548 if ( value - prevvalue > 9
10549 || ((( value - 0x1D7CE) % 10)
10550 <= (prevvalue - 0x1D7CE) % 10))
10551 {
10552 goto warn_bad_digit_range;
10553 }
10554 }
10555 else {
10556
10557 /* For all other ranges of digits in Unicode, the
10558 * algorithm is just to check if both end points
10559 * are in the same series, which is the same range.
10560 * */
10561 index_start = _invlist_search(
10562 PL_XPosix_ptrs[CC_DIGIT_],
10563 prevvalue);
10564
10565 /* Warn if the range starts and ends with a digit,
10566 * and they are not in the same group of 10. */
10567 if ( index_start >= 0
10568 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
10569 && (index_final =
10570 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
10571 value)) != index_start
10572 && index_final >= 0
10573 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
10574 {
10575 warn_bad_digit_range:
10576 vWARN(RExC_parse, "Ranges of digits should be"
10577 " from the same group of"
10578 " 10");
10579 }
10580 }
10581 }
10582 }
10583 }
10584 if ((! range || prevvalue == value) && non_portable_endpoint) {
10585 if (isPRINT_A(value)) {
10586 char literal[3];
10587 unsigned d = 0;
10588 if (isBACKSLASHED_PUNCT(value)) {
10589 literal[d++] = '\\';
10590 }
10591 literal[d++] = (char) value;
10592 literal[d++] = '\0';
10593
10594 vWARN4(RExC_parse,
10595 "\"%.*s\" is more clearly written simply as \"%s\"",
10596 (int) (RExC_parse - rangebegin),
10597 rangebegin,
10598 literal
10599 );
10600 }
10601 else if (isMNEMONIC_CNTRL(value)) {
10602 vWARN4(RExC_parse,
10603 "\"%.*s\" is more clearly written simply as \"%s\"",
10604 (int) (RExC_parse - rangebegin),
10605 rangebegin,
10606 cntrl_to_mnemonic((U8) value)
10607 );
10608 }
10609 }
10610 }
10611
10612 /* Deal with this element of the class */
10613
10614 #ifndef EBCDIC
10615 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10616 prevvalue, value);
10617 #else
10618 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
10619 * that don't require special handling, we can just add the range like
10620 * we do for ASCII platforms */
10621 if ((UNLIKELY(prevvalue == 0) && value >= 255)
10622 || ! (prevvalue < 256
10623 && (unicode_range
10624 || (! non_portable_endpoint
10625 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
10626 || (isUPPER_A(prevvalue)
10627 && isUPPER_A(value)))))))
10628 {
10629 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10630 prevvalue, value);
10631 }
10632 else {
10633 /* Here, requires special handling. This can be because it is a
10634 * range whose code points are considered to be Unicode, and so
10635 * must be individually translated into native, or because its a
10636 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
10637 * EBCDIC, but we have defined them to include only the "expected"
10638 * upper or lower case ASCII alphabetics. Subranges above 255 are
10639 * the same in native and Unicode, so can be added as a range */
10640 U8 start = NATIVE_TO_LATIN1(prevvalue);
10641 unsigned j;
10642 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
10643 for (j = start; j <= end; j++) {
10644 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
10645 }
10646 if (value > 255) {
10647 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10648 256, value);
10649 }
10650 }
10651 #endif
10652
10653 range = 0; /* this range (if it was one) is done now */
10654 } /* End of loop through all the text within the brackets */
10655
10656 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
10657 output_posix_warnings(pRExC_state, posix_warnings);
10658 }
10659
10660 /* If anything in the class expands to more than one character, we have to
10661 * deal with them by building up a substitute parse string, and recursively
10662 * calling reg() on it, instead of proceeding */
10663 if (multi_char_matches) {
10664 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
10665 I32 cp_count;
10666 STRLEN len;
10667 char *save_end = RExC_end;
10668 char *save_parse = RExC_parse;
10669 char *save_start = RExC_start;
10670 Size_t constructed_prefix_len = 0; /* This gives the length of the
10671 constructed portion of the
10672 substitute parse. */
10673 bool first_time = TRUE; /* First multi-char occurrence doesn't get
10674 a "|" */
10675 I32 reg_flags;
10676
10677 assert(! invert);
10678 /* Only one level of recursion allowed */
10679 assert(RExC_copy_start_in_constructed == RExC_precomp);
10680
10681 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
10682 because too confusing */
10683 if (invert) {
10684 sv_catpvs(substitute_parse, "(?:");
10685 }
10686 #endif
10687
10688 /* Look at the longest strings first */
10689 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
10690 cp_count > 0;
10691 cp_count--)
10692 {
10693
10694 if (av_exists(multi_char_matches, cp_count)) {
10695 AV** this_array_ptr;
10696 SV* this_sequence;
10697
10698 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
10699 cp_count, FALSE);
10700 while ((this_sequence = av_pop(*this_array_ptr)) !=
10701 &PL_sv_undef)
10702 {
10703 if (! first_time) {
10704 sv_catpvs(substitute_parse, "|");
10705 }
10706 first_time = FALSE;
10707
10708 sv_catpv(substitute_parse, SvPVX(this_sequence));
10709 }
10710 }
10711 }
10712
10713 /* If the character class contains anything else besides these
10714 * multi-character strings, have to include it in recursive parsing */
10715 if (element_count) {
10716 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
10717
10718 sv_catpvs(substitute_parse, "|");
10719 if (has_l_bracket) { /* Add an [ if the original had one */
10720 sv_catpvs(substitute_parse, "[");
10721 }
10722 constructed_prefix_len = SvCUR(substitute_parse);
10723 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
10724
10725 /* Put in a closing ']' to match any opening one, but not if going
10726 * off the end, as otherwise we are adding something that really
10727 * isn't there */
10728 if (has_l_bracket && RExC_parse < RExC_end) {
10729 sv_catpvs(substitute_parse, "]");
10730 }
10731 }
10732
10733 sv_catpvs(substitute_parse, ")");
10734 #if 0
10735 if (invert) {
10736 /* This is a way to get the parse to skip forward a whole named
10737 * sequence instead of matching the 2nd character when it fails the
10738 * first */
10739 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
10740 }
10741 #endif
10742
10743 /* Set up the data structure so that any errors will be properly
10744 * reported. See the comments at the definition of
10745 * REPORT_LOCATION_ARGS for details */
10746 RExC_copy_start_in_input = (char *) orig_parse;
10747 RExC_start = SvPV(substitute_parse, len);
10748 RExC_parse_set( RExC_start );
10749 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
10750 RExC_end = RExC_parse + len;
10751 RExC_in_multi_char_class = 1;
10752
10753 ret = reg(pRExC_state, 1, ®_flags, depth+1);
10754
10755 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
10756
10757 /* And restore so can parse the rest of the pattern */
10758 RExC_parse_set(save_parse);
10759 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
10760 RExC_end = save_end;
10761 RExC_in_multi_char_class = 0;
10762 SvREFCNT_dec_NN(multi_char_matches);
10763 SvREFCNT_dec(properties);
10764 SvREFCNT_dec(cp_list);
10765 SvREFCNT_dec(simple_posixes);
10766 SvREFCNT_dec(posixes);
10767 SvREFCNT_dec(nposixes);
10768 SvREFCNT_dec(cp_foldable_list);
10769 return ret;
10770 }
10771
10772 /* If folding, we calculate all characters that could fold to or from the
10773 * ones already on the list */
10774 if (cp_foldable_list) {
10775 if (FOLD) {
10776 UV start, end; /* End points of code point ranges */
10777
10778 SV* fold_intersection = NULL;
10779 SV** use_list;
10780
10781 /* Our calculated list will be for Unicode rules. For locale
10782 * matching, we have to keep a separate list that is consulted at
10783 * runtime only when the locale indicates Unicode rules (and we
10784 * don't include potential matches in the ASCII/Latin1 range, as
10785 * any code point could fold to any other, based on the run-time
10786 * locale). For non-locale, we just use the general list */
10787 if (LOC) {
10788 use_list = &only_utf8_locale_list;
10789 }
10790 else {
10791 use_list = &cp_list;
10792 }
10793
10794 /* Only the characters in this class that participate in folds need
10795 * be checked. Get the intersection of this class and all the
10796 * possible characters that are foldable. This can quickly narrow
10797 * down a large class */
10798 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
10799 &fold_intersection);
10800
10801 /* Now look at the foldable characters in this class individually */
10802 invlist_iterinit(fold_intersection);
10803 while (invlist_iternext(fold_intersection, &start, &end)) {
10804 UV j;
10805 UV folded;
10806
10807 /* Look at every character in the range */
10808 for (j = start; j <= end; j++) {
10809 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10810 STRLEN foldlen;
10811 unsigned int k;
10812 Size_t folds_count;
10813 U32 first_fold;
10814 const U32 * remaining_folds;
10815
10816 if (j < 256) {
10817
10818 /* Under /l, we don't know what code points below 256
10819 * fold to, except we do know the MICRO SIGN folds to
10820 * an above-255 character if the locale is UTF-8, so we
10821 * add it to the special list (in *use_list) Otherwise
10822 * we know now what things can match, though some folds
10823 * are valid under /d only if the target is UTF-8.
10824 * Those go in a separate list */
10825 if ( IS_IN_SOME_FOLD_L1(j)
10826 && ! (LOC && j != MICRO_SIGN))
10827 {
10828
10829 /* ASCII is always matched; non-ASCII is matched
10830 * only under Unicode rules (which could happen
10831 * under /l if the locale is a UTF-8 one */
10832 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
10833 *use_list = add_cp_to_invlist(*use_list,
10834 PL_fold_latin1[j]);
10835 }
10836 else if (j != PL_fold_latin1[j]) {
10837 upper_latin1_only_utf8_matches
10838 = add_cp_to_invlist(
10839 upper_latin1_only_utf8_matches,
10840 PL_fold_latin1[j]);
10841 }
10842 }
10843
10844 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
10845 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
10846 {
10847 add_above_Latin1_folds(pRExC_state,
10848 (U8) j,
10849 use_list);
10850 }
10851 continue;
10852 }
10853
10854 /* Here is an above Latin1 character. We don't have the
10855 * rules hard-coded for it. First, get its fold. This is
10856 * the simple fold, as the multi-character folds have been
10857 * handled earlier and separated out */
10858 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
10859 (ASCII_FOLD_RESTRICTED)
10860 ? FOLD_FLAGS_NOMIX_ASCII
10861 : 0);
10862
10863 /* Single character fold of above Latin1. Add everything
10864 * in its fold closure to the list that this node should
10865 * match. */
10866 folds_count = _inverse_folds(folded, &first_fold,
10867 &remaining_folds);
10868 for (k = 0; k <= folds_count; k++) {
10869 UV c = (k == 0) /* First time through use itself */
10870 ? folded
10871 : (k == 1) /* 2nd time use, the first fold */
10872 ? first_fold
10873
10874 /* Then the remaining ones */
10875 : remaining_folds[k-2];
10876
10877 /* /aa doesn't allow folds between ASCII and non- */
10878 if (( ASCII_FOLD_RESTRICTED
10879 && (isASCII(c) != isASCII(j))))
10880 {
10881 continue;
10882 }
10883
10884 /* Folds under /l which cross the 255/256 boundary are
10885 * added to a separate list. (These are valid only
10886 * when the locale is UTF-8.) */
10887 if (c < 256 && LOC) {
10888 *use_list = add_cp_to_invlist(*use_list, c);
10889 continue;
10890 }
10891
10892 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
10893 {
10894 cp_list = add_cp_to_invlist(cp_list, c);
10895 }
10896 else {
10897 /* Similarly folds involving non-ascii Latin1
10898 * characters under /d are added to their list */
10899 upper_latin1_only_utf8_matches
10900 = add_cp_to_invlist(
10901 upper_latin1_only_utf8_matches,
10902 c);
10903 }
10904 }
10905 }
10906 }
10907 SvREFCNT_dec_NN(fold_intersection);
10908 }
10909
10910 /* Now that we have finished adding all the folds, there is no reason
10911 * to keep the foldable list separate */
10912 _invlist_union(cp_list, cp_foldable_list, &cp_list);
10913 SvREFCNT_dec_NN(cp_foldable_list);
10914 }
10915
10916 /* And combine the result (if any) with any inversion lists from posix
10917 * classes. The lists are kept separate up to now because we don't want to
10918 * fold the classes */
10919 if (simple_posixes) { /* These are the classes known to be unaffected by
10920 /a, /aa, and /d */
10921 if (cp_list) {
10922 _invlist_union(cp_list, simple_posixes, &cp_list);
10923 SvREFCNT_dec_NN(simple_posixes);
10924 }
10925 else {
10926 cp_list = simple_posixes;
10927 }
10928 }
10929 if (posixes || nposixes) {
10930 if (! DEPENDS_SEMANTICS) {
10931
10932 /* For everything but /d, we can just add the current 'posixes' and
10933 * 'nposixes' to the main list */
10934 if (posixes) {
10935 if (cp_list) {
10936 _invlist_union(cp_list, posixes, &cp_list);
10937 SvREFCNT_dec_NN(posixes);
10938 }
10939 else {
10940 cp_list = posixes;
10941 }
10942 }
10943 if (nposixes) {
10944 if (cp_list) {
10945 _invlist_union(cp_list, nposixes, &cp_list);
10946 SvREFCNT_dec_NN(nposixes);
10947 }
10948 else {
10949 cp_list = nposixes;
10950 }
10951 }
10952 }
10953 else {
10954 /* Under /d, things like \w match upper Latin1 characters only if
10955 * the target string is in UTF-8. But things like \W match all the
10956 * upper Latin1 characters if the target string is not in UTF-8.
10957 *
10958 * Handle the case with something like \W separately */
10959 if (nposixes) {
10960 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
10961
10962 /* A complemented posix class matches all upper Latin1
10963 * characters if not in UTF-8. And it matches just certain
10964 * ones when in UTF-8. That means those certain ones are
10965 * matched regardless, so can just be added to the
10966 * unconditional list */
10967 if (cp_list) {
10968 _invlist_union(cp_list, nposixes, &cp_list);
10969 SvREFCNT_dec_NN(nposixes);
10970 nposixes = NULL;
10971 }
10972 else {
10973 cp_list = nposixes;
10974 }
10975
10976 /* Likewise for 'posixes' */
10977 _invlist_union(posixes, cp_list, &cp_list);
10978 SvREFCNT_dec(posixes);
10979
10980 /* Likewise for anything else in the range that matched only
10981 * under UTF-8 */
10982 if (upper_latin1_only_utf8_matches) {
10983 _invlist_union(cp_list,
10984 upper_latin1_only_utf8_matches,
10985 &cp_list);
10986 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10987 upper_latin1_only_utf8_matches = NULL;
10988 }
10989
10990 /* If we don't match all the upper Latin1 characters regardless
10991 * of UTF-8ness, we have to set a flag to match the rest when
10992 * not in UTF-8 */
10993 _invlist_subtract(only_non_utf8_list, cp_list,
10994 &only_non_utf8_list);
10995 if (_invlist_len(only_non_utf8_list) != 0) {
10996 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
10997 }
10998 SvREFCNT_dec_NN(only_non_utf8_list);
10999 }
11000 else {
11001 /* Here there were no complemented posix classes. That means
11002 * the upper Latin1 characters in 'posixes' match only when the
11003 * target string is in UTF-8. So we have to add them to the
11004 * list of those types of code points, while adding the
11005 * remainder to the unconditional list.
11006 *
11007 * First calculate what they are */
11008 SV* nonascii_but_latin1_properties = NULL;
11009 _invlist_intersection(posixes, PL_UpperLatin1,
11010 &nonascii_but_latin1_properties);
11011
11012 /* And add them to the final list of such characters. */
11013 _invlist_union(upper_latin1_only_utf8_matches,
11014 nonascii_but_latin1_properties,
11015 &upper_latin1_only_utf8_matches);
11016
11017 /* Remove them from what now becomes the unconditional list */
11018 _invlist_subtract(posixes, nonascii_but_latin1_properties,
11019 &posixes);
11020
11021 /* And add those unconditional ones to the final list */
11022 if (cp_list) {
11023 _invlist_union(cp_list, posixes, &cp_list);
11024 SvREFCNT_dec_NN(posixes);
11025 posixes = NULL;
11026 }
11027 else {
11028 cp_list = posixes;
11029 }
11030
11031 SvREFCNT_dec(nonascii_but_latin1_properties);
11032
11033 /* Get rid of any characters from the conditional list that we
11034 * now know are matched unconditionally, which may make that
11035 * list empty */
11036 _invlist_subtract(upper_latin1_only_utf8_matches,
11037 cp_list,
11038 &upper_latin1_only_utf8_matches);
11039 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
11040 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11041 upper_latin1_only_utf8_matches = NULL;
11042 }
11043 }
11044 }
11045 }
11046
11047 /* And combine the result (if any) with any inversion list from properties.
11048 * The lists are kept separate up to now so that we can distinguish the two
11049 * in regards to matching above-Unicode. A run-time warning is generated
11050 * if a Unicode property is matched against a non-Unicode code point. But,
11051 * we allow user-defined properties to match anything, without any warning,
11052 * and we also suppress the warning if there is a portion of the character
11053 * class that isn't a Unicode property, and which matches above Unicode, \W
11054 * or [\x{110000}] for example.
11055 * (Note that in this case, unlike the Posix one above, there is no
11056 * <upper_latin1_only_utf8_matches>, because having a Unicode property
11057 * forces Unicode semantics */
11058 if (properties) {
11059 if (cp_list) {
11060
11061 /* If it matters to the final outcome, see if a non-property
11062 * component of the class matches above Unicode. If so, the
11063 * warning gets suppressed. This is true even if just a single
11064 * such code point is specified, as, though not strictly correct if
11065 * another such code point is matched against, the fact that they
11066 * are using above-Unicode code points indicates they should know
11067 * the issues involved */
11068 if (warn_super) {
11069 warn_super = ! (invert
11070 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
11071 }
11072
11073 _invlist_union(properties, cp_list, &cp_list);
11074 SvREFCNT_dec_NN(properties);
11075 }
11076 else {
11077 cp_list = properties;
11078 }
11079
11080 if (warn_super) {
11081 anyof_flags |= ANYOF_WARN_SUPER__shared;
11082
11083 /* Because an ANYOF node is the only one that warns, this node
11084 * can't be optimized into something else */
11085 optimizable = FALSE;
11086 }
11087 }
11088
11089 /* Here, we have calculated what code points should be in the character
11090 * class.
11091 *
11092 * Now we can see about various optimizations. Fold calculation (which we
11093 * did above) needs to take place before inversion. Otherwise /[^k]/i
11094 * would invert to include K, which under /i would match k, which it
11095 * shouldn't. Therefore we can't invert folded locale now, as it won't be
11096 * folded until runtime */
11097
11098 /* If we didn't do folding, it's because some information isn't available
11099 * until runtime; set the run-time fold flag for these We know to set the
11100 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
11101 * at least one 0-255 range code point */
11102 if (LOC && FOLD) {
11103
11104 /* Some things on the list might be unconditionally included because of
11105 * other components. Remove them, and clean up the list if it goes to
11106 * 0 elements */
11107 if (only_utf8_locale_list && cp_list) {
11108 _invlist_subtract(only_utf8_locale_list, cp_list,
11109 &only_utf8_locale_list);
11110
11111 if (_invlist_len(only_utf8_locale_list) == 0) {
11112 SvREFCNT_dec_NN(only_utf8_locale_list);
11113 only_utf8_locale_list = NULL;
11114 }
11115 }
11116 if ( only_utf8_locale_list
11117 || ( cp_list
11118 && ( _invlist_contains_cp(cp_list,
11119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
11120 || _invlist_contains_cp(cp_list,
11121 LATIN_SMALL_LETTER_DOTLESS_I))))
11122 {
11123 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11124 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11125 }
11126 else if (cp_list && invlist_lowest(cp_list) < 256) {
11127 /* If nothing is below 256, has no locale dependency; otherwise it
11128 * does */
11129 anyof_flags |= ANYOFL_FOLD;
11130 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11131
11132 /* In a Turkish locale these could match, notify the run-time code
11133 * to check for that */
11134 if ( _invlist_contains_cp(cp_list, 'I')
11135 || _invlist_contains_cp(cp_list, 'i'))
11136 {
11137 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11138 }
11139 }
11140 }
11141 else if ( DEPENDS_SEMANTICS
11142 && ( upper_latin1_only_utf8_matches
11143 || ( anyof_flags
11144 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
11145 {
11146 RExC_seen_d_op = TRUE;
11147 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
11148 }
11149
11150 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
11151 * compile time. */
11152 if ( cp_list
11153 && invert
11154 && ! has_runtime_dependency)
11155 {
11156 _invlist_invert(cp_list);
11157
11158 /* Clear the invert flag since have just done it here */
11159 invert = FALSE;
11160 }
11161
11162 /* All possible optimizations below still have these characteristics.
11163 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
11164 * routine) */
11165 *flagp |= HASWIDTH|SIMPLE;
11166
11167 if (ret_invlist) {
11168 *ret_invlist = cp_list;
11169
11170 return (cp_list) ? RExC_emit : 0;
11171 }
11172
11173 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
11174 RExC_contains_locale = 1;
11175 }
11176
11177 if (optimizable) {
11178
11179 /* Some character classes are equivalent to other nodes. Such nodes
11180 * take up less room, and some nodes require fewer operations to
11181 * execute, than ANYOF nodes. EXACTish nodes may be joinable with
11182 * adjacent nodes to improve efficiency. */
11183 op = optimize_regclass(pRExC_state, cp_list,
11184 only_utf8_locale_list,
11185 upper_latin1_only_utf8_matches,
11186 has_runtime_dependency,
11187 posixl,
11188 &anyof_flags, &invert, &ret, flagp);
11189 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
11190
11191 /* If optimized to something else and emitted, clean up and return */
11192 if (ret >= 0) {
11193 SvREFCNT_dec(cp_list);
11194 SvREFCNT_dec(only_utf8_locale_list);
11195 SvREFCNT_dec(upper_latin1_only_utf8_matches);
11196 return ret;
11197 }
11198
11199 /* If no optimization was found, an END was returned and we will now
11200 * emit an ANYOF */
11201 if (op == END) {
11202 op = ANYOF;
11203 }
11204 }
11205
11206 /* Here are going to emit an ANYOF; set the particular type */
11207 if (op == ANYOF) {
11208 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
11209 op = ANYOFD;
11210 }
11211 else if (posixl) {
11212 op = ANYOFPOSIXL;
11213 }
11214 else if (LOC) {
11215 op = ANYOFL;
11216 }
11217 }
11218
11219 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
11220 FILL_NODE(ret, op); /* We set the argument later */
11221 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
11222 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
11223
11224 /* Here, <cp_list> contains all the code points we can determine at
11225 * compile time that match under all conditions. Go through it, and
11226 * for things that belong in the bitmap, put them there, and delete from
11227 * <cp_list>. While we are at it, see if everything above 255 is in the
11228 * list, and if so, set a flag to speed up execution */
11229
11230 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
11231
11232 if (posixl) {
11233 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
11234 }
11235
11236 if (invert) {
11237 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
11238 }
11239
11240 /* Here, the bitmap has been populated with all the Latin1 code points that
11241 * always match. Can now add to the overall list those that match only
11242 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
11243 * */
11244 if (upper_latin1_only_utf8_matches) {
11245 if (cp_list) {
11246 _invlist_union(cp_list,
11247 upper_latin1_only_utf8_matches,
11248 &cp_list);
11249 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11250 }
11251 else {
11252 cp_list = upper_latin1_only_utf8_matches;
11253 }
11254 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11255 }
11256
11257 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
11258 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
11259 ? listsv
11260 : NULL,
11261 only_utf8_locale_list);
11262
11263 SvREFCNT_dec(cp_list);
11264 SvREFCNT_dec(only_utf8_locale_list);
11265 return ret;
11266 }
11267
11268 STATIC U8
S_optimize_regclass(pTHX_ RExC_state_t * pRExC_state,SV * cp_list,SV * only_utf8_locale_list,SV * upper_latin1_only_utf8_matches,const U32 has_runtime_dependency,const U32 posixl,U8 * anyof_flags,bool * invert,regnode_offset * ret,I32 * flagp)11269 S_optimize_regclass(pTHX_
11270 RExC_state_t *pRExC_state,
11271 SV * cp_list,
11272 SV* only_utf8_locale_list,
11273 SV* upper_latin1_only_utf8_matches,
11274 const U32 has_runtime_dependency,
11275 const U32 posixl,
11276 U8 * anyof_flags,
11277 bool * invert,
11278 regnode_offset * ret,
11279 I32 *flagp
11280 )
11281 {
11282 /* This function exists just to make S_regclass() smaller. It extracts out
11283 * the code that looks for potential optimizations away from a full generic
11284 * ANYOF node. The parameter names are the same as the corresponding
11285 * variables in S_regclass.
11286 *
11287 * It returns the new op (the impossible END one if no optimization found)
11288 * and sets *ret to any created regnode. If the new op is sufficiently
11289 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
11290 *
11291 * Certain of the parameters may be updated as a result of the changes
11292 * herein */
11293
11294 U8 op = END; /* The returned node-type, initialized to an impossible
11295 one. */
11296 UV value = 0;
11297 PERL_UINT_FAST8_T i;
11298 UV partial_cp_count = 0;
11299 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
11300 UV end[MAX_FOLD_FROMS+1] = { 0 };
11301 bool single_range = FALSE;
11302 UV lowest_cp = 0, highest_cp = 0;
11303
11304 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
11305
11306 if (cp_list) { /* Count the code points in enough ranges that we would see
11307 all the ones possible in any fold in this version of
11308 Unicode */
11309
11310 invlist_iterinit(cp_list);
11311 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
11312 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
11313 break;
11314 }
11315 partial_cp_count += end[i] - start[i] + 1;
11316 }
11317
11318 if (i == 1) {
11319 single_range = TRUE;
11320 }
11321 invlist_iterfinish(cp_list);
11322
11323 /* If we know at compile time that this matches every possible code
11324 * point, any run-time dependencies don't matter */
11325 if (start[0] == 0 && end[0] == UV_MAX) {
11326 if (*invert) {
11327 goto return_OPFAIL;
11328 }
11329 else {
11330 goto return_SANY;
11331 }
11332 }
11333
11334 /* Use a clearer mnemonic for below */
11335 lowest_cp = start[0];
11336
11337 highest_cp = invlist_highest(cp_list);
11338 }
11339
11340 /* Similarly, for /l posix classes, if both a class and its complement
11341 * match, any run-time dependencies don't matter */
11342 if (posixl) {
11343 int namedclass;
11344 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
11345 if ( POSIXL_TEST(posixl, namedclass) /* class */
11346 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
11347 {
11348 if (*invert) {
11349 goto return_OPFAIL;
11350 }
11351 goto return_SANY;
11352 }
11353 }
11354
11355 /* For well-behaved locales, some classes are subsets of others, so
11356 * complementing the subset and including the non-complemented superset
11357 * should match everything, like [\D[:alnum:]], and
11358 * [[:^alpha:][:alnum:]], but some implementations of locales are
11359 * buggy, and khw thinks its a bad idea to have optimization change
11360 * behavior, even if it avoids an OS bug in a given case */
11361
11362 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
11363
11364 /* If is a single posix /l class, can optimize to just that op. Such a
11365 * node will not match anything in the Latin1 range, as that is not
11366 * determinable until runtime, but will match whatever the class does
11367 * outside that range. (Note that some classes won't match anything
11368 * outside the range, like [:ascii:]) */
11369 if ( isSINGLE_BIT_SET(posixl)
11370 && (partial_cp_count == 0 || lowest_cp > 255))
11371 {
11372 U8 classnum;
11373 SV * class_above_latin1 = NULL;
11374 bool already_inverted;
11375 bool are_equivalent;
11376
11377
11378 namedclass = single_1bit_pos32(posixl);
11379 classnum = namedclass_to_classnum(namedclass);
11380
11381 /* The named classes are such that the inverted number is one
11382 * larger than the non-inverted one */
11383 already_inverted = namedclass - classnum_to_namedclass(classnum);
11384
11385 /* Create an inversion list of the official property, inverted if
11386 * the constructed node list is inverted, and restricted to only
11387 * the above latin1 code points, which are the only ones known at
11388 * compile time */
11389 _invlist_intersection_maybe_complement_2nd(
11390 PL_AboveLatin1,
11391 PL_XPosix_ptrs[classnum],
11392 already_inverted,
11393 &class_above_latin1);
11394 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
11395 SvREFCNT_dec_NN(class_above_latin1);
11396
11397 if (are_equivalent) {
11398
11399 /* Resolve the run-time inversion flag with this possibly
11400 * inverted class */
11401 *invert = *invert ^ already_inverted;
11402
11403 op = POSIXL + *invert * (NPOSIXL - POSIXL);
11404 *ret = reg_node(pRExC_state, op);
11405 FLAGS(REGNODE_p(*ret)) = classnum;
11406 return op;
11407 }
11408 }
11409 }
11410
11411 /* khw can't think of any other possible transformation involving these. */
11412 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
11413 return END;
11414 }
11415
11416 if (! has_runtime_dependency) {
11417
11418 /* If the list is empty, nothing matches. This happens, for example,
11419 * when a Unicode property that doesn't match anything is the only
11420 * element in the character class (perluniprops.pod notes such
11421 * properties). */
11422 if (partial_cp_count == 0) {
11423 if (*invert) {
11424 goto return_SANY;
11425 }
11426 else {
11427 goto return_OPFAIL;
11428 }
11429 }
11430
11431 /* If matches everything but \n */
11432 if ( start[0] == 0 && end[0] == '\n' - 1
11433 && start[1] == '\n' + 1 && end[1] == UV_MAX)
11434 {
11435 assert (! *invert);
11436 op = REG_ANY;
11437 *ret = reg_node(pRExC_state, op);
11438 MARK_NAUGHTY(1);
11439 return op;
11440 }
11441 }
11442
11443 /* Next see if can optimize classes that contain just a few code points
11444 * into an EXACTish node. The reason to do this is to let the optimizer
11445 * join this node with adjacent EXACTish ones, and ANYOF nodes require
11446 * runtime conversion to code point from UTF-8, which we'd like to avoid.
11447 *
11448 * An EXACTFish node can be generated even if not under /i, and vice versa.
11449 * But care must be taken. An EXACTFish node has to be such that it only
11450 * matches precisely the code points in the class, but we want to generate
11451 * the least restrictive one that does that, to increase the odds of being
11452 * able to join with an adjacent node. For example, if the class contains
11453 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
11454 * from matching. Whether we are under /i or not is irrelevant in this
11455 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
11456 * LETTER APOSTROPHE. That is supposed to match the single character U+0149
11457 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
11458 * is no simple fold that includes \X{02BC}, there is a multi-char fold
11459 * that does, and so the node generated for it must be an EXACTFish one.
11460 * On the other hand qr/:/i should generate a plain EXACT node since the
11461 * colon participates in no fold whatsoever, and having it be EXACT tells
11462 * the optimizer the target string cannot match unless it has a colon in
11463 * it. */
11464 if ( ! posixl
11465 && ! *invert
11466
11467 /* Only try if there are no more code points in the class than in
11468 * the max possible fold */
11469 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
11470 {
11471 /* We can always make a single code point class into an EXACTish node.
11472 * */
11473 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
11474 if (LOC) {
11475
11476 /* Here is /l: Use EXACTL, except if there is a fold not known
11477 * until runtime so shows as only a single code point here.
11478 * For code points above 255, we know which can cause problems
11479 * by having a potential fold to the Latin1 range. */
11480 if ( ! FOLD
11481 || ( lowest_cp > 255
11482 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
11483 {
11484 op = EXACTL;
11485 }
11486 else {
11487 op = EXACTFL;
11488 }
11489 }
11490 else if (! FOLD) { /* Not /l and not /i */
11491 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
11492 }
11493 else if (lowest_cp < 256) { /* /i, not /l, and the code point is
11494 small */
11495
11496 /* Under /i, it gets a little tricky. A code point that
11497 * doesn't participate in a fold should be an EXACT node. We
11498 * know this one isn't the result of a simple fold, or there'd
11499 * be more than one code point in the list, but it could be
11500 * part of a multi-character fold. In that case we better not
11501 * create an EXACT node, as we would wrongly be telling the
11502 * optimizer that this code point must be in the target string,
11503 * and that is wrong. This is because if the sequence around
11504 * this code point forms a multi-char fold, what needs to be in
11505 * the string could be the code point that folds to the
11506 * sequence.
11507 *
11508 * This handles the case of below-255 code points, as we have
11509 * an easy look up for those. The next clause handles the
11510 * above-256 one */
11511 op = IS_IN_SOME_FOLD_L1(lowest_cp)
11512 ? EXACTFU
11513 : EXACT;
11514 }
11515 else { /* /i, larger code point. Since we are under /i, and have
11516 just this code point, we know that it can't fold to
11517 something else, so PL_InMultiCharFold applies to it */
11518 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
11519 ? EXACTFU_REQ8
11520 : EXACT_REQ8;
11521 }
11522
11523 value = lowest_cp;
11524 }
11525 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
11526 && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
11527 {
11528 /* Here, the only runtime dependency, if any, is from /d, and the
11529 * class matches more than one code point, and the lowest code
11530 * point participates in some fold. It might be that the other
11531 * code points are /i equivalent to this one, and hence they would
11532 * be representable by an EXACTFish node. Above, we eliminated
11533 * classes that contain too many code points to be EXACTFish, with
11534 * the test for MAX_FOLD_FROMS
11535 *
11536 * First, special case the ASCII fold pairs, like 'B' and 'b'. We
11537 * do this because we have EXACTFAA at our disposal for the ASCII
11538 * range */
11539 if (partial_cp_count == 2 && isASCII(lowest_cp)) {
11540
11541 /* The only ASCII characters that participate in folds are
11542 * alphabetics */
11543 assert(isALPHA(lowest_cp));
11544 if ( end[0] == start[0] /* First range is a single
11545 character, so 2nd exists */
11546 && isALPHA_FOLD_EQ(start[0], start[1]))
11547 {
11548 /* Here, is part of an ASCII fold pair */
11549
11550 if ( ASCII_FOLD_RESTRICTED
11551 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
11552 {
11553 /* If the second clause just above was true, it means
11554 * we can't be under /i, or else the list would have
11555 * included more than this fold pair. Therefore we
11556 * have to exclude the possibility of whatever else it
11557 * is that folds to these, by using EXACTFAA */
11558 op = EXACTFAA;
11559 }
11560 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
11561
11562 /* Here, there's no simple fold that lowest_cp is part
11563 * of, but there is a multi-character one. If we are
11564 * not under /i, we want to exclude that possibility;
11565 * if under /i, we want to include it */
11566 op = (FOLD) ? EXACTFU : EXACTFAA;
11567 }
11568 else {
11569
11570 /* Here, the only possible fold lowest_cp participates in
11571 * is with start[1]. /i or not isn't relevant */
11572 op = EXACTFU;
11573 }
11574
11575 value = toFOLD(lowest_cp);
11576 }
11577 }
11578 else if ( ! upper_latin1_only_utf8_matches
11579 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2
11580 && PL_fold_latin1[
11581 invlist_highest(upper_latin1_only_utf8_matches)]
11582 == lowest_cp))
11583 {
11584 /* Here, the smallest character is non-ascii or there are more
11585 * than 2 code points matched by this node. Also, we either
11586 * don't have /d UTF-8 dependent matches, or if we do, they
11587 * look like they could be a single character that is the fold
11588 * of the lowest one is in the always-match list. This test
11589 * quickly excludes most of the false positives when there are
11590 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL
11591 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
11592 * iff the target string is UTF-8. (We don't have to worry
11593 * above about exceeding the array bounds of PL_fold_latin1[]
11594 * because any code point in 'upper_latin1_only_utf8_matches'
11595 * is below 256.)
11596 *
11597 * EXACTFAA would apply only to pairs (hence exactly 2 code
11598 * points) in the ASCII range, so we can't use it here to
11599 * artificially restrict the fold domain, so we check if the
11600 * class does or does not match some EXACTFish node. Further,
11601 * if we aren't under /i, and and the folded-to character is
11602 * part of a multi-character fold, we can't do this
11603 * optimization, as the sequence around it could be that
11604 * multi-character fold, and we don't here know the context, so
11605 * we have to assume it is that multi-char fold, to prevent
11606 * potential bugs.
11607 *
11608 * To do the general case, we first find the fold of the lowest
11609 * code point (which may be higher than that lowest unfolded
11610 * one), then find everything that folds to it. (The data
11611 * structure we have only maps from the folded code points, so
11612 * we have to do the earlier step.) */
11613
11614 Size_t foldlen;
11615 U8 foldbuf[UTF8_MAXBYTES_CASE];
11616 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
11617 U32 first_fold;
11618 const U32 * remaining_folds;
11619 Size_t folds_to_this_cp_count = _inverse_folds(
11620 folded,
11621 &first_fold,
11622 &remaining_folds);
11623 Size_t folds_count = folds_to_this_cp_count + 1;
11624 SV * fold_list = _new_invlist(folds_count);
11625 unsigned int i;
11626
11627 /* If there are UTF-8 dependent matches, create a temporary
11628 * list of what this node matches, including them. */
11629 SV * all_cp_list = NULL;
11630 SV ** use_this_list = &cp_list;
11631
11632 if (upper_latin1_only_utf8_matches) {
11633 all_cp_list = _new_invlist(0);
11634 use_this_list = &all_cp_list;
11635 _invlist_union(cp_list,
11636 upper_latin1_only_utf8_matches,
11637 use_this_list);
11638 }
11639
11640 /* Having gotten everything that participates in the fold
11641 * containing the lowest code point, we turn that into an
11642 * inversion list, making sure everything is included. */
11643 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
11644 fold_list = add_cp_to_invlist(fold_list, folded);
11645 if (folds_to_this_cp_count > 0) {
11646 fold_list = add_cp_to_invlist(fold_list, first_fold);
11647 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
11648 fold_list = add_cp_to_invlist(fold_list,
11649 remaining_folds[i]);
11650 }
11651 }
11652
11653 /* If the fold list is identical to what's in this ANYOF node,
11654 * the node can be represented by an EXACTFish one instead */
11655 if (_invlistEQ(*use_this_list, fold_list,
11656 0 /* Don't complement */ )
11657 ) {
11658
11659 /* But, we have to be careful, as mentioned above. Just
11660 * the right sequence of characters could match this if it
11661 * is part of a multi-character fold. That IS what we want
11662 * if we are under /i. But it ISN'T what we want if not
11663 * under /i, as it could match when it shouldn't. So, when
11664 * we aren't under /i and this character participates in a
11665 * multi-char fold, we don't optimize into an EXACTFish
11666 * node. So, for each case below we have to check if we
11667 * are folding, and if not, if it is not part of a
11668 * multi-char fold. */
11669 if (lowest_cp > 255) { /* Highish code point */
11670 if (FOLD || ! _invlist_contains_cp(
11671 PL_InMultiCharFold, folded))
11672 {
11673 op = (LOC)
11674 ? EXACTFLU8
11675 : (ASCII_FOLD_RESTRICTED)
11676 ? EXACTFAA
11677 : EXACTFU_REQ8;
11678 value = folded;
11679 }
11680 } /* Below, the lowest code point < 256 */
11681 else if ( FOLD
11682 && folded == 's'
11683 && DEPENDS_SEMANTICS)
11684 { /* An EXACTF node containing a single character 's',
11685 can be an EXACTFU if it doesn't get joined with an
11686 adjacent 's' */
11687 op = EXACTFU_S_EDGE;
11688 value = folded;
11689 }
11690 else if ( FOLD
11691 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
11692 {
11693 if (upper_latin1_only_utf8_matches) {
11694 op = EXACTF;
11695
11696 /* We can't use the fold, as that only matches
11697 * under UTF-8 */
11698 value = lowest_cp;
11699 }
11700 else if ( UNLIKELY(lowest_cp == MICRO_SIGN)
11701 && ! UTF)
11702 { /* EXACTFUP is a special node for this character */
11703 op = (ASCII_FOLD_RESTRICTED)
11704 ? EXACTFAA
11705 : EXACTFUP;
11706 value = MICRO_SIGN;
11707 }
11708 else if ( ASCII_FOLD_RESTRICTED
11709 && ! isASCII(lowest_cp))
11710 { /* For ASCII under /iaa, we can use EXACTFU below
11711 */
11712 op = EXACTFAA;
11713 value = folded;
11714 }
11715 else {
11716 op = EXACTFU;
11717 value = folded;
11718 }
11719 }
11720 }
11721
11722 SvREFCNT_dec_NN(fold_list);
11723 SvREFCNT_dec(all_cp_list);
11724 }
11725 }
11726
11727 if (op != END) {
11728 U8 len;
11729
11730 /* Here, we have calculated what EXACTish node to use. Have to
11731 * convert to UTF-8 if not already there */
11732 if (value > 255) {
11733 if (! UTF) {
11734 SvREFCNT_dec(cp_list);
11735 REQUIRE_UTF8(flagp);
11736 }
11737
11738 /* This is a kludge to the special casing issues with this
11739 * ligature under /aa. FB05 should fold to FB06, but the call
11740 * above to _to_uni_fold_flags() didn't find this, as it didn't
11741 * use the /aa restriction in order to not miss other folds
11742 * that would be affected. This is the only instance likely to
11743 * ever be a problem in all of Unicode. So special case it. */
11744 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
11745 && ASCII_FOLD_RESTRICTED)
11746 {
11747 value = LATIN_SMALL_LIGATURE_ST;
11748 }
11749 }
11750
11751 len = (UTF) ? UVCHR_SKIP(value) : 1;
11752
11753 *ret = REGNODE_GUTS(pRExC_state, op, len);
11754 FILL_NODE(*ret, op);
11755 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
11756 setSTR_LEN(REGNODE_p(*ret), len);
11757 if (len == 1) {
11758 *STRINGs(REGNODE_p(*ret)) = (U8) value;
11759 }
11760 else {
11761 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
11762 }
11763
11764 return op;
11765 }
11766 }
11767
11768 if (! has_runtime_dependency) {
11769
11770 /* See if this can be turned into an ANYOFM node. Think about the bit
11771 * patterns in two different bytes. In some positions, the bits in
11772 * each will be 1; and in other positions both will be 0; and in some
11773 * positions the bit will be 1 in one byte, and 0 in the other. Let
11774 * 'n' be the number of positions where the bits differ. We create a
11775 * mask which has exactly 'n' 0 bits, each in a position where the two
11776 * bytes differ. Now take the set of all bytes that when ANDed with
11777 * the mask yield the same result. That set has 2**n elements, and is
11778 * representable by just two 8 bit numbers: the result and the mask.
11779 * Importantly, matching the set can be vectorized by creating a word
11780 * full of the result bytes, and a word full of the mask bytes,
11781 * yielding a significant speed up. Here, see if this node matches
11782 * such a set. As a concrete example consider [01], and the byte
11783 * representing '0' which is 0x30 on ASCII machines. It has the bits
11784 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
11785 * that mask we get 0x30. Any other bytes ANDed yield something else.
11786 * So [01], which is a common usage, is optimizable into ANYOFM, and
11787 * can benefit from the speed up. We can only do this on UTF-8
11788 * invariant bytes, because they have the same bit patterns under UTF-8
11789 * as not. */
11790 PERL_UINT_FAST8_T inverted = 0;
11791
11792 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
11793 * EBCDIC */
11794 const PERL_UINT_FAST8_T max_permissible
11795 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
11796
11797 /* If doesn't fit the criteria for ANYOFM, invert and try again. If
11798 * that works we will instead later generate an NANYOFM, and invert
11799 * back when through */
11800 if (highest_cp > max_permissible) {
11801 _invlist_invert(cp_list);
11802 inverted = 1;
11803 }
11804
11805 if (invlist_highest(cp_list) <= max_permissible) {
11806 UV this_start, this_end;
11807 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
11808 U8 bits_differing = 0;
11809 Size_t full_cp_count = 0;
11810 bool first_time = TRUE;
11811
11812 /* Go through the bytes and find the bit positions that differ */
11813 invlist_iterinit(cp_list);
11814 while (invlist_iternext(cp_list, &this_start, &this_end)) {
11815 unsigned int i = this_start;
11816
11817 if (first_time) {
11818 if (! UVCHR_IS_INVARIANT(i)) {
11819 goto done_anyofm;
11820 }
11821
11822 first_time = FALSE;
11823 lowest_cp = this_start;
11824
11825 /* We have set up the code point to compare with. Don't
11826 * compare it with itself */
11827 i++;
11828 }
11829
11830 /* Find the bit positions that differ from the lowest code
11831 * point in the node. Keep track of all such positions by
11832 * OR'ing */
11833 for (; i <= this_end; i++) {
11834 if (! UVCHR_IS_INVARIANT(i)) {
11835 goto done_anyofm;
11836 }
11837
11838 bits_differing |= i ^ lowest_cp;
11839 }
11840
11841 full_cp_count += this_end - this_start + 1;
11842 }
11843
11844 /* At the end of the loop, we count how many bits differ from the
11845 * bits in lowest code point, call the count 'd'. If the set we
11846 * found contains 2**d elements, it is the closure of all code
11847 * points that differ only in those bit positions. To convince
11848 * yourself of that, first note that the number in the closure must
11849 * be a power of 2, which we test for. The only way we could have
11850 * that count and it be some differing set, is if we got some code
11851 * points that don't differ from the lowest code point in any
11852 * position, but do differ from each other in some other position.
11853 * That means one code point has a 1 in that position, and another
11854 * has a 0. But that would mean that one of them differs from the
11855 * lowest code point in that position, which possibility we've
11856 * already excluded. */
11857 if ( (inverted || full_cp_count > 1)
11858 && full_cp_count == 1U << PL_bitcount[bits_differing])
11859 {
11860 U8 ANYOFM_mask;
11861
11862 op = ANYOFM + inverted;
11863
11864 /* We need to make the bits that differ be 0's */
11865 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
11866
11867 /* The argument is the lowest code point */
11868 *ret = reg1node(pRExC_state, op, lowest_cp);
11869 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
11870 }
11871
11872 done_anyofm:
11873 invlist_iterfinish(cp_list);
11874 }
11875
11876 if (inverted) {
11877 _invlist_invert(cp_list);
11878 }
11879
11880 if (op != END) {
11881 return op;
11882 }
11883
11884 /* XXX We could create an ANYOFR_LOW node here if we saved above if all
11885 * were invariants, it wasn't inverted, and there is a single range.
11886 * This would be faster than some of the posix nodes we create below
11887 * like /\d/a, but would be twice the size. Without having actually
11888 * measured the gain, khw doesn't think the tradeoff is really worth it
11889 * */
11890 }
11891
11892 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
11893 PERL_UINT_FAST8_T type;
11894 SV * intersection = NULL;
11895 SV* d_invlist = NULL;
11896
11897 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD
11898 * ones are about the same speed as ANYOF ops, but take less room; the
11899 * ones that have above-Latin1 code point matches are somewhat faster
11900 * than ANYOF. */
11901
11902 for (type = POSIXA; type >= POSIXD; type--) {
11903 int posix_class;
11904
11905 if (type == POSIXL) { /* But not /l posix classes */
11906 continue;
11907 }
11908
11909 for (posix_class = 0;
11910 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
11911 posix_class++)
11912 {
11913 SV** our_code_points = &cp_list;
11914 SV** official_code_points;
11915 int try_inverted;
11916
11917 if (type == POSIXA) {
11918 official_code_points = &PL_Posix_ptrs[posix_class];
11919 }
11920 else {
11921 official_code_points = &PL_XPosix_ptrs[posix_class];
11922 }
11923
11924 /* Skip non-existent classes of this type. e.g. \v only has an
11925 * entry in PL_XPosix_ptrs */
11926 if (! *official_code_points) {
11927 continue;
11928 }
11929
11930 /* Try both the regular class, and its inversion */
11931 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
11932 bool this_inverted = *invert ^ try_inverted;
11933
11934 if (type != POSIXD) {
11935
11936 /* This class that isn't /d can't match if we have /d
11937 * dependencies */
11938 if (has_runtime_dependency
11939 & HAS_D_RUNTIME_DEPENDENCY)
11940 {
11941 continue;
11942 }
11943 }
11944 else /* is /d */ if (! this_inverted) {
11945
11946 /* /d classes don't match anything non-ASCII below 256
11947 * unconditionally (which cp_list contains) */
11948 _invlist_intersection(cp_list, PL_UpperLatin1,
11949 &intersection);
11950 if (_invlist_len(intersection) != 0) {
11951 continue;
11952 }
11953
11954 SvREFCNT_dec(d_invlist);
11955 d_invlist = invlist_clone(cp_list, NULL);
11956
11957 /* But under UTF-8 it turns into using /u rules. Add
11958 * the things it matches under these conditions so that
11959 * we check below that these are identical to what the
11960 * tested class should match */
11961 if (upper_latin1_only_utf8_matches) {
11962 _invlist_union(
11963 d_invlist,
11964 upper_latin1_only_utf8_matches,
11965 &d_invlist);
11966 }
11967 our_code_points = &d_invlist;
11968 }
11969 else { /* POSIXD, inverted. If this doesn't have this
11970 flag set, it isn't /d. */
11971 if (! ( *anyof_flags
11972 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
11973 {
11974 continue;
11975 }
11976
11977 our_code_points = &cp_list;
11978 }
11979
11980 /* Here, have weeded out some things. We want to see if
11981 * the list of characters this node contains
11982 * ('*our_code_points') precisely matches those of the
11983 * class we are currently checking against
11984 * ('*official_code_points'). */
11985 if (_invlistEQ(*our_code_points,
11986 *official_code_points,
11987 try_inverted))
11988 {
11989 /* Here, they precisely match. Optimize this ANYOF
11990 * node into its equivalent POSIX one of the correct
11991 * type, possibly inverted.
11992 *
11993 * Some of these nodes match a single range of
11994 * characters (or [:alpha:] matches two parallel ranges
11995 * on ASCII platforms). The array lookup at execution
11996 * time could be replaced by a range check for such
11997 * nodes. But regnodes are a finite resource, and the
11998 * possible performance boost isn't large, so this
11999 * hasn't been done. An attempt to use just one node
12000 * (and its inverse) to encompass all such cases was
12001 * made in d62feba66bf43f35d092bb026694f927e9f94d38.
12002 * But the shifting/masking it used ended up being
12003 * slower than the array look up, so it was reverted */
12004 op = (try_inverted)
12005 ? type + NPOSIXA - POSIXA
12006 : type;
12007 *ret = reg_node(pRExC_state, op);
12008 FLAGS(REGNODE_p(*ret)) = posix_class;
12009 SvREFCNT_dec(d_invlist);
12010 SvREFCNT_dec(intersection);
12011 return op;
12012 }
12013 }
12014 }
12015 }
12016 SvREFCNT_dec(d_invlist);
12017 SvREFCNT_dec(intersection);
12018 }
12019
12020 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
12021 * in size and speed. Currently, a 20 bit range base (smallest code point
12022 * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
12023 * This allows for using it on all of the Unicode code points except for
12024 * the highest plane, which is only for private use code points. khw
12025 * doubts that a bigger delta is likely in real world applications */
12026 if ( single_range
12027 && ! has_runtime_dependency
12028 && *anyof_flags == 0
12029 && start[0] < (1 << ANYOFR_BASE_BITS)
12030 && end[0] - start[0]
12031 < ((1U << (sizeof(ARG1u_LOC(NULL))
12032 * CHARBITS - ANYOFR_BASE_BITS))))
12033
12034 {
12035 U8 low_utf8[UTF8_MAXBYTES+1];
12036 U8 high_utf8[UTF8_MAXBYTES+1];
12037
12038 op = ANYOFR;
12039 *ret = reg1node(pRExC_state, op,
12040 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
12041
12042 /* Place the lowest UTF-8 start byte in the flags field, so as to allow
12043 * efficient ruling out at run time of many possible inputs. */
12044 (void) uvchr_to_utf8(low_utf8, start[0]);
12045 (void) uvchr_to_utf8(high_utf8, end[0]);
12046
12047 /* If all code points share the same first byte, this can be an
12048 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
12049 * quickly rule out many inputs at run-time without having to compute
12050 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that
12051 * transformation would not rule out nearly so many things */
12052 if (low_utf8[0] == high_utf8[0]) {
12053 op = ANYOFRb;
12054 OP(REGNODE_p(*ret)) = op;
12055 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
12056 }
12057 else {
12058 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
12059 }
12060
12061 return op;
12062 }
12063
12064 /* If didn't find an optimization and there is no need for a bitmap,
12065 * of the lowest code points, optimize to indicate that */
12066 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS
12067 && ! LOC
12068 && ! upper_latin1_only_utf8_matches
12069 && *anyof_flags == 0)
12070 {
12071 U8 low_utf8[UTF8_MAXBYTES+1];
12072 UV highest_cp = invlist_highest(cp_list);
12073
12074 /* Currently the maximum allowed code point by the system is IV_MAX.
12075 * Higher ones are reserved for future internal use. This particular
12076 * regnode can be used for higher ones, but we can't calculate the code
12077 * point of those. IV_MAX suffices though, as it will be a large first
12078 * byte */
12079 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
12080 - low_utf8;
12081
12082 /* We store the lowest possible first byte of the UTF-8 representation,
12083 * using the flags field. This allows for quick ruling out of some
12084 * inputs without having to convert from UTF-8 to code point. For
12085 * EBCDIC, we use I8, as not doing that transformation would not rule
12086 * out nearly so many things */
12087 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
12088
12089 op = ANYOFH;
12090
12091 /* If the first UTF-8 start byte for the highest code point in the
12092 * range is suitably small, we may be able to get an upper bound as
12093 * well */
12094 if (highest_cp <= IV_MAX) {
12095 U8 high_utf8[UTF8_MAXBYTES+1];
12096 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
12097
12098 /* If the lowest and highest are the same, we can get an exact
12099 * first byte instead of a just minimum or even a sequence of exact
12100 * leading bytes. We signal these with different regnodes */
12101 if (low_utf8[0] == high_utf8[0]) {
12102 Size_t len = find_first_differing_byte_pos(low_utf8,
12103 high_utf8,
12104 MIN(low_len, high_len));
12105 if (len == 1) {
12106
12107 /* No need to convert to I8 for EBCDIC as this is an exact
12108 * match */
12109 *anyof_flags = low_utf8[0];
12110
12111 if (high_len == 2) {
12112 /* If the elements matched all have a 2-byte UTF-8
12113 * representation, with the first byte being the same,
12114 * we can use a compact, fast regnode. capable of
12115 * matching any combination of continuation byte
12116 * patterns.
12117 *
12118 * (A similar regnode could be created for the Latin1
12119 * range; the complication being that it could match
12120 * non-UTF8 targets. The internal bitmap would serve
12121 * both cases; with some extra code in regexec.c) */
12122 op = ANYOFHbbm;
12123 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12124 FILL_NODE(*ret, op);
12125 FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0],
12126
12127 /* The 64 bit (or 32 on EBCCDIC) map can be looked up
12128 * directly based on the continuation byte, without
12129 * needing to convert to code point */
12130 populate_bitmap_from_invlist(
12131 cp_list,
12132
12133 /* The base code point is from the start byte */
12134 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
12135 UTF_CONTINUATION_MARK | 0),
12136
12137 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
12138 REGNODE_BBM_BITMAP_LEN);
12139 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
12140 return op;
12141 }
12142 else {
12143 op = ANYOFHb;
12144 }
12145 }
12146 else {
12147 op = ANYOFHs;
12148 *ret = REGNODE_GUTS(pRExC_state, op,
12149 REGNODE_ARG_LEN(op) + STR_SZ(len));
12150 FILL_NODE(*ret, op);
12151 STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret))
12152 = len;
12153 Copy(low_utf8, /* Add the common bytes */
12154 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
12155 len, U8);
12156 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
12157 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
12158 NULL, only_utf8_locale_list);
12159 return op;
12160 }
12161 }
12162 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
12163
12164 /* Here, the high byte is not the same as the low, but is small
12165 * enough that its reasonable to have a loose upper bound,
12166 * which is packed in with the strict lower bound. See
12167 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
12168 * platforms, I8 is used. On ASCII platforms I8 is the same
12169 * thing as UTF-8 */
12170
12171 U8 bits = 0;
12172 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
12173 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
12174 - *anyof_flags;
12175
12176 if (range_diff <= max_range_diff / 8) {
12177 bits = 3;
12178 }
12179 else if (range_diff <= max_range_diff / 4) {
12180 bits = 2;
12181 }
12182 else if (range_diff <= max_range_diff / 2) {
12183 bits = 1;
12184 }
12185 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
12186 op = ANYOFHr;
12187 }
12188 }
12189 }
12190
12191 return op;
12192
12193 return_OPFAIL:
12194 op = OPFAIL;
12195 *flagp &= ~(SIMPLE|HASWIDTH);
12196 *ret = reg1node(pRExC_state, op, 0);
12197 return op;
12198
12199 return_SANY:
12200 op = SANY;
12201 *ret = reg_node(pRExC_state, op);
12202 MARK_NAUGHTY(1);
12203 return op;
12204 }
12205
12206 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12207
12208 #ifdef PERL_RE_BUILD_AUX
12209 void
Perl_set_ANYOF_arg(pTHX_ RExC_state_t * const pRExC_state,regnode * const node,SV * const cp_list,SV * const runtime_defns,SV * const only_utf8_locale_list)12210 Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
12211 regnode* const node,
12212 SV* const cp_list,
12213 SV* const runtime_defns,
12214 SV* const only_utf8_locale_list)
12215 {
12216 /* Sets the arg field of an ANYOF-type node 'node', using information about
12217 * the node passed-in. If only the bitmap is needed to determine what
12218 * matches, the arg is set appropriately to either
12219 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
12220 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
12221 *
12222 * Otherwise, it sets the argument to the count returned by reg_add_data(),
12223 * having allocated and stored an array, av, as follows:
12224 * av[0] stores the inversion list defining this class as far as known at
12225 * this time, or PL_sv_undef if nothing definite is now known.
12226 * av[1] stores the inversion list of code points that match only if the
12227 * current locale is UTF-8, or if none, PL_sv_undef if there is an
12228 * av[2], or no entry otherwise.
12229 * av[2] stores the list of user-defined properties whose subroutine
12230 * definitions aren't known at this time, or no entry if none. */
12231
12232 UV n;
12233
12234 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
12235
12236 /* If this is set, the final disposition won't be known until runtime, so
12237 * we can't do any of the compile time optimizations */
12238 if (! runtime_defns) {
12239
12240 /* On plain ANYOF nodes without the possibility of a runtime locale
12241 * making a difference, maybe there's no information to be gleaned
12242 * except for what's in the bitmap */
12243 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
12244
12245 /* There are two such cases:
12246 * 1) there is no list of code points matched outside the bitmap
12247 */
12248 if (! cp_list) {
12249 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
12250 return;
12251 }
12252
12253 /* 2) the list indicates everything outside the bitmap matches */
12254 if ( invlist_highest(cp_list) == UV_MAX
12255 && invlist_highest_range_start(cp_list)
12256 <= NUM_ANYOF_CODE_POINTS)
12257 {
12258 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
12259 return;
12260 }
12261
12262 /* In all other cases there are things outside the bitmap that we
12263 * may need to check at runtime. */
12264 }
12265
12266 /* Here, we have resolved all the possible run-time matches, and they
12267 * are stored in one or both of two possible lists. (While some match
12268 * only under certain runtime circumstances, we know all the possible
12269 * ones for each such circumstance.)
12270 *
12271 * It may very well be that the pattern being compiled contains an
12272 * identical class, already encountered. Reusing that class here saves
12273 * space. Look through all classes so far encountered. */
12274 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
12275 for (unsigned int i = 0; i < existing_items; i++) {
12276
12277 /* Only look at auxiliary data of this type */
12278 if (RExC_rxi->data->what[i] != 's') {
12279 continue;
12280 }
12281
12282 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
12283 AV * const av = MUTABLE_AV(SvRV(rv));
12284
12285 /* If the already encountered class has data that won't be known
12286 * until runtime (stored in the final element of the array), we
12287 * can't share */
12288 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
12289 continue;
12290 }
12291
12292 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
12293 false /* no lvalue */);
12294
12295 /* The new and the existing one both have to have or both not
12296 * have this element, for this one to duplicate that one */
12297 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
12298 continue;
12299 }
12300
12301 /* If the inversion lists aren't equivalent, can't share */
12302 if (cp_list && ! _invlistEQ(cp_list,
12303 *stored_cp_list_ptr,
12304 FALSE /* don't complement */))
12305 {
12306 continue;
12307 }
12308
12309 /* Similarly for the other list */
12310 SV ** stored_only_utf8_locale_list_ptr = av_fetch(
12311 av,
12312 ONLY_LOCALE_MATCHES_INDEX,
12313 false /* no lvalue */);
12314 if ( cBOOL(only_utf8_locale_list)
12315 != cBOOL(stored_only_utf8_locale_list_ptr))
12316 {
12317 continue;
12318 }
12319
12320 if (only_utf8_locale_list && ! _invlistEQ(
12321 only_utf8_locale_list,
12322 *stored_only_utf8_locale_list_ptr,
12323 FALSE /* don't complement */))
12324 {
12325 continue;
12326 }
12327
12328 /* Here, the existence and contents of both compile-time lists
12329 * are identical between the new and existing data. Re-use the
12330 * existing one */
12331 ARG1u_SET(node, i);
12332 return;
12333 } /* end of loop through existing classes */
12334 }
12335
12336 /* Here, we need to create a new auxiliary data element; either because
12337 * this doesn't duplicate an existing one, or we can't tell at this time if
12338 * it eventually will */
12339
12340 AV * const av = newAV();
12341 SV *rv;
12342
12343 if (cp_list) {
12344 av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
12345 }
12346
12347 /* (Note that if any of this changes, the size calculations in
12348 * S_optimize_regclass() might need to be updated.) */
12349
12350 if (only_utf8_locale_list) {
12351 av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
12352 SvREFCNT_inc_NN(only_utf8_locale_list));
12353 }
12354
12355 if (runtime_defns) {
12356 av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
12357 SvREFCNT_inc_NN(runtime_defns));
12358 }
12359
12360 rv = newRV_noinc(MUTABLE_SV(av));
12361 n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
12362 RExC_rxi->data->data[n] = (void*)rv;
12363 ARG1u_SET(node, n);
12364 }
12365 #endif /* PERL_RE_BUILD_AUX */
12366
12367 SV *
12368
12369 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
Perl_get_regclass_aux_data(pTHX_ const regexp * prog,const regnode * node,bool doinit,SV ** listsvp,SV ** only_utf8_locale_ptr,SV ** output_invlist)12370 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12371 #else
12372 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12373 #endif
12374
12375 {
12376 /* For internal core use only.
12377 * Returns the inversion list for the input 'node' in the regex 'prog'.
12378 * If <doinit> is 'true', will attempt to create the inversion list if not
12379 * already done. If it is created, it will add to the normal inversion
12380 * list any that comes from user-defined properties. It croaks if this
12381 * is called before such a list is ready to be generated, that is when a
12382 * user-defined property has been declared, buyt still not yet defined.
12383 * If <listsvp> is non-null, will return the printable contents of the
12384 * property definition. This can be used to get debugging information
12385 * even before the inversion list exists, by calling this function with
12386 * 'doinit' set to false, in which case the components that will be used
12387 * to eventually create the inversion list are returned (in a printable
12388 * form).
12389 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
12390 * store an inversion list of code points that should match only if the
12391 * execution-time locale is a UTF-8 one.
12392 * If <output_invlist> is not NULL, it is where this routine is to store an
12393 * inversion list of the code points that would be instead returned in
12394 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
12395 * when this parameter is used, is just the non-code point data that
12396 * will go into creating the inversion list. This currently should be just
12397 * user-defined properties whose definitions were not known at compile
12398 * time. Using this parameter allows for easier manipulation of the
12399 * inversion list's data by the caller. It is illegal to call this
12400 * function with this parameter set, but not <listsvp>
12401 *
12402 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
12403 * that, in spite of this function's name, the inversion list it returns
12404 * may include the bitmap data as well */
12405
12406 SV *si = NULL; /* Input initialization string */
12407 SV* invlist = NULL;
12408
12409 RXi_GET_DECL_NULL(prog, progi);
12410 const struct reg_data * const data = prog ? progi->data : NULL;
12411
12412 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12413 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
12414 #else
12415 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
12416 #endif
12417 assert(! output_invlist || listsvp);
12418
12419 if (data && data->count) {
12420 const U32 n = ARG1u(node);
12421
12422 if (data->what[n] == 's') {
12423 SV * const rv = MUTABLE_SV(data->data[n]);
12424 AV * const av = MUTABLE_AV(SvRV(rv));
12425 SV **const ary = AvARRAY(av);
12426
12427 invlist = ary[INVLIST_INDEX];
12428
12429 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
12430 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
12431 }
12432
12433 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
12434 si = ary[DEFERRED_USER_DEFINED_INDEX];
12435 }
12436
12437 if (doinit && (si || invlist)) {
12438 if (si) {
12439 bool user_defined;
12440 SV * msg = newSVpvs_flags("", SVs_TEMP);
12441
12442 SV * prop_definition = handle_user_defined_property(
12443 "", 0, FALSE, /* There is no \p{}, \P{} */
12444 SvPVX_const(si)[1] - '0', /* /i or not has been
12445 stored here for just
12446 this occasion */
12447 TRUE, /* run time */
12448 FALSE, /* This call must find the defn */
12449 si, /* The property definition */
12450 &user_defined,
12451 msg,
12452 0 /* base level call */
12453 );
12454
12455 if (SvCUR(msg)) {
12456 assert(prop_definition == NULL);
12457
12458 Perl_croak(aTHX_ "%" UTF8f,
12459 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
12460 }
12461
12462 if (invlist) {
12463 _invlist_union(invlist, prop_definition, &invlist);
12464 SvREFCNT_dec_NN(prop_definition);
12465 }
12466 else {
12467 invlist = prop_definition;
12468 }
12469
12470 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
12471 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
12472
12473 ary[INVLIST_INDEX] = invlist;
12474 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
12475 ? ONLY_LOCALE_MATCHES_INDEX
12476 : INVLIST_INDEX);
12477 si = NULL;
12478 }
12479 }
12480 }
12481 }
12482
12483 /* If requested, return a printable version of what this ANYOF node matches
12484 * */
12485 if (listsvp) {
12486 SV* matches_string = NULL;
12487
12488 /* This function can be called at compile-time, before everything gets
12489 * resolved, in which case we return the currently best available
12490 * information, which is the string that will eventually be used to do
12491 * that resolving, 'si' */
12492 if (si) {
12493 /* Here, we only have 'si' (and possibly some passed-in data in
12494 * 'invlist', which is handled below) If the caller only wants
12495 * 'si', use that. */
12496 if (! output_invlist) {
12497 matches_string = newSVsv(si);
12498 }
12499 else {
12500 /* But if the caller wants an inversion list of the node, we
12501 * need to parse 'si' and place as much as possible in the
12502 * desired output inversion list, making 'matches_string' only
12503 * contain the currently unresolvable things */
12504 const char *si_string = SvPVX(si);
12505 STRLEN remaining = SvCUR(si);
12506 UV prev_cp = 0;
12507 U8 count = 0;
12508
12509 /* Ignore everything before and including the first new-line */
12510 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
12511 assert (si_string != NULL);
12512 si_string++;
12513 remaining = SvPVX(si) + SvCUR(si) - si_string;
12514
12515 while (remaining > 0) {
12516
12517 /* The data consists of just strings defining user-defined
12518 * property names, but in prior incarnations, and perhaps
12519 * somehow from pluggable regex engines, it could still
12520 * hold hex code point definitions, all of which should be
12521 * legal (or it wouldn't have gotten this far). Each
12522 * component of a range would be separated by a tab, and
12523 * each range by a new-line. If these are found, instead
12524 * add them to the inversion list */
12525 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
12526 |PERL_SCAN_SILENT_NON_PORTABLE;
12527 STRLEN len = remaining;
12528 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
12529
12530 /* If the hex decode routine found something, it should go
12531 * up to the next \n */
12532 if ( *(si_string + len) == '\n') {
12533 if (count) { /* 2nd code point on line */
12534 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
12535 }
12536 else {
12537 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
12538 }
12539 count = 0;
12540 goto prepare_for_next_iteration;
12541 }
12542
12543 /* If the hex decode was instead for the lower range limit,
12544 * save it, and go parse the upper range limit */
12545 if (*(si_string + len) == '\t') {
12546 assert(count == 0);
12547
12548 prev_cp = cp;
12549 count = 1;
12550 prepare_for_next_iteration:
12551 si_string += len + 1;
12552 remaining -= len + 1;
12553 continue;
12554 }
12555
12556 /* Here, didn't find a legal hex number. Just add the text
12557 * from here up to the next \n, omitting any trailing
12558 * markers. */
12559
12560 remaining -= len;
12561 len = strcspn(si_string,
12562 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
12563 remaining -= len;
12564 if (matches_string) {
12565 sv_catpvn(matches_string, si_string, len);
12566 }
12567 else {
12568 matches_string = newSVpvn(si_string, len);
12569 }
12570 sv_catpvs(matches_string, " ");
12571
12572 si_string += len;
12573 if ( remaining
12574 && UCHARAT(si_string)
12575 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
12576 {
12577 si_string++;
12578 remaining--;
12579 }
12580 if (remaining && UCHARAT(si_string) == '\n') {
12581 si_string++;
12582 remaining--;
12583 }
12584 } /* end of loop through the text */
12585
12586 assert(matches_string);
12587 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
12588 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
12589 }
12590 } /* end of has an 'si' */
12591 }
12592
12593 /* Add the stuff that's already known */
12594 if (invlist) {
12595
12596 /* Again, if the caller doesn't want the output inversion list, put
12597 * everything in 'matches-string' */
12598 if (! output_invlist) {
12599 if ( ! matches_string) {
12600 matches_string = newSVpvs("\n");
12601 }
12602 sv_catsv(matches_string, invlist_contents(invlist,
12603 TRUE /* traditional style */
12604 ));
12605 }
12606 else if (! *output_invlist) {
12607 *output_invlist = invlist_clone(invlist, NULL);
12608 }
12609 else {
12610 _invlist_union(*output_invlist, invlist, output_invlist);
12611 }
12612 }
12613
12614 *listsvp = matches_string;
12615 }
12616
12617 return invlist;
12618 }
12619
12620 /* reg_skipcomment()
12621
12622 Absorbs an /x style # comment from the input stream,
12623 returning a pointer to the first character beyond the comment, or if the
12624 comment terminates the pattern without anything following it, this returns
12625 one past the final character of the pattern (in other words, RExC_end) and
12626 sets the REG_RUN_ON_COMMENT_SEEN flag.
12627
12628 Note it's the callers responsibility to ensure that we are
12629 actually in /x mode
12630
12631 */
12632
12633 PERL_STATIC_INLINE char*
S_reg_skipcomment(RExC_state_t * pRExC_state,char * p)12634 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
12635 {
12636 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12637
12638 assert(*p == '#');
12639
12640 while (p < RExC_end) {
12641 if (*(++p) == '\n') {
12642 return p+1;
12643 }
12644 }
12645
12646 /* we ran off the end of the pattern without ending the comment, so we have
12647 * to add an \n when wrapping */
12648 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12649 return p;
12650 }
12651
12652 STATIC void
S_skip_to_be_ignored_text(pTHX_ RExC_state_t * pRExC_state,char ** p,const bool force_to_xmod)12653 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
12654 char ** p,
12655 const bool force_to_xmod
12656 )
12657 {
12658 /* If the text at the current parse position '*p' is a '(?#...)' comment,
12659 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
12660 * is /x whitespace, advance '*p' so that on exit it points to the first
12661 * byte past all such white space and comments */
12662
12663 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
12664
12665 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
12666
12667 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
12668
12669 for (;;) {
12670 if (RExC_end - (*p) >= 3
12671 && *(*p) == '('
12672 && *(*p + 1) == '?'
12673 && *(*p + 2) == '#')
12674 {
12675 while (*(*p) != ')') {
12676 if ((*p) == RExC_end)
12677 FAIL("Sequence (?#... not terminated");
12678 (*p)++;
12679 }
12680 (*p)++;
12681 continue;
12682 }
12683
12684 if (use_xmod) {
12685 const char * save_p = *p;
12686 while ((*p) < RExC_end) {
12687 STRLEN len;
12688 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
12689 (*p) += len;
12690 }
12691 else if (*(*p) == '#') {
12692 (*p) = reg_skipcomment(pRExC_state, (*p));
12693 }
12694 else {
12695 break;
12696 }
12697 }
12698 if (*p != save_p) {
12699 continue;
12700 }
12701 }
12702
12703 break;
12704 }
12705
12706 return;
12707 }
12708
12709 /* nextchar()
12710
12711 Advances the parse position by one byte, unless that byte is the beginning
12712 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
12713 those two cases, the parse position is advanced beyond all such comments and
12714 white space.
12715
12716 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
12717 */
12718
12719 STATIC void
S_nextchar(pTHX_ RExC_state_t * pRExC_state)12720 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12721 {
12722 PERL_ARGS_ASSERT_NEXTCHAR;
12723
12724 if (RExC_parse < RExC_end) {
12725 assert( ! UTF
12726 || UTF8_IS_INVARIANT(*RExC_parse)
12727 || UTF8_IS_START(*RExC_parse));
12728
12729 RExC_parse_inc_safe();
12730
12731 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12732 FALSE /* Don't force /x */ );
12733 }
12734 }
12735
12736 STATIC void
S_change_engine_size(pTHX_ RExC_state_t * pRExC_state,const Ptrdiff_t size)12737 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
12738 {
12739 /* 'size' is the delta number of smallest regnode equivalents to add or
12740 * subtract from the current memory allocated to the regex engine being
12741 * constructed. */
12742
12743 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
12744
12745 RExC_size += size;
12746
12747 Renewc(RExC_rxi,
12748 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
12749 /* +1 for REG_MAGIC */
12750 char,
12751 regexp_internal);
12752 if ( RExC_rxi == NULL )
12753 FAIL("Regexp out of space");
12754 RXi_SET(RExC_rx, RExC_rxi);
12755
12756 RExC_emit_start = RExC_rxi->program;
12757 if (size > 0) {
12758 Zero(REGNODE_p(RExC_emit), size, regnode);
12759 }
12760 }
12761
12762 STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t * pRExC_state,const STRLEN extra_size)12763 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
12764 {
12765 /* Allocate a regnode that is (1 + extra_size) times as big as the
12766 * smallest regnode worth of space, and also aligns and increments
12767 * RExC_size appropriately.
12768 *
12769 * It returns the regnode's offset into the regex engine program */
12770
12771 const regnode_offset ret = RExC_emit;
12772
12773 PERL_ARGS_ASSERT_REGNODE_GUTS;
12774
12775 SIZE_ALIGN(RExC_size);
12776 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
12777 NODE_ALIGN_FILL(REGNODE_p(ret));
12778 return(ret);
12779 }
12780
12781 #ifdef DEBUGGING
12782
12783 STATIC regnode_offset
S_regnode_guts_debug(pTHX_ RExC_state_t * pRExC_state,const U8 op,const STRLEN extra_size)12784 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
12785 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
12786 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
12787 return S_regnode_guts(aTHX_ pRExC_state, extra_size);
12788 }
12789
12790 #endif
12791
12792
12793
12794 /*
12795 - reg_node - emit a node
12796 */
12797 STATIC regnode_offset /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)12798 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12799 {
12800 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12801 regnode_offset ptr = ret;
12802
12803 PERL_ARGS_ASSERT_REG_NODE;
12804
12805 assert(REGNODE_ARG_LEN(op) == 0);
12806
12807 FILL_ADVANCE_NODE(ptr, op);
12808 RExC_emit = ptr;
12809 return(ret);
12810 }
12811
12812 /*
12813 - reg1node - emit a node with an argument
12814 */
12815 STATIC regnode_offset /* Location. */
S_reg1node(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)12816 S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12817 {
12818 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12819 regnode_offset ptr = ret;
12820
12821 PERL_ARGS_ASSERT_REG1NODE;
12822
12823 /* ANYOF are special cased to allow non-length 1 args */
12824 assert(REGNODE_ARG_LEN(op) == 1);
12825
12826 FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
12827 RExC_emit = ptr;
12828 return(ret);
12829 }
12830
12831 /*
12832 - regpnode - emit a temporary node with a SV* argument
12833 */
12834 STATIC regnode_offset /* Location. */
S_regpnode(pTHX_ RExC_state_t * pRExC_state,U8 op,SV * arg)12835 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
12836 {
12837 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12838 regnode_offset ptr = ret;
12839
12840 PERL_ARGS_ASSERT_REGPNODE;
12841
12842 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
12843 RExC_emit = ptr;
12844 return(ret);
12845 }
12846
12847 STATIC regnode_offset
S_reg2node(pTHX_ RExC_state_t * pRExC_state,const U8 op,const U32 arg1,const I32 arg2)12848 S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
12849 {
12850 /* emit a node with U32 and I32 arguments */
12851
12852 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12853 regnode_offset ptr = ret;
12854
12855 PERL_ARGS_ASSERT_REG2NODE;
12856
12857 assert(REGNODE_ARG_LEN(op) == 2);
12858
12859 FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
12860 RExC_emit = ptr;
12861 return(ret);
12862 }
12863
12864 /*
12865 - reginsert - insert an operator in front of already-emitted operand
12866 *
12867 * That means that on exit 'operand' is the offset of the newly inserted
12868 * operator, and the original operand has been relocated.
12869 *
12870 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
12871 * set up NEXT_OFF() of the inserted node if needed. Something like this:
12872 *
12873 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
12874 * NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
12875 *
12876 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
12877 */
12878 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,const U8 op,const regnode_offset operand,const U32 depth)12879 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
12880 const regnode_offset operand, const U32 depth)
12881 {
12882 regnode *src;
12883 regnode *dst;
12884 regnode *place;
12885 const int offset = REGNODE_ARG_LEN((U8)op);
12886 const int size = NODE_STEP_REGNODE + offset;
12887 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12888
12889 PERL_ARGS_ASSERT_REGINSERT;
12890 PERL_UNUSED_CONTEXT;
12891 PERL_UNUSED_ARG(depth);
12892 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
12893 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
12894 studying. If this is wrong then we need to adjust RExC_recurse
12895 below like we do with RExC_open_parens/RExC_close_parens. */
12896 change_engine_size(pRExC_state, (Ptrdiff_t) size);
12897 src = REGNODE_p(RExC_emit);
12898 RExC_emit += size;
12899 dst = REGNODE_p(RExC_emit);
12900
12901 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
12902 * and [perl #133871] shows this can lead to problems, so skip this
12903 * realignment of parens until a later pass when they are reliable */
12904 if (! IN_PARENS_PASS && RExC_open_parens) {
12905 int paren;
12906 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
12907 /* remember that RExC_npar is rex->nparens + 1,
12908 * iow it is 1 more than the number of parens seen in
12909 * the pattern so far. */
12910 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12911 /* note, RExC_open_parens[0] is the start of the
12912 * regex, it can't move. RExC_close_parens[0] is the end
12913 * of the regex, it *can* move. */
12914 if ( paren && RExC_open_parens[paren] >= operand ) {
12915 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
12916 RExC_open_parens[paren] += size;
12917 } else {
12918 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12919 }
12920 if ( RExC_close_parens[paren] >= operand ) {
12921 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
12922 RExC_close_parens[paren] += size;
12923 } else {
12924 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12925 }
12926 }
12927 }
12928 if (RExC_end_op)
12929 RExC_end_op += size;
12930
12931 while (src > REGNODE_p(operand)) {
12932 StructCopy(--src, --dst, regnode);
12933 }
12934
12935 place = REGNODE_p(operand); /* Op node, where operand used to be. */
12936 src = place + 1; /* NOT REGNODE_AFTER! */
12937 FLAGS(place) = 0;
12938 FILL_NODE(operand, op);
12939
12940 /* Zero out any arguments in the new node */
12941 Zero(src, offset, regnode);
12942 }
12943
12944 /*
12945 - regtail - set the next-pointer at the end of a node chain of p to val. If
12946 that value won't fit in the space available, instead returns FALSE.
12947 (Except asserts if we can't fit in the largest space the regex
12948 engine is designed for.)
12949 - SEE ALSO: regtail_study
12950 */
12951 STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,const regnode_offset p,const regnode_offset val,const U32 depth)12952 S_regtail(pTHX_ RExC_state_t * pRExC_state,
12953 const regnode_offset p,
12954 const regnode_offset val,
12955 const U32 depth)
12956 {
12957 regnode_offset scan;
12958 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12959
12960 PERL_ARGS_ASSERT_REGTAIL;
12961 #ifndef DEBUGGING
12962 PERL_UNUSED_ARG(depth);
12963 #endif
12964
12965 /* The final node in the chain is the first one with a nonzero next pointer
12966 * */
12967 scan = (regnode_offset) p;
12968 for (;;) {
12969 regnode * const temp = regnext(REGNODE_p(scan));
12970 DEBUG_PARSE_r({
12971 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12972 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12973 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
12974 SvPV_nolen_const(RExC_mysv), scan,
12975 (temp == NULL ? "->" : ""),
12976 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
12977 );
12978 });
12979 if (temp == NULL)
12980 break;
12981 scan = REGNODE_OFFSET(temp);
12982 }
12983
12984 /* Populate this node's next pointer */
12985 assert(val >= scan);
12986 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12987 assert((UV) (val - scan) <= U32_MAX);
12988 ARG1u_SET(REGNODE_p(scan), val - scan);
12989 }
12990 else {
12991 if (val - scan > U16_MAX) {
12992 /* Populate this with something that won't loop and will likely
12993 * lead to a crash if the caller ignores the failure return, and
12994 * execution continues */
12995 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12996 return FALSE;
12997 }
12998 NEXT_OFF(REGNODE_p(scan)) = val - scan;
12999 }
13000
13001 return TRUE;
13002 }
13003
13004 #ifdef DEBUGGING
13005 /*
13006 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13007 - Look for optimizable sequences at the same time.
13008 - currently only looks for EXACT chains.
13009
13010 This is experimental code. The idea is to use this routine to perform
13011 in place optimizations on branches and groups as they are constructed,
13012 with the long term intention of removing optimization from study_chunk so
13013 that it is purely analytical.
13014
13015 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13016 to control which is which.
13017
13018 This used to return a value that was ignored. It was a problem that it is
13019 #ifdef'd to be another function that didn't return a value. khw has changed it
13020 so both currently return a pass/fail return.
13021
13022 */
13023 /* TODO: All four parms should be const */
13024
13025 STATIC bool
S_regtail_study(pTHX_ RExC_state_t * pRExC_state,regnode_offset p,const regnode_offset val,U32 depth)13026 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
13027 const regnode_offset val, U32 depth)
13028 {
13029 regnode_offset scan;
13030 U8 exact = PSEUDO;
13031 #ifdef EXPERIMENTAL_INPLACESCAN
13032 I32 min = 0;
13033 #endif
13034 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13035
13036 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13037
13038
13039 /* Find last node. */
13040
13041 scan = p;
13042 for (;;) {
13043 regnode * const temp = regnext(REGNODE_p(scan));
13044 #ifdef EXPERIMENTAL_INPLACESCAN
13045 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
13046 bool unfolded_multi_char; /* Unexamined in this routine */
13047 if (join_exact(pRExC_state, scan, &min,
13048 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
13049 return TRUE; /* Was return EXACT */
13050 }
13051 #endif
13052 if ( exact ) {
13053 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
13054 if (exact == PSEUDO )
13055 exact= OP(REGNODE_p(scan));
13056 else if (exact != OP(REGNODE_p(scan)) )
13057 exact= 0;
13058 }
13059 else if (OP(REGNODE_p(scan)) != NOTHING) {
13060 exact= 0;
13061 }
13062 }
13063 DEBUG_PARSE_r({
13064 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13065 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
13066 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
13067 SvPV_nolen_const(RExC_mysv),
13068 scan,
13069 REGNODE_NAME(exact));
13070 });
13071 if (temp == NULL)
13072 break;
13073 scan = REGNODE_OFFSET(temp);
13074 }
13075 DEBUG_PARSE_r({
13076 DEBUG_PARSE_MSG("");
13077 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
13078 Perl_re_printf( aTHX_
13079 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
13080 SvPV_nolen_const(RExC_mysv),
13081 (IV)val,
13082 (IV)(val - scan)
13083 );
13084 });
13085 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
13086 assert((UV) (val - scan) <= U32_MAX);
13087 ARG1u_SET(REGNODE_p(scan), val - scan);
13088 }
13089 else {
13090 if (val - scan > U16_MAX) {
13091 /* Populate this with something that won't loop and will likely
13092 * lead to a crash if the caller ignores the failure return, and
13093 * execution continues */
13094 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
13095 return FALSE;
13096 }
13097 NEXT_OFF(REGNODE_p(scan)) = val - scan;
13098 }
13099
13100 return TRUE; /* Was 'return exact' */
13101 }
13102 #endif
13103
13104
13105 #ifdef PERL_RE_BUILD_AUX
13106 SV*
Perl_get_ANYOFM_contents(pTHX_ const regnode * n)13107 Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
13108
13109 /* Returns an inversion list of all the code points matched by the
13110 * ANYOFM/NANYOFM node 'n' */
13111
13112 SV * cp_list = _new_invlist(-1);
13113 const U8 lowest = (U8) ARG1u(n);
13114 unsigned int i;
13115 U8 count = 0;
13116 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
13117
13118 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
13119
13120 /* Starting with the lowest code point, any code point that ANDed with the
13121 * mask yields the lowest code point is in the set */
13122 for (i = lowest; i <= 0xFF; i++) {
13123 if ((i & FLAGS(n)) == ARG1u(n)) {
13124 cp_list = add_cp_to_invlist(cp_list, i);
13125 count++;
13126
13127 /* We know how many code points (a power of two) that are in the
13128 * set. No use looking once we've got that number */
13129 if (count >= needed) break;
13130 }
13131 }
13132
13133 if (OP(n) == NANYOFM) {
13134 _invlist_invert(cp_list);
13135 }
13136 return cp_list;
13137 }
13138
13139 SV *
Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n)13140 Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
13141 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
13142
13143 SV * cp_list = NULL;
13144 populate_invlist_from_bitmap(
13145 ((struct regnode_bbm *) n)->bitmap,
13146 REGNODE_BBM_BITMAP_LEN * CHARBITS,
13147 &cp_list,
13148
13149 /* The base cp is from the start byte plus a zero continuation */
13150 TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
13151 UTF_CONTINUATION_MARK | 0));
13152 return cp_list;
13153 }
13154 #endif /* PERL_RE_BUILD_AUX */
13155
13156
13157 SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)13158 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13159 { /* Assume that RE_INTUIT is set */
13160 /* Returns an SV containing a string that must appear in the target for it
13161 * to match, or NULL if nothing is known that must match.
13162 *
13163 * CAUTION: the SV can be freed during execution of the regex engine */
13164
13165 struct regexp *const prog = ReANY(r);
13166 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13167
13168 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13169 PERL_UNUSED_CONTEXT;
13170
13171 DEBUG_COMPILE_r(
13172 {
13173 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
13174 const char * const s = SvPV_nolen_const(RX_UTF8(r)
13175 ? prog->check_utf8 : prog->check_substr);
13176
13177 if (!PL_colorset) reginitcolors();
13178 Perl_re_printf( aTHX_
13179 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13180 PL_colors[4],
13181 RX_UTF8(r) ? "utf8 " : "",
13182 PL_colors[5], PL_colors[0],
13183 s,
13184 PL_colors[1],
13185 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
13186 }
13187 } );
13188
13189 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
13190 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
13191 }
13192
13193 /*
13194 pregfree()
13195
13196 handles refcounting and freeing the perl core regexp structure. When
13197 it is necessary to actually free the structure the first thing it
13198 does is call the 'free' method of the regexp_engine associated to
13199 the regexp, allowing the handling of the void *pprivate; member
13200 first. (This routine is not overridable by extensions, which is why
13201 the extensions free is called first.)
13202
13203 See regdupe and regdupe_internal if you change anything here.
13204 */
13205 #ifndef PERL_IN_XSUB_RE
13206 void
Perl_pregfree(pTHX_ REGEXP * r)13207 Perl_pregfree(pTHX_ REGEXP *r)
13208 {
13209 SvREFCNT_dec(r);
13210 }
13211
13212 void
Perl_pregfree2(pTHX_ REGEXP * rx)13213 Perl_pregfree2(pTHX_ REGEXP *rx)
13214 {
13215 struct regexp *const r = ReANY(rx);
13216 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13217
13218 PERL_ARGS_ASSERT_PREGFREE2;
13219
13220 if (! r)
13221 return;
13222
13223 if (r->mother_re) {
13224 ReREFCNT_dec(r->mother_re);
13225 } else {
13226 CALLREGFREE_PVT(rx); /* free the private data */
13227 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13228 }
13229 if (r->substrs) {
13230 int i;
13231 for (i = 0; i < 2; i++) {
13232 SvREFCNT_dec(r->substrs->data[i].substr);
13233 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
13234 }
13235 Safefree(r->substrs);
13236 }
13237 RX_MATCH_COPY_FREE(rx);
13238 #ifdef PERL_ANY_COW
13239 SvREFCNT_dec(r->saved_copy);
13240 #endif
13241 Safefree(RXp_OFFSp(r));
13242 if (r->logical_to_parno) {
13243 Safefree(r->logical_to_parno);
13244 Safefree(r->parno_to_logical);
13245 Safefree(r->parno_to_logical_next);
13246 }
13247
13248 SvREFCNT_dec(r->qr_anoncv);
13249 if (r->recurse_locinput)
13250 Safefree(r->recurse_locinput);
13251 }
13252
13253
13254 /* reg_temp_copy()
13255
13256 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
13257 except that dsv will be created if NULL.
13258
13259 This function is used in two main ways. First to implement
13260 $r = qr/....; $s = $$r;
13261
13262 Secondly, it is used as a hacky workaround to the structural issue of
13263 match results
13264 being stored in the regexp structure which is in turn stored in
13265 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13266 could be PL_curpm in multiple contexts, and could require multiple
13267 result sets being associated with the pattern simultaneously, such
13268 as when doing a recursive match with (??{$qr})
13269
13270 The solution is to make a lightweight copy of the regexp structure
13271 when a qr// is returned from the code executed by (??{$qr}) this
13272 lightweight copy doesn't actually own any of its data except for
13273 the starp/end and the actual regexp structure itself.
13274
13275 */
13276
13277
13278 REGEXP *
Perl_reg_temp_copy(pTHX_ REGEXP * dsv,REGEXP * ssv)13279 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
13280 {
13281 struct regexp *drx;
13282 struct regexp *const srx = ReANY(ssv);
13283 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
13284
13285 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13286
13287 if (!dsv)
13288 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
13289 else {
13290 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
13291
13292 /* our only valid caller, sv_setsv_flags(), should have done
13293 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
13294 assert(!SvOOK(dsv));
13295 assert(!SvIsCOW(dsv));
13296 assert(!SvROK(dsv));
13297
13298 if (SvPVX_const(dsv)) {
13299 if (SvLEN(dsv))
13300 Safefree(SvPVX(dsv));
13301 SvPVX(dsv) = NULL;
13302 }
13303 SvLEN_set(dsv, 0);
13304 SvCUR_set(dsv, 0);
13305 SvOK_off((SV *)dsv);
13306
13307 if (islv) {
13308 /* For PVLVs, the head (sv_any) points to an XPVLV, while
13309 * the LV's xpvlenu_rx will point to a regexp body, which
13310 * we allocate here */
13311 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13312 assert(!SvPVX(dsv));
13313 /* We "steal" the body from the newly allocated SV temp, changing
13314 * the pointer in its HEAD to NULL. We then change its type to
13315 * SVt_NULL so that when we immediately release its only reference,
13316 * no memory deallocation happens.
13317 *
13318 * The body will eventually be freed (from the PVLV) either in
13319 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
13320 * the regexp body needs to be removed)
13321 * or in Perl_sv_clear() (if the PVLV still holds the pointer until
13322 * the PVLV itself is deallocated). */
13323 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
13324 temp->sv_any = NULL;
13325 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13326 SvREFCNT_dec_NN(temp);
13327 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
13328 ing below will not set it. */
13329 SvCUR_set(dsv, SvCUR(ssv));
13330 }
13331 }
13332 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
13333 sv_force_normal(sv) is called. */
13334 SvFAKE_on(dsv);
13335 drx = ReANY(dsv);
13336
13337 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
13338 SvPV_set(dsv, RX_WRAPPED(ssv));
13339 /* We share the same string buffer as the original regexp, on which we
13340 hold a reference count, incremented when mother_re is set below.
13341 The string pointer is copied here, being part of the regexp struct.
13342 */
13343 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
13344 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13345
13346 if (!islv)
13347 SvLEN_set(dsv, 0);
13348 if (RXp_OFFSp(srx)) {
13349 const I32 npar = srx->nparens+1;
13350 NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
13351 }
13352 if (srx->substrs) {
13353 int i;
13354 Newx(drx->substrs, 1, struct reg_substr_data);
13355 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
13356
13357 for (i = 0; i < 2; i++) {
13358 SvREFCNT_inc_void(drx->substrs->data[i].substr);
13359 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
13360 }
13361
13362 /* check_substr and check_utf8, if non-NULL, point to either their
13363 anchored or float namesakes, and don't hold a second reference. */
13364 }
13365 if (srx->logical_to_parno) {
13366 NewCopy(srx->logical_to_parno,
13367 drx->logical_to_parno,
13368 srx->nparens+1, I32);
13369 NewCopy(srx->parno_to_logical,
13370 drx->parno_to_logical,
13371 srx->nparens+1, I32);
13372 NewCopy(srx->parno_to_logical_next,
13373 drx->parno_to_logical_next,
13374 srx->nparens+1, I32);
13375 } else {
13376 drx->logical_to_parno = NULL;
13377 drx->parno_to_logical = NULL;
13378 drx->parno_to_logical_next = NULL;
13379 }
13380 drx->logical_nparens = srx->logical_nparens;
13381
13382 RX_MATCH_COPIED_off(dsv);
13383 #ifdef PERL_ANY_COW
13384 RXp_SAVED_COPY(drx) = NULL;
13385 #endif
13386 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
13387 SvREFCNT_inc_void(drx->qr_anoncv);
13388 if (srx->recurse_locinput)
13389 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
13390
13391 return dsv;
13392 }
13393 #endif
13394
13395
13396 /* regfree_internal()
13397
13398 Free the private data in a regexp. This is overloadable by
13399 extensions. Perl takes care of the regexp structure in pregfree(),
13400 this covers the *pprivate pointer which technically perl doesn't
13401 know about, however of course we have to handle the
13402 regexp_internal structure when no extension is in use.
13403
13404 Note this is called before freeing anything in the regexp
13405 structure.
13406 */
13407
13408 void
Perl_regfree_internal(pTHX_ REGEXP * const rx)13409 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13410 {
13411 struct regexp *const r = ReANY(rx);
13412 RXi_GET_DECL(r, ri);
13413 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13414
13415 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13416
13417 if (! ri) {
13418 return;
13419 }
13420
13421 DEBUG_COMPILE_r({
13422 if (!PL_colorset)
13423 reginitcolors();
13424 {
13425 SV *dsv= sv_newmortal();
13426 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13427 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
13428 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
13429 PL_colors[4], PL_colors[5], s);
13430 }
13431 });
13432
13433 if (ri->code_blocks)
13434 S_free_codeblocks(aTHX_ ri->code_blocks);
13435
13436 if (ri->data) {
13437 int n = ri->data->count;
13438
13439 while (--n >= 0) {
13440 /* If you add a ->what type here, update the comment in regcomp.h */
13441 switch (ri->data->what[n]) {
13442 case 'a':
13443 case 'r':
13444 case 's':
13445 case 'S':
13446 case 'u':
13447 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13448 break;
13449 case 'f':
13450 Safefree(ri->data->data[n]);
13451 break;
13452 case 'l':
13453 case 'L':
13454 break;
13455 case 'T':
13456 { /* Aho Corasick add-on structure for a trie node.
13457 Used in stclass optimization only */
13458 U32 refcount;
13459 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13460 OP_REFCNT_LOCK;
13461 refcount = --aho->refcount;
13462 OP_REFCNT_UNLOCK;
13463 if ( !refcount ) {
13464 PerlMemShared_free(aho->states);
13465 PerlMemShared_free(aho->fail);
13466 /* do this last!!!! */
13467 PerlMemShared_free(ri->data->data[n]);
13468 /* we should only ever get called once, so
13469 * assert as much, and also guard the free
13470 * which /might/ happen twice. At the least
13471 * it will make code anlyzers happy and it
13472 * doesn't cost much. - Yves */
13473 assert(ri->regstclass);
13474 if (ri->regstclass) {
13475 PerlMemShared_free(ri->regstclass);
13476 ri->regstclass = 0;
13477 }
13478 }
13479 }
13480 break;
13481 case 't':
13482 {
13483 /* trie structure. */
13484 U32 refcount;
13485 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13486 OP_REFCNT_LOCK;
13487 refcount = --trie->refcount;
13488 OP_REFCNT_UNLOCK;
13489 if ( !refcount ) {
13490 PerlMemShared_free(trie->charmap);
13491 PerlMemShared_free(trie->states);
13492 PerlMemShared_free(trie->trans);
13493 if (trie->bitmap)
13494 PerlMemShared_free(trie->bitmap);
13495 if (trie->jump)
13496 PerlMemShared_free(trie->jump);
13497 if (trie->j_before_paren)
13498 PerlMemShared_free(trie->j_before_paren);
13499 if (trie->j_after_paren)
13500 PerlMemShared_free(trie->j_after_paren);
13501 PerlMemShared_free(trie->wordinfo);
13502 /* do this last!!!! */
13503 PerlMemShared_free(ri->data->data[n]);
13504 }
13505 }
13506 break;
13507 case '%':
13508 /* NO-OP a '%' data contains a null pointer, so that reg_add_data
13509 * always returns non-zero, this should only ever happen in the
13510 * 0 index */
13511 assert(n==0);
13512 break;
13513 default:
13514 Perl_croak(aTHX_ "panic: regfree data code '%c'",
13515 ri->data->what[n]);
13516 }
13517 }
13518 Safefree(ri->data->what);
13519 Safefree(ri->data);
13520 }
13521
13522 Safefree(ri);
13523 }
13524
13525 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
13526
13527 /*
13528 =for apidoc re_dup_guts
13529 Duplicate a regexp.
13530
13531 This routine is expected to clone a given regexp structure. It is only
13532 compiled under USE_ITHREADS.
13533
13534 After all of the core data stored in struct regexp is duplicated
13535 the C<regexp_engine.dupe> method is used to copy any private data
13536 stored in the *pprivate pointer. This allows extensions to handle
13537 any duplication they need to do.
13538
13539 =cut
13540
13541 See pregfree() and regfree_internal() if you change anything here.
13542 */
13543 #if defined(USE_ITHREADS)
13544 #ifndef PERL_IN_XSUB_RE
13545 void
Perl_re_dup_guts(pTHX_ const REGEXP * sstr,REGEXP * dstr,CLONE_PARAMS * param)13546 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13547 {
13548 I32 npar;
13549 const struct regexp *r = ReANY(sstr);
13550 struct regexp *ret = ReANY(dstr);
13551
13552 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13553
13554 npar = r->nparens+1;
13555 NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
13556
13557 if (ret->substrs) {
13558 /* Do it this way to avoid reading from *r after the StructCopy().
13559 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13560 cache, it doesn't matter. */
13561 int i;
13562 const bool anchored = r->check_substr
13563 ? r->check_substr == r->substrs->data[0].substr
13564 : r->check_utf8 == r->substrs->data[0].utf8_substr;
13565 Newx(ret->substrs, 1, struct reg_substr_data);
13566 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13567
13568 for (i = 0; i < 2; i++) {
13569 ret->substrs->data[i].substr =
13570 sv_dup_inc(ret->substrs->data[i].substr, param);
13571 ret->substrs->data[i].utf8_substr =
13572 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
13573 }
13574
13575 /* check_substr and check_utf8, if non-NULL, point to either their
13576 anchored or float namesakes, and don't hold a second reference. */
13577
13578 if (ret->check_substr) {
13579 if (anchored) {
13580 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
13581
13582 ret->check_substr = ret->substrs->data[0].substr;
13583 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13584 } else {
13585 assert(r->check_substr == r->substrs->data[1].substr);
13586 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
13587
13588 ret->check_substr = ret->substrs->data[1].substr;
13589 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13590 }
13591 } else if (ret->check_utf8) {
13592 if (anchored) {
13593 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13594 } else {
13595 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13596 }
13597 }
13598 }
13599
13600 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13601 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13602 if (r->recurse_locinput)
13603 Newx(ret->recurse_locinput, r->nparens + 1, char *);
13604
13605 if (ret->pprivate)
13606 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
13607
13608 if (RX_MATCH_COPIED(dstr))
13609 RXp_SUBBEG(ret) = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
13610 else
13611 RXp_SUBBEG(ret) = NULL;
13612 #ifdef PERL_ANY_COW
13613 RXp_SAVED_COPY(ret) = NULL;
13614 #endif
13615
13616 if (r->logical_to_parno) {
13617 /* we use total_parens for all three just for symmetry */
13618 ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
13619 ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
13620 ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
13621 } else {
13622 ret->logical_to_parno = NULL;
13623 ret->parno_to_logical = NULL;
13624 ret->parno_to_logical_next = NULL;
13625 }
13626
13627 ret->logical_nparens = r->logical_nparens;
13628
13629 /* Whether mother_re be set or no, we need to copy the string. We
13630 cannot refrain from copying it when the storage points directly to
13631 our mother regexp, because that's
13632 1: a buffer in a different thread
13633 2: something we no longer hold a reference on
13634 so we need to copy it locally. */
13635 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
13636 /* set malloced length to a non-zero value so it will be freed
13637 * (otherwise in combination with SVf_FAKE it looks like an alien
13638 * buffer). It doesn't have to be the actual malloced size, since it
13639 * should never be grown */
13640 SvLEN_set(dstr, SvCUR(sstr)+1);
13641 ret->mother_re = NULL;
13642 }
13643 #endif /* PERL_IN_XSUB_RE */
13644
13645 /*
13646 regdupe_internal()
13647
13648 This is the internal complement to regdupe() which is used to copy
13649 the structure pointed to by the *pprivate pointer in the regexp.
13650 This is the core version of the extension overridable cloning hook.
13651 The regexp structure being duplicated will be copied by perl prior
13652 to this and will be provided as the regexp *r argument, however
13653 with the /old/ structures pprivate pointer value. Thus this routine
13654 may override any copying normally done by perl.
13655
13656 It returns a pointer to the new regexp_internal structure.
13657 */
13658
13659 void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx,CLONE_PARAMS * param)13660 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13661 {
13662 struct regexp *const r = ReANY(rx);
13663 regexp_internal *reti;
13664 int len;
13665 RXi_GET_DECL(r, ri);
13666
13667 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13668
13669 len = ProgLen(ri);
13670
13671 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
13672 char, regexp_internal);
13673 Copy(ri->program, reti->program, len+1, regnode);
13674
13675
13676 if (ri->code_blocks) {
13677 int n;
13678 Newx(reti->code_blocks, 1, struct reg_code_blocks);
13679 Newx(reti->code_blocks->cb, ri->code_blocks->count,
13680 struct reg_code_block);
13681 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
13682 ri->code_blocks->count, struct reg_code_block);
13683 for (n = 0; n < ri->code_blocks->count; n++)
13684 reti->code_blocks->cb[n].src_regex = (REGEXP*)
13685 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
13686 reti->code_blocks->count = ri->code_blocks->count;
13687 reti->code_blocks->refcnt = 1;
13688 }
13689 else
13690 reti->code_blocks = NULL;
13691
13692 reti->regstclass = NULL;
13693
13694 if (ri->data) {
13695 struct reg_data *d;
13696 const int count = ri->data->count;
13697 int i;
13698
13699 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13700 char, struct reg_data);
13701 Newx(d->what, count, U8);
13702
13703 d->count = count;
13704 for (i = 0; i < count; i++) {
13705 d->what[i] = ri->data->what[i];
13706 switch (d->what[i]) {
13707 /* see also regcomp.h and regfree_internal() */
13708 case 'a': /* actually an AV, but the dup function is identical.
13709 values seem to be "plain sv's" generally. */
13710 case 'r': /* a compiled regex (but still just another SV) */
13711 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
13712 this use case should go away, the code could have used
13713 'a' instead - see S_set_ANYOF_arg() for array contents. */
13714 case 'S': /* actually an SV, but the dup function is identical. */
13715 case 'u': /* actually an HV, but the dup function is identical.
13716 values are "plain sv's" */
13717 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13718 break;
13719 case 'f':
13720 /* Synthetic Start Class - "Fake" charclass we generate to optimize
13721 * patterns which could start with several different things. Pre-TRIE
13722 * this was more important than it is now, however this still helps
13723 * in some places, for instance /x?a+/ might produce a SSC equivalent
13724 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
13725 * in regexec.c
13726 */
13727 /* This is cheating. */
13728 Newx(d->data[i], 1, regnode_ssc);
13729 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
13730 reti->regstclass = (regnode*)d->data[i];
13731 break;
13732 case 'T':
13733 /* AHO-CORASICK fail table */
13734 /* Trie stclasses are readonly and can thus be shared
13735 * without duplication. We free the stclass in pregfree
13736 * when the corresponding reg_ac_data struct is freed.
13737 */
13738 reti->regstclass= ri->regstclass;
13739 /* FALLTHROUGH */
13740 case 't':
13741 /* TRIE transition table */
13742 OP_REFCNT_LOCK;
13743 ((reg_trie_data*)ri->data->data[i])->refcount++;
13744 OP_REFCNT_UNLOCK;
13745 /* FALLTHROUGH */
13746 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
13747 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
13748 is not from another regexp */
13749 d->data[i] = ri->data->data[i];
13750 break;
13751 case '%':
13752 /* this is a placeholder type, it exists purely so that
13753 * reg_add_data always returns a non-zero value, this type of
13754 * entry should ONLY be present in the 0 slot of the array */
13755 assert(i == 0);
13756 d->data[i]= ri->data->data[i];
13757 break;
13758 default:
13759 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
13760 ri->data->what[i]);
13761 }
13762 }
13763
13764 reti->data = d;
13765 }
13766 else
13767 reti->data = NULL;
13768
13769 if (ri->regstclass && !reti->regstclass) {
13770 /* Assume that the regstclass is a regnode which is inside of the
13771 * program which we have to copy over */
13772 regnode *node= ri->regstclass;
13773 assert(node >= ri->program && (node - ri->program) < len);
13774 reti->regstclass = reti->program + (node - ri->program);
13775 }
13776
13777
13778 reti->name_list_idx = ri->name_list_idx;
13779
13780 SetProgLen(reti, len);
13781
13782 return (void*)reti;
13783 }
13784
13785 #endif /* USE_ITHREADS */
13786
13787 STATIC void
S_re_croak(pTHX_ bool utf8,const char * pat,...)13788 S_re_croak(pTHX_ bool utf8, const char* pat,...)
13789 {
13790 va_list args;
13791 STRLEN len = strlen(pat);
13792 char buf[512];
13793 SV *msv;
13794 const char *message;
13795
13796 PERL_ARGS_ASSERT_RE_CROAK;
13797
13798 if (len > 510)
13799 len = 510;
13800 Copy(pat, buf, len , char);
13801 buf[len] = '\n';
13802 buf[len + 1] = '\0';
13803 va_start(args, pat);
13804 msv = vmess(buf, &args);
13805 va_end(args);
13806 message = SvPV_const(msv, len);
13807 if (len > 512)
13808 len = 512;
13809 Copy(message, buf, len , char);
13810 /* len-1 to avoid \n */
13811 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
13812 }
13813
13814 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13815
13816 #ifndef PERL_IN_XSUB_RE
13817 void
Perl_save_re_context(pTHX)13818 Perl_save_re_context(pTHX)
13819 {
13820 I32 nparens = -1;
13821 I32 i;
13822
13823 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13824
13825 if (PL_curpm) {
13826 const REGEXP * const rx = PM_GETRE(PL_curpm);
13827 if (rx)
13828 nparens = RX_NPARENS(rx);
13829 }
13830
13831 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
13832 * that PL_curpm will be null, but that utf8.pm and the modules it
13833 * loads will only use $1..$3.
13834 * The t/porting/re_context.t test file checks this assumption.
13835 */
13836 if (nparens == -1)
13837 nparens = 3;
13838
13839 for (i = 1; i <= nparens; i++) {
13840 char digits[TYPE_CHARS(long)];
13841 const STRLEN len = my_snprintf(digits, sizeof(digits),
13842 "%lu", (long)i);
13843 GV *const *const gvp
13844 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13845
13846 if (gvp) {
13847 GV * const gv = *gvp;
13848 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13849 save_scalar(gv);
13850 }
13851 }
13852 }
13853 #endif
13854
13855 #ifndef PERL_IN_XSUB_RE
13856
13857 # include "uni_keywords.h"
13858
13859 void
Perl_init_uniprops(pTHX)13860 Perl_init_uniprops(pTHX)
13861 {
13862
13863 # ifdef DEBUGGING
13864 char * dump_len_string;
13865
13866 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
13867 if ( ! dump_len_string
13868 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
13869 {
13870 PL_dump_re_max_len = 60; /* A reasonable default */
13871 }
13872 # endif
13873
13874 PL_user_def_props = newHV();
13875
13876 # ifdef USE_ITHREADS
13877
13878 HvSHAREKEYS_off(PL_user_def_props);
13879 PL_user_def_props_aTHX = aTHX;
13880
13881 # endif
13882
13883 /* Set up the inversion list interpreter-level variables */
13884
13885 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13886 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
13887 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
13888 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
13889 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
13890 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
13891 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
13892 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
13893 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
13894 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
13895 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
13896 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
13897 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
13898 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
13899 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
13900 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
13901
13902 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13903 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
13904 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
13905 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
13906 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
13907 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
13908 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
13909 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
13910 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
13911 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
13912 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
13913 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
13914 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
13915 PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
13916 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
13917 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
13918
13919 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
13920 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
13921 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
13922 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
13923 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
13924
13925 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
13926 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
13927 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
13928 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
13929
13930 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
13931
13932 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
13933 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
13934
13935 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
13936 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
13937
13938 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
13939 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13940 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
13941 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13942 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
13943 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
13944 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
13945 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
13946 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
13947 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
13948 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
13949 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
13950 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
13951 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
13952
13953 # ifdef UNI_XIDC
13954 /* The below are used only by deprecated functions. They could be removed */
13955 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
13956 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
13957 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
13958 # endif
13959 }
13960
13961 /* These four functions are compiled only in regcomp.c, where they have access
13962 * to the data they return. They are a way for re_comp.c to get access to that
13963 * data without having to compile the whole data structures. */
13964
13965 I16
Perl_do_uniprop_match(const char * const key,const U16 key_len)13966 Perl_do_uniprop_match(const char * const key, const U16 key_len)
13967 {
13968 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
13969
13970 return match_uniprop((U8 *) key, key_len);
13971 }
13972
13973 SV *
Perl_get_prop_definition(pTHX_ const int table_index)13974 Perl_get_prop_definition(pTHX_ const int table_index)
13975 {
13976 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
13977
13978 /* Create and return the inversion list */
13979 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
13980 }
13981
13982 const char * const *
Perl_get_prop_values(const int table_index)13983 Perl_get_prop_values(const int table_index)
13984 {
13985 PERL_ARGS_ASSERT_GET_PROP_VALUES;
13986
13987 return UNI_prop_value_ptrs[table_index];
13988 }
13989
13990 const char *
Perl_get_deprecated_property_msg(const Size_t warning_offset)13991 Perl_get_deprecated_property_msg(const Size_t warning_offset)
13992 {
13993 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
13994
13995 return deprecated_property_msgs[warning_offset];
13996 }
13997
13998 # if 0
13999
14000 This code was mainly added for backcompat to give a warning for non-portable
14001 code points in user-defined properties. But experiments showed that the
14002 warning in earlier perls were only omitted on overflow, which should be an
14003 error, so there really isnt a backcompat issue, and actually adding the
14004 warning when none was present before might cause breakage, for little gain. So
14005 khw left this code in, but not enabled. Tests were never added.
14006
14007 embed.fnc entry:
14008 Ei |const char *|get_extended_utf8_msg|const UV cp
14009
14010 PERL_STATIC_INLINE const char *
14011 S_get_extended_utf8_msg(pTHX_ const UV cp)
14012 {
14013 U8 dummy[UTF8_MAXBYTES + 1];
14014 HV *msgs;
14015 SV **msg;
14016
14017 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
14018 &msgs);
14019
14020 msg = hv_fetchs(msgs, "text", 0);
14021 assert(msg);
14022
14023 (void) sv_2mortal((SV *) msgs);
14024
14025 return SvPVX(*msg);
14026 }
14027
14028 # endif
14029 #endif /* end of ! PERL_IN_XSUB_RE */
14030
14031 STATIC REGEXP *
S_compile_wildcard(pTHX_ const char * subpattern,const STRLEN len,const bool ignore_case)14032 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
14033 const bool ignore_case)
14034 {
14035 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
14036 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
14037 * because nothing outside of ASCII will match. Use /m because the input
14038 * string may be a bunch of lines strung together.
14039 *
14040 * Also sets up the debugging info */
14041
14042 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
14043 U32 rx_flags;
14044 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
14045 REGEXP * subpattern_re;
14046 DECLARE_AND_GET_RE_DEBUG_FLAGS;
14047
14048 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
14049
14050 if (ignore_case) {
14051 flags |= PMf_FOLD;
14052 }
14053 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
14054
14055 /* Like in op.c, we copy the compile time pm flags to the rx ones */
14056 rx_flags = flags & RXf_PMf_COMPILETIME;
14057
14058 #ifndef PERL_IN_XSUB_RE
14059 /* Use the core engine if this file is regcomp.c. That means no
14060 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
14061 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
14062 &PL_core_reg_engine,
14063 NULL, NULL,
14064 rx_flags, flags);
14065 #else
14066 if (isDEBUG_WILDCARD) {
14067 /* Use the special debugging engine if this file is re_comp.c and wants
14068 * to output the wildcard matching. This uses whatever
14069 * 'use re "Debug ..." is in effect */
14070 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
14071 &my_reg_engine,
14072 NULL, NULL,
14073 rx_flags, flags);
14074 }
14075 else {
14076 /* Use the special wildcard engine if this file is re_comp.c and
14077 * doesn't want to output the wildcard matching. This uses whatever
14078 * 'use re "Debug ..." is in effect for compilation, but this engine
14079 * structure has been set up so that it uses the core engine for
14080 * execution, so no execution debugging as a result of re.pm will be
14081 * displayed. */
14082 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
14083 &wild_reg_engine,
14084 NULL, NULL,
14085 rx_flags, flags);
14086 /* XXX The above has the effect that any user-supplied regex engine
14087 * won't be called for matching wildcards. That might be good, or bad.
14088 * It could be changed in several ways. The reason it is done the
14089 * current way is to avoid having to save and restore
14090 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
14091 * could be used. Another suggestion is to keep the authoritative
14092 * value of the debug flags in a thread-local variable and add set/get
14093 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
14094 * Still another is to pass a flag, say in the engine's intflags that
14095 * would be checked each time before doing the debug output */
14096 }
14097 #endif
14098
14099 assert(subpattern_re); /* Should have died if didn't compile successfully */
14100 return subpattern_re;
14101 }
14102
14103 STATIC I32
S_execute_wildcard(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)14104 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
14105 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
14106 {
14107 I32 result;
14108 DECLARE_AND_GET_RE_DEBUG_FLAGS;
14109
14110 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
14111
14112 ENTER;
14113
14114 /* The compilation has set things up so that if the program doesn't want to
14115 * see the wildcard matching procedure, it will get the core execution
14116 * engine, which is subject only to -Dr. So we have to turn that off
14117 * around this procedure */
14118 if (! isDEBUG_WILDCARD) {
14119 /* Note! Casts away 'volatile' */
14120 SAVEI32(PL_debug);
14121 PL_debug &= ~ DEBUG_r_FLAG;
14122 }
14123
14124 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
14125 NULL, nosave);
14126 LEAVE;
14127
14128 return result;
14129 }
14130
14131 SV *
S_handle_user_defined_property(pTHX_ const char * name,const STRLEN name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,SV * contents,bool * user_defined_ptr,SV * msg,const STRLEN level)14132 S_handle_user_defined_property(pTHX_
14133
14134 /* Parses the contents of a user-defined property definition; returning the
14135 * expanded definition if possible. If so, the return is an inversion
14136 * list.
14137 *
14138 * If there are subroutines that are part of the expansion and which aren't
14139 * known at the time of the call to this function, this returns what
14140 * parse_uniprop_string() returned for the first one encountered.
14141 *
14142 * If an error was found, NULL is returned, and 'msg' gets a suitable
14143 * message appended to it. (Appending allows the back trace of how we got
14144 * to the faulty definition to be displayed through nested calls of
14145 * user-defined subs.)
14146 *
14147 * The caller IS responsible for freeing any returned SV.
14148 *
14149 * The syntax of the contents is pretty much described in perlunicode.pod,
14150 * but we also allow comments on each line */
14151
14152 const char * name, /* Name of property */
14153 const STRLEN name_len, /* The name's length in bytes */
14154 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14155 const bool to_fold, /* ? Is this under /i */
14156 const bool runtime, /* ? Are we in compile- or run-time */
14157 const bool deferrable, /* Is it ok for this property's full definition
14158 to be deferred until later? */
14159 SV* contents, /* The property's definition */
14160 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
14161 getting called unless this is thought to be
14162 a user-defined property */
14163 SV * msg, /* Any error or warning msg(s) are appended to
14164 this */
14165 const STRLEN level) /* Recursion level of this call */
14166 {
14167 STRLEN len;
14168 const char * string = SvPV_const(contents, len);
14169 const char * const e = string + len;
14170 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
14171 const STRLEN msgs_length_on_entry = SvCUR(msg);
14172
14173 const char * s0 = string; /* Points to first byte in the current line
14174 being parsed in 'string' */
14175 const char overflow_msg[] = "Code point too large in \"";
14176 SV* running_definition = NULL;
14177
14178 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
14179
14180 *user_defined_ptr = TRUE;
14181
14182 /* Look at each line */
14183 while (s0 < e) {
14184 const char * s; /* Current byte */
14185 char op = '+'; /* Default operation is 'union' */
14186 IV min = 0; /* range begin code point */
14187 IV max = -1; /* and range end */
14188 SV* this_definition;
14189
14190 /* Skip comment lines */
14191 if (*s0 == '#') {
14192 s0 = strchr(s0, '\n');
14193 if (s0 == NULL) {
14194 break;
14195 }
14196 s0++;
14197 continue;
14198 }
14199
14200 /* For backcompat, allow an empty first line */
14201 if (*s0 == '\n') {
14202 s0++;
14203 continue;
14204 }
14205
14206 /* First character in the line may optionally be the operation */
14207 if ( *s0 == '+'
14208 || *s0 == '!'
14209 || *s0 == '-'
14210 || *s0 == '&')
14211 {
14212 op = *s0++;
14213 }
14214
14215 /* If the line is one or two hex digits separated by blank space, its
14216 * a range; otherwise it is either another user-defined property or an
14217 * error */
14218
14219 s = s0;
14220
14221 if (! isXDIGIT(*s)) {
14222 goto check_if_property;
14223 }
14224
14225 do { /* Each new hex digit will add 4 bits. */
14226 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
14227 s = strchr(s, '\n');
14228 if (s == NULL) {
14229 s = e;
14230 }
14231 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14232 sv_catpv(msg, overflow_msg);
14233 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14234 UTF8fARG(is_contents_utf8, s - s0, s0));
14235 sv_catpvs(msg, "\"");
14236 goto return_failure;
14237 }
14238
14239 /* Accumulate this digit into the value */
14240 min = (min << 4) + READ_XDIGIT(s);
14241 } while (isXDIGIT(*s));
14242
14243 while (isBLANK(*s)) { s++; }
14244
14245 /* We allow comments at the end of the line */
14246 if (*s == '#') {
14247 s = strchr(s, '\n');
14248 if (s == NULL) {
14249 s = e;
14250 }
14251 s++;
14252 }
14253 else if (s < e && *s != '\n') {
14254 if (! isXDIGIT(*s)) {
14255 goto check_if_property;
14256 }
14257
14258 /* Look for the high point of the range */
14259 max = 0;
14260 do {
14261 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
14262 s = strchr(s, '\n');
14263 if (s == NULL) {
14264 s = e;
14265 }
14266 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14267 sv_catpv(msg, overflow_msg);
14268 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14269 UTF8fARG(is_contents_utf8, s - s0, s0));
14270 sv_catpvs(msg, "\"");
14271 goto return_failure;
14272 }
14273
14274 max = (max << 4) + READ_XDIGIT(s);
14275 } while (isXDIGIT(*s));
14276
14277 while (isBLANK(*s)) { s++; }
14278
14279 if (*s == '#') {
14280 s = strchr(s, '\n');
14281 if (s == NULL) {
14282 s = e;
14283 }
14284 }
14285 else if (s < e && *s != '\n') {
14286 goto check_if_property;
14287 }
14288 }
14289
14290 if (max == -1) { /* The line only had one entry */
14291 max = min;
14292 }
14293 else if (max < min) {
14294 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14295 sv_catpvs(msg, "Illegal range in \"");
14296 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14297 UTF8fARG(is_contents_utf8, s - s0, s0));
14298 sv_catpvs(msg, "\"");
14299 goto return_failure;
14300 }
14301
14302 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
14303
14304 if ( UNICODE_IS_PERL_EXTENDED(min)
14305 || UNICODE_IS_PERL_EXTENDED(max))
14306 {
14307 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14308
14309 /* If both code points are non-portable, warn only on the lower
14310 * one. */
14311 sv_catpv(msg, get_extended_utf8_msg(
14312 (UNICODE_IS_PERL_EXTENDED(min))
14313 ? min : max));
14314 sv_catpvs(msg, " in \"");
14315 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14316 UTF8fARG(is_contents_utf8, s - s0, s0));
14317 sv_catpvs(msg, "\"");
14318 }
14319
14320 # endif
14321
14322 /* Here, this line contains a legal range */
14323 this_definition = sv_2mortal(_new_invlist(2));
14324 this_definition = _add_range_to_invlist(this_definition, min, max);
14325 goto calculate;
14326
14327 check_if_property:
14328
14329 /* Here it isn't a legal range line. See if it is a legal property
14330 * line. First find the end of the meat of the line */
14331 s = strpbrk(s, "#\n");
14332 if (s == NULL) {
14333 s = e;
14334 }
14335
14336 /* Ignore trailing blanks in keeping with the requirements of
14337 * parse_uniprop_string() */
14338 s--;
14339 while (s > s0 && isBLANK_A(*s)) {
14340 s--;
14341 }
14342 s++;
14343
14344 this_definition = parse_uniprop_string(s0, s - s0,
14345 is_utf8, to_fold, runtime,
14346 deferrable,
14347 NULL,
14348 user_defined_ptr, msg,
14349 (name_len == 0)
14350 ? level /* Don't increase level
14351 if input is empty */
14352 : level + 1
14353 );
14354 if (this_definition == NULL) {
14355 goto return_failure; /* 'msg' should have had the reason
14356 appended to it by the above call */
14357 }
14358
14359 if (! is_invlist(this_definition)) { /* Unknown at this time */
14360 return newSVsv(this_definition);
14361 }
14362
14363 if (*s != '\n') {
14364 s = strchr(s, '\n');
14365 if (s == NULL) {
14366 s = e;
14367 }
14368 }
14369
14370 calculate:
14371
14372 switch (op) {
14373 case '+':
14374 _invlist_union(running_definition, this_definition,
14375 &running_definition);
14376 break;
14377 case '-':
14378 _invlist_subtract(running_definition, this_definition,
14379 &running_definition);
14380 break;
14381 case '&':
14382 _invlist_intersection(running_definition, this_definition,
14383 &running_definition);
14384 break;
14385 case '!':
14386 _invlist_union_complement_2nd(running_definition,
14387 this_definition, &running_definition);
14388 break;
14389 default:
14390 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
14391 __FILE__, __LINE__, op);
14392 break;
14393 }
14394
14395 /* Position past the '\n' */
14396 s0 = s + 1;
14397 } /* End of loop through the lines of 'contents' */
14398
14399 /* Here, we processed all the lines in 'contents' without error. If we
14400 * didn't add any warnings, simply return success */
14401 if (msgs_length_on_entry == SvCUR(msg)) {
14402
14403 /* If the expansion was empty, the answer isn't nothing: its an empty
14404 * inversion list */
14405 if (running_definition == NULL) {
14406 running_definition = _new_invlist(1);
14407 }
14408
14409 return running_definition;
14410 }
14411
14412 /* Otherwise, add some explanatory text, but we will return success */
14413 goto return_msg;
14414
14415 return_failure:
14416 running_definition = NULL;
14417
14418 return_msg:
14419
14420 if (name_len > 0) {
14421 sv_catpvs(msg, " in expansion of ");
14422 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
14423 }
14424
14425 return running_definition;
14426 }
14427
14428 /* As explained below, certain operations need to take place in the first
14429 * thread created. These macros switch contexts */
14430 # ifdef USE_ITHREADS
14431 # define DECLARATION_FOR_GLOBAL_CONTEXT \
14432 PerlInterpreter * save_aTHX = aTHX;
14433 # define SWITCH_TO_GLOBAL_CONTEXT \
14434 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
14435 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
14436 # define CUR_CONTEXT aTHX
14437 # define ORIGINAL_CONTEXT save_aTHX
14438 # else
14439 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
14440 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
14441 # define RESTORE_CONTEXT NOOP
14442 # define CUR_CONTEXT NULL
14443 # define ORIGINAL_CONTEXT NULL
14444 # endif
14445
14446 STATIC void
S_delete_recursion_entry(pTHX_ void * key)14447 S_delete_recursion_entry(pTHX_ void *key)
14448 {
14449 /* Deletes the entry used to detect recursion when expanding user-defined
14450 * properties. This is a function so it can be set up to be called even if
14451 * the program unexpectedly quits */
14452
14453 SV ** current_entry;
14454 const STRLEN key_len = strlen((const char *) key);
14455 DECLARATION_FOR_GLOBAL_CONTEXT;
14456
14457 SWITCH_TO_GLOBAL_CONTEXT;
14458
14459 /* If the entry is one of these types, it is a permanent entry, and not the
14460 * one used to detect recursions. This function should delete only the
14461 * recursion entry */
14462 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
14463 if ( current_entry
14464 && ! is_invlist(*current_entry)
14465 && ! SvPOK(*current_entry))
14466 {
14467 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
14468 G_DISCARD);
14469 }
14470
14471 RESTORE_CONTEXT;
14472 }
14473
14474 STATIC SV *
S_get_fq_name(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool has_colon_colon)14475 S_get_fq_name(pTHX_
14476 const char * const name, /* The first non-blank in the \p{}, \P{} */
14477 const Size_t name_len, /* Its length in bytes, not including any trailing space */
14478 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14479 const bool has_colon_colon
14480 )
14481 {
14482 /* Returns a mortal SV containing the fully qualified version of the input
14483 * name */
14484
14485 SV * fq_name;
14486
14487 fq_name = newSVpvs_flags("", SVs_TEMP);
14488
14489 /* Use the current package if it wasn't included in our input */
14490 if (! has_colon_colon) {
14491 const HV * pkg = (IN_PERL_COMPILETIME)
14492 ? PL_curstash
14493 : CopSTASH(PL_curcop);
14494 const char* pkgname = HvNAME(pkg);
14495
14496 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14497 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
14498 sv_catpvs(fq_name, "::");
14499 }
14500
14501 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14502 UTF8fARG(is_utf8, name_len, name));
14503 return fq_name;
14504 }
14505
14506 STATIC SV *
S_parse_uniprop_string(pTHX_ const char * const name,Size_t name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,AV ** strings,bool * user_defined_ptr,SV * msg,const STRLEN level)14507 S_parse_uniprop_string(pTHX_
14508
14509 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
14510 * now. If so, the return is an inversion list.
14511 *
14512 * If the property is user-defined, it is a subroutine, which in turn
14513 * may call other subroutines. This function will call the whole nest of
14514 * them to get the definition they return; if some aren't known at the time
14515 * of the call to this function, the fully qualified name of the highest
14516 * level sub is returned. It is an error to call this function at runtime
14517 * without every sub defined.
14518 *
14519 * If an error was found, NULL is returned, and 'msg' gets a suitable
14520 * message appended to it. (Appending allows the back trace of how we got
14521 * to the faulty definition to be displayed through nested calls of
14522 * user-defined subs.)
14523 *
14524 * The caller should NOT try to free any returned inversion list.
14525 *
14526 * Other parameters will be set on return as described below */
14527
14528 const char * const name, /* The first non-blank in the \p{}, \P{} */
14529 Size_t name_len, /* Its length in bytes, not including any
14530 trailing space */
14531 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14532 const bool to_fold, /* ? Is this under /i */
14533 const bool runtime, /* TRUE if this is being called at run time */
14534 const bool deferrable, /* TRUE if it's ok for the definition to not be
14535 known at this call */
14536 AV ** strings, /* To return string property values, like named
14537 sequences */
14538 bool *user_defined_ptr, /* Upon return from this function it will be
14539 set to TRUE if any component is a
14540 user-defined property */
14541 SV * msg, /* Any error or warning msg(s) are appended to
14542 this */
14543 const STRLEN level) /* Recursion level of this call */
14544 {
14545 char* lookup_name; /* normalized name for lookup in our tables */
14546 unsigned lookup_len; /* Its length */
14547 enum { Not_Strict = 0, /* Some properties have stricter name */
14548 Strict, /* normalization rules, which we decide */
14549 As_Is /* upon based on parsing */
14550 } stricter = Not_Strict;
14551
14552 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
14553 * (though it requires extra effort to download them from Unicode and
14554 * compile perl to know about them) */
14555 bool is_nv_type = FALSE;
14556
14557 unsigned int i = 0, i_zero = 0, j = 0;
14558 int equals_pos = -1; /* Where the '=' is found, or negative if none */
14559 int slash_pos = -1; /* Where the '/' is found, or negative if none */
14560 int table_index = 0; /* The entry number for this property in the table
14561 of all Unicode property names */
14562 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
14563 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
14564 the normalized name in certain situations */
14565 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
14566 part of a package name */
14567 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
14568 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
14569 property rather than a Unicode
14570 one. */
14571 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
14572 if an error. If it is an inversion list,
14573 it is the definition. Otherwise it is a
14574 string containing the fully qualified sub
14575 name of 'name' */
14576 SV * fq_name = NULL; /* For user-defined properties, the fully
14577 qualified name */
14578 bool invert_return = FALSE; /* ? Do we need to complement the result before
14579 returning it */
14580 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
14581 explicit utf8:: package that we strip
14582 off */
14583 /* The expansion of properties that could be either user-defined or
14584 * official unicode ones is deferred until runtime, including a marker for
14585 * those that might be in the latter category. This boolean indicates if
14586 * we've seen that marker. If not, what we're parsing can't be such an
14587 * official Unicode property whose expansion was deferred */
14588 bool could_be_deferred_official = FALSE;
14589
14590 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
14591
14592 /* The input will be normalized into 'lookup_name' */
14593 Newx(lookup_name, name_len, char);
14594 SAVEFREEPV(lookup_name);
14595
14596 /* Parse the input. */
14597 for (i = 0; i < name_len; i++) {
14598 char cur = name[i];
14599
14600 /* Most of the characters in the input will be of this ilk, being parts
14601 * of a name */
14602 if (isIDCONT_A(cur)) {
14603
14604 /* Case differences are ignored. Our lookup routine assumes
14605 * everything is lowercase, so normalize to that */
14606 if (isUPPER_A(cur)) {
14607 lookup_name[j++] = toLOWER_A(cur);
14608 continue;
14609 }
14610
14611 if (cur == '_') { /* Don't include these in the normalized name */
14612 continue;
14613 }
14614
14615 lookup_name[j++] = cur;
14616
14617 /* The first character in a user-defined name must be of this type.
14618 * */
14619 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
14620 could_be_user_defined = FALSE;
14621 }
14622
14623 continue;
14624 }
14625
14626 /* Here, the character is not something typically in a name, But these
14627 * two types of characters (and the '_' above) can be freely ignored in
14628 * most situations. Later it may turn out we shouldn't have ignored
14629 * them, and we have to reparse, but we don't have enough information
14630 * yet to make that decision */
14631 if (cur == '-' || isSPACE_A(cur)) {
14632 could_be_user_defined = FALSE;
14633 continue;
14634 }
14635
14636 /* An equals sign or single colon mark the end of the first part of
14637 * the property name */
14638 if ( cur == '='
14639 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
14640 {
14641 lookup_name[j++] = '='; /* Treat the colon as an '=' */
14642 equals_pos = j; /* Note where it occurred in the input */
14643 could_be_user_defined = FALSE;
14644 break;
14645 }
14646
14647 /* If this looks like it is a marker we inserted at compile time,
14648 * set a flag and otherwise ignore it. If it isn't in the final
14649 * position, keep it as it would have been user input. */
14650 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
14651 && ! deferrable
14652 && could_be_user_defined
14653 && i == name_len - 1)
14654 {
14655 name_len--;
14656 could_be_deferred_official = TRUE;
14657 continue;
14658 }
14659
14660 /* Otherwise, this character is part of the name. */
14661 lookup_name[j++] = cur;
14662
14663 /* Here it isn't a single colon, so if it is a colon, it must be a
14664 * double colon */
14665 if (cur == ':') {
14666
14667 /* A double colon should be a package qualifier. We note its
14668 * position and continue. Note that one could have
14669 * pkg1::pkg2::...::foo
14670 * so that the position at the end of the loop will be just after
14671 * the final qualifier */
14672
14673 i++;
14674 non_pkg_begin = i + 1;
14675 lookup_name[j++] = ':';
14676 lun_non_pkg_begin = j;
14677 }
14678 else { /* Only word chars (and '::') can be in a user-defined name */
14679 could_be_user_defined = FALSE;
14680 }
14681 } /* End of parsing through the lhs of the property name (or all of it if
14682 no rhs) */
14683
14684 /* If there is a single package name 'utf8::', it is ambiguous. It could
14685 * be for a user-defined property, or it could be a Unicode property, as
14686 * all of them are considered to be for that package. For the purposes of
14687 * parsing the rest of the property, strip it off */
14688 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
14689 lookup_name += STRLENs("utf8::");
14690 j -= STRLENs("utf8::");
14691 equals_pos -= STRLENs("utf8::");
14692 i_zero = STRLENs("utf8::"); /* When resetting 'i' to reparse
14693 from the beginning, it has to be
14694 set past what we're stripping
14695 off */
14696 stripped_utf8_pkg = TRUE;
14697 }
14698
14699 /* Here, we are either done with the whole property name, if it was simple;
14700 * or are positioned just after the '=' if it is compound. */
14701
14702 if (equals_pos >= 0) {
14703 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
14704
14705 /* Space immediately after the '=' is ignored */
14706 i++;
14707 for (; i < name_len; i++) {
14708 if (! isSPACE_A(name[i])) {
14709 break;
14710 }
14711 }
14712
14713 /* Most punctuation after the equals indicates a subpattern, like
14714 * \p{foo=/bar/} */
14715 if ( isPUNCT_A(name[i])
14716 && name[i] != '-'
14717 && name[i] != '+'
14718 && name[i] != '_'
14719 && name[i] != '{'
14720 /* A backslash means the real delimiter is the next character,
14721 * but it must be punctuation */
14722 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
14723 {
14724 bool special_property = memEQs(lookup_name, j - 1, "name")
14725 || memEQs(lookup_name, j - 1, "na");
14726 if (! special_property) {
14727 /* Find the property. The table includes the equals sign, so
14728 * we use 'j' as-is */
14729 table_index = do_uniprop_match(lookup_name, j);
14730 }
14731 if (special_property || table_index) {
14732 REGEXP * subpattern_re;
14733 char open = name[i++];
14734 char close;
14735 const char * pos_in_brackets;
14736 const char * const * prop_values;
14737 bool escaped = 0;
14738
14739 /* Backslash => delimiter is the character following. We
14740 * already checked that it is punctuation */
14741 if (open == '\\') {
14742 open = name[i++];
14743 escaped = 1;
14744 }
14745
14746 /* This data structure is constructed so that the matching
14747 * closing bracket is 3 past its matching opening. The second
14748 * set of closing is so that if the opening is something like
14749 * ']', the closing will be that as well. Something similar is
14750 * done in toke.c */
14751 pos_in_brackets = memCHRs("([<)]>)]>", open);
14752 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
14753
14754 if ( i >= name_len
14755 || name[name_len-1] != close
14756 || (escaped && name[name_len-2] != '\\')
14757 /* Also make sure that there are enough characters.
14758 * e.g., '\\\' would show up incorrectly as legal even
14759 * though it is too short */
14760 || (SSize_t) (name_len - i - 1 - escaped) < 0)
14761 {
14762 sv_catpvs(msg, "Unicode property wildcard not terminated");
14763 goto append_name_to_msg;
14764 }
14765
14766 Perl_ck_warner_d(aTHX_
14767 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
14768 "The Unicode property wildcards feature is experimental");
14769
14770 if (special_property) {
14771 const char * error_msg;
14772 const char * revised_name = name + i;
14773 Size_t revised_name_len = name_len - (i + 1 + escaped);
14774
14775 /* Currently, the only 'special_property' is name, which we
14776 * lookup in _charnames.pm */
14777
14778 if (! load_charnames(newSVpvs("placeholder"),
14779 revised_name, revised_name_len,
14780 &error_msg))
14781 {
14782 sv_catpv(msg, error_msg);
14783 goto append_name_to_msg;
14784 }
14785
14786 /* Farm this out to a function just to make the current
14787 * function less unwieldy */
14788 if (handle_names_wildcard(revised_name, revised_name_len,
14789 &prop_definition,
14790 strings))
14791 {
14792 return prop_definition;
14793 }
14794
14795 goto failed;
14796 }
14797
14798 prop_values = get_prop_values(table_index);
14799
14800 /* Now create and compile the wildcard subpattern. Use /i
14801 * because the property values are supposed to match with case
14802 * ignored. */
14803 subpattern_re = compile_wildcard(name + i,
14804 name_len - i - 1 - escaped,
14805 TRUE /* /i */
14806 );
14807
14808 /* For each legal property value, see if the supplied pattern
14809 * matches it. */
14810 while (*prop_values) {
14811 const char * const entry = *prop_values;
14812 const Size_t len = strlen(entry);
14813 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
14814
14815 if (execute_wildcard(subpattern_re,
14816 (char *) entry,
14817 (char *) entry + len,
14818 (char *) entry, 0,
14819 entry_sv,
14820 0))
14821 { /* Here, matched. Add to the returned list */
14822 Size_t total_len = j + len;
14823 SV * sub_invlist = NULL;
14824 char * this_string;
14825
14826 /* We know this is a legal \p{property=value}. Call
14827 * the function to return the list of code points that
14828 * match it */
14829 Newxz(this_string, total_len + 1, char);
14830 Copy(lookup_name, this_string, j, char);
14831 my_strlcat(this_string, entry, total_len + 1);
14832 SAVEFREEPV(this_string);
14833 sub_invlist = parse_uniprop_string(this_string,
14834 total_len,
14835 is_utf8,
14836 to_fold,
14837 runtime,
14838 deferrable,
14839 NULL,
14840 user_defined_ptr,
14841 msg,
14842 level + 1);
14843 _invlist_union(prop_definition, sub_invlist,
14844 &prop_definition);
14845 }
14846
14847 prop_values++; /* Next iteration, look at next propvalue */
14848 } /* End of looking through property values; (the data
14849 structure is terminated by a NULL ptr) */
14850
14851 SvREFCNT_dec_NN(subpattern_re);
14852
14853 if (prop_definition) {
14854 return prop_definition;
14855 }
14856
14857 sv_catpvs(msg, "No Unicode property value wildcard matches:");
14858 goto append_name_to_msg;
14859 }
14860
14861 /* Here's how khw thinks we should proceed to handle the properties
14862 * not yet done: Bidi Mirroring Glyph can map to ""
14863 Bidi Paired Bracket can map to ""
14864 Case Folding (both full and simple)
14865 Shouldn't /i be good enough for Full
14866 Decomposition Mapping
14867 Equivalent Unified Ideograph can map to ""
14868 Lowercase Mapping (both full and simple)
14869 NFKC Case Fold can map to ""
14870 Titlecase Mapping (both full and simple)
14871 Uppercase Mapping (both full and simple)
14872 * Handle these the same way Name is done, using say, _wild.pm, but
14873 * having both loose and full, like in charclass_invlists.h.
14874 * Perhaps move block and script to that as they are somewhat large
14875 * in charclass_invlists.h.
14876 * For properties where the default is the code point itself, such
14877 * as any of the case changing mappings, the string would otherwise
14878 * consist of all Unicode code points in UTF-8 strung together.
14879 * This would be impractical. So instead, examine their compiled
14880 * pattern, looking at the ssc. If none, reject the pattern as an
14881 * error. Otherwise run the pattern against every code point in
14882 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
14883 * And it might be good to create an API to return the ssc.
14884 * Or handle them like the algorithmic names are done
14885 */
14886 } /* End of is a wildcard subppattern */
14887
14888 /* \p{name=...} is handled specially. Instead of using the normal
14889 * mechanism involving charclass_invlists.h, it uses _charnames.pm
14890 * which has the necessary (huge) data accessible to it, and which
14891 * doesn't get loaded unless necessary. The legal syntax for names is
14892 * somewhat different than other properties due both to the vagaries of
14893 * a few outlier official names, and the fact that only a few ASCII
14894 * characters are permitted in them */
14895 if ( memEQs(lookup_name, j - 1, "name")
14896 || memEQs(lookup_name, j - 1, "na"))
14897 {
14898 dSP;
14899 HV * table;
14900 SV * character;
14901 const char * error_msg;
14902 CV* lookup_loose;
14903 SV * character_name;
14904 STRLEN character_len;
14905 UV cp;
14906
14907 stricter = As_Is;
14908
14909 /* Since the RHS (after skipping initial space) is passed unchanged
14910 * to charnames, and there are different criteria for what are
14911 * legal characters in the name, just parse it here. A character
14912 * name must begin with an ASCII alphabetic */
14913 if (! isALPHA(name[i])) {
14914 goto failed;
14915 }
14916 lookup_name[j++] = name[i];
14917
14918 for (++i; i < name_len; i++) {
14919 /* Official names can only be in the ASCII range, and only
14920 * certain characters */
14921 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
14922 goto failed;
14923 }
14924 lookup_name[j++] = name[i];
14925 }
14926
14927 /* Finished parsing, save the name into an SV */
14928 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
14929
14930 /* Make sure _charnames is loaded. (The parameters give context
14931 * for any errors generated */
14932 table = load_charnames(character_name, name, name_len, &error_msg);
14933 if (table == NULL) {
14934 sv_catpv(msg, error_msg);
14935 goto append_name_to_msg;
14936 }
14937
14938 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
14939 if (! lookup_loose) {
14940 Perl_croak(aTHX_
14941 "panic: Can't find '_charnames::_loose_regcomp_lookup");
14942 }
14943
14944 PUSHSTACKi(PERLSI_REGCOMP);
14945 ENTER ;
14946 SAVETMPS;
14947 save_re_context();
14948
14949 PUSHMARK(SP) ;
14950 XPUSHs(character_name);
14951 PUTBACK;
14952 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
14953
14954 SPAGAIN ;
14955
14956 character = POPs;
14957 SvREFCNT_inc_simple_void_NN(character);
14958
14959 PUTBACK ;
14960 FREETMPS ;
14961 LEAVE ;
14962 POPSTACK;
14963
14964 if (! SvOK(character)) {
14965 goto failed;
14966 }
14967
14968 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
14969 if (character_len == SvCUR(character)) {
14970 prop_definition = add_cp_to_invlist(NULL, cp);
14971 }
14972 else {
14973 AV * this_string;
14974
14975 /* First of the remaining characters in the string. */
14976 char * remaining = SvPVX(character) + character_len;
14977
14978 if (strings == NULL) {
14979 goto failed; /* XXX Perhaps a specific msg instead, like
14980 'not available here' */
14981 }
14982
14983 if (*strings == NULL) {
14984 *strings = newAV();
14985 }
14986
14987 this_string = newAV();
14988 av_push_simple(this_string, newSVuv(cp));
14989
14990 do {
14991 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
14992 av_push_simple(this_string, newSVuv(cp));
14993 remaining += character_len;
14994 } while (remaining < SvEND(character));
14995
14996 av_push_simple(*strings, (SV *) this_string);
14997 }
14998
14999 return prop_definition;
15000 }
15001
15002 /* Certain properties whose values are numeric need special handling.
15003 * They may optionally be prefixed by 'is'. Ignore that prefix for the
15004 * purposes of checking if this is one of those properties */
15005 if (memBEGINPs(lookup_name, j, "is")) {
15006 lookup_offset = 2;
15007 }
15008
15009 /* Then check if it is one of these specially-handled properties. The
15010 * possibilities are hard-coded because easier this way, and the list
15011 * is unlikely to change.
15012 *
15013 * All numeric value type properties are of this ilk, and are also
15014 * special in a different way later on. So find those first. There
15015 * are several numeric value type properties in the Unihan DB (which is
15016 * unlikely to be compiled with perl, but we handle it here in case it
15017 * does get compiled). They all end with 'numeric'. The interiors
15018 * aren't checked for the precise property. This would stop working if
15019 * a cjk property were to be created that ended with 'numeric' and
15020 * wasn't a numeric type */
15021 is_nv_type = memEQs(lookup_name + lookup_offset,
15022 j - 1 - lookup_offset, "numericvalue")
15023 || memEQs(lookup_name + lookup_offset,
15024 j - 1 - lookup_offset, "nv")
15025 || ( memENDPs(lookup_name + lookup_offset,
15026 j - 1 - lookup_offset, "numeric")
15027 && ( memBEGINPs(lookup_name + lookup_offset,
15028 j - 1 - lookup_offset, "cjk")
15029 || memBEGINPs(lookup_name + lookup_offset,
15030 j - 1 - lookup_offset, "k")));
15031 if ( is_nv_type
15032 || memEQs(lookup_name + lookup_offset,
15033 j - 1 - lookup_offset, "canonicalcombiningclass")
15034 || memEQs(lookup_name + lookup_offset,
15035 j - 1 - lookup_offset, "ccc")
15036 || memEQs(lookup_name + lookup_offset,
15037 j - 1 - lookup_offset, "age")
15038 || memEQs(lookup_name + lookup_offset,
15039 j - 1 - lookup_offset, "in")
15040 || memEQs(lookup_name + lookup_offset,
15041 j - 1 - lookup_offset, "presentin"))
15042 {
15043 unsigned int k;
15044
15045 /* Since the stuff after the '=' is a number, we can't throw away
15046 * '-' willy-nilly, as those could be a minus sign. Other stricter
15047 * rules also apply. However, these properties all can have the
15048 * rhs not be a number, in which case they contain at least one
15049 * alphabetic. In those cases, the stricter rules don't apply.
15050 * But the numeric type properties can have the alphas [Ee] to
15051 * signify an exponent, and it is still a number with stricter
15052 * rules. So look for an alpha that signifies not-strict */
15053 stricter = Strict;
15054 for (k = i; k < name_len; k++) {
15055 if ( isALPHA_A(name[k])
15056 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
15057 {
15058 stricter = Not_Strict;
15059 break;
15060 }
15061 }
15062 }
15063
15064 if (stricter) {
15065
15066 /* A number may have a leading '+' or '-'. The latter is retained
15067 * */
15068 if (name[i] == '+') {
15069 i++;
15070 }
15071 else if (name[i] == '-') {
15072 lookup_name[j++] = '-';
15073 i++;
15074 }
15075
15076 /* Skip leading zeros including single underscores separating the
15077 * zeros, or between the final leading zero and the first other
15078 * digit */
15079 for (; i < name_len - 1; i++) {
15080 if ( name[i] != '0'
15081 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15082 {
15083 break;
15084 }
15085 }
15086
15087 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by
15088 * underling libc implementation. */
15089 if ( i == name_len - 1
15090 && name[name_len-1] == '0'
15091 && lookup_name[j-1] == '-')
15092 {
15093 j--;
15094 }
15095 }
15096 }
15097 else { /* No '=' */
15098
15099 /* Only a few properties without an '=' should be parsed with stricter
15100 * rules. The list is unlikely to change. */
15101 if ( memBEGINPs(lookup_name, j, "perl")
15102 && memNEs(lookup_name + 4, j - 4, "space")
15103 && memNEs(lookup_name + 4, j - 4, "word"))
15104 {
15105 stricter = Strict;
15106
15107 /* We set the inputs back to 0 and the code below will reparse,
15108 * using strict */
15109 i = i_zero;
15110 j = 0;
15111 }
15112 }
15113
15114 /* Here, we have either finished the property, or are positioned to parse
15115 * the remainder, and we know if stricter rules apply. Finish out, if not
15116 * already done */
15117 for (; i < name_len; i++) {
15118 char cur = name[i];
15119
15120 /* In all instances, case differences are ignored, and we normalize to
15121 * lowercase */
15122 if (isUPPER_A(cur)) {
15123 lookup_name[j++] = toLOWER(cur);
15124 continue;
15125 }
15126
15127 /* An underscore is skipped, but not under strict rules unless it
15128 * separates two digits */
15129 if (cur == '_') {
15130 if ( stricter
15131 && ( i == i_zero || (int) i == equals_pos || i == name_len- 1
15132 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
15133 {
15134 lookup_name[j++] = '_';
15135 }
15136 continue;
15137 }
15138
15139 /* Hyphens are skipped except under strict */
15140 if (cur == '-' && ! stricter) {
15141 continue;
15142 }
15143
15144 /* XXX Bug in documentation. It says white space skipped adjacent to
15145 * non-word char. Maybe we should, but shouldn't skip it next to a dot
15146 * in a number */
15147 if (isSPACE_A(cur) && ! stricter) {
15148 continue;
15149 }
15150
15151 lookup_name[j++] = cur;
15152
15153 /* Unless this is a non-trailing slash, we are done with it */
15154 if (i >= name_len - 1 || cur != '/') {
15155 continue;
15156 }
15157
15158 slash_pos = j;
15159
15160 /* A slash in the 'numeric value' property indicates that what follows
15161 * is a denominator. It can have a leading '+' and '0's that should be
15162 * skipped. But we have never allowed a negative denominator, so treat
15163 * a minus like every other character. (No need to rule out a second
15164 * '/', as that won't match anything anyway */
15165 if (is_nv_type) {
15166 i++;
15167 if (i < name_len && name[i] == '+') {
15168 i++;
15169 }
15170
15171 /* Skip leading zeros including underscores separating digits */
15172 for (; i < name_len - 1; i++) {
15173 if ( name[i] != '0'
15174 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15175 {
15176 break;
15177 }
15178 }
15179
15180 /* Store the first real character in the denominator */
15181 if (i < name_len) {
15182 lookup_name[j++] = name[i];
15183 }
15184 }
15185 }
15186
15187 /* Here are completely done parsing the input 'name', and 'lookup_name'
15188 * contains a copy, normalized.
15189 *
15190 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
15191 * different from without the underscores. */
15192 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
15193 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
15194 && UNLIKELY(name[name_len-1] == '_'))
15195 {
15196 lookup_name[j++] = '&';
15197 }
15198
15199 /* If the original input began with 'In' or 'Is', it could be a subroutine
15200 * call to a user-defined property instead of a Unicode property name. */
15201 if ( name_len - non_pkg_begin > 2
15202 && name[non_pkg_begin+0] == 'I'
15203 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
15204 {
15205 /* Names that start with In have different characteristics than those
15206 * that start with Is */
15207 if (name[non_pkg_begin+1] == 's') {
15208 starts_with_Is = TRUE;
15209 }
15210 }
15211 else {
15212 could_be_user_defined = FALSE;
15213 }
15214
15215 if (could_be_user_defined) {
15216 CV* user_sub;
15217
15218 /* If the user defined property returns the empty string, it could
15219 * easily be because the pattern is being compiled before the data it
15220 * actually needs to compile is available. This could be argued to be
15221 * a bug in the perl code, but this is a change of behavior for Perl,
15222 * so we handle it. This means that intentionally returning nothing
15223 * will not be resolved until runtime */
15224 bool empty_return = FALSE;
15225
15226 /* Here, the name could be for a user defined property, which are
15227 * implemented as subs. */
15228 user_sub = get_cvn_flags(name, name_len, 0);
15229 if (! user_sub) {
15230
15231 /* Here, the property name could be a user-defined one, but there
15232 * is no subroutine to handle it (as of now). Defer handling it
15233 * until runtime. Otherwise, a block defined by Unicode in a later
15234 * release would get the synonym InFoo added for it, and existing
15235 * code that used that name would suddenly break if it referred to
15236 * the property before the sub was declared. See [perl #134146] */
15237 if (deferrable) {
15238 goto definition_deferred;
15239 }
15240
15241 /* Here, we are at runtime, and didn't find the user property. It
15242 * could be an official property, but only if no package was
15243 * specified, or just the utf8:: package. */
15244 if (could_be_deferred_official) {
15245 lookup_name += lun_non_pkg_begin;
15246 j -= lun_non_pkg_begin;
15247 }
15248 else if (! stripped_utf8_pkg) {
15249 goto unknown_user_defined;
15250 }
15251
15252 /* Drop down to look up in the official properties */
15253 }
15254 else {
15255 const char insecure[] = "Insecure user-defined property";
15256
15257 /* Here, there is a sub by the correct name. Normally we call it
15258 * to get the property definition */
15259 dSP;
15260 SV * user_sub_sv = MUTABLE_SV(user_sub);
15261 SV * error; /* Any error returned by calling 'user_sub' */
15262 SV * key; /* The key into the hash of user defined sub names
15263 */
15264 SV * placeholder;
15265 SV ** saved_user_prop_ptr; /* Hash entry for this property */
15266
15267 /* How many times to retry when another thread is in the middle of
15268 * expanding the same definition we want */
15269 PERL_INT_FAST8_T retry_countdown = 10;
15270
15271 DECLARATION_FOR_GLOBAL_CONTEXT;
15272
15273 /* If we get here, we know this property is user-defined */
15274 *user_defined_ptr = TRUE;
15275
15276 /* We refuse to call a potentially tainted subroutine; returning an
15277 * error instead */
15278 if (TAINT_get) {
15279 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15280 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15281 goto append_name_to_msg;
15282 }
15283
15284 /* In principal, we only call each subroutine property definition
15285 * once during the life of the program. This guarantees that the
15286 * property definition never changes. The results of the single
15287 * sub call are stored in a hash, which is used instead for future
15288 * references to this property. The property definition is thus
15289 * immutable. But, to allow the user to have a /i-dependent
15290 * definition, we call the sub once for non-/i, and once for /i,
15291 * should the need arise, passing the /i status as a parameter.
15292 *
15293 * We start by constructing the hash key name, consisting of the
15294 * fully qualified subroutine name, preceded by the /i status, so
15295 * that there is a key for /i and a different key for non-/i */
15296 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
15297 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15298 non_pkg_begin != 0);
15299 sv_catsv(key, fq_name);
15300
15301 /* We only call the sub once throughout the life of the program
15302 * (with the /i, non-/i exception noted above). That means the
15303 * hash must be global and accessible to all threads. It is
15304 * created at program start-up, before any threads are created, so
15305 * is accessible to all children. But this creates some
15306 * complications.
15307 *
15308 * 1) The keys can't be shared, or else problems arise; sharing is
15309 * turned off at hash creation time
15310 * 2) All SVs in it are there for the remainder of the life of the
15311 * program, and must be created in the same interpreter context
15312 * as the hash, or else they will be freed from the wrong pool
15313 * at global destruction time. This is handled by switching to
15314 * the hash's context to create each SV going into it, and then
15315 * immediately switching back
15316 * 3) All accesses to the hash must be controlled by a mutex, to
15317 * prevent two threads from getting an unstable state should
15318 * they simultaneously be accessing it. The code below is
15319 * crafted so that the mutex is locked whenever there is an
15320 * access and unlocked only when the next stable state is
15321 * achieved.
15322 *
15323 * The hash stores either the definition of the property if it was
15324 * valid, or, if invalid, the error message that was raised. We
15325 * use the type of SV to distinguish.
15326 *
15327 * There's also the need to guard against the definition expansion
15328 * from infinitely recursing. This is handled by storing the aTHX
15329 * of the expanding thread during the expansion. Again the SV type
15330 * is used to distinguish this from the other two cases. If we
15331 * come to here and the hash entry for this property is our aTHX,
15332 * it means we have recursed, and the code assumes that we would
15333 * infinitely recurse, so instead stops and raises an error.
15334 * (Any recursion has always been treated as infinite recursion in
15335 * this feature.)
15336 *
15337 * If instead, the entry is for a different aTHX, it means that
15338 * that thread has gotten here first, and hasn't finished expanding
15339 * the definition yet. We just have to wait until it is done. We
15340 * sleep and retry a few times, returning an error if the other
15341 * thread doesn't complete. */
15342
15343 re_fetch:
15344 USER_PROP_MUTEX_LOCK;
15345
15346 /* If we have an entry for this key, the subroutine has already
15347 * been called once with this /i status. */
15348 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
15349 SvPVX(key), SvCUR(key), 0);
15350 if (saved_user_prop_ptr) {
15351
15352 /* If the saved result is an inversion list, it is the valid
15353 * definition of this property */
15354 if (is_invlist(*saved_user_prop_ptr)) {
15355 prop_definition = *saved_user_prop_ptr;
15356
15357 /* The SV in the hash won't be removed until global
15358 * destruction, so it is stable and we can unlock */
15359 USER_PROP_MUTEX_UNLOCK;
15360
15361 /* The caller shouldn't try to free this SV */
15362 return prop_definition;
15363 }
15364
15365 /* Otherwise, if it is a string, it is the error message
15366 * that was returned when we first tried to evaluate this
15367 * property. Fail, and append the message */
15368 if (SvPOK(*saved_user_prop_ptr)) {
15369 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15370 sv_catsv(msg, *saved_user_prop_ptr);
15371
15372 /* The SV in the hash won't be removed until global
15373 * destruction, so it is stable and we can unlock */
15374 USER_PROP_MUTEX_UNLOCK;
15375
15376 return NULL;
15377 }
15378
15379 assert(SvIOK(*saved_user_prop_ptr));
15380
15381 /* Here, we have an unstable entry in the hash. Either another
15382 * thread is in the middle of expanding the property's
15383 * definition, or we are ourselves recursing. We use the aTHX
15384 * in it to distinguish */
15385 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
15386
15387 /* Here, it's another thread doing the expanding. We've
15388 * looked as much as we are going to at the contents of the
15389 * hash entry. It's safe to unlock. */
15390 USER_PROP_MUTEX_UNLOCK;
15391
15392 /* Retry a few times */
15393 if (retry_countdown-- > 0) {
15394 PerlProc_sleep(1);
15395 goto re_fetch;
15396 }
15397
15398 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15399 sv_catpvs(msg, "Timeout waiting for another thread to "
15400 "define");
15401 goto append_name_to_msg;
15402 }
15403
15404 /* Here, we are recursing; don't dig any deeper */
15405 USER_PROP_MUTEX_UNLOCK;
15406
15407 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15408 sv_catpvs(msg,
15409 "Infinite recursion in user-defined property");
15410 goto append_name_to_msg;
15411 }
15412
15413 /* Here, this thread has exclusive control, and there is no entry
15414 * for this property in the hash. So we have the go ahead to
15415 * expand the definition ourselves. */
15416
15417 PUSHSTACKi(PERLSI_REGCOMP);
15418 ENTER;
15419
15420 /* Create a temporary placeholder in the hash to detect recursion
15421 * */
15422 SWITCH_TO_GLOBAL_CONTEXT;
15423 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
15424 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
15425 RESTORE_CONTEXT;
15426
15427 /* Now that we have a placeholder, we can let other threads
15428 * continue */
15429 USER_PROP_MUTEX_UNLOCK;
15430
15431 /* Make sure the placeholder always gets destroyed */
15432 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
15433
15434 PUSHMARK(SP);
15435 SAVETMPS;
15436
15437 /* Call the user's function, with the /i status as a parameter.
15438 * Note that we have gone to a lot of trouble to keep this call
15439 * from being within the locked mutex region. */
15440 XPUSHs(boolSV(to_fold));
15441 PUTBACK;
15442
15443 /* The following block was taken from swash_init(). Presumably
15444 * they apply to here as well, though we no longer use a swash --
15445 * khw */
15446 SAVEHINTS();
15447 save_re_context();
15448 /* We might get here via a subroutine signature which uses a utf8
15449 * parameter name, at which point PL_subname will have been set
15450 * but not yet used. */
15451 save_item(PL_subname);
15452
15453 /* G_SCALAR guarantees a single return value */
15454 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
15455
15456 SPAGAIN;
15457
15458 error = ERRSV;
15459 if (TAINT_get || SvTRUE(error)) {
15460 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15461 if (SvTRUE(error)) {
15462 sv_catpvs(msg, "Error \"");
15463 sv_catsv(msg, error);
15464 sv_catpvs(msg, "\"");
15465 }
15466 if (TAINT_get) {
15467 if (SvTRUE(error)) sv_catpvs(msg, "; ");
15468 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15469 }
15470
15471 if (name_len > 0) {
15472 sv_catpvs(msg, " in expansion of ");
15473 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
15474 name_len,
15475 name));
15476 }
15477
15478 (void) POPs;
15479 prop_definition = NULL;
15480 }
15481 else {
15482 SV * contents = POPs;
15483
15484 /* The contents is supposed to be the expansion of the property
15485 * definition. If the definition is deferrable, and we got an
15486 * empty string back, set a flag to later defer it (after clean
15487 * up below). */
15488 if ( deferrable
15489 && (! SvPOK(contents) || SvCUR(contents) == 0))
15490 {
15491 empty_return = TRUE;
15492 }
15493 else { /* Otherwise, call a function to check for valid syntax,
15494 and handle it */
15495
15496 prop_definition = handle_user_defined_property(
15497 name, name_len,
15498 is_utf8, to_fold, runtime,
15499 deferrable,
15500 contents, user_defined_ptr,
15501 msg,
15502 level);
15503 }
15504 }
15505
15506 /* Here, we have the results of the expansion. Delete the
15507 * placeholder, and if the definition is now known, replace it with
15508 * that definition. We need exclusive access to the hash, and we
15509 * can't let anyone else in, between when we delete the placeholder
15510 * and add the permanent entry */
15511 USER_PROP_MUTEX_LOCK;
15512
15513 S_delete_recursion_entry(aTHX_ SvPVX(key));
15514
15515 if ( ! empty_return
15516 && (! prop_definition || is_invlist(prop_definition)))
15517 {
15518 /* If we got success we use the inversion list defining the
15519 * property; otherwise use the error message */
15520 SWITCH_TO_GLOBAL_CONTEXT;
15521 (void) hv_store_ent(PL_user_def_props,
15522 key,
15523 ((prop_definition)
15524 ? newSVsv(prop_definition)
15525 : newSVsv(msg)),
15526 0);
15527 RESTORE_CONTEXT;
15528 }
15529
15530 /* All done, and the hash now has a permanent entry for this
15531 * property. Give up exclusive control */
15532 USER_PROP_MUTEX_UNLOCK;
15533
15534 FREETMPS;
15535 LEAVE;
15536 POPSTACK;
15537
15538 if (empty_return) {
15539 goto definition_deferred;
15540 }
15541
15542 if (prop_definition) {
15543
15544 /* If the definition is for something not known at this time,
15545 * we toss it, and go return the main property name, as that's
15546 * the one the user will be aware of */
15547 if (! is_invlist(prop_definition)) {
15548 SvREFCNT_dec_NN(prop_definition);
15549 goto definition_deferred;
15550 }
15551
15552 sv_2mortal(prop_definition);
15553 }
15554
15555 /* And return */
15556 return prop_definition;
15557
15558 } /* End of calling the subroutine for the user-defined property */
15559 } /* End of it could be a user-defined property */
15560
15561 /* Here it wasn't a user-defined property that is known at this time. See
15562 * if it is a Unicode property */
15563
15564 lookup_len = j; /* This is a more mnemonic name than 'j' */
15565
15566 /* Get the index into our pointer table of the inversion list corresponding
15567 * to the property */
15568 table_index = do_uniprop_match(lookup_name, lookup_len);
15569
15570 /* If it didn't find the property ... */
15571 if (table_index == 0) {
15572
15573 /* Try again stripping off any initial 'Is'. This is because we
15574 * promise that an initial Is is optional. The same isn't true of
15575 * names that start with 'In'. Those can match only blocks, and the
15576 * lookup table already has those accounted for. The lookup table also
15577 * has already accounted for Perl extensions (without and = sign)
15578 * starting with 'i's'. */
15579 if (starts_with_Is && equals_pos >= 0) {
15580 lookup_name += 2;
15581 lookup_len -= 2;
15582 equals_pos -= 2;
15583 slash_pos -= 2;
15584
15585 table_index = do_uniprop_match(lookup_name, lookup_len);
15586 }
15587
15588 if (table_index == 0) {
15589 char * canonical;
15590
15591 /* Here, we didn't find it. If not a numeric type property, and
15592 * can't be a user-defined one, it isn't a legal property */
15593 if (! is_nv_type) {
15594 if (! could_be_user_defined) {
15595 goto failed;
15596 }
15597
15598 /* Here, the property name is legal as a user-defined one. At
15599 * compile time, it might just be that the subroutine for that
15600 * property hasn't been encountered yet, but at runtime, it's
15601 * an error to try to use an undefined one */
15602 if (! deferrable) {
15603 goto unknown_user_defined;
15604 }
15605
15606 goto definition_deferred;
15607 } /* End of isn't a numeric type property */
15608
15609 /* The numeric type properties need more work to decide. What we
15610 * do is make sure we have the number in canonical form and look
15611 * that up. */
15612
15613 if (slash_pos < 0) { /* No slash */
15614
15615 /* When it isn't a rational, take the input, convert it to a
15616 * NV, then create a canonical string representation of that
15617 * NV. */
15618
15619 NV value;
15620 SSize_t value_len = lookup_len - equals_pos;
15621
15622 /* Get the value */
15623 if ( value_len <= 0
15624 || my_atof3(lookup_name + equals_pos, &value,
15625 value_len)
15626 != lookup_name + lookup_len)
15627 {
15628 goto failed;
15629 }
15630
15631 /* If the value is an integer, the canonical value is integral
15632 * */
15633 if (Perl_ceil(value) == value) {
15634 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
15635 equals_pos, lookup_name, value);
15636 }
15637 else { /* Otherwise, it is %e with a known precision */
15638 char * exp_ptr;
15639
15640 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
15641 equals_pos, lookup_name,
15642 PL_E_FORMAT_PRECISION, value);
15643
15644 /* The exponent generated is expecting two digits, whereas
15645 * %e on some systems will generate three. Remove leading
15646 * zeros in excess of 2 from the exponent. We start
15647 * looking for them after the '=' */
15648 exp_ptr = strchr(canonical + equals_pos, 'e');
15649 if (exp_ptr) {
15650 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
15651 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
15652
15653 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
15654
15655 if (excess_exponent_len > 0) {
15656 SSize_t leading_zeros = strspn(cur_ptr, "0");
15657 SSize_t excess_leading_zeros
15658 = MIN(leading_zeros, excess_exponent_len);
15659 if (excess_leading_zeros > 0) {
15660 Move(cur_ptr + excess_leading_zeros,
15661 cur_ptr,
15662 strlen(cur_ptr) - excess_leading_zeros
15663 + 1, /* Copy the NUL as well */
15664 char);
15665 }
15666 }
15667 }
15668 }
15669 }
15670 else { /* Has a slash. Create a rational in canonical form */
15671 UV numerator, denominator, gcd, trial;
15672 const char * end_ptr;
15673 const char * sign = "";
15674
15675 /* We can't just find the numerator, denominator, and do the
15676 * division, then use the method above, because that is
15677 * inexact. And the input could be a rational that is within
15678 * epsilon (given our precision) of a valid rational, and would
15679 * then incorrectly compare valid.
15680 *
15681 * We're only interested in the part after the '=' */
15682 const char * this_lookup_name = lookup_name + equals_pos;
15683 lookup_len -= equals_pos;
15684 slash_pos -= equals_pos;
15685
15686 /* Handle any leading minus */
15687 if (this_lookup_name[0] == '-') {
15688 sign = "-";
15689 this_lookup_name++;
15690 lookup_len--;
15691 slash_pos--;
15692 }
15693
15694 /* Convert the numerator to numeric */
15695 end_ptr = this_lookup_name + slash_pos;
15696 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
15697 goto failed;
15698 }
15699
15700 /* It better have included all characters before the slash */
15701 if (*end_ptr != '/') {
15702 goto failed;
15703 }
15704
15705 /* Set to look at just the denominator */
15706 this_lookup_name += slash_pos;
15707 lookup_len -= slash_pos;
15708 end_ptr = this_lookup_name + lookup_len;
15709
15710 /* Convert the denominator to numeric */
15711 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
15712 goto failed;
15713 }
15714
15715 /* It better be the rest of the characters, and don't divide by
15716 * 0 */
15717 if ( end_ptr != this_lookup_name + lookup_len
15718 || denominator == 0)
15719 {
15720 goto failed;
15721 }
15722
15723 /* Get the greatest common denominator using
15724 https://en.wikipedia.org/wiki/Euclidean_algorithm */
15725 gcd = numerator;
15726 trial = denominator;
15727 while (trial != 0) {
15728 UV temp = trial;
15729 trial = gcd % trial;
15730 gcd = temp;
15731 }
15732
15733 /* If already in lowest possible terms, we have already tried
15734 * looking this up */
15735 if (gcd == 1) {
15736 goto failed;
15737 }
15738
15739 /* Reduce the rational, which should put it in canonical form
15740 * */
15741 numerator /= gcd;
15742 denominator /= gcd;
15743
15744 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
15745 equals_pos, lookup_name, sign, numerator, denominator);
15746 }
15747
15748 /* Here, we have the number in canonical form. Try that */
15749 table_index = do_uniprop_match(canonical, strlen(canonical));
15750 if (table_index == 0) {
15751 goto failed;
15752 }
15753 } /* End of still didn't find the property in our table */
15754 } /* End of didn't find the property in our table */
15755
15756 /* Here, we have a non-zero return, which is an index into a table of ptrs.
15757 * A negative return signifies that the real index is the absolute value,
15758 * but the result needs to be inverted */
15759 if (table_index < 0) {
15760 invert_return = TRUE;
15761 table_index = -table_index;
15762 }
15763
15764 /* Out-of band indices indicate a deprecated property. The proper index is
15765 * modulo it with the table size. And dividing by the table size yields
15766 * an offset into a table constructed by regen/mk_invlists.pl to contain
15767 * the corresponding warning message */
15768 if (table_index > MAX_UNI_KEYWORD_INDEX) {
15769 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
15770 table_index %= MAX_UNI_KEYWORD_INDEX;
15771 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
15772 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
15773 (int) name_len, name,
15774 get_deprecated_property_msg(warning_offset));
15775 }
15776
15777 /* In a few properties, a different property is used under /i. These are
15778 * unlikely to change, so are hard-coded here. */
15779 if (to_fold) {
15780 if ( table_index == UNI_XPOSIXUPPER
15781 || table_index == UNI_XPOSIXLOWER
15782 || table_index == UNI_TITLE)
15783 {
15784 table_index = UNI_CASED;
15785 }
15786 else if ( table_index == UNI_UPPERCASELETTER
15787 || table_index == UNI_LOWERCASELETTER
15788 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
15789 || table_index == UNI_TITLECASELETTER
15790 # endif
15791 ) {
15792 table_index = UNI_CASEDLETTER;
15793 }
15794 else if ( table_index == UNI_POSIXUPPER
15795 || table_index == UNI_POSIXLOWER)
15796 {
15797 table_index = UNI_POSIXALPHA;
15798 }
15799 }
15800
15801 /* Create and return the inversion list */
15802 prop_definition = get_prop_definition(table_index);
15803 sv_2mortal(prop_definition);
15804
15805 /* See if there is a private use override to add to this definition */
15806 {
15807 COPHH * hinthash = (IN_PERL_COMPILETIME)
15808 ? CopHINTHASH_get(&PL_compiling)
15809 : CopHINTHASH_get(PL_curcop);
15810 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
15811
15812 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
15813
15814 /* See if there is an element in the hints hash for this table */
15815 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
15816 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
15817
15818 if (pos) {
15819 bool dummy;
15820 SV * pu_definition;
15821 SV * pu_invlist;
15822 SV * expanded_prop_definition =
15823 sv_2mortal(invlist_clone(prop_definition, NULL));
15824
15825 /* If so, it's definition is the string from here to the next
15826 * \a character. And its format is the same as a user-defined
15827 * property */
15828 pos += SvCUR(pu_lookup);
15829 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
15830 pu_invlist = handle_user_defined_property(lookup_name,
15831 lookup_len,
15832 0, /* Not UTF-8 */
15833 0, /* Not folded */
15834 runtime,
15835 deferrable,
15836 pu_definition,
15837 &dummy,
15838 msg,
15839 level);
15840 if (TAINT_get) {
15841 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15842 sv_catpvs(msg, "Insecure private-use override");
15843 goto append_name_to_msg;
15844 }
15845
15846 /* For now, as a safety measure, make sure that it doesn't
15847 * override non-private use code points */
15848 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
15849
15850 /* Add it to the list to be returned */
15851 _invlist_union(prop_definition, pu_invlist,
15852 &expanded_prop_definition);
15853 prop_definition = expanded_prop_definition;
15854 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
15855 }
15856 }
15857 }
15858
15859 if (invert_return) {
15860 _invlist_invert(prop_definition);
15861 }
15862 return prop_definition;
15863
15864 unknown_user_defined:
15865 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15866 sv_catpvs(msg, "Unknown user-defined property name");
15867 goto append_name_to_msg;
15868
15869 failed:
15870 if (non_pkg_begin != 0) {
15871 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15872 sv_catpvs(msg, "Illegal user-defined property name");
15873 }
15874 else {
15875 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15876 sv_catpvs(msg, "Can't find Unicode property definition");
15877 }
15878 /* FALLTHROUGH */
15879
15880 append_name_to_msg:
15881 {
15882 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
15883 const char * suffix = (runtime && level == 0) ? "}" : "\"";
15884
15885 sv_catpv(msg, prefix);
15886 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
15887 sv_catpv(msg, suffix);
15888 }
15889
15890 return NULL;
15891
15892 definition_deferred:
15893
15894 {
15895 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
15896
15897 /* Here it could yet to be defined, so defer evaluation of this until
15898 * its needed at runtime. We need the fully qualified property name to
15899 * avoid ambiguity */
15900 if (! fq_name) {
15901 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15902 is_qualified);
15903 }
15904
15905 /* If it didn't come with a package, or the package is utf8::, this
15906 * actually could be an official Unicode property whose inclusion we
15907 * are deferring until runtime to make sure that it isn't overridden by
15908 * a user-defined property of the same name (which we haven't
15909 * encountered yet). Add a marker to indicate this possibility, for
15910 * use at such time when we first need the definition during pattern
15911 * matching execution */
15912 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
15913 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
15914 }
15915
15916 /* We also need a trailing newline */
15917 sv_catpvs(fq_name, "\n");
15918
15919 *user_defined_ptr = TRUE;
15920 return fq_name;
15921 }
15922 }
15923
15924 STATIC bool
S_handle_names_wildcard(pTHX_ const char * wname,const STRLEN wname_len,SV ** prop_definition,AV ** strings)15925 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
15926 const STRLEN wname_len, /* Its length */
15927 SV ** prop_definition,
15928 AV ** strings)
15929 {
15930 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
15931 * any matches, adding them to prop_definition */
15932
15933 dSP;
15934
15935 CV * get_names_info; /* entry to charnames.pm to get info we need */
15936 SV * names_string; /* Contains all character names, except algo */
15937 SV * algorithmic_names; /* Contains info about algorithmically
15938 generated character names */
15939 REGEXP * subpattern_re; /* The user's pattern to match with */
15940 struct regexp * prog; /* The compiled pattern */
15941 char * all_names_start; /* lib/unicore/Name.pl string of every
15942 (non-algorithmic) character name */
15943 char * cur_pos; /* We match, effectively using /gc; this is
15944 where we are now */
15945 bool found_matches = FALSE; /* Did any name match so far? */
15946 SV * empty; /* For matching zero length names */
15947 SV * must_sv; /* Contains the substring, if any, that must be
15948 in a name for the subpattern to match */
15949 const char * must; /* The PV of 'must' */
15950 STRLEN must_len; /* And its length */
15951 SV * syllable_name = NULL; /* For Hangul syllables */
15952 const char hangul_prefix[] = "HANGUL SYLLABLE ";
15953 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
15954
15955 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
15956 * syllable name, and these are immutable and guaranteed by the Unicode
15957 * standard to never be extended */
15958 const STRLEN syl_max_len = hangul_prefix_len + 7;
15959
15960 IV i;
15961
15962 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
15963
15964 /* Make sure _charnames is loaded. (The parameters give context
15965 * for any errors generated */
15966 get_names_info = get_cv("_charnames::_get_names_info", 0);
15967 if (! get_names_info) {
15968 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
15969 }
15970
15971 /* Get the charnames data */
15972 PUSHSTACKi(PERLSI_REGCOMP);
15973 ENTER ;
15974 SAVETMPS;
15975 save_re_context();
15976
15977 PUSHMARK(SP) ;
15978 PUTBACK;
15979
15980 /* Special _charnames entry point that returns the info this routine
15981 * requires */
15982 call_sv(MUTABLE_SV(get_names_info), G_LIST);
15983
15984 SPAGAIN ;
15985
15986 /* Data structure for names which end in their very own code points */
15987 algorithmic_names = POPs;
15988 SvREFCNT_inc_simple_void_NN(algorithmic_names);
15989
15990 /* The lib/unicore/Name.pl string */
15991 names_string = POPs;
15992 SvREFCNT_inc_simple_void_NN(names_string);
15993
15994 PUTBACK ;
15995 FREETMPS ;
15996 LEAVE ;
15997 POPSTACK;
15998
15999 if ( ! SvROK(names_string)
16000 || ! SvROK(algorithmic_names))
16001 { /* Perhaps should panic instead XXX */
16002 SvREFCNT_dec(names_string);
16003 SvREFCNT_dec(algorithmic_names);
16004 return FALSE;
16005 }
16006
16007 names_string = sv_2mortal(SvRV(names_string));
16008 all_names_start = SvPVX(names_string);
16009 cur_pos = all_names_start;
16010
16011 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
16012
16013 /* Compile the subpattern consisting of the name being looked for */
16014 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
16015
16016 must_sv = re_intuit_string(subpattern_re);
16017 if (must_sv) {
16018 /* regexec.c can free the re_intuit_string() return. GH #17734 */
16019 must_sv = sv_2mortal(newSVsv(must_sv));
16020 must = SvPV(must_sv, must_len);
16021 }
16022 else {
16023 must = "";
16024 must_len = 0;
16025 }
16026
16027 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
16028 * This works because the NUL causes the function to return early, thus
16029 * showing that there are characters in it other than the acceptable ones,
16030 * which is our desired result.) */
16031
16032 prog = ReANY(subpattern_re);
16033
16034 /* If only nothing is matched, skip to where empty names are looked for */
16035 if (prog->maxlen == 0) {
16036 goto check_empty;
16037 }
16038
16039 /* And match against the string of all names /gc. Don't even try if it
16040 * must match a character not found in any name. */
16041 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
16042 {
16043 while (execute_wildcard(subpattern_re,
16044 cur_pos,
16045 SvEND(names_string),
16046 all_names_start, 0,
16047 names_string,
16048 0))
16049 { /* Here, matched. */
16050
16051 /* Note the string entries look like
16052 * 00001\nSTART OF HEADING\n\n
16053 * so we could match anywhere in that string. We have to rule out
16054 * matching a code point line */
16055 char * this_name_start = all_names_start
16056 + RX_OFFS_START(subpattern_re,0);
16057 char * this_name_end = all_names_start
16058 + RX_OFFS_END(subpattern_re,0);
16059 char * cp_start;
16060 char * cp_end;
16061 UV cp = 0; /* Silences some compilers */
16062 AV * this_string = NULL;
16063 bool is_multi = FALSE;
16064
16065 /* If matched nothing, advance to next possible match */
16066 if (this_name_start == this_name_end) {
16067 cur_pos = (char *) memchr(this_name_end + 1, '\n',
16068 SvEND(names_string) - this_name_end);
16069 if (cur_pos == NULL) {
16070 break;
16071 }
16072 }
16073 else {
16074 /* Position the next match to start beyond the current returned
16075 * entry */
16076 cur_pos = (char *) memchr(this_name_end, '\n',
16077 SvEND(names_string) - this_name_end);
16078 }
16079
16080 /* Back up to the \n just before the beginning of the character. */
16081 cp_end = (char *) my_memrchr(all_names_start,
16082 '\n',
16083 this_name_start - all_names_start);
16084
16085 /* If we didn't find a \n, it means it matched somewhere in the
16086 * initial '00000' in the string, so isn't a real match */
16087 if (cp_end == NULL) {
16088 continue;
16089 }
16090
16091 this_name_start = cp_end + 1; /* The name starts just after */
16092 cp_end--; /* the \n, and the code point */
16093 /* ends just before it */
16094
16095 /* All code points are 5 digits long */
16096 cp_start = cp_end - 4;
16097
16098 /* This shouldn't happen, as we found a \n, and the first \n is
16099 * further along than what we subtracted */
16100 assert(cp_start >= all_names_start);
16101
16102 if (cp_start == all_names_start) {
16103 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
16104 continue;
16105 }
16106
16107 /* If the character is a blank, we either have a named sequence, or
16108 * something is wrong */
16109 if (*(cp_start - 1) == ' ') {
16110 cp_start = (char *) my_memrchr(all_names_start,
16111 '\n',
16112 cp_start - all_names_start);
16113 cp_start++;
16114 }
16115
16116 assert(cp_start != NULL && cp_start >= all_names_start + 2);
16117
16118 /* Except for the first line in the string, the sequence before the
16119 * code point is \n\n. If that isn't the case here, we didn't
16120 * match the name of a character. (We could have matched a named
16121 * sequence, not currently handled */
16122 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
16123 continue;
16124 }
16125
16126 /* We matched! Add this to the list */
16127 found_matches = TRUE;
16128
16129 /* Loop through all the code points in the sequence */
16130 while (cp_start < cp_end) {
16131
16132 /* Calculate this code point from its 5 digits */
16133 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
16134 + (XDIGIT_VALUE(cp_start[1]) << 12)
16135 + (XDIGIT_VALUE(cp_start[2]) << 8)
16136 + (XDIGIT_VALUE(cp_start[3]) << 4)
16137 + XDIGIT_VALUE(cp_start[4]);
16138
16139 cp_start += 6; /* Go past any blank */
16140
16141 if (cp_start < cp_end || is_multi) {
16142 if (this_string == NULL) {
16143 this_string = newAV();
16144 }
16145
16146 is_multi = TRUE;
16147 av_push_simple(this_string, newSVuv(cp));
16148 }
16149 }
16150
16151 if (is_multi) { /* Was more than one code point */
16152 if (*strings == NULL) {
16153 *strings = newAV();
16154 }
16155
16156 av_push_simple(*strings, (SV *) this_string);
16157 }
16158 else { /* Only a single code point */
16159 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
16160 }
16161 } /* End of loop through the non-algorithmic names string */
16162 }
16163
16164 /* There are also character names not in 'names_string'. These are
16165 * algorithmically generatable. Try this pattern on each possible one.
16166 * (khw originally planned to leave this out given the large number of
16167 * matches attempted; but the speed turned out to be quite acceptable
16168 *
16169 * There are plenty of opportunities to optimize to skip many of the tests.
16170 * beyond the rudimentary ones already here */
16171
16172 /* First see if the subpattern matches any of the algorithmic generatable
16173 * Hangul syllable names.
16174 *
16175 * We know none of these syllable names will match if the input pattern
16176 * requires more bytes than any syllable has, or if the input pattern only
16177 * matches an empty name, or if the pattern has something it must match and
16178 * one of the characters in that isn't in any Hangul syllable. */
16179 if ( prog->minlen <= (SSize_t) syl_max_len
16180 && prog->maxlen > 0
16181 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
16182 {
16183 /* These constants, names, values, and algorithm are adapted from the
16184 * Unicode standard, version 5.1, section 3.12, and should never
16185 * change. */
16186 const char * JamoL[] = {
16187 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
16188 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
16189 };
16190 const int LCount = C_ARRAY_LENGTH(JamoL);
16191
16192 const char * JamoV[] = {
16193 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
16194 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
16195 "I"
16196 };
16197 const int VCount = C_ARRAY_LENGTH(JamoV);
16198
16199 const char * JamoT[] = {
16200 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
16201 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
16202 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
16203 };
16204 const int TCount = C_ARRAY_LENGTH(JamoT);
16205
16206 int L, V, T;
16207
16208 /* This is the initial Hangul syllable code point; each time through the
16209 * inner loop, it maps to the next higher code point. For more info,
16210 * see the Hangul syllable section of the Unicode standard. */
16211 int cp = 0xAC00;
16212
16213 syllable_name = sv_2mortal(newSV(syl_max_len));
16214 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
16215
16216 for (L = 0; L < LCount; L++) {
16217 for (V = 0; V < VCount; V++) {
16218 for (T = 0; T < TCount; T++) {
16219
16220 /* Truncate back to the prefix, which is unvarying */
16221 SvCUR_set(syllable_name, hangul_prefix_len);
16222
16223 sv_catpv(syllable_name, JamoL[L]);
16224 sv_catpv(syllable_name, JamoV[V]);
16225 sv_catpv(syllable_name, JamoT[T]);
16226
16227 if (execute_wildcard(subpattern_re,
16228 SvPVX(syllable_name),
16229 SvEND(syllable_name),
16230 SvPVX(syllable_name), 0,
16231 syllable_name,
16232 0))
16233 {
16234 *prop_definition = add_cp_to_invlist(*prop_definition,
16235 cp);
16236 found_matches = TRUE;
16237 }
16238
16239 cp++;
16240 }
16241 }
16242 }
16243 }
16244
16245 /* The rest of the algorithmically generatable names are of the form
16246 * "PREFIX-code_point". The prefixes and the code point limits of each
16247 * were returned to us in the array 'algorithmic_names' from data in
16248 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
16249 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
16250 IV j;
16251
16252 /* Each element of the array is a hash, giving the details for the
16253 * series of names it covers. There is the base name of the characters
16254 * in the series, and the low and high code points in the series. And,
16255 * for optimization purposes a string containing all the legal
16256 * characters that could possibly be in a name in this series. */
16257 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
16258 SV * prefix = * hv_fetchs(this_series, "name", 0);
16259 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
16260 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
16261 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
16262
16263 /* Pre-allocate an SV with enough space */
16264 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
16265 SvPVX(prefix)));
16266 if (high >= 0x10000) {
16267 sv_catpvs(algo_name, "0");
16268 }
16269
16270 /* This series can be skipped entirely if the pattern requires
16271 * something longer than any name in the series, or can only match an
16272 * empty name, or contains a character not found in any name in the
16273 * series */
16274 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
16275 && prog->maxlen > 0
16276 && (strspn(must, legal) == must_len))
16277 {
16278 for (j = low; j <= high; j++) { /* For each code point in the series */
16279
16280 /* Get its name, and see if it matches the subpattern */
16281 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
16282 (unsigned) j);
16283
16284 if (execute_wildcard(subpattern_re,
16285 SvPVX(algo_name),
16286 SvEND(algo_name),
16287 SvPVX(algo_name), 0,
16288 algo_name,
16289 0))
16290 {
16291 *prop_definition = add_cp_to_invlist(*prop_definition, j);
16292 found_matches = TRUE;
16293 }
16294 }
16295 }
16296 }
16297
16298 check_empty:
16299 /* Finally, see if the subpattern matches an empty string */
16300 empty = newSVpvs("");
16301 if (execute_wildcard(subpattern_re,
16302 SvPVX(empty),
16303 SvEND(empty),
16304 SvPVX(empty), 0,
16305 empty,
16306 0))
16307 {
16308 /* Many code points have empty names. Currently these are the \p{GC=C}
16309 * ones, minus CC and CF */
16310
16311 SV * empty_names_ref = get_prop_definition(UNI_C);
16312 SV * empty_names = invlist_clone(empty_names_ref, NULL);
16313
16314 SV * subtract = get_prop_definition(UNI_CC);
16315
16316 _invlist_subtract(empty_names, subtract, &empty_names);
16317 SvREFCNT_dec_NN(empty_names_ref);
16318 SvREFCNT_dec_NN(subtract);
16319
16320 subtract = get_prop_definition(UNI_CF);
16321 _invlist_subtract(empty_names, subtract, &empty_names);
16322 SvREFCNT_dec_NN(subtract);
16323
16324 _invlist_union(*prop_definition, empty_names, prop_definition);
16325 found_matches = TRUE;
16326 SvREFCNT_dec_NN(empty_names);
16327 }
16328 SvREFCNT_dec_NN(empty);
16329
16330 #if 0
16331 /* If we ever were to accept aliases for, say private use names, we would
16332 * need to do something fancier to find empty names. The code below works
16333 * (at the time it was written), and is slower than the above */
16334 const char empties_pat[] = "^.";
16335 if (strNE(name, empties_pat)) {
16336 SV * empty = newSVpvs("");
16337 if (execute_wildcard(subpattern_re,
16338 SvPVX(empty),
16339 SvEND(empty),
16340 SvPVX(empty), 0,
16341 empty,
16342 0))
16343 {
16344 SV * empties = NULL;
16345
16346 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
16347
16348 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
16349 SvREFCNT_dec_NN(empties);
16350
16351 found_matches = TRUE;
16352 }
16353 SvREFCNT_dec_NN(empty);
16354 }
16355 #endif
16356
16357 SvREFCNT_dec_NN(subpattern_re);
16358 return found_matches;
16359 }
16360
16361 /*
16362 * ex: set ts=8 sts=4 sw=4 et:
16363 */
16364