1 /* regcomp.c
2 */
3
4 /*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32 */
33
34 /*
35 * pregcomp and pregexec -- regsub and regerror are not used in perl
36 *
37 * Copyright (c) 1986 by University of Toronto.
38 * Written by Henry Spencer. Not derived from licensed software.
39 *
40 * Permission is granted to anyone to use this software for any
41 * purpose on any computer system, and to redistribute it freely,
42 * subject to the following restrictions:
43 *
44 * 1. The author is not responsible for the consequences of use of
45 * this software, no matter how awful, even if they arise
46 * from defects in it.
47 *
48 * 2. The origin of this software must not be misrepresented, either
49 * by explicit claim or by omission.
50 *
51 * 3. Altered versions must be plainly marked as such, and must not
52 * be misrepresented as being the original software.
53 *
54 *
55 **** Alterations to Henry's code are...
56 ****
57 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
58 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
59 **** by Larry Wall and others
60 ****
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
63
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69
70 /* Note on debug output:
71 *
72 * This is set up so that -Dr turns on debugging like all other flags that are
73 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
74 * all regular expressions encountered in a program, and gives a huge amount of
75 * output for all but the shortest programs.
76 *
77 * The ability to output pattern debugging information lexically, and with much
78 * finer grained control was added, with 'use re qw(Debug ....);' available even
79 * in non-DEBUGGING builds. This is accomplished by copying the contents of
80 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
81 * Those files are compiled and linked into the perl executable, and they are
82 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
83 * to re.pm.
84 *
85 * That would normally mean linking errors when two functions of the same name
86 * are attempted to be placed into the same executable. That is solved in one
87 * of four ways:
88 * 1) Static functions aren't known outside the file they are in, so for the
89 * many functions of that type in this file, it just isn't a problem.
90 * 2) Most externally known functions are enclosed in
91 * #ifndef PERL_IN_XSUB_RE
92 * ...
93 * #endif
94 * blocks, so there is only one definition for them in the whole
95 * executable, the one in regcomp.c (or regexec.c). The implication of
96 * that is any debugging info that comes from them is controlled only by
97 * -Dr. Further, any static function they call will also be the version
98 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
99 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
100 * have different names, so that what gets loaded in the executable is
101 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
102 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
103 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
104 * versions and their callees are under control of re.pm. The catch is
105 * that references to all these go through the regexp_engine structure,
106 * which is initialized in regcomp.h to the Perl_foo versions, and
107 * substituted out in lexical scopes where 'use re' is in effect to the
108 * 'my_foo' ones. That structure is public API, so it would be a hard
109 * sell to add any additional members.
110 * 4) For functions in regcomp.c and re_comp.c that are called only from,
111 * respectively, regexec.c and re_exec.c, they can have two different
112 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
113 * embed.fnc.
114 *
115 * The bottom line is that if you add code to one of the public functions
116 * listed in ext/re/re_top.h, debugging automagically works. But if you write
117 * a new function that needs to do debugging or there is a chain of calls from
118 * it that need to do debugging, all functions in the chain should use options
119 * 2) or 4) above.
120 *
121 * A function may have to be split so that debugging stuff is static, but it
122 * calls out to some other function that only gets compiled in regcomp.c to
123 * access data that we don't want to duplicate.
124 */
125
126 #ifdef PERL_EXT_RE_BUILD
127 #include "re_top.h"
128 #endif
129
130 #include "EXTERN.h"
131 #define PERL_IN_REGEX_ENGINE
132 #define PERL_IN_REGCOMP_ANY
133 #define PERL_IN_REGCOMP_C
134 #include "perl.h"
135
136 #ifdef PERL_IN_XSUB_RE
137 # include "re_comp.h"
138 EXTERN_C const struct regexp_engine my_reg_engine;
139 EXTERN_C const struct regexp_engine wild_reg_engine;
140 #else
141 # include "regcomp.h"
142 #endif
143
144 #include "invlist_inline.h"
145 #include "unicode_constants.h"
146 #include "regcomp_internal.h"
147
148 /* =========================================================
149 * BEGIN edit_distance stuff.
150 *
151 * This calculates how many single character changes of any type are needed to
152 * transform a string into another one. It is taken from version 3.1 of
153 *
154 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
155 */
156
157 /* Our unsorted dictionary linked list. */
158 /* Note we use UVs, not chars. */
159
160 struct dictionary{
161 UV key;
162 UV value;
163 struct dictionary* next;
164 };
165 typedef struct dictionary item;
166
167
168 PERL_STATIC_INLINE item*
push(UV key,item * curr)169 push(UV key, item* curr)
170 {
171 item* head;
172 Newx(head, 1, item);
173 head->key = key;
174 head->value = 0;
175 head->next = curr;
176 return head;
177 }
178
179
180 PERL_STATIC_INLINE item*
find(item * head,UV key)181 find(item* head, UV key)
182 {
183 item* iterator = head;
184 while (iterator){
185 if (iterator->key == key){
186 return iterator;
187 }
188 iterator = iterator->next;
189 }
190
191 return NULL;
192 }
193
194 PERL_STATIC_INLINE item*
uniquePush(item * head,UV key)195 uniquePush(item* head, UV key)
196 {
197 item* iterator = head;
198
199 while (iterator){
200 if (iterator->key == key) {
201 return head;
202 }
203 iterator = iterator->next;
204 }
205
206 return push(key, head);
207 }
208
209 PERL_STATIC_INLINE void
dict_free(item * head)210 dict_free(item* head)
211 {
212 item* iterator = head;
213
214 while (iterator) {
215 item* temp = iterator;
216 iterator = iterator->next;
217 Safefree(temp);
218 }
219
220 head = NULL;
221 }
222
223 /* End of Dictionary Stuff */
224
225 /* All calculations/work are done here */
226 STATIC int
S_edit_distance(const UV * src,const UV * tgt,const STRLEN x,const STRLEN y,const SSize_t maxDistance)227 S_edit_distance(const UV* src,
228 const UV* tgt,
229 const STRLEN x, /* length of src[] */
230 const STRLEN y, /* length of tgt[] */
231 const SSize_t maxDistance
232 )
233 {
234 item *head = NULL;
235 UV swapCount, swapScore, targetCharCount, i, j;
236 UV *scores;
237 UV score_ceil = x + y;
238
239 PERL_ARGS_ASSERT_EDIT_DISTANCE;
240
241 /* initialize matrix start values */
242 Newx(scores, ( (x + 2) * (y + 2)), UV);
243 scores[0] = score_ceil;
244 scores[1 * (y + 2) + 0] = score_ceil;
245 scores[0 * (y + 2) + 1] = score_ceil;
246 scores[1 * (y + 2) + 1] = 0;
247 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
248
249 /* work loops */
250 /* i = src index */
251 /* j = tgt index */
252 for (i=1;i<=x;i++) {
253 if (i < x)
254 head = uniquePush(head, src[i]);
255 scores[(i+1) * (y + 2) + 1] = i;
256 scores[(i+1) * (y + 2) + 0] = score_ceil;
257 swapCount = 0;
258
259 for (j=1;j<=y;j++) {
260 if (i == 1) {
261 if(j < y)
262 head = uniquePush(head, tgt[j]);
263 scores[1 * (y + 2) + (j + 1)] = j;
264 scores[0 * (y + 2) + (j + 1)] = score_ceil;
265 }
266
267 targetCharCount = find(head, tgt[j-1])->value;
268 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
269
270 if (src[i-1] != tgt[j-1]){
271 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
272 }
273 else {
274 swapCount = j;
275 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
276 }
277 }
278
279 find(head, src[i-1])->value = i;
280 }
281
282 {
283 IV score = scores[(x+1) * (y + 2) + (y + 1)];
284 dict_free(head);
285 Safefree(scores);
286 return (maxDistance != 0 && maxDistance < score)?(-1):score;
287 }
288 }
289
290 /* END of edit_distance() stuff
291 * ========================================================= */
292
293 /* add a data member to the struct reg_data attached to this regex, it should
294 * always return a non-zero return. the 's' argument is the type of the items
295 * being added and the n is the number of items. The length of 's' should match
296 * the number of items. */
297 U32
Perl_reg_add_data(RExC_state_t * const pRExC_state,const char * const s,const U32 n)298 Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
299 {
300 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
301
302 PERL_ARGS_ASSERT_REG_ADD_DATA;
303
304 /* in the below expression we have (count + n - 1), the minus one is there
305 * because the struct that we allocate already contains a slot for 1 data
306 * item, so we do not need to allocate it the first time. IOW, the
307 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
308 * to allocate. See struct reg_data in regcomp.h
309 */
310 Renewc(RExC_rxi->data,
311 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
312 char, struct reg_data);
313 /* however in the data->what expression we use (count + n) and do not
314 * subtract one from the result because the data structure contains a
315 * pointer to an array, and does not allocate the first element as part of
316 * the data struct. */
317 if (count > 1)
318 Renew(RExC_rxi->data->what, (count + n), U8);
319 else {
320 /* when count == 1 it means we have not initialized anything.
321 * we always fill the 0 slot of the data array with a '%' entry, which
322 * means "zero" (all the other types are letters) which exists purely
323 * so the return from reg_add_data is ALWAYS true, so we can tell it apart
324 * from a "no value" idx=0 in places where we would return an index
325 * into reg_add_data. This is particularly important with the new "single
326 * pass, usually, but not always" strategy that we use, where the code
327 * will use a 0 to represent "not able to compute this yet".
328 */
329 Newx(RExC_rxi->data->what, n+1, U8);
330 /* fill in the placeholder slot of 0 with a what of '%', we use
331 * this because it sorta looks like a zero (0/0) and it is not a letter
332 * like any of the other "whats", this type should never be created
333 * any other way but here. '%' happens to also not appear in this
334 * file for any other reason (at the time of writing this comment)*/
335 RExC_rxi->data->what[0]= '%';
336 RExC_rxi->data->data[0]= NULL;
337 }
338 RExC_rxi->data->count = count + n;
339 Copy(s, RExC_rxi->data->what + count, n, U8);
340 assert(count>0);
341 return count;
342 }
343
344 /*XXX: todo make this not included in a non debugging perl, but appears to be
345 * used anyway there, in 'use re' */
346 #ifndef PERL_IN_XSUB_RE
347 void
Perl_reginitcolors(pTHX)348 Perl_reginitcolors(pTHX)
349 {
350 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
351 if (s) {
352 char *t = savepv(s);
353 int i = 0;
354 PL_colors[0] = t;
355 while (++i < 6) {
356 t = strchr(t, '\t');
357 if (t) {
358 *t = '\0';
359 PL_colors[i] = ++t;
360 }
361 else
362 PL_colors[i] = t = (char *)"";
363 }
364 } else {
365 int i = 0;
366 while (i < 6)
367 PL_colors[i++] = (char *)"";
368 }
369 PL_colorset = 1;
370 }
371 #endif
372
373
374 #ifdef TRIE_STUDY_OPT
375 /* search for "restudy" in this file for a detailed explanation */
376 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
377 STMT_START { \
378 if ( \
379 (data.flags & SCF_TRIE_RESTUDY) \
380 && ! restudied++ \
381 ) { \
382 dOsomething; \
383 goto reStudy; \
384 } \
385 } STMT_END
386 #else
387 #define CHECK_RESTUDY_GOTO_butfirst
388 #endif
389
390 /*
391 * pregcomp - compile a regular expression into internal code
392 *
393 * Decides which engine's compiler to call based on the hint currently in
394 * scope
395 */
396
397 #ifndef PERL_IN_XSUB_RE
398
399 /* return the currently in-scope regex engine (or the default if none) */
400
401 regexp_engine const *
Perl_current_re_engine(pTHX)402 Perl_current_re_engine(pTHX)
403 {
404 if (IN_PERL_COMPILETIME) {
405 HV * const table = GvHV(PL_hintgv);
406 SV **ptr;
407
408 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
409 return &PL_core_reg_engine;
410 ptr = hv_fetchs(table, "regcomp", FALSE);
411 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
412 return &PL_core_reg_engine;
413 return INT2PTR(regexp_engine*, SvIV(*ptr));
414 }
415 else {
416 SV *ptr;
417 if (!PL_curcop->cop_hints_hash)
418 return &PL_core_reg_engine;
419 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
420 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
421 return &PL_core_reg_engine;
422 return INT2PTR(regexp_engine*, SvIV(ptr));
423 }
424 }
425
426
427 REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern,const U32 flags)428 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
429 {
430 regexp_engine const *eng = current_re_engine();
431 DECLARE_AND_GET_RE_DEBUG_FLAGS;
432
433 PERL_ARGS_ASSERT_PREGCOMP;
434
435 /* Dispatch a request to compile a regexp to correct regexp engine. */
436 DEBUG_COMPILE_r({
437 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
438 PTR2UV(eng));
439 });
440 return CALLREGCOMP_ENG(eng, pattern, flags);
441 }
442 #endif
443
444 /*
445 =for apidoc re_compile
446
447 Compile the regular expression pattern C<pattern>, returning a pointer to the
448 compiled object for later matching with the internal regex engine.
449
450 This function is typically used by a custom regexp engine C<.comp()> function
451 to hand off to the core regexp engine those patterns it doesn't want to handle
452 itself (typically passing through the same flags it was called with). In
453 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
454 to compile using the currently active regexp engine.
455
456 If C<pattern> is already a C<REGEXP>, this function does nothing but return a
457 pointer to the input. Otherwise the PV is extracted and treated like a string
458 representing a pattern. See L<perlre>.
459
460 The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
461 all begin with C<RXf_>.
462
463 =cut
464
465 * public entry point for the perl core's own regex compiling code.
466 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
467 * pattern rather than a list of OPs, and uses the internal engine rather
468 * than the current one */
469
470 REGEXP *
Perl_re_compile(pTHX_ SV * const pattern,U32 rx_flags)471 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
472 {
473 SV *pat = pattern; /* defeat constness! */
474
475 PERL_ARGS_ASSERT_RE_COMPILE;
476
477 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
478 #ifdef PERL_IN_XSUB_RE
479 &my_reg_engine,
480 #else
481 &PL_core_reg_engine,
482 #endif
483 NULL, NULL, rx_flags, 0);
484 }
485
486 static void
S_free_codeblocks(pTHX_ struct reg_code_blocks * cbs)487 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
488 {
489 int n;
490
491 if (--cbs->refcnt > 0)
492 return;
493 for (n = 0; n < cbs->count; n++) {
494 REGEXP *rx = cbs->cb[n].src_regex;
495 if (rx) {
496 cbs->cb[n].src_regex = NULL;
497 SvREFCNT_dec_NN(rx);
498 }
499 }
500 Safefree(cbs->cb);
501 Safefree(cbs);
502 }
503
504
505 static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)506 S_alloc_code_blocks(pTHX_ int ncode)
507 {
508 struct reg_code_blocks *cbs;
509 Newx(cbs, 1, struct reg_code_blocks);
510 cbs->count = ncode;
511 cbs->refcnt = 1;
512 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
513 if (ncode)
514 Newx(cbs->cb, ncode, struct reg_code_block);
515 else
516 cbs->cb = NULL;
517 return cbs;
518 }
519
520
521 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
522 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
523 * point to the realloced string and length.
524 *
525 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
526 * stuff added */
527
528 static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,char ** pat_p,STRLEN * plen_p,int num_code_blocks)529 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
530 char **pat_p, STRLEN *plen_p, int num_code_blocks)
531 {
532 U8 *const src = (U8*)*pat_p;
533 U8 *dst, *d;
534 int n=0;
535 STRLEN s = 0;
536 bool do_end = 0;
537 DECLARE_AND_GET_RE_DEBUG_FLAGS;
538
539 DEBUG_PARSE_r(Perl_re_printf( aTHX_
540 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
541
542 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
543 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
544 d = dst;
545
546 while (s < *plen_p) {
547 append_utf8_from_native_byte(src[s], &d);
548
549 if (n < num_code_blocks) {
550 assert(pRExC_state->code_blocks);
551 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
552 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
553 assert(*(d - 1) == '(');
554 do_end = 1;
555 }
556 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
557 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
558 assert(*(d - 1) == ')');
559 do_end = 0;
560 n++;
561 }
562 }
563 s++;
564 }
565 *d = '\0';
566 *plen_p = d - dst;
567 *pat_p = (char*) dst;
568 SAVEFREEPV(*pat_p);
569 RExC_orig_utf8 = RExC_utf8 = 1;
570 }
571
572
573
574 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
575 * while recording any code block indices, and handling overloading,
576 * nested qr// objects etc. If pat is null, it will allocate a new
577 * string, or just return the first arg, if there's only one.
578 *
579 * Returns the malloced/updated pat.
580 * patternp and pat_count is the array of SVs to be concatted;
581 * oplist is the optional list of ops that generated the SVs;
582 * recompile_p is a pointer to a boolean that will be set if
583 * the regex will need to be recompiled.
584 * delim, if non-null is an SV that will be inserted between each element
585 */
586
587 static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,SV * pat,SV ** const patternp,int pat_count,OP * oplist,bool * recompile_p,SV * delim)588 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
589 SV *pat, SV ** const patternp, int pat_count,
590 OP *oplist, bool *recompile_p, SV *delim)
591 {
592 SV **svp;
593 int n = 0;
594 bool use_delim = FALSE;
595 bool alloced = FALSE;
596
597 /* if we know we have at least two args, create an empty string,
598 * then concatenate args to that. For no args, return an empty string */
599 if (!pat && pat_count != 1) {
600 pat = newSVpvs("");
601 SAVEFREESV(pat);
602 alloced = TRUE;
603 }
604
605 for (svp = patternp; svp < patternp + pat_count; svp++) {
606 SV *sv;
607 SV *rx = NULL;
608 STRLEN orig_patlen = 0;
609 bool code = 0;
610 SV *msv = use_delim ? delim : *svp;
611 if (!msv) msv = &PL_sv_undef;
612
613 /* if we've got a delimiter, we go round the loop twice for each
614 * svp slot (except the last), using the delimiter the second
615 * time round */
616 if (use_delim) {
617 svp--;
618 use_delim = FALSE;
619 }
620 else if (delim)
621 use_delim = TRUE;
622
623 if (SvTYPE(msv) == SVt_PVAV) {
624 /* we've encountered an interpolated array within
625 * the pattern, e.g. /...@a..../. Expand the list of elements,
626 * then recursively append elements.
627 * The code in this block is based on S_pushav() */
628
629 AV *const av = (AV*)msv;
630 const SSize_t maxarg = AvFILL(av) + 1;
631 SV **array;
632
633 if (oplist) {
634 assert(oplist->op_type == OP_PADAV
635 || oplist->op_type == OP_RV2AV);
636 oplist = OpSIBLING(oplist);
637 }
638
639 if (SvRMAGICAL(av)) {
640 SSize_t i;
641
642 Newx(array, maxarg, SV*);
643 SAVEFREEPV(array);
644 for (i=0; i < maxarg; i++) {
645 SV ** const svp = av_fetch(av, i, FALSE);
646 array[i] = svp ? *svp : &PL_sv_undef;
647 }
648 }
649 else
650 array = AvARRAY(av);
651
652 if (maxarg > 0) {
653 pat = S_concat_pat(aTHX_ pRExC_state, pat,
654 array, maxarg, NULL, recompile_p,
655 /* $" */
656 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
657 }
658 else if (!pat) {
659 pat = newSVpvs_flags("", SVs_TEMP);
660 }
661
662 continue;
663 }
664
665
666 /* we make the assumption here that each op in the list of
667 * op_siblings maps to one SV pushed onto the stack,
668 * except for code blocks, with have both an OP_NULL and
669 * an OP_CONST.
670 * This allows us to match up the list of SVs against the
671 * list of OPs to find the next code block.
672 *
673 * Note that PUSHMARK PADSV PADSV ..
674 * is optimised to
675 * PADRANGE PADSV PADSV ..
676 * so the alignment still works. */
677
678 if (oplist) {
679 if (oplist->op_type == OP_NULL
680 && (oplist->op_flags & OPf_SPECIAL))
681 {
682 assert(n < pRExC_state->code_blocks->count);
683 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
684 pRExC_state->code_blocks->cb[n].block = oplist;
685 pRExC_state->code_blocks->cb[n].src_regex = NULL;
686 n++;
687 code = 1;
688 oplist = OpSIBLING(oplist); /* skip CONST */
689 assert(oplist);
690 }
691 oplist = OpSIBLING(oplist);;
692 }
693
694 /* apply magic and QR overloading to arg */
695
696 SvGETMAGIC(msv);
697 if (SvROK(msv) && SvAMAGIC(msv)) {
698 SV *sv = AMG_CALLunary(msv, regexp_amg);
699 if (sv) {
700 if (SvROK(sv))
701 sv = SvRV(sv);
702 if (SvTYPE(sv) != SVt_REGEXP)
703 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
704 msv = sv;
705 }
706 }
707
708 /* try concatenation overload ... */
709 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
710 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
711 {
712 sv_setsv(pat, sv);
713 /* overloading involved: all bets are off over literal
714 * code. Pretend we haven't seen it */
715 if (n)
716 pRExC_state->code_blocks->count -= n;
717 n = 0;
718 }
719 else {
720 /* ... or failing that, try "" overload */
721 while (SvAMAGIC(msv)
722 && (sv = AMG_CALLunary(msv, string_amg))
723 && sv != msv
724 && !( SvROK(msv)
725 && SvROK(sv)
726 && SvRV(msv) == SvRV(sv))
727 ) {
728 msv = sv;
729 SvGETMAGIC(msv);
730 }
731 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
732 msv = SvRV(msv);
733
734 if (pat) {
735 /* this is a partially unrolled
736 * sv_catsv_nomg(pat, msv);
737 * that allows us to adjust code block indices if
738 * needed */
739 STRLEN dlen;
740 char *dst = SvPV_force_nomg(pat, dlen);
741 orig_patlen = dlen;
742 if (SvUTF8(msv) && !SvUTF8(pat)) {
743 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
744 sv_setpvn(pat, dst, dlen);
745 SvUTF8_on(pat);
746 }
747 sv_catsv_nomg(pat, msv);
748 rx = msv;
749 }
750 else {
751 /* We have only one SV to process, but we need to verify
752 * it is properly null terminated or we will fail asserts
753 * later. In theory we probably shouldn't get such SV's,
754 * but if we do we should handle it gracefully. */
755 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
756 /* not a string, or a string with a trailing null */
757 pat = msv;
758 } else {
759 /* a string with no trailing null, we need to copy it
760 * so it has a trailing null */
761 pat = sv_2mortal(newSVsv(msv));
762 }
763 }
764
765 if (code)
766 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
767 }
768
769 /* extract any code blocks within any embedded qr//'s */
770 if (rx && SvTYPE(rx) == SVt_REGEXP
771 && RX_ENGINE((REGEXP*)rx)->op_comp)
772 {
773
774 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
775 if (ri->code_blocks && ri->code_blocks->count) {
776 int i;
777 /* the presence of an embedded qr// with code means
778 * we should always recompile: the text of the
779 * qr// may not have changed, but it may be a
780 * different closure than last time */
781 *recompile_p = 1;
782 if (pRExC_state->code_blocks) {
783 int new_count = pRExC_state->code_blocks->count
784 + ri->code_blocks->count;
785 Renew(pRExC_state->code_blocks->cb,
786 new_count, struct reg_code_block);
787 pRExC_state->code_blocks->count = new_count;
788 }
789 else
790 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
791 ri->code_blocks->count);
792
793 for (i=0; i < ri->code_blocks->count; i++) {
794 struct reg_code_block *src, *dst;
795 STRLEN offset = orig_patlen
796 + ReANY((REGEXP *)rx)->pre_prefix;
797 assert(n < pRExC_state->code_blocks->count);
798 src = &ri->code_blocks->cb[i];
799 dst = &pRExC_state->code_blocks->cb[n];
800 dst->start = src->start + offset;
801 dst->end = src->end + offset;
802 dst->block = src->block;
803 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
804 src->src_regex
805 ? src->src_regex
806 : (REGEXP*)rx);
807 n++;
808 }
809 }
810 }
811 }
812 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
813 if (alloced)
814 SvSETMAGIC(pat);
815
816 return pat;
817 }
818
819
820
821 /* see if there are any run-time code blocks in the pattern.
822 * False positives are allowed */
823
824 static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)825 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
826 char *pat, STRLEN plen)
827 {
828 int n = 0;
829 STRLEN s;
830
831 PERL_UNUSED_CONTEXT;
832
833 for (s = 0; s < plen; s++) {
834 if ( pRExC_state->code_blocks
835 && n < pRExC_state->code_blocks->count
836 && s == pRExC_state->code_blocks->cb[n].start)
837 {
838 s = pRExC_state->code_blocks->cb[n].end;
839 n++;
840 continue;
841 }
842 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
843 * positives here */
844 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
845 (pat[s+2] == '{'
846 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
847 )
848 return 1;
849 }
850 return 0;
851 }
852
853 /* Handle run-time code blocks. We will already have compiled any direct
854 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
855 * copy of it, but with any literal code blocks blanked out and
856 * appropriate chars escaped; then feed it into
857 *
858 * eval "qr'modified_pattern'"
859 *
860 * For example,
861 *
862 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
863 *
864 * becomes
865 *
866 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
867 *
868 * After eval_sv()-ing that, grab any new code blocks from the returned qr
869 * and merge them with any code blocks of the original regexp.
870 *
871 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
872 * instead, just save the qr and return FALSE; this tells our caller that
873 * the original pattern needs upgrading to utf8.
874 */
875
876 static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)877 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
878 char *pat, STRLEN plen)
879 {
880 SV *qr;
881
882 DECLARE_AND_GET_RE_DEBUG_FLAGS;
883
884 if (pRExC_state->runtime_code_qr) {
885 /* this is the second time we've been called; this should
886 * only happen if the main pattern got upgraded to utf8
887 * during compilation; re-use the qr we compiled first time
888 * round (which should be utf8 too)
889 */
890 qr = pRExC_state->runtime_code_qr;
891 pRExC_state->runtime_code_qr = NULL;
892 assert(RExC_utf8 && SvUTF8(qr));
893 }
894 else {
895 int n = 0;
896 STRLEN s;
897 char *p, *newpat;
898 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
899 SV *sv, *qr_ref;
900 dSP;
901
902 /* determine how many extra chars we need for ' and \ escaping */
903 for (s = 0; s < plen; s++) {
904 if (pat[s] == '\'' || pat[s] == '\\')
905 newlen++;
906 }
907
908 Newx(newpat, newlen, char);
909 p = newpat;
910 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
911
912 for (s = 0; s < plen; s++) {
913 if ( pRExC_state->code_blocks
914 && n < pRExC_state->code_blocks->count
915 && s == pRExC_state->code_blocks->cb[n].start)
916 {
917 /* blank out literal code block so that they aren't
918 * recompiled: eg change from/to:
919 * /(?{xyz})/
920 * /(?=====)/
921 * and
922 * /(??{xyz})/
923 * /(?======)/
924 * and
925 * /(?(?{xyz}))/
926 * /(?(?=====))/
927 */
928 assert(pat[s] == '(');
929 assert(pat[s+1] == '?');
930 *p++ = '(';
931 *p++ = '?';
932 s += 2;
933 while (s < pRExC_state->code_blocks->cb[n].end) {
934 *p++ = '=';
935 s++;
936 }
937 *p++ = ')';
938 n++;
939 continue;
940 }
941 if (pat[s] == '\'' || pat[s] == '\\')
942 *p++ = '\\';
943 *p++ = pat[s];
944 }
945 *p++ = '\'';
946 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
947 *p++ = 'x';
948 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
949 *p++ = 'x';
950 }
951 }
952 *p++ = '\0';
953 DEBUG_COMPILE_r({
954 Perl_re_printf( aTHX_
955 "%sre-parsing pattern for runtime code:%s %s\n",
956 PL_colors[4], PL_colors[5], newpat);
957 });
958
959 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
960 Safefree(newpat);
961
962 ENTER;
963 SAVETMPS;
964 save_re_context();
965 PUSHSTACKi(PERLSI_REQUIRE);
966 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
967 * parsing qr''; normally only q'' does this. It also alters
968 * hints handling */
969 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
970 SvREFCNT_dec_NN(sv);
971 SPAGAIN;
972 qr_ref = POPs;
973 PUTBACK;
974 {
975 SV * const errsv = ERRSV;
976 if (SvTRUE_NN(errsv))
977 /* use croak_sv ? */
978 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
979 }
980 assert(SvROK(qr_ref));
981 qr = SvRV(qr_ref);
982 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
983 /* the leaving below frees the tmp qr_ref.
984 * Give qr a life of its own */
985 SvREFCNT_inc(qr);
986 POPSTACK;
987 FREETMPS;
988 LEAVE;
989
990 }
991
992 if (!RExC_utf8 && SvUTF8(qr)) {
993 /* first time through; the pattern got upgraded; save the
994 * qr for the next time through */
995 assert(!pRExC_state->runtime_code_qr);
996 pRExC_state->runtime_code_qr = qr;
997 return 0;
998 }
999
1000
1001 /* extract any code blocks within the returned qr// */
1002
1003
1004 /* merge the main (r1) and run-time (r2) code blocks into one */
1005 {
1006 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
1007 struct reg_code_block *new_block, *dst;
1008 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
1009 int i1 = 0, i2 = 0;
1010 int r1c, r2c;
1011
1012 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
1013 {
1014 SvREFCNT_dec_NN(qr);
1015 return 1;
1016 }
1017
1018 if (!r1->code_blocks)
1019 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
1020
1021 r1c = r1->code_blocks->count;
1022 r2c = r2->code_blocks->count;
1023
1024 Newx(new_block, r1c + r2c, struct reg_code_block);
1025
1026 dst = new_block;
1027
1028 while (i1 < r1c || i2 < r2c) {
1029 struct reg_code_block *src;
1030 bool is_qr = 0;
1031
1032 if (i1 == r1c) {
1033 src = &r2->code_blocks->cb[i2++];
1034 is_qr = 1;
1035 }
1036 else if (i2 == r2c)
1037 src = &r1->code_blocks->cb[i1++];
1038 else if ( r1->code_blocks->cb[i1].start
1039 < r2->code_blocks->cb[i2].start)
1040 {
1041 src = &r1->code_blocks->cb[i1++];
1042 assert(src->end < r2->code_blocks->cb[i2].start);
1043 }
1044 else {
1045 assert( r1->code_blocks->cb[i1].start
1046 > r2->code_blocks->cb[i2].start);
1047 src = &r2->code_blocks->cb[i2++];
1048 is_qr = 1;
1049 assert(src->end < r1->code_blocks->cb[i1].start);
1050 }
1051
1052 assert(pat[src->start] == '(');
1053 assert(pat[src->end] == ')');
1054 dst->start = src->start;
1055 dst->end = src->end;
1056 dst->block = src->block;
1057 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
1058 : src->src_regex;
1059 dst++;
1060 }
1061 r1->code_blocks->count += r2c;
1062 Safefree(r1->code_blocks->cb);
1063 r1->code_blocks->cb = new_block;
1064 }
1065
1066 SvREFCNT_dec_NN(qr);
1067 return 1;
1068 }
1069
1070
1071 STATIC bool
S_setup_longest(pTHX_ RExC_state_t * pRExC_state,struct reg_substr_datum * rsd,struct scan_data_substrs * sub,STRLEN longest_length)1072 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
1073 struct reg_substr_datum *rsd,
1074 struct scan_data_substrs *sub,
1075 STRLEN longest_length)
1076 {
1077 /* This is the common code for setting up the floating and fixed length
1078 * string data extracted from Perl_re_op_compile() below. Returns a boolean
1079 * as to whether succeeded or not */
1080
1081 I32 t;
1082 SSize_t ml;
1083 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
1084 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
1085
1086 if (! (longest_length
1087 || (eol /* Can't have SEOL and MULTI */
1088 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
1089 )
1090 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
1091 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
1092 {
1093 return FALSE;
1094 }
1095
1096 /* copy the information about the longest from the reg_scan_data
1097 over to the program. */
1098 if (SvUTF8(sub->str)) {
1099 rsd->substr = NULL;
1100 rsd->utf8_substr = sub->str;
1101 } else {
1102 rsd->substr = sub->str;
1103 rsd->utf8_substr = NULL;
1104 }
1105 /* end_shift is how many chars that must be matched that
1106 follow this item. We calculate it ahead of time as once the
1107 lookbehind offset is added in we lose the ability to correctly
1108 calculate it.*/
1109 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
1110 rsd->end_shift = ml - sub->min_offset
1111 - longest_length
1112 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
1113 * intead? - DAPM
1114 + (SvTAIL(sub->str) != 0)
1115 */
1116 + sub->lookbehind;
1117
1118 t = (eol/* Can't have SEOL and MULTI */
1119 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
1120 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
1121
1122 return TRUE;
1123 }
1124
1125 STATIC void
S_set_regex_pv(pTHX_ RExC_state_t * pRExC_state,REGEXP * Rx)1126 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
1127 {
1128 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
1129 * properly wrapped with the right modifiers */
1130
1131 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
1132 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
1133 != REGEX_DEPENDS_CHARSET);
1134
1135 /* The caret is output if there are any defaults: if not all the STD
1136 * flags are set, or if no character set specifier is needed */
1137 bool has_default =
1138 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
1139 || ! has_charset);
1140 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
1141 == REG_RUN_ON_COMMENT_SEEN);
1142 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
1143 >> RXf_PMf_STD_PMMOD_SHIFT);
1144 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
1145 char *p;
1146 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
1147
1148 /* We output all the necessary flags; we never output a minus, as all
1149 * those are defaults, so are
1150 * covered by the caret */
1151 const STRLEN wraplen = pat_len + has_p + has_runon
1152 + has_default /* If needs a caret */
1153 + PL_bitcount[reganch] /* 1 char for each set standard flag */
1154
1155 /* If needs a character set specifier */
1156 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
1157 + (sizeof("(?:)") - 1);
1158
1159 PERL_ARGS_ASSERT_SET_REGEX_PV;
1160
1161 /* make sure PL_bitcount bounds not exceeded */
1162 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
1163
1164 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
1165 SvPOK_on(Rx);
1166 if (RExC_utf8)
1167 SvFLAGS(Rx) |= SVf_UTF8;
1168 *p++='('; *p++='?';
1169
1170 /* If a default, cover it using the caret */
1171 if (has_default) {
1172 *p++= DEFAULT_PAT_MOD;
1173 }
1174 if (has_charset) {
1175 STRLEN len;
1176 const char* name;
1177
1178 name = get_regex_charset_name(RExC_rx->extflags, &len);
1179 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
1180 assert(RExC_utf8);
1181 name = UNICODE_PAT_MODS;
1182 len = sizeof(UNICODE_PAT_MODS) - 1;
1183 }
1184 Copy(name, p, len, char);
1185 p += len;
1186 }
1187 if (has_p)
1188 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
1189 {
1190 char ch;
1191 while((ch = *fptr++)) {
1192 if(reganch & 1)
1193 *p++ = ch;
1194 reganch >>= 1;
1195 }
1196 }
1197
1198 *p++ = ':';
1199 Copy(RExC_precomp, p, pat_len, char);
1200 assert ((RX_WRAPPED(Rx) - p) < 16);
1201 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
1202 p += pat_len;
1203
1204 /* Adding a trailing \n causes this to compile properly:
1205 my $R = qr / A B C # D E/x; /($R)/
1206 Otherwise the parens are considered part of the comment */
1207 if (has_runon)
1208 *p++ = '\n';
1209 *p++ = ')';
1210 *p = 0;
1211 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
1212 }
1213
1214 /*
1215 * Perl_re_op_compile - the perl internal RE engine's function to compile a
1216 * regular expression into internal code.
1217 * The pattern may be passed either as:
1218 * a list of SVs (patternp plus pat_count)
1219 * a list of OPs (expr)
1220 * If both are passed, the SV list is used, but the OP list indicates
1221 * which SVs are actually pre-compiled code blocks
1222 *
1223 * The SVs in the list have magic and qr overloading applied to them (and
1224 * the list may be modified in-place with replacement SVs in the latter
1225 * case).
1226 *
1227 * If the pattern hasn't changed from old_re, then old_re will be
1228 * returned.
1229 *
1230 * eng is the current engine. If that engine has an op_comp method, then
1231 * handle directly (i.e. we assume that op_comp was us); otherwise, just
1232 * do the initial concatenation of arguments and pass on to the external
1233 * engine.
1234 *
1235 * If is_bare_re is not null, set it to a boolean indicating whether the
1236 * arg list reduced (after overloading) to a single bare regex which has
1237 * been returned (i.e. /$qr/).
1238 *
1239 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
1240 *
1241 * pm_flags contains the PMf_* flags, typically based on those from the
1242 * pm_flags field of the related PMOP. Currently we're only interested in
1243 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
1244 *
1245 * For many years this code had an initial sizing pass that calculated
1246 * (sometimes incorrectly, leading to security holes) the size needed for the
1247 * compiled pattern. That was changed by commit
1248 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
1249 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
1250 * references to this sizing pass.
1251 *
1252 * Now, an initial crude guess as to the size needed is made, based on the
1253 * length of the pattern. Patches welcome to improve that guess. That amount
1254 * of space is malloc'd and then immediately freed, and then clawed back node
1255 * by node. This design is to minimize, to the extent possible, memory churn
1256 * when doing the reallocs.
1257 *
1258 * A separate parentheses counting pass may be needed in some cases.
1259 * (Previously the sizing pass did this.) Patches welcome to reduce the number
1260 * of these cases.
1261 *
1262 * The existence of a sizing pass necessitated design decisions that are no
1263 * longer needed. There are potential areas of simplification.
1264 *
1265 * Beware that the optimization-preparation code in here knows about some
1266 * of the structure of the compiled regexp. [I'll say.]
1267 */
1268
1269 REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp,int pat_count,OP * expr,const regexp_engine * eng,REGEXP * old_re,bool * is_bare_re,const U32 orig_rx_flags,const U32 pm_flags)1270 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
1271 OP *expr, const regexp_engine* eng, REGEXP *old_re,
1272 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
1273 {
1274 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
1275 STRLEN plen;
1276 char *exp;
1277 regnode *scan;
1278 I32 flags;
1279 SSize_t minlen = 0;
1280 U32 rx_flags;
1281 SV *pat;
1282 SV** new_patternp = patternp;
1283
1284 /* these are all flags - maybe they should be turned
1285 * into a single int with different bit masks */
1286 I32 sawlookahead = 0;
1287 I32 sawplus = 0;
1288 I32 sawopen = 0;
1289 I32 sawminmod = 0;
1290
1291 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
1292 bool recompile = 0;
1293 bool runtime_code = 0;
1294 scan_data_t data;
1295 RExC_state_t RExC_state;
1296 RExC_state_t * const pRExC_state = &RExC_state;
1297 #ifdef TRIE_STUDY_OPT
1298 /* search for "restudy" in this file for a detailed explanation */
1299 int restudied = 0;
1300 RExC_state_t copyRExC_state;
1301 #endif
1302 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1303
1304 PERL_ARGS_ASSERT_RE_OP_COMPILE;
1305
1306 DEBUG_r(if (!PL_colorset) reginitcolors());
1307
1308
1309 pRExC_state->warn_text = NULL;
1310 pRExC_state->unlexed_names = NULL;
1311 pRExC_state->code_blocks = NULL;
1312
1313 if (is_bare_re)
1314 *is_bare_re = FALSE;
1315
1316 if (expr && (expr->op_type == OP_LIST ||
1317 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
1318 /* allocate code_blocks if needed */
1319 OP *o;
1320 int ncode = 0;
1321
1322 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
1323 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
1324 ncode++; /* count of DO blocks */
1325
1326 if (ncode)
1327 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
1328 }
1329
1330 if (!pat_count) {
1331 /* compile-time pattern with just OP_CONSTs and DO blocks */
1332
1333 int n;
1334 OP *o;
1335
1336 /* find how many CONSTs there are */
1337 assert(expr);
1338 n = 0;
1339 if (expr->op_type == OP_CONST)
1340 n = 1;
1341 else
1342 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1343 if (o->op_type == OP_CONST)
1344 n++;
1345 }
1346
1347 /* fake up an SV array */
1348
1349 assert(!new_patternp);
1350 Newx(new_patternp, n, SV*);
1351 SAVEFREEPV(new_patternp);
1352 pat_count = n;
1353
1354 n = 0;
1355 if (expr->op_type == OP_CONST)
1356 new_patternp[n] = cSVOPx_sv(expr);
1357 else
1358 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1359 if (o->op_type == OP_CONST)
1360 new_patternp[n++] = cSVOPo_sv;
1361 }
1362
1363 }
1364
1365 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1366 "Assembling pattern from %d elements%s\n", pat_count,
1367 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1368
1369 /* set expr to the first arg op */
1370
1371 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
1372 && expr->op_type != OP_CONST)
1373 {
1374 expr = cLISTOPx(expr)->op_first;
1375 assert( expr->op_type == OP_PUSHMARK
1376 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
1377 || expr->op_type == OP_PADRANGE);
1378 expr = OpSIBLING(expr);
1379 }
1380
1381 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
1382 expr, &recompile, NULL);
1383
1384 /* handle bare (possibly after overloading) regex: foo =~ $re */
1385 {
1386 SV *re = pat;
1387 if (SvROK(re))
1388 re = SvRV(re);
1389 if (SvTYPE(re) == SVt_REGEXP) {
1390 if (is_bare_re)
1391 *is_bare_re = TRUE;
1392 SvREFCNT_inc(re);
1393 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1394 "Precompiled pattern%s\n",
1395 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1396
1397 return (REGEXP*)re;
1398 }
1399 }
1400
1401 exp = SvPV_nomg(pat, plen);
1402
1403 if (!eng->op_comp) {
1404 if ((SvUTF8(pat) && IN_BYTES)
1405 || SvGMAGICAL(pat) || SvAMAGIC(pat))
1406 {
1407 /* make a temporary copy; either to convert to bytes,
1408 * or to avoid repeating get-magic / overloaded stringify */
1409 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
1410 (IN_BYTES ? 0 : SvUTF8(pat)));
1411 }
1412 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1413 }
1414
1415 /* ignore the utf8ness if the pattern is 0 length */
1416 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
1417 RExC_uni_semantics = 0;
1418 RExC_contains_locale = 0;
1419 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
1420 RExC_in_script_run = 0;
1421 RExC_study_started = 0;
1422 pRExC_state->runtime_code_qr = NULL;
1423 RExC_frame_head= NULL;
1424 RExC_frame_last= NULL;
1425 RExC_frame_count= 0;
1426 RExC_latest_warn_offset = 0;
1427 RExC_use_BRANCHJ = 0;
1428 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
1429 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
1430 RExC_logical_total_parens = 0;
1431 RExC_total_parens = 0;
1432 RExC_logical_to_parno = NULL;
1433 RExC_parno_to_logical = NULL;
1434 RExC_open_parens = NULL;
1435 RExC_close_parens = NULL;
1436 RExC_paren_names = NULL;
1437 RExC_size = 0;
1438 RExC_seen_d_op = FALSE;
1439 #ifdef DEBUGGING
1440 RExC_paren_name_list = NULL;
1441 #endif
1442
1443 DEBUG_r({
1444 RExC_mysv1= sv_newmortal();
1445 RExC_mysv2= sv_newmortal();
1446 });
1447
1448 DEBUG_COMPILE_r({
1449 SV *dsv= sv_newmortal();
1450 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1451 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
1452 PL_colors[4], PL_colors[5], s);
1453 });
1454
1455 /* we jump here if we have to recompile, e.g., from upgrading the pattern
1456 * to utf8 */
1457
1458 if ((pm_flags & PMf_USE_RE_EVAL)
1459 /* this second condition covers the non-regex literal case,
1460 * i.e. $foo =~ '(?{})'. */
1461 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
1462 )
1463 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
1464
1465 redo_parse:
1466 /* return old regex if pattern hasn't changed */
1467 /* XXX: note in the below we have to check the flags as well as the
1468 * pattern.
1469 *
1470 * Things get a touch tricky as we have to compare the utf8 flag
1471 * independently from the compile flags. */
1472
1473 if ( old_re
1474 && !recompile
1475 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
1476 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
1477 && RX_PRELEN(old_re) == plen
1478 && memEQ(RX_PRECOMP(old_re), exp, plen)
1479 && !runtime_code /* with runtime code, always recompile */ )
1480 {
1481 DEBUG_COMPILE_r({
1482 SV *dsv= sv_newmortal();
1483 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1484 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
1485 PL_colors[4], PL_colors[5], s);
1486 });
1487 return old_re;
1488 }
1489
1490 /* Allocate the pattern's SV */
1491 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
1492 RExC_rx = ReANY(Rx);
1493 if ( RExC_rx == NULL )
1494 FAIL("Regexp out of space");
1495
1496 rx_flags = orig_rx_flags;
1497
1498 if ( toUSE_UNI_CHARSET_NOT_DEPENDS
1499 && initial_charset == REGEX_DEPENDS_CHARSET)
1500 {
1501
1502 /* Set to use unicode semantics if the pattern is in utf8 and has the
1503 * 'depends' charset specified, as it means unicode when utf8 */
1504 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
1505 RExC_uni_semantics = 1;
1506 }
1507
1508 RExC_pm_flags = pm_flags;
1509
1510 if (runtime_code) {
1511 assert(TAINTING_get || !TAINT_get);
1512 if (TAINT_get)
1513 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
1514
1515 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
1516 /* whoops, we have a non-utf8 pattern, whilst run-time code
1517 * got compiled as utf8. Try again with a utf8 pattern */
1518 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1519 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1520 goto redo_parse;
1521 }
1522 }
1523 assert(!pRExC_state->runtime_code_qr);
1524
1525 RExC_sawback = 0;
1526
1527 RExC_seen = 0;
1528 RExC_maxlen = 0;
1529 RExC_in_lookaround = 0;
1530 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1531 RExC_recode_x_to_native = 0;
1532 RExC_in_multi_char_class = 0;
1533
1534 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
1535 RExC_precomp_end = RExC_end = exp + plen;
1536 RExC_nestroot = 0;
1537 RExC_whilem_seen = 0;
1538 RExC_end_op = NULL;
1539 RExC_recurse = NULL;
1540 RExC_study_chunk_recursed = NULL;
1541 RExC_study_chunk_recursed_bytes= 0;
1542 RExC_recurse_count = 0;
1543 RExC_sets_depth = 0;
1544 pRExC_state->code_index = 0;
1545
1546 /* Initialize the string in the compiled pattern. This is so that there is
1547 * something to output if necessary */
1548 set_regex_pv(pRExC_state, Rx);
1549
1550 DEBUG_PARSE_r({
1551 Perl_re_printf( aTHX_
1552 "Starting parse and generation\n");
1553 RExC_lastnum=0;
1554 RExC_lastparse=NULL;
1555 });
1556
1557 /* Allocate space and zero-initialize. Note, the two step process
1558 of zeroing when in debug mode, thus anything assigned has to
1559 happen after that */
1560 if (! RExC_size) {
1561
1562 /* On the first pass of the parse, we guess how big this will be. Then
1563 * we grow in one operation to that amount and then give it back. As
1564 * we go along, we re-allocate what we need.
1565 *
1566 * XXX Currently the guess is essentially that the pattern will be an
1567 * EXACT node with one byte input, one byte output. This is crude, and
1568 * better heuristics are welcome.
1569 *
1570 * On any subsequent passes, we guess what we actually computed in the
1571 * latest earlier pass. Such a pass probably didn't complete so is
1572 * missing stuff. We could improve those guesses by knowing where the
1573 * parse stopped, and use the length so far plus apply the above
1574 * assumption to what's left. */
1575 RExC_size = STR_SZ(RExC_end - RExC_start);
1576 }
1577
1578 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
1579 if ( RExC_rxi == NULL )
1580 FAIL("Regexp out of space");
1581
1582 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
1583 RXi_SET( RExC_rx, RExC_rxi );
1584
1585 /* We start from 0 (over from 0 in the case this is a reparse. The first
1586 * node parsed will give back any excess memory we have allocated so far).
1587 * */
1588 RExC_size = 0;
1589
1590 /* non-zero initialization begins here */
1591 RExC_rx->engine= eng;
1592 RExC_rx->extflags = rx_flags;
1593 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
1594
1595 if (pm_flags & PMf_IS_QR) {
1596 RExC_rxi->code_blocks = pRExC_state->code_blocks;
1597 if (RExC_rxi->code_blocks) {
1598 RExC_rxi->code_blocks->refcnt++;
1599 }
1600 }
1601
1602 RExC_rx->intflags = 0;
1603
1604 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
1605 RExC_parse_set(exp);
1606
1607 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
1608 * code makes sure the final byte is an uncounted NUL. But should this
1609 * ever not be the case, lots of things could read beyond the end of the
1610 * buffer: loops like
1611 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
1612 * strchr(RExC_parse, "foo");
1613 * etc. So it is worth noting. */
1614 assert(*RExC_end == '\0');
1615
1616 RExC_naughty = 0;
1617 RExC_npar = 1;
1618 RExC_logical_npar = 1;
1619 RExC_parens_buf_size = 0;
1620 RExC_emit_start = RExC_rxi->program;
1621 pRExC_state->code_index = 0;
1622
1623 *((char*) RExC_emit_start) = (char) REG_MAGIC;
1624 RExC_emit = NODE_STEP_REGNODE;
1625
1626 /* Do the parse */
1627 if (reg(pRExC_state, 0, &flags, 1)) {
1628
1629 /* Success!, But we may need to redo the parse knowing how many parens
1630 * there actually are */
1631 if (IN_PARENS_PASS) {
1632 flags |= RESTART_PARSE;
1633 }
1634
1635 /* We have that number in RExC_npar */
1636 RExC_total_parens = RExC_npar;
1637 RExC_logical_total_parens = RExC_logical_npar;
1638 }
1639 else if (! MUST_RESTART(flags)) {
1640 ReREFCNT_dec(Rx);
1641 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
1642 }
1643
1644 /* Here, we either have success, or we have to redo the parse for some reason */
1645 if (MUST_RESTART(flags)) {
1646
1647 /* It's possible to write a regexp in ascii that represents Unicode
1648 codepoints outside of the byte range, such as via \x{100}. If we
1649 detect such a sequence we have to convert the entire pattern to utf8
1650 and then recompile, as our sizing calculation will have been based
1651 on 1 byte == 1 character, but we will need to use utf8 to encode
1652 at least some part of the pattern, and therefore must convert the whole
1653 thing.
1654 -- dmq */
1655 if (flags & NEED_UTF8) {
1656
1657 /* We have stored the offset of the final warning output so far.
1658 * That must be adjusted. Any variant characters between the start
1659 * of the pattern and this warning count for 2 bytes in the final,
1660 * so just add them again */
1661 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
1662 RExC_latest_warn_offset +=
1663 variant_under_utf8_count((U8 *) exp, (U8 *) exp
1664 + RExC_latest_warn_offset);
1665 }
1666 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1667 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1668 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
1669 }
1670 else {
1671 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
1672 }
1673
1674 if (ALL_PARENS_COUNTED) {
1675 /* Make enough room for all the known parens, and zero it */
1676 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
1677 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
1678 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
1679
1680 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
1681 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
1682 /* we do NOT reinitialize RExC_logical_to_parno and
1683 * RExC_parno_to_logical here. We need their data on the second
1684 * pass */
1685 }
1686 else { /* Parse did not complete. Reinitialize the parentheses
1687 structures */
1688 RExC_total_parens = 0;
1689 if (RExC_open_parens) {
1690 Safefree(RExC_open_parens);
1691 RExC_open_parens = NULL;
1692 }
1693 if (RExC_close_parens) {
1694 Safefree(RExC_close_parens);
1695 RExC_close_parens = NULL;
1696 }
1697 if (RExC_logical_to_parno) {
1698 Safefree(RExC_logical_to_parno);
1699 RExC_logical_to_parno = NULL;
1700 }
1701 if (RExC_parno_to_logical) {
1702 Safefree(RExC_parno_to_logical);
1703 RExC_parno_to_logical = NULL;
1704 }
1705 }
1706
1707 /* Clean up what we did in this parse */
1708 SvREFCNT_dec_NN(RExC_rx_sv);
1709
1710 goto redo_parse;
1711 }
1712
1713 /* Here, we have successfully parsed and generated the pattern's program
1714 * for the regex engine. We are ready to finish things up and look for
1715 * optimizations. */
1716
1717 /* Update the string to compile, with correct modifiers, etc */
1718 set_regex_pv(pRExC_state, Rx);
1719
1720 RExC_rx->nparens = RExC_total_parens - 1;
1721 RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
1722
1723 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
1724 if (RExC_whilem_seen > 15)
1725 RExC_whilem_seen = 15;
1726
1727 DEBUG_PARSE_r({
1728 Perl_re_printf( aTHX_
1729 "Required size %" IVdf " nodes\n", (IV)RExC_size);
1730 RExC_lastnum=0;
1731 RExC_lastparse=NULL;
1732 });
1733
1734 SetProgLen(RExC_rxi,RExC_size);
1735
1736 DEBUG_DUMP_PRE_OPTIMIZE_r({
1737 SV * const sv = sv_newmortal();
1738 RXi_GET_DECL(RExC_rx, ri);
1739 DEBUG_RExC_seen();
1740 Perl_re_printf( aTHX_ "Program before optimization:\n");
1741
1742 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
1743 sv, 0, 0);
1744 });
1745
1746 DEBUG_OPTIMISE_r(
1747 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
1748 );
1749
1750 /* XXXX To minimize changes to RE engine we always allocate
1751 3-units-long substrs field. */
1752 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
1753 if (RExC_recurse_count) {
1754 Newx(RExC_recurse, RExC_recurse_count, regnode *);
1755 SAVEFREEPV(RExC_recurse);
1756 }
1757
1758 if (RExC_seen & REG_RECURSE_SEEN) {
1759 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
1760 * So its 1 if there are no parens. */
1761 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
1762 ((RExC_total_parens & 0x07) != 0);
1763 Newx(RExC_study_chunk_recursed,
1764 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1765 SAVEFREEPV(RExC_study_chunk_recursed);
1766 }
1767
1768 reStudy:
1769 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
1770 DEBUG_r(
1771 RExC_study_chunk_recursed_count= 0;
1772 );
1773 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
1774 if (RExC_study_chunk_recursed) {
1775 Zero(RExC_study_chunk_recursed,
1776 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1777 }
1778
1779
1780 #ifdef TRIE_STUDY_OPT
1781 /* search for "restudy" in this file for a detailed explanation */
1782 if (!restudied) {
1783 StructCopy(&zero_scan_data, &data, scan_data_t);
1784 copyRExC_state = RExC_state;
1785 } else {
1786 U32 seen=RExC_seen;
1787 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
1788
1789 RExC_state = copyRExC_state;
1790 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
1791 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
1792 else
1793 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
1794 StructCopy(&zero_scan_data, &data, scan_data_t);
1795 }
1796 #else
1797 StructCopy(&zero_scan_data, &data, scan_data_t);
1798 #endif
1799
1800 /* Dig out information for optimizations. */
1801 RExC_rx->extflags = RExC_flags; /* was pm_op */
1802 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
1803
1804 if (UTF)
1805 SvUTF8_on(Rx); /* Unicode in it? */
1806 RExC_rxi->regstclass = NULL;
1807 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
1808 RExC_rx->intflags |= PREGf_NAUGHTY;
1809 scan = RExC_rxi->program + 1; /* First BRANCH. */
1810
1811 /* testing for BRANCH here tells us whether there is "must appear"
1812 data in the pattern. If there is then we can use it for optimisations */
1813 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
1814 */
1815 SSize_t fake_deltap;
1816 STRLEN longest_length[2];
1817 regnode_ssc ch_class; /* pointed to by data */
1818 int stclass_flag;
1819 SSize_t last_close = 0; /* pointed to by data */
1820 regnode *first= scan;
1821 regnode *first_next= regnext(first);
1822 regnode *last_close_op= NULL;
1823 int i;
1824
1825 /*
1826 * Skip introductions and multiplicators >= 1
1827 * so that we can extract the 'meat' of the pattern that must
1828 * match in the large if() sequence following.
1829 * NOTE that EXACT is NOT covered here, as it is normally
1830 * picked up by the optimiser separately.
1831 *
1832 * This is unfortunate as the optimiser isnt handling lookahead
1833 * properly currently.
1834 *
1835 */
1836 while (1)
1837 {
1838 if (OP(first) == OPEN)
1839 sawopen = 1;
1840 else
1841 if (OP(first) == IFMATCH && !FLAGS(first))
1842 /* for now we can't handle lookbehind IFMATCH */
1843 sawlookahead = 1;
1844 else
1845 if (OP(first) == PLUS)
1846 sawplus = 1;
1847 else
1848 if (OP(first) == MINMOD)
1849 sawminmod = 1;
1850 else
1851 if (!(
1852 /* An OR of *one* alternative - should not happen now. */
1853 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
1854 /* An {n,m} with n>0 */
1855 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) ||
1856 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
1857 )){
1858 break;
1859 }
1860
1861 first = REGNODE_AFTER(first);
1862 first_next= regnext(first);
1863 }
1864
1865 /* Starting-point info. */
1866 again:
1867 DEBUG_PEEP("first:", first, 0, 0);
1868 /* Ignore EXACT as we deal with it later. */
1869 if (REGNODE_TYPE(OP(first)) == EXACT) {
1870 if (! isEXACTFish(OP(first))) {
1871 NOOP; /* Empty, get anchored substr later. */
1872 }
1873 else
1874 RExC_rxi->regstclass = first;
1875 }
1876 #ifdef TRIE_STCLASS
1877 else if (REGNODE_TYPE(OP(first)) == TRIE &&
1878 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
1879 {
1880 /* this can happen only on restudy
1881 * Search for "restudy" in this file to find
1882 * a comment with details. */
1883 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
1884 }
1885 #endif
1886 else if (REGNODE_SIMPLE(OP(first)))
1887 RExC_rxi->regstclass = first;
1888 else if (REGNODE_TYPE(OP(first)) == BOUND ||
1889 REGNODE_TYPE(OP(first)) == NBOUND)
1890 RExC_rxi->regstclass = first;
1891 else if (REGNODE_TYPE(OP(first)) == BOL) {
1892 RExC_rx->intflags |= (OP(first) == MBOL
1893 ? PREGf_ANCH_MBOL
1894 : PREGf_ANCH_SBOL);
1895 first = REGNODE_AFTER(first);
1896 goto again;
1897 }
1898 else if (OP(first) == GPOS) {
1899 RExC_rx->intflags |= PREGf_ANCH_GPOS;
1900 first = REGNODE_AFTER_type(first,tregnode_GPOS);
1901 goto again;
1902 }
1903 else if ((!sawopen || !RExC_sawback) &&
1904 !sawlookahead &&
1905 (OP(first) == STAR &&
1906 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
1907 !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
1908 {
1909 /* turn .* into ^.* with an implied $*=1 */
1910 const int type =
1911 (OP(REGNODE_AFTER(first)) == REG_ANY)
1912 ? PREGf_ANCH_MBOL
1913 : PREGf_ANCH_SBOL;
1914 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
1915 first = REGNODE_AFTER(first);
1916 goto again;
1917 }
1918 if (sawplus && !sawminmod && !sawlookahead
1919 && (!sawopen || !RExC_sawback)
1920 && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
1921 /* x+ must match at the 1st pos of run of x's */
1922 RExC_rx->intflags |= PREGf_SKIP;
1923
1924 /* Scan is after the zeroth branch, first is atomic matcher. */
1925 #ifdef TRIE_STUDY_OPT
1926 /* search for "restudy" in this file for a detailed explanation */
1927 DEBUG_PARSE_r(
1928 if (!restudied)
1929 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
1930 (IV)(first - scan + 1))
1931 );
1932 #else
1933 DEBUG_PARSE_r(
1934 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
1935 (IV)(first - scan + 1))
1936 );
1937 #endif
1938
1939
1940 /*
1941 * If there's something expensive in the r.e., find the
1942 * longest literal string that must appear and make it the
1943 * regmust. Resolve ties in favor of later strings, since
1944 * the regstart check works with the beginning of the r.e.
1945 * and avoiding duplication strengthens checking. Not a
1946 * strong reason, but sufficient in the absence of others.
1947 * [Now we resolve ties in favor of the earlier string if
1948 * it happens that c_offset_min has been invalidated, since the
1949 * earlier string may buy us something the later one won't.]
1950 */
1951
1952 data.substrs[0].str = newSVpvs("");
1953 data.substrs[1].str = newSVpvs("");
1954 data.last_found = newSVpvs("");
1955 data.cur_is_floating = 0; /* initially any found substring is fixed */
1956 ENTER_with_name("study_chunk");
1957 SAVEFREESV(data.substrs[0].str);
1958 SAVEFREESV(data.substrs[1].str);
1959 SAVEFREESV(data.last_found);
1960 first = scan;
1961 if (!RExC_rxi->regstclass) {
1962 ssc_init(pRExC_state, &ch_class);
1963 data.start_class = &ch_class;
1964 stclass_flag = SCF_DO_STCLASS_AND;
1965 } else /* XXXX Check for BOUND? */
1966 stclass_flag = 0;
1967 data.last_closep = &last_close;
1968 data.last_close_opp = &last_close_op;
1969
1970 DEBUG_RExC_seen();
1971 /*
1972 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
1973 * (NO top level branches)
1974 */
1975 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
1976 scan + RExC_size, /* Up to end */
1977 &data, -1, 0, NULL,
1978 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
1979 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
1980 0, TRUE);
1981 /* search for "restudy" in this file for a detailed explanation
1982 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
1983
1984
1985 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
1986
1987
1988 if ( RExC_total_parens == 1 && !data.cur_is_floating
1989 && data.last_start_min == 0 && data.last_end > 0
1990 && !RExC_seen_zerolen
1991 && !(RExC_seen & REG_VERBARG_SEEN)
1992 && !(RExC_seen & REG_GPOS_SEEN)
1993 ){
1994 RExC_rx->extflags |= RXf_CHECK_ALL;
1995 }
1996 scan_commit(pRExC_state, &data,&minlen, 0);
1997
1998
1999 /* XXX this is done in reverse order because that's the way the
2000 * code was before it was parameterised. Don't know whether it
2001 * actually needs doing in reverse order. DAPM */
2002 for (i = 1; i >= 0; i--) {
2003 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
2004
2005 if ( !( i
2006 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
2007 && data.substrs[0].min_offset
2008 == data.substrs[1].min_offset
2009 && SvCUR(data.substrs[0].str)
2010 == SvCUR(data.substrs[1].str)
2011 )
2012 && S_setup_longest (aTHX_ pRExC_state,
2013 &(RExC_rx->substrs->data[i]),
2014 &(data.substrs[i]),
2015 longest_length[i]))
2016 {
2017 RExC_rx->substrs->data[i].min_offset =
2018 data.substrs[i].min_offset - data.substrs[i].lookbehind;
2019
2020 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
2021 /* Don't offset infinity */
2022 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
2023 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
2024 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
2025 }
2026 else {
2027 RExC_rx->substrs->data[i].substr = NULL;
2028 RExC_rx->substrs->data[i].utf8_substr = NULL;
2029 longest_length[i] = 0;
2030 }
2031 }
2032
2033 LEAVE_with_name("study_chunk");
2034
2035 if (RExC_rxi->regstclass
2036 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
2037 RExC_rxi->regstclass = NULL;
2038
2039 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
2040 || RExC_rx->substrs->data[0].min_offset)
2041 && stclass_flag
2042 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2043 && is_ssc_worth_it(pRExC_state, data.start_class))
2044 {
2045 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2046
2047 ssc_finalize(pRExC_state, data.start_class);
2048
2049 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2050 StructCopy(data.start_class,
2051 (regnode_ssc*)RExC_rxi->data->data[n],
2052 regnode_ssc);
2053 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2054 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2055 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
2056 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2057 Perl_re_printf( aTHX_
2058 "synthetic stclass \"%s\".\n",
2059 SvPVX_const(sv));});
2060 data.start_class = NULL;
2061 }
2062
2063 /* A temporary algorithm prefers floated substr to fixed one of
2064 * same length to dig more info. */
2065 i = (longest_length[0] <= longest_length[1]);
2066 RExC_rx->substrs->check_ix = i;
2067 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
2068 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
2069 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
2070 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
2071 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
2072 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
2073 RExC_rx->intflags |= PREGf_NOSCAN;
2074
2075 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
2076 RExC_rx->extflags |= RXf_USE_INTUIT;
2077 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
2078 RExC_rx->extflags |= RXf_INTUIT_TAIL;
2079 }
2080
2081 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
2082 if ( (STRLEN)minlen < longest_length[1] )
2083 minlen= longest_length[1];
2084 if ( (STRLEN)minlen < longest_length[0] )
2085 minlen= longest_length[0];
2086 */
2087 }
2088 else {
2089 /* Several toplevels. Best we can is to set minlen. */
2090 SSize_t fake_deltap;
2091 regnode_ssc ch_class;
2092 SSize_t last_close = 0;
2093 regnode *last_close_op = NULL;
2094
2095 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
2096
2097 scan = RExC_rxi->program + 1;
2098 ssc_init(pRExC_state, &ch_class);
2099 data.start_class = &ch_class;
2100 data.last_closep = &last_close;
2101 data.last_close_opp = &last_close_op;
2102
2103 DEBUG_RExC_seen();
2104 /*
2105 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
2106 * (patterns WITH top level branches)
2107 */
2108 minlen = study_chunk(pRExC_state,
2109 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
2110 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
2111 ? SCF_TRIE_DOING_RESTUDY
2112 : 0),
2113 0, TRUE);
2114 /* search for "restudy" in this file for a detailed explanation
2115 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2116
2117 CHECK_RESTUDY_GOTO_butfirst(NOOP);
2118
2119 RExC_rx->check_substr = NULL;
2120 RExC_rx->check_utf8 = NULL;
2121 RExC_rx->substrs->data[0].substr = NULL;
2122 RExC_rx->substrs->data[0].utf8_substr = NULL;
2123 RExC_rx->substrs->data[1].substr = NULL;
2124 RExC_rx->substrs->data[1].utf8_substr = NULL;
2125
2126 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2127 && is_ssc_worth_it(pRExC_state, data.start_class))
2128 {
2129 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2130
2131 ssc_finalize(pRExC_state, data.start_class);
2132
2133 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2134 StructCopy(data.start_class,
2135 (regnode_ssc*)RExC_rxi->data->data[n],
2136 regnode_ssc);
2137 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2138 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2139 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
2140 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2141 Perl_re_printf( aTHX_
2142 "synthetic stclass \"%s\".\n",
2143 SvPVX_const(sv));});
2144 data.start_class = NULL;
2145 }
2146 }
2147
2148 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
2149 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
2150 RExC_rx->maxlen = REG_INFTY;
2151 }
2152 else {
2153 RExC_rx->maxlen = RExC_maxlen;
2154 }
2155
2156 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
2157 the "real" pattern. */
2158 DEBUG_OPTIMISE_r({
2159 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
2160 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
2161 });
2162 RExC_rx->minlenret = minlen;
2163 if (RExC_rx->minlen < minlen)
2164 RExC_rx->minlen = minlen;
2165
2166 if (RExC_seen & REG_RECURSE_SEEN ) {
2167 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
2168 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
2169 }
2170 if (RExC_seen & REG_GPOS_SEEN)
2171 RExC_rx->intflags |= PREGf_GPOS_SEEN;
2172
2173 if (RExC_seen & REG_PESSIMIZE_SEEN)
2174 RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN;
2175
2176 if (RExC_seen & REG_LOOKBEHIND_SEEN)
2177 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
2178 lookbehind */
2179 if (pRExC_state->code_blocks)
2180 RExC_rx->extflags |= RXf_EVAL_SEEN;
2181
2182 if (RExC_seen & REG_VERBARG_SEEN) {
2183 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
2184 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
2185 }
2186
2187 if (RExC_seen & REG_CUTGROUP_SEEN)
2188 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
2189
2190 if (pm_flags & PMf_USE_RE_EVAL)
2191 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
2192
2193 if (RExC_paren_names)
2194 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
2195 else
2196 RXp_PAREN_NAMES(RExC_rx) = NULL;
2197
2198 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
2199 * so it can be used in pp.c */
2200 if (RExC_rx->intflags & PREGf_ANCH)
2201 RExC_rx->extflags |= RXf_IS_ANCHORED;
2202
2203
2204 {
2205 /* this is used to identify "special" patterns that might result
2206 * in Perl NOT calling the regex engine and instead doing the match "itself",
2207 * particularly special cases in split//. By having the regex compiler
2208 * do this pattern matching at a regop level (instead of by inspecting the pattern)
2209 * we avoid weird issues with equivalent patterns resulting in different behavior,
2210 * AND we allow non Perl engines to get the same optimizations by the setting the
2211 * flags appropriately - Yves */
2212 regnode *first = RExC_rxi->program + 1;
2213 U8 fop = OP(first);
2214 regnode *next = NULL;
2215 U8 nop = 0;
2216 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
2217 next = REGNODE_AFTER(first);
2218 nop = OP(next);
2219 }
2220 /* It's safe to read through *next only if OP(first) is a regop of
2221 * the right type (not EXACT, for example).
2222 */
2223 if (REGNODE_TYPE(fop) == NOTHING && nop == END)
2224 RExC_rx->extflags |= RXf_NULL;
2225 else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END)
2226 /* when fop is SBOL first->flags will be true only when it was
2227 * produced by parsing /\A/, and not when parsing /^/. This is
2228 * very important for the split code as there we want to
2229 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
2230 * See rt #122761 for more details. -- Yves */
2231 RExC_rx->extflags |= RXf_START_ONLY;
2232 else if (fop == PLUS
2233 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
2234 && OP(regnext(first)) == END)
2235 RExC_rx->extflags |= RXf_WHITE;
2236 else if ( RExC_rx->extflags & RXf_SPLIT
2237 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
2238 && STR_LEN(first) == 1
2239 && *(STRING(first)) == ' '
2240 && OP(regnext(first)) == END )
2241 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
2242
2243 }
2244
2245 if (RExC_contains_locale) {
2246 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
2247 }
2248
2249 #ifdef DEBUGGING
2250 if (RExC_paren_names) {
2251 RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
2252 RExC_rxi->data->data[RExC_rxi->name_list_idx]
2253 = (void*)SvREFCNT_inc(RExC_paren_name_list);
2254 } else
2255 #endif
2256 RExC_rxi->name_list_idx = 0;
2257
2258 while ( RExC_recurse_count > 0 ) {
2259 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
2260 /*
2261 * This data structure is set up in study_chunk() and is used
2262 * to calculate the distance between a GOSUB regopcode and
2263 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
2264 * it refers to.
2265 *
2266 * If for some reason someone writes code that optimises
2267 * away a GOSUB opcode then the assert should be changed to
2268 * an if(scan) to guard the ARG2i_SET() - Yves
2269 *
2270 */
2271 assert(scan && OP(scan) == GOSUB);
2272 ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
2273 }
2274 if (RExC_logical_total_parens != RExC_total_parens) {
2275 Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
2276 /* we rebuild this below */
2277 Zero(RExC_logical_to_parno, RExC_total_parens, I32);
2278 for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
2279 int logical_parno= RExC_parno_to_logical[parno];
2280 assert(logical_parno);
2281 RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
2282 RExC_logical_to_parno[logical_parno] = parno;
2283 }
2284 RExC_rx->logical_to_parno = RExC_logical_to_parno;
2285 RExC_rx->parno_to_logical = RExC_parno_to_logical;
2286 RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
2287 RExC_logical_to_parno = NULL;
2288 RExC_parno_to_logical = NULL;
2289 RExC_parno_to_logical_next = NULL;
2290 } else {
2291 RExC_rx->logical_to_parno = NULL;
2292 RExC_rx->parno_to_logical = NULL;
2293 RExC_rx->parno_to_logical_next = NULL;
2294 }
2295
2296 Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
2297 /* assume we don't need to swap parens around before we match */
2298 DEBUG_TEST_r({
2299 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
2300 (unsigned long)RExC_study_chunk_recursed_count);
2301 });
2302 DEBUG_DUMP_r({
2303 DEBUG_RExC_seen();
2304 Perl_re_printf( aTHX_ "Final program:\n");
2305 regdump(RExC_rx);
2306 });
2307
2308 if (RExC_open_parens) {
2309 Safefree(RExC_open_parens);
2310 RExC_open_parens = NULL;
2311 }
2312 if (RExC_close_parens) {
2313 Safefree(RExC_close_parens);
2314 RExC_close_parens = NULL;
2315 }
2316 if (RExC_logical_to_parno) {
2317 Safefree(RExC_logical_to_parno);
2318 RExC_logical_to_parno = NULL;
2319 }
2320 if (RExC_parno_to_logical) {
2321 Safefree(RExC_parno_to_logical);
2322 RExC_parno_to_logical = NULL;
2323 }
2324
2325 #ifdef USE_ITHREADS
2326 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
2327 * by setting the regexp SV to readonly-only instead. If the
2328 * pattern's been recompiled, the USEDness should remain. */
2329 if (old_re && SvREADONLY(old_re))
2330 SvREADONLY_on(Rx);
2331 #endif
2332 return Rx;
2333 }
2334
2335
2336
2337 SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)2338 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
2339 {
2340 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
2341 PERL_UNUSED_ARG(rx);
2342 if (0)
2343 return NULL;
2344 else
2345 return newSVpvs("Regexp");
2346 }
2347
2348 /* Scans the name of a named buffer from the pattern.
2349 * If flags is REG_RSN_RETURN_NULL returns null.
2350 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
2351 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
2352 * to the parsed name as looked up in the RExC_paren_names hash.
2353 * If there is an error throws a vFAIL().. type exception.
2354 */
2355
2356 #define REG_RSN_RETURN_NULL 0
2357 #define REG_RSN_RETURN_NAME 1
2358 #define REG_RSN_RETURN_DATA 2
2359
2360 STATIC SV*
S_reg_scan_name(pTHX_ RExC_state_t * pRExC_state,U32 flags)2361 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
2362 {
2363 char *name_start = RExC_parse;
2364 SV* sv_name;
2365
2366 PERL_ARGS_ASSERT_REG_SCAN_NAME;
2367
2368 assert (RExC_parse <= RExC_end);
2369 if (RExC_parse == RExC_end) NOOP;
2370 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
2371 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
2372 * using do...while */
2373 if (UTF)
2374 do {
2375 RExC_parse_inc_utf8();
2376 } while ( RExC_parse < RExC_end
2377 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
2378 else
2379 do {
2380 RExC_parse_inc_by(1);
2381 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
2382 } else {
2383 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
2384 character */
2385 vFAIL("Group name must start with a non-digit word character");
2386 }
2387 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
2388 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
2389 if ( flags == REG_RSN_RETURN_NAME)
2390 return sv_name;
2391 else if (flags==REG_RSN_RETURN_DATA) {
2392 HE *he_str = NULL;
2393 SV *sv_dat = NULL;
2394 if ( ! sv_name ) /* should not happen*/
2395 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
2396 if (RExC_paren_names)
2397 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
2398 if ( he_str )
2399 sv_dat = HeVAL(he_str);
2400 if ( ! sv_dat ) { /* Didn't find group */
2401
2402 /* It might be a forward reference; we can't fail until we
2403 * know, by completing the parse to get all the groups, and
2404 * then reparsing */
2405 if (ALL_PARENS_COUNTED) {
2406 vFAIL("Reference to nonexistent named group");
2407 }
2408 else {
2409 REQUIRE_PARENS_PASS;
2410 }
2411 }
2412 return sv_dat;
2413 }
2414
2415 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
2416 (unsigned long) flags);
2417 }
2418
2419 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
2420 if (RExC_lastparse!=RExC_parse) { \
2421 Perl_re_printf( aTHX_ "%s", \
2422 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
2423 RExC_end - RExC_parse, 16, \
2424 "", "", \
2425 PERL_PV_ESCAPE_UNI_DETECT | \
2426 PERL_PV_PRETTY_ELLIPSES | \
2427 PERL_PV_PRETTY_LTGT | \
2428 PERL_PV_ESCAPE_RE | \
2429 PERL_PV_PRETTY_EXACTSIZE \
2430 ) \
2431 ); \
2432 } else \
2433 Perl_re_printf( aTHX_ "%16s",""); \
2434 \
2435 if (RExC_lastnum!=RExC_emit) \
2436 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
2437 else \
2438 Perl_re_printf( aTHX_ "|%4s",""); \
2439 Perl_re_printf( aTHX_ "|%*s%-4s", \
2440 (int)((depth*2)), "", \
2441 (funcname) \
2442 ); \
2443 RExC_lastnum=RExC_emit; \
2444 RExC_lastparse=RExC_parse; \
2445 })
2446
2447
2448
2449 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
2450 DEBUG_PARSE_MSG((funcname)); \
2451 Perl_re_printf( aTHX_ "%4s","\n"); \
2452 })
2453 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
2454 DEBUG_PARSE_MSG((funcname)); \
2455 Perl_re_printf( aTHX_ fmt "\n",args); \
2456 })
2457
2458
2459 STATIC void
S_parse_lparen_question_flags(pTHX_ RExC_state_t * pRExC_state)2460 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
2461 {
2462 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
2463 * constructs, and updates RExC_flags with them. On input, RExC_parse
2464 * should point to the first flag; it is updated on output to point to the
2465 * final ')' or ':'. There needs to be at least one flag, or this will
2466 * abort */
2467
2468 /* for (?g), (?gc), and (?o) warnings; warning
2469 about (?c) will warn about (?g) -- japhy */
2470
2471 #define WASTED_O 0x01
2472 #define WASTED_G 0x02
2473 #define WASTED_C 0x04
2474 #define WASTED_GC (WASTED_G|WASTED_C)
2475 I32 wastedflags = 0x00;
2476 U32 posflags = 0, negflags = 0;
2477 U32 *flagsp = &posflags;
2478 char has_charset_modifier = '\0';
2479 regex_charset cs;
2480 bool has_use_defaults = FALSE;
2481 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
2482 int x_mod_count = 0;
2483
2484 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
2485
2486 /* '^' as an initial flag sets certain defaults */
2487 if (UCHARAT(RExC_parse) == '^') {
2488 RExC_parse_inc_by(1);
2489 has_use_defaults = TRUE;
2490 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
2491 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2492 ? REGEX_UNICODE_CHARSET
2493 : REGEX_DEPENDS_CHARSET;
2494 set_regex_charset(&RExC_flags, cs);
2495 }
2496 else {
2497 cs = get_regex_charset(RExC_flags);
2498 if ( cs == REGEX_DEPENDS_CHARSET
2499 && (toUSE_UNI_CHARSET_NOT_DEPENDS))
2500 {
2501 cs = REGEX_UNICODE_CHARSET;
2502 }
2503 }
2504
2505 while (RExC_parse < RExC_end) {
2506 /* && memCHRs("iogcmsx", *RExC_parse) */
2507 /* (?g), (?gc) and (?o) are useless here
2508 and must be globally applied -- japhy */
2509 if ((RExC_pm_flags & PMf_WILDCARD)) {
2510 if (flagsp == & negflags) {
2511 if (*RExC_parse == 'm') {
2512 RExC_parse_inc_by(1);
2513 /* diag_listed_as: Use of %s is not allowed in Unicode
2514 property wildcard subpatterns in regex; marked by <--
2515 HERE in m/%s/ */
2516 vFAIL("Use of modifier '-m' is not allowed in Unicode"
2517 " property wildcard subpatterns");
2518 }
2519 }
2520 else {
2521 if (*RExC_parse == 's') {
2522 goto modifier_illegal_in_wildcard;
2523 }
2524 }
2525 }
2526
2527 switch (*RExC_parse) {
2528
2529 /* Code for the imsxn flags */
2530 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
2531
2532 case LOCALE_PAT_MOD:
2533 if (has_charset_modifier) {
2534 goto excess_modifier;
2535 }
2536 else if (flagsp == &negflags) {
2537 goto neg_modifier;
2538 }
2539 cs = REGEX_LOCALE_CHARSET;
2540 has_charset_modifier = LOCALE_PAT_MOD;
2541 break;
2542 case UNICODE_PAT_MOD:
2543 if (has_charset_modifier) {
2544 goto excess_modifier;
2545 }
2546 else if (flagsp == &negflags) {
2547 goto neg_modifier;
2548 }
2549 cs = REGEX_UNICODE_CHARSET;
2550 has_charset_modifier = UNICODE_PAT_MOD;
2551 break;
2552 case ASCII_RESTRICT_PAT_MOD:
2553 if (flagsp == &negflags) {
2554 goto neg_modifier;
2555 }
2556 if (has_charset_modifier) {
2557 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
2558 goto excess_modifier;
2559 }
2560 /* Doubled modifier implies more restricted */
2561 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
2562 }
2563 else {
2564 cs = REGEX_ASCII_RESTRICTED_CHARSET;
2565 }
2566 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
2567 break;
2568 case DEPENDS_PAT_MOD:
2569 if (has_use_defaults) {
2570 goto fail_modifiers;
2571 }
2572 else if (flagsp == &negflags) {
2573 goto neg_modifier;
2574 }
2575 else if (has_charset_modifier) {
2576 goto excess_modifier;
2577 }
2578
2579 /* The dual charset means unicode semantics if the
2580 * pattern (or target, not known until runtime) are
2581 * utf8, or something in the pattern indicates unicode
2582 * semantics */
2583 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2584 ? REGEX_UNICODE_CHARSET
2585 : REGEX_DEPENDS_CHARSET;
2586 has_charset_modifier = DEPENDS_PAT_MOD;
2587 break;
2588 excess_modifier:
2589 RExC_parse_inc_by(1);
2590 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
2591 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
2592 }
2593 else if (has_charset_modifier == *(RExC_parse - 1)) {
2594 vFAIL2("Regexp modifier \"%c\" may not appear twice",
2595 *(RExC_parse - 1));
2596 }
2597 else {
2598 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
2599 }
2600 NOT_REACHED; /*NOTREACHED*/
2601 neg_modifier:
2602 RExC_parse_inc_by(1);
2603 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
2604 *(RExC_parse - 1));
2605 NOT_REACHED; /*NOTREACHED*/
2606 case GLOBAL_PAT_MOD: /* 'g' */
2607 if (RExC_pm_flags & PMf_WILDCARD) {
2608 goto modifier_illegal_in_wildcard;
2609 }
2610 /*FALLTHROUGH*/
2611 case ONCE_PAT_MOD: /* 'o' */
2612 if (ckWARN(WARN_REGEXP)) {
2613 const I32 wflagbit = *RExC_parse == 'o'
2614 ? WASTED_O
2615 : WASTED_G;
2616 if (! (wastedflags & wflagbit) ) {
2617 wastedflags |= wflagbit;
2618 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2619 vWARN5(
2620 RExC_parse + 1,
2621 "Useless (%s%c) - %suse /%c modifier",
2622 flagsp == &negflags ? "?-" : "?",
2623 *RExC_parse,
2624 flagsp == &negflags ? "don't " : "",
2625 *RExC_parse
2626 );
2627 }
2628 }
2629 break;
2630
2631 case CONTINUE_PAT_MOD: /* 'c' */
2632 if (RExC_pm_flags & PMf_WILDCARD) {
2633 goto modifier_illegal_in_wildcard;
2634 }
2635 if (ckWARN(WARN_REGEXP)) {
2636 if (! (wastedflags & WASTED_C) ) {
2637 wastedflags |= WASTED_GC;
2638 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2639 vWARN3(
2640 RExC_parse + 1,
2641 "Useless (%sc) - %suse /gc modifier",
2642 flagsp == &negflags ? "?-" : "?",
2643 flagsp == &negflags ? "don't " : ""
2644 );
2645 }
2646 }
2647 break;
2648 case KEEPCOPY_PAT_MOD: /* 'p' */
2649 if (RExC_pm_flags & PMf_WILDCARD) {
2650 goto modifier_illegal_in_wildcard;
2651 }
2652 if (flagsp == &negflags) {
2653 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
2654 } else {
2655 *flagsp |= RXf_PMf_KEEPCOPY;
2656 }
2657 break;
2658 case '-':
2659 /* A flag is a default iff it is following a minus, so
2660 * if there is a minus, it means will be trying to
2661 * re-specify a default which is an error */
2662 if (has_use_defaults || flagsp == &negflags) {
2663 goto fail_modifiers;
2664 }
2665 flagsp = &negflags;
2666 wastedflags = 0; /* reset so (?g-c) warns twice */
2667 x_mod_count = 0;
2668 break;
2669 case ':':
2670 case ')':
2671
2672 if ( (RExC_pm_flags & PMf_WILDCARD)
2673 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
2674 {
2675 RExC_parse_inc_by(1);
2676 /* diag_listed_as: Use of %s is not allowed in Unicode
2677 property wildcard subpatterns in regex; marked by <--
2678 HERE in m/%s/ */
2679 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
2680 " property wildcard subpatterns",
2681 has_charset_modifier);
2682 }
2683
2684 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
2685 negflags |= RXf_PMf_EXTENDED_MORE;
2686 }
2687 RExC_flags |= posflags;
2688
2689 if (negflags & RXf_PMf_EXTENDED) {
2690 negflags |= RXf_PMf_EXTENDED_MORE;
2691 }
2692 RExC_flags &= ~negflags;
2693 set_regex_charset(&RExC_flags, cs);
2694
2695 return;
2696 default:
2697 fail_modifiers:
2698 RExC_parse_inc_if_char();
2699 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
2700 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
2701 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
2702 NOT_REACHED; /*NOTREACHED*/
2703 }
2704
2705 RExC_parse_inc();
2706 }
2707
2708 vFAIL("Sequence (?... not terminated");
2709
2710 modifier_illegal_in_wildcard:
2711 RExC_parse_inc_by(1);
2712 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
2713 subpatterns in regex; marked by <-- HERE in m/%s/ */
2714 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
2715 " subpatterns", *(RExC_parse - 1));
2716 }
2717
2718 /*
2719 - reg - regular expression, i.e. main body or parenthesized thing
2720 *
2721 * Caller must absorb opening parenthesis.
2722 *
2723 * Combining parenthesis handling with the base level of regular expression
2724 * is a trifle forced, but the need to tie the tails of the branches to what
2725 * follows makes it hard to avoid.
2726 */
2727
2728 STATIC regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,char * backref_parse_start,char ch)2729 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
2730 I32 *flagp,
2731 char * backref_parse_start,
2732 char ch
2733 )
2734 {
2735 regnode_offset ret;
2736 char* name_start = RExC_parse;
2737 U32 num = 0;
2738 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
2739 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2740
2741 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
2742
2743 if (RExC_parse != name_start && ch == '}') {
2744 while (isBLANK(*RExC_parse)) {
2745 RExC_parse_inc_by(1);
2746 }
2747 }
2748 if (RExC_parse == name_start || *RExC_parse != ch) {
2749 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
2750 vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
2751 }
2752
2753 if (sv_dat) {
2754 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
2755 RExC_rxi->data->data[num]=(void*)sv_dat;
2756 SvREFCNT_inc_simple_void_NN(sv_dat);
2757 }
2758 RExC_sawback = 1;
2759 ret = reg2node(pRExC_state,
2760 ((! FOLD)
2761 ? REFN
2762 : (ASCII_FOLD_RESTRICTED)
2763 ? REFFAN
2764 : (AT_LEAST_UNI_SEMANTICS)
2765 ? REFFUN
2766 : (LOC)
2767 ? REFFLN
2768 : REFFN),
2769 num, RExC_nestroot);
2770 if (RExC_nestroot && num >= (U32)RExC_nestroot)
2771 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
2772 *flagp |= HASWIDTH;
2773
2774 nextchar(pRExC_state);
2775 return ret;
2776 }
2777
2778 /* reg_la_NOTHING()
2779 *
2780 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2781 * NOTHING regop when the construct is empty.
2782 *
2783 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2784 *
2785 * Checks for unterminated constructs and throws a "not terminated" error
2786 * with the appropriate type if necessary
2787 *
2788 * Assuming it does not throw an exception increments RExC_seen_zerolen.
2789 *
2790 * If the construct is empty generates a NOTHING op and returns its
2791 * regnode_offset, which the caller would then return to its caller.
2792 *
2793 * If the construct is not empty increments RExC_in_lookaround, and turns
2794 * on any flags provided in RExC_seen, and then returns 0 to signify
2795 * that parsing should continue.
2796 *
2797 * PS: I would have called this reg_parse_lookaround_NOTHING() but then
2798 * any use of it would have had to be broken onto multiple lines, hence
2799 * the abbreviation.
2800 */
2801 STATIC regnode_offset
S_reg_la_NOTHING(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2802 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2803 const char *type)
2804 {
2805
2806 PERL_ARGS_ASSERT_REG_LA_NOTHING;
2807
2808 /* false below so we do not force /x */
2809 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2810
2811 if (RExC_parse >= RExC_end)
2812 vFAIL2("Sequence (%s... not terminated", type);
2813
2814 /* Always increment as NOTHING regops are zerolen */
2815 RExC_seen_zerolen++;
2816
2817 if (*RExC_parse == ')') {
2818 regnode_offset ret= reg_node(pRExC_state, NOTHING);
2819 nextchar(pRExC_state);
2820 return ret;
2821 }
2822
2823 RExC_seen |= flags;
2824 RExC_in_lookaround++;
2825 return 0; /* keep parsing! */
2826 }
2827
2828 /* reg_la_OPFAIL()
2829 *
2830 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2831 * OPFAIL regop when the construct is empty.
2832 *
2833 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2834 *
2835 * Checks for unterminated constructs and throws a "not terminated" error
2836 * if necessary.
2837 *
2838 * If the construct is empty generates an OPFAIL op and returns its
2839 * regnode_offset which the caller should then return to its caller.
2840 *
2841 * If the construct is not empty increments RExC_in_lookaround, and also
2842 * increments RExC_seen_zerolen, and turns on the flags provided in
2843 * RExC_seen, and then returns 0 to signify that parsing should continue.
2844 *
2845 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
2846 * any use of it would have had to be broken onto multiple lines, hence
2847 * the abbreviation.
2848 */
2849
2850 STATIC regnode_offset
S_reg_la_OPFAIL(pTHX_ RExC_state_t * pRExC_state,U32 flags,const char * type)2851 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2852 const char *type)
2853 {
2854
2855 PERL_ARGS_ASSERT_REG_LA_OPFAIL;
2856
2857 /* FALSE so we don't force to /x below */;
2858 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2859
2860 if (RExC_parse >= RExC_end)
2861 vFAIL2("Sequence (%s... not terminated", type);
2862
2863 if (*RExC_parse == ')') {
2864 regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
2865 nextchar(pRExC_state);
2866 return ret; /* return produced regop */
2867 }
2868
2869 /* only increment zerolen *after* we check if we produce an OPFAIL
2870 * as an OPFAIL does not match a zero length construct, as it
2871 * does not match ever. */
2872 RExC_seen_zerolen++;
2873 RExC_seen |= flags;
2874 RExC_in_lookaround++;
2875 return 0; /* keep parsing! */
2876 }
2877
2878 /* Below are the main parsing routines.
2879 *
2880 * S_reg() parses a whole pattern or subpattern. It itself handles things
2881 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
2882 * alternation '|' in the '...' pattern.
2883 * S_regbranch() effectively implements the concatenation operator, handling
2884 * one alternative of '|', repeatedly calling S_regpiece on each
2885 * segment of the input.
2886 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
2887 * and then adds any quantifier for that chunk.
2888 * S_regatom() parses the next chunk of the input, returning when it
2889 * determines it has found a complete atomic chunk. The chunk may
2890 * be a nested subpattern, in which case S_reg is called
2891 * recursively
2892 *
2893 * The functions generate regnodes as they go along, appending each to the
2894 * pattern data structure so far. They return the offset of the current final
2895 * node into that structure, or 0 on failure.
2896 *
2897 * There are three parameters common to all of them:
2898 * pRExC_state is a structure with much information about the current
2899 * state of the parse. It's easy to add new elements to
2900 * convey new information, but beware that an error return may
2901 * require clearing the element.
2902 * flagp is a pointer to bit flags set in a lower level to pass up
2903 * to higher levels information, such as the cause of a
2904 * failure, or some characteristic about the generated node
2905 * depth is roughly the recursion depth, mostly unused except for
2906 * pretty printing debugging info.
2907 *
2908 * There are ancillary functions that these may farm work out to, using the
2909 * same parameters.
2910 *
2911 * The protocol for handling flags is that each function will, before
2912 * returning, add into *flagp the flags it needs to pass up. Each function has
2913 * a second flags variable, typically named 'flags', which it sets and clears
2914 * at will. Flag bits in it are used in that function, and it calls the next
2915 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return,
2916 * 'flags' will contain whatever it had before the call, plus whatever that
2917 * function passed up. If it wants to pass any of these up to its caller, it
2918 * has to add them to its *flagp. This means that it takes extra steps to keep
2919 * passing a flag upwards, and otherwise the flag bit is cleared for higher
2920 * functions.
2921 */
2922
2923 /* On success, returns the offset at which any next node should be placed into
2924 * the regex engine program being compiled.
2925 *
2926 * Returns 0 otherwise, with *flagp set to indicate why:
2927 * TRYAGAIN at the end of (?) that only sets flags.
2928 * RESTART_PARSE if the parse needs to be restarted, or'd with
2929 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
2930 * Otherwise would only return 0 if regbranch() returns 0, which cannot
2931 * happen. */
2932 STATIC regnode_offset
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp,U32 depth)2933 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
2934 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
2935 * 2 is like 1, but indicates that nextchar() has been called to advance
2936 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
2937 * this flag alerts us to the need to check for that */
2938 {
2939 regnode_offset ret = 0; /* Will be the head of the group. */
2940 regnode_offset br;
2941 regnode_offset lastbr;
2942 regnode_offset ender = 0;
2943 I32 logical_parno = 0;
2944 I32 parno = 0;
2945 I32 flags;
2946 U32 oregflags = RExC_flags;
2947 bool have_branch = 0;
2948 bool is_open = 0;
2949 I32 freeze_paren = 0;
2950 I32 after_freeze = 0;
2951 I32 num; /* numeric backreferences */
2952 SV * max_open; /* Max number of unclosed parens */
2953 I32 was_in_lookaround = RExC_in_lookaround;
2954 I32 fake_eval = 0; /* matches paren */
2955
2956 /* The difference between the following variables can be seen with *
2957 * the broken pattern /(?:foo/ where segment_parse_start will point *
2958 * at the 'f', and reg_parse_start will point at the '(' */
2959
2960 /* the following is used for unmatched '(' errors */
2961 char * const reg_parse_start = RExC_parse;
2962
2963 /* the following is used to track where various segments of
2964 * the pattern that we parse out started. */
2965 char * segment_parse_start = RExC_parse;
2966
2967 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2968
2969 PERL_ARGS_ASSERT_REG;
2970 DEBUG_PARSE("reg ");
2971
2972 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
2973 assert(max_open);
2974 if (!SvIOK(max_open)) {
2975 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
2976 }
2977 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
2978 open paren */
2979 vFAIL("Too many nested open parens");
2980 }
2981
2982 *flagp = 0; /* Initialize. */
2983
2984 /* Having this true makes it feasible to have a lot fewer tests for the
2985 * parse pointer being in scope. For example, we can write
2986 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2987 * instead of
2988 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2989 */
2990 assert(*RExC_end == '\0');
2991
2992 /* Make an OPEN node, if parenthesized. */
2993 if (paren) {
2994
2995 /* Under /x, space and comments can be gobbled up between the '(' and
2996 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
2997 * intervening space, as the sequence is a token, and a token should be
2998 * indivisible */
2999 bool has_intervening_patws = (paren == 2)
3000 && *(RExC_parse - 1) != '(';
3001
3002 if (RExC_parse >= RExC_end) {
3003 vFAIL("Unmatched (");
3004 }
3005
3006 if (paren == 'r') { /* Atomic script run */
3007 paren = '>';
3008 goto parse_rest;
3009 }
3010 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
3011 if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */
3012 fake_eval = '{';
3013 goto handle_qmark;
3014 }
3015
3016 char *start_verb = RExC_parse + 1;
3017 STRLEN verb_len;
3018 char *start_arg = NULL;
3019 unsigned char op = 0;
3020 int arg_required = 0;
3021 int internal_argval = -1; /* if > -1 no argument allowed */
3022 bool has_upper = FALSE;
3023 U32 seen_flag_set = 0; /* RExC_seen flags we must set */
3024
3025 if (has_intervening_patws) {
3026 RExC_parse_inc_by(1); /* past the '*' */
3027
3028 /* For strict backwards compatibility, don't change the message
3029 * now that we also have lowercase operands */
3030 if (isUPPER(*RExC_parse)) {
3031 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
3032 }
3033 else {
3034 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
3035 }
3036 }
3037 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
3038 if ( *RExC_parse == ':' ) {
3039 start_arg = RExC_parse + 1;
3040 break;
3041 }
3042 else if (! UTF) {
3043 if (isUPPER(*RExC_parse)) {
3044 has_upper = TRUE;
3045 }
3046 RExC_parse_inc_by(1);
3047 }
3048 else {
3049 RExC_parse_inc_utf8();
3050 }
3051 }
3052 verb_len = RExC_parse - start_verb;
3053 if ( start_arg ) {
3054 if (RExC_parse >= RExC_end) {
3055 goto unterminated_verb_pattern;
3056 }
3057
3058 RExC_parse_inc();
3059 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
3060 RExC_parse_inc();
3061 }
3062 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3063 unterminated_verb_pattern:
3064 if (has_upper) {
3065 vFAIL("Unterminated verb pattern argument");
3066 }
3067 else {
3068 vFAIL("Unterminated '(*...' argument");
3069 }
3070 }
3071 } else {
3072 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3073 if (has_upper) {
3074 vFAIL("Unterminated verb pattern");
3075 }
3076 else {
3077 vFAIL("Unterminated '(*...' construct");
3078 }
3079 }
3080 }
3081
3082 /* Here, we know that RExC_parse < RExC_end */
3083
3084 switch ( *start_verb ) {
3085 case 'A': /* (*ACCEPT) */
3086 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
3087 op = ACCEPT;
3088 internal_argval = RExC_nestroot;
3089 }
3090 break;
3091 case 'C': /* (*COMMIT) */
3092 if ( memEQs(start_verb, verb_len,"COMMIT") )
3093 op = COMMIT;
3094 break;
3095 case 'F': /* (*FAIL) */
3096 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
3097 op = OPFAIL;
3098 }
3099 break;
3100 case ':': /* (*:NAME) */
3101 case 'M': /* (*MARK:NAME) */
3102 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
3103 op = MARKPOINT;
3104 arg_required = 1;
3105 }
3106 break;
3107 case 'P': /* (*PRUNE) */
3108 if ( memEQs(start_verb, verb_len,"PRUNE") )
3109 op = PRUNE;
3110 break;
3111 case 'S': /* (*SKIP) */
3112 if ( memEQs(start_verb, verb_len,"SKIP") )
3113 op = SKIP;
3114 break;
3115 case 'T': /* (*THEN) */
3116 /* [19:06] <TimToady> :: is then */
3117 if ( memEQs(start_verb, verb_len,"THEN") ) {
3118 op = CUTGROUP;
3119 RExC_seen |= REG_CUTGROUP_SEEN;
3120 }
3121 break;
3122 case 'a':
3123 if ( memEQs(start_verb, verb_len, "asr")
3124 || memEQs(start_verb, verb_len, "atomic_script_run"))
3125 {
3126 paren = 'r'; /* Mnemonic: recursed run */
3127 goto script_run;
3128 }
3129 else if (memEQs(start_verb, verb_len, "atomic")) {
3130 paren = 't'; /* AtOMIC */
3131 goto alpha_assertions;
3132 }
3133 break;
3134 case 'p':
3135 if ( memEQs(start_verb, verb_len, "plb")
3136 || memEQs(start_verb, verb_len, "positive_lookbehind"))
3137 {
3138 paren = 'b';
3139 goto lookbehind_alpha_assertions;
3140 }
3141 else if ( memEQs(start_verb, verb_len, "pla")
3142 || memEQs(start_verb, verb_len, "positive_lookahead"))
3143 {
3144 paren = 'a';
3145 goto alpha_assertions;
3146 }
3147 break;
3148 case 'n':
3149 if ( memEQs(start_verb, verb_len, "nlb")
3150 || memEQs(start_verb, verb_len, "negative_lookbehind"))
3151 {
3152 paren = 'B';
3153 goto lookbehind_alpha_assertions;
3154 }
3155 else if ( memEQs(start_verb, verb_len, "nla")
3156 || memEQs(start_verb, verb_len, "negative_lookahead"))
3157 {
3158 paren = 'A';
3159 goto alpha_assertions;
3160 }
3161 break;
3162 case 's':
3163 if ( memEQs(start_verb, verb_len, "sr")
3164 || memEQs(start_verb, verb_len, "script_run"))
3165 {
3166 regnode_offset atomic;
3167
3168 paren = 's';
3169
3170 script_run:
3171
3172 /* This indicates Unicode rules. */
3173 REQUIRE_UNI_RULES(flagp, 0);
3174
3175 if (! start_arg) {
3176 goto no_colon;
3177 }
3178
3179 RExC_parse_set(start_arg);
3180
3181 if (RExC_in_script_run) {
3182
3183 /* Nested script runs are treated as no-ops, because
3184 * if the nested one fails, the outer one must as
3185 * well. It could fail sooner, and avoid (??{} with
3186 * side effects, but that is explicitly documented as
3187 * undefined behavior. */
3188
3189 ret = 0;
3190
3191 if (paren == 's') {
3192 paren = ':';
3193 goto parse_rest;
3194 }
3195
3196 /* But, the atomic part of a nested atomic script run
3197 * isn't a no-op, but can be treated just like a '(?>'
3198 * */
3199 paren = '>';
3200 goto parse_rest;
3201 }
3202
3203 if (paren == 's') {
3204 /* Here, we're starting a new regular script run */
3205 ret = reg_node(pRExC_state, SROPEN);
3206 RExC_in_script_run = 1;
3207 is_open = 1;
3208 goto parse_rest;
3209 }
3210
3211 /* Here, we are starting an atomic script run. This is
3212 * handled by recursing to deal with the atomic portion
3213 * separately, enclosed in SROPEN ... SRCLOSE nodes */
3214
3215 ret = reg_node(pRExC_state, SROPEN);
3216
3217 RExC_in_script_run = 1;
3218
3219 atomic = reg(pRExC_state, 'r', &flags, depth);
3220 if (flags & (RESTART_PARSE|NEED_UTF8)) {
3221 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
3222 return 0;
3223 }
3224
3225 if (! REGTAIL(pRExC_state, ret, atomic)) {
3226 REQUIRE_BRANCHJ(flagp, 0);
3227 }
3228
3229 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
3230 SRCLOSE)))
3231 {
3232 REQUIRE_BRANCHJ(flagp, 0);
3233 }
3234
3235 RExC_in_script_run = 0;
3236 return ret;
3237 }
3238
3239 break;
3240
3241 lookbehind_alpha_assertions:
3242 seen_flag_set = REG_LOOKBEHIND_SEEN;
3243 /*FALLTHROUGH*/
3244
3245 alpha_assertions:
3246
3247 if ( !start_arg ) {
3248 goto no_colon;
3249 }
3250
3251 if ( RExC_parse == start_arg ) {
3252 if ( paren == 'A' || paren == 'B' ) {
3253 /* An empty negative lookaround assertion is failure.
3254 * See also: S_reg_la_OPFAIL() */
3255
3256 /* Note: OPFAIL is *not* zerolen. */
3257 ret = reg1node(pRExC_state, OPFAIL, 0);
3258 nextchar(pRExC_state);
3259 return ret;
3260 }
3261 else
3262 if ( paren == 'a' || paren == 'b' ) {
3263 /* An empty positive lookaround assertion is success.
3264 * See also: S_reg_la_NOTHING() */
3265
3266 /* Note: NOTHING is zerolen, so increment here */
3267 RExC_seen_zerolen++;
3268 ret = reg_node(pRExC_state, NOTHING);
3269 nextchar(pRExC_state);
3270 return ret;
3271 }
3272 }
3273
3274 RExC_seen_zerolen++;
3275 RExC_in_lookaround++;
3276 RExC_seen |= seen_flag_set;
3277
3278 RExC_parse_set(start_arg);
3279 goto parse_rest;
3280
3281 no_colon:
3282 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
3283 UTF8fARG(UTF, verb_len, start_verb));
3284 NOT_REACHED; /*NOTREACHED*/
3285
3286 } /* End of switch */
3287 if ( ! op ) {
3288 RExC_parse_inc_safe();
3289 if (has_upper || verb_len == 0) {
3290 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
3291 UTF8fARG(UTF, verb_len, start_verb));
3292 }
3293 else {
3294 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
3295 UTF8fARG(UTF, verb_len, start_verb));
3296 }
3297 }
3298 if ( RExC_parse == start_arg ) {
3299 start_arg = NULL;
3300 }
3301 if ( arg_required && !start_arg ) {
3302 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
3303 (int) verb_len, start_verb);
3304 }
3305 if (internal_argval == -1) {
3306 ret = reg1node(pRExC_state, op, 0);
3307 } else {
3308 ret = reg2node(pRExC_state, op, 0, internal_argval);
3309 }
3310 RExC_seen |= REG_VERBARG_SEEN;
3311 if (start_arg) {
3312 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
3313 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
3314 STR_WITH_LEN("S"));
3315 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
3316 FLAGS(REGNODE_p(ret)) = 1;
3317 } else {
3318 FLAGS(REGNODE_p(ret)) = 0;
3319 }
3320 if ( internal_argval != -1 )
3321 ARG2i_SET(REGNODE_p(ret), internal_argval);
3322 nextchar(pRExC_state);
3323 return ret;
3324 }
3325 else if (*RExC_parse == '?') { /* (?...) */
3326 handle_qmark:
3327 ; /* make sure the label has a statement associated with it*/
3328 bool is_logical = 0, is_optimistic = 0;
3329 const char * const seqstart = RExC_parse;
3330 const char * endptr;
3331 const char non_existent_group_msg[]
3332 = "Reference to nonexistent group";
3333 const char impossible_group[] = "Invalid reference to group";
3334
3335 if (has_intervening_patws) {
3336 RExC_parse_inc_by(1);
3337 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
3338 }
3339
3340 RExC_parse_inc_by(1); /* past the '?' */
3341 if (!fake_eval) {
3342 paren = *RExC_parse; /* might be a trailing NUL, if not
3343 well-formed */
3344 is_optimistic = 0;
3345 } else {
3346 is_optimistic = 1;
3347 paren = fake_eval;
3348 }
3349 RExC_parse_inc();
3350 if (RExC_parse > RExC_end) {
3351 paren = '\0';
3352 }
3353 ret = 0; /* For look-ahead/behind. */
3354 switch (paren) {
3355
3356 case 'P': /* (?P...) variants for those used to PCRE/Python */
3357 paren = *RExC_parse;
3358 if ( paren == '<') { /* (?P<...>) named capture */
3359 RExC_parse_inc_by(1);
3360 if (RExC_parse >= RExC_end) {
3361 vFAIL("Sequence (?P<... not terminated");
3362 }
3363 goto named_capture;
3364 }
3365 else if (paren == '>') { /* (?P>name) named recursion */
3366 RExC_parse_inc_by(1);
3367 if (RExC_parse >= RExC_end) {
3368 vFAIL("Sequence (?P>... not terminated");
3369 }
3370 goto named_recursion;
3371 }
3372 else if (paren == '=') { /* (?P=...) named backref */
3373 RExC_parse_inc_by(1);
3374 return handle_named_backref(pRExC_state, flagp,
3375 segment_parse_start, ')');
3376 }
3377 RExC_parse_inc_if_char();
3378 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3379 vFAIL3("Sequence (%.*s...) not recognized",
3380 (int) (RExC_parse - seqstart), seqstart);
3381 NOT_REACHED; /*NOTREACHED*/
3382 case '<': /* (?<...) */
3383 /* If you want to support (?<*...), first reconcile with GH #17363 */
3384 if (*RExC_parse == '!') {
3385 paren = ','; /* negative lookbehind (?<! ... ) */
3386 RExC_parse_inc_by(1);
3387 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
3388 return ret;
3389 break;
3390 }
3391 else
3392 if (*RExC_parse == '=') {
3393 /* paren = '<' - negative lookahead (?<= ... ) */
3394 RExC_parse_inc_by(1);
3395 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
3396 return ret;
3397 break;
3398 }
3399 else
3400 named_capture:
3401 { /* (?<...>) */
3402 char *name_start;
3403 SV *svname;
3404 paren= '>';
3405 /* FALLTHROUGH */
3406 case '\'': /* (?'...') */
3407 name_start = RExC_parse;
3408 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
3409 if ( RExC_parse == name_start
3410 || RExC_parse >= RExC_end
3411 || *RExC_parse != paren)
3412 {
3413 vFAIL2("Sequence (?%c... not terminated",
3414 paren=='>' ? '<' : (char) paren);
3415 }
3416 {
3417 HE *he_str;
3418 SV *sv_dat = NULL;
3419 if (!svname) /* shouldn't happen */
3420 Perl_croak(aTHX_
3421 "panic: reg_scan_name returned NULL");
3422 if (!RExC_paren_names) {
3423 RExC_paren_names= newHV();
3424 sv_2mortal(MUTABLE_SV(RExC_paren_names));
3425 #ifdef DEBUGGING
3426 RExC_paren_name_list= newAV();
3427 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
3428 #endif
3429 }
3430 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
3431 if ( he_str )
3432 sv_dat = HeVAL(he_str);
3433 if ( ! sv_dat ) {
3434 /* croak baby croak */
3435 Perl_croak(aTHX_
3436 "panic: paren_name hash element allocation failed");
3437 } else if ( SvPOK(sv_dat) ) {
3438 /* (?|...) can mean we have dupes so scan to check
3439 its already been stored. Maybe a flag indicating
3440 we are inside such a construct would be useful,
3441 but the arrays are likely to be quite small, so
3442 for now we punt -- dmq */
3443 IV count = SvIV(sv_dat);
3444 I32 *pv = (I32*)SvPVX(sv_dat);
3445 IV i;
3446 for ( i = 0 ; i < count ; i++ ) {
3447 if ( pv[i] == RExC_npar ) {
3448 count = 0;
3449 break;
3450 }
3451 }
3452 if ( count ) {
3453 pv = (I32*)SvGROW(sv_dat,
3454 SvCUR(sv_dat) + sizeof(I32)+1);
3455 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
3456 pv[count] = RExC_npar;
3457 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
3458 }
3459 } else {
3460 (void)SvUPGRADE(sv_dat, SVt_PVNV);
3461 sv_setpvn(sv_dat, (char *)&(RExC_npar),
3462 sizeof(I32));
3463 SvIOK_on(sv_dat);
3464 SvIV_set(sv_dat, 1);
3465 }
3466 #ifdef DEBUGGING
3467 /* No, this does not cause a memory leak under
3468 * debugging. RExC_paren_name_list is freed later
3469 * on in the dump process. - Yves
3470 */
3471 if (!av_store(RExC_paren_name_list,
3472 RExC_npar, SvREFCNT_inc_NN(svname)))
3473 SvREFCNT_dec_NN(svname);
3474 #endif
3475
3476 }
3477 nextchar(pRExC_state);
3478 paren = 1;
3479 goto capturing_parens;
3480 }
3481 NOT_REACHED; /*NOTREACHED*/
3482 case '=': /* (?=...) */
3483 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
3484 return ret;
3485 break;
3486 case '!': /* (?!...) */
3487 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
3488 return ret;
3489 break;
3490 case '|': /* (?|...) */
3491 /* branch reset, behave like a (?:...) except that
3492 buffers in alternations share the same numbers */
3493 paren = ':';
3494 after_freeze = freeze_paren = RExC_logical_npar;
3495
3496 /* XXX This construct currently requires an extra pass.
3497 * Investigation would be required to see if that could be
3498 * changed */
3499 REQUIRE_PARENS_PASS;
3500 break;
3501 case ':': /* (?:...) */
3502 case '>': /* (?>...) */
3503 break;
3504 case '$': /* (?$...) */
3505 case '@': /* (?@...) */
3506 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3507 break;
3508 case '0' : /* (?0) */
3509 case 'R' : /* (?R) */
3510 if (RExC_parse == RExC_end || *RExC_parse != ')')
3511 FAIL("Sequence (?R) not terminated");
3512 num = 0;
3513 RExC_seen |= REG_RECURSE_SEEN;
3514
3515 /* XXX These constructs currently require an extra pass.
3516 * It probably could be changed */
3517 REQUIRE_PARENS_PASS;
3518
3519 *flagp |= POSTPONED;
3520 goto gen_recurse_regop;
3521 /*notreached*/
3522 /* named and numeric backreferences */
3523 case '&': /* (?&NAME) */
3524 segment_parse_start = RExC_parse - 1;
3525 named_recursion:
3526 {
3527 SV *sv_dat = reg_scan_name(pRExC_state,
3528 REG_RSN_RETURN_DATA);
3529 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
3530 }
3531 if (RExC_parse >= RExC_end || *RExC_parse != ')')
3532 vFAIL("Sequence (?&... not terminated");
3533 goto gen_recurse_regop;
3534 /* NOTREACHED */
3535 case '+':
3536 if (! inRANGE(RExC_parse[0], '1', '9')) {
3537 RExC_parse_inc_by(1);
3538 vFAIL("Illegal pattern");
3539 }
3540 goto parse_recursion;
3541 /* NOTREACHED*/
3542 case '-': /* (?-1) */
3543 if (! inRANGE(RExC_parse[0], '1', '9')) {
3544 RExC_parse--; /* rewind to let it be handled later */
3545 goto parse_flags;
3546 }
3547 /* FALLTHROUGH */
3548 case '1': case '2': case '3': case '4': /* (?1) */
3549 case '5': case '6': case '7': case '8': case '9':
3550 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */
3551 parse_recursion:
3552 {
3553 bool is_neg = FALSE;
3554 UV unum;
3555 segment_parse_start = RExC_parse - 1;
3556 if (*RExC_parse == '-') {
3557 RExC_parse_inc_by(1);
3558 is_neg = TRUE;
3559 }
3560 endptr = RExC_end;
3561 if (grok_atoUV(RExC_parse, &unum, &endptr)
3562 && unum <= I32_MAX
3563 ) {
3564 num = (I32)unum;
3565 RExC_parse_set((char*)endptr);
3566 }
3567 else { /* Overflow, or something like that. Position
3568 beyond all digits for the message */
3569 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
3570 RExC_parse_inc_by(1);
3571 }
3572 vFAIL(impossible_group);
3573 }
3574 if (is_neg) {
3575 /* -num is always representable on 1 and 2's complement
3576 * machines */
3577 num = -num;
3578 }
3579 }
3580 if (*RExC_parse!=')')
3581 vFAIL("Expecting close bracket");
3582
3583 if (paren == '-' || paren == '+') {
3584
3585 /* Don't overflow */
3586 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
3587 RExC_parse_inc_by(1);
3588 vFAIL(impossible_group);
3589 }
3590
3591 /*
3592 Diagram of capture buffer numbering.
3593 Top line is the normal capture buffer numbers
3594 Bottom line is the negative indexing as from
3595 the X (the (?-2))
3596
3597 1 2 3 4 5 X Y 6 7
3598 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
3599 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
3600 - 5 4 3 2 1 X Y x x
3601
3602 Resolve to absolute group. Recall that RExC_npar is +1 of
3603 the actual parenthesis group number. For lookahead, we
3604 have to compensate for that. Using the above example, when
3605 we get to Y in the parse, num is 2 and RExC_npar is 6. We
3606 want 7 for +2, and 4 for -2.
3607 */
3608 if ( paren == '+' ) {
3609 num--;
3610 }
3611
3612 num += RExC_npar;
3613
3614 if (paren == '-' && num < 1) {
3615 RExC_parse_inc_by(1);
3616 vFAIL(non_existent_group_msg);
3617 }
3618 }
3619 else
3620 if (num && num < RExC_logical_npar) {
3621 num = RExC_logical_to_parno[num];
3622 }
3623 else
3624 if (ALL_PARENS_COUNTED) {
3625 if (num < RExC_logical_total_parens) {
3626 num = RExC_logical_to_parno[num];
3627 }
3628 else {
3629 RExC_parse_inc_by(1);
3630 vFAIL(non_existent_group_msg);
3631 }
3632 }
3633 else {
3634 REQUIRE_PARENS_PASS;
3635 }
3636
3637
3638 gen_recurse_regop:
3639 if (num >= RExC_npar) {
3640
3641 /* It might be a forward reference; we can't fail until we
3642 * know, by completing the parse to get all the groups, and
3643 * then reparsing */
3644 if (ALL_PARENS_COUNTED) {
3645 if (num >= RExC_total_parens) {
3646 RExC_parse_inc_by(1);
3647 vFAIL(non_existent_group_msg);
3648 }
3649 }
3650 else {
3651 REQUIRE_PARENS_PASS;
3652 }
3653 }
3654
3655 /* We keep track how many GOSUB items we have produced.
3656 To start off the ARG2i() of the GOSUB holds its "id",
3657 which is used later in conjunction with RExC_recurse
3658 to calculate the offset we need to jump for the GOSUB,
3659 which it will store in the final representation.
3660 We have to defer the actual calculation until much later
3661 as the regop may move.
3662 */
3663 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
3664 RExC_recurse_count++;
3665 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
3666 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
3667 22, "| |", (int)(depth * 2 + 1), "",
3668 (UV)ARG1u(REGNODE_p(ret)),
3669 (IV)ARG2i(REGNODE_p(ret))));
3670 RExC_seen |= REG_RECURSE_SEEN;
3671
3672 *flagp |= POSTPONED;
3673 assert(*RExC_parse == ')');
3674 nextchar(pRExC_state);
3675 return ret;
3676
3677 /* NOTREACHED */
3678
3679 case '?': /* (??...) */
3680 is_logical = 1;
3681 if (*RExC_parse != '{') {
3682 RExC_parse_inc_if_char();
3683 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3684 vFAIL2utf8f(
3685 "Sequence (%" UTF8f "...) not recognized",
3686 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
3687 NOT_REACHED; /*NOTREACHED*/
3688 }
3689 *flagp |= POSTPONED;
3690 paren = '{';
3691 RExC_parse_inc_by(1);
3692 /* FALLTHROUGH */
3693 case '{': /* (?{...}) */
3694 {
3695 U32 n = 0;
3696 struct reg_code_block *cb;
3697 OP * o;
3698
3699 RExC_seen_zerolen++;
3700
3701 if ( !pRExC_state->code_blocks
3702 || pRExC_state->code_index
3703 >= pRExC_state->code_blocks->count
3704 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
3705 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
3706 - RExC_start)
3707 ) {
3708 if (RExC_pm_flags & PMf_USE_RE_EVAL)
3709 FAIL("panic: Sequence (?{...}): no code block found\n");
3710 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3711 }
3712 /* this is a pre-compiled code block (?{...}) */
3713 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
3714 RExC_parse_set(RExC_start + cb->end);
3715 o = cb->block;
3716 if (cb->src_regex) {
3717 n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
3718 RExC_rxi->data->data[n] =
3719 (void*)SvREFCNT_inc((SV*)cb->src_regex);
3720 RExC_rxi->data->data[n+1] = (void*)o;
3721 }
3722 else {
3723 n = reg_add_data(pRExC_state,
3724 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
3725 RExC_rxi->data->data[n] = (void*)o;
3726 }
3727 pRExC_state->code_index++;
3728 nextchar(pRExC_state);
3729 if (!is_optimistic)
3730 RExC_seen |= REG_PESSIMIZE_SEEN;
3731
3732 if (is_logical) {
3733 regnode_offset eval;
3734 ret = reg_node(pRExC_state, LOGICAL);
3735 FLAGS(REGNODE_p(ret)) = 2;
3736
3737 eval = reg2node(pRExC_state, EVAL,
3738 n,
3739
3740 /* for later propagation into (??{})
3741 * return value */
3742 RExC_flags & RXf_PMf_COMPILETIME
3743 );
3744 FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3745 if (! REGTAIL(pRExC_state, ret, eval)) {
3746 REQUIRE_BRANCHJ(flagp, 0);
3747 }
3748 return ret;
3749 }
3750 ret = reg2node(pRExC_state, EVAL, n, 0);
3751 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3752
3753 return ret;
3754 }
3755 case '(': /* (?(?{...})...) and (?(?=...)...) */
3756 {
3757 int is_define= 0;
3758 const int DEFINE_len = sizeof("DEFINE") - 1;
3759 if ( RExC_parse < RExC_end - 1
3760 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
3761 && ( RExC_parse[1] == '='
3762 || RExC_parse[1] == '!'
3763 || RExC_parse[1] == '<'
3764 || RExC_parse[1] == '{'))
3765 || ( RExC_parse[0] == '*' /* (?(*...)) */
3766 && ( RExC_parse[1] == '{'
3767 || ( memBEGINs(RExC_parse + 1,
3768 (Size_t) (RExC_end - (RExC_parse + 1)),
3769 "pla:")
3770 || memBEGINs(RExC_parse + 1,
3771 (Size_t) (RExC_end - (RExC_parse + 1)),
3772 "plb:")
3773 || memBEGINs(RExC_parse + 1,
3774 (Size_t) (RExC_end - (RExC_parse + 1)),
3775 "nla:")
3776 || memBEGINs(RExC_parse + 1,
3777 (Size_t) (RExC_end - (RExC_parse + 1)),
3778 "nlb:")
3779 || memBEGINs(RExC_parse + 1,
3780 (Size_t) (RExC_end - (RExC_parse + 1)),
3781 "positive_lookahead:")
3782 || memBEGINs(RExC_parse + 1,
3783 (Size_t) (RExC_end - (RExC_parse + 1)),
3784 "positive_lookbehind:")
3785 || memBEGINs(RExC_parse + 1,
3786 (Size_t) (RExC_end - (RExC_parse + 1)),
3787 "negative_lookahead:")
3788 || memBEGINs(RExC_parse + 1,
3789 (Size_t) (RExC_end - (RExC_parse + 1)),
3790 "negative_lookbehind:")))))
3791 ) { /* Lookahead or eval. */
3792 I32 flag;
3793 regnode_offset tail;
3794
3795 ret = reg_node(pRExC_state, LOGICAL);
3796 FLAGS(REGNODE_p(ret)) = 1;
3797
3798 tail = reg(pRExC_state, 1, &flag, depth+1);
3799 RETURN_FAIL_ON_RESTART(flag, flagp);
3800 if (! REGTAIL(pRExC_state, ret, tail)) {
3801 REQUIRE_BRANCHJ(flagp, 0);
3802 }
3803 goto insert_if;
3804 }
3805 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
3806 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
3807 {
3808 char ch = RExC_parse[0] == '<' ? '>' : '\'';
3809 char *name_start= RExC_parse;
3810 RExC_parse_inc_by(1);
3811 U32 num = 0;
3812 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
3813 if ( RExC_parse == name_start
3814 || RExC_parse >= RExC_end
3815 || *RExC_parse != ch)
3816 {
3817 vFAIL2("Sequence (?(%c... not terminated",
3818 (ch == '>' ? '<' : ch));
3819 }
3820 RExC_parse_inc_by(1);
3821 if (sv_dat) {
3822 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
3823 RExC_rxi->data->data[num]=(void*)sv_dat;
3824 SvREFCNT_inc_simple_void_NN(sv_dat);
3825 }
3826 ret = reg1node(pRExC_state, GROUPPN, num);
3827 goto insert_if_check_paren;
3828 }
3829 else if (memBEGINs(RExC_parse,
3830 (STRLEN) (RExC_end - RExC_parse),
3831 "DEFINE"))
3832 {
3833 ret = reg1node(pRExC_state, DEFINEP, 0);
3834 RExC_parse_inc_by(DEFINE_len);
3835 is_define = 1;
3836 goto insert_if_check_paren;
3837 }
3838 else if (RExC_parse[0] == 'R') {
3839 RExC_parse_inc_by(1);
3840 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
3841 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
3842 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
3843 */
3844 parno = 0;
3845 if (RExC_parse[0] == '0') {
3846 parno = 1;
3847 RExC_parse_inc_by(1);
3848 }
3849 else if (inRANGE(RExC_parse[0], '1', '9')) {
3850 UV uv;
3851 endptr = RExC_end;
3852 if (grok_atoUV(RExC_parse, &uv, &endptr)
3853 && uv <= I32_MAX
3854 ) {
3855 parno = (I32)uv + 1;
3856 RExC_parse_set((char*)endptr);
3857 }
3858 /* else "Switch condition not recognized" below */
3859 } else if (RExC_parse[0] == '&') {
3860 SV *sv_dat;
3861 RExC_parse_inc_by(1);
3862 sv_dat = reg_scan_name(pRExC_state,
3863 REG_RSN_RETURN_DATA);
3864 if (sv_dat)
3865 parno = 1 + *((I32 *)SvPVX(sv_dat));
3866 }
3867 ret = reg1node(pRExC_state, INSUBP, parno);
3868 goto insert_if_check_paren;
3869 }
3870 else if (inRANGE(RExC_parse[0], '1', '9')) {
3871 /* (?(1)...) */
3872 char c;
3873 UV uv;
3874 endptr = RExC_end;
3875 if (grok_atoUV(RExC_parse, &uv, &endptr)
3876 && uv <= I32_MAX
3877 ) {
3878 parno = (I32)uv;
3879 RExC_parse_set((char*)endptr);
3880 }
3881 else {
3882 vFAIL("panic: grok_atoUV returned FALSE");
3883 }
3884 ret = reg1node(pRExC_state, GROUPP, parno);
3885
3886 insert_if_check_paren:
3887 if (UCHARAT(RExC_parse) != ')') {
3888 RExC_parse_inc_safe();
3889 vFAIL("Switch condition not recognized");
3890 }
3891 nextchar(pRExC_state);
3892 insert_if:
3893 if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
3894 IFTHEN, 0)))
3895 {
3896 REQUIRE_BRANCHJ(flagp, 0);
3897 }
3898 br = regbranch(pRExC_state, &flags, 1, depth+1);
3899 if (br == 0) {
3900 RETURN_FAIL_ON_RESTART(flags,flagp);
3901 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3902 (UV) flags);
3903 } else
3904 if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
3905 LONGJMP, 0)))
3906 {
3907 REQUIRE_BRANCHJ(flagp, 0);
3908 }
3909 c = UCHARAT(RExC_parse);
3910 nextchar(pRExC_state);
3911 if (flags&HASWIDTH)
3912 *flagp |= HASWIDTH;
3913 if (c == '|') {
3914 if (is_define)
3915 vFAIL("(?(DEFINE)....) does not allow branches");
3916
3917 /* Fake one for optimizer. */
3918 lastbr = reg1node(pRExC_state, IFTHEN, 0);
3919
3920 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
3921 RETURN_FAIL_ON_RESTART(flags, flagp);
3922 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3923 (UV) flags);
3924 }
3925 if (! REGTAIL(pRExC_state, ret, lastbr)) {
3926 REQUIRE_BRANCHJ(flagp, 0);
3927 }
3928 if (flags&HASWIDTH)
3929 *flagp |= HASWIDTH;
3930 c = UCHARAT(RExC_parse);
3931 nextchar(pRExC_state);
3932 }
3933 else
3934 lastbr = 0;
3935 if (c != ')') {
3936 if (RExC_parse >= RExC_end)
3937 vFAIL("Switch (?(condition)... not terminated");
3938 else
3939 vFAIL("Switch (?(condition)... contains too many branches");
3940 }
3941 ender = reg_node(pRExC_state, TAIL);
3942 if (! REGTAIL(pRExC_state, br, ender)) {
3943 REQUIRE_BRANCHJ(flagp, 0);
3944 }
3945 if (lastbr) {
3946 if (! REGTAIL(pRExC_state, lastbr, ender)) {
3947 REQUIRE_BRANCHJ(flagp, 0);
3948 }
3949 if (! REGTAIL(pRExC_state,
3950 REGNODE_OFFSET(
3951 REGNODE_AFTER(REGNODE_p(lastbr))),
3952 ender))
3953 {
3954 REQUIRE_BRANCHJ(flagp, 0);
3955 }
3956 }
3957 else
3958 if (! REGTAIL(pRExC_state, ret, ender)) {
3959 REQUIRE_BRANCHJ(flagp, 0);
3960 }
3961 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
3962 RExC_size++; /* XXX WHY do we need this?!!
3963 For large programs it seems to be required
3964 but I can't figure out why. -- dmq*/
3965 #endif
3966 return ret;
3967 }
3968 RExC_parse_inc_safe();
3969 vFAIL("Unknown switch condition (?(...))");
3970 }
3971 case '[': /* (?[ ... ]) */
3972 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
3973 case 0: /* A NUL */
3974 RExC_parse--; /* for vFAIL to print correctly */
3975 vFAIL("Sequence (? incomplete");
3976 break;
3977
3978 case ')':
3979 if (RExC_strict) { /* [perl #132851] */
3980 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
3981 }
3982 /* FALLTHROUGH */
3983 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
3984 /* FALLTHROUGH */
3985 default: /* e.g., (?i) */
3986 RExC_parse_set((char *) seqstart + 1);
3987 parse_flags:
3988 parse_lparen_question_flags(pRExC_state);
3989 if (UCHARAT(RExC_parse) != ':') {
3990 if (RExC_parse < RExC_end)
3991 nextchar(pRExC_state);
3992 *flagp = TRYAGAIN;
3993 return 0;
3994 }
3995 paren = ':';
3996 nextchar(pRExC_state);
3997 ret = 0;
3998 goto parse_rest;
3999 } /* end switch */
4000 }
4001 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
4002 capturing_parens:
4003 parno = RExC_npar;
4004 RExC_npar++;
4005 if (RExC_npar >= U16_MAX)
4006 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
4007
4008 logical_parno = RExC_logical_npar;
4009 RExC_logical_npar++;
4010 if (! ALL_PARENS_COUNTED) {
4011 /* If we are in our first pass through (and maybe only pass),
4012 * we need to allocate memory for the capturing parentheses
4013 * data structures.
4014 */
4015
4016 if (!RExC_parens_buf_size) {
4017 /* first guess at number of parens we might encounter */
4018 RExC_parens_buf_size = 10;
4019
4020 /* setup RExC_open_parens, which holds the address of each
4021 * OPEN tag, and to make things simpler for the 0 index the
4022 * start of the program - this is used later for offsets */
4023 Newxz(RExC_open_parens, RExC_parens_buf_size,
4024 regnode_offset);
4025 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
4026
4027 /* setup RExC_close_parens, which holds the address of each
4028 * CLOSE tag, and to make things simpler for the 0 index
4029 * the end of the program - this is used later for offsets
4030 * */
4031 Newxz(RExC_close_parens, RExC_parens_buf_size,
4032 regnode_offset);
4033 /* we don't know where end op starts yet, so we don't need to
4034 * set RExC_close_parens[0] like we do RExC_open_parens[0]
4035 * above */
4036
4037 Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4038 Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4039 }
4040 else if (RExC_npar > RExC_parens_buf_size) {
4041 I32 old_size = RExC_parens_buf_size;
4042
4043 RExC_parens_buf_size *= 2;
4044
4045 Renew(RExC_open_parens, RExC_parens_buf_size,
4046 regnode_offset);
4047 Zero(RExC_open_parens + old_size,
4048 RExC_parens_buf_size - old_size, regnode_offset);
4049
4050 Renew(RExC_close_parens, RExC_parens_buf_size,
4051 regnode_offset);
4052 Zero(RExC_close_parens + old_size,
4053 RExC_parens_buf_size - old_size, regnode_offset);
4054
4055 Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4056 Zero(RExC_logical_to_parno + old_size,
4057 RExC_parens_buf_size - old_size, I32);
4058
4059 Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4060 Zero(RExC_parno_to_logical + old_size,
4061 RExC_parens_buf_size - old_size, I32);
4062 }
4063 }
4064
4065 ret = reg1node(pRExC_state, OPEN, parno);
4066 if (!RExC_nestroot)
4067 RExC_nestroot = parno;
4068 if (RExC_open_parens && !RExC_open_parens[parno])
4069 {
4070 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4071 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
4072 22, "| |", (int)(depth * 2 + 1), "",
4073 (IV)parno, ret));
4074 RExC_open_parens[parno]= ret;
4075 }
4076 if (RExC_parno_to_logical) {
4077 RExC_parno_to_logical[parno] = logical_parno;
4078 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
4079 RExC_logical_to_parno[logical_parno] = parno;
4080 }
4081 is_open = 1;
4082 } else {
4083 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
4084 paren = ':';
4085 ret = 0;
4086 }
4087 }
4088 else /* ! paren */
4089 ret = 0;
4090
4091 parse_rest:
4092 /* Pick up the branches, linking them together. */
4093 segment_parse_start = RExC_parse;
4094 I32 npar_before_regbranch = RExC_npar - 1;
4095 br = regbranch(pRExC_state, &flags, 1, depth+1);
4096
4097 /* branch_len = (paren != 0); */
4098
4099 if (br == 0) {
4100 RETURN_FAIL_ON_RESTART(flags, flagp);
4101 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4102 }
4103 if (*RExC_parse == '|') {
4104 if (RExC_use_BRANCHJ) {
4105 reginsert(pRExC_state, BRANCHJ, br, depth+1);
4106 ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
4107 ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4108 }
4109 else {
4110 reginsert(pRExC_state, BRANCH, br, depth+1);
4111 ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
4112 ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4113 }
4114 have_branch = 1;
4115 }
4116 else if (paren == ':') {
4117 *flagp |= flags&SIMPLE;
4118 }
4119 if (is_open) { /* Starts with OPEN. */
4120 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
4121 REQUIRE_BRANCHJ(flagp, 0);
4122 }
4123 }
4124 else if (paren != '?') /* Not Conditional */
4125 ret = br;
4126 *flagp |= flags & (HASWIDTH | POSTPONED);
4127 lastbr = br;
4128 while (*RExC_parse == '|') {
4129 if (RExC_use_BRANCHJ) {
4130 bool shut_gcc_up;
4131
4132 ender = reg1node(pRExC_state, LONGJMP, 0);
4133
4134 /* Append to the previous. */
4135 shut_gcc_up = REGTAIL(pRExC_state,
4136 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
4137 ender);
4138 PERL_UNUSED_VAR(shut_gcc_up);
4139 }
4140 nextchar(pRExC_state);
4141 if (freeze_paren) {
4142 if (RExC_logical_npar > after_freeze)
4143 after_freeze = RExC_logical_npar;
4144 RExC_logical_npar = freeze_paren;
4145 }
4146 br = regbranch(pRExC_state, &flags, 0, depth+1);
4147
4148 if (br == 0) {
4149 RETURN_FAIL_ON_RESTART(flags, flagp);
4150 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4151 }
4152 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
4153 REQUIRE_BRANCHJ(flagp, 0);
4154 }
4155 assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4156 assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4157 if (OP(REGNODE_p(br)) == BRANCH) {
4158 if (OP(REGNODE_p(lastbr)) == BRANCH)
4159 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4160 else
4161 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4162 }
4163 else
4164 if (OP(REGNODE_p(br)) == BRANCHJ) {
4165 if (OP(REGNODE_p(lastbr)) == BRANCH)
4166 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4167 else
4168 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4169 }
4170
4171 lastbr = br;
4172 *flagp |= flags & (HASWIDTH | POSTPONED);
4173 }
4174
4175 if (have_branch || paren != ':') {
4176 regnode * br;
4177
4178 /* Make a closing node, and hook it on the end. */
4179 switch (paren) {
4180 case ':':
4181 ender = reg_node(pRExC_state, TAIL);
4182 break;
4183 case 1: case 2:
4184 ender = reg1node(pRExC_state, CLOSE, parno);
4185 if ( RExC_close_parens ) {
4186 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4187 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
4188 22, "| |", (int)(depth * 2 + 1), "",
4189 (IV)parno, ender));
4190 RExC_close_parens[parno]= ender;
4191 if (RExC_nestroot == parno)
4192 RExC_nestroot = 0;
4193 }
4194 break;
4195 case 's':
4196 ender = reg_node(pRExC_state, SRCLOSE);
4197 RExC_in_script_run = 0;
4198 break;
4199 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
4200 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
4201 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
4202 case '<': /* (?<= ... ) */
4203 case ',': /* (?<! ... ) */
4204 *flagp &= ~HASWIDTH;
4205 ender = reg_node(pRExC_state, LOOKBEHIND_END);
4206 break;
4207 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
4208 case 'a':
4209 case 'A':
4210 case '=':
4211 case '!':
4212 *flagp &= ~HASWIDTH;
4213 /* FALLTHROUGH */
4214 case 't': /* aTomic */
4215 case '>':
4216 ender = reg_node(pRExC_state, SUCCEED);
4217 break;
4218 case 0:
4219 ender = reg_node(pRExC_state, END);
4220 assert(!RExC_end_op); /* there can only be one! */
4221 RExC_end_op = REGNODE_p(ender);
4222 if (RExC_close_parens) {
4223 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4224 "%*s%*s Setting close paren #0 (END) to %zu\n",
4225 22, "| |", (int)(depth * 2 + 1), "",
4226 ender));
4227
4228 RExC_close_parens[0]= ender;
4229 }
4230 break;
4231 }
4232 DEBUG_PARSE_r({
4233 DEBUG_PARSE_MSG("lsbr");
4234 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
4235 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
4236 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4237 SvPV_nolen_const(RExC_mysv1),
4238 (IV)lastbr,
4239 SvPV_nolen_const(RExC_mysv2),
4240 (IV)ender,
4241 (IV)(ender - lastbr)
4242 );
4243 });
4244 if (OP(REGNODE_p(lastbr)) == BRANCH) {
4245 ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4246 }
4247 else
4248 if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4249 ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4250 }
4251
4252 if (! REGTAIL(pRExC_state, lastbr, ender)) {
4253 REQUIRE_BRANCHJ(flagp, 0);
4254 }
4255
4256 if (have_branch) {
4257 char is_nothing= 1;
4258 if (depth==1)
4259 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
4260
4261 /* Hook the tails of the branches to the closing node. */
4262 for (br = REGNODE_p(ret); br; br = regnext(br)) {
4263 const U8 op = REGNODE_TYPE(OP(br));
4264 regnode *nextoper = REGNODE_AFTER(br);
4265 if (op == BRANCH) {
4266 if (! REGTAIL_STUDY(pRExC_state,
4267 REGNODE_OFFSET(nextoper),
4268 ender))
4269 {
4270 REQUIRE_BRANCHJ(flagp, 0);
4271 }
4272 if ( OP(nextoper) != NOTHING
4273 || regnext(nextoper) != REGNODE_p(ender))
4274 is_nothing= 0;
4275 }
4276 else if (op == BRANCHJ) {
4277 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
4278 REGNODE_OFFSET(nextoper),
4279 ender);
4280 PERL_UNUSED_VAR(shut_gcc_up);
4281 /* for now we always disable this optimisation * /
4282 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
4283 if ( OP(nopr) != NOTHING
4284 || regnext(nopr) != REGNODE_p(ender))
4285 */
4286 is_nothing= 0;
4287 }
4288 }
4289 if (is_nothing) {
4290 regnode * ret_as_regnode = REGNODE_p(ret);
4291 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
4292 ? regnext(ret_as_regnode)
4293 : ret_as_regnode;
4294 DEBUG_PARSE_r({
4295 DEBUG_PARSE_MSG("NADA");
4296 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
4297 NULL, pRExC_state);
4298 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
4299 NULL, pRExC_state);
4300 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4301 SvPV_nolen_const(RExC_mysv1),
4302 (IV)REG_NODE_NUM(ret_as_regnode),
4303 SvPV_nolen_const(RExC_mysv2),
4304 (IV)ender,
4305 (IV)(ender - ret)
4306 );
4307 });
4308 OP(br)= NOTHING;
4309 if (OP(REGNODE_p(ender)) == TAIL) {
4310 NEXT_OFF(br)= 0;
4311 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
4312 } else {
4313 regnode *opt;
4314 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
4315 OP(opt)= OPTIMIZED;
4316 NEXT_OFF(br)= REGNODE_p(ender) - br;
4317 }
4318 }
4319 }
4320 }
4321
4322 {
4323 const char *p;
4324 /* Even/odd or x=don't care: 010101x10x */
4325 static const char parens[] = "=!aA<,>Bbt";
4326 /* flag below is set to 0 up through 'A'; 1 for larger */
4327
4328 if (paren && (p = strchr(parens, paren))) {
4329 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4330 int flag = (p - parens) > 3;
4331
4332 if (paren == '>' || paren == 't') {
4333 node = SUSPEND, flag = 0;
4334 }
4335
4336 reginsert(pRExC_state, node, ret, depth+1);
4337 FLAGS(REGNODE_p(ret)) = flag;
4338 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
4339 {
4340 REQUIRE_BRANCHJ(flagp, 0);
4341 }
4342 }
4343 }
4344
4345 /* Check for proper termination. */
4346 if (paren) {
4347 /* restore original flags, but keep (?p) and, if we've encountered
4348 * something in the parse that changes /d rules into /u, keep the /u */
4349 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
4350 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
4351 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
4352 }
4353 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
4354 RExC_parse_set(reg_parse_start);
4355 vFAIL("Unmatched (");
4356 }
4357 nextchar(pRExC_state);
4358 }
4359 else if (!paren && RExC_parse < RExC_end) {
4360 if (*RExC_parse == ')') {
4361 RExC_parse_inc_by(1);
4362 vFAIL("Unmatched )");
4363 }
4364 else
4365 FAIL("Junk on end of regexp"); /* "Can't happen". */
4366 NOT_REACHED; /* NOTREACHED */
4367 }
4368
4369 if (after_freeze > RExC_logical_npar)
4370 RExC_logical_npar = after_freeze;
4371
4372 RExC_in_lookaround = was_in_lookaround;
4373
4374 return(ret);
4375 }
4376
4377 /*
4378 - regbranch - one alternative of an | operator
4379 *
4380 * Implements the concatenation operator.
4381 *
4382 * On success, returns the offset at which any next node should be placed into
4383 * the regex engine program being compiled.
4384 *
4385 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
4386 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
4387 * UTF-8
4388 */
4389 STATIC regnode_offset
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first,U32 depth)4390 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4391 {
4392 regnode_offset ret;
4393 regnode_offset chain = 0;
4394 regnode_offset latest;
4395 regnode *branch_node = NULL;
4396 I32 flags = 0, c = 0;
4397 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4398
4399 PERL_ARGS_ASSERT_REGBRANCH;
4400
4401 DEBUG_PARSE("brnc");
4402
4403 if (first)
4404 ret = 0;
4405 else {
4406 if (RExC_use_BRANCHJ) {
4407 ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
4408 branch_node = REGNODE_p(ret);
4409 ARG2a_SET(branch_node, (U16)RExC_npar-1);
4410 } else {
4411 ret = reg1node(pRExC_state, BRANCH, 0);
4412 branch_node = REGNODE_p(ret);
4413 ARG1a_SET(branch_node, (U16)RExC_npar-1);
4414 }
4415 }
4416
4417 *flagp = 0; /* Initialize. */
4418
4419 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
4420 FALSE /* Don't force to /x */ );
4421 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4422 flags &= ~TRYAGAIN;
4423 latest = regpiece(pRExC_state, &flags, depth+1);
4424 if (latest == 0) {
4425 if (flags & TRYAGAIN)
4426 continue;
4427 RETURN_FAIL_ON_RESTART(flags, flagp);
4428 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
4429 }
4430 else if (ret == 0)
4431 ret = latest;
4432 *flagp |= flags&(HASWIDTH|POSTPONED);
4433 if (chain != 0) {
4434 /* FIXME adding one for every branch after the first is probably
4435 * excessive now we have TRIE support. (hv) */
4436 MARK_NAUGHTY(1);
4437 if (! REGTAIL(pRExC_state, chain, latest)) {
4438 /* XXX We could just redo this branch, but figuring out what
4439 * bookkeeping needs to be reset is a pain, and it's likely
4440 * that other branches that goto END will also be too large */
4441 REQUIRE_BRANCHJ(flagp, 0);
4442 }
4443 }
4444 chain = latest;
4445 c++;
4446 }
4447 if (chain == 0) { /* Loop ran zero times. */
4448 chain = reg_node(pRExC_state, NOTHING);
4449 if (ret == 0)
4450 ret = chain;
4451 }
4452 if (c == 1) {
4453 *flagp |= flags & SIMPLE;
4454 }
4455 return ret;
4456 }
4457
4458 #define RBRACE 0
4459 #define MIN_S 1
4460 #define MIN_E 2
4461 #define MAX_S 3
4462 #define MAX_E 4
4463
4464 #ifndef PERL_IN_XSUB_RE
4465 bool
Perl_regcurly(const char * s,const char * e,const char * result[5])4466 Perl_regcurly(const char *s, const char *e, const char * result[5])
4467 {
4468 /* This function matches a {m,n} quantifier. When called with a NULL final
4469 * argument, it simply parses the input from 's' up through 'e-1', and
4470 * returns a boolean as to whether or not this input is syntactically a
4471 * {m,n} quantifier.
4472 *
4473 * When called with a non-NULL final parameter, and when the function
4474 * returns TRUE, it additionally stores information into the array
4475 * specified by that parameter about what it found in the parse. The
4476 * parameter must be a pointer into a 5 element array of 'const char *'
4477 * elements. The returned information is as follows:
4478 * result[RBRACE] points to the closing brace
4479 * result[MIN_S] points to the first byte of the lower bound
4480 * result[MIN_E] points to one beyond the final byte of the lower bound
4481 * result[MAX_S] points to the first byte of the upper bound
4482 * result[MAX_E] points to one beyond the final byte of the upper bound
4483 *
4484 * If the quantifier is of the form {m,} (meaning an infinite upper
4485 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
4486 * to is irrelevant, just that it's the same place
4487 *
4488 * If instead the quantifier is of the form {m} there is actually only
4489 * one bound, and both the upper and lower result[] elements are set to
4490 * point to it.
4491 *
4492 * This function checks only for syntactic validity; it leaves checking for
4493 * semantic validity and raising any diagnostics to the caller. This
4494 * function is called in multiple places to check for syntax, but only from
4495 * one for semantics. It makes it as simple as possible for the
4496 * syntax-only callers, while furnishing just enough information for the
4497 * semantic caller.
4498 */
4499
4500 const char * min_start = NULL;
4501 const char * max_start = NULL;
4502 const char * min_end = NULL;
4503 const char * max_end = NULL;
4504
4505 bool has_comma = FALSE;
4506
4507 PERL_ARGS_ASSERT_REGCURLY;
4508
4509 if (s >= e || *s++ != '{')
4510 return FALSE;
4511
4512 while (s < e && isBLANK(*s)) {
4513 s++;
4514 }
4515
4516 if isDIGIT(*s) {
4517 min_start = s;
4518 do {
4519 s++;
4520 } while (s < e && isDIGIT(*s));
4521 min_end = s;
4522 }
4523
4524 while (s < e && isBLANK(*s)) {
4525 s++;
4526 }
4527
4528 if (*s == ',') {
4529 has_comma = TRUE;
4530 s++;
4531
4532 while (s < e && isBLANK(*s)) {
4533 s++;
4534 }
4535
4536 if isDIGIT(*s) {
4537 max_start = s;
4538 do {
4539 s++;
4540 } while (s < e && isDIGIT(*s));
4541 max_end = s;
4542 }
4543 }
4544
4545 while (s < e && isBLANK(*s)) {
4546 s++;
4547 }
4548 /* Need at least one number */
4549 if (s >= e || *s != '}' || (! min_start && ! max_end)) {
4550 return FALSE;
4551 }
4552
4553 if (result) {
4554
4555 result[RBRACE] = s;
4556
4557 result[MIN_S] = min_start;
4558 result[MIN_E] = min_end;
4559 if (has_comma) {
4560 if (max_start) {
4561 result[MAX_S] = max_start;
4562 result[MAX_E] = max_end;
4563 }
4564 else {
4565 /* Having no value after the comma is signalled by setting
4566 * start and end to the same value. What that value is isn't
4567 * relevant; NULL is chosen simply because it will fail if the
4568 * caller mistakenly uses it */
4569 result[MAX_S] = result[MAX_E] = NULL;
4570 }
4571 }
4572 else { /* No comma means lower and upper bounds are the same */
4573 result[MAX_S] = min_start;
4574 result[MAX_E] = min_end;
4575 }
4576 }
4577
4578 return TRUE;
4579 }
4580 #endif
4581
4582 U32
S_get_quantifier_value(pTHX_ RExC_state_t * pRExC_state,const char * start,const char * end)4583 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
4584 const char * start, const char * end)
4585 {
4586 /* This is a helper function for regpiece() to compute, given the
4587 * quantifier {m,n}, the value of either m or n, based on the starting
4588 * position 'start' in the string, through the byte 'end-1', returning it
4589 * if valid, and failing appropriately if not. It knows the restrictions
4590 * imposed on quantifier values */
4591
4592 UV uv;
4593 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
4594
4595 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
4596
4597 if (grok_atoUV(start, &uv, &end)) {
4598 if (uv < REG_INFTY) { /* A valid, small-enough number */
4599 return (U32) uv;
4600 }
4601 }
4602 else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
4603 leading zeros or overflow */
4604 RExC_parse_set((char * ) end);
4605
4606 /* Perhaps too generic a msg for what is only failure from having
4607 * leading zeros, but this is how it's always behaved. */
4608 vFAIL("Invalid quantifier in {,}");
4609 NOT_REACHED; /*NOTREACHED*/
4610 }
4611
4612 /* Here, found a quantifier, but was too large; either it overflowed or was
4613 * too big a legal number */
4614 RExC_parse_set((char * ) end);
4615 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4616
4617 NOT_REACHED; /*NOTREACHED*/
4618 return U32_MAX; /* Perhaps some compilers will be expecting a return */
4619 }
4620
4621 /*
4622 - regpiece - something followed by possible quantifier * + ? {n,m}
4623 *
4624 * Note that the branching code sequences used for ? and the general cases
4625 * of * and + are somewhat optimized: they use the same NOTHING node as
4626 * both the endmarker for their branch list and the body of the last branch.
4627 * It might seem that this node could be dispensed with entirely, but the
4628 * endmarker role is not redundant.
4629 *
4630 * On success, returns the offset at which any next node should be placed into
4631 * the regex engine program being compiled.
4632 *
4633 * Returns 0 otherwise, with *flagp set to indicate why:
4634 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
4635 * RESTART_PARSE if the parse needs to be restarted, or'd with
4636 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
4637 */
4638 STATIC regnode_offset
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)4639 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4640 {
4641 regnode_offset ret;
4642 char op;
4643 I32 flags;
4644 const char * const origparse = RExC_parse;
4645 I32 min;
4646 I32 max = REG_INFTY;
4647 I32 npar_before = RExC_npar-1;
4648
4649 /* Save the original in case we change the emitted regop to a FAIL. */
4650 const regnode_offset orig_emit = RExC_emit;
4651
4652 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4653
4654 PERL_ARGS_ASSERT_REGPIECE;
4655
4656 DEBUG_PARSE("piec");
4657
4658 ret = regatom(pRExC_state, &flags, depth+1);
4659 if (ret == 0) {
4660 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
4661 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
4662 }
4663 I32 npar_after = RExC_npar-1;
4664
4665 op = *RExC_parse;
4666 switch (op) {
4667 const char * regcurly_return[5];
4668
4669 case '*':
4670 nextchar(pRExC_state);
4671 min = 0;
4672 break;
4673
4674 case '+':
4675 nextchar(pRExC_state);
4676 min = 1;
4677 break;
4678
4679 case '?':
4680 nextchar(pRExC_state);
4681 min = 0; max = 1;
4682 break;
4683
4684 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
4685 to determine which */
4686 if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
4687 const char * min_start = regcurly_return[MIN_S];
4688 const char * min_end = regcurly_return[MIN_E];
4689 const char * max_start = regcurly_return[MAX_S];
4690 const char * max_end = regcurly_return[MAX_E];
4691
4692 if (min_start) {
4693 min = get_quantifier_value(pRExC_state, min_start, min_end);
4694 }
4695 else {
4696 min = 0;
4697 }
4698
4699 if (max_start == max_end) { /* Was of the form {m,} */
4700 max = REG_INFTY;
4701 }
4702 else if (max_start == min_start) { /* Was of the form {m} */
4703 max = min;
4704 }
4705 else { /* Was of the form {m,n} */
4706 assert(max_end >= max_start);
4707
4708 max = get_quantifier_value(pRExC_state, max_start, max_end);
4709 }
4710
4711 RExC_parse_set((char *) regcurly_return[RBRACE]);
4712 nextchar(pRExC_state);
4713
4714 if (max < min) { /* If can't match, warn and optimize to fail
4715 unconditionally */
4716 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
4717 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
4718 NEXT_OFF(REGNODE_p(orig_emit)) =
4719 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
4720 return ret;
4721 }
4722 else if (min == max && *RExC_parse == '?') {
4723 ckWARN2reg(RExC_parse + 1,
4724 "Useless use of greediness modifier '%c'",
4725 *RExC_parse);
4726 }
4727
4728 break;
4729 } /* End of is {m,n} */
4730
4731 /* Here was a '{', but what followed it didn't form a quantifier. */
4732 /* FALLTHROUGH */
4733
4734 default:
4735 *flagp = flags;
4736 return(ret);
4737 NOT_REACHED; /*NOTREACHED*/
4738 }
4739
4740 /* Here we have a quantifier, and have calculated 'min' and 'max'.
4741 *
4742 * Check and possibly adjust a zero width operand */
4743 if (! (flags & (HASWIDTH|POSTPONED))) {
4744 if (max > REG_INFTY/3) {
4745 ckWARN2reg(RExC_parse,
4746 "%" UTF8f " matches null string many times",
4747 UTF8fARG(UTF, (RExC_parse >= origparse
4748 ? RExC_parse - origparse
4749 : 0),
4750 origparse));
4751 }
4752
4753 /* There's no point in trying to match something 0 length more than
4754 * once except for extra side effects, which we don't have here since
4755 * not POSTPONED */
4756 if (max > 1) {
4757 max = 1;
4758 if (min > max) {
4759 min = max;
4760 }
4761 }
4762 }
4763
4764 /* If this is a code block pass it up */
4765 *flagp |= (flags & POSTPONED);
4766
4767 if (max > 0) {
4768 *flagp |= (flags & HASWIDTH);
4769 if (max == REG_INFTY)
4770 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
4771 }
4772
4773 /* 'SIMPLE' operands don't require full generality */
4774 if ((flags&SIMPLE)) {
4775 if (max == REG_INFTY) {
4776 if (min == 0) {
4777 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
4778 goto min0_maxINF_wildcard_forbidden;
4779 }
4780
4781 reginsert(pRExC_state, STAR, ret, depth+1);
4782 MARK_NAUGHTY(4);
4783 goto done_main_op;
4784 }
4785 else if (min == 1) {
4786 reginsert(pRExC_state, PLUS, ret, depth+1);
4787 MARK_NAUGHTY(3);
4788 goto done_main_op;
4789 }
4790 }
4791
4792 /* Here, SIMPLE, but not the '*' and '+' special cases */
4793
4794 MARK_NAUGHTY_EXP(2, 2);
4795 reginsert(pRExC_state, CURLY, ret, depth+1);
4796 }
4797 else { /* not SIMPLE */
4798 const regnode_offset w = reg_node(pRExC_state, WHILEM);
4799
4800 FLAGS(REGNODE_p(w)) = 0;
4801 if (! REGTAIL(pRExC_state, ret, w)) {
4802 REQUIRE_BRANCHJ(flagp, 0);
4803 }
4804 if (RExC_use_BRANCHJ) {
4805 reginsert(pRExC_state, LONGJMP, ret, depth+1);
4806 reginsert(pRExC_state, NOTHING, ret, depth+1);
4807 REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
4808 }
4809 reginsert(pRExC_state, CURLYX, ret, depth+1);
4810 if (RExC_use_BRANCHJ)
4811 /* Go over NOTHING to LONGJMP. */
4812 REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
4813
4814 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
4815 NOTHING)))
4816 {
4817 REQUIRE_BRANCHJ(flagp, 0);
4818 }
4819 RExC_whilem_seen++;
4820 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
4821 }
4822
4823 /* Finish up the CURLY/CURLYX case */
4824 FLAGS(REGNODE_p(ret)) = 0;
4825
4826 ARG1i_SET(REGNODE_p(ret), min);
4827 ARG2i_SET(REGNODE_p(ret), max);
4828
4829 /* if we had a npar_after then we need to increment npar_before,
4830 * we want to track the range of parens we need to reset each iteration
4831 */
4832 if (npar_after!=npar_before) {
4833 ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
4834 ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
4835 } else {
4836 ARG3a_SET(REGNODE_p(ret), 0);
4837 ARG3b_SET(REGNODE_p(ret), 0);
4838 }
4839
4840 done_main_op:
4841
4842 /* Process any greediness modifiers */
4843 if (*RExC_parse == '?') {
4844 nextchar(pRExC_state);
4845 reginsert(pRExC_state, MINMOD, ret, depth+1);
4846 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
4847 REQUIRE_BRANCHJ(flagp, 0);
4848 }
4849 }
4850 else if (*RExC_parse == '+') {
4851 regnode_offset ender;
4852 nextchar(pRExC_state);
4853 ender = reg_node(pRExC_state, SUCCEED);
4854 if (! REGTAIL(pRExC_state, ret, ender)) {
4855 REQUIRE_BRANCHJ(flagp, 0);
4856 }
4857 reginsert(pRExC_state, SUSPEND, ret, depth+1);
4858 ender = reg_node(pRExC_state, TAIL);
4859 if (! REGTAIL(pRExC_state, ret, ender)) {
4860 REQUIRE_BRANCHJ(flagp, 0);
4861 }
4862 }
4863
4864 /* Forbid extra quantifiers */
4865 if (isQUANTIFIER(RExC_parse, RExC_end)) {
4866 RExC_parse_inc_by(1);
4867 vFAIL("Nested quantifiers");
4868 }
4869
4870 return(ret);
4871
4872 min0_maxINF_wildcard_forbidden:
4873
4874 /* Here we are in a wildcard match, and the minimum match length is 0, and
4875 * the max could be infinity. This is currently forbidden. The only
4876 * reason is to make it harder to write patterns that take a long long time
4877 * to halt, and because the use of this construct isn't necessary in
4878 * matching Unicode property values */
4879 RExC_parse_inc_by(1);
4880 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
4881 subpatterns in regex; marked by <-- HERE in m/%s/
4882 */
4883 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
4884 " subpatterns");
4885
4886 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
4887 * legal at all in wildcards, so can't get this far */
4888
4889 NOT_REACHED; /*NOTREACHED*/
4890 }
4891
4892 STATIC bool
S_grok_bslash_N(pTHX_ RExC_state_t * pRExC_state,regnode_offset * node_p,UV * code_point_p,int * cp_count,I32 * flagp,const bool strict,const U32 depth)4893 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
4894 regnode_offset * node_p,
4895 UV * code_point_p,
4896 int * cp_count,
4897 I32 * flagp,
4898 const bool strict,
4899 const U32 depth
4900 )
4901 {
4902 /* This routine teases apart the various meanings of \N and returns
4903 * accordingly. The input parameters constrain which meaning(s) is/are valid
4904 * in the current context.
4905 *
4906 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
4907 *
4908 * If <code_point_p> is not NULL, the context is expecting the result to be a
4909 * single code point. If this \N instance turns out to a single code point,
4910 * the function returns TRUE and sets *code_point_p to that code point.
4911 *
4912 * If <node_p> is not NULL, the context is expecting the result to be one of
4913 * the things representable by a regnode. If this \N instance turns out to be
4914 * one such, the function generates the regnode, returns TRUE and sets *node_p
4915 * to point to the offset of that regnode into the regex engine program being
4916 * compiled.
4917 *
4918 * If this instance of \N isn't legal in any context, this function will
4919 * generate a fatal error and not return.
4920 *
4921 * On input, RExC_parse should point to the first char following the \N at the
4922 * time of the call. On successful return, RExC_parse will have been updated
4923 * to point to just after the sequence identified by this routine. Also
4924 * *flagp has been updated as needed.
4925 *
4926 * When there is some problem with the current context and this \N instance,
4927 * the function returns FALSE, without advancing RExC_parse, nor setting
4928 * *node_p, nor *code_point_p, nor *flagp.
4929 *
4930 * If <cp_count> is not NULL, the caller wants to know the length (in code
4931 * points) that this \N sequence matches. This is set, and the input is
4932 * parsed for errors, even if the function returns FALSE, as detailed below.
4933 *
4934 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
4935 *
4936 * Probably the most common case is for the \N to specify a single code point.
4937 * *cp_count will be set to 1, and *code_point_p will be set to that code
4938 * point.
4939 *
4940 * Another possibility is for the input to be an empty \N{}. This is no
4941 * longer accepted, and will generate a fatal error.
4942 *
4943 * Another possibility is for a custom charnames handler to be in effect which
4944 * translates the input name to an empty string. *cp_count will be set to 0.
4945 * *node_p will be set to a generated NOTHING node.
4946 *
4947 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
4948 * set to 0. *node_p will be set to a generated REG_ANY node.
4949 *
4950 * The fifth possibility is that \N resolves to a sequence of more than one
4951 * code points. *cp_count will be set to the number of code points in the
4952 * sequence. *node_p will be set to a generated node returned by this
4953 * function calling S_reg().
4954 *
4955 * The sixth and final possibility is that it is premature to be calling this
4956 * function; the parse needs to be restarted. This can happen when this
4957 * changes from /d to /u rules, or when the pattern needs to be upgraded to
4958 * UTF-8. The latter occurs only when the fifth possibility would otherwise
4959 * be in effect, and is because one of those code points requires the pattern
4960 * to be recompiled as UTF-8. The function returns FALSE, and sets the
4961 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
4962 * happens, the caller needs to desist from continuing parsing, and return
4963 * this information to its caller. This is not set for when there is only one
4964 * code point, as this can be called as part of an ANYOF node, and they can
4965 * store above-Latin1 code points without the pattern having to be in UTF-8.
4966 *
4967 * For non-single-quoted regexes, the tokenizer has resolved character and
4968 * sequence names inside \N{...} into their Unicode values, normalizing the
4969 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
4970 * hex-represented code points in the sequence. This is done there because
4971 * the names can vary based on what charnames pragma is in scope at the time,
4972 * so we need a way to take a snapshot of what they resolve to at the time of
4973 * the original parse. [perl #56444].
4974 *
4975 * That parsing is skipped for single-quoted regexes, so here we may get
4976 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
4977 * like '\N{U+41}', that code point is Unicode, and has to be translated into
4978 * the native character set for non-ASCII platforms. The other possibilities
4979 * are already native, so no translation is done. */
4980
4981 char * endbrace; /* points to '}' following the name */
4982 char * e; /* points to final non-blank before endbrace */
4983 char* p = RExC_parse; /* Temporary */
4984
4985 SV * substitute_parse = NULL;
4986 char *orig_end;
4987 char *save_start;
4988 I32 flags;
4989
4990 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4991
4992 PERL_ARGS_ASSERT_GROK_BSLASH_N;
4993
4994 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
4995 assert(! (node_p && cp_count)); /* At most 1 should be set */
4996
4997 if (cp_count) { /* Initialize return for the most common case */
4998 *cp_count = 1;
4999 }
5000
5001 /* The [^\n] meaning of \N ignores spaces and comments under the /x
5002 * modifier. The other meanings do not (except blanks adjacent to and
5003 * within the braces), so use a temporary until we find out which we are
5004 * being called with */
5005 skip_to_be_ignored_text(pRExC_state, &p,
5006 FALSE /* Don't force to /x */ );
5007
5008 /* Disambiguate between \N meaning a named character versus \N meaning
5009 * [^\n]. The latter is assumed when the {...} following the \N is a legal
5010 * quantifier, or if there is no '{' at all */
5011 if (*p != '{' || regcurly(p, RExC_end, NULL)) {
5012 RExC_parse_set(p);
5013 if (cp_count) {
5014 *cp_count = -1;
5015 }
5016
5017 if (! node_p) {
5018 return FALSE;
5019 }
5020
5021 *node_p = reg_node(pRExC_state, REG_ANY);
5022 *flagp |= HASWIDTH|SIMPLE;
5023 MARK_NAUGHTY(1);
5024 return TRUE;
5025 }
5026
5027 /* The test above made sure that the next real character is a '{', but
5028 * under the /x modifier, it could be separated by space (or a comment and
5029 * \n) and this is not allowed (for consistency with \x{...} and the
5030 * tokenizer handling of \N{NAME}). */
5031 if (*RExC_parse != '{') {
5032 vFAIL("Missing braces on \\N{}");
5033 }
5034
5035 RExC_parse_inc_by(1); /* Skip past the '{' */
5036
5037 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
5038 if (! endbrace) { /* no trailing brace */
5039 vFAIL2("Missing right brace on \\%c{}", 'N');
5040 }
5041
5042 /* Here, we have decided it should be a named character or sequence. These
5043 * imply Unicode semantics */
5044 REQUIRE_UNI_RULES(flagp, FALSE);
5045
5046 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
5047 * nothing at all (not allowed under strict) */
5048 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
5049 RExC_parse_set(endbrace);
5050 if (strict) {
5051 RExC_parse_inc_by(1); /* Position after the "}" */
5052 vFAIL("Zero length \\N{}");
5053 }
5054
5055 if (cp_count) {
5056 *cp_count = 0;
5057 }
5058 nextchar(pRExC_state);
5059 if (! node_p) {
5060 return FALSE;
5061 }
5062
5063 *node_p = reg_node(pRExC_state, NOTHING);
5064 return TRUE;
5065 }
5066
5067 while (isBLANK(*RExC_parse)) {
5068 RExC_parse_inc_by(1);
5069 }
5070
5071 e = endbrace;
5072 while (RExC_parse < e && isBLANK(*(e-1))) {
5073 e--;
5074 }
5075
5076 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
5077
5078 /* Here, the name isn't of the form U+.... This can happen if the
5079 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
5080 * is the time to find out what the name means */
5081
5082 const STRLEN name_len = e - RExC_parse;
5083 SV * value_sv; /* What does this name evaluate to */
5084 SV ** value_svp;
5085 const U8 * value; /* string of name's value */
5086 STRLEN value_len; /* and its length */
5087
5088 /* RExC_unlexed_names is a hash of names that weren't evaluated by
5089 * toke.c, and their values. Make sure is initialized */
5090 if (! RExC_unlexed_names) {
5091 RExC_unlexed_names = newHV();
5092 }
5093
5094 /* If we have already seen this name in this pattern, use that. This
5095 * allows us to only call the charnames handler once per name per
5096 * pattern. A broken or malicious handler could return something
5097 * different each time, which could cause the results to vary depending
5098 * on if something gets added or subtracted from the pattern that
5099 * causes the number of passes to change, for example */
5100 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
5101 name_len, 0)))
5102 {
5103 value_sv = *value_svp;
5104 }
5105 else { /* Otherwise we have to go out and get the name */
5106 const char * error_msg = NULL;
5107 value_sv = get_and_check_backslash_N_name(RExC_parse, e,
5108 UTF,
5109 &error_msg);
5110 if (error_msg) {
5111 RExC_parse_set(endbrace);
5112 vFAIL(error_msg);
5113 }
5114
5115 /* If no error message, should have gotten a valid return */
5116 assert (value_sv);
5117
5118 /* Save the name's meaning for later use */
5119 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
5120 value_sv, 0))
5121 {
5122 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
5123 }
5124 }
5125
5126 /* Here, we have the value the name evaluates to in 'value_sv' */
5127 value = (U8 *) SvPV(value_sv, value_len);
5128
5129 /* See if the result is one code point vs 0 or multiple */
5130 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
5131 ? UTF8SKIP(value)
5132 : 1)))
5133 {
5134 /* Here, exactly one code point. If that isn't what is wanted,
5135 * fail */
5136 if (! code_point_p) {
5137 RExC_parse_set(p);
5138 return FALSE;
5139 }
5140
5141 /* Convert from string to numeric code point */
5142 *code_point_p = (SvUTF8(value_sv))
5143 ? valid_utf8_to_uvchr(value, NULL)
5144 : *value;
5145
5146 /* Have parsed this entire single code point \N{...}. *cp_count
5147 * has already been set to 1, so don't do it again. */
5148 RExC_parse_set(endbrace);
5149 nextchar(pRExC_state);
5150 return TRUE;
5151 } /* End of is a single code point */
5152
5153 /* Count the code points, if caller desires. The API says to do this
5154 * even if we will later return FALSE */
5155 if (cp_count) {
5156 *cp_count = 0;
5157
5158 *cp_count = (SvUTF8(value_sv))
5159 ? utf8_length(value, value + value_len)
5160 : value_len;
5161 }
5162
5163 /* Fail if caller doesn't want to handle a multi-code-point sequence.
5164 * But don't back the pointer up if the caller wants to know how many
5165 * code points there are (they need to handle it themselves in this
5166 * case). */
5167 if (! node_p) {
5168 if (! cp_count) {
5169 RExC_parse_set(p);
5170 }
5171 return FALSE;
5172 }
5173
5174 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
5175 * reg recursively to parse it. That way, it retains its atomicness,
5176 * while not having to worry about any special handling that some code
5177 * points may have. */
5178
5179 substitute_parse = newSVpvs("?:");
5180 sv_catsv(substitute_parse, value_sv);
5181 sv_catpv(substitute_parse, ")");
5182
5183 /* The value should already be native, so no need to convert on EBCDIC
5184 * platforms.*/
5185 assert(! RExC_recode_x_to_native);
5186
5187 }
5188 else { /* \N{U+...} */
5189 Size_t count = 0; /* code point count kept internally */
5190
5191 /* We can get to here when the input is \N{U+...} or when toke.c has
5192 * converted a name to the \N{U+...} form. This include changing a
5193 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
5194
5195 RExC_parse_inc_by(2); /* Skip past the 'U+' */
5196
5197 /* Code points are separated by dots. The '}' terminates the whole
5198 * thing. */
5199
5200 do { /* Loop until the ending brace */
5201 I32 flags = PERL_SCAN_SILENT_OVERFLOW
5202 | PERL_SCAN_SILENT_ILLDIGIT
5203 | PERL_SCAN_NOTIFY_ILLDIGIT
5204 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
5205 | PERL_SCAN_DISALLOW_PREFIX;
5206 STRLEN len = e - RExC_parse;
5207 NV overflow_value;
5208 char * start_digit = RExC_parse;
5209 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
5210
5211 if (len == 0) {
5212 RExC_parse_inc_by(1);
5213 bad_NU:
5214 vFAIL("Invalid hexadecimal number in \\N{U+...}");
5215 }
5216
5217 RExC_parse_inc_by(len);
5218
5219 if (cp > MAX_LEGAL_CP) {
5220 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
5221 }
5222
5223 if (RExC_parse >= e) { /* Got to the closing '}' */
5224 if (count) {
5225 goto do_concat;
5226 }
5227
5228 /* Here, is a single code point; fail if doesn't want that */
5229 if (! code_point_p) {
5230 RExC_parse_set(p);
5231 return FALSE;
5232 }
5233
5234 /* A single code point is easy to handle; just return it */
5235 *code_point_p = UNI_TO_NATIVE(cp);
5236 RExC_parse_set(endbrace);
5237 nextchar(pRExC_state);
5238 return TRUE;
5239 }
5240
5241 /* Here, the parse stopped bfore the ending brace. This is legal
5242 * only if that character is a dot separating code points, like a
5243 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
5244 * So the next character must be a dot (and the one after that
5245 * can't be the ending brace, or we'd have something like
5246 * \N{U+100.} )
5247 * */
5248 if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
5249 /*point to after 1st invalid */
5250 RExC_parse_incf(RExC_orig_utf8);
5251 /*Guard against malformed utf8*/
5252 RExC_parse_set(MIN(e, RExC_parse));
5253 goto bad_NU;
5254 }
5255
5256 /* Here, looks like its really a multiple character sequence. Fail
5257 * if that's not what the caller wants. But continue with counting
5258 * and error checking if they still want a count */
5259 if (! node_p && ! cp_count) {
5260 return FALSE;
5261 }
5262
5263 /* What is done here is to convert this to a sub-pattern of the
5264 * form \x{char1}\x{char2}... and then call reg recursively to
5265 * parse it (enclosing in "(?: ... )" ). That way, it retains its
5266 * atomicness, while not having to worry about special handling
5267 * that some code points may have. We don't create a subpattern,
5268 * but go through the motions of code point counting and error
5269 * checking, if the caller doesn't want a node returned. */
5270
5271 if (node_p && ! substitute_parse) {
5272 substitute_parse = newSVpvs("?:");
5273 }
5274
5275 do_concat:
5276
5277 if (node_p) {
5278 /* Convert to notation the rest of the code understands */
5279 sv_catpvs(substitute_parse, "\\x{");
5280 sv_catpvn(substitute_parse, start_digit,
5281 RExC_parse - start_digit);
5282 sv_catpvs(substitute_parse, "}");
5283 }
5284
5285 /* Move to after the dot (or ending brace the final time through.)
5286 * */
5287 RExC_parse_inc_by(1);
5288 count++;
5289
5290 } while (RExC_parse < e);
5291
5292 if (! node_p) { /* Doesn't want the node */
5293 assert (cp_count);
5294
5295 *cp_count = count;
5296 return FALSE;
5297 }
5298
5299 sv_catpvs(substitute_parse, ")");
5300
5301 /* The values are Unicode, and therefore have to be converted to native
5302 * on a non-Unicode (meaning non-ASCII) platform. */
5303 SET_recode_x_to_native(1);
5304 }
5305
5306 /* Here, we have the string the name evaluates to, ready to be parsed,
5307 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
5308 * constructs. This can be called from within a substitute parse already.
5309 * The error reporting mechanism doesn't work for 2 levels of this, but the
5310 * code above has validated this new construct, so there should be no
5311 * errors generated by the below. And this isn't an exact copy, so the
5312 * mechanism to seamlessly deal with this won't work, so turn off warnings
5313 * during it */
5314 save_start = RExC_start;
5315 orig_end = RExC_end;
5316
5317 RExC_start = SvPVX(substitute_parse);
5318 RExC_parse_set(RExC_start);
5319 RExC_end = RExC_parse + SvCUR(substitute_parse);
5320 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
5321
5322 *node_p = reg(pRExC_state, 1, &flags, depth+1);
5323
5324 /* Restore the saved values */
5325 RESTORE_WARNINGS;
5326 RExC_start = save_start;
5327 RExC_parse_set(endbrace);
5328 RExC_end = orig_end;
5329 SET_recode_x_to_native(0);
5330
5331 SvREFCNT_dec_NN(substitute_parse);
5332
5333 if (! *node_p) {
5334 RETURN_FAIL_ON_RESTART(flags, flagp);
5335 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
5336 (UV) flags);
5337 }
5338 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5339
5340 nextchar(pRExC_state);
5341
5342 return TRUE;
5343 }
5344
5345
5346 STATIC U8
S_compute_EXACTish(RExC_state_t * pRExC_state)5347 S_compute_EXACTish(RExC_state_t *pRExC_state)
5348 {
5349 U8 op;
5350
5351 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
5352
5353 if (! FOLD) {
5354 return (LOC)
5355 ? EXACTL
5356 : EXACT;
5357 }
5358
5359 op = get_regex_charset(RExC_flags);
5360 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
5361 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
5362 been, so there is no hole */
5363 }
5364
5365 return op + EXACTF;
5366 }
5367
5368 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
5369 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
5370
5371 static I32
S_backref_value(char * p,char * e)5372 S_backref_value(char *p, char *e)
5373 {
5374 const char* endptr = e;
5375 UV val;
5376 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
5377 return (I32)val;
5378 return I32_MAX;
5379 }
5380
5381
5382 /*
5383 - regatom - the lowest level
5384
5385 Try to identify anything special at the start of the current parse position.
5386 If there is, then handle it as required. This may involve generating a
5387 single regop, such as for an assertion; or it may involve recursing, such as
5388 to handle a () structure.
5389
5390 If the string doesn't start with something special then we gobble up
5391 as much literal text as we can. If we encounter a quantifier, we have to
5392 back off the final literal character, as that quantifier applies to just it
5393 and not to the whole string of literals.
5394
5395 Once we have been able to handle whatever type of thing started the
5396 sequence, we return the offset into the regex engine program being compiled
5397 at which any next regnode should be placed.
5398
5399 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
5400 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
5401 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
5402 Otherwise does not return 0.
5403
5404 Note: we have to be careful with escapes, as they can be both literal
5405 and special, and in the case of \10 and friends, context determines which.
5406
5407 A summary of the code structure is:
5408
5409 switch (first_byte) {
5410 cases for each special:
5411 handle this special;
5412 break;
5413 case '\\':
5414 switch (2nd byte) {
5415 cases for each unambiguous special:
5416 handle this special;
5417 break;
5418 cases for each ambiguous special/literal:
5419 disambiguate;
5420 if (special) handle here
5421 else goto defchar;
5422 default: // unambiguously literal:
5423 goto defchar;
5424 }
5425 default: // is a literal char
5426 // FALL THROUGH
5427 defchar:
5428 create EXACTish node for literal;
5429 while (more input and node isn't full) {
5430 switch (input_byte) {
5431 cases for each special;
5432 make sure parse pointer is set so that the next call to
5433 regatom will see this special first
5434 goto loopdone; // EXACTish node terminated by prev. char
5435 default:
5436 append char to EXACTISH node;
5437 }
5438 get next input byte;
5439 }
5440 loopdone:
5441 }
5442 return the generated node;
5443
5444 Specifically there are two separate switches for handling
5445 escape sequences, with the one for handling literal escapes requiring
5446 a dummy entry for all of the special escapes that are actually handled
5447 by the other.
5448
5449 */
5450
5451 STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)5452 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5453 {
5454 regnode_offset ret = 0;
5455 I32 flags = 0;
5456 char *atom_parse_start;
5457 U8 op;
5458 int invert = 0;
5459
5460 DECLARE_AND_GET_RE_DEBUG_FLAGS;
5461
5462 *flagp = 0; /* Initialize. */
5463
5464 DEBUG_PARSE("atom");
5465
5466 PERL_ARGS_ASSERT_REGATOM;
5467
5468 tryagain:
5469 atom_parse_start = RExC_parse;
5470 assert(RExC_parse < RExC_end);
5471 switch ((U8)*RExC_parse) {
5472 case '^':
5473 RExC_seen_zerolen++;
5474 nextchar(pRExC_state);
5475 if (RExC_flags & RXf_PMf_MULTILINE)
5476 ret = reg_node(pRExC_state, MBOL);
5477 else
5478 ret = reg_node(pRExC_state, SBOL);
5479 break;
5480 case '$':
5481 nextchar(pRExC_state);
5482 if (*RExC_parse)
5483 RExC_seen_zerolen++;
5484 if (RExC_flags & RXf_PMf_MULTILINE)
5485 ret = reg_node(pRExC_state, MEOL);
5486 else
5487 ret = reg_node(pRExC_state, SEOL);
5488 break;
5489 case '.':
5490 nextchar(pRExC_state);
5491 if (RExC_flags & RXf_PMf_SINGLELINE)
5492 ret = reg_node(pRExC_state, SANY);
5493 else
5494 ret = reg_node(pRExC_state, REG_ANY);
5495 *flagp |= HASWIDTH|SIMPLE;
5496 MARK_NAUGHTY(1);
5497 break;
5498 case '[':
5499 {
5500 char * const cc_parse_start = ++RExC_parse;
5501 ret = regclass(pRExC_state, flagp, depth+1,
5502 FALSE, /* means parse the whole char class */
5503 TRUE, /* allow multi-char folds */
5504 FALSE, /* don't silence non-portable warnings. */
5505 (bool) RExC_strict,
5506 TRUE, /* Allow an optimized regnode result */
5507 NULL);
5508 if (ret == 0) {
5509 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5510 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5511 (UV) *flagp);
5512 }
5513 if (*RExC_parse != ']') {
5514 RExC_parse_set(cc_parse_start);
5515 vFAIL("Unmatched [");
5516 }
5517 nextchar(pRExC_state);
5518 break;
5519 }
5520 case '(':
5521 nextchar(pRExC_state);
5522 ret = reg(pRExC_state, 2, &flags, depth+1);
5523 if (ret == 0) {
5524 if (flags & TRYAGAIN) {
5525 if (RExC_parse >= RExC_end) {
5526 /* Make parent create an empty node if needed. */
5527 *flagp |= TRYAGAIN;
5528 return(0);
5529 }
5530 goto tryagain;
5531 }
5532 RETURN_FAIL_ON_RESTART(flags, flagp);
5533 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
5534 (UV) flags);
5535 }
5536 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5537 break;
5538 case '|':
5539 case ')':
5540 if (flags & TRYAGAIN) {
5541 *flagp |= TRYAGAIN;
5542 return 0;
5543 }
5544 vFAIL("Internal urp");
5545 /* Supposed to be caught earlier. */
5546 break;
5547 case '?':
5548 case '+':
5549 case '*':
5550 RExC_parse_inc_by(1);
5551 vFAIL("Quantifier follows nothing");
5552 break;
5553 case '\\':
5554 /* Special Escapes
5555
5556 This switch handles escape sequences that resolve to some kind
5557 of special regop and not to literal text. Escape sequences that
5558 resolve to literal text are handled below in the switch marked
5559 "Literal Escapes".
5560
5561 Every entry in this switch *must* have a corresponding entry
5562 in the literal escape switch. However, the opposite is not
5563 required, as the default for this switch is to jump to the
5564 literal text handling code.
5565 */
5566 RExC_parse_inc_by(1);
5567 switch ((U8)*RExC_parse) {
5568 /* Special Escapes */
5569 case 'A':
5570 RExC_seen_zerolen++;
5571 /* Under wildcards, this is changed to match \n; should be
5572 * invisible to the user, as they have to compile under /m */
5573 if (RExC_pm_flags & PMf_WILDCARD) {
5574 ret = reg_node(pRExC_state, MBOL);
5575 }
5576 else {
5577 ret = reg_node(pRExC_state, SBOL);
5578 /* SBOL is shared with /^/ so we set the flags so we can tell
5579 * /\A/ from /^/ in split. */
5580 FLAGS(REGNODE_p(ret)) = 1;
5581 }
5582 goto finish_meta_pat;
5583 case 'G':
5584 if (RExC_pm_flags & PMf_WILDCARD) {
5585 RExC_parse_inc_by(1);
5586 /* diag_listed_as: Use of %s is not allowed in Unicode property
5587 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
5588 */
5589 vFAIL("Use of '\\G' is not allowed in Unicode property"
5590 " wildcard subpatterns");
5591 }
5592 ret = reg_node(pRExC_state, GPOS);
5593 RExC_seen |= REG_GPOS_SEEN;
5594 goto finish_meta_pat;
5595 case 'K':
5596 if (!RExC_in_lookaround) {
5597 RExC_seen_zerolen++;
5598 ret = reg_node(pRExC_state, KEEPS);
5599 /* XXX:dmq : disabling in-place substitution seems to
5600 * be necessary here to avoid cases of memory corruption, as
5601 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
5602 */
5603 RExC_seen |= REG_LOOKBEHIND_SEEN;
5604 goto finish_meta_pat;
5605 }
5606 else {
5607 ++RExC_parse; /* advance past the 'K' */
5608 vFAIL("\\K not permitted in lookahead/lookbehind");
5609 }
5610 case 'Z':
5611 if (RExC_pm_flags & PMf_WILDCARD) {
5612 /* See comment under \A above */
5613 ret = reg_node(pRExC_state, MEOL);
5614 }
5615 else {
5616 ret = reg_node(pRExC_state, SEOL);
5617 }
5618 RExC_seen_zerolen++; /* Do not optimize RE away */
5619 goto finish_meta_pat;
5620 case 'z':
5621 if (RExC_pm_flags & PMf_WILDCARD) {
5622 /* See comment under \A above */
5623 ret = reg_node(pRExC_state, MEOL);
5624 }
5625 else {
5626 ret = reg_node(pRExC_state, EOS);
5627 }
5628 RExC_seen_zerolen++; /* Do not optimize RE away */
5629 goto finish_meta_pat;
5630 case 'C':
5631 vFAIL("\\C no longer supported");
5632 case 'X':
5633 ret = reg_node(pRExC_state, CLUMP);
5634 *flagp |= HASWIDTH;
5635 goto finish_meta_pat;
5636
5637 case 'B':
5638 invert = 1;
5639 /* FALLTHROUGH */
5640 case 'b':
5641 {
5642 U8 flags = 0;
5643 regex_charset charset = get_regex_charset(RExC_flags);
5644
5645 RExC_seen_zerolen++;
5646 RExC_seen |= REG_LOOKBEHIND_SEEN;
5647 op = BOUND + charset;
5648
5649 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
5650 flags = TRADITIONAL_BOUND;
5651 if (op > BOUNDA) { /* /aa is same as /a */
5652 op = BOUNDA;
5653 }
5654 }
5655 else {
5656 STRLEN length;
5657 char name = *RExC_parse;
5658 char * endbrace = (char *) memchr(RExC_parse, '}',
5659 RExC_end - RExC_parse);
5660 char * e = endbrace;
5661
5662 RExC_parse_inc_by(2);
5663
5664 if (! endbrace) {
5665 vFAIL2("Missing right brace on \\%c{}", name);
5666 }
5667
5668 while (isBLANK(*RExC_parse)) {
5669 RExC_parse_inc_by(1);
5670 }
5671
5672 while (RExC_parse < e && isBLANK(*(e - 1))) {
5673 e--;
5674 }
5675
5676 if (e == RExC_parse) {
5677 RExC_parse_set(endbrace + 1); /* After the '}' */
5678 vFAIL2("Empty \\%c{}", name);
5679 }
5680
5681 length = e - RExC_parse;
5682
5683 switch (*RExC_parse) {
5684 case 'g':
5685 if ( length != 1
5686 && (memNEs(RExC_parse + 1, length - 1, "cb")))
5687 {
5688 goto bad_bound_type;
5689 }
5690 flags = GCB_BOUND;
5691 break;
5692 case 'l':
5693 if (length != 2 || *(RExC_parse + 1) != 'b') {
5694 goto bad_bound_type;
5695 }
5696 flags = LB_BOUND;
5697 break;
5698 case 's':
5699 if (length != 2 || *(RExC_parse + 1) != 'b') {
5700 goto bad_bound_type;
5701 }
5702 flags = SB_BOUND;
5703 break;
5704 case 'w':
5705 if (length != 2 || *(RExC_parse + 1) != 'b') {
5706 goto bad_bound_type;
5707 }
5708 flags = WB_BOUND;
5709 break;
5710 default:
5711 bad_bound_type:
5712 RExC_parse_set(e);
5713 vFAIL2utf8f(
5714 "'%" UTF8f "' is an unknown bound type",
5715 UTF8fARG(UTF, length, e - length));
5716 NOT_REACHED; /*NOTREACHED*/
5717 }
5718 RExC_parse_set(endbrace);
5719 REQUIRE_UNI_RULES(flagp, 0);
5720
5721 if (op == BOUND) {
5722 op = BOUNDU;
5723 }
5724 else if (op >= BOUNDA) { /* /aa is same as /a */
5725 op = BOUNDU;
5726 length += 4;
5727
5728 /* Don't have to worry about UTF-8, in this message because
5729 * to get here the contents of the \b must be ASCII */
5730 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
5731 "Using /u for '%.*s' instead of /%s",
5732 (unsigned) length,
5733 endbrace - length + 1,
5734 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
5735 ? ASCII_RESTRICT_PAT_MODS
5736 : ASCII_MORE_RESTRICT_PAT_MODS);
5737 }
5738 }
5739
5740 if (op == BOUND) {
5741 RExC_seen_d_op = TRUE;
5742 }
5743 else if (op == BOUNDL) {
5744 RExC_contains_locale = 1;
5745 }
5746
5747 if (invert) {
5748 op += NBOUND - BOUND;
5749 }
5750
5751 ret = reg_node(pRExC_state, op);
5752 FLAGS(REGNODE_p(ret)) = flags;
5753
5754 goto finish_meta_pat;
5755 }
5756
5757 case 'R':
5758 ret = reg_node(pRExC_state, LNBREAK);
5759 *flagp |= HASWIDTH|SIMPLE;
5760 goto finish_meta_pat;
5761
5762 case 'd':
5763 case 'D':
5764 case 'h':
5765 case 'H':
5766 case 'p':
5767 case 'P':
5768 case 's':
5769 case 'S':
5770 case 'v':
5771 case 'V':
5772 case 'w':
5773 case 'W':
5774 /* These all have the same meaning inside [brackets], and it knows
5775 * how to do the best optimizations for them. So, pretend we found
5776 * these within brackets, and let it do the work */
5777 RExC_parse--;
5778
5779 ret = regclass(pRExC_state, flagp, depth+1,
5780 TRUE, /* means just parse this element */
5781 FALSE, /* don't allow multi-char folds */
5782 FALSE, /* don't silence non-portable warnings. It
5783 would be a bug if these returned
5784 non-portables */
5785 (bool) RExC_strict,
5786 TRUE, /* Allow an optimized regnode result */
5787 NULL);
5788 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5789 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
5790 * multi-char folds are allowed. */
5791 if (!ret)
5792 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5793 (UV) *flagp);
5794
5795 RExC_parse--; /* regclass() leaves this one too far ahead */
5796
5797 finish_meta_pat:
5798 /* The escapes above that don't take a parameter can't be
5799 * followed by a '{'. But 'pX', 'p{foo}' and
5800 * correspondingly 'P' can be */
5801 if ( RExC_parse - atom_parse_start == 1
5802 && UCHARAT(RExC_parse + 1) == '{'
5803 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
5804 {
5805 RExC_parse_inc_by(2);
5806 vFAIL("Unescaped left brace in regex is illegal here");
5807 }
5808 nextchar(pRExC_state);
5809 break;
5810 case 'N':
5811 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
5812 * \N{...} evaluates to a sequence of more than one code points).
5813 * The function call below returns a regnode, which is our result.
5814 * The parameters cause it to fail if the \N{} evaluates to a
5815 * single code point; we handle those like any other literal. The
5816 * reason that the multicharacter case is handled here and not as
5817 * part of the EXACtish code is because of quantifiers. In
5818 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
5819 * this way makes that Just Happen. dmq.
5820 * join_exact() will join this up with adjacent EXACTish nodes
5821 * later on, if appropriate. */
5822 ++RExC_parse;
5823 if (grok_bslash_N(pRExC_state,
5824 &ret, /* Want a regnode returned */
5825 NULL, /* Fail if evaluates to a single code
5826 point */
5827 NULL, /* Don't need a count of how many code
5828 points */
5829 flagp,
5830 RExC_strict,
5831 depth)
5832 ) {
5833 break;
5834 }
5835
5836 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5837
5838 /* Here, evaluates to a single code point. Go get that */
5839 RExC_parse_set(atom_parse_start);
5840 goto defchar;
5841
5842 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
5843 parse_named_seq: /* Also handle non-numeric \g{...} */
5844 {
5845 char ch;
5846 if ( RExC_parse >= RExC_end - 1
5847 || (( ch = RExC_parse[1]) != '<'
5848 && ch != '\''
5849 && ch != '{'))
5850 {
5851 RExC_parse_inc_by(1);
5852 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
5853 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
5854 } else {
5855 RExC_parse_inc_by(2);
5856 if (ch == '{') {
5857 while (isBLANK(*RExC_parse)) {
5858 RExC_parse_inc_by(1);
5859 }
5860 }
5861 ret = handle_named_backref(pRExC_state,
5862 flagp,
5863 atom_parse_start,
5864 (ch == '<')
5865 ? '>'
5866 : (ch == '{')
5867 ? '}'
5868 : '\'');
5869 }
5870 break;
5871 }
5872 case 'g':
5873 case '1': case '2': case '3': case '4':
5874 case '5': case '6': case '7': case '8': case '9':
5875 {
5876 I32 num;
5877 char * endbrace = NULL;
5878 char * s = RExC_parse;
5879 char * e = RExC_end;
5880
5881 if (*s == 'g') {
5882 bool isrel = 0;
5883
5884 s++;
5885 if (*s == '{') {
5886 endbrace = (char *) memchr(s, '}', RExC_end - s);
5887 if (! endbrace ) {
5888
5889 /* Missing '}'. Position after the number to give
5890 * a better indication to the user of where the
5891 * problem is. */
5892 s++;
5893 if (*s == '-') {
5894 s++;
5895 }
5896
5897 /* If it looks to be a name and not a number, go
5898 * handle it there */
5899 if (! isDIGIT(*s)) {
5900 goto parse_named_seq;
5901 }
5902
5903 do {
5904 s++;
5905 } while isDIGIT(*s);
5906
5907 RExC_parse_set(s);
5908 vFAIL("Unterminated \\g{...} pattern");
5909 }
5910
5911 s++; /* Past the '{' */
5912
5913 while (isBLANK(*s)) {
5914 s++;
5915 }
5916
5917 /* Ignore trailing blanks */
5918 e = endbrace;
5919 while (s < e && isBLANK(*(e - 1))) {
5920 e--;
5921 }
5922 }
5923
5924 /* Here, have isolated the meat of the construct from any
5925 * surrounding braces */
5926
5927 if (*s == '-') {
5928 isrel = 1;
5929 s++;
5930 }
5931
5932 if (endbrace && !isDIGIT(*s)) {
5933 goto parse_named_seq;
5934 }
5935
5936 RExC_parse_set(s);
5937 num = S_backref_value(RExC_parse, RExC_end);
5938 if (num == 0)
5939 vFAIL("Reference to invalid group 0");
5940 else if (num == I32_MAX) {
5941 if (isDIGIT(*RExC_parse))
5942 vFAIL("Reference to nonexistent group");
5943 else
5944 vFAIL("Unterminated \\g... pattern");
5945 }
5946
5947 if (isrel) {
5948 num = RExC_npar - num;
5949 if (num < 1)
5950 vFAIL("Reference to nonexistent or unclosed group");
5951 }
5952 else
5953 if (num < RExC_logical_npar) {
5954 num = RExC_logical_to_parno[num];
5955 }
5956 else
5957 if (ALL_PARENS_COUNTED) {
5958 if (num < RExC_logical_total_parens)
5959 num = RExC_logical_to_parno[num];
5960 else {
5961 num = -1;
5962 }
5963 }
5964 else{
5965 REQUIRE_PARENS_PASS;
5966 }
5967 }
5968 else {
5969 num = S_backref_value(RExC_parse, RExC_end);
5970 /* bare \NNN might be backref or octal - if it is larger
5971 * than or equal RExC_npar then it is assumed to be an
5972 * octal escape. Note RExC_npar is +1 from the actual
5973 * number of parens. */
5974 /* Note we do NOT check if num == I32_MAX here, as that is
5975 * handled by the RExC_npar check */
5976
5977 if ( /* any numeric escape < 10 is always a backref */
5978 num > 9
5979 /* any numeric escape < RExC_npar is a backref */
5980 && num >= RExC_logical_npar
5981 /* cannot be an octal escape if it starts with [89]
5982 * */
5983 && ! inRANGE(*RExC_parse, '8', '9')
5984 ) {
5985 /* Probably not meant to be a backref, instead likely
5986 * to be an octal character escape, e.g. \35 or \777.
5987 * The above logic should make it obvious why using
5988 * octal escapes in patterns is problematic. - Yves */
5989 RExC_parse_set(atom_parse_start);
5990 goto defchar;
5991 }
5992 if (num < RExC_logical_npar) {
5993 num = RExC_logical_to_parno[num];
5994 }
5995 else
5996 if (ALL_PARENS_COUNTED) {
5997 if (num < RExC_logical_total_parens) {
5998 num = RExC_logical_to_parno[num];
5999 } else {
6000 num = -1;
6001 }
6002 } else {
6003 REQUIRE_PARENS_PASS;
6004 }
6005 }
6006
6007 /* At this point RExC_parse points at a numeric escape like
6008 * \12 or \88 or the digits in \g{34} or \g34 or something
6009 * similar, which we should NOT treat as an octal escape. It
6010 * may or may not be a valid backref escape. For instance
6011 * \88888888 is unlikely to be a valid backref.
6012 *
6013 * We've already figured out what value the digits represent.
6014 * Now, move the parse to beyond them. */
6015 if (endbrace) {
6016 RExC_parse_set(endbrace + 1);
6017 }
6018 else while (isDIGIT(*RExC_parse)) {
6019 RExC_parse_inc_by(1);
6020 }
6021 if (num < 0)
6022 vFAIL("Reference to nonexistent group");
6023
6024 if (num >= (I32)RExC_npar) {
6025 /* It might be a forward reference; we can't fail until we
6026 * know, by completing the parse to get all the groups, and
6027 * then reparsing */
6028 if (ALL_PARENS_COUNTED) {
6029 if (num >= RExC_total_parens) {
6030 vFAIL("Reference to nonexistent group");
6031 }
6032 }
6033 else {
6034 REQUIRE_PARENS_PASS;
6035 }
6036 }
6037 RExC_sawback = 1;
6038 ret = reg2node(pRExC_state,
6039 ((! FOLD)
6040 ? REF
6041 : (ASCII_FOLD_RESTRICTED)
6042 ? REFFA
6043 : (AT_LEAST_UNI_SEMANTICS)
6044 ? REFFU
6045 : (LOC)
6046 ? REFFL
6047 : REFF),
6048 num, RExC_nestroot);
6049 if (RExC_nestroot && num >= RExC_nestroot)
6050 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
6051 if (OP(REGNODE_p(ret)) == REFF) {
6052 RExC_seen_d_op = TRUE;
6053 }
6054 *flagp |= HASWIDTH;
6055
6056 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
6057 FALSE /* Don't force to /x */ );
6058 }
6059 break;
6060 case '\0':
6061 if (RExC_parse >= RExC_end)
6062 FAIL("Trailing \\");
6063 /* FALLTHROUGH */
6064 default:
6065 /* Do not generate "unrecognized" warnings here, we fall
6066 back into the quick-grab loop below */
6067 RExC_parse_set(atom_parse_start);
6068 goto defchar;
6069 } /* end of switch on a \foo sequence */
6070 break;
6071
6072 case '#':
6073
6074 /* '#' comments should have been spaced over before this function was
6075 * called */
6076 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
6077 /*
6078 if (RExC_flags & RXf_PMf_EXTENDED) {
6079 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
6080 if (RExC_parse < RExC_end)
6081 goto tryagain;
6082 }
6083 */
6084
6085 /* FALLTHROUGH */
6086
6087 default:
6088 defchar: {
6089
6090 /* Here, we have determined that the next thing is probably a
6091 * literal character. RExC_parse points to the first byte of its
6092 * definition. (It still may be an escape sequence that evaluates
6093 * to a single character) */
6094
6095 STRLEN len = 0;
6096 UV ender = 0;
6097 char *p;
6098 char *s, *old_s = NULL, *old_old_s = NULL;
6099 char *s0;
6100 U32 max_string_len = 255;
6101
6102 /* We may have to reparse the node, artificially stopping filling
6103 * it early, based on info gleaned in the first parse. This
6104 * variable gives where we stop. Make it above the normal stopping
6105 * place first time through; otherwise it would stop too early */
6106 U32 upper_fill = max_string_len + 1;
6107
6108 /* We start out as an EXACT node, even if under /i, until we find a
6109 * character which is in a fold. The algorithm now segregates into
6110 * separate nodes, characters that fold from those that don't under
6111 * /i. (This hopefully will create nodes that are fixed strings
6112 * even under /i, giving the optimizer something to grab on to.)
6113 * So, if a node has something in it and the next character is in
6114 * the opposite category, that node is closed up, and the function
6115 * returns. Then regatom is called again, and a new node is
6116 * created for the new category. */
6117 U8 node_type = EXACT;
6118
6119 /* Assume the node will be fully used; the excess is given back at
6120 * the end. Under /i, we may need to temporarily add the fold of
6121 * an extra character or two at the end to check for splitting
6122 * multi-char folds, so allocate extra space for that. We can't
6123 * make any other length assumptions, as a byte input sequence
6124 * could shrink down. */
6125 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
6126 + ((! FOLD)
6127 ? 0
6128 : 2 * ((UTF)
6129 ? UTF8_MAXBYTES_CASE
6130 /* Max non-UTF-8 expansion is 2 */ : 2)));
6131
6132 bool next_is_quantifier;
6133 char * oldp = NULL;
6134
6135 /* We can convert EXACTF nodes to EXACTFU if they contain only
6136 * characters that match identically regardless of the target
6137 * string's UTF8ness. The reason to do this is that EXACTF is not
6138 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
6139 * runtime.
6140 *
6141 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
6142 * contain only above-Latin1 characters (hence must be in UTF8),
6143 * which don't participate in folds with Latin1-range characters,
6144 * as the latter's folds aren't known until runtime. */
6145 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6146
6147 /* Single-character EXACTish nodes are almost always SIMPLE. This
6148 * allows us to override this as encountered */
6149 U8 maybe_SIMPLE = SIMPLE;
6150
6151 /* Does this node contain something that can't match unless the
6152 * target string is (also) in UTF-8 */
6153 bool requires_utf8_target = FALSE;
6154
6155 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
6156 bool has_ss = FALSE;
6157
6158 /* So is the MICRO SIGN */
6159 bool has_micro_sign = FALSE;
6160
6161 /* Set when we fill up the current node and there is still more
6162 * text to process */
6163 bool overflowed;
6164
6165 /* Allocate an EXACT node. The node_type may change below to
6166 * another EXACTish node, but since the size of the node doesn't
6167 * change, it works */
6168 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
6169 FILL_NODE(ret, node_type);
6170 RExC_emit += NODE_STEP_REGNODE;
6171
6172 s = STRING(REGNODE_p(ret));
6173
6174 s0 = s;
6175
6176 reparse:
6177
6178 p = RExC_parse;
6179 len = 0;
6180 s = s0;
6181 node_type = EXACT;
6182 oldp = NULL;
6183 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6184 maybe_SIMPLE = SIMPLE;
6185 requires_utf8_target = FALSE;
6186 has_ss = FALSE;
6187 has_micro_sign = FALSE;
6188
6189 continue_parse:
6190
6191 /* This breaks under rare circumstances. If folding, we do not
6192 * want to split a node at a character that is a non-final in a
6193 * multi-char fold, as an input string could just happen to want to
6194 * match across the node boundary. The code at the end of the loop
6195 * looks for this, and backs off until it finds not such a
6196 * character, but it is possible (though extremely, extremely
6197 * unlikely) for all characters in the node to be non-final fold
6198 * ones, in which case we just leave the node fully filled, and
6199 * hope that it doesn't match the string in just the wrong place */
6200
6201 assert( ! UTF /* Is at the beginning of a character */
6202 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
6203 || UTF8_IS_START(UCHARAT(RExC_parse)));
6204
6205 overflowed = FALSE;
6206
6207 /* Here, we have a literal character. Find the maximal string of
6208 * them in the input that we can fit into a single EXACTish node.
6209 * We quit at the first non-literal or when the node gets full, or
6210 * under /i the categorization of folding/non-folding character
6211 * changes */
6212 while (p < RExC_end && len < upper_fill) {
6213
6214 /* In most cases each iteration adds one byte to the output.
6215 * The exceptions override this */
6216 Size_t added_len = 1;
6217
6218 oldp = p;
6219 old_old_s = old_s;
6220 old_s = s;
6221
6222 /* White space has already been ignored */
6223 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
6224 || ! is_PATWS_safe((p), RExC_end, UTF));
6225
6226 switch ((U8)*p) {
6227 const char* message;
6228 U32 packed_warn;
6229 U8 grok_c_char;
6230
6231 case '^':
6232 case '$':
6233 case '.':
6234 case '[':
6235 case '(':
6236 case ')':
6237 case '|':
6238 goto loopdone;
6239 case '\\':
6240 /* Literal Escapes Switch
6241
6242 This switch is meant to handle escape sequences that
6243 resolve to a literal character.
6244
6245 Every escape sequence that represents something
6246 else, like an assertion or a char class, is handled
6247 in the switch marked 'Special Escapes' above in this
6248 routine, but also has an entry here as anything that
6249 isn't explicitly mentioned here will be treated as
6250 an unescaped equivalent literal.
6251 */
6252
6253 switch ((U8)*++p) {
6254
6255 /* These are all the special escapes. */
6256 case 'A': /* Start assertion */
6257 case 'b': case 'B': /* Word-boundary assertion*/
6258 case 'C': /* Single char !DANGEROUS! */
6259 case 'd': case 'D': /* digit class */
6260 case 'g': case 'G': /* generic-backref, pos assertion */
6261 case 'h': case 'H': /* HORIZWS */
6262 case 'k': case 'K': /* named backref, keep marker */
6263 case 'p': case 'P': /* Unicode property */
6264 case 'R': /* LNBREAK */
6265 case 's': case 'S': /* space class */
6266 case 'v': case 'V': /* VERTWS */
6267 case 'w': case 'W': /* word class */
6268 case 'X': /* eXtended Unicode "combining
6269 character sequence" */
6270 case 'z': case 'Z': /* End of line/string assertion */
6271 --p;
6272 goto loopdone;
6273
6274 /* Anything after here is an escape that resolves to a
6275 literal. (Except digits, which may or may not)
6276 */
6277 case 'n':
6278 ender = '\n';
6279 p++;
6280 break;
6281 case 'N': /* Handle a single-code point named character. */
6282 RExC_parse_set( p + 1 );
6283 if (! grok_bslash_N(pRExC_state,
6284 NULL, /* Fail if evaluates to
6285 anything other than a
6286 single code point */
6287 &ender, /* The returned single code
6288 point */
6289 NULL, /* Don't need a count of
6290 how many code points */
6291 flagp,
6292 RExC_strict,
6293 depth)
6294 ) {
6295 if (*flagp & NEED_UTF8)
6296 FAIL("panic: grok_bslash_N set NEED_UTF8");
6297 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
6298
6299 /* Here, it wasn't a single code point. Go close
6300 * up this EXACTish node. The switch() prior to
6301 * this switch handles the other cases */
6302 p = oldp;
6303 RExC_parse_set(p);
6304 goto loopdone;
6305 }
6306 p = RExC_parse;
6307 RExC_parse_set(atom_parse_start);
6308
6309 /* The \N{} means the pattern, if previously /d,
6310 * becomes /u. That means it can't be an EXACTF node,
6311 * but an EXACTFU */
6312 if (node_type == EXACTF) {
6313 node_type = EXACTFU;
6314
6315 /* If the node already contains something that
6316 * differs between EXACTF and EXACTFU, reparse it
6317 * as EXACTFU */
6318 if (! maybe_exactfu) {
6319 len = 0;
6320 s = s0;
6321 goto reparse;
6322 }
6323 }
6324
6325 break;
6326 case 'r':
6327 ender = '\r';
6328 p++;
6329 break;
6330 case 't':
6331 ender = '\t';
6332 p++;
6333 break;
6334 case 'f':
6335 ender = '\f';
6336 p++;
6337 break;
6338 case 'e':
6339 ender = ESC_NATIVE;
6340 p++;
6341 break;
6342 case 'a':
6343 ender = '\a';
6344 p++;
6345 break;
6346 case 'o':
6347 if (! grok_bslash_o(&p,
6348 RExC_end,
6349 &ender,
6350 &message,
6351 &packed_warn,
6352 (bool) RExC_strict,
6353 FALSE, /* No illegal cp's */
6354 UTF))
6355 {
6356 RExC_parse_set(p); /* going to die anyway; point to
6357 exact spot of failure */
6358 vFAIL(message);
6359 }
6360
6361 if (message && TO_OUTPUT_WARNINGS(p)) {
6362 warn_non_literal_string(p, packed_warn, message);
6363 }
6364 break;
6365 case 'x':
6366 if (! grok_bslash_x(&p,
6367 RExC_end,
6368 &ender,
6369 &message,
6370 &packed_warn,
6371 (bool) RExC_strict,
6372 FALSE, /* No illegal cp's */
6373 UTF))
6374 {
6375 RExC_parse_set(p); /* going to die anyway; point
6376 to exact spot of failure */
6377 vFAIL(message);
6378 }
6379
6380 if (message && TO_OUTPUT_WARNINGS(p)) {
6381 warn_non_literal_string(p, packed_warn, message);
6382 }
6383
6384 #ifdef EBCDIC
6385 if (ender < 0x100) {
6386 if (RExC_recode_x_to_native) {
6387 ender = LATIN1_TO_NATIVE(ender);
6388 }
6389 }
6390 #endif
6391 break;
6392 case 'c':
6393 p++;
6394 if (! grok_bslash_c(*p, &grok_c_char,
6395 &message, &packed_warn))
6396 {
6397 /* going to die anyway; point to exact spot of
6398 * failure */
6399 char *new_p= p + ((UTF)
6400 ? UTF8_SAFE_SKIP(p, RExC_end)
6401 : 1);
6402 RExC_parse_set(new_p);
6403 vFAIL(message);
6404 }
6405
6406 ender = grok_c_char;
6407 p++;
6408 if (message && TO_OUTPUT_WARNINGS(p)) {
6409 warn_non_literal_string(p, packed_warn, message);
6410 }
6411
6412 break;
6413 case '8': case '9': /* must be a backreference */
6414 --p;
6415 /* we have an escape like \8 which cannot be an octal escape
6416 * so we exit the loop, and let the outer loop handle this
6417 * escape which may or may not be a legitimate backref. */
6418 goto loopdone;
6419 case '1': case '2': case '3':case '4':
6420 case '5': case '6': case '7':
6421
6422 /* When we parse backslash escapes there is ambiguity
6423 * between backreferences and octal escapes. Any escape
6424 * from \1 - \9 is a backreference, any multi-digit
6425 * escape which does not start with 0 and which when
6426 * evaluated as decimal could refer to an already
6427 * parsed capture buffer is a back reference. Anything
6428 * else is octal.
6429 *
6430 * Note this implies that \118 could be interpreted as
6431 * 118 OR as "\11" . "8" depending on whether there
6432 * were 118 capture buffers defined already in the
6433 * pattern. */
6434
6435 /* NOTE, RExC_npar is 1 more than the actual number of
6436 * parens we have seen so far, hence the "<" as opposed
6437 * to "<=" */
6438 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
6439 { /* Not to be treated as an octal constant, go
6440 find backref */
6441 p = oldp;
6442 goto loopdone;
6443 }
6444 /* FALLTHROUGH */
6445 case '0':
6446 {
6447 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
6448 | PERL_SCAN_NOTIFY_ILLDIGIT;
6449 STRLEN numlen = 3;
6450 ender = grok_oct(p, &numlen, &flags, NULL);
6451 p += numlen;
6452 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
6453 && isDIGIT(*p) /* like \08, \178 */
6454 && ckWARN(WARN_REGEXP))
6455 {
6456 reg_warn_non_literal_string(
6457 p + 1,
6458 form_alien_digit_msg(8, numlen, p,
6459 RExC_end, UTF, FALSE));
6460 }
6461 }
6462 break;
6463 case '\0':
6464 if (p >= RExC_end)
6465 FAIL("Trailing \\");
6466 /* FALLTHROUGH */
6467 default:
6468 if (isALPHANUMERIC(*p)) {
6469 /* An alpha followed by '{' is going to fail next
6470 * iteration, so don't output this warning in that
6471 * case */
6472 if (! isALPHA(*p) || *(p + 1) != '{') {
6473 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
6474 " passed through", p);
6475 }
6476 }
6477 goto normal_default;
6478 } /* End of switch on '\' */
6479 break;
6480 case '{':
6481 /* Trying to gain new uses for '{' without breaking too
6482 * much existing code is hard. The solution currently
6483 * adopted is:
6484 * 1) If there is no ambiguity that a '{' should always
6485 * be taken literally, at the start of a construct, we
6486 * just do so.
6487 * 2) If the literal '{' conflicts with our desired use
6488 * of it as a metacharacter, we die. The deprecation
6489 * cycles for this have come and gone.
6490 * 3) If there is ambiguity, we raise a simple warning.
6491 * This could happen, for example, if the user
6492 * intended it to introduce a quantifier, but slightly
6493 * misspelled the quantifier. Without this warning,
6494 * the quantifier would silently be taken as a literal
6495 * string of characters instead of a meta construct */
6496 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
6497 if ( RExC_strict
6498 || ( p > atom_parse_start + 1
6499 && isALPHA_A(*(p - 1))
6500 && *(p - 2) == '\\'))
6501 {
6502 RExC_parse_set(p + 1);
6503 vFAIL("Unescaped left brace in regex is "
6504 "illegal here");
6505 }
6506 ckWARNreg(p + 1, "Unescaped left brace in regex is"
6507 " passed through");
6508 }
6509 goto normal_default;
6510 case '}':
6511 case ']':
6512 if (p > RExC_parse && RExC_strict) {
6513 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
6514 }
6515 /*FALLTHROUGH*/
6516 default: /* A literal character */
6517 normal_default:
6518 if (! UTF8_IS_INVARIANT(*p) && UTF) {
6519 STRLEN numlen;
6520 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6521 &numlen, UTF8_ALLOW_DEFAULT);
6522 p += numlen;
6523 }
6524 else
6525 ender = (U8) *p++;
6526 break;
6527 } /* End of switch on the literal */
6528
6529 /* Here, have looked at the literal character, and <ender>
6530 * contains its ordinal; <p> points to the character after it.
6531 * */
6532
6533 if (ender > 255) {
6534 REQUIRE_UTF8(flagp);
6535 if ( UNICODE_IS_PERL_EXTENDED(ender)
6536 && TO_OUTPUT_WARNINGS(p))
6537 {
6538 ckWARN2_non_literal_string(p,
6539 packWARN(WARN_PORTABLE),
6540 PL_extended_cp_format,
6541 ender);
6542 }
6543 }
6544
6545 /* We need to check if the next non-ignored thing is a
6546 * quantifier. Move <p> to after anything that should be
6547 * ignored, which, as a side effect, positions <p> for the next
6548 * loop iteration */
6549 skip_to_be_ignored_text(pRExC_state, &p,
6550 FALSE /* Don't force to /x */ );
6551
6552 /* If the next thing is a quantifier, it applies to this
6553 * character only, which means that this character has to be in
6554 * its own node and can't just be appended to the string in an
6555 * existing node, so if there are already other characters in
6556 * the node, close the node with just them, and set up to do
6557 * this character again next time through, when it will be the
6558 * only thing in its new node */
6559
6560 next_is_quantifier = LIKELY(p < RExC_end)
6561 && UNLIKELY(isQUANTIFIER(p, RExC_end));
6562
6563 if (next_is_quantifier && LIKELY(len)) {
6564 p = oldp;
6565 goto loopdone;
6566 }
6567
6568 /* Ready to add 'ender' to the node */
6569
6570 if (! FOLD) { /* The simple case, just append the literal */
6571 not_fold_common:
6572
6573 /* Don't output if it would overflow */
6574 if (UNLIKELY(len > max_string_len - ((UTF)
6575 ? UVCHR_SKIP(ender)
6576 : 1)))
6577 {
6578 overflowed = TRUE;
6579 break;
6580 }
6581
6582 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
6583 *(s++) = (char) ender;
6584 }
6585 else {
6586 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
6587 added_len = (char *) new_s - s;
6588 s = (char *) new_s;
6589
6590 if (ender > 255) {
6591 requires_utf8_target = TRUE;
6592 }
6593 }
6594 }
6595 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
6596
6597 /* Here are folding under /l, and the code point is
6598 * problematic. If this is the first character in the
6599 * node, change the node type to folding. Otherwise, if
6600 * this is the first problematic character, close up the
6601 * existing node, so can start a new node with this one */
6602 if (! len) {
6603 node_type = EXACTFL;
6604 RExC_contains_locale = 1;
6605 }
6606 else if (node_type == EXACT) {
6607 p = oldp;
6608 goto loopdone;
6609 }
6610
6611 /* This problematic code point means we can't simplify
6612 * things */
6613 maybe_exactfu = FALSE;
6614
6615 /* Although these two characters have folds that are
6616 * locale-problematic, they also have folds to above Latin1
6617 * that aren't a problem. Doing these now helps at
6618 * runtime. */
6619 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
6620 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
6621 {
6622 goto fold_anyway;
6623 }
6624
6625 /* Here, we are adding a problematic fold character.
6626 * "Problematic" in this context means that its fold isn't
6627 * known until runtime. (The non-problematic code points
6628 * are the above-Latin1 ones that fold to also all
6629 * above-Latin1. Their folds don't vary no matter what the
6630 * locale is.) But here we have characters whose fold
6631 * depends on the locale. We just add in the unfolded
6632 * character, and wait until runtime to fold it */
6633 goto not_fold_common;
6634 }
6635 else /* regular fold; see if actually is in a fold */
6636 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
6637 || (ender > 255
6638 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
6639 {
6640 /* Here, folding, but the character isn't in a fold.
6641 *
6642 * Start a new node if previous characters in the node were
6643 * folded */
6644 if (len && node_type != EXACT) {
6645 p = oldp;
6646 goto loopdone;
6647 }
6648
6649 /* Here, continuing a node with non-folded characters. Add
6650 * this one */
6651 goto not_fold_common;
6652 }
6653 else { /* Here, does participate in some fold */
6654
6655 /* If this is the first character in the node, change its
6656 * type to folding. Otherwise, if this is the first
6657 * folding character in the node, close up the existing
6658 * node, so can start a new node with this one. */
6659 if (! len) {
6660 node_type = compute_EXACTish(pRExC_state);
6661 }
6662 else if (node_type == EXACT) {
6663 p = oldp;
6664 goto loopdone;
6665 }
6666
6667 if (UTF) { /* Alway use the folded value for UTF-8
6668 patterns */
6669 if (UVCHR_IS_INVARIANT(ender)) {
6670 if (UNLIKELY(len + 1 > max_string_len)) {
6671 overflowed = TRUE;
6672 break;
6673 }
6674
6675 *(s)++ = (U8) toFOLD(ender);
6676 }
6677 else {
6678 UV folded;
6679
6680 fold_anyway:
6681 folded = _to_uni_fold_flags(
6682 ender,
6683 (U8 *) s, /* We have allocated extra space
6684 in 's' so can't run off the
6685 end */
6686 &added_len,
6687 FOLD_FLAGS_FULL
6688 | (( ASCII_FOLD_RESTRICTED
6689 || node_type == EXACTFL)
6690 ? FOLD_FLAGS_NOMIX_ASCII
6691 : 0));
6692 if (UNLIKELY(len + added_len > max_string_len)) {
6693 overflowed = TRUE;
6694 break;
6695 }
6696
6697 s += added_len;
6698
6699 if ( folded > 255
6700 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
6701 {
6702 /* U+B5 folds to the MU, so its possible for a
6703 * non-UTF-8 target to match it */
6704 requires_utf8_target = TRUE;
6705 }
6706 }
6707 }
6708 else { /* Here is non-UTF8. */
6709
6710 /* The fold will be one or (rarely) two characters.
6711 * Check that there's room for at least a single one
6712 * before setting any flags, etc. Because otherwise an
6713 * overflowing character could cause a flag to be set
6714 * even though it doesn't end up in this node. (For
6715 * the two character fold, we check again, before
6716 * setting any flags) */
6717 if (UNLIKELY(len + 1 > max_string_len)) {
6718 overflowed = TRUE;
6719 break;
6720 }
6721
6722 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
6723 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
6724 || UNICODE_DOT_DOT_VERSION > 0)
6725
6726 /* On non-ancient Unicodes, check for the only possible
6727 * multi-char fold */
6728 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
6729
6730 /* This potential multi-char fold means the node
6731 * can't be simple (because it could match more
6732 * than a single char). And in some cases it will
6733 * match 'ss', so set that flag */
6734 maybe_SIMPLE = 0;
6735 has_ss = TRUE;
6736
6737 /* It can't change to be an EXACTFU (unless already
6738 * is one). We fold it iff under /u rules. */
6739 if (node_type != EXACTFU) {
6740 maybe_exactfu = FALSE;
6741 }
6742 else {
6743 if (UNLIKELY(len + 2 > max_string_len)) {
6744 overflowed = TRUE;
6745 break;
6746 }
6747
6748 *(s++) = 's';
6749 *(s++) = 's';
6750 added_len = 2;
6751
6752 goto done_with_this_char;
6753 }
6754 }
6755 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
6756 && LIKELY(len > 0)
6757 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
6758 {
6759 /* Also, the sequence 'ss' is special when not
6760 * under /u. If the target string is UTF-8, it
6761 * should match SHARP S; otherwise it won't. So,
6762 * here we have to exclude the possibility of this
6763 * node moving to /u.*/
6764 has_ss = TRUE;
6765 maybe_exactfu = FALSE;
6766 }
6767 #endif
6768 /* Here, the fold will be a single character */
6769
6770 if (UNLIKELY(ender == MICRO_SIGN)) {
6771 has_micro_sign = TRUE;
6772 }
6773 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
6774
6775 /* If the character's fold differs between /d and
6776 * /u, this can't change to be an EXACTFU node */
6777 maybe_exactfu = FALSE;
6778 }
6779
6780 *(s++) = (DEPENDS_SEMANTICS)
6781 ? (char) toFOLD(ender)
6782
6783 /* Under /u, the fold of any character in
6784 * the 0-255 range happens to be its
6785 * lowercase equivalent, except for LATIN
6786 * SMALL LETTER SHARP S, which was handled
6787 * above, and the MICRO SIGN, whose fold
6788 * requires UTF-8 to represent. */
6789 : (char) toLOWER_L1(ender);
6790 }
6791 } /* End of adding current character to the node */
6792
6793 done_with_this_char:
6794
6795 len += added_len;
6796
6797 if (next_is_quantifier) {
6798
6799 /* Here, the next input is a quantifier, and to get here,
6800 * the current character is the only one in the node. */
6801 goto loopdone;
6802 }
6803
6804 } /* End of loop through literal characters */
6805
6806 /* Here we have either exhausted the input or run out of room in
6807 * the node. If the former, we are done. (If we encountered a
6808 * character that can't be in the node, transfer is made directly
6809 * to <loopdone>, and so we wouldn't have fallen off the end of the
6810 * loop.) */
6811 if (LIKELY(! overflowed)) {
6812 goto loopdone;
6813 }
6814
6815 /* Here we have run out of room. We can grow plain EXACT and
6816 * LEXACT nodes. If the pattern is gigantic enough, though,
6817 * eventually we'll have to artificially chunk the pattern into
6818 * multiple nodes. */
6819 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
6820 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
6821 Size_t overhead_expansion = 0;
6822 char temp[256];
6823 Size_t max_nodes_for_string;
6824 Size_t achievable;
6825 SSize_t delta;
6826
6827 /* Here we couldn't fit the final character in the current
6828 * node, so it will have to be reparsed, no matter what else we
6829 * do */
6830 p = oldp;
6831
6832 /* If would have overflowed a regular EXACT node, switch
6833 * instead to an LEXACT. The code below is structured so that
6834 * the actual growing code is common to changing from an EXACT
6835 * or just increasing the LEXACT size. This means that we have
6836 * to save the string in the EXACT case before growing, and
6837 * then copy it afterwards to its new location */
6838 if (node_type == EXACT) {
6839 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
6840 RExC_emit += overhead_expansion;
6841 Copy(s0, temp, len, char);
6842 }
6843
6844 /* Ready to grow. If it was a plain EXACT, the string was
6845 * saved, and the first few bytes of it overwritten by adding
6846 * an argument field. We assume, as we do elsewhere in this
6847 * file, that one byte of remaining input will translate into
6848 * one byte of output, and if that's too small, we grow again,
6849 * if too large the excess memory is freed at the end */
6850
6851 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
6852 achievable = MIN(max_nodes_for_string,
6853 current_string_nodes + STR_SZ(RExC_end - p));
6854 delta = achievable - current_string_nodes;
6855
6856 /* If there is just no more room, go finish up this chunk of
6857 * the pattern. */
6858 if (delta <= 0) {
6859 goto loopdone;
6860 }
6861
6862 change_engine_size(pRExC_state, delta + overhead_expansion);
6863 current_string_nodes += delta;
6864 max_string_len
6865 = sizeof(struct regnode) * current_string_nodes;
6866 upper_fill = max_string_len + 1;
6867
6868 /* If the length was small, we know this was originally an
6869 * EXACT node now converted to LEXACT, and the string has to be
6870 * restored. Otherwise the string was untouched. 260 is just
6871 * a number safely above 255 so don't have to worry about
6872 * getting it precise */
6873 if (len < 260) {
6874 node_type = LEXACT;
6875 FILL_NODE(ret, node_type);
6876 s0 = STRING(REGNODE_p(ret));
6877 Copy(temp, s0, len, char);
6878 s = s0 + len;
6879 }
6880
6881 goto continue_parse;
6882 }
6883 else if (FOLD) {
6884 bool splittable = FALSE;
6885 bool backed_up = FALSE;
6886 char * e; /* should this be U8? */
6887 char * s_start; /* should this be U8? */
6888
6889 /* Here is /i. Running out of room creates a problem if we are
6890 * folding, and the split happens in the middle of a
6891 * multi-character fold, as a match that should have occurred,
6892 * won't, due to the way nodes are matched, and our artificial
6893 * boundary. So back off until we aren't splitting such a
6894 * fold. If there is no such place to back off to, we end up
6895 * taking the entire node as-is. This can happen if the node
6896 * consists entirely of 'f' or entirely of 's' characters (or
6897 * things that fold to them) as 'ff' and 'ss' are
6898 * multi-character folds.
6899 *
6900 * The Unicode standard says that multi character folds consist
6901 * of either two or three characters. That means we would be
6902 * splitting one if the final character in the node is at the
6903 * beginning of either type, or is the second of a three
6904 * character fold.
6905 *
6906 * At this point:
6907 * ender is the code point of the character that won't fit
6908 * in the node
6909 * s points to just beyond the final byte in the node.
6910 * It's where we would place ender if there were
6911 * room, and where in fact we do place ender's fold
6912 * in the code below, as we've over-allocated space
6913 * for s0 (hence s) to allow for this
6914 * e starts at 's' and advances as we append things.
6915 * old_s is the same as 's'. (If ender had fit, 's' would
6916 * have been advanced to beyond it).
6917 * old_old_s points to the beginning byte of the final
6918 * character in the node
6919 * p points to the beginning byte in the input of the
6920 * character beyond 'ender'.
6921 * oldp points to the beginning byte in the input of
6922 * 'ender'.
6923 *
6924 * In the case of /il, we haven't folded anything that could be
6925 * affected by the locale. That means only above-Latin1
6926 * characters that fold to other above-latin1 characters get
6927 * folded at compile time. To check where a good place to
6928 * split nodes is, everything in it will have to be folded.
6929 * The boolean 'maybe_exactfu' keeps track in /il if there are
6930 * any unfolded characters in the node. */
6931 bool need_to_fold_loc = LOC && ! maybe_exactfu;
6932
6933 /* If we do need to fold the node, we need a place to store the
6934 * folded copy, and a way to map back to the unfolded original
6935 * */
6936 char * locfold_buf = NULL;
6937 Size_t * loc_correspondence = NULL;
6938
6939 if (! need_to_fold_loc) { /* The normal case. Just
6940 initialize to the actual node */
6941 e = s;
6942 s_start = s0;
6943 s = old_old_s; /* Point to the beginning of the final char
6944 that fits in the node */
6945 }
6946 else {
6947
6948 /* Here, we have filled a /il node, and there are unfolded
6949 * characters in it. If the runtime locale turns out to be
6950 * UTF-8, there are possible multi-character folds, just
6951 * like when not under /l. The node hence can't terminate
6952 * in the middle of such a fold. To determine this, we
6953 * have to create a folded copy of this node. That means
6954 * reparsing the node, folding everything assuming a UTF-8
6955 * locale. (If at runtime it isn't such a locale, the
6956 * actions here wouldn't have been necessary, but we have
6957 * to assume the worst case.) If we find we need to back
6958 * off the folded string, we do so, and then map that
6959 * position back to the original unfolded node, which then
6960 * gets output, truncated at that spot */
6961
6962 char * redo_p = RExC_parse;
6963 char * redo_e;
6964 char * old_redo_e;
6965
6966 /* Allow enough space assuming a single byte input folds to
6967 * a single byte output, plus assume that the two unparsed
6968 * characters (that we may need) fold to the largest number
6969 * of bytes possible, plus extra for one more worst case
6970 * scenario. In the loop below, if we start eating into
6971 * that final spare space, we enlarge this initial space */
6972 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
6973
6974 Newxz(locfold_buf, size, char);
6975 Newxz(loc_correspondence, size, Size_t);
6976
6977 /* Redo this node's parse, folding into 'locfold_buf' */
6978 redo_p = RExC_parse;
6979 old_redo_e = redo_e = locfold_buf;
6980 while (redo_p <= oldp) {
6981
6982 old_redo_e = redo_e;
6983 loc_correspondence[redo_e - locfold_buf]
6984 = redo_p - RExC_parse;
6985
6986 if (UTF) {
6987 Size_t added_len;
6988
6989 (void) _to_utf8_fold_flags((U8 *) redo_p,
6990 (U8 *) RExC_end,
6991 (U8 *) redo_e,
6992 &added_len,
6993 FOLD_FLAGS_FULL);
6994 redo_e += added_len;
6995 redo_p += UTF8SKIP(redo_p);
6996 }
6997 else {
6998
6999 /* Note that if this code is run on some ancient
7000 * Unicode versions, SHARP S doesn't fold to 'ss',
7001 * but rather than clutter the code with #ifdef's,
7002 * as is done above, we ignore that possibility.
7003 * This is ok because this code doesn't affect what
7004 * gets matched, but merely where the node gets
7005 * split */
7006 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
7007 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
7008 }
7009 else {
7010 *redo_e++ = 's';
7011 *redo_e++ = 's';
7012 }
7013 redo_p++;
7014 }
7015
7016
7017 /* If we're getting so close to the end that a
7018 * worst-case fold in the next character would cause us
7019 * to overflow, increase, assuming one byte output byte
7020 * per one byte input one, plus room for another worst
7021 * case fold */
7022 if ( redo_p <= oldp
7023 && redo_e > locfold_buf + size
7024 - (UTF8_MAXBYTES_CASE + 1))
7025 {
7026 Size_t new_size = size
7027 + (oldp - redo_p)
7028 + UTF8_MAXBYTES_CASE + 1;
7029 Ptrdiff_t e_offset = redo_e - locfold_buf;
7030
7031 Renew(locfold_buf, new_size, char);
7032 Renew(loc_correspondence, new_size, Size_t);
7033 size = new_size;
7034
7035 redo_e = locfold_buf + e_offset;
7036 }
7037 }
7038
7039 /* Set so that things are in terms of the folded, temporary
7040 * string */
7041 s = old_redo_e;
7042 s_start = locfold_buf;
7043 e = redo_e;
7044
7045 }
7046
7047 /* Here, we have 's', 's_start' and 'e' set up to point to the
7048 * input that goes into the node, folded.
7049 *
7050 * If the final character of the node and the fold of ender
7051 * form the first two characters of a three character fold, we
7052 * need to peek ahead at the next (unparsed) character in the
7053 * input to determine if the three actually do form such a
7054 * fold. Just looking at that character is not generally
7055 * sufficient, as it could be, for example, an escape sequence
7056 * that evaluates to something else, and it needs to be folded.
7057 *
7058 * khw originally thought to just go through the parse loop one
7059 * extra time, but that doesn't work easily as that iteration
7060 * could cause things to think that the parse is over and to
7061 * goto loopdone. The character could be a '$' for example, or
7062 * the character beyond could be a quantifier, and other
7063 * glitches as well.
7064 *
7065 * The solution used here for peeking ahead is to look at that
7066 * next character. If it isn't ASCII punctuation, then it will
7067 * be something that would continue on in an EXACTish node if
7068 * there were space. We append the fold of it to s, having
7069 * reserved enough room in s0 for the purpose. If we can't
7070 * reasonably peek ahead, we instead assume the worst case:
7071 * that it is something that would form the completion of a
7072 * multi-char fold.
7073 *
7074 * If we can't split between s and ender, we work backwards
7075 * character-by-character down to s0. At each current point
7076 * see if we are at the beginning of a multi-char fold. If so,
7077 * that means we would be splitting the fold across nodes, and
7078 * so we back up one and try again.
7079 *
7080 * If we're not at the beginning, we still could be at the
7081 * final two characters of a (rare) three character fold. We
7082 * check if the sequence starting at the character before the
7083 * current position (and including the current and next
7084 * characters) is a three character fold. If not, the node can
7085 * be split here. If it is, we have to backup two characters
7086 * and try again.
7087 *
7088 * Otherwise, the node can be split at the current position.
7089 *
7090 * The same logic is used for UTF-8 patterns and not */
7091 if (UTF) {
7092 Size_t added_len;
7093
7094 /* Append the fold of ender */
7095 (void) _to_uni_fold_flags(
7096 ender,
7097 (U8 *) e,
7098 &added_len,
7099 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7100 ? FOLD_FLAGS_NOMIX_ASCII
7101 : 0));
7102 e += added_len;
7103
7104 /* 's' and the character folded to by ender may be the
7105 * first two of a three-character fold, in which case the
7106 * node should not be split here. That may mean examining
7107 * the so-far unparsed character starting at 'p'. But if
7108 * ender folded to more than one character, we already have
7109 * three characters to look at. Also, we first check if
7110 * the sequence consisting of s and the next character form
7111 * the first two of some three character fold. If not,
7112 * there's no need to peek ahead. */
7113 if ( added_len <= UTF8SKIP(e - added_len)
7114 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
7115 {
7116 /* Here, the two do form the beginning of a potential
7117 * three character fold. The unexamined character may
7118 * or may not complete it. Peek at it. It might be
7119 * something that ends the node or an escape sequence,
7120 * in which case we don't know without a lot of work
7121 * what it evaluates to, so we have to assume the worst
7122 * case: that it does complete the fold, and so we
7123 * can't split here. All such instances will have
7124 * that character be an ASCII punctuation character,
7125 * like a backslash. So, for that case, backup one and
7126 * drop down to try at that position */
7127 if (isPUNCT(*p)) {
7128 s = (char *) utf8_hop_back((U8 *) s, -1,
7129 (U8 *) s_start);
7130 backed_up = TRUE;
7131 }
7132 else {
7133 /* Here, since it's not punctuation, it must be a
7134 * real character, and we can append its fold to
7135 * 'e' (having deliberately reserved enough space
7136 * for this eventuality) and drop down to check if
7137 * the three actually do form a folded sequence */
7138 (void) _to_utf8_fold_flags(
7139 (U8 *) p, (U8 *) RExC_end,
7140 (U8 *) e,
7141 &added_len,
7142 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7143 ? FOLD_FLAGS_NOMIX_ASCII
7144 : 0));
7145 e += added_len;
7146 }
7147 }
7148
7149 /* Here, we either have three characters available in
7150 * sequence starting at 's', or we have two characters and
7151 * know that the following one can't possibly be part of a
7152 * three character fold. We go through the node backwards
7153 * until we find a place where we can split it without
7154 * breaking apart a multi-character fold. At any given
7155 * point we have to worry about if such a fold begins at
7156 * the current 's', and also if a three-character fold
7157 * begins at s-1, (containing s and s+1). Splitting in
7158 * either case would break apart a fold */
7159 do {
7160 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
7161 (U8 *) s_start);
7162
7163 /* If is a multi-char fold, can't split here. Backup
7164 * one char and try again */
7165 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
7166 s = prev_s;
7167 backed_up = TRUE;
7168 continue;
7169 }
7170
7171 /* If the two characters beginning at 's' are part of a
7172 * three character fold starting at the character
7173 * before s, we can't split either before or after s.
7174 * Backup two chars and try again */
7175 if ( LIKELY(s > s_start)
7176 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
7177 {
7178 s = prev_s;
7179 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
7180 backed_up = TRUE;
7181 continue;
7182 }
7183
7184 /* Here there's no multi-char fold between s and the
7185 * next character following it. We can split */
7186 splittable = TRUE;
7187 break;
7188
7189 } while (s > s_start); /* End of loops backing up through the node */
7190
7191 /* Here we either couldn't find a place to split the node,
7192 * or else we broke out of the loop setting 'splittable' to
7193 * true. In the latter case, the place to split is between
7194 * the first and second characters in the sequence starting
7195 * at 's' */
7196 if (splittable) {
7197 s += UTF8SKIP(s);
7198 }
7199 }
7200 else { /* Pattern not UTF-8 */
7201 if ( ender != LATIN_SMALL_LETTER_SHARP_S
7202 || ASCII_FOLD_RESTRICTED)
7203 {
7204 assert( toLOWER_L1(ender) < 256 );
7205 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7206 }
7207 else {
7208 *e++ = 's';
7209 *e++ = 's';
7210 }
7211
7212 if ( e - s <= 1
7213 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
7214 {
7215 if (isPUNCT(*p)) {
7216 s--;
7217 backed_up = TRUE;
7218 }
7219 else {
7220 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
7221 || ASCII_FOLD_RESTRICTED)
7222 {
7223 assert( toLOWER_L1(ender) < 256 );
7224 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7225 }
7226 else {
7227 *e++ = 's';
7228 *e++ = 's';
7229 }
7230 }
7231 }
7232
7233 do {
7234 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
7235 s--;
7236 backed_up = TRUE;
7237 continue;
7238 }
7239
7240 if ( LIKELY(s > s_start)
7241 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
7242 {
7243 s -= 2;
7244 backed_up = TRUE;
7245 continue;
7246 }
7247
7248 splittable = TRUE;
7249 break;
7250
7251 } while (s > s_start);
7252
7253 if (splittable) {
7254 s++;
7255 }
7256 }
7257
7258 /* Here, we are done backing up. If we didn't backup at all
7259 * (the likely case), just proceed */
7260 if (backed_up) {
7261
7262 /* If we did find a place to split, reparse the entire node
7263 * stopping where we have calculated. */
7264 if (splittable) {
7265
7266 /* If we created a temporary folded string under /l, we
7267 * have to map that back to the original */
7268 if (need_to_fold_loc) {
7269 upper_fill = loc_correspondence[s - s_start];
7270 if (upper_fill == 0) {
7271 FAIL2("panic: loc_correspondence[%d] is 0",
7272 (int) (s - s_start));
7273 }
7274 Safefree(locfold_buf);
7275 Safefree(loc_correspondence);
7276 }
7277 else {
7278 upper_fill = s - s0;
7279 }
7280 goto reparse;
7281 }
7282
7283 /* Here the node consists entirely of non-final multi-char
7284 * folds. (Likely it is all 'f's or all 's's.) There's no
7285 * decent place to split it, so give up and just take the
7286 * whole thing */
7287 len = old_s - s0;
7288 }
7289
7290 if (need_to_fold_loc) {
7291 Safefree(locfold_buf);
7292 Safefree(loc_correspondence);
7293 }
7294 } /* End of verifying node ends with an appropriate char */
7295
7296 /* We need to start the next node at the character that didn't fit
7297 * in this one */
7298 p = oldp;
7299
7300 loopdone: /* Jumped to when encounters something that shouldn't be
7301 in the node */
7302
7303 /* Free up any over-allocated space; cast is to silence bogus
7304 * warning in MS VC */
7305 change_engine_size(pRExC_state,
7306 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
7307
7308 /* I (khw) don't know if you can get here with zero length, but the
7309 * old code handled this situation by creating a zero-length EXACT
7310 * node. Might as well be NOTHING instead */
7311 if (len == 0) {
7312 OP(REGNODE_p(ret)) = NOTHING;
7313 }
7314 else {
7315
7316 /* If the node type is EXACT here, check to see if it
7317 * should be EXACTL, or EXACT_REQ8. */
7318 if (node_type == EXACT) {
7319 if (LOC) {
7320 node_type = EXACTL;
7321 }
7322 else if (requires_utf8_target) {
7323 node_type = EXACT_REQ8;
7324 }
7325 }
7326 else if (node_type == LEXACT) {
7327 if (requires_utf8_target) {
7328 node_type = LEXACT_REQ8;
7329 }
7330 }
7331 else if (FOLD) {
7332 if ( UNLIKELY(has_micro_sign || has_ss)
7333 && (node_type == EXACTFU || ( node_type == EXACTF
7334 && maybe_exactfu)))
7335 { /* These two conditions are problematic in non-UTF-8
7336 EXACTFU nodes. */
7337 assert(! UTF);
7338 node_type = EXACTFUP;
7339 }
7340 else if (node_type == EXACTFL) {
7341
7342 /* 'maybe_exactfu' is deliberately set above to
7343 * indicate this node type, where all code points in it
7344 * are above 255 */
7345 if (maybe_exactfu) {
7346 node_type = EXACTFLU8;
7347 }
7348 else if (UNLIKELY(
7349 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
7350 {
7351 /* A character that folds to more than one will
7352 * match multiple characters, so can't be SIMPLE.
7353 * We don't have to worry about this with EXACTFLU8
7354 * nodes just above, as they have already been
7355 * folded (since the fold doesn't vary at run
7356 * time). Here, if the final character in the node
7357 * folds to multiple, it can't be simple. (This
7358 * only has an effect if the node has only a single
7359 * character, hence the final one, as elsewhere we
7360 * turn off simple for nodes whose length > 1 */
7361 maybe_SIMPLE = 0;
7362 }
7363 }
7364 else if (node_type == EXACTF) { /* Means is /di */
7365
7366 /* This intermediate variable is needed solely because
7367 * the asserts in the macro where used exceed Win32's
7368 * literal string capacity */
7369 char first_char = * STRING(REGNODE_p(ret));
7370
7371 /* If 'maybe_exactfu' is clear, then we need to stay
7372 * /di. If it is set, it means there are no code
7373 * points that match differently depending on UTF8ness
7374 * of the target string, so it can become an EXACTFU
7375 * node */
7376 if (! maybe_exactfu) {
7377 RExC_seen_d_op = TRUE;
7378 }
7379 else if ( isALPHA_FOLD_EQ(first_char, 's')
7380 || isALPHA_FOLD_EQ(ender, 's'))
7381 {
7382 /* But, if the node begins or ends in an 's' we
7383 * have to defer changing it into an EXACTFU, as
7384 * the node could later get joined with another one
7385 * that ends or begins with 's' creating an 'ss'
7386 * sequence which would then wrongly match the
7387 * sharp s without the target being UTF-8. We
7388 * create a special node that we resolve later when
7389 * we join nodes together */
7390
7391 node_type = EXACTFU_S_EDGE;
7392 }
7393 else {
7394 node_type = EXACTFU;
7395 }
7396 }
7397
7398 if (requires_utf8_target && node_type == EXACTFU) {
7399 node_type = EXACTFU_REQ8;
7400 }
7401 }
7402
7403 OP(REGNODE_p(ret)) = node_type;
7404 setSTR_LEN(REGNODE_p(ret), len);
7405 RExC_emit += STR_SZ(len);
7406
7407 /* If the node isn't a single character, it can't be SIMPLE */
7408 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
7409 maybe_SIMPLE = 0;
7410 }
7411
7412 *flagp |= HASWIDTH | maybe_SIMPLE;
7413 }
7414
7415 RExC_parse_set(p);
7416
7417 {
7418 /* len is STRLEN which is unsigned, need to copy to signed */
7419 IV iv = len;
7420 if (iv < 0)
7421 vFAIL("Internal disaster");
7422 }
7423
7424 } /* End of label 'defchar:' */
7425 break;
7426 } /* End of giant switch on input character */
7427
7428 /* Position parse to next real character */
7429 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
7430 FALSE /* Don't force to /x */ );
7431 if ( *RExC_parse == '{'
7432 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
7433 {
7434 if (RExC_strict) {
7435 RExC_parse_inc_by(1);
7436 vFAIL("Unescaped left brace in regex is illegal here");
7437 }
7438 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
7439 " passed through");
7440 }
7441
7442 return(ret);
7443 }
7444
7445
7446 void
Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode * node,SV ** invlist_ptr)7447 Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
7448 {
7449 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
7450 * sets up the bitmap and any flags, removing those code points from the
7451 * inversion list, setting it to NULL should it become completely empty */
7452
7453
7454 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
7455
7456 /* There is no bitmap for this node type */
7457 if (REGNODE_TYPE(OP(node)) != ANYOF) {
7458 return;
7459 }
7460
7461 ANYOF_BITMAP_ZERO(node);
7462 if (*invlist_ptr) {
7463
7464 /* This gets set if we actually need to modify things */
7465 bool change_invlist = FALSE;
7466
7467 UV start, end;
7468
7469 /* Start looking through *invlist_ptr */
7470 invlist_iterinit(*invlist_ptr);
7471 while (invlist_iternext(*invlist_ptr, &start, &end)) {
7472 UV high;
7473 int i;
7474
7475 /* Quit if are above what we should change */
7476 if (start >= NUM_ANYOF_CODE_POINTS) {
7477 break;
7478 }
7479
7480 change_invlist = TRUE;
7481
7482 /* Set all the bits in the range, up to the max that we are doing */
7483 high = (end < NUM_ANYOF_CODE_POINTS - 1)
7484 ? end
7485 : NUM_ANYOF_CODE_POINTS - 1;
7486 for (i = start; i <= (int) high; i++) {
7487 ANYOF_BITMAP_SET(node, i);
7488 }
7489 }
7490 invlist_iterfinish(*invlist_ptr);
7491
7492 /* Done with loop; remove any code points that are in the bitmap from
7493 * *invlist_ptr */
7494 if (change_invlist) {
7495 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
7496 }
7497
7498 /* If have completely emptied it, remove it completely */
7499 if (_invlist_len(*invlist_ptr) == 0) {
7500 SvREFCNT_dec_NN(*invlist_ptr);
7501 *invlist_ptr = NULL;
7502 }
7503 }
7504 }
7505
7506 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7507 Character classes ([:foo:]) can also be negated ([:^foo:]).
7508 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7509 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7510 but trigger failures because they are currently unimplemented. */
7511
7512 #define POSIXCC_DONE(c) ((c) == ':')
7513 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7514 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7515 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
7516
7517 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
7518 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
7519 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
7520
7521 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
7522
7523 /* 'posix_warnings' and 'warn_text' are names of variables in the following
7524 * routine. q.v. */
7525 #define ADD_POSIX_WARNING(p, text) STMT_START { \
7526 if (posix_warnings) { \
7527 if (! RExC_warn_text ) RExC_warn_text = \
7528 (AV *) sv_2mortal((SV *) newAV()); \
7529 av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_ \
7530 WARNING_PREFIX \
7531 text \
7532 REPORT_LOCATION, \
7533 REPORT_LOCATION_ARGS(p))); \
7534 } \
7535 } STMT_END
7536 #define CLEAR_POSIX_WARNINGS() \
7537 STMT_START { \
7538 if (posix_warnings && RExC_warn_text) \
7539 av_clear(RExC_warn_text); \
7540 } STMT_END
7541
7542 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
7543 STMT_START { \
7544 CLEAR_POSIX_WARNINGS(); \
7545 return ret; \
7546 } STMT_END
7547
7548 STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t * pRExC_state,const char * const s,char ** updated_parse_ptr,AV ** posix_warnings,const bool check_only)7549 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
7550
7551 const char * const s, /* Where the putative posix class begins.
7552 Normally, this is one past the '['. This
7553 parameter exists so it can be somewhere
7554 besides RExC_parse. */
7555 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
7556 NULL */
7557 AV ** posix_warnings, /* Where to place any generated warnings, or
7558 NULL */
7559 const bool check_only /* Don't die if error */
7560 )
7561 {
7562 /* This parses what the caller thinks may be one of the three POSIX
7563 * constructs:
7564 * 1) a character class, like [:blank:]
7565 * 2) a collating symbol, like [. .]
7566 * 3) an equivalence class, like [= =]
7567 * In the latter two cases, it croaks if it finds a syntactically legal
7568 * one, as these are not handled by Perl.
7569 *
7570 * The main purpose is to look for a POSIX character class. It returns:
7571 * a) the class number
7572 * if it is a completely syntactically and semantically legal class.
7573 * 'updated_parse_ptr', if not NULL, is set to point to just after the
7574 * closing ']' of the class
7575 * b) OOB_NAMEDCLASS
7576 * if it appears that one of the three POSIX constructs was meant, but
7577 * its specification was somehow defective. 'updated_parse_ptr', if
7578 * not NULL, is set to point to the character just after the end
7579 * character of the class. See below for handling of warnings.
7580 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
7581 * if it doesn't appear that a POSIX construct was intended.
7582 * 'updated_parse_ptr' is not changed. No warnings nor errors are
7583 * raised.
7584 *
7585 * In b) there may be errors or warnings generated. If 'check_only' is
7586 * TRUE, then any errors are discarded. Warnings are returned to the
7587 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
7588 * instead it is NULL, warnings are suppressed.
7589 *
7590 * The reason for this function, and its complexity is that a bracketed
7591 * character class can contain just about anything. But it's easy to
7592 * mistype the very specific posix class syntax but yielding a valid
7593 * regular bracketed class, so it silently gets compiled into something
7594 * quite unintended.
7595 *
7596 * The solution adopted here maintains backward compatibility except that
7597 * it adds a warning if it looks like a posix class was intended but
7598 * improperly specified. The warning is not raised unless what is input
7599 * very closely resembles one of the 14 legal posix classes. To do this,
7600 * it uses fuzzy parsing. It calculates how many single-character edits it
7601 * would take to transform what was input into a legal posix class. Only
7602 * if that number is quite small does it think that the intention was a
7603 * posix class. Obviously these are heuristics, and there will be cases
7604 * where it errs on one side or another, and they can be tweaked as
7605 * experience informs.
7606 *
7607 * The syntax for a legal posix class is:
7608 *
7609 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
7610 *
7611 * What this routine considers syntactically to be an intended posix class
7612 * is this (the comments indicate some restrictions that the pattern
7613 * doesn't show):
7614 *
7615 * qr/(?x: \[? # The left bracket, possibly
7616 * # omitted
7617 * \h* # possibly followed by blanks
7618 * (?: \^ \h* )? # possibly a misplaced caret
7619 * [:;]? # The opening class character,
7620 * # possibly omitted. A typo
7621 * # semi-colon can also be used.
7622 * \h*
7623 * \^? # possibly a correctly placed
7624 * # caret, but not if there was also
7625 * # a misplaced one
7626 * \h*
7627 * .{3,15} # The class name. If there are
7628 * # deviations from the legal syntax,
7629 * # its edit distance must be close
7630 * # to a real class name in order
7631 * # for it to be considered to be
7632 * # an intended posix class.
7633 * \h*
7634 * [[:punct:]]? # The closing class character,
7635 * # possibly omitted. If not a colon
7636 * # nor semi colon, the class name
7637 * # must be even closer to a valid
7638 * # one
7639 * \h*
7640 * \]? # The right bracket, possibly
7641 * # omitted.
7642 * )/
7643 *
7644 * In the above, \h must be ASCII-only.
7645 *
7646 * These are heuristics, and can be tweaked as field experience dictates.
7647 * There will be cases when someone didn't intend to specify a posix class
7648 * that this warns as being so. The goal is to minimize these, while
7649 * maximizing the catching of things intended to be a posix class that
7650 * aren't parsed as such.
7651 */
7652
7653 const char* p = s;
7654 const char * const e = RExC_end;
7655 unsigned complement = 0; /* If to complement the class */
7656 bool found_problem = FALSE; /* Assume OK until proven otherwise */
7657 bool has_opening_bracket = FALSE;
7658 bool has_opening_colon = FALSE;
7659 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
7660 valid class */
7661 const char * possible_end = NULL; /* used for a 2nd parse pass */
7662 const char* name_start; /* ptr to class name first char */
7663
7664 /* If the number of single-character typos the input name is away from a
7665 * legal name is no more than this number, it is considered to have meant
7666 * the legal name */
7667 int max_distance = 2;
7668
7669 /* to store the name. The size determines the maximum length before we
7670 * decide that no posix class was intended. Should be at least
7671 * sizeof("alphanumeric") */
7672 UV input_text[15];
7673 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
7674
7675 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
7676
7677 CLEAR_POSIX_WARNINGS();
7678
7679 if (p >= e) {
7680 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
7681 }
7682
7683 if (*(p - 1) != '[') {
7684 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
7685 found_problem = TRUE;
7686 }
7687 else {
7688 has_opening_bracket = TRUE;
7689 }
7690
7691 /* They could be confused and think you can put spaces between the
7692 * components */
7693 if (isBLANK(*p)) {
7694 found_problem = TRUE;
7695
7696 do {
7697 p++;
7698 } while (p < e && isBLANK(*p));
7699
7700 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7701 }
7702
7703 /* For [. .] and [= =]. These are quite different internally from [: :],
7704 * so they are handled separately. */
7705 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
7706 and 1 for at least one char in it
7707 */
7708 {
7709 const char open_char = *p;
7710 const char * temp_ptr = p + 1;
7711
7712 /* These two constructs are not handled by perl, and if we find a
7713 * syntactically valid one, we croak. khw, who wrote this code, finds
7714 * this explanation of them very unclear:
7715 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
7716 * And searching the rest of the internet wasn't very helpful either.
7717 * It looks like just about any byte can be in these constructs,
7718 * depending on the locale. But unless the pattern is being compiled
7719 * under /l, which is very rare, Perl runs under the C or POSIX locale.
7720 * In that case, it looks like [= =] isn't allowed at all, and that
7721 * [. .] could be any single code point, but for longer strings the
7722 * constituent characters would have to be the ASCII alphabetics plus
7723 * the minus-hyphen. Any sensible locale definition would limit itself
7724 * to these. And any portable one definitely should. Trying to parse
7725 * the general case is a nightmare (see [perl #127604]). So, this code
7726 * looks only for interiors of these constructs that match:
7727 * qr/.|[-\w]{2,}/
7728 * Using \w relaxes the apparent rules a little, without adding much
7729 * danger of mistaking something else for one of these constructs.
7730 *
7731 * [. .] in some implementations described on the internet is usable to
7732 * escape a character that otherwise is special in bracketed character
7733 * classes. For example [.].] means a literal right bracket instead of
7734 * the ending of the class
7735 *
7736 * [= =] can legitimately contain a [. .] construct, but we don't
7737 * handle this case, as that [. .] construct will later get parsed
7738 * itself and croak then. And [= =] is checked for even when not under
7739 * /l, as Perl has long done so.
7740 *
7741 * The code below relies on there being a trailing NUL, so it doesn't
7742 * have to keep checking if the parse ptr < e.
7743 */
7744 if (temp_ptr[1] == open_char) {
7745 temp_ptr++;
7746 }
7747 else while ( temp_ptr < e
7748 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
7749 {
7750 temp_ptr++;
7751 }
7752
7753 if (*temp_ptr == open_char) {
7754 temp_ptr++;
7755 if (*temp_ptr == ']') {
7756 temp_ptr++;
7757 if (! found_problem && ! check_only) {
7758 RExC_parse_set((char *) temp_ptr);
7759 vFAIL3("POSIX syntax [%c %c] is reserved for future "
7760 "extensions", open_char, open_char);
7761 }
7762
7763 /* Here, the syntax wasn't completely valid, or else the call
7764 * is to check-only */
7765 if (updated_parse_ptr) {
7766 *updated_parse_ptr = (char *) temp_ptr;
7767 }
7768
7769 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
7770 }
7771 }
7772
7773 /* If we find something that started out to look like one of these
7774 * constructs, but isn't, we continue below so that it can be checked
7775 * for being a class name with a typo of '.' or '=' instead of a colon.
7776 * */
7777 }
7778
7779 /* Here, we think there is a possibility that a [: :] class was meant, and
7780 * we have the first real character. It could be they think the '^' comes
7781 * first */
7782 if (*p == '^') {
7783 found_problem = TRUE;
7784 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
7785 complement = 1;
7786 p++;
7787
7788 if (isBLANK(*p)) {
7789 found_problem = TRUE;
7790
7791 do {
7792 p++;
7793 } while (p < e && isBLANK(*p));
7794
7795 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7796 }
7797 }
7798
7799 /* But the first character should be a colon, which they could have easily
7800 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
7801 * distinguish from a colon, so treat that as a colon). */
7802 if (*p == ':') {
7803 p++;
7804 has_opening_colon = TRUE;
7805 }
7806 else if (*p == ';') {
7807 found_problem = TRUE;
7808 p++;
7809 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7810 has_opening_colon = TRUE;
7811 }
7812 else {
7813 found_problem = TRUE;
7814 ADD_POSIX_WARNING(p, "there must be a starting ':'");
7815
7816 /* Consider an initial punctuation (not one of the recognized ones) to
7817 * be a left terminator */
7818 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
7819 p++;
7820 }
7821 }
7822
7823 /* They may think that you can put spaces between the components */
7824 if (isBLANK(*p)) {
7825 found_problem = TRUE;
7826
7827 do {
7828 p++;
7829 } while (p < e && isBLANK(*p));
7830
7831 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7832 }
7833
7834 if (*p == '^') {
7835
7836 /* We consider something like [^:^alnum:]] to not have been intended to
7837 * be a posix class, but XXX maybe we should */
7838 if (complement) {
7839 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7840 }
7841
7842 complement = 1;
7843 p++;
7844 }
7845
7846 /* Again, they may think that you can put spaces between the components */
7847 if (isBLANK(*p)) {
7848 found_problem = TRUE;
7849
7850 do {
7851 p++;
7852 } while (p < e && isBLANK(*p));
7853
7854 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7855 }
7856
7857 if (*p == ']') {
7858
7859 /* XXX This ']' may be a typo, and something else was meant. But
7860 * treating it as such creates enough complications, that that
7861 * possibility isn't currently considered here. So we assume that the
7862 * ']' is what is intended, and if we've already found an initial '[',
7863 * this leaves this construct looking like [:] or [:^], which almost
7864 * certainly weren't intended to be posix classes */
7865 if (has_opening_bracket) {
7866 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7867 }
7868
7869 /* But this function can be called when we parse the colon for
7870 * something like qr/[alpha:]]/, so we back up to look for the
7871 * beginning */
7872 p--;
7873
7874 if (*p == ';') {
7875 found_problem = TRUE;
7876 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7877 }
7878 else if (*p != ':') {
7879
7880 /* XXX We are currently very restrictive here, so this code doesn't
7881 * consider the possibility that, say, /[alpha.]]/ was intended to
7882 * be a posix class. */
7883 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7884 }
7885
7886 /* Here we have something like 'foo:]'. There was no initial colon,
7887 * and we back up over 'foo. XXX Unlike the going forward case, we
7888 * don't handle typos of non-word chars in the middle */
7889 has_opening_colon = FALSE;
7890 p--;
7891
7892 while (p > RExC_start && isWORDCHAR(*p)) {
7893 p--;
7894 }
7895 p++;
7896
7897 /* Here, we have positioned ourselves to where we think the first
7898 * character in the potential class is */
7899 }
7900
7901 /* Now the interior really starts. There are certain key characters that
7902 * can end the interior, or these could just be typos. To catch both
7903 * cases, we may have to do two passes. In the first pass, we keep on
7904 * going unless we come to a sequence that matches
7905 * qr/ [[:punct:]] [[:blank:]]* \] /xa
7906 * This means it takes a sequence to end the pass, so two typos in a row if
7907 * that wasn't what was intended. If the class is perfectly formed, just
7908 * this one pass is needed. We also stop if there are too many characters
7909 * being accumulated, but this number is deliberately set higher than any
7910 * real class. It is set high enough so that someone who thinks that
7911 * 'alphanumeric' is a correct name would get warned that it wasn't.
7912 * While doing the pass, we keep track of where the key characters were in
7913 * it. If we don't find an end to the class, and one of the key characters
7914 * was found, we redo the pass, but stop when we get to that character.
7915 * Thus the key character was considered a typo in the first pass, but a
7916 * terminator in the second. If two key characters are found, we stop at
7917 * the second one in the first pass. Again this can miss two typos, but
7918 * catches a single one
7919 *
7920 * In the first pass, 'possible_end' starts as NULL, and then gets set to
7921 * point to the first key character. For the second pass, it starts as -1.
7922 * */
7923
7924 name_start = p;
7925 parse_name:
7926 {
7927 bool has_blank = FALSE;
7928 bool has_upper = FALSE;
7929 bool has_terminating_colon = FALSE;
7930 bool has_terminating_bracket = FALSE;
7931 bool has_semi_colon = FALSE;
7932 unsigned int name_len = 0;
7933 int punct_count = 0;
7934
7935 while (p < e) {
7936
7937 /* Squeeze out blanks when looking up the class name below */
7938 if (isBLANK(*p) ) {
7939 has_blank = TRUE;
7940 found_problem = TRUE;
7941 p++;
7942 continue;
7943 }
7944
7945 /* The name will end with a punctuation */
7946 if (isPUNCT(*p)) {
7947 const char * peek = p + 1;
7948
7949 /* Treat any non-']' punctuation followed by a ']' (possibly
7950 * with intervening blanks) as trying to terminate the class.
7951 * ']]' is very likely to mean a class was intended (but
7952 * missing the colon), but the warning message that gets
7953 * generated shows the error position better if we exit the
7954 * loop at the bottom (eventually), so skip it here. */
7955 if (*p != ']') {
7956 if (peek < e && isBLANK(*peek)) {
7957 has_blank = TRUE;
7958 found_problem = TRUE;
7959 do {
7960 peek++;
7961 } while (peek < e && isBLANK(*peek));
7962 }
7963
7964 if (peek < e && *peek == ']') {
7965 has_terminating_bracket = TRUE;
7966 if (*p == ':') {
7967 has_terminating_colon = TRUE;
7968 }
7969 else if (*p == ';') {
7970 has_semi_colon = TRUE;
7971 has_terminating_colon = TRUE;
7972 }
7973 else {
7974 found_problem = TRUE;
7975 }
7976 p = peek + 1;
7977 goto try_posix;
7978 }
7979 }
7980
7981 /* Here we have punctuation we thought didn't end the class.
7982 * Keep track of the position of the key characters that are
7983 * more likely to have been class-enders */
7984 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
7985
7986 /* Allow just one such possible class-ender not actually
7987 * ending the class. */
7988 if (possible_end) {
7989 break;
7990 }
7991 possible_end = p;
7992 }
7993
7994 /* If we have too many punctuation characters, no use in
7995 * keeping going */
7996 if (++punct_count > max_distance) {
7997 break;
7998 }
7999
8000 /* Treat the punctuation as a typo. */
8001 input_text[name_len++] = *p;
8002 p++;
8003 }
8004 else if (isUPPER(*p)) { /* Use lowercase for lookup */
8005 input_text[name_len++] = toLOWER(*p);
8006 has_upper = TRUE;
8007 found_problem = TRUE;
8008 p++;
8009 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
8010 input_text[name_len++] = *p;
8011 p++;
8012 }
8013 else {
8014 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
8015 p+= UTF8SKIP(p);
8016 }
8017
8018 /* The declaration of 'input_text' is how long we allow a potential
8019 * class name to be, before saying they didn't mean a class name at
8020 * all */
8021 if (name_len >= C_ARRAY_LENGTH(input_text)) {
8022 break;
8023 }
8024 }
8025
8026 /* We get to here when the possible class name hasn't been properly
8027 * terminated before:
8028 * 1) we ran off the end of the pattern; or
8029 * 2) found two characters, each of which might have been intended to
8030 * be the name's terminator
8031 * 3) found so many punctuation characters in the purported name,
8032 * that the edit distance to a valid one is exceeded
8033 * 4) we decided it was more characters than anyone could have
8034 * intended to be one. */
8035
8036 found_problem = TRUE;
8037
8038 /* In the final two cases, we know that looking up what we've
8039 * accumulated won't lead to a match, even a fuzzy one. */
8040 if ( name_len >= C_ARRAY_LENGTH(input_text)
8041 || punct_count > max_distance)
8042 {
8043 /* If there was an intermediate key character that could have been
8044 * an intended end, redo the parse, but stop there */
8045 if (possible_end && possible_end != (char *) -1) {
8046 possible_end = (char *) -1; /* Special signal value to say
8047 we've done a first pass */
8048 p = name_start;
8049 goto parse_name;
8050 }
8051
8052 /* Otherwise, it can't have meant to have been a class */
8053 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8054 }
8055
8056 /* If we ran off the end, and the final character was a punctuation
8057 * one, back up one, to look at that final one just below. Later, we
8058 * will restore the parse pointer if appropriate */
8059 if (name_len && p == e && isPUNCT(*(p-1))) {
8060 p--;
8061 name_len--;
8062 }
8063
8064 if (p < e && isPUNCT(*p)) {
8065 if (*p == ']') {
8066 has_terminating_bracket = TRUE;
8067
8068 /* If this is a 2nd ']', and the first one is just below this
8069 * one, consider that to be the real terminator. This gives a
8070 * uniform and better positioning for the warning message */
8071 if ( possible_end
8072 && possible_end != (char *) -1
8073 && *possible_end == ']'
8074 && name_len && input_text[name_len - 1] == ']')
8075 {
8076 name_len--;
8077 p = possible_end;
8078
8079 /* And this is actually equivalent to having done the 2nd
8080 * pass now, so set it to not try again */
8081 possible_end = (char *) -1;
8082 }
8083 }
8084 else {
8085 if (*p == ':') {
8086 has_terminating_colon = TRUE;
8087 }
8088 else if (*p == ';') {
8089 has_semi_colon = TRUE;
8090 has_terminating_colon = TRUE;
8091 }
8092 p++;
8093 }
8094 }
8095
8096 try_posix:
8097
8098 /* Here, we have a class name to look up. We can short circuit the
8099 * stuff below for short names that can't possibly be meant to be a
8100 * class name. (We can do this on the first pass, as any second pass
8101 * will yield an even shorter name) */
8102 if (name_len < 3) {
8103 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8104 }
8105
8106 /* Find which class it is. Initially switch on the length of the name.
8107 * */
8108 switch (name_len) {
8109 case 4:
8110 if (memEQs(name_start, 4, "word")) {
8111 /* this is not POSIX, this is the Perl \w */
8112 class_number = ANYOF_WORDCHAR;
8113 }
8114 break;
8115 case 5:
8116 /* Names all of length 5: alnum alpha ascii blank cntrl digit
8117 * graph lower print punct space upper
8118 * Offset 4 gives the best switch position. */
8119 switch (name_start[4]) {
8120 case 'a':
8121 if (memBEGINs(name_start, 5, "alph")) /* alpha */
8122 class_number = ANYOF_ALPHA;
8123 break;
8124 case 'e':
8125 if (memBEGINs(name_start, 5, "spac")) /* space */
8126 class_number = ANYOF_SPACE;
8127 break;
8128 case 'h':
8129 if (memBEGINs(name_start, 5, "grap")) /* graph */
8130 class_number = ANYOF_GRAPH;
8131 break;
8132 case 'i':
8133 if (memBEGINs(name_start, 5, "asci")) /* ascii */
8134 class_number = ANYOF_ASCII;
8135 break;
8136 case 'k':
8137 if (memBEGINs(name_start, 5, "blan")) /* blank */
8138 class_number = ANYOF_BLANK;
8139 break;
8140 case 'l':
8141 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
8142 class_number = ANYOF_CNTRL;
8143 break;
8144 case 'm':
8145 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
8146 class_number = ANYOF_ALPHANUMERIC;
8147 break;
8148 case 'r':
8149 if (memBEGINs(name_start, 5, "lowe")) /* lower */
8150 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
8151 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
8152 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
8153 break;
8154 case 't':
8155 if (memBEGINs(name_start, 5, "digi")) /* digit */
8156 class_number = ANYOF_DIGIT;
8157 else if (memBEGINs(name_start, 5, "prin")) /* print */
8158 class_number = ANYOF_PRINT;
8159 else if (memBEGINs(name_start, 5, "punc")) /* punct */
8160 class_number = ANYOF_PUNCT;
8161 break;
8162 }
8163 break;
8164 case 6:
8165 if (memEQs(name_start, 6, "xdigit"))
8166 class_number = ANYOF_XDIGIT;
8167 break;
8168 }
8169
8170 /* If the name exactly matches a posix class name the class number will
8171 * here be set to it, and the input almost certainly was meant to be a
8172 * posix class, so we can skip further checking. If instead the syntax
8173 * is exactly correct, but the name isn't one of the legal ones, we
8174 * will return that as an error below. But if neither of these apply,
8175 * it could be that no posix class was intended at all, or that one
8176 * was, but there was a typo. We tease these apart by doing fuzzy
8177 * matching on the name */
8178 if (class_number == OOB_NAMEDCLASS && found_problem) {
8179 const UV posix_names[][6] = {
8180 { 'a', 'l', 'n', 'u', 'm' },
8181 { 'a', 'l', 'p', 'h', 'a' },
8182 { 'a', 's', 'c', 'i', 'i' },
8183 { 'b', 'l', 'a', 'n', 'k' },
8184 { 'c', 'n', 't', 'r', 'l' },
8185 { 'd', 'i', 'g', 'i', 't' },
8186 { 'g', 'r', 'a', 'p', 'h' },
8187 { 'l', 'o', 'w', 'e', 'r' },
8188 { 'p', 'r', 'i', 'n', 't' },
8189 { 'p', 'u', 'n', 'c', 't' },
8190 { 's', 'p', 'a', 'c', 'e' },
8191 { 'u', 'p', 'p', 'e', 'r' },
8192 { 'w', 'o', 'r', 'd' },
8193 { 'x', 'd', 'i', 'g', 'i', 't' }
8194 };
8195 /* The names of the above all have added NULs to make them the same
8196 * size, so we need to also have the real lengths */
8197 const UV posix_name_lengths[] = {
8198 sizeof("alnum") - 1,
8199 sizeof("alpha") - 1,
8200 sizeof("ascii") - 1,
8201 sizeof("blank") - 1,
8202 sizeof("cntrl") - 1,
8203 sizeof("digit") - 1,
8204 sizeof("graph") - 1,
8205 sizeof("lower") - 1,
8206 sizeof("print") - 1,
8207 sizeof("punct") - 1,
8208 sizeof("space") - 1,
8209 sizeof("upper") - 1,
8210 sizeof("word") - 1,
8211 sizeof("xdigit")- 1
8212 };
8213 unsigned int i;
8214 int temp_max = max_distance; /* Use a temporary, so if we
8215 reparse, we haven't changed the
8216 outer one */
8217
8218 /* Use a smaller max edit distance if we are missing one of the
8219 * delimiters */
8220 if ( has_opening_bracket + has_opening_colon < 2
8221 || has_terminating_bracket + has_terminating_colon < 2)
8222 {
8223 temp_max--;
8224 }
8225
8226 /* See if the input name is close to a legal one */
8227 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
8228
8229 /* Short circuit call if the lengths are too far apart to be
8230 * able to match */
8231 if (abs( (int) (name_len - posix_name_lengths[i]))
8232 > temp_max)
8233 {
8234 continue;
8235 }
8236
8237 if (edit_distance(input_text,
8238 posix_names[i],
8239 name_len,
8240 posix_name_lengths[i],
8241 temp_max
8242 )
8243 > -1)
8244 { /* If it is close, it probably was intended to be a class */
8245 goto probably_meant_to_be;
8246 }
8247 }
8248
8249 /* Here the input name is not close enough to a valid class name
8250 * for us to consider it to be intended to be a posix class. If
8251 * we haven't already done so, and the parse found a character that
8252 * could have been terminators for the name, but which we absorbed
8253 * as typos during the first pass, repeat the parse, signalling it
8254 * to stop at that character */
8255 if (possible_end && possible_end != (char *) -1) {
8256 possible_end = (char *) -1;
8257 p = name_start;
8258 goto parse_name;
8259 }
8260
8261 /* Here neither pass found a close-enough class name */
8262 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8263 }
8264
8265 probably_meant_to_be:
8266
8267 /* Here we think that a posix specification was intended. Update any
8268 * parse pointer */
8269 if (updated_parse_ptr) {
8270 *updated_parse_ptr = (char *) p;
8271 }
8272
8273 /* If a posix class name was intended but incorrectly specified, we
8274 * output or return the warnings */
8275 if (found_problem) {
8276
8277 /* We set flags for these issues in the parse loop above instead of
8278 * adding them to the list of warnings, because we can parse it
8279 * twice, and we only want one warning instance */
8280 if (has_upper) {
8281 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
8282 }
8283 if (has_blank) {
8284 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
8285 }
8286 if (has_semi_colon) {
8287 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
8288 }
8289 else if (! has_terminating_colon) {
8290 ADD_POSIX_WARNING(p, "there is no terminating ':'");
8291 }
8292 if (! has_terminating_bracket) {
8293 ADD_POSIX_WARNING(p, "there is no terminating ']'");
8294 }
8295
8296 if ( posix_warnings
8297 && RExC_warn_text
8298 && av_count(RExC_warn_text) > 0)
8299 {
8300 *posix_warnings = RExC_warn_text;
8301 }
8302 }
8303 else if (class_number != OOB_NAMEDCLASS) {
8304 /* If it is a known class, return the class. The class number
8305 * #defines are structured so each complement is +1 to the normal
8306 * one */
8307 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
8308 }
8309 else if (! check_only) {
8310
8311 /* Here, it is an unrecognized class. This is an error (unless the
8312 * call is to check only, which we've already handled above) */
8313 const char * const complement_string = (complement)
8314 ? "^"
8315 : "";
8316 RExC_parse_set((char *) p);
8317 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
8318 complement_string,
8319 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
8320 }
8321 }
8322
8323 return OOB_NAMEDCLASS;
8324 }
8325 #undef ADD_POSIX_WARNING
8326
8327 STATIC unsigned int
S_regex_set_precedence(const U8 my_operator)8328 S_regex_set_precedence(const U8 my_operator) {
8329
8330 /* Returns the precedence in the (?[...]) construct of the input operator,
8331 * specified by its character representation. The precedence follows
8332 * general Perl rules, but it extends this so that ')' and ']' have (low)
8333 * precedence even though they aren't really operators */
8334
8335 switch (my_operator) {
8336 case '!':
8337 return 5;
8338 case '&':
8339 return 4;
8340 case '^':
8341 case '|':
8342 case '+':
8343 case '-':
8344 return 3;
8345 case ')':
8346 return 2;
8347 case ']':
8348 return 1;
8349 }
8350
8351 NOT_REACHED; /* NOTREACHED */
8352 return 0; /* Silence compiler warning */
8353 }
8354
8355 STATIC regnode_offset
S_handle_regex_sets(pTHX_ RExC_state_t * pRExC_state,SV ** return_invlist,I32 * flagp,U32 depth)8356 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
8357 I32 *flagp, U32 depth)
8358 {
8359 /* Handle the (?[...]) construct to do set operations */
8360
8361 U8 curchar; /* Current character being parsed */
8362 UV start, end; /* End points of code point ranges */
8363 SV* final = NULL; /* The end result inversion list */
8364 SV* result_string; /* 'final' stringified */
8365 AV* stack; /* stack of operators and operands not yet
8366 resolved */
8367 AV* fence_stack = NULL; /* A stack containing the positions in
8368 'stack' of where the undealt-with left
8369 parens would be if they were actually
8370 put there */
8371 /* The 'volatile' is a workaround for an optimiser bug
8372 * in Solaris Studio 12.3. See RT #127455 */
8373 volatile IV fence = 0; /* Position of where most recent undealt-
8374 with left paren in stack is; -1 if none.
8375 */
8376 STRLEN len; /* Temporary */
8377 regnode_offset node; /* Temporary, and final regnode returned by
8378 this function */
8379 const bool save_fold = FOLD; /* Temporary */
8380 char *save_end, *save_parse; /* Temporaries */
8381 const bool in_locale = LOC; /* we turn off /l during processing */
8382
8383 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8384
8385 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
8386
8387 DEBUG_PARSE("xcls");
8388
8389 if (in_locale) {
8390 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
8391 }
8392
8393 /* The use of this operator implies /u. This is required so that the
8394 * compile time values are valid in all runtime cases */
8395 REQUIRE_UNI_RULES(flagp, 0);
8396
8397 /* Everything in this construct is a metacharacter. Operands begin with
8398 * either a '\' (for an escape sequence), or a '[' for a bracketed
8399 * character class. Any other character should be an operator, or
8400 * parenthesis for grouping. Both types of operands are handled by calling
8401 * regclass() to parse them. It is called with a parameter to indicate to
8402 * return the computed inversion list. The parsing here is implemented via
8403 * a stack. Each entry on the stack is a single character representing one
8404 * of the operators; or else a pointer to an operand inversion list. */
8405
8406 #define IS_OPERATOR(a) SvIOK(a)
8407 #define IS_OPERAND(a) (! IS_OPERATOR(a))
8408
8409 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
8410 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
8411 * with pronouncing it called it Reverse Polish instead, but now that YOU
8412 * know how to pronounce it you can use the correct term, thus giving due
8413 * credit to the person who invented it, and impressing your geek friends.
8414 * Wikipedia says that the pronunciation of "Ł" has been changing so that
8415 * it is now more like an English initial W (as in wonk) than an L.)
8416 *
8417 * This means that, for example, 'a | b & c' is stored on the stack as
8418 *
8419 * c [4]
8420 * b [3]
8421 * & [2]
8422 * a [1]
8423 * | [0]
8424 *
8425 * where the numbers in brackets give the stack [array] element number.
8426 * In this implementation, parentheses are not stored on the stack.
8427 * Instead a '(' creates a "fence" so that the part of the stack below the
8428 * fence is invisible except to the corresponding ')' (this allows us to
8429 * replace testing for parens, by using instead subtraction of the fence
8430 * position). As new operands are processed they are pushed onto the stack
8431 * (except as noted in the next paragraph). New operators of higher
8432 * precedence than the current final one are inserted on the stack before
8433 * the lhs operand (so that when the rhs is pushed next, everything will be
8434 * in the correct positions shown above. When an operator of equal or
8435 * lower precedence is encountered in parsing, all the stacked operations
8436 * of equal or higher precedence are evaluated, leaving the result as the
8437 * top entry on the stack. This makes higher precedence operations
8438 * evaluate before lower precedence ones, and causes operations of equal
8439 * precedence to left associate.
8440 *
8441 * The only unary operator '!' is immediately pushed onto the stack when
8442 * encountered. When an operand is encountered, if the top of the stack is
8443 * a '!", the complement is immediately performed, and the '!' popped. The
8444 * resulting value is treated as a new operand, and the logic in the
8445 * previous paragraph is executed. Thus in the expression
8446 * [a] + ! [b]
8447 * the stack looks like
8448 *
8449 * !
8450 * a
8451 * +
8452 *
8453 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
8454 * becomes
8455 *
8456 * !b
8457 * a
8458 * +
8459 *
8460 * A ')' is treated as an operator with lower precedence than all the
8461 * aforementioned ones, which causes all operations on the stack above the
8462 * corresponding '(' to be evaluated down to a single resultant operand.
8463 * Then the fence for the '(' is removed, and the operand goes through the
8464 * algorithm above, without the fence.
8465 *
8466 * A separate stack is kept of the fence positions, so that the position of
8467 * the latest so-far unbalanced '(' is at the top of it.
8468 *
8469 * The ']' ending the construct is treated as the lowest operator of all,
8470 * so that everything gets evaluated down to a single operand, which is the
8471 * result */
8472
8473 stack = (AV*)newSV_type_mortal(SVt_PVAV);
8474 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
8475
8476 while (RExC_parse < RExC_end) {
8477 I32 top_index; /* Index of top-most element in 'stack' */
8478 SV** top_ptr; /* Pointer to top 'stack' element */
8479 SV* current = NULL; /* To contain the current inversion list
8480 operand */
8481 SV* only_to_avoid_leaks;
8482
8483 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
8484 TRUE /* Force /x */ );
8485 if (RExC_parse >= RExC_end) { /* Fail */
8486 break;
8487 }
8488
8489 curchar = UCHARAT(RExC_parse);
8490
8491 redo_curchar:
8492
8493 #ifdef ENABLE_REGEX_SETS_DEBUGGING
8494 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
8495 DEBUG_U(dump_regex_sets_structures(pRExC_state,
8496 stack, fence, fence_stack));
8497 #endif
8498
8499 top_index = av_tindex_skip_len_mg(stack);
8500
8501 switch (curchar) {
8502 SV** stacked_ptr; /* Ptr to something already on 'stack' */
8503 char stacked_operator; /* The topmost operator on the 'stack'. */
8504 SV* lhs; /* Operand to the left of the operator */
8505 SV* rhs; /* Operand to the right of the operator */
8506 SV* fence_ptr; /* Pointer to top element of the fence
8507 stack */
8508 case '(':
8509
8510 if ( RExC_parse < RExC_end - 2
8511 && UCHARAT(RExC_parse + 1) == '?'
8512 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
8513 {
8514 const regnode_offset orig_emit = RExC_emit;
8515 SV * resultant_invlist;
8516
8517 /* Here it could be an embedded '(?flags:(?[...])'.
8518 * This happens when we have some thing like
8519 *
8520 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
8521 * ...
8522 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
8523 *
8524 * Here we would be handling the interpolated
8525 * '$thai_or_lao'. We handle this by a recursive call to
8526 * reg which returns the inversion list the
8527 * interpolated expression evaluates to. Actually, the
8528 * return is a special regnode containing a pointer to that
8529 * inversion list. If the return isn't that regnode alone,
8530 * we know that this wasn't such an interpolation, which is
8531 * an error: we need to get a single inversion list back
8532 * from the recursion */
8533
8534 RExC_parse_inc_by(1);
8535 RExC_sets_depth++;
8536
8537 node = reg(pRExC_state, 2, flagp, depth+1);
8538 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8539
8540 if ( OP(REGNODE_p(node)) != REGEX_SET
8541 /* If more than a single node returned, the nested
8542 * parens evaluated to more than just a (?[...]),
8543 * which isn't legal */
8544 || RExC_emit != orig_emit
8545 + NODE_STEP_REGNODE
8546 + REGNODE_ARG_LEN(REGEX_SET))
8547 {
8548 vFAIL("Expecting interpolated extended charclass");
8549 }
8550 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
8551 current = invlist_clone(resultant_invlist, NULL);
8552 SvREFCNT_dec(resultant_invlist);
8553
8554 RExC_sets_depth--;
8555 RExC_emit = orig_emit;
8556 goto handle_operand;
8557 }
8558
8559 /* A regular '('. Look behind for illegal syntax */
8560 if (top_index - fence >= 0) {
8561 /* If the top entry on the stack is an operator, it had
8562 * better be a '!', otherwise the entry below the top
8563 * operand should be an operator */
8564 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
8565 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
8566 || ( IS_OPERAND(*top_ptr)
8567 && ( top_index - fence < 1
8568 || ! (stacked_ptr = av_fetch(stack,
8569 top_index - 1,
8570 FALSE))
8571 || ! IS_OPERATOR(*stacked_ptr))))
8572 {
8573 RExC_parse_inc_by(1);
8574 vFAIL("Unexpected '(' with no preceding operator");
8575 }
8576 }
8577
8578 /* Stack the position of this undealt-with left paren */
8579 av_push_simple(fence_stack, newSViv(fence));
8580 fence = top_index + 1;
8581 break;
8582
8583 case '\\':
8584 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8585 * multi-char folds are allowed. */
8586 if (!regclass(pRExC_state, flagp, depth+1,
8587 TRUE, /* means parse just the next thing */
8588 FALSE, /* don't allow multi-char folds */
8589 FALSE, /* don't silence non-portable warnings. */
8590 TRUE, /* strict */
8591 FALSE, /* Require return to be an ANYOF */
8592 ¤t))
8593 {
8594 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8595 goto regclass_failed;
8596 }
8597
8598 assert(current);
8599
8600 /* regclass() will return with parsing just the \ sequence,
8601 * leaving the parse pointer at the next thing to parse */
8602 RExC_parse--;
8603 goto handle_operand;
8604
8605 case '[': /* Is a bracketed character class */
8606 {
8607 /* See if this is a [:posix:] class. */
8608 bool is_posix_class = (OOB_NAMEDCLASS
8609 < handle_possible_posix(pRExC_state,
8610 RExC_parse + 1,
8611 NULL,
8612 NULL,
8613 TRUE /* checking only */));
8614 /* If it is a posix class, leave the parse pointer at the '['
8615 * to fool regclass() into thinking it is part of a
8616 * '[[:posix:]]'. */
8617 if (! is_posix_class) {
8618 RExC_parse_inc_by(1);
8619 }
8620
8621 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8622 * multi-char folds are allowed. */
8623 if (!regclass(pRExC_state, flagp, depth+1,
8624 is_posix_class, /* parse the whole char
8625 class only if not a
8626 posix class */
8627 FALSE, /* don't allow multi-char folds */
8628 TRUE, /* silence non-portable warnings. */
8629 TRUE, /* strict */
8630 FALSE, /* Require return to be an ANYOF */
8631 ¤t))
8632 {
8633 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8634 goto regclass_failed;
8635 }
8636
8637 assert(current);
8638
8639 /* function call leaves parse pointing to the ']', except if we
8640 * faked it */
8641 if (is_posix_class) {
8642 RExC_parse--;
8643 }
8644
8645 goto handle_operand;
8646 }
8647
8648 case ']':
8649 if (top_index >= 1) {
8650 goto join_operators;
8651 }
8652
8653 /* Only a single operand on the stack: are done */
8654 goto done;
8655
8656 case ')':
8657 if (av_tindex_skip_len_mg(fence_stack) < 0) {
8658 if (UCHARAT(RExC_parse - 1) == ']') {
8659 break;
8660 }
8661 RExC_parse_inc_by(1);
8662 vFAIL("Unexpected ')'");
8663 }
8664
8665 /* If nothing after the fence, is missing an operand */
8666 if (top_index - fence < 0) {
8667 RExC_parse_inc_by(1);
8668 goto bad_syntax;
8669 }
8670 /* If at least two things on the stack, treat this as an
8671 * operator */
8672 if (top_index - fence >= 1) {
8673 goto join_operators;
8674 }
8675
8676 /* Here only a single thing on the fenced stack, and there is a
8677 * fence. Get rid of it */
8678 fence_ptr = av_pop(fence_stack);
8679 assert(fence_ptr);
8680 fence = SvIV(fence_ptr);
8681 SvREFCNT_dec_NN(fence_ptr);
8682 fence_ptr = NULL;
8683
8684 if (fence < 0) {
8685 fence = 0;
8686 }
8687
8688 /* Having gotten rid of the fence, we pop the operand at the
8689 * stack top and process it as a newly encountered operand */
8690 current = av_pop(stack);
8691 if (IS_OPERAND(current)) {
8692 goto handle_operand;
8693 }
8694
8695 RExC_parse_inc_by(1);
8696 goto bad_syntax;
8697
8698 case '&':
8699 case '|':
8700 case '+':
8701 case '-':
8702 case '^':
8703
8704 /* These binary operators should have a left operand already
8705 * parsed */
8706 if ( top_index - fence < 0
8707 || top_index - fence == 1
8708 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
8709 || ! IS_OPERAND(*top_ptr))
8710 {
8711 goto unexpected_binary;
8712 }
8713
8714 /* If only the one operand is on the part of the stack visible
8715 * to us, we just place this operator in the proper position */
8716 if (top_index - fence < 2) {
8717
8718 /* Place the operator before the operand */
8719
8720 SV* lhs = av_pop(stack);
8721 av_push_simple(stack, newSVuv(curchar));
8722 av_push_simple(stack, lhs);
8723 break;
8724 }
8725
8726 /* But if there is something else on the stack, we need to
8727 * process it before this new operator if and only if the
8728 * stacked operation has equal or higher precedence than the
8729 * new one */
8730
8731 join_operators:
8732
8733 /* The operator on the stack is supposed to be below both its
8734 * operands */
8735 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
8736 || IS_OPERAND(*stacked_ptr))
8737 {
8738 /* But if not, it's legal and indicates we are completely
8739 * done if and only if we're currently processing a ']',
8740 * which should be the final thing in the expression */
8741 if (curchar == ']') {
8742 goto done;
8743 }
8744
8745 unexpected_binary:
8746 RExC_parse_inc_by(1);
8747 vFAIL2("Unexpected binary operator '%c' with no "
8748 "preceding operand", curchar);
8749 }
8750 stacked_operator = (char) SvUV(*stacked_ptr);
8751
8752 if (regex_set_precedence(curchar)
8753 > regex_set_precedence(stacked_operator))
8754 {
8755 /* Here, the new operator has higher precedence than the
8756 * stacked one. This means we need to add the new one to
8757 * the stack to await its rhs operand (and maybe more
8758 * stuff). We put it before the lhs operand, leaving
8759 * untouched the stacked operator and everything below it
8760 * */
8761 lhs = av_pop(stack);
8762 assert(IS_OPERAND(lhs));
8763 av_push_simple(stack, newSVuv(curchar));
8764 av_push_simple(stack, lhs);
8765 break;
8766 }
8767
8768 /* Here, the new operator has equal or lower precedence than
8769 * what's already there. This means the operation already
8770 * there should be performed now, before the new one. */
8771
8772 rhs = av_pop(stack);
8773 if (! IS_OPERAND(rhs)) {
8774
8775 /* This can happen when a ! is not followed by an operand,
8776 * like in /(?[\t &!])/ */
8777 goto bad_syntax;
8778 }
8779
8780 lhs = av_pop(stack);
8781
8782 if (! IS_OPERAND(lhs)) {
8783
8784 /* This can happen when there is an empty (), like in
8785 * /(?[[0]+()+])/ */
8786 goto bad_syntax;
8787 }
8788
8789 switch (stacked_operator) {
8790 case '&':
8791 _invlist_intersection(lhs, rhs, &rhs);
8792 break;
8793
8794 case '|':
8795 case '+':
8796 _invlist_union(lhs, rhs, &rhs);
8797 break;
8798
8799 case '-':
8800 _invlist_subtract(lhs, rhs, &rhs);
8801 break;
8802
8803 case '^': /* The union minus the intersection */
8804 {
8805 SV* i = NULL;
8806 SV* u = NULL;
8807
8808 _invlist_union(lhs, rhs, &u);
8809 _invlist_intersection(lhs, rhs, &i);
8810 _invlist_subtract(u, i, &rhs);
8811 SvREFCNT_dec_NN(i);
8812 SvREFCNT_dec_NN(u);
8813 break;
8814 }
8815 }
8816 SvREFCNT_dec(lhs);
8817
8818 /* Here, the higher precedence operation has been done, and the
8819 * result is in 'rhs'. We overwrite the stacked operator with
8820 * the result. Then we redo this code to either push the new
8821 * operator onto the stack or perform any higher precedence
8822 * stacked operation */
8823 only_to_avoid_leaks = av_pop(stack);
8824 SvREFCNT_dec(only_to_avoid_leaks);
8825 av_push_simple(stack, rhs);
8826 goto redo_curchar;
8827
8828 case '!': /* Highest priority, right associative */
8829
8830 /* If what's already at the top of the stack is another '!",
8831 * they just cancel each other out */
8832 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
8833 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
8834 {
8835 only_to_avoid_leaks = av_pop(stack);
8836 SvREFCNT_dec(only_to_avoid_leaks);
8837 }
8838 else { /* Otherwise, since it's right associative, just push
8839 onto the stack */
8840 av_push_simple(stack, newSVuv(curchar));
8841 }
8842 break;
8843
8844 default:
8845 RExC_parse_inc();
8846 if (RExC_parse >= RExC_end) {
8847 break;
8848 }
8849 vFAIL("Unexpected character");
8850
8851 handle_operand:
8852
8853 /* Here 'current' is the operand. If something is already on the
8854 * stack, we have to check if it is a !. But first, the code above
8855 * may have altered the stack in the time since we earlier set
8856 * 'top_index'. */
8857
8858 top_index = av_tindex_skip_len_mg(stack);
8859 if (top_index - fence >= 0) {
8860 /* If the top entry on the stack is an operator, it had better
8861 * be a '!', otherwise the entry below the top operand should
8862 * be an operator */
8863 top_ptr = av_fetch(stack, top_index, FALSE);
8864 assert(top_ptr);
8865 if (IS_OPERATOR(*top_ptr)) {
8866
8867 /* The only permissible operator at the top of the stack is
8868 * '!', which is applied immediately to this operand. */
8869 curchar = (char) SvUV(*top_ptr);
8870 if (curchar != '!') {
8871 SvREFCNT_dec(current);
8872 vFAIL2("Unexpected binary operator '%c' with no "
8873 "preceding operand", curchar);
8874 }
8875
8876 _invlist_invert(current);
8877
8878 only_to_avoid_leaks = av_pop(stack);
8879 SvREFCNT_dec(only_to_avoid_leaks);
8880
8881 /* And we redo with the inverted operand. This allows
8882 * handling multiple ! in a row */
8883 goto handle_operand;
8884 }
8885 /* Single operand is ok only for the non-binary ')'
8886 * operator */
8887 else if ((top_index - fence == 0 && curchar != ')')
8888 || (top_index - fence > 0
8889 && (! (stacked_ptr = av_fetch(stack,
8890 top_index - 1,
8891 FALSE))
8892 || IS_OPERAND(*stacked_ptr))))
8893 {
8894 SvREFCNT_dec(current);
8895 vFAIL("Operand with no preceding operator");
8896 }
8897 }
8898
8899 /* Here there was nothing on the stack or the top element was
8900 * another operand. Just add this new one */
8901 av_push_simple(stack, current);
8902
8903 } /* End of switch on next parse token */
8904
8905 RExC_parse_inc();
8906 } /* End of loop parsing through the construct */
8907
8908 vFAIL("Syntax error in (?[...])");
8909
8910 done:
8911
8912 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
8913 if (RExC_parse < RExC_end) {
8914 RExC_parse_inc_by(1);
8915 }
8916
8917 vFAIL("Unexpected ']' with no following ')' in (?[...");
8918 }
8919
8920 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
8921 vFAIL("Unmatched (");
8922 }
8923
8924 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
8925 || ((final = av_pop(stack)) == NULL)
8926 || ! IS_OPERAND(final)
8927 || ! is_invlist(final)
8928 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
8929 {
8930 bad_syntax:
8931 SvREFCNT_dec(final);
8932 vFAIL("Incomplete expression within '(?[ ])'");
8933 }
8934
8935 /* Here, 'final' is the resultant inversion list from evaluating the
8936 * expression. Return it if so requested */
8937 if (return_invlist) {
8938 *return_invlist = final;
8939 return END;
8940 }
8941
8942 if (RExC_sets_depth) { /* If within a recursive call, return in a special
8943 regnode */
8944 RExC_parse_inc_by(1);
8945 node = regpnode(pRExC_state, REGEX_SET, final);
8946 }
8947 else {
8948
8949 /* Otherwise generate a resultant node, based on 'final'. regclass()
8950 * is expecting a string of ranges and individual code points */
8951 invlist_iterinit(final);
8952 result_string = newSVpvs("");
8953 while (invlist_iternext(final, &start, &end)) {
8954 if (start == end) {
8955 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
8956 }
8957 else {
8958 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
8959 UVXf "}", start, end);
8960 }
8961 }
8962
8963 /* About to generate an ANYOF (or similar) node from the inversion list
8964 * we have calculated */
8965 save_parse = RExC_parse;
8966 RExC_parse_set(SvPV(result_string, len));
8967 save_end = RExC_end;
8968 RExC_end = RExC_parse + len;
8969 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
8970
8971 /* We turn off folding around the call, as the class we have
8972 * constructed already has all folding taken into consideration, and we
8973 * don't want regclass() to add to that */
8974 RExC_flags &= ~RXf_PMf_FOLD;
8975 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
8976 * folds are allowed. */
8977 node = regclass(pRExC_state, flagp, depth+1,
8978 FALSE, /* means parse the whole char class */
8979 FALSE, /* don't allow multi-char folds */
8980 TRUE, /* silence non-portable warnings. The above may
8981 very well have generated non-portable code
8982 points, but they're valid on this machine */
8983 FALSE, /* similarly, no need for strict */
8984
8985 /* We can optimize into something besides an ANYOF,
8986 * except under /l, which needs to be ANYOF because of
8987 * runtime checks for locale sanity, etc */
8988 ! in_locale,
8989 NULL
8990 );
8991
8992 RESTORE_WARNINGS;
8993 RExC_parse_set(save_parse + 1);
8994 RExC_end = save_end;
8995 SvREFCNT_dec_NN(final);
8996 SvREFCNT_dec_NN(result_string);
8997
8998 if (save_fold) {
8999 RExC_flags |= RXf_PMf_FOLD;
9000 }
9001
9002 if (!node) {
9003 RETURN_FAIL_ON_RESTART(*flagp, flagp);
9004 goto regclass_failed;
9005 }
9006
9007 /* Fix up the node type if we are in locale. (We have pretended we are
9008 * under /u for the purposes of regclass(), as this construct will only
9009 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
9010 * (so as to cause any warnings about bad locales to be output in
9011 * regexec.c), and add the flag that indicates to check if not in a
9012 * UTF-8 locale. The reason we above forbid optimization into
9013 * something other than an ANYOF node is simply to minimize the number
9014 * of code changes in regexec.c. Otherwise we would have to create new
9015 * EXACTish node types and deal with them. This decision could be
9016 * revisited should this construct become popular.
9017 *
9018 * (One might think we could look at the resulting ANYOF node and
9019 * suppress the flag if everything is above 255, as those would be
9020 * UTF-8 only, but this isn't true, as the components that led to that
9021 * result could have been locale-affected, and just happen to cancel
9022 * each other out under UTF-8 locales.) */
9023 if (in_locale) {
9024 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
9025
9026 assert(OP(REGNODE_p(node)) == ANYOF);
9027
9028 OP(REGNODE_p(node)) = ANYOFL;
9029 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
9030 }
9031 }
9032
9033 nextchar(pRExC_state);
9034 return node;
9035
9036 regclass_failed:
9037 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
9038 (UV) *flagp);
9039 }
9040
9041 #ifdef ENABLE_REGEX_SETS_DEBUGGING
9042
9043 STATIC void
S_dump_regex_sets_structures(pTHX_ RExC_state_t * pRExC_state,AV * stack,const IV fence,AV * fence_stack)9044 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
9045 AV * stack, const IV fence, AV * fence_stack)
9046 { /* Dumps the stacks in handle_regex_sets() */
9047
9048 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
9049 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
9050 SSize_t i;
9051
9052 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
9053
9054 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
9055
9056 if (stack_top < 0) {
9057 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
9058 }
9059 else {
9060 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
9061 for (i = stack_top; i >= 0; i--) {
9062 SV ** element_ptr = av_fetch(stack, i, FALSE);
9063 if (! element_ptr) {
9064 }
9065
9066 if (IS_OPERATOR(*element_ptr)) {
9067 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
9068 (int) i, (int) SvIV(*element_ptr));
9069 }
9070 else {
9071 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
9072 sv_dump(*element_ptr);
9073 }
9074 }
9075 }
9076
9077 if (fence_stack_top < 0) {
9078 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
9079 }
9080 else {
9081 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
9082 for (i = fence_stack_top; i >= 0; i--) {
9083 SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
9084 if (! element_ptr) {
9085 }
9086
9087 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
9088 (int) i, (int) SvIV(*element_ptr));
9089 }
9090 }
9091 }
9092
9093 #endif
9094
9095 #undef IS_OPERATOR
9096 #undef IS_OPERAND
9097
9098 void
Perl_add_above_Latin1_folds(pTHX_ RExC_state_t * pRExC_state,const U8 cp,SV ** invlist)9099 Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
9100 {
9101 /* This adds the Latin1/above-Latin1 folding rules.
9102 *
9103 * This should be called only for a Latin1-range code points, cp, which is
9104 * known to be involved in a simple fold with other code points above
9105 * Latin1. It would give false results if /aa has been specified.
9106 * Multi-char folds are outside the scope of this, and must be handled
9107 * specially. */
9108
9109 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
9110
9111 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
9112
9113 /* The rules that are valid for all Unicode versions are hard-coded in */
9114 switch (cp) {
9115 case 'k':
9116 case 'K':
9117 *invlist =
9118 add_cp_to_invlist(*invlist, KELVIN_SIGN);
9119 break;
9120 case 's':
9121 case 'S':
9122 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
9123 break;
9124 case MICRO_SIGN:
9125 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
9126 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
9127 break;
9128 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9129 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9130 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
9131 break;
9132 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9133 *invlist = add_cp_to_invlist(*invlist,
9134 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9135 break;
9136
9137 default: /* Other code points are checked against the data for the
9138 current Unicode version */
9139 {
9140 Size_t folds_count;
9141 U32 first_fold;
9142 const U32 * remaining_folds;
9143 UV folded_cp;
9144
9145 if (isASCII(cp)) {
9146 folded_cp = toFOLD(cp);
9147 }
9148 else {
9149 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
9150 Size_t dummy_len;
9151 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
9152 }
9153
9154 if (folded_cp > 255) {
9155 *invlist = add_cp_to_invlist(*invlist, folded_cp);
9156 }
9157
9158 folds_count = _inverse_folds(folded_cp, &first_fold,
9159 &remaining_folds);
9160 if (folds_count == 0) {
9161
9162 /* Use deprecated warning to increase the chances of this being
9163 * output */
9164 ckWARN2reg_d(RExC_parse,
9165 "Perl folding rules are not up-to-date for 0x%02X;"
9166 " please use the perlbug utility to report;", cp);
9167 }
9168 else {
9169 unsigned int i;
9170
9171 if (first_fold > 255) {
9172 *invlist = add_cp_to_invlist(*invlist, first_fold);
9173 }
9174 for (i = 0; i < folds_count - 1; i++) {
9175 if (remaining_folds[i] > 255) {
9176 *invlist = add_cp_to_invlist(*invlist,
9177 remaining_folds[i]);
9178 }
9179 }
9180 }
9181 break;
9182 }
9183 }
9184 }
9185
9186 STATIC void
S_output_posix_warnings(pTHX_ RExC_state_t * pRExC_state,AV * posix_warnings)9187 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
9188 {
9189 /* Output the elements of the array given by '*posix_warnings' as REGEXP
9190 * warnings. */
9191
9192 SV * msg;
9193 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
9194
9195 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
9196
9197 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
9198 CLEAR_POSIX_WARNINGS();
9199 return;
9200 }
9201
9202 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
9203 if (first_is_fatal) { /* Avoid leaking this */
9204 av_undef(posix_warnings); /* This isn't necessary if the
9205 array is mortal, but is a
9206 fail-safe */
9207 (void) sv_2mortal(msg);
9208 PREPARE_TO_DIE;
9209 }
9210 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
9211 SvREFCNT_dec_NN(msg);
9212 }
9213
9214 UPDATE_WARNINGS_LOC(RExC_parse);
9215 }
9216
9217 PERL_STATIC_INLINE Size_t
S_find_first_differing_byte_pos(const U8 * s1,const U8 * s2,const Size_t max)9218 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
9219 {
9220 const U8 * const start = s1;
9221 const U8 * const send = start + max;
9222
9223 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
9224
9225 while (s1 < send && *s1 == *s2) {
9226 s1++; s2++;
9227 }
9228
9229 return s1 - start;
9230 }
9231
9232 STATIC AV *
S_add_multi_match(pTHX_ AV * multi_char_matches,SV * multi_string,const STRLEN cp_count)9233 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
9234 {
9235 /* This adds the string scalar <multi_string> to the array
9236 * <multi_char_matches>. <multi_string> is known to have exactly
9237 * <cp_count> code points in it. This is used when constructing a
9238 * bracketed character class and we find something that needs to match more
9239 * than a single character.
9240 *
9241 * <multi_char_matches> is actually an array of arrays. Each top-level
9242 * element is an array that contains all the strings known so far that are
9243 * the same length. And that length (in number of code points) is the same
9244 * as the index of the top-level array. Hence, the [2] element is an
9245 * array, each element thereof is a string containing TWO code points;
9246 * while element [3] is for strings of THREE characters, and so on. Since
9247 * this is for multi-char strings there can never be a [0] nor [1] element.
9248 *
9249 * When we rewrite the character class below, we will do so such that the
9250 * longest strings are written first, so that it prefers the longest
9251 * matching strings first. This is done even if it turns out that any
9252 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
9253 * Christiansen has agreed that this is ok. This makes the test for the
9254 * ligature 'ffi' come before the test for 'ff', for example */
9255
9256 AV* this_array;
9257 AV** this_array_ptr;
9258
9259 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
9260
9261 if (! multi_char_matches) {
9262 multi_char_matches = newAV();
9263 }
9264
9265 if (av_exists(multi_char_matches, cp_count)) {
9266 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
9267 this_array = *this_array_ptr;
9268 }
9269 else {
9270 this_array = newAV();
9271 av_store_simple(multi_char_matches, cp_count,
9272 (SV*) this_array);
9273 }
9274 av_push_simple(this_array, multi_string);
9275
9276 return multi_char_matches;
9277 }
9278
9279 /* The names of properties whose definitions are not known at compile time are
9280 * stored in this SV, after a constant heading. So if the length has been
9281 * changed since initialization, then there is a run-time definition. */
9282 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
9283 (SvCUR(listsv) != initial_listsv_len)
9284
9285 /* There is a restricted set of white space characters that are legal when
9286 * ignoring white space in a bracketed character class. This generates the
9287 * code to skip them.
9288 *
9289 * There is a line below that uses the same white space criteria but is outside
9290 * this macro. Both here and there must use the same definition */
9291 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
9292 STMT_START { \
9293 if (do_skip) { \
9294 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
9295 { \
9296 p++; \
9297 } \
9298 } \
9299 } STMT_END
9300
9301 STATIC regnode_offset
S_regclass(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth,const bool stop_at_1,bool allow_mutiple_chars,const bool silence_non_portable,const bool strict,bool optimizable,SV ** ret_invlist)9302 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
9303 const bool stop_at_1, /* Just parse the next thing, don't
9304 look for a full character class */
9305 bool allow_mutiple_chars,
9306 const bool silence_non_portable, /* Don't output warnings
9307 about too large
9308 characters */
9309 const bool strict,
9310 bool optimizable, /* ? Allow a non-ANYOF return
9311 node */
9312 SV** ret_invlist /* Return an inversion list, not a node */
9313 )
9314 {
9315 /* parse a bracketed class specification. Most of these will produce an
9316 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
9317 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
9318 * under /i with multi-character folds: it will be rewritten following the
9319 * paradigm of this example, where the <multi-fold>s are characters which
9320 * fold to multiple character sequences:
9321 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
9322 * gets effectively rewritten as:
9323 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
9324 * reg() gets called (recursively) on the rewritten version, and this
9325 * function will return what it constructs. (Actually the <multi-fold>s
9326 * aren't physically removed from the [abcdefghi], it's just that they are
9327 * ignored in the recursion by means of a flag:
9328 * <RExC_in_multi_char_class>.)
9329 *
9330 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
9331 * characters, with the corresponding bit set if that character is in the
9332 * list. For characters above this, an inversion list is used. There
9333 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
9334 * determinable at compile time
9335 *
9336 * On success, returns the offset at which any next node should be placed
9337 * into the regex engine program being compiled.
9338 *
9339 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
9340 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
9341 * UTF-8
9342 */
9343
9344 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
9345 IV range = 0;
9346 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
9347 regnode_offset ret = -1; /* Initialized to an illegal value */
9348 STRLEN numlen;
9349 int namedclass = OOB_NAMEDCLASS;
9350 char *rangebegin = NULL;
9351 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
9352 aren't available at the time this was called */
9353 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9354 than just initialized. */
9355 SV* properties = NULL; /* Code points that match \p{} \P{} */
9356 SV* posixes = NULL; /* Code points that match classes like [:word:],
9357 extended beyond the Latin1 range. These have to
9358 be kept separate from other code points for much
9359 of this function because their handling is
9360 different under /i, and for most classes under
9361 /d as well */
9362 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
9363 separate for a while from the non-complemented
9364 versions because of complications with /d
9365 matching */
9366 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
9367 treated more simply than the general case,
9368 leading to less compilation and execution
9369 work */
9370 UV element_count = 0; /* Number of distinct elements in the class.
9371 Optimizations may be possible if this is tiny */
9372 AV * multi_char_matches = NULL; /* Code points that fold to more than one
9373 character; used under /i */
9374 UV n;
9375 char * stop_ptr = RExC_end; /* where to stop parsing */
9376
9377 /* ignore unescaped whitespace? */
9378 const bool skip_white = cBOOL( ret_invlist
9379 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
9380
9381 /* inversion list of code points this node matches only when the target
9382 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
9383 * /d) */
9384 SV* upper_latin1_only_utf8_matches = NULL;
9385
9386 /* Inversion list of code points this node matches regardless of things
9387 * like locale, folding, utf8ness of the target string */
9388 SV* cp_list = NULL;
9389
9390 /* Like cp_list, but code points on this list need to be checked for things
9391 * that fold to/from them under /i */
9392 SV* cp_foldable_list = NULL;
9393
9394 /* Like cp_list, but code points on this list are valid only when the
9395 * runtime locale is UTF-8 */
9396 SV* only_utf8_locale_list = NULL;
9397
9398 /* In a range, if one of the endpoints is non-character-set portable,
9399 * meaning that it hard-codes a code point that may mean a different
9400 * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
9401 * mnemonic '\t' which each mean the same character no matter which
9402 * character set the platform is on. */
9403 unsigned int non_portable_endpoint = 0;
9404
9405 /* Is the range unicode? which means on a platform that isn't 1-1 native
9406 * to Unicode (i.e. non-ASCII), each code point in it should be considered
9407 * to be a Unicode value. */
9408 bool unicode_range = FALSE;
9409 bool invert = FALSE; /* Is this class to be complemented */
9410
9411 bool warn_super = ALWAYS_WARN_SUPER;
9412
9413 const char * orig_parse = RExC_parse;
9414
9415 /* This variable is used to mark where the end in the input is of something
9416 * that looks like a POSIX construct but isn't. During the parse, when
9417 * something looks like it could be such a construct is encountered, it is
9418 * checked for being one, but not if we've already checked this area of the
9419 * input. Only after this position is reached do we check again */
9420 char *not_posix_region_end = RExC_parse - 1;
9421
9422 AV* posix_warnings = NULL;
9423 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
9424 U8 op = ANYOF; /* The returned node-type, initialized to the expected
9425 type. */
9426 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
9427 U32 posixl = 0; /* bit field of posix classes matched under /l */
9428
9429
9430 /* Flags as to what things aren't knowable until runtime. (Note that these are
9431 * mutually exclusive.) */
9432 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
9433 haven't been defined as of yet */
9434 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
9435 UTF-8 or not */
9436 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
9437 what gets folded */
9438 U32 has_runtime_dependency = 0; /* OR of the above flags */
9439
9440 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9441
9442 PERL_ARGS_ASSERT_REGCLASS;
9443 #ifndef DEBUGGING
9444 PERL_UNUSED_ARG(depth);
9445 #endif
9446
9447 assert(! (ret_invlist && allow_mutiple_chars));
9448
9449 /* If wants an inversion list returned, we can't optimize to something
9450 * else. */
9451 if (ret_invlist) {
9452 optimizable = FALSE;
9453 }
9454
9455 DEBUG_PARSE("clas");
9456
9457 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
9458 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
9459 && UNICODE_DOT_DOT_VERSION == 0)
9460 allow_mutiple_chars = FALSE;
9461 #endif
9462
9463 /* We include the /i status at the beginning of this so that we can
9464 * know it at runtime */
9465 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
9466 initial_listsv_len = SvCUR(listsv);
9467 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
9468
9469 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9470
9471 assert(RExC_parse <= RExC_end);
9472
9473 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
9474 RExC_parse_inc_by(1);
9475 invert = TRUE;
9476 allow_mutiple_chars = FALSE;
9477 MARK_NAUGHTY(1);
9478 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9479 }
9480
9481 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
9482 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
9483 int maybe_class = handle_possible_posix(pRExC_state,
9484 RExC_parse,
9485 ¬_posix_region_end,
9486 NULL,
9487 TRUE /* checking only */);
9488 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
9489 ckWARN4reg(not_posix_region_end,
9490 "POSIX syntax [%c %c] belongs inside character classes%s",
9491 *RExC_parse, *RExC_parse,
9492 (maybe_class == OOB_NAMEDCLASS)
9493 ? ((POSIXCC_NOTYET(*RExC_parse))
9494 ? " (but this one isn't implemented)"
9495 : " (but this one isn't fully valid)")
9496 : ""
9497 );
9498 }
9499 }
9500
9501 /* If the caller wants us to just parse a single element, accomplish this
9502 * by faking the loop ending condition */
9503 if (stop_at_1 && RExC_end > RExC_parse) {
9504 stop_ptr = RExC_parse + 1;
9505 }
9506
9507 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
9508 if (UCHARAT(RExC_parse) == ']')
9509 goto charclassloop;
9510
9511 while (1) {
9512
9513 if ( posix_warnings
9514 && av_tindex_skip_len_mg(posix_warnings) >= 0
9515 && RExC_parse > not_posix_region_end)
9516 {
9517 /* Warnings about posix class issues are considered tentative until
9518 * we are far enough along in the parse that we can no longer
9519 * change our mind, at which point we output them. This is done
9520 * each time through the loop so that a later class won't zap them
9521 * before they have been dealt with. */
9522 output_posix_warnings(pRExC_state, posix_warnings);
9523 }
9524
9525 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9526
9527 if (RExC_parse >= stop_ptr) {
9528 break;
9529 }
9530
9531 if (UCHARAT(RExC_parse) == ']') {
9532 break;
9533 }
9534
9535 charclassloop:
9536
9537 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9538 save_value = value;
9539 save_prevvalue = prevvalue;
9540
9541 if (!range) {
9542 rangebegin = RExC_parse;
9543 element_count++;
9544 non_portable_endpoint = 0;
9545 }
9546 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
9547 value = utf8n_to_uvchr((U8*)RExC_parse,
9548 RExC_end - RExC_parse,
9549 &numlen, UTF8_ALLOW_DEFAULT);
9550 RExC_parse_inc_by(numlen);
9551 }
9552 else {
9553 value = UCHARAT(RExC_parse);
9554 RExC_parse_inc_by(1);
9555 }
9556
9557 if (value == '[') {
9558 char * posix_class_end;
9559 namedclass = handle_possible_posix(pRExC_state,
9560 RExC_parse,
9561 &posix_class_end,
9562 do_posix_warnings ? &posix_warnings : NULL,
9563 FALSE /* die if error */);
9564 if (namedclass > OOB_NAMEDCLASS) {
9565
9566 /* If there was an earlier attempt to parse this particular
9567 * posix class, and it failed, it was a false alarm, as this
9568 * successful one proves */
9569 if ( posix_warnings
9570 && av_tindex_skip_len_mg(posix_warnings) >= 0
9571 && not_posix_region_end >= RExC_parse
9572 && not_posix_region_end <= posix_class_end)
9573 {
9574 av_undef(posix_warnings);
9575 }
9576
9577 RExC_parse_set(posix_class_end);
9578 }
9579 else if (namedclass == OOB_NAMEDCLASS) {
9580 not_posix_region_end = posix_class_end;
9581 }
9582 else {
9583 namedclass = OOB_NAMEDCLASS;
9584 }
9585 }
9586 else if ( RExC_parse - 1 > not_posix_region_end
9587 && MAYBE_POSIXCC(value))
9588 {
9589 (void) handle_possible_posix(
9590 pRExC_state,
9591 RExC_parse - 1, /* -1 because parse has already been
9592 advanced */
9593 ¬_posix_region_end,
9594 do_posix_warnings ? &posix_warnings : NULL,
9595 TRUE /* checking only */);
9596 }
9597 else if ( strict && ! skip_white
9598 && ( generic_isCC_(value, CC_VERTSPACE_)
9599 || is_VERTWS_cp_high(value)))
9600 {
9601 vFAIL("Literal vertical space in [] is illegal except under /x");
9602 }
9603 else if (value == '\\') {
9604 /* Is a backslash; get the code point of the char after it */
9605
9606 if (RExC_parse >= RExC_end) {
9607 vFAIL("Unmatched [");
9608 }
9609
9610 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
9611 value = utf8n_to_uvchr((U8*)RExC_parse,
9612 RExC_end - RExC_parse,
9613 &numlen, UTF8_ALLOW_DEFAULT);
9614 RExC_parse_inc_by(numlen);
9615 }
9616 else {
9617 value = UCHARAT(RExC_parse);
9618 RExC_parse_inc_by(1);
9619 }
9620
9621 /* Some compilers cannot handle switching on 64-bit integer
9622 * values, therefore value cannot be an UV. Yes, this will
9623 * be a problem later if we want switch on Unicode.
9624 * A similar issue a little bit later when switching on
9625 * namedclass. --jhi */
9626
9627 /* If the \ is escaping white space when white space is being
9628 * skipped, it means that that white space is wanted literally, and
9629 * is already in 'value'. Otherwise, need to translate the escape
9630 * into what it signifies. */
9631 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
9632 const char * message;
9633 U32 packed_warn;
9634 U8 grok_c_char;
9635
9636 case 'w': namedclass = ANYOF_WORDCHAR; break;
9637 case 'W': namedclass = ANYOF_NWORDCHAR; break;
9638 case 's': namedclass = ANYOF_SPACE; break;
9639 case 'S': namedclass = ANYOF_NSPACE; break;
9640 case 'd': namedclass = ANYOF_DIGIT; break;
9641 case 'D': namedclass = ANYOF_NDIGIT; break;
9642 case 'v': namedclass = ANYOF_VERTWS; break;
9643 case 'V': namedclass = ANYOF_NVERTWS; break;
9644 case 'h': namedclass = ANYOF_HORIZWS; break;
9645 case 'H': namedclass = ANYOF_NHORIZWS; break;
9646 case 'N': /* Handle \N{NAME} in class */
9647 {
9648 const char * const backslash_N_beg = RExC_parse - 2;
9649 int cp_count;
9650
9651 if (! grok_bslash_N(pRExC_state,
9652 NULL, /* No regnode */
9653 &value, /* Yes single value */
9654 &cp_count, /* Multiple code pt count */
9655 flagp,
9656 strict,
9657 depth)
9658 ) {
9659
9660 if (*flagp & NEED_UTF8)
9661 FAIL("panic: grok_bslash_N set NEED_UTF8");
9662
9663 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
9664
9665 if (cp_count < 0) {
9666 vFAIL("\\N in a character class must be a named character: \\N{...}");
9667 }
9668 else if (cp_count == 0) {
9669 ckWARNreg(RExC_parse,
9670 "Ignoring zero length \\N{} in character class");
9671 }
9672 else { /* cp_count > 1 */
9673 assert(cp_count > 1);
9674 if (! RExC_in_multi_char_class) {
9675 if ( ! allow_mutiple_chars
9676 || invert
9677 || range
9678 || *RExC_parse == '-')
9679 {
9680 if (strict) {
9681 RExC_parse--;
9682 vFAIL("\\N{} here is restricted to one character");
9683 }
9684 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
9685 break; /* <value> contains the first code
9686 point. Drop out of the switch to
9687 process it */
9688 }
9689 else {
9690 SV * multi_char_N = newSVpvn(backslash_N_beg,
9691 RExC_parse - backslash_N_beg);
9692 multi_char_matches
9693 = add_multi_match(multi_char_matches,
9694 multi_char_N,
9695 cp_count);
9696 }
9697 }
9698 } /* End of cp_count != 1 */
9699
9700 /* This element should not be processed further in this
9701 * class */
9702 element_count--;
9703 value = save_value;
9704 prevvalue = save_prevvalue;
9705 continue; /* Back to top of loop to get next char */
9706 }
9707
9708 /* Here, is a single code point, and <value> contains it */
9709 unicode_range = TRUE; /* \N{} are Unicode */
9710 }
9711 break;
9712 case 'p':
9713 case 'P':
9714 {
9715 char *e;
9716
9717 if (RExC_pm_flags & PMf_WILDCARD) {
9718 RExC_parse_inc_by(1);
9719 /* diag_listed_as: Use of %s is not allowed in Unicode
9720 property wildcard subpatterns in regex; marked by <--
9721 HERE in m/%s/ */
9722 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
9723 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
9724 }
9725
9726 /* \p means they want Unicode semantics */
9727 REQUIRE_UNI_RULES(flagp, 0);
9728
9729 if (RExC_parse >= RExC_end)
9730 vFAIL2("Empty \\%c", (U8)value);
9731 if (*RExC_parse == '{') {
9732 const U8 c = (U8)value;
9733 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
9734 if (!e) {
9735 RExC_parse_inc_by(1);
9736 vFAIL2("Missing right brace on \\%c{}", c);
9737 }
9738
9739 RExC_parse_inc_by(1);
9740
9741 /* White space is allowed adjacent to the braces and after
9742 * any '^', even when not under /x */
9743 while (isSPACE(*RExC_parse)) {
9744 RExC_parse_inc_by(1);
9745 }
9746
9747 if (UCHARAT(RExC_parse) == '^') {
9748
9749 /* toggle. (The rhs xor gets the single bit that
9750 * differs between P and p; the other xor inverts just
9751 * that bit) */
9752 value ^= 'P' ^ 'p';
9753
9754 RExC_parse_inc_by(1);
9755 while (isSPACE(*RExC_parse)) {
9756 RExC_parse_inc_by(1);
9757 }
9758 }
9759
9760 if (e == RExC_parse)
9761 vFAIL2("Empty \\%c{}", c);
9762
9763 n = e - RExC_parse;
9764 while (isSPACE(*(RExC_parse + n - 1)))
9765 n--;
9766
9767 } /* The \p isn't immediately followed by a '{' */
9768 else if (! isALPHA(*RExC_parse)) {
9769 RExC_parse_inc_safe();
9770 vFAIL2("Character following \\%c must be '{' or a "
9771 "single-character Unicode property name",
9772 (U8) value);
9773 }
9774 else {
9775 e = RExC_parse;
9776 n = 1;
9777 }
9778 {
9779 char* name = RExC_parse;
9780
9781 /* Any message returned about expanding the definition */
9782 SV* msg = newSVpvs_flags("", SVs_TEMP);
9783
9784 /* If set TRUE, the property is user-defined as opposed to
9785 * official Unicode */
9786 bool user_defined = FALSE;
9787 AV * strings = NULL;
9788
9789 SV * prop_definition = parse_uniprop_string(
9790 name, n, UTF, FOLD,
9791 FALSE, /* This is compile-time */
9792
9793 /* We can't defer this defn when
9794 * the full result is required in
9795 * this call */
9796 ! cBOOL(ret_invlist),
9797
9798 &strings,
9799 &user_defined,
9800 msg,
9801 0 /* Base level */
9802 );
9803 if (SvCUR(msg)) { /* Assumes any error causes a msg */
9804 assert(prop_definition == NULL);
9805 RExC_parse_set(e + 1);
9806 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
9807 thing so, or else the display is
9808 mojibake */
9809 RExC_utf8 = TRUE;
9810 }
9811 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
9812 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
9813 SvCUR(msg), SvPVX(msg)));
9814 }
9815
9816 assert(prop_definition || strings);
9817
9818 if (strings) {
9819 if (ret_invlist) {
9820 if (! prop_definition) {
9821 RExC_parse_set(e + 1);
9822 vFAIL("Unicode string properties are not implemented in (?[...])");
9823 }
9824 else {
9825 ckWARNreg(e + 1,
9826 "Using just the single character results"
9827 " returned by \\p{} in (?[...])");
9828 }
9829 }
9830 else if (! RExC_in_multi_char_class) {
9831 if (invert ^ (value == 'P')) {
9832 RExC_parse_set(e + 1);
9833 vFAIL("Inverting a character class which contains"
9834 " a multi-character sequence is illegal");
9835 }
9836
9837 /* For each multi-character string ... */
9838 while (av_count(strings) > 0) {
9839 /* ... Each entry is itself an array of code
9840 * points. */
9841 AV * this_string = (AV *) av_shift( strings);
9842 STRLEN cp_count = av_count(this_string);
9843 SV * final = newSV(cp_count ? cp_count * 4 : 1);
9844 SvPVCLEAR_FRESH(final);
9845
9846 /* Create another string of sequences of \x{...} */
9847 while (av_count(this_string) > 0) {
9848 SV * character = av_shift(this_string);
9849 UV cp = SvUV(character);
9850
9851 if (cp > 255) {
9852 REQUIRE_UTF8(flagp);
9853 }
9854 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
9855 cp);
9856 SvREFCNT_dec_NN(character);
9857 }
9858 SvREFCNT_dec_NN(this_string);
9859
9860 /* And add that to the list of such things */
9861 multi_char_matches
9862 = add_multi_match(multi_char_matches,
9863 final,
9864 cp_count);
9865 }
9866 }
9867 SvREFCNT_dec_NN(strings);
9868 }
9869
9870 if (! prop_definition) { /* If we got only a string,
9871 this iteration didn't really
9872 find a character */
9873 element_count--;
9874 }
9875 else if (! is_invlist(prop_definition)) {
9876
9877 /* Here, the definition isn't known, so we have gotten
9878 * returned a string that will be evaluated if and when
9879 * encountered at runtime. We add it to the list of
9880 * such properties, along with whether it should be
9881 * complemented or not */
9882 if (value == 'P') {
9883 sv_catpvs(listsv, "!");
9884 }
9885 else {
9886 sv_catpvs(listsv, "+");
9887 }
9888 sv_catsv(listsv, prop_definition);
9889
9890 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
9891
9892 /* We don't know yet what this matches, so have to flag
9893 * it */
9894 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
9895 }
9896 else {
9897 assert (prop_definition && is_invlist(prop_definition));
9898
9899 /* Here we do have the complete property definition
9900 *
9901 * Temporary workaround for [GH #16520]. For this
9902 * precise input that is in the .t that is failing,
9903 * load utf8.pm, which is what the test wants, so that
9904 * that .t passes */
9905 if ( memEQs(RExC_start, e + 1 - RExC_start,
9906 "foo\\p{Alnum}")
9907 && ! hv_common(GvHVn(PL_incgv),
9908 NULL,
9909 "utf8.pm", sizeof("utf8.pm") - 1,
9910 0, HV_FETCH_ISEXISTS, NULL, 0))
9911 {
9912 require_pv("utf8.pm");
9913 }
9914
9915 if (! user_defined &&
9916 /* We warn on matching an above-Unicode code point
9917 * if the match would return true, except don't
9918 * warn for \p{All}, which has exactly one element
9919 * = 0 */
9920 (_invlist_contains_cp(prop_definition, 0x110000)
9921 && (! (_invlist_len(prop_definition) == 1
9922 && *invlist_array(prop_definition) == 0))))
9923 {
9924 warn_super = TRUE;
9925 }
9926
9927 /* Invert if asking for the complement */
9928 if (value == 'P') {
9929 _invlist_union_complement_2nd(properties,
9930 prop_definition,
9931 &properties);
9932 }
9933 else {
9934 _invlist_union(properties, prop_definition, &properties);
9935 }
9936 }
9937 }
9938
9939 RExC_parse_set(e + 1);
9940 namedclass = ANYOF_UNIPROP; /* no official name, but it's
9941 named */
9942 }
9943 break;
9944 case 'n': value = '\n'; break;
9945 case 'r': value = '\r'; break;
9946 case 't': value = '\t'; break;
9947 case 'f': value = '\f'; break;
9948 case 'b': value = '\b'; break;
9949 case 'e': value = ESC_NATIVE; break;
9950 case 'a': value = '\a'; break;
9951 case 'o':
9952 RExC_parse--; /* function expects to be pointed at the 'o' */
9953 if (! grok_bslash_o(&RExC_parse,
9954 RExC_end,
9955 &value,
9956 &message,
9957 &packed_warn,
9958 strict,
9959 cBOOL(range), /* MAX_UV allowed for range
9960 upper limit */
9961 UTF))
9962 {
9963 vFAIL(message);
9964 }
9965 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9966 warn_non_literal_string(RExC_parse, packed_warn, message);
9967 }
9968
9969 if (value < 256) {
9970 non_portable_endpoint++;
9971 }
9972 break;
9973 case 'x':
9974 RExC_parse--; /* function expects to be pointed at the 'x' */
9975 if (! grok_bslash_x(&RExC_parse,
9976 RExC_end,
9977 &value,
9978 &message,
9979 &packed_warn,
9980 strict,
9981 cBOOL(range), /* MAX_UV allowed for range
9982 upper limit */
9983 UTF))
9984 {
9985 vFAIL(message);
9986 }
9987 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9988 warn_non_literal_string(RExC_parse, packed_warn, message);
9989 }
9990
9991 if (value < 256) {
9992 non_portable_endpoint++;
9993 }
9994 break;
9995 case 'c':
9996 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
9997 &packed_warn))
9998 {
9999 /* going to die anyway; point to exact spot of
10000 * failure */
10001 RExC_parse_inc_safe();
10002 vFAIL(message);
10003 }
10004
10005 value = grok_c_char;
10006 RExC_parse_inc_by(1);
10007 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10008 warn_non_literal_string(RExC_parse, packed_warn, message);
10009 }
10010
10011 non_portable_endpoint++;
10012 break;
10013 case '0': case '1': case '2': case '3': case '4':
10014 case '5': case '6': case '7':
10015 {
10016 /* Take 1-3 octal digits */
10017 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
10018 | PERL_SCAN_NOTIFY_ILLDIGIT;
10019 numlen = (strict) ? 4 : 3;
10020 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10021 RExC_parse_inc_by(numlen);
10022 if (numlen != 3) {
10023 if (strict) {
10024 RExC_parse_inc_safe();
10025 vFAIL("Need exactly 3 octal digits");
10026 }
10027 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
10028 && RExC_parse < RExC_end
10029 && isDIGIT(*RExC_parse)
10030 && ckWARN(WARN_REGEXP))
10031 {
10032 reg_warn_non_literal_string(
10033 RExC_parse + 1,
10034 form_alien_digit_msg(8, numlen, RExC_parse,
10035 RExC_end, UTF, FALSE));
10036 }
10037 }
10038 if (value < 256) {
10039 non_portable_endpoint++;
10040 }
10041 break;
10042 }
10043 default:
10044 /* Allow \_ to not give an error */
10045 if (isWORDCHAR(value) && value != '_') {
10046 if (strict) {
10047 vFAIL2("Unrecognized escape \\%c in character class",
10048 (int)value);
10049 }
10050 else {
10051 ckWARN2reg(RExC_parse,
10052 "Unrecognized escape \\%c in character class passed through",
10053 (int)value);
10054 }
10055 }
10056 break;
10057 } /* End of switch on char following backslash */
10058 } /* end of handling backslash escape sequences */
10059
10060 /* Here, we have the current token in 'value' */
10061
10062 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10063 U8 classnum;
10064
10065 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10066 * literal, as is the character that began the false range, i.e.
10067 * the 'a' in the examples */
10068 if (range) {
10069 const int w = (RExC_parse >= rangebegin)
10070 ? RExC_parse - rangebegin
10071 : 0;
10072 if (strict) {
10073 vFAIL2utf8f(
10074 "False [] range \"%" UTF8f "\"",
10075 UTF8fARG(UTF, w, rangebegin));
10076 }
10077 else {
10078 ckWARN2reg(RExC_parse,
10079 "False [] range \"%" UTF8f "\"",
10080 UTF8fARG(UTF, w, rangebegin));
10081 cp_list = add_cp_to_invlist(cp_list, '-');
10082 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
10083 prevvalue);
10084 }
10085
10086 range = 0; /* this was not a true range */
10087 element_count += 2; /* So counts for three values */
10088 }
10089
10090 classnum = namedclass_to_classnum(namedclass);
10091
10092 if (LOC && namedclass < ANYOF_POSIXL_MAX
10093 #ifndef HAS_ISASCII
10094 && classnum != CC_ASCII_
10095 #endif
10096 ) {
10097 SV* scratch_list = NULL;
10098
10099 /* What the Posix classes (like \w, [:space:]) match isn't
10100 * generally knowable under locale until actual match time. A
10101 * special node is used for these which has extra space for a
10102 * bitmap, with a bit reserved for each named class that is to
10103 * be matched against. (This isn't needed for \p{} and
10104 * pseudo-classes, as they are not affected by locale, and
10105 * hence are dealt with separately.) However, if a named class
10106 * and its complement are both present, then it matches
10107 * everything, and there is no runtime dependency. Odd numbers
10108 * are the complements of the next lower number, so xor works.
10109 * (Note that something like [\w\D] should match everything,
10110 * because \d should be a proper subset of \w. But rather than
10111 * trust that the locale is well behaved, we leave this to
10112 * runtime to sort out) */
10113 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
10114 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
10115 POSIXL_ZERO(posixl);
10116 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
10117 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
10118 continue; /* We could ignore the rest of the class, but
10119 best to parse it for any errors */
10120 }
10121 else { /* Here, isn't the complement of any already parsed
10122 class */
10123 POSIXL_SET(posixl, namedclass);
10124 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
10125 anyof_flags |= ANYOF_MATCHES_POSIXL;
10126
10127 /* The above-Latin1 characters are not subject to locale
10128 * rules. Just add them to the unconditionally-matched
10129 * list */
10130
10131 /* Get the list of the above-Latin1 code points this
10132 * matches */
10133 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
10134 PL_XPosix_ptrs[classnum],
10135
10136 /* Odd numbers are complements,
10137 * like NDIGIT, NASCII, ... */
10138 namedclass % 2 != 0,
10139 &scratch_list);
10140 /* Checking if 'cp_list' is NULL first saves an extra
10141 * clone. Its reference count will be decremented at the
10142 * next union, etc, or if this is the only instance, at the
10143 * end of the routine */
10144 if (! cp_list) {
10145 cp_list = scratch_list;
10146 }
10147 else {
10148 _invlist_union(cp_list, scratch_list, &cp_list);
10149 SvREFCNT_dec_NN(scratch_list);
10150 }
10151 continue; /* Go get next character */
10152 }
10153 }
10154 else {
10155
10156 /* Here, is not /l, or is a POSIX class for which /l doesn't
10157 * matter (or is a Unicode property, which is skipped here). */
10158 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
10159 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
10160
10161 /* Here, should be \h, \H, \v, or \V. None of /d, /i
10162 * nor /l make a difference in what these match,
10163 * therefore we just add what they match to cp_list. */
10164 if (classnum != CC_VERTSPACE_) {
10165 assert( namedclass == ANYOF_HORIZWS
10166 || namedclass == ANYOF_NHORIZWS);
10167
10168 /* It turns out that \h is just a synonym for
10169 * XPosixBlank */
10170 classnum = CC_BLANK_;
10171 }
10172
10173 _invlist_union_maybe_complement_2nd(
10174 cp_list,
10175 PL_XPosix_ptrs[classnum],
10176 namedclass % 2 != 0, /* Complement if odd
10177 (NHORIZWS, NVERTWS)
10178 */
10179 &cp_list);
10180 }
10181 }
10182 else if ( AT_LEAST_UNI_SEMANTICS
10183 || classnum == CC_ASCII_
10184 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_
10185 || classnum == CC_XDIGIT_)))
10186 {
10187 /* We usually have to worry about /d affecting what POSIX
10188 * classes match, with special code needed because we won't
10189 * know until runtime what all matches. But there is no
10190 * extra work needed under /u and /a; and [:ascii:] is
10191 * unaffected by /d; and :digit: and :xdigit: don't have
10192 * runtime differences under /d. So we can special case
10193 * these, and avoid some extra work below, and at runtime.
10194 * */
10195 _invlist_union_maybe_complement_2nd(
10196 simple_posixes,
10197 ((AT_LEAST_ASCII_RESTRICTED)
10198 ? PL_Posix_ptrs[classnum]
10199 : PL_XPosix_ptrs[classnum]),
10200 namedclass % 2 != 0,
10201 &simple_posixes);
10202 }
10203 else { /* Garden variety class. If is NUPPER, NALPHA, ...
10204 complement and use nposixes */
10205 SV** posixes_ptr = namedclass % 2 == 0
10206 ? &posixes
10207 : &nposixes;
10208 _invlist_union_maybe_complement_2nd(
10209 *posixes_ptr,
10210 PL_XPosix_ptrs[classnum],
10211 namedclass % 2 != 0,
10212 posixes_ptr);
10213 }
10214 }
10215 } /* end of namedclass \blah */
10216
10217 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
10218
10219 /* If 'range' is set, 'value' is the ending of a range--check its
10220 * validity. (If value isn't a single code point in the case of a
10221 * range, we should have figured that out above in the code that
10222 * catches false ranges). Later, we will handle each individual code
10223 * point in the range. If 'range' isn't set, this could be the
10224 * beginning of a range, so check for that by looking ahead to see if
10225 * the next real character to be processed is the range indicator--the
10226 * minus sign */
10227
10228 if (range) {
10229 #ifdef EBCDIC
10230 /* For unicode ranges, we have to test that the Unicode as opposed
10231 * to the native values are not decreasing. (Above 255, there is
10232 * no difference between native and Unicode) */
10233 if (unicode_range && prevvalue < 255 && value < 255) {
10234 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
10235 goto backwards_range;
10236 }
10237 }
10238 else
10239 #endif
10240 if (prevvalue > value) /* b-a */ {
10241 int w;
10242 #ifdef EBCDIC
10243 backwards_range:
10244 #endif
10245 w = RExC_parse - rangebegin;
10246 vFAIL2utf8f(
10247 "Invalid [] range \"%" UTF8f "\"",
10248 UTF8fARG(UTF, w, rangebegin));
10249 NOT_REACHED; /* NOTREACHED */
10250 }
10251 }
10252 else {
10253 prevvalue = value; /* save the beginning of the potential range */
10254 if (! stop_at_1 /* Can't be a range if parsing just one thing */
10255 && *RExC_parse == '-')
10256 {
10257 char* next_char_ptr = RExC_parse + 1;
10258
10259 /* Get the next real char after the '-' */
10260 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
10261
10262 /* If the '-' is at the end of the class (just before the ']',
10263 * it is a literal minus; otherwise it is a range */
10264 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
10265 RExC_parse_set(next_char_ptr);
10266
10267 /* a bad range like \w-, [:word:]- ? */
10268 if (namedclass > OOB_NAMEDCLASS) {
10269 if (strict || ckWARN(WARN_REGEXP)) {
10270 const int w = RExC_parse >= rangebegin
10271 ? RExC_parse - rangebegin
10272 : 0;
10273 if (strict) {
10274 vFAIL4("False [] range \"%*.*s\"",
10275 w, w, rangebegin);
10276 }
10277 else {
10278 vWARN4(RExC_parse,
10279 "False [] range \"%*.*s\"",
10280 w, w, rangebegin);
10281 }
10282 }
10283 cp_list = add_cp_to_invlist(cp_list, '-');
10284 element_count++;
10285 } else
10286 range = 1; /* yeah, it's a range! */
10287 continue; /* but do it the next time */
10288 }
10289 }
10290 }
10291
10292 if (namedclass > OOB_NAMEDCLASS) {
10293 continue;
10294 }
10295
10296 /* Here, we have a single value this time through the loop, and
10297 * <prevvalue> is the beginning of the range, if any; or <value> if
10298 * not. */
10299
10300 /* non-Latin1 code point implies unicode semantics. */
10301 if (value > 255) {
10302 if (value > MAX_LEGAL_CP && ( value != UV_MAX
10303 || prevvalue > MAX_LEGAL_CP))
10304 {
10305 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
10306 }
10307 REQUIRE_UNI_RULES(flagp, 0);
10308 if ( ! silence_non_portable
10309 && UNICODE_IS_PERL_EXTENDED(value)
10310 && TO_OUTPUT_WARNINGS(RExC_parse))
10311 {
10312 ckWARN2_non_literal_string(RExC_parse,
10313 packWARN(WARN_PORTABLE),
10314 PL_extended_cp_format,
10315 value);
10316 }
10317 }
10318
10319 /* Ready to process either the single value, or the completed range.
10320 * For single-valued non-inverted ranges, we consider the possibility
10321 * of multi-char folds. (We made a conscious decision to not do this
10322 * for the other cases because it can often lead to non-intuitive
10323 * results. For example, you have the peculiar case that:
10324 * "s s" =~ /^[^\xDF]+$/i => Y
10325 * "ss" =~ /^[^\xDF]+$/i => N
10326 *
10327 * See [perl #89750] */
10328 if (FOLD && allow_mutiple_chars && value == prevvalue) {
10329 if ( value == LATIN_SMALL_LETTER_SHARP_S
10330 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
10331 value)))
10332 {
10333 /* Here <value> is indeed a multi-char fold. Get what it is */
10334
10335 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10336 STRLEN foldlen;
10337
10338 UV folded = _to_uni_fold_flags(
10339 value,
10340 foldbuf,
10341 &foldlen,
10342 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
10343 ? FOLD_FLAGS_NOMIX_ASCII
10344 : 0)
10345 );
10346
10347 /* Here, <folded> should be the first character of the
10348 * multi-char fold of <value>, with <foldbuf> containing the
10349 * whole thing. But, if this fold is not allowed (because of
10350 * the flags), <fold> will be the same as <value>, and should
10351 * be processed like any other character, so skip the special
10352 * handling */
10353 if (folded != value) {
10354
10355 /* Skip if we are recursed, currently parsing the class
10356 * again. Otherwise add this character to the list of
10357 * multi-char folds. */
10358 if (! RExC_in_multi_char_class) {
10359 STRLEN cp_count = utf8_length(foldbuf,
10360 foldbuf + foldlen);
10361 SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
10362
10363 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
10364
10365 multi_char_matches
10366 = add_multi_match(multi_char_matches,
10367 multi_fold,
10368 cp_count);
10369
10370 }
10371
10372 /* This element should not be processed further in this
10373 * class */
10374 element_count--;
10375 value = save_value;
10376 prevvalue = save_prevvalue;
10377 continue;
10378 }
10379 }
10380 }
10381
10382 if (strict && ckWARN(WARN_REGEXP)) {
10383 if (range) {
10384
10385 /* If the range starts above 255, everything is portable and
10386 * likely to be so for any forseeable character set, so don't
10387 * warn. */
10388 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
10389 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
10390 }
10391 else if (prevvalue != value) {
10392
10393 /* Under strict, ranges that stop and/or end in an ASCII
10394 * printable should have each end point be a portable value
10395 * for it (preferably like 'A', but we don't warn if it is
10396 * a (portable) Unicode name or code point), and the range
10397 * must be all digits or all letters of the same case.
10398 * Otherwise, the range is non-portable and unclear as to
10399 * what it contains */
10400 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
10401 && ( non_portable_endpoint
10402 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
10403 || (isLOWER_A(prevvalue) && isLOWER_A(value))
10404 || (isUPPER_A(prevvalue) && isUPPER_A(value))
10405 ))) {
10406 vWARN(RExC_parse, "Ranges of ASCII printables should"
10407 " be some subset of \"0-9\","
10408 " \"A-Z\", or \"a-z\"");
10409 }
10410 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
10411 SSize_t index_start;
10412 SSize_t index_final;
10413
10414 /* But the nature of Unicode and languages mean we
10415 * can't do the same checks for above-ASCII ranges,
10416 * except in the case of digit ones. These should
10417 * contain only digits from the same group of 10. The
10418 * ASCII case is handled just above. Hence here, the
10419 * range could be a range of digits. First some
10420 * unlikely special cases. Grandfather in that a range
10421 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
10422 * if its starting value is one of the 10 digits prior
10423 * to it. This is because it is an alternate way of
10424 * writing 19D1, and some people may expect it to be in
10425 * that group. But it is bad, because it won't give
10426 * the expected results. In Unicode 5.2 it was
10427 * considered to be in that group (of 11, hence), but
10428 * this was fixed in the next version */
10429
10430 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
10431 goto warn_bad_digit_range;
10432 }
10433 else if (UNLIKELY( prevvalue >= 0x1D7CE
10434 && value <= 0x1D7FF))
10435 {
10436 /* This is the only other case currently in Unicode
10437 * where the algorithm below fails. The code
10438 * points just above are the end points of a single
10439 * range containing only decimal digits. It is 5
10440 * different series of 0-9. All other ranges of
10441 * digits currently in Unicode are just a single
10442 * series. (And mktables will notify us if a later
10443 * Unicode version breaks this.)
10444 *
10445 * If the range being checked is at most 9 long,
10446 * and the digit values represented are in
10447 * numerical order, they are from the same series.
10448 * */
10449 if ( value - prevvalue > 9
10450 || ((( value - 0x1D7CE) % 10)
10451 <= (prevvalue - 0x1D7CE) % 10))
10452 {
10453 goto warn_bad_digit_range;
10454 }
10455 }
10456 else {
10457
10458 /* For all other ranges of digits in Unicode, the
10459 * algorithm is just to check if both end points
10460 * are in the same series, which is the same range.
10461 * */
10462 index_start = _invlist_search(
10463 PL_XPosix_ptrs[CC_DIGIT_],
10464 prevvalue);
10465
10466 /* Warn if the range starts and ends with a digit,
10467 * and they are not in the same group of 10. */
10468 if ( index_start >= 0
10469 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
10470 && (index_final =
10471 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
10472 value)) != index_start
10473 && index_final >= 0
10474 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
10475 {
10476 warn_bad_digit_range:
10477 vWARN(RExC_parse, "Ranges of digits should be"
10478 " from the same group of"
10479 " 10");
10480 }
10481 }
10482 }
10483 }
10484 }
10485 if ((! range || prevvalue == value) && non_portable_endpoint) {
10486 if (isPRINT_A(value)) {
10487 char literal[3];
10488 unsigned d = 0;
10489 if (isBACKSLASHED_PUNCT(value)) {
10490 literal[d++] = '\\';
10491 }
10492 literal[d++] = (char) value;
10493 literal[d++] = '\0';
10494
10495 vWARN4(RExC_parse,
10496 "\"%.*s\" is more clearly written simply as \"%s\"",
10497 (int) (RExC_parse - rangebegin),
10498 rangebegin,
10499 literal
10500 );
10501 }
10502 else if (isMNEMONIC_CNTRL(value)) {
10503 vWARN4(RExC_parse,
10504 "\"%.*s\" is more clearly written simply as \"%s\"",
10505 (int) (RExC_parse - rangebegin),
10506 rangebegin,
10507 cntrl_to_mnemonic((U8) value)
10508 );
10509 }
10510 }
10511 }
10512
10513 /* Deal with this element of the class */
10514
10515 #ifndef EBCDIC
10516 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10517 prevvalue, value);
10518 #else
10519 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
10520 * that don't require special handling, we can just add the range like
10521 * we do for ASCII platforms */
10522 if ((UNLIKELY(prevvalue == 0) && value >= 255)
10523 || ! (prevvalue < 256
10524 && (unicode_range
10525 || (! non_portable_endpoint
10526 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
10527 || (isUPPER_A(prevvalue)
10528 && isUPPER_A(value)))))))
10529 {
10530 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10531 prevvalue, value);
10532 }
10533 else {
10534 /* Here, requires special handling. This can be because it is a
10535 * range whose code points are considered to be Unicode, and so
10536 * must be individually translated into native, or because its a
10537 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
10538 * EBCDIC, but we have defined them to include only the "expected"
10539 * upper or lower case ASCII alphabetics. Subranges above 255 are
10540 * the same in native and Unicode, so can be added as a range */
10541 U8 start = NATIVE_TO_LATIN1(prevvalue);
10542 unsigned j;
10543 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
10544 for (j = start; j <= end; j++) {
10545 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
10546 }
10547 if (value > 255) {
10548 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10549 256, value);
10550 }
10551 }
10552 #endif
10553
10554 range = 0; /* this range (if it was one) is done now */
10555 } /* End of loop through all the text within the brackets */
10556
10557 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
10558 output_posix_warnings(pRExC_state, posix_warnings);
10559 }
10560
10561 /* If anything in the class expands to more than one character, we have to
10562 * deal with them by building up a substitute parse string, and recursively
10563 * calling reg() on it, instead of proceeding */
10564 if (multi_char_matches) {
10565 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
10566 I32 cp_count;
10567 STRLEN len;
10568 char *save_end = RExC_end;
10569 char *save_parse = RExC_parse;
10570 char *save_start = RExC_start;
10571 Size_t constructed_prefix_len = 0; /* This gives the length of the
10572 constructed portion of the
10573 substitute parse. */
10574 bool first_time = TRUE; /* First multi-char occurrence doesn't get
10575 a "|" */
10576 I32 reg_flags;
10577
10578 assert(! invert);
10579 /* Only one level of recursion allowed */
10580 assert(RExC_copy_start_in_constructed == RExC_precomp);
10581
10582 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
10583 because too confusing */
10584 if (invert) {
10585 sv_catpvs(substitute_parse, "(?:");
10586 }
10587 #endif
10588
10589 /* Look at the longest strings first */
10590 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
10591 cp_count > 0;
10592 cp_count--)
10593 {
10594
10595 if (av_exists(multi_char_matches, cp_count)) {
10596 AV** this_array_ptr;
10597 SV* this_sequence;
10598
10599 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
10600 cp_count, FALSE);
10601 while ((this_sequence = av_pop(*this_array_ptr)) !=
10602 &PL_sv_undef)
10603 {
10604 if (! first_time) {
10605 sv_catpvs(substitute_parse, "|");
10606 }
10607 first_time = FALSE;
10608
10609 sv_catpv(substitute_parse, SvPVX(this_sequence));
10610 }
10611 }
10612 }
10613
10614 /* If the character class contains anything else besides these
10615 * multi-character strings, have to include it in recursive parsing */
10616 if (element_count) {
10617 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
10618
10619 sv_catpvs(substitute_parse, "|");
10620 if (has_l_bracket) { /* Add an [ if the original had one */
10621 sv_catpvs(substitute_parse, "[");
10622 }
10623 constructed_prefix_len = SvCUR(substitute_parse);
10624 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
10625
10626 /* Put in a closing ']' to match any opening one, but not if going
10627 * off the end, as otherwise we are adding something that really
10628 * isn't there */
10629 if (has_l_bracket && RExC_parse < RExC_end) {
10630 sv_catpvs(substitute_parse, "]");
10631 }
10632 }
10633
10634 sv_catpvs(substitute_parse, ")");
10635 #if 0
10636 if (invert) {
10637 /* This is a way to get the parse to skip forward a whole named
10638 * sequence instead of matching the 2nd character when it fails the
10639 * first */
10640 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
10641 }
10642 #endif
10643
10644 /* Set up the data structure so that any errors will be properly
10645 * reported. See the comments at the definition of
10646 * REPORT_LOCATION_ARGS for details */
10647 RExC_copy_start_in_input = (char *) orig_parse;
10648 RExC_start = SvPV(substitute_parse, len);
10649 RExC_parse_set( RExC_start );
10650 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
10651 RExC_end = RExC_parse + len;
10652 RExC_in_multi_char_class = 1;
10653
10654 ret = reg(pRExC_state, 1, ®_flags, depth+1);
10655
10656 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
10657
10658 /* And restore so can parse the rest of the pattern */
10659 RExC_parse_set(save_parse);
10660 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
10661 RExC_end = save_end;
10662 RExC_in_multi_char_class = 0;
10663 SvREFCNT_dec_NN(multi_char_matches);
10664 SvREFCNT_dec(properties);
10665 SvREFCNT_dec(cp_list);
10666 SvREFCNT_dec(simple_posixes);
10667 SvREFCNT_dec(posixes);
10668 SvREFCNT_dec(nposixes);
10669 SvREFCNT_dec(cp_foldable_list);
10670 return ret;
10671 }
10672
10673 /* If folding, we calculate all characters that could fold to or from the
10674 * ones already on the list */
10675 if (cp_foldable_list) {
10676 if (FOLD) {
10677 UV start, end; /* End points of code point ranges */
10678
10679 SV* fold_intersection = NULL;
10680 SV** use_list;
10681
10682 /* Our calculated list will be for Unicode rules. For locale
10683 * matching, we have to keep a separate list that is consulted at
10684 * runtime only when the locale indicates Unicode rules (and we
10685 * don't include potential matches in the ASCII/Latin1 range, as
10686 * any code point could fold to any other, based on the run-time
10687 * locale). For non-locale, we just use the general list */
10688 if (LOC) {
10689 use_list = &only_utf8_locale_list;
10690 }
10691 else {
10692 use_list = &cp_list;
10693 }
10694
10695 /* Only the characters in this class that participate in folds need
10696 * be checked. Get the intersection of this class and all the
10697 * possible characters that are foldable. This can quickly narrow
10698 * down a large class */
10699 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
10700 &fold_intersection);
10701
10702 /* Now look at the foldable characters in this class individually */
10703 invlist_iterinit(fold_intersection);
10704 while (invlist_iternext(fold_intersection, &start, &end)) {
10705 UV j;
10706 UV folded;
10707
10708 /* Look at every character in the range */
10709 for (j = start; j <= end; j++) {
10710 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10711 STRLEN foldlen;
10712 unsigned int k;
10713 Size_t folds_count;
10714 U32 first_fold;
10715 const U32 * remaining_folds;
10716
10717 if (j < 256) {
10718
10719 /* Under /l, we don't know what code points below 256
10720 * fold to, except we do know the MICRO SIGN folds to
10721 * an above-255 character if the locale is UTF-8, so we
10722 * add it to the special list (in *use_list) Otherwise
10723 * we know now what things can match, though some folds
10724 * are valid under /d only if the target is UTF-8.
10725 * Those go in a separate list */
10726 if ( IS_IN_SOME_FOLD_L1(j)
10727 && ! (LOC && j != MICRO_SIGN))
10728 {
10729
10730 /* ASCII is always matched; non-ASCII is matched
10731 * only under Unicode rules (which could happen
10732 * under /l if the locale is a UTF-8 one */
10733 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
10734 *use_list = add_cp_to_invlist(*use_list,
10735 PL_fold_latin1[j]);
10736 }
10737 else if (j != PL_fold_latin1[j]) {
10738 upper_latin1_only_utf8_matches
10739 = add_cp_to_invlist(
10740 upper_latin1_only_utf8_matches,
10741 PL_fold_latin1[j]);
10742 }
10743 }
10744
10745 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
10746 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
10747 {
10748 add_above_Latin1_folds(pRExC_state,
10749 (U8) j,
10750 use_list);
10751 }
10752 continue;
10753 }
10754
10755 /* Here is an above Latin1 character. We don't have the
10756 * rules hard-coded for it. First, get its fold. This is
10757 * the simple fold, as the multi-character folds have been
10758 * handled earlier and separated out */
10759 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
10760 (ASCII_FOLD_RESTRICTED)
10761 ? FOLD_FLAGS_NOMIX_ASCII
10762 : 0);
10763
10764 /* Single character fold of above Latin1. Add everything
10765 * in its fold closure to the list that this node should
10766 * match. */
10767 folds_count = _inverse_folds(folded, &first_fold,
10768 &remaining_folds);
10769 for (k = 0; k <= folds_count; k++) {
10770 UV c = (k == 0) /* First time through use itself */
10771 ? folded
10772 : (k == 1) /* 2nd time use, the first fold */
10773 ? first_fold
10774
10775 /* Then the remaining ones */
10776 : remaining_folds[k-2];
10777
10778 /* /aa doesn't allow folds between ASCII and non- */
10779 if (( ASCII_FOLD_RESTRICTED
10780 && (isASCII(c) != isASCII(j))))
10781 {
10782 continue;
10783 }
10784
10785 /* Folds under /l which cross the 255/256 boundary are
10786 * added to a separate list. (These are valid only
10787 * when the locale is UTF-8.) */
10788 if (c < 256 && LOC) {
10789 *use_list = add_cp_to_invlist(*use_list, c);
10790 continue;
10791 }
10792
10793 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
10794 {
10795 cp_list = add_cp_to_invlist(cp_list, c);
10796 }
10797 else {
10798 /* Similarly folds involving non-ascii Latin1
10799 * characters under /d are added to their list */
10800 upper_latin1_only_utf8_matches
10801 = add_cp_to_invlist(
10802 upper_latin1_only_utf8_matches,
10803 c);
10804 }
10805 }
10806 }
10807 }
10808 SvREFCNT_dec_NN(fold_intersection);
10809 }
10810
10811 /* Now that we have finished adding all the folds, there is no reason
10812 * to keep the foldable list separate */
10813 _invlist_union(cp_list, cp_foldable_list, &cp_list);
10814 SvREFCNT_dec_NN(cp_foldable_list);
10815 }
10816
10817 /* And combine the result (if any) with any inversion lists from posix
10818 * classes. The lists are kept separate up to now because we don't want to
10819 * fold the classes */
10820 if (simple_posixes) { /* These are the classes known to be unaffected by
10821 /a, /aa, and /d */
10822 if (cp_list) {
10823 _invlist_union(cp_list, simple_posixes, &cp_list);
10824 SvREFCNT_dec_NN(simple_posixes);
10825 }
10826 else {
10827 cp_list = simple_posixes;
10828 }
10829 }
10830 if (posixes || nposixes) {
10831 if (! DEPENDS_SEMANTICS) {
10832
10833 /* For everything but /d, we can just add the current 'posixes' and
10834 * 'nposixes' to the main list */
10835 if (posixes) {
10836 if (cp_list) {
10837 _invlist_union(cp_list, posixes, &cp_list);
10838 SvREFCNT_dec_NN(posixes);
10839 }
10840 else {
10841 cp_list = posixes;
10842 }
10843 }
10844 if (nposixes) {
10845 if (cp_list) {
10846 _invlist_union(cp_list, nposixes, &cp_list);
10847 SvREFCNT_dec_NN(nposixes);
10848 }
10849 else {
10850 cp_list = nposixes;
10851 }
10852 }
10853 }
10854 else {
10855 /* Under /d, things like \w match upper Latin1 characters only if
10856 * the target string is in UTF-8. But things like \W match all the
10857 * upper Latin1 characters if the target string is not in UTF-8.
10858 *
10859 * Handle the case with something like \W separately */
10860 if (nposixes) {
10861 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
10862
10863 /* A complemented posix class matches all upper Latin1
10864 * characters if not in UTF-8. And it matches just certain
10865 * ones when in UTF-8. That means those certain ones are
10866 * matched regardless, so can just be added to the
10867 * unconditional list */
10868 if (cp_list) {
10869 _invlist_union(cp_list, nposixes, &cp_list);
10870 SvREFCNT_dec_NN(nposixes);
10871 nposixes = NULL;
10872 }
10873 else {
10874 cp_list = nposixes;
10875 }
10876
10877 /* Likewise for 'posixes' */
10878 _invlist_union(posixes, cp_list, &cp_list);
10879 SvREFCNT_dec(posixes);
10880
10881 /* Likewise for anything else in the range that matched only
10882 * under UTF-8 */
10883 if (upper_latin1_only_utf8_matches) {
10884 _invlist_union(cp_list,
10885 upper_latin1_only_utf8_matches,
10886 &cp_list);
10887 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10888 upper_latin1_only_utf8_matches = NULL;
10889 }
10890
10891 /* If we don't match all the upper Latin1 characters regardless
10892 * of UTF-8ness, we have to set a flag to match the rest when
10893 * not in UTF-8 */
10894 _invlist_subtract(only_non_utf8_list, cp_list,
10895 &only_non_utf8_list);
10896 if (_invlist_len(only_non_utf8_list) != 0) {
10897 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
10898 }
10899 SvREFCNT_dec_NN(only_non_utf8_list);
10900 }
10901 else {
10902 /* Here there were no complemented posix classes. That means
10903 * the upper Latin1 characters in 'posixes' match only when the
10904 * target string is in UTF-8. So we have to add them to the
10905 * list of those types of code points, while adding the
10906 * remainder to the unconditional list.
10907 *
10908 * First calculate what they are */
10909 SV* nonascii_but_latin1_properties = NULL;
10910 _invlist_intersection(posixes, PL_UpperLatin1,
10911 &nonascii_but_latin1_properties);
10912
10913 /* And add them to the final list of such characters. */
10914 _invlist_union(upper_latin1_only_utf8_matches,
10915 nonascii_but_latin1_properties,
10916 &upper_latin1_only_utf8_matches);
10917
10918 /* Remove them from what now becomes the unconditional list */
10919 _invlist_subtract(posixes, nonascii_but_latin1_properties,
10920 &posixes);
10921
10922 /* And add those unconditional ones to the final list */
10923 if (cp_list) {
10924 _invlist_union(cp_list, posixes, &cp_list);
10925 SvREFCNT_dec_NN(posixes);
10926 posixes = NULL;
10927 }
10928 else {
10929 cp_list = posixes;
10930 }
10931
10932 SvREFCNT_dec(nonascii_but_latin1_properties);
10933
10934 /* Get rid of any characters from the conditional list that we
10935 * now know are matched unconditionally, which may make that
10936 * list empty */
10937 _invlist_subtract(upper_latin1_only_utf8_matches,
10938 cp_list,
10939 &upper_latin1_only_utf8_matches);
10940 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
10941 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10942 upper_latin1_only_utf8_matches = NULL;
10943 }
10944 }
10945 }
10946 }
10947
10948 /* And combine the result (if any) with any inversion list from properties.
10949 * The lists are kept separate up to now so that we can distinguish the two
10950 * in regards to matching above-Unicode. A run-time warning is generated
10951 * if a Unicode property is matched against a non-Unicode code point. But,
10952 * we allow user-defined properties to match anything, without any warning,
10953 * and we also suppress the warning if there is a portion of the character
10954 * class that isn't a Unicode property, and which matches above Unicode, \W
10955 * or [\x{110000}] for example.
10956 * (Note that in this case, unlike the Posix one above, there is no
10957 * <upper_latin1_only_utf8_matches>, because having a Unicode property
10958 * forces Unicode semantics */
10959 if (properties) {
10960 if (cp_list) {
10961
10962 /* If it matters to the final outcome, see if a non-property
10963 * component of the class matches above Unicode. If so, the
10964 * warning gets suppressed. This is true even if just a single
10965 * such code point is specified, as, though not strictly correct if
10966 * another such code point is matched against, the fact that they
10967 * are using above-Unicode code points indicates they should know
10968 * the issues involved */
10969 if (warn_super) {
10970 warn_super = ! (invert
10971 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
10972 }
10973
10974 _invlist_union(properties, cp_list, &cp_list);
10975 SvREFCNT_dec_NN(properties);
10976 }
10977 else {
10978 cp_list = properties;
10979 }
10980
10981 if (warn_super) {
10982 anyof_flags |= ANYOF_WARN_SUPER__shared;
10983
10984 /* Because an ANYOF node is the only one that warns, this node
10985 * can't be optimized into something else */
10986 optimizable = FALSE;
10987 }
10988 }
10989
10990 /* Here, we have calculated what code points should be in the character
10991 * class.
10992 *
10993 * Now we can see about various optimizations. Fold calculation (which we
10994 * did above) needs to take place before inversion. Otherwise /[^k]/i
10995 * would invert to include K, which under /i would match k, which it
10996 * shouldn't. Therefore we can't invert folded locale now, as it won't be
10997 * folded until runtime */
10998
10999 /* If we didn't do folding, it's because some information isn't available
11000 * until runtime; set the run-time fold flag for these We know to set the
11001 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
11002 * at least one 0-255 range code point */
11003 if (LOC && FOLD) {
11004
11005 /* Some things on the list might be unconditionally included because of
11006 * other components. Remove them, and clean up the list if it goes to
11007 * 0 elements */
11008 if (only_utf8_locale_list && cp_list) {
11009 _invlist_subtract(only_utf8_locale_list, cp_list,
11010 &only_utf8_locale_list);
11011
11012 if (_invlist_len(only_utf8_locale_list) == 0) {
11013 SvREFCNT_dec_NN(only_utf8_locale_list);
11014 only_utf8_locale_list = NULL;
11015 }
11016 }
11017 if ( only_utf8_locale_list
11018 || ( cp_list
11019 && ( _invlist_contains_cp(cp_list,
11020 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
11021 || _invlist_contains_cp(cp_list,
11022 LATIN_SMALL_LETTER_DOTLESS_I))))
11023 {
11024 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11025 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11026 }
11027 else if (cp_list && invlist_lowest(cp_list) < 256) {
11028 /* If nothing is below 256, has no locale dependency; otherwise it
11029 * does */
11030 anyof_flags |= ANYOFL_FOLD;
11031 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11032
11033 /* In a Turkish locale these could match, notify the run-time code
11034 * to check for that */
11035 if ( _invlist_contains_cp(cp_list, 'I')
11036 || _invlist_contains_cp(cp_list, 'i'))
11037 {
11038 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11039 }
11040 }
11041 }
11042 else if ( DEPENDS_SEMANTICS
11043 && ( upper_latin1_only_utf8_matches
11044 || ( anyof_flags
11045 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
11046 {
11047 RExC_seen_d_op = TRUE;
11048 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
11049 }
11050
11051 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
11052 * compile time. */
11053 if ( cp_list
11054 && invert
11055 && ! has_runtime_dependency)
11056 {
11057 _invlist_invert(cp_list);
11058
11059 /* Clear the invert flag since have just done it here */
11060 invert = FALSE;
11061 }
11062
11063 /* All possible optimizations below still have these characteristics.
11064 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
11065 * routine) */
11066 *flagp |= HASWIDTH|SIMPLE;
11067
11068 if (ret_invlist) {
11069 *ret_invlist = cp_list;
11070
11071 return (cp_list) ? RExC_emit : 0;
11072 }
11073
11074 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
11075 RExC_contains_locale = 1;
11076 }
11077
11078 if (optimizable) {
11079
11080 /* Some character classes are equivalent to other nodes. Such nodes
11081 * take up less room, and some nodes require fewer operations to
11082 * execute, than ANYOF nodes. EXACTish nodes may be joinable with
11083 * adjacent nodes to improve efficiency. */
11084 op = optimize_regclass(pRExC_state, cp_list,
11085 only_utf8_locale_list,
11086 upper_latin1_only_utf8_matches,
11087 has_runtime_dependency,
11088 posixl,
11089 &anyof_flags, &invert, &ret, flagp);
11090 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
11091
11092 /* If optimized to something else and emitted, clean up and return */
11093 if (ret >= 0) {
11094 SvREFCNT_dec(cp_list);;
11095 SvREFCNT_dec(only_utf8_locale_list);
11096 SvREFCNT_dec(upper_latin1_only_utf8_matches);
11097 return ret;
11098 }
11099
11100 /* If no optimization was found, an END was returned and we will now
11101 * emit an ANYOF */
11102 if (op == END) {
11103 op = ANYOF;
11104 }
11105 }
11106
11107 /* Here are going to emit an ANYOF; set the particular type */
11108 if (op == ANYOF) {
11109 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
11110 op = ANYOFD;
11111 }
11112 else if (posixl) {
11113 op = ANYOFPOSIXL;
11114 }
11115 else if (LOC) {
11116 op = ANYOFL;
11117 }
11118 }
11119
11120 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
11121 FILL_NODE(ret, op); /* We set the argument later */
11122 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
11123 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
11124
11125 /* Here, <cp_list> contains all the code points we can determine at
11126 * compile time that match under all conditions. Go through it, and
11127 * for things that belong in the bitmap, put them there, and delete from
11128 * <cp_list>. While we are at it, see if everything above 255 is in the
11129 * list, and if so, set a flag to speed up execution */
11130
11131 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
11132
11133 if (posixl) {
11134 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
11135 }
11136
11137 if (invert) {
11138 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
11139 }
11140
11141 /* Here, the bitmap has been populated with all the Latin1 code points that
11142 * always match. Can now add to the overall list those that match only
11143 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
11144 * */
11145 if (upper_latin1_only_utf8_matches) {
11146 if (cp_list) {
11147 _invlist_union(cp_list,
11148 upper_latin1_only_utf8_matches,
11149 &cp_list);
11150 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11151 }
11152 else {
11153 cp_list = upper_latin1_only_utf8_matches;
11154 }
11155 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11156 }
11157
11158 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
11159 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
11160 ? listsv
11161 : NULL,
11162 only_utf8_locale_list);
11163
11164 SvREFCNT_dec(cp_list);;
11165 SvREFCNT_dec(only_utf8_locale_list);
11166 return ret;
11167 }
11168
11169 STATIC U8
S_optimize_regclass(pTHX_ RExC_state_t * pRExC_state,SV * cp_list,SV * only_utf8_locale_list,SV * upper_latin1_only_utf8_matches,const U32 has_runtime_dependency,const U32 posixl,U8 * anyof_flags,bool * invert,regnode_offset * ret,I32 * flagp)11170 S_optimize_regclass(pTHX_
11171 RExC_state_t *pRExC_state,
11172 SV * cp_list,
11173 SV* only_utf8_locale_list,
11174 SV* upper_latin1_only_utf8_matches,
11175 const U32 has_runtime_dependency,
11176 const U32 posixl,
11177 U8 * anyof_flags,
11178 bool * invert,
11179 regnode_offset * ret,
11180 I32 *flagp
11181 )
11182 {
11183 /* This function exists just to make S_regclass() smaller. It extracts out
11184 * the code that looks for potential optimizations away from a full generic
11185 * ANYOF node. The parameter names are the same as the corresponding
11186 * variables in S_regclass.
11187 *
11188 * It returns the new op (the impossible END one if no optimization found)
11189 * and sets *ret to any created regnode. If the new op is sufficiently
11190 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
11191 *
11192 * Certain of the parameters may be updated as a result of the changes
11193 * herein */
11194
11195 U8 op = END; /* The returned node-type, initialized to an impossible
11196 one. */
11197 UV value = 0;
11198 PERL_UINT_FAST8_T i;
11199 UV partial_cp_count = 0;
11200 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
11201 UV end[MAX_FOLD_FROMS+1] = { 0 };
11202 bool single_range = FALSE;
11203 UV lowest_cp = 0, highest_cp = 0;
11204
11205 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
11206
11207 if (cp_list) { /* Count the code points in enough ranges that we would see
11208 all the ones possible in any fold in this version of
11209 Unicode */
11210
11211 invlist_iterinit(cp_list);
11212 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
11213 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
11214 break;
11215 }
11216 partial_cp_count += end[i] - start[i] + 1;
11217 }
11218
11219 if (i == 1) {
11220 single_range = TRUE;
11221 }
11222 invlist_iterfinish(cp_list);
11223
11224 /* If we know at compile time that this matches every possible code
11225 * point, any run-time dependencies don't matter */
11226 if (start[0] == 0 && end[0] == UV_MAX) {
11227 if (*invert) {
11228 goto return_OPFAIL;
11229 }
11230 else {
11231 goto return_SANY;
11232 }
11233 }
11234
11235 /* Use a clearer mnemonic for below */
11236 lowest_cp = start[0];
11237
11238 highest_cp = invlist_highest(cp_list);
11239 }
11240
11241 /* Similarly, for /l posix classes, if both a class and its complement
11242 * match, any run-time dependencies don't matter */
11243 if (posixl) {
11244 int namedclass;
11245 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
11246 if ( POSIXL_TEST(posixl, namedclass) /* class */
11247 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
11248 {
11249 if (*invert) {
11250 goto return_OPFAIL;
11251 }
11252 goto return_SANY;
11253 }
11254 }
11255
11256 /* For well-behaved locales, some classes are subsets of others, so
11257 * complementing the subset and including the non-complemented superset
11258 * should match everything, like [\D[:alnum:]], and
11259 * [[:^alpha:][:alnum:]], but some implementations of locales are
11260 * buggy, and khw thinks its a bad idea to have optimization change
11261 * behavior, even if it avoids an OS bug in a given case */
11262
11263 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
11264
11265 /* If is a single posix /l class, can optimize to just that op. Such a
11266 * node will not match anything in the Latin1 range, as that is not
11267 * determinable until runtime, but will match whatever the class does
11268 * outside that range. (Note that some classes won't match anything
11269 * outside the range, like [:ascii:]) */
11270 if ( isSINGLE_BIT_SET(posixl)
11271 && (partial_cp_count == 0 || lowest_cp > 255))
11272 {
11273 U8 classnum;
11274 SV * class_above_latin1 = NULL;
11275 bool already_inverted;
11276 bool are_equivalent;
11277
11278
11279 namedclass = single_1bit_pos32(posixl);
11280 classnum = namedclass_to_classnum(namedclass);
11281
11282 /* The named classes are such that the inverted number is one
11283 * larger than the non-inverted one */
11284 already_inverted = namedclass - classnum_to_namedclass(classnum);
11285
11286 /* Create an inversion list of the official property, inverted if
11287 * the constructed node list is inverted, and restricted to only
11288 * the above latin1 code points, which are the only ones known at
11289 * compile time */
11290 _invlist_intersection_maybe_complement_2nd(
11291 PL_AboveLatin1,
11292 PL_XPosix_ptrs[classnum],
11293 already_inverted,
11294 &class_above_latin1);
11295 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
11296 SvREFCNT_dec_NN(class_above_latin1);
11297
11298 if (are_equivalent) {
11299
11300 /* Resolve the run-time inversion flag with this possibly
11301 * inverted class */
11302 *invert = *invert ^ already_inverted;
11303
11304 op = POSIXL + *invert * (NPOSIXL - POSIXL);
11305 *ret = reg_node(pRExC_state, op);
11306 FLAGS(REGNODE_p(*ret)) = classnum;
11307 return op;
11308 }
11309 }
11310 }
11311
11312 /* khw can't think of any other possible transformation involving these. */
11313 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
11314 return END;
11315 }
11316
11317 if (! has_runtime_dependency) {
11318
11319 /* If the list is empty, nothing matches. This happens, for example,
11320 * when a Unicode property that doesn't match anything is the only
11321 * element in the character class (perluniprops.pod notes such
11322 * properties). */
11323 if (partial_cp_count == 0) {
11324 if (*invert) {
11325 goto return_SANY;
11326 }
11327 else {
11328 goto return_OPFAIL;
11329 }
11330 }
11331
11332 /* If matches everything but \n */
11333 if ( start[0] == 0 && end[0] == '\n' - 1
11334 && start[1] == '\n' + 1 && end[1] == UV_MAX)
11335 {
11336 assert (! *invert);
11337 op = REG_ANY;
11338 *ret = reg_node(pRExC_state, op);
11339 MARK_NAUGHTY(1);
11340 return op;
11341 }
11342 }
11343
11344 /* Next see if can optimize classes that contain just a few code points
11345 * into an EXACTish node. The reason to do this is to let the optimizer
11346 * join this node with adjacent EXACTish ones, and ANYOF nodes require
11347 * runtime conversion to code point from UTF-8, which we'd like to avoid.
11348 *
11349 * An EXACTFish node can be generated even if not under /i, and vice versa.
11350 * But care must be taken. An EXACTFish node has to be such that it only
11351 * matches precisely the code points in the class, but we want to generate
11352 * the least restrictive one that does that, to increase the odds of being
11353 * able to join with an adjacent node. For example, if the class contains
11354 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
11355 * from matching. Whether we are under /i or not is irrelevant in this
11356 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
11357 * LETTER APOSTROPHE. That is supposed to match the single character U+0149
11358 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
11359 * is no simple fold that includes \X{02BC}, there is a multi-char fold
11360 * that does, and so the node generated for it must be an EXACTFish one.
11361 * On the other hand qr/:/i should generate a plain EXACT node since the
11362 * colon participates in no fold whatsoever, and having it be EXACT tells
11363 * the optimizer the target string cannot match unless it has a colon in
11364 * it. */
11365 if ( ! posixl
11366 && ! *invert
11367
11368 /* Only try if there are no more code points in the class than in
11369 * the max possible fold */
11370 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
11371 {
11372 /* We can always make a single code point class into an EXACTish node.
11373 * */
11374 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
11375 if (LOC) {
11376
11377 /* Here is /l: Use EXACTL, except if there is a fold not known
11378 * until runtime so shows as only a single code point here.
11379 * For code points above 255, we know which can cause problems
11380 * by having a potential fold to the Latin1 range. */
11381 if ( ! FOLD
11382 || ( lowest_cp > 255
11383 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
11384 {
11385 op = EXACTL;
11386 }
11387 else {
11388 op = EXACTFL;
11389 }
11390 }
11391 else if (! FOLD) { /* Not /l and not /i */
11392 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
11393 }
11394 else if (lowest_cp < 256) { /* /i, not /l, and the code point is
11395 small */
11396
11397 /* Under /i, it gets a little tricky. A code point that
11398 * doesn't participate in a fold should be an EXACT node. We
11399 * know this one isn't the result of a simple fold, or there'd
11400 * be more than one code point in the list, but it could be
11401 * part of a multi-character fold. In that case we better not
11402 * create an EXACT node, as we would wrongly be telling the
11403 * optimizer that this code point must be in the target string,
11404 * and that is wrong. This is because if the sequence around
11405 * this code point forms a multi-char fold, what needs to be in
11406 * the string could be the code point that folds to the
11407 * sequence.
11408 *
11409 * This handles the case of below-255 code points, as we have
11410 * an easy look up for those. The next clause handles the
11411 * above-256 one */
11412 op = IS_IN_SOME_FOLD_L1(lowest_cp)
11413 ? EXACTFU
11414 : EXACT;
11415 }
11416 else { /* /i, larger code point. Since we are under /i, and have
11417 just this code point, we know that it can't fold to
11418 something else, so PL_InMultiCharFold applies to it */
11419 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
11420 ? EXACTFU_REQ8
11421 : EXACT_REQ8;
11422 }
11423
11424 value = lowest_cp;
11425 }
11426 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
11427 && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
11428 {
11429 /* Here, the only runtime dependency, if any, is from /d, and the
11430 * class matches more than one code point, and the lowest code
11431 * point participates in some fold. It might be that the other
11432 * code points are /i equivalent to this one, and hence they would
11433 * be representable by an EXACTFish node. Above, we eliminated
11434 * classes that contain too many code points to be EXACTFish, with
11435 * the test for MAX_FOLD_FROMS
11436 *
11437 * First, special case the ASCII fold pairs, like 'B' and 'b'. We
11438 * do this because we have EXACTFAA at our disposal for the ASCII
11439 * range */
11440 if (partial_cp_count == 2 && isASCII(lowest_cp)) {
11441
11442 /* The only ASCII characters that participate in folds are
11443 * alphabetics */
11444 assert(isALPHA(lowest_cp));
11445 if ( end[0] == start[0] /* First range is a single
11446 character, so 2nd exists */
11447 && isALPHA_FOLD_EQ(start[0], start[1]))
11448 {
11449 /* Here, is part of an ASCII fold pair */
11450
11451 if ( ASCII_FOLD_RESTRICTED
11452 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
11453 {
11454 /* If the second clause just above was true, it means
11455 * we can't be under /i, or else the list would have
11456 * included more than this fold pair. Therefore we
11457 * have to exclude the possibility of whatever else it
11458 * is that folds to these, by using EXACTFAA */
11459 op = EXACTFAA;
11460 }
11461 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
11462
11463 /* Here, there's no simple fold that lowest_cp is part
11464 * of, but there is a multi-character one. If we are
11465 * not under /i, we want to exclude that possibility;
11466 * if under /i, we want to include it */
11467 op = (FOLD) ? EXACTFU : EXACTFAA;
11468 }
11469 else {
11470
11471 /* Here, the only possible fold lowest_cp participates in
11472 * is with start[1]. /i or not isn't relevant */
11473 op = EXACTFU;
11474 }
11475
11476 value = toFOLD(lowest_cp);
11477 }
11478 }
11479 else if ( ! upper_latin1_only_utf8_matches
11480 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2
11481 && PL_fold_latin1[
11482 invlist_highest(upper_latin1_only_utf8_matches)]
11483 == lowest_cp))
11484 {
11485 /* Here, the smallest character is non-ascii or there are more
11486 * than 2 code points matched by this node. Also, we either
11487 * don't have /d UTF-8 dependent matches, or if we do, they
11488 * look like they could be a single character that is the fold
11489 * of the lowest one is in the always-match list. This test
11490 * quickly excludes most of the false positives when there are
11491 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL
11492 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
11493 * iff the target string is UTF-8. (We don't have to worry
11494 * above about exceeding the array bounds of PL_fold_latin1[]
11495 * because any code point in 'upper_latin1_only_utf8_matches'
11496 * is below 256.)
11497 *
11498 * EXACTFAA would apply only to pairs (hence exactly 2 code
11499 * points) in the ASCII range, so we can't use it here to
11500 * artificially restrict the fold domain, so we check if the
11501 * class does or does not match some EXACTFish node. Further,
11502 * if we aren't under /i, and and the folded-to character is
11503 * part of a multi-character fold, we can't do this
11504 * optimization, as the sequence around it could be that
11505 * multi-character fold, and we don't here know the context, so
11506 * we have to assume it is that multi-char fold, to prevent
11507 * potential bugs.
11508 *
11509 * To do the general case, we first find the fold of the lowest
11510 * code point (which may be higher than that lowest unfolded
11511 * one), then find everything that folds to it. (The data
11512 * structure we have only maps from the folded code points, so
11513 * we have to do the earlier step.) */
11514
11515 Size_t foldlen;
11516 U8 foldbuf[UTF8_MAXBYTES_CASE];
11517 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
11518 U32 first_fold;
11519 const U32 * remaining_folds;
11520 Size_t folds_to_this_cp_count = _inverse_folds(
11521 folded,
11522 &first_fold,
11523 &remaining_folds);
11524 Size_t folds_count = folds_to_this_cp_count + 1;
11525 SV * fold_list = _new_invlist(folds_count);
11526 unsigned int i;
11527
11528 /* If there are UTF-8 dependent matches, create a temporary
11529 * list of what this node matches, including them. */
11530 SV * all_cp_list = NULL;
11531 SV ** use_this_list = &cp_list;
11532
11533 if (upper_latin1_only_utf8_matches) {
11534 all_cp_list = _new_invlist(0);
11535 use_this_list = &all_cp_list;
11536 _invlist_union(cp_list,
11537 upper_latin1_only_utf8_matches,
11538 use_this_list);
11539 }
11540
11541 /* Having gotten everything that participates in the fold
11542 * containing the lowest code point, we turn that into an
11543 * inversion list, making sure everything is included. */
11544 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
11545 fold_list = add_cp_to_invlist(fold_list, folded);
11546 if (folds_to_this_cp_count > 0) {
11547 fold_list = add_cp_to_invlist(fold_list, first_fold);
11548 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
11549 fold_list = add_cp_to_invlist(fold_list,
11550 remaining_folds[i]);
11551 }
11552 }
11553
11554 /* If the fold list is identical to what's in this ANYOF node,
11555 * the node can be represented by an EXACTFish one instead */
11556 if (_invlistEQ(*use_this_list, fold_list,
11557 0 /* Don't complement */ )
11558 ) {
11559
11560 /* But, we have to be careful, as mentioned above. Just
11561 * the right sequence of characters could match this if it
11562 * is part of a multi-character fold. That IS what we want
11563 * if we are under /i. But it ISN'T what we want if not
11564 * under /i, as it could match when it shouldn't. So, when
11565 * we aren't under /i and this character participates in a
11566 * multi-char fold, we don't optimize into an EXACTFish
11567 * node. So, for each case below we have to check if we
11568 * are folding, and if not, if it is not part of a
11569 * multi-char fold. */
11570 if (lowest_cp > 255) { /* Highish code point */
11571 if (FOLD || ! _invlist_contains_cp(
11572 PL_InMultiCharFold, folded))
11573 {
11574 op = (LOC)
11575 ? EXACTFLU8
11576 : (ASCII_FOLD_RESTRICTED)
11577 ? EXACTFAA
11578 : EXACTFU_REQ8;
11579 value = folded;
11580 }
11581 } /* Below, the lowest code point < 256 */
11582 else if ( FOLD
11583 && folded == 's'
11584 && DEPENDS_SEMANTICS)
11585 { /* An EXACTF node containing a single character 's',
11586 can be an EXACTFU if it doesn't get joined with an
11587 adjacent 's' */
11588 op = EXACTFU_S_EDGE;
11589 value = folded;
11590 }
11591 else if ( FOLD
11592 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
11593 {
11594 if (upper_latin1_only_utf8_matches) {
11595 op = EXACTF;
11596
11597 /* We can't use the fold, as that only matches
11598 * under UTF-8 */
11599 value = lowest_cp;
11600 }
11601 else if ( UNLIKELY(lowest_cp == MICRO_SIGN)
11602 && ! UTF)
11603 { /* EXACTFUP is a special node for this character */
11604 op = (ASCII_FOLD_RESTRICTED)
11605 ? EXACTFAA
11606 : EXACTFUP;
11607 value = MICRO_SIGN;
11608 }
11609 else if ( ASCII_FOLD_RESTRICTED
11610 && ! isASCII(lowest_cp))
11611 { /* For ASCII under /iaa, we can use EXACTFU below
11612 */
11613 op = EXACTFAA;
11614 value = folded;
11615 }
11616 else {
11617 op = EXACTFU;
11618 value = folded;
11619 }
11620 }
11621 }
11622
11623 SvREFCNT_dec_NN(fold_list);
11624 SvREFCNT_dec(all_cp_list);
11625 }
11626 }
11627
11628 if (op != END) {
11629 U8 len;
11630
11631 /* Here, we have calculated what EXACTish node to use. Have to
11632 * convert to UTF-8 if not already there */
11633 if (value > 255) {
11634 if (! UTF) {
11635 SvREFCNT_dec(cp_list);;
11636 REQUIRE_UTF8(flagp);
11637 }
11638
11639 /* This is a kludge to the special casing issues with this
11640 * ligature under /aa. FB05 should fold to FB06, but the call
11641 * above to _to_uni_fold_flags() didn't find this, as it didn't
11642 * use the /aa restriction in order to not miss other folds
11643 * that would be affected. This is the only instance likely to
11644 * ever be a problem in all of Unicode. So special case it. */
11645 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
11646 && ASCII_FOLD_RESTRICTED)
11647 {
11648 value = LATIN_SMALL_LIGATURE_ST;
11649 }
11650 }
11651
11652 len = (UTF) ? UVCHR_SKIP(value) : 1;
11653
11654 *ret = REGNODE_GUTS(pRExC_state, op, len);
11655 FILL_NODE(*ret, op);
11656 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
11657 setSTR_LEN(REGNODE_p(*ret), len);
11658 if (len == 1) {
11659 *STRINGs(REGNODE_p(*ret)) = (U8) value;
11660 }
11661 else {
11662 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
11663 }
11664
11665 return op;
11666 }
11667 }
11668
11669 if (! has_runtime_dependency) {
11670
11671 /* See if this can be turned into an ANYOFM node. Think about the bit
11672 * patterns in two different bytes. In some positions, the bits in
11673 * each will be 1; and in other positions both will be 0; and in some
11674 * positions the bit will be 1 in one byte, and 0 in the other. Let
11675 * 'n' be the number of positions where the bits differ. We create a
11676 * mask which has exactly 'n' 0 bits, each in a position where the two
11677 * bytes differ. Now take the set of all bytes that when ANDed with
11678 * the mask yield the same result. That set has 2**n elements, and is
11679 * representable by just two 8 bit numbers: the result and the mask.
11680 * Importantly, matching the set can be vectorized by creating a word
11681 * full of the result bytes, and a word full of the mask bytes,
11682 * yielding a significant speed up. Here, see if this node matches
11683 * such a set. As a concrete example consider [01], and the byte
11684 * representing '0' which is 0x30 on ASCII machines. It has the bits
11685 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
11686 * that mask we get 0x30. Any other bytes ANDed yield something else.
11687 * So [01], which is a common usage, is optimizable into ANYOFM, and
11688 * can benefit from the speed up. We can only do this on UTF-8
11689 * invariant bytes, because they have the same bit patterns under UTF-8
11690 * as not. */
11691 PERL_UINT_FAST8_T inverted = 0;
11692
11693 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
11694 * EBCDIC */
11695 const PERL_UINT_FAST8_T max_permissible
11696 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
11697
11698 /* If doesn't fit the criteria for ANYOFM, invert and try again. If
11699 * that works we will instead later generate an NANYOFM, and invert
11700 * back when through */
11701 if (highest_cp > max_permissible) {
11702 _invlist_invert(cp_list);
11703 inverted = 1;
11704 }
11705
11706 if (invlist_highest(cp_list) <= max_permissible) {
11707 UV this_start, this_end;
11708 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
11709 U8 bits_differing = 0;
11710 Size_t full_cp_count = 0;
11711 bool first_time = TRUE;
11712
11713 /* Go through the bytes and find the bit positions that differ */
11714 invlist_iterinit(cp_list);
11715 while (invlist_iternext(cp_list, &this_start, &this_end)) {
11716 unsigned int i = this_start;
11717
11718 if (first_time) {
11719 if (! UVCHR_IS_INVARIANT(i)) {
11720 goto done_anyofm;
11721 }
11722
11723 first_time = FALSE;
11724 lowest_cp = this_start;
11725
11726 /* We have set up the code point to compare with. Don't
11727 * compare it with itself */
11728 i++;
11729 }
11730
11731 /* Find the bit positions that differ from the lowest code
11732 * point in the node. Keep track of all such positions by
11733 * OR'ing */
11734 for (; i <= this_end; i++) {
11735 if (! UVCHR_IS_INVARIANT(i)) {
11736 goto done_anyofm;
11737 }
11738
11739 bits_differing |= i ^ lowest_cp;
11740 }
11741
11742 full_cp_count += this_end - this_start + 1;
11743 }
11744
11745 /* At the end of the loop, we count how many bits differ from the
11746 * bits in lowest code point, call the count 'd'. If the set we
11747 * found contains 2**d elements, it is the closure of all code
11748 * points that differ only in those bit positions. To convince
11749 * yourself of that, first note that the number in the closure must
11750 * be a power of 2, which we test for. The only way we could have
11751 * that count and it be some differing set, is if we got some code
11752 * points that don't differ from the lowest code point in any
11753 * position, but do differ from each other in some other position.
11754 * That means one code point has a 1 in that position, and another
11755 * has a 0. But that would mean that one of them differs from the
11756 * lowest code point in that position, which possibility we've
11757 * already excluded. */
11758 if ( (inverted || full_cp_count > 1)
11759 && full_cp_count == 1U << PL_bitcount[bits_differing])
11760 {
11761 U8 ANYOFM_mask;
11762
11763 op = ANYOFM + inverted;;
11764
11765 /* We need to make the bits that differ be 0's */
11766 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
11767
11768 /* The argument is the lowest code point */
11769 *ret = reg1node(pRExC_state, op, lowest_cp);
11770 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
11771 }
11772
11773 done_anyofm:
11774 invlist_iterfinish(cp_list);
11775 }
11776
11777 if (inverted) {
11778 _invlist_invert(cp_list);
11779 }
11780
11781 if (op != END) {
11782 return op;
11783 }
11784
11785 /* XXX We could create an ANYOFR_LOW node here if we saved above if all
11786 * were invariants, it wasn't inverted, and there is a single range.
11787 * This would be faster than some of the posix nodes we create below
11788 * like /\d/a, but would be twice the size. Without having actually
11789 * measured the gain, khw doesn't think the tradeoff is really worth it
11790 * */
11791 }
11792
11793 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
11794 PERL_UINT_FAST8_T type;
11795 SV * intersection = NULL;
11796 SV* d_invlist = NULL;
11797
11798 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD
11799 * ones are about the same speed as ANYOF ops, but take less room; the
11800 * ones that have above-Latin1 code point matches are somewhat faster
11801 * than ANYOF. */
11802
11803 for (type = POSIXA; type >= POSIXD; type--) {
11804 int posix_class;
11805
11806 if (type == POSIXL) { /* But not /l posix classes */
11807 continue;
11808 }
11809
11810 for (posix_class = 0;
11811 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
11812 posix_class++)
11813 {
11814 SV** our_code_points = &cp_list;
11815 SV** official_code_points;
11816 int try_inverted;
11817
11818 if (type == POSIXA) {
11819 official_code_points = &PL_Posix_ptrs[posix_class];
11820 }
11821 else {
11822 official_code_points = &PL_XPosix_ptrs[posix_class];
11823 }
11824
11825 /* Skip non-existent classes of this type. e.g. \v only has an
11826 * entry in PL_XPosix_ptrs */
11827 if (! *official_code_points) {
11828 continue;
11829 }
11830
11831 /* Try both the regular class, and its inversion */
11832 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
11833 bool this_inverted = *invert ^ try_inverted;
11834
11835 if (type != POSIXD) {
11836
11837 /* This class that isn't /d can't match if we have /d
11838 * dependencies */
11839 if (has_runtime_dependency
11840 & HAS_D_RUNTIME_DEPENDENCY)
11841 {
11842 continue;
11843 }
11844 }
11845 else /* is /d */ if (! this_inverted) {
11846
11847 /* /d classes don't match anything non-ASCII below 256
11848 * unconditionally (which cp_list contains) */
11849 _invlist_intersection(cp_list, PL_UpperLatin1,
11850 &intersection);
11851 if (_invlist_len(intersection) != 0) {
11852 continue;
11853 }
11854
11855 SvREFCNT_dec(d_invlist);
11856 d_invlist = invlist_clone(cp_list, NULL);
11857
11858 /* But under UTF-8 it turns into using /u rules. Add
11859 * the things it matches under these conditions so that
11860 * we check below that these are identical to what the
11861 * tested class should match */
11862 if (upper_latin1_only_utf8_matches) {
11863 _invlist_union(
11864 d_invlist,
11865 upper_latin1_only_utf8_matches,
11866 &d_invlist);
11867 }
11868 our_code_points = &d_invlist;
11869 }
11870 else { /* POSIXD, inverted. If this doesn't have this
11871 flag set, it isn't /d. */
11872 if (! ( *anyof_flags
11873 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
11874 {
11875 continue;
11876 }
11877
11878 our_code_points = &cp_list;
11879 }
11880
11881 /* Here, have weeded out some things. We want to see if
11882 * the list of characters this node contains
11883 * ('*our_code_points') precisely matches those of the
11884 * class we are currently checking against
11885 * ('*official_code_points'). */
11886 if (_invlistEQ(*our_code_points,
11887 *official_code_points,
11888 try_inverted))
11889 {
11890 /* Here, they precisely match. Optimize this ANYOF
11891 * node into its equivalent POSIX one of the correct
11892 * type, possibly inverted.
11893 *
11894 * Some of these nodes match a single range of
11895 * characters (or [:alpha:] matches two parallel ranges
11896 * on ASCII platforms). The array lookup at execution
11897 * time could be replaced by a range check for such
11898 * nodes. But regnodes are a finite resource, and the
11899 * possible performance boost isn't large, so this
11900 * hasn't been done. An attempt to use just one node
11901 * (and its inverse) to encompass all such cases was
11902 * made in d62feba66bf43f35d092bb026694f927e9f94d38.
11903 * But the shifting/masking it used ended up being
11904 * slower than the array look up, so it was reverted */
11905 op = (try_inverted)
11906 ? type + NPOSIXA - POSIXA
11907 : type;
11908 *ret = reg_node(pRExC_state, op);
11909 FLAGS(REGNODE_p(*ret)) = posix_class;
11910 SvREFCNT_dec(d_invlist);
11911 SvREFCNT_dec(intersection);
11912 return op;
11913 }
11914 }
11915 }
11916 }
11917 SvREFCNT_dec(d_invlist);
11918 SvREFCNT_dec(intersection);
11919 }
11920
11921 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
11922 * in size and speed. Currently, a 20 bit range base (smallest code point
11923 * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
11924 * This allows for using it on all of the Unicode code points except for
11925 * the highest plane, which is only for private use code points. khw
11926 * doubts that a bigger delta is likely in real world applications */
11927 if ( single_range
11928 && ! has_runtime_dependency
11929 && *anyof_flags == 0
11930 && start[0] < (1 << ANYOFR_BASE_BITS)
11931 && end[0] - start[0]
11932 < ((1U << (sizeof(ARG1u_LOC(NULL))
11933 * CHARBITS - ANYOFR_BASE_BITS))))
11934
11935 {
11936 U8 low_utf8[UTF8_MAXBYTES+1];
11937 U8 high_utf8[UTF8_MAXBYTES+1];
11938
11939 op = ANYOFR;
11940 *ret = reg1node(pRExC_state, op,
11941 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
11942
11943 /* Place the lowest UTF-8 start byte in the flags field, so as to allow
11944 * efficient ruling out at run time of many possible inputs. */
11945 (void) uvchr_to_utf8(low_utf8, start[0]);
11946 (void) uvchr_to_utf8(high_utf8, end[0]);
11947
11948 /* If all code points share the same first byte, this can be an
11949 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
11950 * quickly rule out many inputs at run-time without having to compute
11951 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that
11952 * transformation would not rule out nearly so many things */
11953 if (low_utf8[0] == high_utf8[0]) {
11954 op = ANYOFRb;
11955 OP(REGNODE_p(*ret)) = op;
11956 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
11957 }
11958 else {
11959 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
11960 }
11961
11962 return op;
11963 }
11964
11965 /* If didn't find an optimization and there is no need for a bitmap,
11966 * of the lowest code points, optimize to indicate that */
11967 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS
11968 && ! LOC
11969 && ! upper_latin1_only_utf8_matches
11970 && *anyof_flags == 0)
11971 {
11972 U8 low_utf8[UTF8_MAXBYTES+1];
11973 UV highest_cp = invlist_highest(cp_list);
11974
11975 /* Currently the maximum allowed code point by the system is IV_MAX.
11976 * Higher ones are reserved for future internal use. This particular
11977 * regnode can be used for higher ones, but we can't calculate the code
11978 * point of those. IV_MAX suffices though, as it will be a large first
11979 * byte */
11980 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
11981 - low_utf8;
11982
11983 /* We store the lowest possible first byte of the UTF-8 representation,
11984 * using the flags field. This allows for quick ruling out of some
11985 * inputs without having to convert from UTF-8 to code point. For
11986 * EBCDIC, we use I8, as not doing that transformation would not rule
11987 * out nearly so many things */
11988 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
11989
11990 op = ANYOFH;
11991
11992 /* If the first UTF-8 start byte for the highest code point in the
11993 * range is suitably small, we may be able to get an upper bound as
11994 * well */
11995 if (highest_cp <= IV_MAX) {
11996 U8 high_utf8[UTF8_MAXBYTES+1];
11997 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
11998
11999 /* If the lowest and highest are the same, we can get an exact
12000 * first byte instead of a just minimum or even a sequence of exact
12001 * leading bytes. We signal these with different regnodes */
12002 if (low_utf8[0] == high_utf8[0]) {
12003 Size_t len = find_first_differing_byte_pos(low_utf8,
12004 high_utf8,
12005 MIN(low_len, high_len));
12006 if (len == 1) {
12007
12008 /* No need to convert to I8 for EBCDIC as this is an exact
12009 * match */
12010 *anyof_flags = low_utf8[0];
12011
12012 if (high_len == 2) {
12013 /* If the elements matched all have a 2-byte UTF-8
12014 * representation, with the first byte being the same,
12015 * we can use a compact, fast regnode. capable of
12016 * matching any combination of continuation byte
12017 * patterns.
12018 *
12019 * (A similar regnode could be created for the Latin1
12020 * range; the complication being that it could match
12021 * non-UTF8 targets. The internal bitmap would serve
12022 * both cases; with some extra code in regexec.c) */
12023 op = ANYOFHbbm;
12024 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12025 FILL_NODE(*ret, op);
12026 FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0],
12027
12028 /* The 64 bit (or 32 on EBCCDIC) map can be looked up
12029 * directly based on the continuation byte, without
12030 * needing to convert to code point */
12031 populate_bitmap_from_invlist(
12032 cp_list,
12033
12034 /* The base code point is from the start byte */
12035 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
12036 UTF_CONTINUATION_MARK | 0),
12037
12038 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
12039 REGNODE_BBM_BITMAP_LEN);
12040 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
12041 return op;
12042 }
12043 else {
12044 op = ANYOFHb;
12045 }
12046 }
12047 else {
12048 op = ANYOFHs;
12049 *ret = REGNODE_GUTS(pRExC_state, op,
12050 REGNODE_ARG_LEN(op) + STR_SZ(len));
12051 FILL_NODE(*ret, op);
12052 STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret))
12053 = len;
12054 Copy(low_utf8, /* Add the common bytes */
12055 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
12056 len, U8);
12057 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
12058 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
12059 NULL, only_utf8_locale_list);
12060 return op;
12061 }
12062 }
12063 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
12064
12065 /* Here, the high byte is not the same as the low, but is small
12066 * enough that its reasonable to have a loose upper bound,
12067 * which is packed in with the strict lower bound. See
12068 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
12069 * platforms, I8 is used. On ASCII platforms I8 is the same
12070 * thing as UTF-8 */
12071
12072 U8 bits = 0;
12073 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
12074 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
12075 - *anyof_flags;
12076
12077 if (range_diff <= max_range_diff / 8) {
12078 bits = 3;
12079 }
12080 else if (range_diff <= max_range_diff / 4) {
12081 bits = 2;
12082 }
12083 else if (range_diff <= max_range_diff / 2) {
12084 bits = 1;
12085 }
12086 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
12087 op = ANYOFHr;
12088 }
12089 }
12090 }
12091
12092 return op;
12093
12094 return_OPFAIL:
12095 op = OPFAIL;
12096 *ret = reg1node(pRExC_state, op, 0);
12097 return op;
12098
12099 return_SANY:
12100 op = SANY;
12101 *ret = reg_node(pRExC_state, op);
12102 MARK_NAUGHTY(1);
12103 return op;
12104 }
12105
12106 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12107
12108 void
Perl_set_ANYOF_arg(pTHX_ RExC_state_t * const pRExC_state,regnode * const node,SV * const cp_list,SV * const runtime_defns,SV * const only_utf8_locale_list)12109 Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
12110 regnode* const node,
12111 SV* const cp_list,
12112 SV* const runtime_defns,
12113 SV* const only_utf8_locale_list)
12114 {
12115 /* Sets the arg field of an ANYOF-type node 'node', using information about
12116 * the node passed-in. If only the bitmap is needed to determine what
12117 * matches, the arg is set appropriately to either
12118 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
12119 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
12120 *
12121 * Otherwise, it sets the argument to the count returned by reg_add_data(),
12122 * having allocated and stored an array, av, as follows:
12123 * av[0] stores the inversion list defining this class as far as known at
12124 * this time, or PL_sv_undef if nothing definite is now known.
12125 * av[1] stores the inversion list of code points that match only if the
12126 * current locale is UTF-8, or if none, PL_sv_undef if there is an
12127 * av[2], or no entry otherwise.
12128 * av[2] stores the list of user-defined properties whose subroutine
12129 * definitions aren't known at this time, or no entry if none. */
12130
12131 UV n;
12132
12133 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
12134
12135 /* If this is set, the final disposition won't be known until runtime, so
12136 * we can't do any of the compile time optimizations */
12137 if (! runtime_defns) {
12138
12139 /* On plain ANYOF nodes without the possibility of a runtime locale
12140 * making a difference, maybe there's no information to be gleaned
12141 * except for what's in the bitmap */
12142 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
12143
12144 /* There are two such cases:
12145 * 1) there is no list of code points matched outside the bitmap
12146 */
12147 if (! cp_list) {
12148 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
12149 return;
12150 }
12151
12152 /* 2) the list indicates everything outside the bitmap matches */
12153 if ( invlist_highest(cp_list) == UV_MAX
12154 && invlist_highest_range_start(cp_list)
12155 <= NUM_ANYOF_CODE_POINTS)
12156 {
12157 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
12158 return;
12159 }
12160
12161 /* In all other cases there are things outside the bitmap that we
12162 * may need to check at runtime. */
12163 }
12164
12165 /* Here, we have resolved all the possible run-time matches, and they
12166 * are stored in one or both of two possible lists. (While some match
12167 * only under certain runtime circumstances, we know all the possible
12168 * ones for each such circumstance.)
12169 *
12170 * It may very well be that the pattern being compiled contains an
12171 * identical class, already encountered. Reusing that class here saves
12172 * space. Look through all classes so far encountered. */
12173 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
12174 for (unsigned int i = 0; i < existing_items; i++) {
12175
12176 /* Only look at auxiliary data of this type */
12177 if (RExC_rxi->data->what[i] != 's') {
12178 continue;
12179 }
12180
12181 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
12182 AV * const av = MUTABLE_AV(SvRV(rv));
12183
12184 /* If the already encountered class has data that won't be known
12185 * until runtime (stored in the final element of the array), we
12186 * can't share */
12187 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
12188 continue;
12189 }
12190
12191 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
12192 false /* no lvalue */);
12193
12194 /* The new and the existing one both have to have or both not
12195 * have this element, for this one to duplicate that one */
12196 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
12197 continue;
12198 }
12199
12200 /* If the inversion lists aren't equivalent, can't share */
12201 if (cp_list && ! _invlistEQ(cp_list,
12202 *stored_cp_list_ptr,
12203 FALSE /* don't complement */))
12204 {
12205 continue;
12206 }
12207
12208 /* Similarly for the other list */
12209 SV ** stored_only_utf8_locale_list_ptr = av_fetch(
12210 av,
12211 ONLY_LOCALE_MATCHES_INDEX,
12212 false /* no lvalue */);
12213 if ( cBOOL(only_utf8_locale_list)
12214 != cBOOL(stored_only_utf8_locale_list_ptr))
12215 {
12216 continue;
12217 }
12218
12219 if (only_utf8_locale_list && ! _invlistEQ(
12220 only_utf8_locale_list,
12221 *stored_only_utf8_locale_list_ptr,
12222 FALSE /* don't complement */))
12223 {
12224 continue;
12225 }
12226
12227 /* Here, the existence and contents of both compile-time lists
12228 * are identical between the new and existing data. Re-use the
12229 * existing one */
12230 ARG1u_SET(node, i);
12231 return;
12232 } /* end of loop through existing classes */
12233 }
12234
12235 /* Here, we need to create a new auxiliary data element; either because
12236 * this doesn't duplicate an existing one, or we can't tell at this time if
12237 * it eventually will */
12238
12239 AV * const av = newAV();
12240 SV *rv;
12241
12242 if (cp_list) {
12243 av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
12244 }
12245
12246 /* (Note that if any of this changes, the size calculations in
12247 * S_optimize_regclass() might need to be updated.) */
12248
12249 if (only_utf8_locale_list) {
12250 av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
12251 SvREFCNT_inc_NN(only_utf8_locale_list));
12252 }
12253
12254 if (runtime_defns) {
12255 av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
12256 SvREFCNT_inc_NN(runtime_defns));
12257 }
12258
12259 rv = newRV_noinc(MUTABLE_SV(av));
12260 n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
12261 RExC_rxi->data->data[n] = (void*)rv;
12262 ARG1u_SET(node, n);
12263 }
12264
12265 SV *
12266
12267 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
Perl_get_regclass_aux_data(pTHX_ const regexp * prog,const regnode * node,bool doinit,SV ** listsvp,SV ** only_utf8_locale_ptr,SV ** output_invlist)12268 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12269 #else
12270 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12271 #endif
12272
12273 {
12274 /* For internal core use only.
12275 * Returns the inversion list for the input 'node' in the regex 'prog'.
12276 * If <doinit> is 'true', will attempt to create the inversion list if not
12277 * already done. If it is created, it will add to the normal inversion
12278 * list any that comes from user-defined properties. It croaks if this
12279 * is called before such a list is ready to be generated, that is when a
12280 * user-defined property has been declared, buyt still not yet defined.
12281 * If <listsvp> is non-null, will return the printable contents of the
12282 * property definition. This can be used to get debugging information
12283 * even before the inversion list exists, by calling this function with
12284 * 'doinit' set to false, in which case the components that will be used
12285 * to eventually create the inversion list are returned (in a printable
12286 * form).
12287 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
12288 * store an inversion list of code points that should match only if the
12289 * execution-time locale is a UTF-8 one.
12290 * If <output_invlist> is not NULL, it is where this routine is to store an
12291 * inversion list of the code points that would be instead returned in
12292 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
12293 * when this parameter is used, is just the non-code point data that
12294 * will go into creating the inversion list. This currently should be just
12295 * user-defined properties whose definitions were not known at compile
12296 * time. Using this parameter allows for easier manipulation of the
12297 * inversion list's data by the caller. It is illegal to call this
12298 * function with this parameter set, but not <listsvp>
12299 *
12300 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
12301 * that, in spite of this function's name, the inversion list it returns
12302 * may include the bitmap data as well */
12303
12304 SV *si = NULL; /* Input initialization string */
12305 SV* invlist = NULL;
12306
12307 RXi_GET_DECL_NULL(prog, progi);
12308 const struct reg_data * const data = prog ? progi->data : NULL;
12309
12310 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12311 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
12312 #else
12313 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
12314 #endif
12315 assert(! output_invlist || listsvp);
12316
12317 if (data && data->count) {
12318 const U32 n = ARG1u(node);
12319
12320 if (data->what[n] == 's') {
12321 SV * const rv = MUTABLE_SV(data->data[n]);
12322 AV * const av = MUTABLE_AV(SvRV(rv));
12323 SV **const ary = AvARRAY(av);
12324
12325 invlist = ary[INVLIST_INDEX];
12326
12327 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
12328 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
12329 }
12330
12331 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
12332 si = ary[DEFERRED_USER_DEFINED_INDEX];
12333 }
12334
12335 if (doinit && (si || invlist)) {
12336 if (si) {
12337 bool user_defined;
12338 SV * msg = newSVpvs_flags("", SVs_TEMP);
12339
12340 SV * prop_definition = handle_user_defined_property(
12341 "", 0, FALSE, /* There is no \p{}, \P{} */
12342 SvPVX_const(si)[1] - '0', /* /i or not has been
12343 stored here for just
12344 this occasion */
12345 TRUE, /* run time */
12346 FALSE, /* This call must find the defn */
12347 si, /* The property definition */
12348 &user_defined,
12349 msg,
12350 0 /* base level call */
12351 );
12352
12353 if (SvCUR(msg)) {
12354 assert(prop_definition == NULL);
12355
12356 Perl_croak(aTHX_ "%" UTF8f,
12357 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
12358 }
12359
12360 if (invlist) {
12361 _invlist_union(invlist, prop_definition, &invlist);
12362 SvREFCNT_dec_NN(prop_definition);
12363 }
12364 else {
12365 invlist = prop_definition;
12366 }
12367
12368 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
12369 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
12370
12371 ary[INVLIST_INDEX] = invlist;
12372 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
12373 ? ONLY_LOCALE_MATCHES_INDEX
12374 : INVLIST_INDEX);
12375 si = NULL;
12376 }
12377 }
12378 }
12379 }
12380
12381 /* If requested, return a printable version of what this ANYOF node matches
12382 * */
12383 if (listsvp) {
12384 SV* matches_string = NULL;
12385
12386 /* This function can be called at compile-time, before everything gets
12387 * resolved, in which case we return the currently best available
12388 * information, which is the string that will eventually be used to do
12389 * that resolving, 'si' */
12390 if (si) {
12391 /* Here, we only have 'si' (and possibly some passed-in data in
12392 * 'invlist', which is handled below) If the caller only wants
12393 * 'si', use that. */
12394 if (! output_invlist) {
12395 matches_string = newSVsv(si);
12396 }
12397 else {
12398 /* But if the caller wants an inversion list of the node, we
12399 * need to parse 'si' and place as much as possible in the
12400 * desired output inversion list, making 'matches_string' only
12401 * contain the currently unresolvable things */
12402 const char *si_string = SvPVX(si);
12403 STRLEN remaining = SvCUR(si);
12404 UV prev_cp = 0;
12405 U8 count = 0;
12406
12407 /* Ignore everything before and including the first new-line */
12408 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
12409 assert (si_string != NULL);
12410 si_string++;
12411 remaining = SvPVX(si) + SvCUR(si) - si_string;
12412
12413 while (remaining > 0) {
12414
12415 /* The data consists of just strings defining user-defined
12416 * property names, but in prior incarnations, and perhaps
12417 * somehow from pluggable regex engines, it could still
12418 * hold hex code point definitions, all of which should be
12419 * legal (or it wouldn't have gotten this far). Each
12420 * component of a range would be separated by a tab, and
12421 * each range by a new-line. If these are found, instead
12422 * add them to the inversion list */
12423 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
12424 |PERL_SCAN_SILENT_NON_PORTABLE;
12425 STRLEN len = remaining;
12426 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
12427
12428 /* If the hex decode routine found something, it should go
12429 * up to the next \n */
12430 if ( *(si_string + len) == '\n') {
12431 if (count) { /* 2nd code point on line */
12432 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
12433 }
12434 else {
12435 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
12436 }
12437 count = 0;
12438 goto prepare_for_next_iteration;
12439 }
12440
12441 /* If the hex decode was instead for the lower range limit,
12442 * save it, and go parse the upper range limit */
12443 if (*(si_string + len) == '\t') {
12444 assert(count == 0);
12445
12446 prev_cp = cp;
12447 count = 1;
12448 prepare_for_next_iteration:
12449 si_string += len + 1;
12450 remaining -= len + 1;
12451 continue;
12452 }
12453
12454 /* Here, didn't find a legal hex number. Just add the text
12455 * from here up to the next \n, omitting any trailing
12456 * markers. */
12457
12458 remaining -= len;
12459 len = strcspn(si_string,
12460 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
12461 remaining -= len;
12462 if (matches_string) {
12463 sv_catpvn(matches_string, si_string, len);
12464 }
12465 else {
12466 matches_string = newSVpvn(si_string, len);
12467 }
12468 sv_catpvs(matches_string, " ");
12469
12470 si_string += len;
12471 if ( remaining
12472 && UCHARAT(si_string)
12473 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
12474 {
12475 si_string++;
12476 remaining--;
12477 }
12478 if (remaining && UCHARAT(si_string) == '\n') {
12479 si_string++;
12480 remaining--;
12481 }
12482 } /* end of loop through the text */
12483
12484 assert(matches_string);
12485 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
12486 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
12487 }
12488 } /* end of has an 'si' */
12489 }
12490
12491 /* Add the stuff that's already known */
12492 if (invlist) {
12493
12494 /* Again, if the caller doesn't want the output inversion list, put
12495 * everything in 'matches-string' */
12496 if (! output_invlist) {
12497 if ( ! matches_string) {
12498 matches_string = newSVpvs("\n");
12499 }
12500 sv_catsv(matches_string, invlist_contents(invlist,
12501 TRUE /* traditional style */
12502 ));
12503 }
12504 else if (! *output_invlist) {
12505 *output_invlist = invlist_clone(invlist, NULL);
12506 }
12507 else {
12508 _invlist_union(*output_invlist, invlist, output_invlist);
12509 }
12510 }
12511
12512 *listsvp = matches_string;
12513 }
12514
12515 return invlist;
12516 }
12517
12518 /* reg_skipcomment()
12519
12520 Absorbs an /x style # comment from the input stream,
12521 returning a pointer to the first character beyond the comment, or if the
12522 comment terminates the pattern without anything following it, this returns
12523 one past the final character of the pattern (in other words, RExC_end) and
12524 sets the REG_RUN_ON_COMMENT_SEEN flag.
12525
12526 Note it's the callers responsibility to ensure that we are
12527 actually in /x mode
12528
12529 */
12530
12531 PERL_STATIC_INLINE char*
S_reg_skipcomment(RExC_state_t * pRExC_state,char * p)12532 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
12533 {
12534 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12535
12536 assert(*p == '#');
12537
12538 while (p < RExC_end) {
12539 if (*(++p) == '\n') {
12540 return p+1;
12541 }
12542 }
12543
12544 /* we ran off the end of the pattern without ending the comment, so we have
12545 * to add an \n when wrapping */
12546 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12547 return p;
12548 }
12549
12550 STATIC void
S_skip_to_be_ignored_text(pTHX_ RExC_state_t * pRExC_state,char ** p,const bool force_to_xmod)12551 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
12552 char ** p,
12553 const bool force_to_xmod
12554 )
12555 {
12556 /* If the text at the current parse position '*p' is a '(?#...)' comment,
12557 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
12558 * is /x whitespace, advance '*p' so that on exit it points to the first
12559 * byte past all such white space and comments */
12560
12561 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
12562
12563 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
12564
12565 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
12566
12567 for (;;) {
12568 if (RExC_end - (*p) >= 3
12569 && *(*p) == '('
12570 && *(*p + 1) == '?'
12571 && *(*p + 2) == '#')
12572 {
12573 while (*(*p) != ')') {
12574 if ((*p) == RExC_end)
12575 FAIL("Sequence (?#... not terminated");
12576 (*p)++;
12577 }
12578 (*p)++;
12579 continue;
12580 }
12581
12582 if (use_xmod) {
12583 const char * save_p = *p;
12584 while ((*p) < RExC_end) {
12585 STRLEN len;
12586 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
12587 (*p) += len;
12588 }
12589 else if (*(*p) == '#') {
12590 (*p) = reg_skipcomment(pRExC_state, (*p));
12591 }
12592 else {
12593 break;
12594 }
12595 }
12596 if (*p != save_p) {
12597 continue;
12598 }
12599 }
12600
12601 break;
12602 }
12603
12604 return;
12605 }
12606
12607 /* nextchar()
12608
12609 Advances the parse position by one byte, unless that byte is the beginning
12610 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
12611 those two cases, the parse position is advanced beyond all such comments and
12612 white space.
12613
12614 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
12615 */
12616
12617 STATIC void
S_nextchar(pTHX_ RExC_state_t * pRExC_state)12618 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12619 {
12620 PERL_ARGS_ASSERT_NEXTCHAR;
12621
12622 if (RExC_parse < RExC_end) {
12623 assert( ! UTF
12624 || UTF8_IS_INVARIANT(*RExC_parse)
12625 || UTF8_IS_START(*RExC_parse));
12626
12627 RExC_parse_inc_safe();
12628
12629 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12630 FALSE /* Don't force /x */ );
12631 }
12632 }
12633
12634 STATIC void
S_change_engine_size(pTHX_ RExC_state_t * pRExC_state,const Ptrdiff_t size)12635 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
12636 {
12637 /* 'size' is the delta number of smallest regnode equivalents to add or
12638 * subtract from the current memory allocated to the regex engine being
12639 * constructed. */
12640
12641 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
12642
12643 RExC_size += size;
12644
12645 Renewc(RExC_rxi,
12646 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
12647 /* +1 for REG_MAGIC */
12648 char,
12649 regexp_internal);
12650 if ( RExC_rxi == NULL )
12651 FAIL("Regexp out of space");
12652 RXi_SET(RExC_rx, RExC_rxi);
12653
12654 RExC_emit_start = RExC_rxi->program;
12655 if (size > 0) {
12656 Zero(REGNODE_p(RExC_emit), size, regnode);
12657 }
12658 }
12659
12660 STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t * pRExC_state,const STRLEN extra_size)12661 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
12662 {
12663 /* Allocate a regnode that is (1 + extra_size) times as big as the
12664 * smallest regnode worth of space, and also aligns and increments
12665 * RExC_size appropriately.
12666 *
12667 * It returns the regnode's offset into the regex engine program */
12668
12669 const regnode_offset ret = RExC_emit;
12670
12671 PERL_ARGS_ASSERT_REGNODE_GUTS;
12672
12673 SIZE_ALIGN(RExC_size);
12674 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
12675 NODE_ALIGN_FILL(REGNODE_p(ret));
12676 return(ret);
12677 }
12678
12679 #ifdef DEBUGGING
12680
12681 STATIC regnode_offset
S_regnode_guts_debug(pTHX_ RExC_state_t * pRExC_state,const U8 op,const STRLEN extra_size)12682 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
12683 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
12684 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
12685 return S_regnode_guts(aTHX_ pRExC_state, extra_size);
12686 }
12687
12688 #endif
12689
12690
12691
12692 /*
12693 - reg_node - emit a node
12694 */
12695 STATIC regnode_offset /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)12696 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12697 {
12698 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12699 regnode_offset ptr = ret;
12700
12701 PERL_ARGS_ASSERT_REG_NODE;
12702
12703 assert(REGNODE_ARG_LEN(op) == 0);
12704
12705 FILL_ADVANCE_NODE(ptr, op);
12706 RExC_emit = ptr;
12707 return(ret);
12708 }
12709
12710 /*
12711 - reg1node - emit a node with an argument
12712 */
12713 STATIC regnode_offset /* Location. */
S_reg1node(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)12714 S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12715 {
12716 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12717 regnode_offset ptr = ret;
12718
12719 PERL_ARGS_ASSERT_REG1NODE;
12720
12721 /* ANYOF are special cased to allow non-length 1 args */
12722 assert(REGNODE_ARG_LEN(op) == 1);
12723
12724 FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
12725 RExC_emit = ptr;
12726 return(ret);
12727 }
12728
12729 /*
12730 - regpnode - emit a temporary node with a SV* argument
12731 */
12732 STATIC regnode_offset /* Location. */
S_regpnode(pTHX_ RExC_state_t * pRExC_state,U8 op,SV * arg)12733 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
12734 {
12735 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12736 regnode_offset ptr = ret;
12737
12738 PERL_ARGS_ASSERT_REGPNODE;
12739
12740 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
12741 RExC_emit = ptr;
12742 return(ret);
12743 }
12744
12745 STATIC regnode_offset
S_reg2node(pTHX_ RExC_state_t * pRExC_state,const U8 op,const U32 arg1,const I32 arg2)12746 S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
12747 {
12748 /* emit a node with U32 and I32 arguments */
12749
12750 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12751 regnode_offset ptr = ret;
12752
12753 PERL_ARGS_ASSERT_REG2NODE;
12754
12755 assert(REGNODE_ARG_LEN(op) == 2);
12756
12757 FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
12758 RExC_emit = ptr;
12759 return(ret);
12760 }
12761
12762 /*
12763 - reginsert - insert an operator in front of already-emitted operand
12764 *
12765 * That means that on exit 'operand' is the offset of the newly inserted
12766 * operator, and the original operand has been relocated.
12767 *
12768 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
12769 * set up NEXT_OFF() of the inserted node if needed. Something like this:
12770 *
12771 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
12772 * NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
12773 *
12774 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
12775 */
12776 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,const U8 op,const regnode_offset operand,const U32 depth)12777 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
12778 const regnode_offset operand, const U32 depth)
12779 {
12780 regnode *src;
12781 regnode *dst;
12782 regnode *place;
12783 const int offset = REGNODE_ARG_LEN((U8)op);
12784 const int size = NODE_STEP_REGNODE + offset;
12785 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12786
12787 PERL_ARGS_ASSERT_REGINSERT;
12788 PERL_UNUSED_CONTEXT;
12789 PERL_UNUSED_ARG(depth);
12790 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
12791 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
12792 studying. If this is wrong then we need to adjust RExC_recurse
12793 below like we do with RExC_open_parens/RExC_close_parens. */
12794 change_engine_size(pRExC_state, (Ptrdiff_t) size);
12795 src = REGNODE_p(RExC_emit);
12796 RExC_emit += size;
12797 dst = REGNODE_p(RExC_emit);
12798
12799 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
12800 * and [perl #133871] shows this can lead to problems, so skip this
12801 * realignment of parens until a later pass when they are reliable */
12802 if (! IN_PARENS_PASS && RExC_open_parens) {
12803 int paren;
12804 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
12805 /* remember that RExC_npar is rex->nparens + 1,
12806 * iow it is 1 more than the number of parens seen in
12807 * the pattern so far. */
12808 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12809 /* note, RExC_open_parens[0] is the start of the
12810 * regex, it can't move. RExC_close_parens[0] is the end
12811 * of the regex, it *can* move. */
12812 if ( paren && RExC_open_parens[paren] >= operand ) {
12813 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
12814 RExC_open_parens[paren] += size;
12815 } else {
12816 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12817 }
12818 if ( RExC_close_parens[paren] >= operand ) {
12819 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
12820 RExC_close_parens[paren] += size;
12821 } else {
12822 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12823 }
12824 }
12825 }
12826 if (RExC_end_op)
12827 RExC_end_op += size;
12828
12829 while (src > REGNODE_p(operand)) {
12830 StructCopy(--src, --dst, regnode);
12831 }
12832
12833 place = REGNODE_p(operand); /* Op node, where operand used to be. */
12834 src = place + 1; /* NOT REGNODE_AFTER! */
12835 FLAGS(place) = 0;
12836 FILL_NODE(operand, op);
12837
12838 /* Zero out any arguments in the new node */
12839 Zero(src, offset, regnode);
12840 }
12841
12842 /*
12843 - regtail - set the next-pointer at the end of a node chain of p to val. If
12844 that value won't fit in the space available, instead returns FALSE.
12845 (Except asserts if we can't fit in the largest space the regex
12846 engine is designed for.)
12847 - SEE ALSO: regtail_study
12848 */
12849 STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,const regnode_offset p,const regnode_offset val,const U32 depth)12850 S_regtail(pTHX_ RExC_state_t * pRExC_state,
12851 const regnode_offset p,
12852 const regnode_offset val,
12853 const U32 depth)
12854 {
12855 regnode_offset scan;
12856 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12857
12858 PERL_ARGS_ASSERT_REGTAIL;
12859 #ifndef DEBUGGING
12860 PERL_UNUSED_ARG(depth);
12861 #endif
12862
12863 /* The final node in the chain is the first one with a nonzero next pointer
12864 * */
12865 scan = (regnode_offset) p;
12866 for (;;) {
12867 regnode * const temp = regnext(REGNODE_p(scan));
12868 DEBUG_PARSE_r({
12869 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12870 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12871 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
12872 SvPV_nolen_const(RExC_mysv), scan,
12873 (temp == NULL ? "->" : ""),
12874 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
12875 );
12876 });
12877 if (temp == NULL)
12878 break;
12879 scan = REGNODE_OFFSET(temp);
12880 }
12881
12882 /* Populate this node's next pointer */
12883 assert(val >= scan);
12884 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12885 assert((UV) (val - scan) <= U32_MAX);
12886 ARG1u_SET(REGNODE_p(scan), val - scan);
12887 }
12888 else {
12889 if (val - scan > U16_MAX) {
12890 /* Populate this with something that won't loop and will likely
12891 * lead to a crash if the caller ignores the failure return, and
12892 * execution continues */
12893 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12894 return FALSE;
12895 }
12896 NEXT_OFF(REGNODE_p(scan)) = val - scan;
12897 }
12898
12899 return TRUE;
12900 }
12901
12902 #ifdef DEBUGGING
12903 /*
12904 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12905 - Look for optimizable sequences at the same time.
12906 - currently only looks for EXACT chains.
12907
12908 This is experimental code. The idea is to use this routine to perform
12909 in place optimizations on branches and groups as they are constructed,
12910 with the long term intention of removing optimization from study_chunk so
12911 that it is purely analytical.
12912
12913 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12914 to control which is which.
12915
12916 This used to return a value that was ignored. It was a problem that it is
12917 #ifdef'd to be another function that didn't return a value. khw has changed it
12918 so both currently return a pass/fail return.
12919
12920 */
12921 /* TODO: All four parms should be const */
12922
12923 STATIC bool
S_regtail_study(pTHX_ RExC_state_t * pRExC_state,regnode_offset p,const regnode_offset val,U32 depth)12924 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
12925 const regnode_offset val, U32 depth)
12926 {
12927 regnode_offset scan;
12928 U8 exact = PSEUDO;
12929 #ifdef EXPERIMENTAL_INPLACESCAN
12930 I32 min = 0;
12931 #endif
12932 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12933
12934 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12935
12936
12937 /* Find last node. */
12938
12939 scan = p;
12940 for (;;) {
12941 regnode * const temp = regnext(REGNODE_p(scan));
12942 #ifdef EXPERIMENTAL_INPLACESCAN
12943 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12944 bool unfolded_multi_char; /* Unexamined in this routine */
12945 if (join_exact(pRExC_state, scan, &min,
12946 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
12947 return TRUE; /* Was return EXACT */
12948 }
12949 #endif
12950 if ( exact ) {
12951 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12952 if (exact == PSEUDO )
12953 exact= OP(REGNODE_p(scan));
12954 else if (exact != OP(REGNODE_p(scan)) )
12955 exact= 0;
12956 }
12957 else if (OP(REGNODE_p(scan)) != NOTHING) {
12958 exact= 0;
12959 }
12960 }
12961 DEBUG_PARSE_r({
12962 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12963 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12964 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
12965 SvPV_nolen_const(RExC_mysv),
12966 scan,
12967 REGNODE_NAME(exact));
12968 });
12969 if (temp == NULL)
12970 break;
12971 scan = REGNODE_OFFSET(temp);
12972 }
12973 DEBUG_PARSE_r({
12974 DEBUG_PARSE_MSG("");
12975 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
12976 Perl_re_printf( aTHX_
12977 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
12978 SvPV_nolen_const(RExC_mysv),
12979 (IV)val,
12980 (IV)(val - scan)
12981 );
12982 });
12983 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12984 assert((UV) (val - scan) <= U32_MAX);
12985 ARG1u_SET(REGNODE_p(scan), val - scan);
12986 }
12987 else {
12988 if (val - scan > U16_MAX) {
12989 /* Populate this with something that won't loop and will likely
12990 * lead to a crash if the caller ignores the failure return, and
12991 * execution continues */
12992 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12993 return FALSE;
12994 }
12995 NEXT_OFF(REGNODE_p(scan)) = val - scan;
12996 }
12997
12998 return TRUE; /* Was 'return exact' */
12999 }
13000 #endif
13001
13002 SV*
Perl_get_ANYOFM_contents(pTHX_ const regnode * n)13003 Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
13004
13005 /* Returns an inversion list of all the code points matched by the
13006 * ANYOFM/NANYOFM node 'n' */
13007
13008 SV * cp_list = _new_invlist(-1);
13009 const U8 lowest = (U8) ARG1u(n);
13010 unsigned int i;
13011 U8 count = 0;
13012 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
13013
13014 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
13015
13016 /* Starting with the lowest code point, any code point that ANDed with the
13017 * mask yields the lowest code point is in the set */
13018 for (i = lowest; i <= 0xFF; i++) {
13019 if ((i & FLAGS(n)) == ARG1u(n)) {
13020 cp_list = add_cp_to_invlist(cp_list, i);
13021 count++;
13022
13023 /* We know how many code points (a power of two) that are in the
13024 * set. No use looking once we've got that number */
13025 if (count >= needed) break;
13026 }
13027 }
13028
13029 if (OP(n) == NANYOFM) {
13030 _invlist_invert(cp_list);
13031 }
13032 return cp_list;
13033 }
13034
13035 SV *
Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n)13036 Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
13037 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
13038
13039 SV * cp_list = NULL;
13040 populate_invlist_from_bitmap(
13041 ((struct regnode_bbm *) n)->bitmap,
13042 REGNODE_BBM_BITMAP_LEN * CHARBITS,
13043 &cp_list,
13044
13045 /* The base cp is from the start byte plus a zero continuation */
13046 TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
13047 UTF_CONTINUATION_MARK | 0));
13048 return cp_list;
13049 }
13050
13051
13052
13053 SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)13054 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13055 { /* Assume that RE_INTUIT is set */
13056 /* Returns an SV containing a string that must appear in the target for it
13057 * to match, or NULL if nothing is known that must match.
13058 *
13059 * CAUTION: the SV can be freed during execution of the regex engine */
13060
13061 struct regexp *const prog = ReANY(r);
13062 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13063
13064 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13065 PERL_UNUSED_CONTEXT;
13066
13067 DEBUG_COMPILE_r(
13068 {
13069 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
13070 const char * const s = SvPV_nolen_const(RX_UTF8(r)
13071 ? prog->check_utf8 : prog->check_substr);
13072
13073 if (!PL_colorset) reginitcolors();
13074 Perl_re_printf( aTHX_
13075 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13076 PL_colors[4],
13077 RX_UTF8(r) ? "utf8 " : "",
13078 PL_colors[5], PL_colors[0],
13079 s,
13080 PL_colors[1],
13081 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
13082 }
13083 } );
13084
13085 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
13086 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
13087 }
13088
13089 /*
13090 pregfree()
13091
13092 handles refcounting and freeing the perl core regexp structure. When
13093 it is necessary to actually free the structure the first thing it
13094 does is call the 'free' method of the regexp_engine associated to
13095 the regexp, allowing the handling of the void *pprivate; member
13096 first. (This routine is not overridable by extensions, which is why
13097 the extensions free is called first.)
13098
13099 See regdupe and regdupe_internal if you change anything here.
13100 */
13101 #ifndef PERL_IN_XSUB_RE
13102 void
Perl_pregfree(pTHX_ REGEXP * r)13103 Perl_pregfree(pTHX_ REGEXP *r)
13104 {
13105 SvREFCNT_dec(r);
13106 }
13107
13108 void
Perl_pregfree2(pTHX_ REGEXP * rx)13109 Perl_pregfree2(pTHX_ REGEXP *rx)
13110 {
13111 struct regexp *const r = ReANY(rx);
13112 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13113
13114 PERL_ARGS_ASSERT_PREGFREE2;
13115
13116 if (! r)
13117 return;
13118
13119 if (r->mother_re) {
13120 ReREFCNT_dec(r->mother_re);
13121 } else {
13122 CALLREGFREE_PVT(rx); /* free the private data */
13123 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13124 }
13125 if (r->substrs) {
13126 int i;
13127 for (i = 0; i < 2; i++) {
13128 SvREFCNT_dec(r->substrs->data[i].substr);
13129 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
13130 }
13131 Safefree(r->substrs);
13132 }
13133 RX_MATCH_COPY_FREE(rx);
13134 #ifdef PERL_ANY_COW
13135 SvREFCNT_dec(r->saved_copy);
13136 #endif
13137 Safefree(RXp_OFFSp(r));
13138 if (r->logical_to_parno) {
13139 Safefree(r->logical_to_parno);
13140 Safefree(r->parno_to_logical);
13141 Safefree(r->parno_to_logical_next);
13142 }
13143
13144 SvREFCNT_dec(r->qr_anoncv);
13145 if (r->recurse_locinput)
13146 Safefree(r->recurse_locinput);
13147 }
13148
13149
13150 /* reg_temp_copy()
13151
13152 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
13153 except that dsv will be created if NULL.
13154
13155 This function is used in two main ways. First to implement
13156 $r = qr/....; $s = $$r;
13157
13158 Secondly, it is used as a hacky workaround to the structural issue of
13159 match results
13160 being stored in the regexp structure which is in turn stored in
13161 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13162 could be PL_curpm in multiple contexts, and could require multiple
13163 result sets being associated with the pattern simultaneously, such
13164 as when doing a recursive match with (??{$qr})
13165
13166 The solution is to make a lightweight copy of the regexp structure
13167 when a qr// is returned from the code executed by (??{$qr}) this
13168 lightweight copy doesn't actually own any of its data except for
13169 the starp/end and the actual regexp structure itself.
13170
13171 */
13172
13173
13174 REGEXP *
Perl_reg_temp_copy(pTHX_ REGEXP * dsv,REGEXP * ssv)13175 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
13176 {
13177 struct regexp *drx;
13178 struct regexp *const srx = ReANY(ssv);
13179 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
13180
13181 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13182
13183 if (!dsv)
13184 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
13185 else {
13186 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
13187
13188 /* our only valid caller, sv_setsv_flags(), should have done
13189 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
13190 assert(!SvOOK(dsv));
13191 assert(!SvIsCOW(dsv));
13192 assert(!SvROK(dsv));
13193
13194 if (SvPVX_const(dsv)) {
13195 if (SvLEN(dsv))
13196 Safefree(SvPVX(dsv));
13197 SvPVX(dsv) = NULL;
13198 }
13199 SvLEN_set(dsv, 0);
13200 SvCUR_set(dsv, 0);
13201 SvOK_off((SV *)dsv);
13202
13203 if (islv) {
13204 /* For PVLVs, the head (sv_any) points to an XPVLV, while
13205 * the LV's xpvlenu_rx will point to a regexp body, which
13206 * we allocate here */
13207 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13208 assert(!SvPVX(dsv));
13209 /* We "steal" the body from the newly allocated SV temp, changing
13210 * the pointer in its HEAD to NULL. We then change its type to
13211 * SVt_NULL so that when we immediately release its only reference,
13212 * no memory deallocation happens.
13213 *
13214 * The body will eventually be freed (from the PVLV) either in
13215 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
13216 * the regexp body needs to be removed)
13217 * or in Perl_sv_clear() (if the PVLV still holds the pointer until
13218 * the PVLV itself is deallocated). */
13219 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
13220 temp->sv_any = NULL;
13221 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13222 SvREFCNT_dec_NN(temp);
13223 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
13224 ing below will not set it. */
13225 SvCUR_set(dsv, SvCUR(ssv));
13226 }
13227 }
13228 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
13229 sv_force_normal(sv) is called. */
13230 SvFAKE_on(dsv);
13231 drx = ReANY(dsv);
13232
13233 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
13234 SvPV_set(dsv, RX_WRAPPED(ssv));
13235 /* We share the same string buffer as the original regexp, on which we
13236 hold a reference count, incremented when mother_re is set below.
13237 The string pointer is copied here, being part of the regexp struct.
13238 */
13239 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
13240 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13241
13242 if (!islv)
13243 SvLEN_set(dsv, 0);
13244 if (RXp_OFFSp(srx)) {
13245 const I32 npar = srx->nparens+1;
13246 NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
13247 }
13248 if (srx->substrs) {
13249 int i;
13250 Newx(drx->substrs, 1, struct reg_substr_data);
13251 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
13252
13253 for (i = 0; i < 2; i++) {
13254 SvREFCNT_inc_void(drx->substrs->data[i].substr);
13255 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
13256 }
13257
13258 /* check_substr and check_utf8, if non-NULL, point to either their
13259 anchored or float namesakes, and don't hold a second reference. */
13260 }
13261 if (srx->logical_to_parno) {
13262 NewCopy(srx->logical_to_parno,
13263 drx->logical_to_parno,
13264 srx->nparens+1, I32);
13265 NewCopy(srx->parno_to_logical,
13266 drx->parno_to_logical,
13267 srx->nparens+1, I32);
13268 NewCopy(srx->parno_to_logical_next,
13269 drx->parno_to_logical_next,
13270 srx->nparens+1, I32);
13271 } else {
13272 drx->logical_to_parno = NULL;
13273 drx->parno_to_logical = NULL;
13274 drx->parno_to_logical_next = NULL;
13275 }
13276 drx->logical_nparens = srx->logical_nparens;
13277
13278 RX_MATCH_COPIED_off(dsv);
13279 #ifdef PERL_ANY_COW
13280 RXp_SAVED_COPY(drx) = NULL;
13281 #endif
13282 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
13283 SvREFCNT_inc_void(drx->qr_anoncv);
13284 if (srx->recurse_locinput)
13285 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
13286
13287 return dsv;
13288 }
13289 #endif
13290
13291
13292 /* regfree_internal()
13293
13294 Free the private data in a regexp. This is overloadable by
13295 extensions. Perl takes care of the regexp structure in pregfree(),
13296 this covers the *pprivate pointer which technically perl doesn't
13297 know about, however of course we have to handle the
13298 regexp_internal structure when no extension is in use.
13299
13300 Note this is called before freeing anything in the regexp
13301 structure.
13302 */
13303
13304 void
Perl_regfree_internal(pTHX_ REGEXP * const rx)13305 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13306 {
13307 struct regexp *const r = ReANY(rx);
13308 RXi_GET_DECL(r, ri);
13309 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13310
13311 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13312
13313 if (! ri) {
13314 return;
13315 }
13316
13317 DEBUG_COMPILE_r({
13318 if (!PL_colorset)
13319 reginitcolors();
13320 {
13321 SV *dsv= sv_newmortal();
13322 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13323 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
13324 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
13325 PL_colors[4], PL_colors[5], s);
13326 }
13327 });
13328
13329 if (ri->code_blocks)
13330 S_free_codeblocks(aTHX_ ri->code_blocks);
13331
13332 if (ri->data) {
13333 int n = ri->data->count;
13334
13335 while (--n >= 0) {
13336 /* If you add a ->what type here, update the comment in regcomp.h */
13337 switch (ri->data->what[n]) {
13338 case 'a':
13339 case 'r':
13340 case 's':
13341 case 'S':
13342 case 'u':
13343 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13344 break;
13345 case 'f':
13346 Safefree(ri->data->data[n]);
13347 break;
13348 case 'l':
13349 case 'L':
13350 break;
13351 case 'T':
13352 { /* Aho Corasick add-on structure for a trie node.
13353 Used in stclass optimization only */
13354 U32 refcount;
13355 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13356 OP_REFCNT_LOCK;
13357 refcount = --aho->refcount;
13358 OP_REFCNT_UNLOCK;
13359 if ( !refcount ) {
13360 PerlMemShared_free(aho->states);
13361 PerlMemShared_free(aho->fail);
13362 /* do this last!!!! */
13363 PerlMemShared_free(ri->data->data[n]);
13364 /* we should only ever get called once, so
13365 * assert as much, and also guard the free
13366 * which /might/ happen twice. At the least
13367 * it will make code anlyzers happy and it
13368 * doesn't cost much. - Yves */
13369 assert(ri->regstclass);
13370 if (ri->regstclass) {
13371 PerlMemShared_free(ri->regstclass);
13372 ri->regstclass = 0;
13373 }
13374 }
13375 }
13376 break;
13377 case 't':
13378 {
13379 /* trie structure. */
13380 U32 refcount;
13381 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13382 OP_REFCNT_LOCK;
13383 refcount = --trie->refcount;
13384 OP_REFCNT_UNLOCK;
13385 if ( !refcount ) {
13386 PerlMemShared_free(trie->charmap);
13387 PerlMemShared_free(trie->states);
13388 PerlMemShared_free(trie->trans);
13389 if (trie->bitmap)
13390 PerlMemShared_free(trie->bitmap);
13391 if (trie->jump)
13392 PerlMemShared_free(trie->jump);
13393 if (trie->j_before_paren)
13394 PerlMemShared_free(trie->j_before_paren);
13395 if (trie->j_after_paren)
13396 PerlMemShared_free(trie->j_after_paren);
13397 PerlMemShared_free(trie->wordinfo);
13398 /* do this last!!!! */
13399 PerlMemShared_free(ri->data->data[n]);
13400 }
13401 }
13402 break;
13403 case '%':
13404 /* NO-OP a '%' data contains a null pointer, so that reg_add_data
13405 * always returns non-zero, this should only ever happen in the
13406 * 0 index */
13407 assert(n==0);
13408 break;
13409 default:
13410 Perl_croak(aTHX_ "panic: regfree data code '%c'",
13411 ri->data->what[n]);
13412 }
13413 }
13414 Safefree(ri->data->what);
13415 Safefree(ri->data);
13416 }
13417
13418 Safefree(ri);
13419 }
13420
13421 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
13422
13423 /*
13424 =for apidoc re_dup_guts
13425 Duplicate a regexp.
13426
13427 This routine is expected to clone a given regexp structure. It is only
13428 compiled under USE_ITHREADS.
13429
13430 After all of the core data stored in struct regexp is duplicated
13431 the C<regexp_engine.dupe> method is used to copy any private data
13432 stored in the *pprivate pointer. This allows extensions to handle
13433 any duplication they need to do.
13434
13435 =cut
13436
13437 See pregfree() and regfree_internal() if you change anything here.
13438 */
13439 #if defined(USE_ITHREADS)
13440 #ifndef PERL_IN_XSUB_RE
13441 void
Perl_re_dup_guts(pTHX_ const REGEXP * sstr,REGEXP * dstr,CLONE_PARAMS * param)13442 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13443 {
13444 I32 npar;
13445 const struct regexp *r = ReANY(sstr);
13446 struct regexp *ret = ReANY(dstr);
13447
13448 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13449
13450 npar = r->nparens+1;
13451 NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
13452
13453 if (ret->substrs) {
13454 /* Do it this way to avoid reading from *r after the StructCopy().
13455 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13456 cache, it doesn't matter. */
13457 int i;
13458 const bool anchored = r->check_substr
13459 ? r->check_substr == r->substrs->data[0].substr
13460 : r->check_utf8 == r->substrs->data[0].utf8_substr;
13461 Newx(ret->substrs, 1, struct reg_substr_data);
13462 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13463
13464 for (i = 0; i < 2; i++) {
13465 ret->substrs->data[i].substr =
13466 sv_dup_inc(ret->substrs->data[i].substr, param);
13467 ret->substrs->data[i].utf8_substr =
13468 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
13469 }
13470
13471 /* check_substr and check_utf8, if non-NULL, point to either their
13472 anchored or float namesakes, and don't hold a second reference. */
13473
13474 if (ret->check_substr) {
13475 if (anchored) {
13476 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
13477
13478 ret->check_substr = ret->substrs->data[0].substr;
13479 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13480 } else {
13481 assert(r->check_substr == r->substrs->data[1].substr);
13482 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
13483
13484 ret->check_substr = ret->substrs->data[1].substr;
13485 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13486 }
13487 } else if (ret->check_utf8) {
13488 if (anchored) {
13489 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13490 } else {
13491 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13492 }
13493 }
13494 }
13495
13496 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13497 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13498 if (r->recurse_locinput)
13499 Newx(ret->recurse_locinput, r->nparens + 1, char *);
13500
13501 if (ret->pprivate)
13502 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
13503
13504 if (RX_MATCH_COPIED(dstr))
13505 RXp_SUBBEG(ret) = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
13506 else
13507 RXp_SUBBEG(ret) = NULL;
13508 #ifdef PERL_ANY_COW
13509 RXp_SAVED_COPY(ret) = NULL;
13510 #endif
13511
13512 if (r->logical_to_parno) {
13513 /* we use total_parens for all three just for symmetry */
13514 ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
13515 ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
13516 ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
13517 } else {
13518 ret->logical_to_parno = NULL;
13519 ret->parno_to_logical = NULL;
13520 ret->parno_to_logical_next = NULL;
13521 }
13522
13523 ret->logical_nparens = r->logical_nparens;
13524
13525 /* Whether mother_re be set or no, we need to copy the string. We
13526 cannot refrain from copying it when the storage points directly to
13527 our mother regexp, because that's
13528 1: a buffer in a different thread
13529 2: something we no longer hold a reference on
13530 so we need to copy it locally. */
13531 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
13532 /* set malloced length to a non-zero value so it will be freed
13533 * (otherwise in combination with SVf_FAKE it looks like an alien
13534 * buffer). It doesn't have to be the actual malloced size, since it
13535 * should never be grown */
13536 SvLEN_set(dstr, SvCUR(sstr)+1);
13537 ret->mother_re = NULL;
13538 }
13539 #endif /* PERL_IN_XSUB_RE */
13540
13541 /*
13542 regdupe_internal()
13543
13544 This is the internal complement to regdupe() which is used to copy
13545 the structure pointed to by the *pprivate pointer in the regexp.
13546 This is the core version of the extension overridable cloning hook.
13547 The regexp structure being duplicated will be copied by perl prior
13548 to this and will be provided as the regexp *r argument, however
13549 with the /old/ structures pprivate pointer value. Thus this routine
13550 may override any copying normally done by perl.
13551
13552 It returns a pointer to the new regexp_internal structure.
13553 */
13554
13555 void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx,CLONE_PARAMS * param)13556 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13557 {
13558 struct regexp *const r = ReANY(rx);
13559 regexp_internal *reti;
13560 int len;
13561 RXi_GET_DECL(r, ri);
13562
13563 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13564
13565 len = ProgLen(ri);
13566
13567 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
13568 char, regexp_internal);
13569 Copy(ri->program, reti->program, len+1, regnode);
13570
13571
13572 if (ri->code_blocks) {
13573 int n;
13574 Newx(reti->code_blocks, 1, struct reg_code_blocks);
13575 Newx(reti->code_blocks->cb, ri->code_blocks->count,
13576 struct reg_code_block);
13577 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
13578 ri->code_blocks->count, struct reg_code_block);
13579 for (n = 0; n < ri->code_blocks->count; n++)
13580 reti->code_blocks->cb[n].src_regex = (REGEXP*)
13581 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
13582 reti->code_blocks->count = ri->code_blocks->count;
13583 reti->code_blocks->refcnt = 1;
13584 }
13585 else
13586 reti->code_blocks = NULL;
13587
13588 reti->regstclass = NULL;
13589
13590 if (ri->data) {
13591 struct reg_data *d;
13592 const int count = ri->data->count;
13593 int i;
13594
13595 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13596 char, struct reg_data);
13597 Newx(d->what, count, U8);
13598
13599 d->count = count;
13600 for (i = 0; i < count; i++) {
13601 d->what[i] = ri->data->what[i];
13602 switch (d->what[i]) {
13603 /* see also regcomp.h and regfree_internal() */
13604 case 'a': /* actually an AV, but the dup function is identical.
13605 values seem to be "plain sv's" generally. */
13606 case 'r': /* a compiled regex (but still just another SV) */
13607 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
13608 this use case should go away, the code could have used
13609 'a' instead - see S_set_ANYOF_arg() for array contents. */
13610 case 'S': /* actually an SV, but the dup function is identical. */
13611 case 'u': /* actually an HV, but the dup function is identical.
13612 values are "plain sv's" */
13613 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13614 break;
13615 case 'f':
13616 /* Synthetic Start Class - "Fake" charclass we generate to optimize
13617 * patterns which could start with several different things. Pre-TRIE
13618 * this was more important than it is now, however this still helps
13619 * in some places, for instance /x?a+/ might produce a SSC equivalent
13620 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
13621 * in regexec.c
13622 */
13623 /* This is cheating. */
13624 Newx(d->data[i], 1, regnode_ssc);
13625 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
13626 reti->regstclass = (regnode*)d->data[i];
13627 break;
13628 case 'T':
13629 /* AHO-CORASICK fail table */
13630 /* Trie stclasses are readonly and can thus be shared
13631 * without duplication. We free the stclass in pregfree
13632 * when the corresponding reg_ac_data struct is freed.
13633 */
13634 reti->regstclass= ri->regstclass;
13635 /* FALLTHROUGH */
13636 case 't':
13637 /* TRIE transition table */
13638 OP_REFCNT_LOCK;
13639 ((reg_trie_data*)ri->data->data[i])->refcount++;
13640 OP_REFCNT_UNLOCK;
13641 /* FALLTHROUGH */
13642 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
13643 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
13644 is not from another regexp */
13645 d->data[i] = ri->data->data[i];
13646 break;
13647 case '%':
13648 /* this is a placeholder type, it exists purely so that
13649 * reg_add_data always returns a non-zero value, this type of
13650 * entry should ONLY be present in the 0 slot of the array */
13651 assert(i == 0);
13652 d->data[i]= ri->data->data[i];
13653 break;
13654 default:
13655 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
13656 ri->data->what[i]);
13657 }
13658 }
13659
13660 reti->data = d;
13661 }
13662 else
13663 reti->data = NULL;
13664
13665 if (ri->regstclass && !reti->regstclass) {
13666 /* Assume that the regstclass is a regnode which is inside of the
13667 * program which we have to copy over */
13668 regnode *node= ri->regstclass;
13669 assert(node >= ri->program && (node - ri->program) < len);
13670 reti->regstclass = reti->program + (node - ri->program);
13671 }
13672
13673
13674 reti->name_list_idx = ri->name_list_idx;
13675
13676 SetProgLen(reti, len);
13677
13678 return (void*)reti;
13679 }
13680
13681 #endif /* USE_ITHREADS */
13682
13683 STATIC void
S_re_croak(pTHX_ bool utf8,const char * pat,...)13684 S_re_croak(pTHX_ bool utf8, const char* pat,...)
13685 {
13686 va_list args;
13687 STRLEN len = strlen(pat);
13688 char buf[512];
13689 SV *msv;
13690 const char *message;
13691
13692 PERL_ARGS_ASSERT_RE_CROAK;
13693
13694 if (len > 510)
13695 len = 510;
13696 Copy(pat, buf, len , char);
13697 buf[len] = '\n';
13698 buf[len + 1] = '\0';
13699 va_start(args, pat);
13700 msv = vmess(buf, &args);
13701 va_end(args);
13702 message = SvPV_const(msv, len);
13703 if (len > 512)
13704 len = 512;
13705 Copy(message, buf, len , char);
13706 /* len-1 to avoid \n */
13707 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
13708 }
13709
13710 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13711
13712 #ifndef PERL_IN_XSUB_RE
13713 void
Perl_save_re_context(pTHX)13714 Perl_save_re_context(pTHX)
13715 {
13716 I32 nparens = -1;
13717 I32 i;
13718
13719 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13720
13721 if (PL_curpm) {
13722 const REGEXP * const rx = PM_GETRE(PL_curpm);
13723 if (rx)
13724 nparens = RX_NPARENS(rx);
13725 }
13726
13727 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
13728 * that PL_curpm will be null, but that utf8.pm and the modules it
13729 * loads will only use $1..$3.
13730 * The t/porting/re_context.t test file checks this assumption.
13731 */
13732 if (nparens == -1)
13733 nparens = 3;
13734
13735 for (i = 1; i <= nparens; i++) {
13736 char digits[TYPE_CHARS(long)];
13737 const STRLEN len = my_snprintf(digits, sizeof(digits),
13738 "%lu", (long)i);
13739 GV *const *const gvp
13740 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13741
13742 if (gvp) {
13743 GV * const gv = *gvp;
13744 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13745 save_scalar(gv);
13746 }
13747 }
13748 }
13749 #endif
13750
13751 #ifndef PERL_IN_XSUB_RE
13752
13753 # include "uni_keywords.h"
13754
13755 void
Perl_init_uniprops(pTHX)13756 Perl_init_uniprops(pTHX)
13757 {
13758
13759 # ifdef DEBUGGING
13760 char * dump_len_string;
13761
13762 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
13763 if ( ! dump_len_string
13764 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
13765 {
13766 PL_dump_re_max_len = 60; /* A reasonable default */
13767 }
13768 # endif
13769
13770 PL_user_def_props = newHV();
13771
13772 # ifdef USE_ITHREADS
13773
13774 HvSHAREKEYS_off(PL_user_def_props);
13775 PL_user_def_props_aTHX = aTHX;
13776
13777 # endif
13778
13779 /* Set up the inversion list interpreter-level variables */
13780
13781 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13782 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
13783 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
13784 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
13785 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
13786 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
13787 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
13788 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
13789 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
13790 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
13791 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
13792 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
13793 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
13794 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
13795 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
13796 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
13797
13798 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13799 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
13800 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
13801 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
13802 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
13803 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
13804 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
13805 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
13806 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
13807 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
13808 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
13809 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
13810 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
13811 PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
13812 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
13813 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
13814
13815 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
13816 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
13817 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
13818 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
13819 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
13820
13821 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
13822 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
13823 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
13824 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
13825
13826 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
13827
13828 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
13829 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
13830
13831 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
13832 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
13833
13834 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
13835 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13836 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
13837 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13838 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
13839 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
13840 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
13841 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
13842 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
13843 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
13844 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
13845 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
13846 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
13847 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
13848
13849 # ifdef UNI_XIDC
13850 /* The below are used only by deprecated functions. They could be removed */
13851 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
13852 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
13853 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
13854 # endif
13855 }
13856
13857 /* These four functions are compiled only in regcomp.c, where they have access
13858 * to the data they return. They are a way for re_comp.c to get access to that
13859 * data without having to compile the whole data structures. */
13860
13861 I16
Perl_do_uniprop_match(const char * const key,const U16 key_len)13862 Perl_do_uniprop_match(const char * const key, const U16 key_len)
13863 {
13864 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
13865
13866 return match_uniprop((U8 *) key, key_len);
13867 }
13868
13869 SV *
Perl_get_prop_definition(pTHX_ const int table_index)13870 Perl_get_prop_definition(pTHX_ const int table_index)
13871 {
13872 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
13873
13874 /* Create and return the inversion list */
13875 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
13876 }
13877
13878 const char * const *
Perl_get_prop_values(const int table_index)13879 Perl_get_prop_values(const int table_index)
13880 {
13881 PERL_ARGS_ASSERT_GET_PROP_VALUES;
13882
13883 return UNI_prop_value_ptrs[table_index];
13884 }
13885
13886 const char *
Perl_get_deprecated_property_msg(const Size_t warning_offset)13887 Perl_get_deprecated_property_msg(const Size_t warning_offset)
13888 {
13889 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
13890
13891 return deprecated_property_msgs[warning_offset];
13892 }
13893
13894 # if 0
13895
13896 This code was mainly added for backcompat to give a warning for non-portable
13897 code points in user-defined properties. But experiments showed that the
13898 warning in earlier perls were only omitted on overflow, which should be an
13899 error, so there really isnt a backcompat issue, and actually adding the
13900 warning when none was present before might cause breakage, for little gain. So
13901 khw left this code in, but not enabled. Tests were never added.
13902
13903 embed.fnc entry:
13904 Ei |const char *|get_extended_utf8_msg|const UV cp
13905
13906 PERL_STATIC_INLINE const char *
13907 S_get_extended_utf8_msg(pTHX_ const UV cp)
13908 {
13909 U8 dummy[UTF8_MAXBYTES + 1];
13910 HV *msgs;
13911 SV **msg;
13912
13913 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
13914 &msgs);
13915
13916 msg = hv_fetchs(msgs, "text", 0);
13917 assert(msg);
13918
13919 (void) sv_2mortal((SV *) msgs);
13920
13921 return SvPVX(*msg);
13922 }
13923
13924 # endif
13925 #endif /* end of ! PERL_IN_XSUB_RE */
13926
13927 STATIC REGEXP *
S_compile_wildcard(pTHX_ const char * subpattern,const STRLEN len,const bool ignore_case)13928 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
13929 const bool ignore_case)
13930 {
13931 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
13932 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
13933 * because nothing outside of ASCII will match. Use /m because the input
13934 * string may be a bunch of lines strung together.
13935 *
13936 * Also sets up the debugging info */
13937
13938 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
13939 U32 rx_flags;
13940 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
13941 REGEXP * subpattern_re;
13942 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13943
13944 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
13945
13946 if (ignore_case) {
13947 flags |= PMf_FOLD;
13948 }
13949 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
13950
13951 /* Like in op.c, we copy the compile time pm flags to the rx ones */
13952 rx_flags = flags & RXf_PMf_COMPILETIME;
13953
13954 #ifndef PERL_IN_XSUB_RE
13955 /* Use the core engine if this file is regcomp.c. That means no
13956 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
13957 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13958 &PL_core_reg_engine,
13959 NULL, NULL,
13960 rx_flags, flags);
13961 #else
13962 if (isDEBUG_WILDCARD) {
13963 /* Use the special debugging engine if this file is re_comp.c and wants
13964 * to output the wildcard matching. This uses whatever
13965 * 'use re "Debug ..." is in effect */
13966 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13967 &my_reg_engine,
13968 NULL, NULL,
13969 rx_flags, flags);
13970 }
13971 else {
13972 /* Use the special wildcard engine if this file is re_comp.c and
13973 * doesn't want to output the wildcard matching. This uses whatever
13974 * 'use re "Debug ..." is in effect for compilation, but this engine
13975 * structure has been set up so that it uses the core engine for
13976 * execution, so no execution debugging as a result of re.pm will be
13977 * displayed. */
13978 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13979 &wild_reg_engine,
13980 NULL, NULL,
13981 rx_flags, flags);
13982 /* XXX The above has the effect that any user-supplied regex engine
13983 * won't be called for matching wildcards. That might be good, or bad.
13984 * It could be changed in several ways. The reason it is done the
13985 * current way is to avoid having to save and restore
13986 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
13987 * could be used. Another suggestion is to keep the authoritative
13988 * value of the debug flags in a thread-local variable and add set/get
13989 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
13990 * Still another is to pass a flag, say in the engine's intflags that
13991 * would be checked each time before doing the debug output */
13992 }
13993 #endif
13994
13995 assert(subpattern_re); /* Should have died if didn't compile successfully */
13996 return subpattern_re;
13997 }
13998
13999 STATIC I32
S_execute_wildcard(pTHX_ REGEXP * const prog,char * stringarg,char * strend,char * strbeg,SSize_t minend,SV * screamer,U32 nosave)14000 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
14001 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
14002 {
14003 I32 result;
14004 DECLARE_AND_GET_RE_DEBUG_FLAGS;
14005
14006 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
14007
14008 ENTER;
14009
14010 /* The compilation has set things up so that if the program doesn't want to
14011 * see the wildcard matching procedure, it will get the core execution
14012 * engine, which is subject only to -Dr. So we have to turn that off
14013 * around this procedure */
14014 if (! isDEBUG_WILDCARD) {
14015 /* Note! Casts away 'volatile' */
14016 SAVEI32(PL_debug);
14017 PL_debug &= ~ DEBUG_r_FLAG;
14018 }
14019
14020 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
14021 NULL, nosave);
14022 LEAVE;
14023
14024 return result;
14025 }
14026
14027 SV *
S_handle_user_defined_property(pTHX_ const char * name,const STRLEN name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,SV * contents,bool * user_defined_ptr,SV * msg,const STRLEN level)14028 S_handle_user_defined_property(pTHX_
14029
14030 /* Parses the contents of a user-defined property definition; returning the
14031 * expanded definition if possible. If so, the return is an inversion
14032 * list.
14033 *
14034 * If there are subroutines that are part of the expansion and which aren't
14035 * known at the time of the call to this function, this returns what
14036 * parse_uniprop_string() returned for the first one encountered.
14037 *
14038 * If an error was found, NULL is returned, and 'msg' gets a suitable
14039 * message appended to it. (Appending allows the back trace of how we got
14040 * to the faulty definition to be displayed through nested calls of
14041 * user-defined subs.)
14042 *
14043 * The caller IS responsible for freeing any returned SV.
14044 *
14045 * The syntax of the contents is pretty much described in perlunicode.pod,
14046 * but we also allow comments on each line */
14047
14048 const char * name, /* Name of property */
14049 const STRLEN name_len, /* The name's length in bytes */
14050 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14051 const bool to_fold, /* ? Is this under /i */
14052 const bool runtime, /* ? Are we in compile- or run-time */
14053 const bool deferrable, /* Is it ok for this property's full definition
14054 to be deferred until later? */
14055 SV* contents, /* The property's definition */
14056 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
14057 getting called unless this is thought to be
14058 a user-defined property */
14059 SV * msg, /* Any error or warning msg(s) are appended to
14060 this */
14061 const STRLEN level) /* Recursion level of this call */
14062 {
14063 STRLEN len;
14064 const char * string = SvPV_const(contents, len);
14065 const char * const e = string + len;
14066 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
14067 const STRLEN msgs_length_on_entry = SvCUR(msg);
14068
14069 const char * s0 = string; /* Points to first byte in the current line
14070 being parsed in 'string' */
14071 const char overflow_msg[] = "Code point too large in \"";
14072 SV* running_definition = NULL;
14073
14074 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
14075
14076 *user_defined_ptr = TRUE;
14077
14078 /* Look at each line */
14079 while (s0 < e) {
14080 const char * s; /* Current byte */
14081 char op = '+'; /* Default operation is 'union' */
14082 IV min = 0; /* range begin code point */
14083 IV max = -1; /* and range end */
14084 SV* this_definition;
14085
14086 /* Skip comment lines */
14087 if (*s0 == '#') {
14088 s0 = strchr(s0, '\n');
14089 if (s0 == NULL) {
14090 break;
14091 }
14092 s0++;
14093 continue;
14094 }
14095
14096 /* For backcompat, allow an empty first line */
14097 if (*s0 == '\n') {
14098 s0++;
14099 continue;
14100 }
14101
14102 /* First character in the line may optionally be the operation */
14103 if ( *s0 == '+'
14104 || *s0 == '!'
14105 || *s0 == '-'
14106 || *s0 == '&')
14107 {
14108 op = *s0++;
14109 }
14110
14111 /* If the line is one or two hex digits separated by blank space, its
14112 * a range; otherwise it is either another user-defined property or an
14113 * error */
14114
14115 s = s0;
14116
14117 if (! isXDIGIT(*s)) {
14118 goto check_if_property;
14119 }
14120
14121 do { /* Each new hex digit will add 4 bits. */
14122 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
14123 s = strchr(s, '\n');
14124 if (s == NULL) {
14125 s = e;
14126 }
14127 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14128 sv_catpv(msg, overflow_msg);
14129 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14130 UTF8fARG(is_contents_utf8, s - s0, s0));
14131 sv_catpvs(msg, "\"");
14132 goto return_failure;
14133 }
14134
14135 /* Accumulate this digit into the value */
14136 min = (min << 4) + READ_XDIGIT(s);
14137 } while (isXDIGIT(*s));
14138
14139 while (isBLANK(*s)) { s++; }
14140
14141 /* We allow comments at the end of the line */
14142 if (*s == '#') {
14143 s = strchr(s, '\n');
14144 if (s == NULL) {
14145 s = e;
14146 }
14147 s++;
14148 }
14149 else if (s < e && *s != '\n') {
14150 if (! isXDIGIT(*s)) {
14151 goto check_if_property;
14152 }
14153
14154 /* Look for the high point of the range */
14155 max = 0;
14156 do {
14157 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
14158 s = strchr(s, '\n');
14159 if (s == NULL) {
14160 s = e;
14161 }
14162 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14163 sv_catpv(msg, overflow_msg);
14164 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14165 UTF8fARG(is_contents_utf8, s - s0, s0));
14166 sv_catpvs(msg, "\"");
14167 goto return_failure;
14168 }
14169
14170 max = (max << 4) + READ_XDIGIT(s);
14171 } while (isXDIGIT(*s));
14172
14173 while (isBLANK(*s)) { s++; }
14174
14175 if (*s == '#') {
14176 s = strchr(s, '\n');
14177 if (s == NULL) {
14178 s = e;
14179 }
14180 }
14181 else if (s < e && *s != '\n') {
14182 goto check_if_property;
14183 }
14184 }
14185
14186 if (max == -1) { /* The line only had one entry */
14187 max = min;
14188 }
14189 else if (max < min) {
14190 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14191 sv_catpvs(msg, "Illegal range in \"");
14192 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14193 UTF8fARG(is_contents_utf8, s - s0, s0));
14194 sv_catpvs(msg, "\"");
14195 goto return_failure;
14196 }
14197
14198 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
14199
14200 if ( UNICODE_IS_PERL_EXTENDED(min)
14201 || UNICODE_IS_PERL_EXTENDED(max))
14202 {
14203 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14204
14205 /* If both code points are non-portable, warn only on the lower
14206 * one. */
14207 sv_catpv(msg, get_extended_utf8_msg(
14208 (UNICODE_IS_PERL_EXTENDED(min))
14209 ? min : max));
14210 sv_catpvs(msg, " in \"");
14211 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14212 UTF8fARG(is_contents_utf8, s - s0, s0));
14213 sv_catpvs(msg, "\"");
14214 }
14215
14216 # endif
14217
14218 /* Here, this line contains a legal range */
14219 this_definition = sv_2mortal(_new_invlist(2));
14220 this_definition = _add_range_to_invlist(this_definition, min, max);
14221 goto calculate;
14222
14223 check_if_property:
14224
14225 /* Here it isn't a legal range line. See if it is a legal property
14226 * line. First find the end of the meat of the line */
14227 s = strpbrk(s, "#\n");
14228 if (s == NULL) {
14229 s = e;
14230 }
14231
14232 /* Ignore trailing blanks in keeping with the requirements of
14233 * parse_uniprop_string() */
14234 s--;
14235 while (s > s0 && isBLANK_A(*s)) {
14236 s--;
14237 }
14238 s++;
14239
14240 this_definition = parse_uniprop_string(s0, s - s0,
14241 is_utf8, to_fold, runtime,
14242 deferrable,
14243 NULL,
14244 user_defined_ptr, msg,
14245 (name_len == 0)
14246 ? level /* Don't increase level
14247 if input is empty */
14248 : level + 1
14249 );
14250 if (this_definition == NULL) {
14251 goto return_failure; /* 'msg' should have had the reason
14252 appended to it by the above call */
14253 }
14254
14255 if (! is_invlist(this_definition)) { /* Unknown at this time */
14256 return newSVsv(this_definition);
14257 }
14258
14259 if (*s != '\n') {
14260 s = strchr(s, '\n');
14261 if (s == NULL) {
14262 s = e;
14263 }
14264 }
14265
14266 calculate:
14267
14268 switch (op) {
14269 case '+':
14270 _invlist_union(running_definition, this_definition,
14271 &running_definition);
14272 break;
14273 case '-':
14274 _invlist_subtract(running_definition, this_definition,
14275 &running_definition);
14276 break;
14277 case '&':
14278 _invlist_intersection(running_definition, this_definition,
14279 &running_definition);
14280 break;
14281 case '!':
14282 _invlist_union_complement_2nd(running_definition,
14283 this_definition, &running_definition);
14284 break;
14285 default:
14286 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
14287 __FILE__, __LINE__, op);
14288 break;
14289 }
14290
14291 /* Position past the '\n' */
14292 s0 = s + 1;
14293 } /* End of loop through the lines of 'contents' */
14294
14295 /* Here, we processed all the lines in 'contents' without error. If we
14296 * didn't add any warnings, simply return success */
14297 if (msgs_length_on_entry == SvCUR(msg)) {
14298
14299 /* If the expansion was empty, the answer isn't nothing: its an empty
14300 * inversion list */
14301 if (running_definition == NULL) {
14302 running_definition = _new_invlist(1);
14303 }
14304
14305 return running_definition;
14306 }
14307
14308 /* Otherwise, add some explanatory text, but we will return success */
14309 goto return_msg;
14310
14311 return_failure:
14312 running_definition = NULL;
14313
14314 return_msg:
14315
14316 if (name_len > 0) {
14317 sv_catpvs(msg, " in expansion of ");
14318 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
14319 }
14320
14321 return running_definition;
14322 }
14323
14324 /* As explained below, certain operations need to take place in the first
14325 * thread created. These macros switch contexts */
14326 # ifdef USE_ITHREADS
14327 # define DECLARATION_FOR_GLOBAL_CONTEXT \
14328 PerlInterpreter * save_aTHX = aTHX;
14329 # define SWITCH_TO_GLOBAL_CONTEXT \
14330 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
14331 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
14332 # define CUR_CONTEXT aTHX
14333 # define ORIGINAL_CONTEXT save_aTHX
14334 # else
14335 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
14336 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
14337 # define RESTORE_CONTEXT NOOP
14338 # define CUR_CONTEXT NULL
14339 # define ORIGINAL_CONTEXT NULL
14340 # endif
14341
14342 STATIC void
S_delete_recursion_entry(pTHX_ void * key)14343 S_delete_recursion_entry(pTHX_ void *key)
14344 {
14345 /* Deletes the entry used to detect recursion when expanding user-defined
14346 * properties. This is a function so it can be set up to be called even if
14347 * the program unexpectedly quits */
14348
14349 SV ** current_entry;
14350 const STRLEN key_len = strlen((const char *) key);
14351 DECLARATION_FOR_GLOBAL_CONTEXT;
14352
14353 SWITCH_TO_GLOBAL_CONTEXT;
14354
14355 /* If the entry is one of these types, it is a permanent entry, and not the
14356 * one used to detect recursions. This function should delete only the
14357 * recursion entry */
14358 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
14359 if ( current_entry
14360 && ! is_invlist(*current_entry)
14361 && ! SvPOK(*current_entry))
14362 {
14363 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
14364 G_DISCARD);
14365 }
14366
14367 RESTORE_CONTEXT;
14368 }
14369
14370 STATIC SV *
S_get_fq_name(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool has_colon_colon)14371 S_get_fq_name(pTHX_
14372 const char * const name, /* The first non-blank in the \p{}, \P{} */
14373 const Size_t name_len, /* Its length in bytes, not including any trailing space */
14374 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14375 const bool has_colon_colon
14376 )
14377 {
14378 /* Returns a mortal SV containing the fully qualified version of the input
14379 * name */
14380
14381 SV * fq_name;
14382
14383 fq_name = newSVpvs_flags("", SVs_TEMP);
14384
14385 /* Use the current package if it wasn't included in our input */
14386 if (! has_colon_colon) {
14387 const HV * pkg = (IN_PERL_COMPILETIME)
14388 ? PL_curstash
14389 : CopSTASH(PL_curcop);
14390 const char* pkgname = HvNAME(pkg);
14391
14392 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14393 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
14394 sv_catpvs(fq_name, "::");
14395 }
14396
14397 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14398 UTF8fARG(is_utf8, name_len, name));
14399 return fq_name;
14400 }
14401
14402 STATIC SV *
S_parse_uniprop_string(pTHX_ const char * const name,Size_t name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,AV ** strings,bool * user_defined_ptr,SV * msg,const STRLEN level)14403 S_parse_uniprop_string(pTHX_
14404
14405 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
14406 * now. If so, the return is an inversion list.
14407 *
14408 * If the property is user-defined, it is a subroutine, which in turn
14409 * may call other subroutines. This function will call the whole nest of
14410 * them to get the definition they return; if some aren't known at the time
14411 * of the call to this function, the fully qualified name of the highest
14412 * level sub is returned. It is an error to call this function at runtime
14413 * without every sub defined.
14414 *
14415 * If an error was found, NULL is returned, and 'msg' gets a suitable
14416 * message appended to it. (Appending allows the back trace of how we got
14417 * to the faulty definition to be displayed through nested calls of
14418 * user-defined subs.)
14419 *
14420 * The caller should NOT try to free any returned inversion list.
14421 *
14422 * Other parameters will be set on return as described below */
14423
14424 const char * const name, /* The first non-blank in the \p{}, \P{} */
14425 Size_t name_len, /* Its length in bytes, not including any
14426 trailing space */
14427 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14428 const bool to_fold, /* ? Is this under /i */
14429 const bool runtime, /* TRUE if this is being called at run time */
14430 const bool deferrable, /* TRUE if it's ok for the definition to not be
14431 known at this call */
14432 AV ** strings, /* To return string property values, like named
14433 sequences */
14434 bool *user_defined_ptr, /* Upon return from this function it will be
14435 set to TRUE if any component is a
14436 user-defined property */
14437 SV * msg, /* Any error or warning msg(s) are appended to
14438 this */
14439 const STRLEN level) /* Recursion level of this call */
14440 {
14441 char* lookup_name; /* normalized name for lookup in our tables */
14442 unsigned lookup_len; /* Its length */
14443 enum { Not_Strict = 0, /* Some properties have stricter name */
14444 Strict, /* normalization rules, which we decide */
14445 As_Is /* upon based on parsing */
14446 } stricter = Not_Strict;
14447
14448 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
14449 * (though it requires extra effort to download them from Unicode and
14450 * compile perl to know about them) */
14451 bool is_nv_type = FALSE;
14452
14453 unsigned int i = 0, i_zero = 0, j = 0;
14454 int equals_pos = -1; /* Where the '=' is found, or negative if none */
14455 int slash_pos = -1; /* Where the '/' is found, or negative if none */
14456 int table_index = 0; /* The entry number for this property in the table
14457 of all Unicode property names */
14458 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
14459 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
14460 the normalized name in certain situations */
14461 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
14462 part of a package name */
14463 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
14464 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
14465 property rather than a Unicode
14466 one. */
14467 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
14468 if an error. If it is an inversion list,
14469 it is the definition. Otherwise it is a
14470 string containing the fully qualified sub
14471 name of 'name' */
14472 SV * fq_name = NULL; /* For user-defined properties, the fully
14473 qualified name */
14474 bool invert_return = FALSE; /* ? Do we need to complement the result before
14475 returning it */
14476 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
14477 explicit utf8:: package that we strip
14478 off */
14479 /* The expansion of properties that could be either user-defined or
14480 * official unicode ones is deferred until runtime, including a marker for
14481 * those that might be in the latter category. This boolean indicates if
14482 * we've seen that marker. If not, what we're parsing can't be such an
14483 * official Unicode property whose expansion was deferred */
14484 bool could_be_deferred_official = FALSE;
14485
14486 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
14487
14488 /* The input will be normalized into 'lookup_name' */
14489 Newx(lookup_name, name_len, char);
14490 SAVEFREEPV(lookup_name);
14491
14492 /* Parse the input. */
14493 for (i = 0; i < name_len; i++) {
14494 char cur = name[i];
14495
14496 /* Most of the characters in the input will be of this ilk, being parts
14497 * of a name */
14498 if (isIDCONT_A(cur)) {
14499
14500 /* Case differences are ignored. Our lookup routine assumes
14501 * everything is lowercase, so normalize to that */
14502 if (isUPPER_A(cur)) {
14503 lookup_name[j++] = toLOWER_A(cur);
14504 continue;
14505 }
14506
14507 if (cur == '_') { /* Don't include these in the normalized name */
14508 continue;
14509 }
14510
14511 lookup_name[j++] = cur;
14512
14513 /* The first character in a user-defined name must be of this type.
14514 * */
14515 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
14516 could_be_user_defined = FALSE;
14517 }
14518
14519 continue;
14520 }
14521
14522 /* Here, the character is not something typically in a name, But these
14523 * two types of characters (and the '_' above) can be freely ignored in
14524 * most situations. Later it may turn out we shouldn't have ignored
14525 * them, and we have to reparse, but we don't have enough information
14526 * yet to make that decision */
14527 if (cur == '-' || isSPACE_A(cur)) {
14528 could_be_user_defined = FALSE;
14529 continue;
14530 }
14531
14532 /* An equals sign or single colon mark the end of the first part of
14533 * the property name */
14534 if ( cur == '='
14535 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
14536 {
14537 lookup_name[j++] = '='; /* Treat the colon as an '=' */
14538 equals_pos = j; /* Note where it occurred in the input */
14539 could_be_user_defined = FALSE;
14540 break;
14541 }
14542
14543 /* If this looks like it is a marker we inserted at compile time,
14544 * set a flag and otherwise ignore it. If it isn't in the final
14545 * position, keep it as it would have been user input. */
14546 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
14547 && ! deferrable
14548 && could_be_user_defined
14549 && i == name_len - 1)
14550 {
14551 name_len--;
14552 could_be_deferred_official = TRUE;
14553 continue;
14554 }
14555
14556 /* Otherwise, this character is part of the name. */
14557 lookup_name[j++] = cur;
14558
14559 /* Here it isn't a single colon, so if it is a colon, it must be a
14560 * double colon */
14561 if (cur == ':') {
14562
14563 /* A double colon should be a package qualifier. We note its
14564 * position and continue. Note that one could have
14565 * pkg1::pkg2::...::foo
14566 * so that the position at the end of the loop will be just after
14567 * the final qualifier */
14568
14569 i++;
14570 non_pkg_begin = i + 1;
14571 lookup_name[j++] = ':';
14572 lun_non_pkg_begin = j;
14573 }
14574 else { /* Only word chars (and '::') can be in a user-defined name */
14575 could_be_user_defined = FALSE;
14576 }
14577 } /* End of parsing through the lhs of the property name (or all of it if
14578 no rhs) */
14579
14580 /* If there is a single package name 'utf8::', it is ambiguous. It could
14581 * be for a user-defined property, or it could be a Unicode property, as
14582 * all of them are considered to be for that package. For the purposes of
14583 * parsing the rest of the property, strip it off */
14584 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
14585 lookup_name += STRLENs("utf8::");
14586 j -= STRLENs("utf8::");
14587 equals_pos -= STRLENs("utf8::");
14588 i_zero = STRLENs("utf8::"); /* When resetting 'i' to reparse
14589 from the beginning, it has to be
14590 set past what we're stripping
14591 off */
14592 stripped_utf8_pkg = TRUE;
14593 }
14594
14595 /* Here, we are either done with the whole property name, if it was simple;
14596 * or are positioned just after the '=' if it is compound. */
14597
14598 if (equals_pos >= 0) {
14599 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
14600
14601 /* Space immediately after the '=' is ignored */
14602 i++;
14603 for (; i < name_len; i++) {
14604 if (! isSPACE_A(name[i])) {
14605 break;
14606 }
14607 }
14608
14609 /* Most punctuation after the equals indicates a subpattern, like
14610 * \p{foo=/bar/} */
14611 if ( isPUNCT_A(name[i])
14612 && name[i] != '-'
14613 && name[i] != '+'
14614 && name[i] != '_'
14615 && name[i] != '{'
14616 /* A backslash means the real delimiter is the next character,
14617 * but it must be punctuation */
14618 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
14619 {
14620 bool special_property = memEQs(lookup_name, j - 1, "name")
14621 || memEQs(lookup_name, j - 1, "na");
14622 if (! special_property) {
14623 /* Find the property. The table includes the equals sign, so
14624 * we use 'j' as-is */
14625 table_index = do_uniprop_match(lookup_name, j);
14626 }
14627 if (special_property || table_index) {
14628 REGEXP * subpattern_re;
14629 char open = name[i++];
14630 char close;
14631 const char * pos_in_brackets;
14632 const char * const * prop_values;
14633 bool escaped = 0;
14634
14635 /* Backslash => delimiter is the character following. We
14636 * already checked that it is punctuation */
14637 if (open == '\\') {
14638 open = name[i++];
14639 escaped = 1;
14640 }
14641
14642 /* This data structure is constructed so that the matching
14643 * closing bracket is 3 past its matching opening. The second
14644 * set of closing is so that if the opening is something like
14645 * ']', the closing will be that as well. Something similar is
14646 * done in toke.c */
14647 pos_in_brackets = memCHRs("([<)]>)]>", open);
14648 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
14649
14650 if ( i >= name_len
14651 || name[name_len-1] != close
14652 || (escaped && name[name_len-2] != '\\')
14653 /* Also make sure that there are enough characters.
14654 * e.g., '\\\' would show up incorrectly as legal even
14655 * though it is too short */
14656 || (SSize_t) (name_len - i - 1 - escaped) < 0)
14657 {
14658 sv_catpvs(msg, "Unicode property wildcard not terminated");
14659 goto append_name_to_msg;
14660 }
14661
14662 Perl_ck_warner_d(aTHX_
14663 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
14664 "The Unicode property wildcards feature is experimental");
14665
14666 if (special_property) {
14667 const char * error_msg;
14668 const char * revised_name = name + i;
14669 Size_t revised_name_len = name_len - (i + 1 + escaped);
14670
14671 /* Currently, the only 'special_property' is name, which we
14672 * lookup in _charnames.pm */
14673
14674 if (! load_charnames(newSVpvs("placeholder"),
14675 revised_name, revised_name_len,
14676 &error_msg))
14677 {
14678 sv_catpv(msg, error_msg);
14679 goto append_name_to_msg;
14680 }
14681
14682 /* Farm this out to a function just to make the current
14683 * function less unwieldy */
14684 if (handle_names_wildcard(revised_name, revised_name_len,
14685 &prop_definition,
14686 strings))
14687 {
14688 return prop_definition;
14689 }
14690
14691 goto failed;
14692 }
14693
14694 prop_values = get_prop_values(table_index);
14695
14696 /* Now create and compile the wildcard subpattern. Use /i
14697 * because the property values are supposed to match with case
14698 * ignored. */
14699 subpattern_re = compile_wildcard(name + i,
14700 name_len - i - 1 - escaped,
14701 TRUE /* /i */
14702 );
14703
14704 /* For each legal property value, see if the supplied pattern
14705 * matches it. */
14706 while (*prop_values) {
14707 const char * const entry = *prop_values;
14708 const Size_t len = strlen(entry);
14709 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
14710
14711 if (execute_wildcard(subpattern_re,
14712 (char *) entry,
14713 (char *) entry + len,
14714 (char *) entry, 0,
14715 entry_sv,
14716 0))
14717 { /* Here, matched. Add to the returned list */
14718 Size_t total_len = j + len;
14719 SV * sub_invlist = NULL;
14720 char * this_string;
14721
14722 /* We know this is a legal \p{property=value}. Call
14723 * the function to return the list of code points that
14724 * match it */
14725 Newxz(this_string, total_len + 1, char);
14726 Copy(lookup_name, this_string, j, char);
14727 my_strlcat(this_string, entry, total_len + 1);
14728 SAVEFREEPV(this_string);
14729 sub_invlist = parse_uniprop_string(this_string,
14730 total_len,
14731 is_utf8,
14732 to_fold,
14733 runtime,
14734 deferrable,
14735 NULL,
14736 user_defined_ptr,
14737 msg,
14738 level + 1);
14739 _invlist_union(prop_definition, sub_invlist,
14740 &prop_definition);
14741 }
14742
14743 prop_values++; /* Next iteration, look at next propvalue */
14744 } /* End of looking through property values; (the data
14745 structure is terminated by a NULL ptr) */
14746
14747 SvREFCNT_dec_NN(subpattern_re);
14748
14749 if (prop_definition) {
14750 return prop_definition;
14751 }
14752
14753 sv_catpvs(msg, "No Unicode property value wildcard matches:");
14754 goto append_name_to_msg;
14755 }
14756
14757 /* Here's how khw thinks we should proceed to handle the properties
14758 * not yet done: Bidi Mirroring Glyph can map to ""
14759 Bidi Paired Bracket can map to ""
14760 Case Folding (both full and simple)
14761 Shouldn't /i be good enough for Full
14762 Decomposition Mapping
14763 Equivalent Unified Ideograph can map to ""
14764 Lowercase Mapping (both full and simple)
14765 NFKC Case Fold can map to ""
14766 Titlecase Mapping (both full and simple)
14767 Uppercase Mapping (both full and simple)
14768 * Handle these the same way Name is done, using say, _wild.pm, but
14769 * having both loose and full, like in charclass_invlists.h.
14770 * Perhaps move block and script to that as they are somewhat large
14771 * in charclass_invlists.h.
14772 * For properties where the default is the code point itself, such
14773 * as any of the case changing mappings, the string would otherwise
14774 * consist of all Unicode code points in UTF-8 strung together.
14775 * This would be impractical. So instead, examine their compiled
14776 * pattern, looking at the ssc. If none, reject the pattern as an
14777 * error. Otherwise run the pattern against every code point in
14778 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
14779 * And it might be good to create an API to return the ssc.
14780 * Or handle them like the algorithmic names are done
14781 */
14782 } /* End of is a wildcard subppattern */
14783
14784 /* \p{name=...} is handled specially. Instead of using the normal
14785 * mechanism involving charclass_invlists.h, it uses _charnames.pm
14786 * which has the necessary (huge) data accessible to it, and which
14787 * doesn't get loaded unless necessary. The legal syntax for names is
14788 * somewhat different than other properties due both to the vagaries of
14789 * a few outlier official names, and the fact that only a few ASCII
14790 * characters are permitted in them */
14791 if ( memEQs(lookup_name, j - 1, "name")
14792 || memEQs(lookup_name, j - 1, "na"))
14793 {
14794 dSP;
14795 HV * table;
14796 SV * character;
14797 const char * error_msg;
14798 CV* lookup_loose;
14799 SV * character_name;
14800 STRLEN character_len;
14801 UV cp;
14802
14803 stricter = As_Is;
14804
14805 /* Since the RHS (after skipping initial space) is passed unchanged
14806 * to charnames, and there are different criteria for what are
14807 * legal characters in the name, just parse it here. A character
14808 * name must begin with an ASCII alphabetic */
14809 if (! isALPHA(name[i])) {
14810 goto failed;
14811 }
14812 lookup_name[j++] = name[i];
14813
14814 for (++i; i < name_len; i++) {
14815 /* Official names can only be in the ASCII range, and only
14816 * certain characters */
14817 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
14818 goto failed;
14819 }
14820 lookup_name[j++] = name[i];
14821 }
14822
14823 /* Finished parsing, save the name into an SV */
14824 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
14825
14826 /* Make sure _charnames is loaded. (The parameters give context
14827 * for any errors generated */
14828 table = load_charnames(character_name, name, name_len, &error_msg);
14829 if (table == NULL) {
14830 sv_catpv(msg, error_msg);
14831 goto append_name_to_msg;
14832 }
14833
14834 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
14835 if (! lookup_loose) {
14836 Perl_croak(aTHX_
14837 "panic: Can't find '_charnames::_loose_regcomp_lookup");
14838 }
14839
14840 PUSHSTACKi(PERLSI_REGCOMP);
14841 ENTER ;
14842 SAVETMPS;
14843 save_re_context();
14844
14845 PUSHMARK(SP) ;
14846 XPUSHs(character_name);
14847 PUTBACK;
14848 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
14849
14850 SPAGAIN ;
14851
14852 character = POPs;
14853 SvREFCNT_inc_simple_void_NN(character);
14854
14855 PUTBACK ;
14856 FREETMPS ;
14857 LEAVE ;
14858 POPSTACK;
14859
14860 if (! SvOK(character)) {
14861 goto failed;
14862 }
14863
14864 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
14865 if (character_len == SvCUR(character)) {
14866 prop_definition = add_cp_to_invlist(NULL, cp);
14867 }
14868 else {
14869 AV * this_string;
14870
14871 /* First of the remaining characters in the string. */
14872 char * remaining = SvPVX(character) + character_len;
14873
14874 if (strings == NULL) {
14875 goto failed; /* XXX Perhaps a specific msg instead, like
14876 'not available here' */
14877 }
14878
14879 if (*strings == NULL) {
14880 *strings = newAV();
14881 }
14882
14883 this_string = newAV();
14884 av_push_simple(this_string, newSVuv(cp));
14885
14886 do {
14887 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
14888 av_push_simple(this_string, newSVuv(cp));
14889 remaining += character_len;
14890 } while (remaining < SvEND(character));
14891
14892 av_push_simple(*strings, (SV *) this_string);
14893 }
14894
14895 return prop_definition;
14896 }
14897
14898 /* Certain properties whose values are numeric need special handling.
14899 * They may optionally be prefixed by 'is'. Ignore that prefix for the
14900 * purposes of checking if this is one of those properties */
14901 if (memBEGINPs(lookup_name, j, "is")) {
14902 lookup_offset = 2;
14903 }
14904
14905 /* Then check if it is one of these specially-handled properties. The
14906 * possibilities are hard-coded because easier this way, and the list
14907 * is unlikely to change.
14908 *
14909 * All numeric value type properties are of this ilk, and are also
14910 * special in a different way later on. So find those first. There
14911 * are several numeric value type properties in the Unihan DB (which is
14912 * unlikely to be compiled with perl, but we handle it here in case it
14913 * does get compiled). They all end with 'numeric'. The interiors
14914 * aren't checked for the precise property. This would stop working if
14915 * a cjk property were to be created that ended with 'numeric' and
14916 * wasn't a numeric type */
14917 is_nv_type = memEQs(lookup_name + lookup_offset,
14918 j - 1 - lookup_offset, "numericvalue")
14919 || memEQs(lookup_name + lookup_offset,
14920 j - 1 - lookup_offset, "nv")
14921 || ( memENDPs(lookup_name + lookup_offset,
14922 j - 1 - lookup_offset, "numeric")
14923 && ( memBEGINPs(lookup_name + lookup_offset,
14924 j - 1 - lookup_offset, "cjk")
14925 || memBEGINPs(lookup_name + lookup_offset,
14926 j - 1 - lookup_offset, "k")));
14927 if ( is_nv_type
14928 || memEQs(lookup_name + lookup_offset,
14929 j - 1 - lookup_offset, "canonicalcombiningclass")
14930 || memEQs(lookup_name + lookup_offset,
14931 j - 1 - lookup_offset, "ccc")
14932 || memEQs(lookup_name + lookup_offset,
14933 j - 1 - lookup_offset, "age")
14934 || memEQs(lookup_name + lookup_offset,
14935 j - 1 - lookup_offset, "in")
14936 || memEQs(lookup_name + lookup_offset,
14937 j - 1 - lookup_offset, "presentin"))
14938 {
14939 unsigned int k;
14940
14941 /* Since the stuff after the '=' is a number, we can't throw away
14942 * '-' willy-nilly, as those could be a minus sign. Other stricter
14943 * rules also apply. However, these properties all can have the
14944 * rhs not be a number, in which case they contain at least one
14945 * alphabetic. In those cases, the stricter rules don't apply.
14946 * But the numeric type properties can have the alphas [Ee] to
14947 * signify an exponent, and it is still a number with stricter
14948 * rules. So look for an alpha that signifies not-strict */
14949 stricter = Strict;
14950 for (k = i; k < name_len; k++) {
14951 if ( isALPHA_A(name[k])
14952 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
14953 {
14954 stricter = Not_Strict;
14955 break;
14956 }
14957 }
14958 }
14959
14960 if (stricter) {
14961
14962 /* A number may have a leading '+' or '-'. The latter is retained
14963 * */
14964 if (name[i] == '+') {
14965 i++;
14966 }
14967 else if (name[i] == '-') {
14968 lookup_name[j++] = '-';
14969 i++;
14970 }
14971
14972 /* Skip leading zeros including single underscores separating the
14973 * zeros, or between the final leading zero and the first other
14974 * digit */
14975 for (; i < name_len - 1; i++) {
14976 if ( name[i] != '0'
14977 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
14978 {
14979 break;
14980 }
14981 }
14982
14983 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by
14984 * underling libc implementation. */
14985 if ( i == name_len - 1
14986 && name[name_len-1] == '0'
14987 && lookup_name[j-1] == '-')
14988 {
14989 j--;
14990 }
14991 }
14992 }
14993 else { /* No '=' */
14994
14995 /* Only a few properties without an '=' should be parsed with stricter
14996 * rules. The list is unlikely to change. */
14997 if ( memBEGINPs(lookup_name, j, "perl")
14998 && memNEs(lookup_name + 4, j - 4, "space")
14999 && memNEs(lookup_name + 4, j - 4, "word"))
15000 {
15001 stricter = Strict;
15002
15003 /* We set the inputs back to 0 and the code below will reparse,
15004 * using strict */
15005 i = i_zero;
15006 j = 0;
15007 }
15008 }
15009
15010 /* Here, we have either finished the property, or are positioned to parse
15011 * the remainder, and we know if stricter rules apply. Finish out, if not
15012 * already done */
15013 for (; i < name_len; i++) {
15014 char cur = name[i];
15015
15016 /* In all instances, case differences are ignored, and we normalize to
15017 * lowercase */
15018 if (isUPPER_A(cur)) {
15019 lookup_name[j++] = toLOWER(cur);
15020 continue;
15021 }
15022
15023 /* An underscore is skipped, but not under strict rules unless it
15024 * separates two digits */
15025 if (cur == '_') {
15026 if ( stricter
15027 && ( i == i_zero || (int) i == equals_pos || i == name_len- 1
15028 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
15029 {
15030 lookup_name[j++] = '_';
15031 }
15032 continue;
15033 }
15034
15035 /* Hyphens are skipped except under strict */
15036 if (cur == '-' && ! stricter) {
15037 continue;
15038 }
15039
15040 /* XXX Bug in documentation. It says white space skipped adjacent to
15041 * non-word char. Maybe we should, but shouldn't skip it next to a dot
15042 * in a number */
15043 if (isSPACE_A(cur) && ! stricter) {
15044 continue;
15045 }
15046
15047 lookup_name[j++] = cur;
15048
15049 /* Unless this is a non-trailing slash, we are done with it */
15050 if (i >= name_len - 1 || cur != '/') {
15051 continue;
15052 }
15053
15054 slash_pos = j;
15055
15056 /* A slash in the 'numeric value' property indicates that what follows
15057 * is a denominator. It can have a leading '+' and '0's that should be
15058 * skipped. But we have never allowed a negative denominator, so treat
15059 * a minus like every other character. (No need to rule out a second
15060 * '/', as that won't match anything anyway */
15061 if (is_nv_type) {
15062 i++;
15063 if (i < name_len && name[i] == '+') {
15064 i++;
15065 }
15066
15067 /* Skip leading zeros including underscores separating digits */
15068 for (; i < name_len - 1; i++) {
15069 if ( name[i] != '0'
15070 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15071 {
15072 break;
15073 }
15074 }
15075
15076 /* Store the first real character in the denominator */
15077 if (i < name_len) {
15078 lookup_name[j++] = name[i];
15079 }
15080 }
15081 }
15082
15083 /* Here are completely done parsing the input 'name', and 'lookup_name'
15084 * contains a copy, normalized.
15085 *
15086 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
15087 * different from without the underscores. */
15088 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
15089 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
15090 && UNLIKELY(name[name_len-1] == '_'))
15091 {
15092 lookup_name[j++] = '&';
15093 }
15094
15095 /* If the original input began with 'In' or 'Is', it could be a subroutine
15096 * call to a user-defined property instead of a Unicode property name. */
15097 if ( name_len - non_pkg_begin > 2
15098 && name[non_pkg_begin+0] == 'I'
15099 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
15100 {
15101 /* Names that start with In have different characteristics than those
15102 * that start with Is */
15103 if (name[non_pkg_begin+1] == 's') {
15104 starts_with_Is = TRUE;
15105 }
15106 }
15107 else {
15108 could_be_user_defined = FALSE;
15109 }
15110
15111 if (could_be_user_defined) {
15112 CV* user_sub;
15113
15114 /* If the user defined property returns the empty string, it could
15115 * easily be because the pattern is being compiled before the data it
15116 * actually needs to compile is available. This could be argued to be
15117 * a bug in the perl code, but this is a change of behavior for Perl,
15118 * so we handle it. This means that intentionally returning nothing
15119 * will not be resolved until runtime */
15120 bool empty_return = FALSE;
15121
15122 /* Here, the name could be for a user defined property, which are
15123 * implemented as subs. */
15124 user_sub = get_cvn_flags(name, name_len, 0);
15125 if (! user_sub) {
15126
15127 /* Here, the property name could be a user-defined one, but there
15128 * is no subroutine to handle it (as of now). Defer handling it
15129 * until runtime. Otherwise, a block defined by Unicode in a later
15130 * release would get the synonym InFoo added for it, and existing
15131 * code that used that name would suddenly break if it referred to
15132 * the property before the sub was declared. See [perl #134146] */
15133 if (deferrable) {
15134 goto definition_deferred;
15135 }
15136
15137 /* Here, we are at runtime, and didn't find the user property. It
15138 * could be an official property, but only if no package was
15139 * specified, or just the utf8:: package. */
15140 if (could_be_deferred_official) {
15141 lookup_name += lun_non_pkg_begin;
15142 j -= lun_non_pkg_begin;
15143 }
15144 else if (! stripped_utf8_pkg) {
15145 goto unknown_user_defined;
15146 }
15147
15148 /* Drop down to look up in the official properties */
15149 }
15150 else {
15151 const char insecure[] = "Insecure user-defined property";
15152
15153 /* Here, there is a sub by the correct name. Normally we call it
15154 * to get the property definition */
15155 dSP;
15156 SV * user_sub_sv = MUTABLE_SV(user_sub);
15157 SV * error; /* Any error returned by calling 'user_sub' */
15158 SV * key; /* The key into the hash of user defined sub names
15159 */
15160 SV * placeholder;
15161 SV ** saved_user_prop_ptr; /* Hash entry for this property */
15162
15163 /* How many times to retry when another thread is in the middle of
15164 * expanding the same definition we want */
15165 PERL_INT_FAST8_T retry_countdown = 10;
15166
15167 DECLARATION_FOR_GLOBAL_CONTEXT;
15168
15169 /* If we get here, we know this property is user-defined */
15170 *user_defined_ptr = TRUE;
15171
15172 /* We refuse to call a potentially tainted subroutine; returning an
15173 * error instead */
15174 if (TAINT_get) {
15175 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15176 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15177 goto append_name_to_msg;
15178 }
15179
15180 /* In principal, we only call each subroutine property definition
15181 * once during the life of the program. This guarantees that the
15182 * property definition never changes. The results of the single
15183 * sub call are stored in a hash, which is used instead for future
15184 * references to this property. The property definition is thus
15185 * immutable. But, to allow the user to have a /i-dependent
15186 * definition, we call the sub once for non-/i, and once for /i,
15187 * should the need arise, passing the /i status as a parameter.
15188 *
15189 * We start by constructing the hash key name, consisting of the
15190 * fully qualified subroutine name, preceded by the /i status, so
15191 * that there is a key for /i and a different key for non-/i */
15192 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
15193 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15194 non_pkg_begin != 0);
15195 sv_catsv(key, fq_name);
15196
15197 /* We only call the sub once throughout the life of the program
15198 * (with the /i, non-/i exception noted above). That means the
15199 * hash must be global and accessible to all threads. It is
15200 * created at program start-up, before any threads are created, so
15201 * is accessible to all children. But this creates some
15202 * complications.
15203 *
15204 * 1) The keys can't be shared, or else problems arise; sharing is
15205 * turned off at hash creation time
15206 * 2) All SVs in it are there for the remainder of the life of the
15207 * program, and must be created in the same interpreter context
15208 * as the hash, or else they will be freed from the wrong pool
15209 * at global destruction time. This is handled by switching to
15210 * the hash's context to create each SV going into it, and then
15211 * immediately switching back
15212 * 3) All accesses to the hash must be controlled by a mutex, to
15213 * prevent two threads from getting an unstable state should
15214 * they simultaneously be accessing it. The code below is
15215 * crafted so that the mutex is locked whenever there is an
15216 * access and unlocked only when the next stable state is
15217 * achieved.
15218 *
15219 * The hash stores either the definition of the property if it was
15220 * valid, or, if invalid, the error message that was raised. We
15221 * use the type of SV to distinguish.
15222 *
15223 * There's also the need to guard against the definition expansion
15224 * from infinitely recursing. This is handled by storing the aTHX
15225 * of the expanding thread during the expansion. Again the SV type
15226 * is used to distinguish this from the other two cases. If we
15227 * come to here and the hash entry for this property is our aTHX,
15228 * it means we have recursed, and the code assumes that we would
15229 * infinitely recurse, so instead stops and raises an error.
15230 * (Any recursion has always been treated as infinite recursion in
15231 * this feature.)
15232 *
15233 * If instead, the entry is for a different aTHX, it means that
15234 * that thread has gotten here first, and hasn't finished expanding
15235 * the definition yet. We just have to wait until it is done. We
15236 * sleep and retry a few times, returning an error if the other
15237 * thread doesn't complete. */
15238
15239 re_fetch:
15240 USER_PROP_MUTEX_LOCK;
15241
15242 /* If we have an entry for this key, the subroutine has already
15243 * been called once with this /i status. */
15244 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
15245 SvPVX(key), SvCUR(key), 0);
15246 if (saved_user_prop_ptr) {
15247
15248 /* If the saved result is an inversion list, it is the valid
15249 * definition of this property */
15250 if (is_invlist(*saved_user_prop_ptr)) {
15251 prop_definition = *saved_user_prop_ptr;
15252
15253 /* The SV in the hash won't be removed until global
15254 * destruction, so it is stable and we can unlock */
15255 USER_PROP_MUTEX_UNLOCK;
15256
15257 /* The caller shouldn't try to free this SV */
15258 return prop_definition;
15259 }
15260
15261 /* Otherwise, if it is a string, it is the error message
15262 * that was returned when we first tried to evaluate this
15263 * property. Fail, and append the message */
15264 if (SvPOK(*saved_user_prop_ptr)) {
15265 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15266 sv_catsv(msg, *saved_user_prop_ptr);
15267
15268 /* The SV in the hash won't be removed until global
15269 * destruction, so it is stable and we can unlock */
15270 USER_PROP_MUTEX_UNLOCK;
15271
15272 return NULL;
15273 }
15274
15275 assert(SvIOK(*saved_user_prop_ptr));
15276
15277 /* Here, we have an unstable entry in the hash. Either another
15278 * thread is in the middle of expanding the property's
15279 * definition, or we are ourselves recursing. We use the aTHX
15280 * in it to distinguish */
15281 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
15282
15283 /* Here, it's another thread doing the expanding. We've
15284 * looked as much as we are going to at the contents of the
15285 * hash entry. It's safe to unlock. */
15286 USER_PROP_MUTEX_UNLOCK;
15287
15288 /* Retry a few times */
15289 if (retry_countdown-- > 0) {
15290 PerlProc_sleep(1);
15291 goto re_fetch;
15292 }
15293
15294 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15295 sv_catpvs(msg, "Timeout waiting for another thread to "
15296 "define");
15297 goto append_name_to_msg;
15298 }
15299
15300 /* Here, we are recursing; don't dig any deeper */
15301 USER_PROP_MUTEX_UNLOCK;
15302
15303 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15304 sv_catpvs(msg,
15305 "Infinite recursion in user-defined property");
15306 goto append_name_to_msg;
15307 }
15308
15309 /* Here, this thread has exclusive control, and there is no entry
15310 * for this property in the hash. So we have the go ahead to
15311 * expand the definition ourselves. */
15312
15313 PUSHSTACKi(PERLSI_REGCOMP);
15314 ENTER;
15315
15316 /* Create a temporary placeholder in the hash to detect recursion
15317 * */
15318 SWITCH_TO_GLOBAL_CONTEXT;
15319 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
15320 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
15321 RESTORE_CONTEXT;
15322
15323 /* Now that we have a placeholder, we can let other threads
15324 * continue */
15325 USER_PROP_MUTEX_UNLOCK;
15326
15327 /* Make sure the placeholder always gets destroyed */
15328 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
15329
15330 PUSHMARK(SP);
15331 SAVETMPS;
15332
15333 /* Call the user's function, with the /i status as a parameter.
15334 * Note that we have gone to a lot of trouble to keep this call
15335 * from being within the locked mutex region. */
15336 XPUSHs(boolSV(to_fold));
15337 PUTBACK;
15338
15339 /* The following block was taken from swash_init(). Presumably
15340 * they apply to here as well, though we no longer use a swash --
15341 * khw */
15342 SAVEHINTS();
15343 save_re_context();
15344 /* We might get here via a subroutine signature which uses a utf8
15345 * parameter name, at which point PL_subname will have been set
15346 * but not yet used. */
15347 save_item(PL_subname);
15348
15349 /* G_SCALAR guarantees a single return value */
15350 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
15351
15352 SPAGAIN;
15353
15354 error = ERRSV;
15355 if (TAINT_get || SvTRUE(error)) {
15356 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15357 if (SvTRUE(error)) {
15358 sv_catpvs(msg, "Error \"");
15359 sv_catsv(msg, error);
15360 sv_catpvs(msg, "\"");
15361 }
15362 if (TAINT_get) {
15363 if (SvTRUE(error)) sv_catpvs(msg, "; ");
15364 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15365 }
15366
15367 if (name_len > 0) {
15368 sv_catpvs(msg, " in expansion of ");
15369 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
15370 name_len,
15371 name));
15372 }
15373
15374 (void) POPs;
15375 prop_definition = NULL;
15376 }
15377 else {
15378 SV * contents = POPs;
15379
15380 /* The contents is supposed to be the expansion of the property
15381 * definition. If the definition is deferrable, and we got an
15382 * empty string back, set a flag to later defer it (after clean
15383 * up below). */
15384 if ( deferrable
15385 && (! SvPOK(contents) || SvCUR(contents) == 0))
15386 {
15387 empty_return = TRUE;
15388 }
15389 else { /* Otherwise, call a function to check for valid syntax,
15390 and handle it */
15391
15392 prop_definition = handle_user_defined_property(
15393 name, name_len,
15394 is_utf8, to_fold, runtime,
15395 deferrable,
15396 contents, user_defined_ptr,
15397 msg,
15398 level);
15399 }
15400 }
15401
15402 /* Here, we have the results of the expansion. Delete the
15403 * placeholder, and if the definition is now known, replace it with
15404 * that definition. We need exclusive access to the hash, and we
15405 * can't let anyone else in, between when we delete the placeholder
15406 * and add the permanent entry */
15407 USER_PROP_MUTEX_LOCK;
15408
15409 S_delete_recursion_entry(aTHX_ SvPVX(key));
15410
15411 if ( ! empty_return
15412 && (! prop_definition || is_invlist(prop_definition)))
15413 {
15414 /* If we got success we use the inversion list defining the
15415 * property; otherwise use the error message */
15416 SWITCH_TO_GLOBAL_CONTEXT;
15417 (void) hv_store_ent(PL_user_def_props,
15418 key,
15419 ((prop_definition)
15420 ? newSVsv(prop_definition)
15421 : newSVsv(msg)),
15422 0);
15423 RESTORE_CONTEXT;
15424 }
15425
15426 /* All done, and the hash now has a permanent entry for this
15427 * property. Give up exclusive control */
15428 USER_PROP_MUTEX_UNLOCK;
15429
15430 FREETMPS;
15431 LEAVE;
15432 POPSTACK;
15433
15434 if (empty_return) {
15435 goto definition_deferred;
15436 }
15437
15438 if (prop_definition) {
15439
15440 /* If the definition is for something not known at this time,
15441 * we toss it, and go return the main property name, as that's
15442 * the one the user will be aware of */
15443 if (! is_invlist(prop_definition)) {
15444 SvREFCNT_dec_NN(prop_definition);
15445 goto definition_deferred;
15446 }
15447
15448 sv_2mortal(prop_definition);
15449 }
15450
15451 /* And return */
15452 return prop_definition;
15453
15454 } /* End of calling the subroutine for the user-defined property */
15455 } /* End of it could be a user-defined property */
15456
15457 /* Here it wasn't a user-defined property that is known at this time. See
15458 * if it is a Unicode property */
15459
15460 lookup_len = j; /* This is a more mnemonic name than 'j' */
15461
15462 /* Get the index into our pointer table of the inversion list corresponding
15463 * to the property */
15464 table_index = do_uniprop_match(lookup_name, lookup_len);
15465
15466 /* If it didn't find the property ... */
15467 if (table_index == 0) {
15468
15469 /* Try again stripping off any initial 'Is'. This is because we
15470 * promise that an initial Is is optional. The same isn't true of
15471 * names that start with 'In'. Those can match only blocks, and the
15472 * lookup table already has those accounted for. The lookup table also
15473 * has already accounted for Perl extensions (without and = sign)
15474 * starting with 'i's'. */
15475 if (starts_with_Is && equals_pos >= 0) {
15476 lookup_name += 2;
15477 lookup_len -= 2;
15478 equals_pos -= 2;
15479 slash_pos -= 2;
15480
15481 table_index = do_uniprop_match(lookup_name, lookup_len);
15482 }
15483
15484 if (table_index == 0) {
15485 char * canonical;
15486
15487 /* Here, we didn't find it. If not a numeric type property, and
15488 * can't be a user-defined one, it isn't a legal property */
15489 if (! is_nv_type) {
15490 if (! could_be_user_defined) {
15491 goto failed;
15492 }
15493
15494 /* Here, the property name is legal as a user-defined one. At
15495 * compile time, it might just be that the subroutine for that
15496 * property hasn't been encountered yet, but at runtime, it's
15497 * an error to try to use an undefined one */
15498 if (! deferrable) {
15499 goto unknown_user_defined;;
15500 }
15501
15502 goto definition_deferred;
15503 } /* End of isn't a numeric type property */
15504
15505 /* The numeric type properties need more work to decide. What we
15506 * do is make sure we have the number in canonical form and look
15507 * that up. */
15508
15509 if (slash_pos < 0) { /* No slash */
15510
15511 /* When it isn't a rational, take the input, convert it to a
15512 * NV, then create a canonical string representation of that
15513 * NV. */
15514
15515 NV value;
15516 SSize_t value_len = lookup_len - equals_pos;
15517
15518 /* Get the value */
15519 if ( value_len <= 0
15520 || my_atof3(lookup_name + equals_pos, &value,
15521 value_len)
15522 != lookup_name + lookup_len)
15523 {
15524 goto failed;
15525 }
15526
15527 /* If the value is an integer, the canonical value is integral
15528 * */
15529 if (Perl_ceil(value) == value) {
15530 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
15531 equals_pos, lookup_name, value);
15532 }
15533 else { /* Otherwise, it is %e with a known precision */
15534 char * exp_ptr;
15535
15536 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
15537 equals_pos, lookup_name,
15538 PL_E_FORMAT_PRECISION, value);
15539
15540 /* The exponent generated is expecting two digits, whereas
15541 * %e on some systems will generate three. Remove leading
15542 * zeros in excess of 2 from the exponent. We start
15543 * looking for them after the '=' */
15544 exp_ptr = strchr(canonical + equals_pos, 'e');
15545 if (exp_ptr) {
15546 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
15547 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
15548
15549 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
15550
15551 if (excess_exponent_len > 0) {
15552 SSize_t leading_zeros = strspn(cur_ptr, "0");
15553 SSize_t excess_leading_zeros
15554 = MIN(leading_zeros, excess_exponent_len);
15555 if (excess_leading_zeros > 0) {
15556 Move(cur_ptr + excess_leading_zeros,
15557 cur_ptr,
15558 strlen(cur_ptr) - excess_leading_zeros
15559 + 1, /* Copy the NUL as well */
15560 char);
15561 }
15562 }
15563 }
15564 }
15565 }
15566 else { /* Has a slash. Create a rational in canonical form */
15567 UV numerator, denominator, gcd, trial;
15568 const char * end_ptr;
15569 const char * sign = "";
15570
15571 /* We can't just find the numerator, denominator, and do the
15572 * division, then use the method above, because that is
15573 * inexact. And the input could be a rational that is within
15574 * epsilon (given our precision) of a valid rational, and would
15575 * then incorrectly compare valid.
15576 *
15577 * We're only interested in the part after the '=' */
15578 const char * this_lookup_name = lookup_name + equals_pos;
15579 lookup_len -= equals_pos;
15580 slash_pos -= equals_pos;
15581
15582 /* Handle any leading minus */
15583 if (this_lookup_name[0] == '-') {
15584 sign = "-";
15585 this_lookup_name++;
15586 lookup_len--;
15587 slash_pos--;
15588 }
15589
15590 /* Convert the numerator to numeric */
15591 end_ptr = this_lookup_name + slash_pos;
15592 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
15593 goto failed;
15594 }
15595
15596 /* It better have included all characters before the slash */
15597 if (*end_ptr != '/') {
15598 goto failed;
15599 }
15600
15601 /* Set to look at just the denominator */
15602 this_lookup_name += slash_pos;
15603 lookup_len -= slash_pos;
15604 end_ptr = this_lookup_name + lookup_len;
15605
15606 /* Convert the denominator to numeric */
15607 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
15608 goto failed;
15609 }
15610
15611 /* It better be the rest of the characters, and don't divide by
15612 * 0 */
15613 if ( end_ptr != this_lookup_name + lookup_len
15614 || denominator == 0)
15615 {
15616 goto failed;
15617 }
15618
15619 /* Get the greatest common denominator using
15620 http://en.wikipedia.org/wiki/Euclidean_algorithm */
15621 gcd = numerator;
15622 trial = denominator;
15623 while (trial != 0) {
15624 UV temp = trial;
15625 trial = gcd % trial;
15626 gcd = temp;
15627 }
15628
15629 /* If already in lowest possible terms, we have already tried
15630 * looking this up */
15631 if (gcd == 1) {
15632 goto failed;
15633 }
15634
15635 /* Reduce the rational, which should put it in canonical form
15636 * */
15637 numerator /= gcd;
15638 denominator /= gcd;
15639
15640 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
15641 equals_pos, lookup_name, sign, numerator, denominator);
15642 }
15643
15644 /* Here, we have the number in canonical form. Try that */
15645 table_index = do_uniprop_match(canonical, strlen(canonical));
15646 if (table_index == 0) {
15647 goto failed;
15648 }
15649 } /* End of still didn't find the property in our table */
15650 } /* End of didn't find the property in our table */
15651
15652 /* Here, we have a non-zero return, which is an index into a table of ptrs.
15653 * A negative return signifies that the real index is the absolute value,
15654 * but the result needs to be inverted */
15655 if (table_index < 0) {
15656 invert_return = TRUE;
15657 table_index = -table_index;
15658 }
15659
15660 /* Out-of band indices indicate a deprecated property. The proper index is
15661 * modulo it with the table size. And dividing by the table size yields
15662 * an offset into a table constructed by regen/mk_invlists.pl to contain
15663 * the corresponding warning message */
15664 if (table_index > MAX_UNI_KEYWORD_INDEX) {
15665 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
15666 table_index %= MAX_UNI_KEYWORD_INDEX;
15667 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
15668 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
15669 (int) name_len, name,
15670 get_deprecated_property_msg(warning_offset));
15671 }
15672
15673 /* In a few properties, a different property is used under /i. These are
15674 * unlikely to change, so are hard-coded here. */
15675 if (to_fold) {
15676 if ( table_index == UNI_XPOSIXUPPER
15677 || table_index == UNI_XPOSIXLOWER
15678 || table_index == UNI_TITLE)
15679 {
15680 table_index = UNI_CASED;
15681 }
15682 else if ( table_index == UNI_UPPERCASELETTER
15683 || table_index == UNI_LOWERCASELETTER
15684 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
15685 || table_index == UNI_TITLECASELETTER
15686 # endif
15687 ) {
15688 table_index = UNI_CASEDLETTER;
15689 }
15690 else if ( table_index == UNI_POSIXUPPER
15691 || table_index == UNI_POSIXLOWER)
15692 {
15693 table_index = UNI_POSIXALPHA;
15694 }
15695 }
15696
15697 /* Create and return the inversion list */
15698 prop_definition = get_prop_definition(table_index);
15699 sv_2mortal(prop_definition);
15700
15701 /* See if there is a private use override to add to this definition */
15702 {
15703 COPHH * hinthash = (IN_PERL_COMPILETIME)
15704 ? CopHINTHASH_get(&PL_compiling)
15705 : CopHINTHASH_get(PL_curcop);
15706 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
15707
15708 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
15709
15710 /* See if there is an element in the hints hash for this table */
15711 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
15712 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
15713
15714 if (pos) {
15715 bool dummy;
15716 SV * pu_definition;
15717 SV * pu_invlist;
15718 SV * expanded_prop_definition =
15719 sv_2mortal(invlist_clone(prop_definition, NULL));
15720
15721 /* If so, it's definition is the string from here to the next
15722 * \a character. And its format is the same as a user-defined
15723 * property */
15724 pos += SvCUR(pu_lookup);
15725 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
15726 pu_invlist = handle_user_defined_property(lookup_name,
15727 lookup_len,
15728 0, /* Not UTF-8 */
15729 0, /* Not folded */
15730 runtime,
15731 deferrable,
15732 pu_definition,
15733 &dummy,
15734 msg,
15735 level);
15736 if (TAINT_get) {
15737 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15738 sv_catpvs(msg, "Insecure private-use override");
15739 goto append_name_to_msg;
15740 }
15741
15742 /* For now, as a safety measure, make sure that it doesn't
15743 * override non-private use code points */
15744 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
15745
15746 /* Add it to the list to be returned */
15747 _invlist_union(prop_definition, pu_invlist,
15748 &expanded_prop_definition);
15749 prop_definition = expanded_prop_definition;
15750 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
15751 }
15752 }
15753 }
15754
15755 if (invert_return) {
15756 _invlist_invert(prop_definition);
15757 }
15758 return prop_definition;
15759
15760 unknown_user_defined:
15761 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15762 sv_catpvs(msg, "Unknown user-defined property name");
15763 goto append_name_to_msg;
15764
15765 failed:
15766 if (non_pkg_begin != 0) {
15767 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15768 sv_catpvs(msg, "Illegal user-defined property name");
15769 }
15770 else {
15771 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15772 sv_catpvs(msg, "Can't find Unicode property definition");
15773 }
15774 /* FALLTHROUGH */
15775
15776 append_name_to_msg:
15777 {
15778 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
15779 const char * suffix = (runtime && level == 0) ? "}" : "\"";
15780
15781 sv_catpv(msg, prefix);
15782 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
15783 sv_catpv(msg, suffix);
15784 }
15785
15786 return NULL;
15787
15788 definition_deferred:
15789
15790 {
15791 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
15792
15793 /* Here it could yet to be defined, so defer evaluation of this until
15794 * its needed at runtime. We need the fully qualified property name to
15795 * avoid ambiguity */
15796 if (! fq_name) {
15797 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15798 is_qualified);
15799 }
15800
15801 /* If it didn't come with a package, or the package is utf8::, this
15802 * actually could be an official Unicode property whose inclusion we
15803 * are deferring until runtime to make sure that it isn't overridden by
15804 * a user-defined property of the same name (which we haven't
15805 * encountered yet). Add a marker to indicate this possibility, for
15806 * use at such time when we first need the definition during pattern
15807 * matching execution */
15808 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
15809 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
15810 }
15811
15812 /* We also need a trailing newline */
15813 sv_catpvs(fq_name, "\n");
15814
15815 *user_defined_ptr = TRUE;
15816 return fq_name;
15817 }
15818 }
15819
15820 STATIC bool
S_handle_names_wildcard(pTHX_ const char * wname,const STRLEN wname_len,SV ** prop_definition,AV ** strings)15821 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
15822 const STRLEN wname_len, /* Its length */
15823 SV ** prop_definition,
15824 AV ** strings)
15825 {
15826 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
15827 * any matches, adding them to prop_definition */
15828
15829 dSP;
15830
15831 CV * get_names_info; /* entry to charnames.pm to get info we need */
15832 SV * names_string; /* Contains all character names, except algo */
15833 SV * algorithmic_names; /* Contains info about algorithmically
15834 generated character names */
15835 REGEXP * subpattern_re; /* The user's pattern to match with */
15836 struct regexp * prog; /* The compiled pattern */
15837 char * all_names_start; /* lib/unicore/Name.pl string of every
15838 (non-algorithmic) character name */
15839 char * cur_pos; /* We match, effectively using /gc; this is
15840 where we are now */
15841 bool found_matches = FALSE; /* Did any name match so far? */
15842 SV * empty; /* For matching zero length names */
15843 SV * must_sv; /* Contains the substring, if any, that must be
15844 in a name for the subpattern to match */
15845 const char * must; /* The PV of 'must' */
15846 STRLEN must_len; /* And its length */
15847 SV * syllable_name = NULL; /* For Hangul syllables */
15848 const char hangul_prefix[] = "HANGUL SYLLABLE ";
15849 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
15850
15851 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
15852 * syllable name, and these are immutable and guaranteed by the Unicode
15853 * standard to never be extended */
15854 const STRLEN syl_max_len = hangul_prefix_len + 7;
15855
15856 IV i;
15857
15858 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
15859
15860 /* Make sure _charnames is loaded. (The parameters give context
15861 * for any errors generated */
15862 get_names_info = get_cv("_charnames::_get_names_info", 0);
15863 if (! get_names_info) {
15864 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
15865 }
15866
15867 /* Get the charnames data */
15868 PUSHSTACKi(PERLSI_REGCOMP);
15869 ENTER ;
15870 SAVETMPS;
15871 save_re_context();
15872
15873 PUSHMARK(SP) ;
15874 PUTBACK;
15875
15876 /* Special _charnames entry point that returns the info this routine
15877 * requires */
15878 call_sv(MUTABLE_SV(get_names_info), G_LIST);
15879
15880 SPAGAIN ;
15881
15882 /* Data structure for names which end in their very own code points */
15883 algorithmic_names = POPs;
15884 SvREFCNT_inc_simple_void_NN(algorithmic_names);
15885
15886 /* The lib/unicore/Name.pl string */
15887 names_string = POPs;
15888 SvREFCNT_inc_simple_void_NN(names_string);
15889
15890 PUTBACK ;
15891 FREETMPS ;
15892 LEAVE ;
15893 POPSTACK;
15894
15895 if ( ! SvROK(names_string)
15896 || ! SvROK(algorithmic_names))
15897 { /* Perhaps should panic instead XXX */
15898 SvREFCNT_dec(names_string);
15899 SvREFCNT_dec(algorithmic_names);
15900 return FALSE;
15901 }
15902
15903 names_string = sv_2mortal(SvRV(names_string));
15904 all_names_start = SvPVX(names_string);
15905 cur_pos = all_names_start;
15906
15907 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
15908
15909 /* Compile the subpattern consisting of the name being looked for */
15910 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
15911
15912 must_sv = re_intuit_string(subpattern_re);
15913 if (must_sv) {
15914 /* regexec.c can free the re_intuit_string() return. GH #17734 */
15915 must_sv = sv_2mortal(newSVsv(must_sv));
15916 must = SvPV(must_sv, must_len);
15917 }
15918 else {
15919 must = "";
15920 must_len = 0;
15921 }
15922
15923 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
15924 * This works because the NUL causes the function to return early, thus
15925 * showing that there are characters in it other than the acceptable ones,
15926 * which is our desired result.) */
15927
15928 prog = ReANY(subpattern_re);
15929
15930 /* If only nothing is matched, skip to where empty names are looked for */
15931 if (prog->maxlen == 0) {
15932 goto check_empty;
15933 }
15934
15935 /* And match against the string of all names /gc. Don't even try if it
15936 * must match a character not found in any name. */
15937 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
15938 {
15939 while (execute_wildcard(subpattern_re,
15940 cur_pos,
15941 SvEND(names_string),
15942 all_names_start, 0,
15943 names_string,
15944 0))
15945 { /* Here, matched. */
15946
15947 /* Note the string entries look like
15948 * 00001\nSTART OF HEADING\n\n
15949 * so we could match anywhere in that string. We have to rule out
15950 * matching a code point line */
15951 char * this_name_start = all_names_start
15952 + RX_OFFS_START(subpattern_re,0);
15953 char * this_name_end = all_names_start
15954 + RX_OFFS_END(subpattern_re,0);
15955 char * cp_start;
15956 char * cp_end;
15957 UV cp = 0; /* Silences some compilers */
15958 AV * this_string = NULL;
15959 bool is_multi = FALSE;
15960
15961 /* If matched nothing, advance to next possible match */
15962 if (this_name_start == this_name_end) {
15963 cur_pos = (char *) memchr(this_name_end + 1, '\n',
15964 SvEND(names_string) - this_name_end);
15965 if (cur_pos == NULL) {
15966 break;
15967 }
15968 }
15969 else {
15970 /* Position the next match to start beyond the current returned
15971 * entry */
15972 cur_pos = (char *) memchr(this_name_end, '\n',
15973 SvEND(names_string) - this_name_end);
15974 }
15975
15976 /* Back up to the \n just before the beginning of the character. */
15977 cp_end = (char *) my_memrchr(all_names_start,
15978 '\n',
15979 this_name_start - all_names_start);
15980
15981 /* If we didn't find a \n, it means it matched somewhere in the
15982 * initial '00000' in the string, so isn't a real match */
15983 if (cp_end == NULL) {
15984 continue;
15985 }
15986
15987 this_name_start = cp_end + 1; /* The name starts just after */
15988 cp_end--; /* the \n, and the code point */
15989 /* ends just before it */
15990
15991 /* All code points are 5 digits long */
15992 cp_start = cp_end - 4;
15993
15994 /* This shouldn't happen, as we found a \n, and the first \n is
15995 * further along than what we subtracted */
15996 assert(cp_start >= all_names_start);
15997
15998 if (cp_start == all_names_start) {
15999 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
16000 continue;
16001 }
16002
16003 /* If the character is a blank, we either have a named sequence, or
16004 * something is wrong */
16005 if (*(cp_start - 1) == ' ') {
16006 cp_start = (char *) my_memrchr(all_names_start,
16007 '\n',
16008 cp_start - all_names_start);
16009 cp_start++;
16010 }
16011
16012 assert(cp_start != NULL && cp_start >= all_names_start + 2);
16013
16014 /* Except for the first line in the string, the sequence before the
16015 * code point is \n\n. If that isn't the case here, we didn't
16016 * match the name of a character. (We could have matched a named
16017 * sequence, not currently handled */
16018 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
16019 continue;
16020 }
16021
16022 /* We matched! Add this to the list */
16023 found_matches = TRUE;
16024
16025 /* Loop through all the code points in the sequence */
16026 while (cp_start < cp_end) {
16027
16028 /* Calculate this code point from its 5 digits */
16029 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
16030 + (XDIGIT_VALUE(cp_start[1]) << 12)
16031 + (XDIGIT_VALUE(cp_start[2]) << 8)
16032 + (XDIGIT_VALUE(cp_start[3]) << 4)
16033 + XDIGIT_VALUE(cp_start[4]);
16034
16035 cp_start += 6; /* Go past any blank */
16036
16037 if (cp_start < cp_end || is_multi) {
16038 if (this_string == NULL) {
16039 this_string = newAV();
16040 }
16041
16042 is_multi = TRUE;
16043 av_push_simple(this_string, newSVuv(cp));
16044 }
16045 }
16046
16047 if (is_multi) { /* Was more than one code point */
16048 if (*strings == NULL) {
16049 *strings = newAV();
16050 }
16051
16052 av_push_simple(*strings, (SV *) this_string);
16053 }
16054 else { /* Only a single code point */
16055 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
16056 }
16057 } /* End of loop through the non-algorithmic names string */
16058 }
16059
16060 /* There are also character names not in 'names_string'. These are
16061 * algorithmically generatable. Try this pattern on each possible one.
16062 * (khw originally planned to leave this out given the large number of
16063 * matches attempted; but the speed turned out to be quite acceptable
16064 *
16065 * There are plenty of opportunities to optimize to skip many of the tests.
16066 * beyond the rudimentary ones already here */
16067
16068 /* First see if the subpattern matches any of the algorithmic generatable
16069 * Hangul syllable names.
16070 *
16071 * We know none of these syllable names will match if the input pattern
16072 * requires more bytes than any syllable has, or if the input pattern only
16073 * matches an empty name, or if the pattern has something it must match and
16074 * one of the characters in that isn't in any Hangul syllable. */
16075 if ( prog->minlen <= (SSize_t) syl_max_len
16076 && prog->maxlen > 0
16077 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
16078 {
16079 /* These constants, names, values, and algorithm are adapted from the
16080 * Unicode standard, version 5.1, section 3.12, and should never
16081 * change. */
16082 const char * JamoL[] = {
16083 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
16084 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
16085 };
16086 const int LCount = C_ARRAY_LENGTH(JamoL);
16087
16088 const char * JamoV[] = {
16089 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
16090 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
16091 "I"
16092 };
16093 const int VCount = C_ARRAY_LENGTH(JamoV);
16094
16095 const char * JamoT[] = {
16096 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
16097 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
16098 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
16099 };
16100 const int TCount = C_ARRAY_LENGTH(JamoT);
16101
16102 int L, V, T;
16103
16104 /* This is the initial Hangul syllable code point; each time through the
16105 * inner loop, it maps to the next higher code point. For more info,
16106 * see the Hangul syllable section of the Unicode standard. */
16107 int cp = 0xAC00;
16108
16109 syllable_name = sv_2mortal(newSV(syl_max_len));
16110 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
16111
16112 for (L = 0; L < LCount; L++) {
16113 for (V = 0; V < VCount; V++) {
16114 for (T = 0; T < TCount; T++) {
16115
16116 /* Truncate back to the prefix, which is unvarying */
16117 SvCUR_set(syllable_name, hangul_prefix_len);
16118
16119 sv_catpv(syllable_name, JamoL[L]);
16120 sv_catpv(syllable_name, JamoV[V]);
16121 sv_catpv(syllable_name, JamoT[T]);
16122
16123 if (execute_wildcard(subpattern_re,
16124 SvPVX(syllable_name),
16125 SvEND(syllable_name),
16126 SvPVX(syllable_name), 0,
16127 syllable_name,
16128 0))
16129 {
16130 *prop_definition = add_cp_to_invlist(*prop_definition,
16131 cp);
16132 found_matches = TRUE;
16133 }
16134
16135 cp++;
16136 }
16137 }
16138 }
16139 }
16140
16141 /* The rest of the algorithmically generatable names are of the form
16142 * "PREFIX-code_point". The prefixes and the code point limits of each
16143 * were returned to us in the array 'algorithmic_names' from data in
16144 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
16145 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
16146 IV j;
16147
16148 /* Each element of the array is a hash, giving the details for the
16149 * series of names it covers. There is the base name of the characters
16150 * in the series, and the low and high code points in the series. And,
16151 * for optimization purposes a string containing all the legal
16152 * characters that could possibly be in a name in this series. */
16153 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
16154 SV * prefix = * hv_fetchs(this_series, "name", 0);
16155 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
16156 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
16157 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
16158
16159 /* Pre-allocate an SV with enough space */
16160 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
16161 SvPVX(prefix)));
16162 if (high >= 0x10000) {
16163 sv_catpvs(algo_name, "0");
16164 }
16165
16166 /* This series can be skipped entirely if the pattern requires
16167 * something longer than any name in the series, or can only match an
16168 * empty name, or contains a character not found in any name in the
16169 * series */
16170 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
16171 && prog->maxlen > 0
16172 && (strspn(must, legal) == must_len))
16173 {
16174 for (j = low; j <= high; j++) { /* For each code point in the series */
16175
16176 /* Get its name, and see if it matches the subpattern */
16177 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
16178 (unsigned) j);
16179
16180 if (execute_wildcard(subpattern_re,
16181 SvPVX(algo_name),
16182 SvEND(algo_name),
16183 SvPVX(algo_name), 0,
16184 algo_name,
16185 0))
16186 {
16187 *prop_definition = add_cp_to_invlist(*prop_definition, j);
16188 found_matches = TRUE;
16189 }
16190 }
16191 }
16192 }
16193
16194 check_empty:
16195 /* Finally, see if the subpattern matches an empty string */
16196 empty = newSVpvs("");
16197 if (execute_wildcard(subpattern_re,
16198 SvPVX(empty),
16199 SvEND(empty),
16200 SvPVX(empty), 0,
16201 empty,
16202 0))
16203 {
16204 /* Many code points have empty names. Currently these are the \p{GC=C}
16205 * ones, minus CC and CF */
16206
16207 SV * empty_names_ref = get_prop_definition(UNI_C);
16208 SV * empty_names = invlist_clone(empty_names_ref, NULL);
16209
16210 SV * subtract = get_prop_definition(UNI_CC);
16211
16212 _invlist_subtract(empty_names, subtract, &empty_names);
16213 SvREFCNT_dec_NN(empty_names_ref);
16214 SvREFCNT_dec_NN(subtract);
16215
16216 subtract = get_prop_definition(UNI_CF);
16217 _invlist_subtract(empty_names, subtract, &empty_names);
16218 SvREFCNT_dec_NN(subtract);
16219
16220 _invlist_union(*prop_definition, empty_names, prop_definition);
16221 found_matches = TRUE;
16222 SvREFCNT_dec_NN(empty_names);
16223 }
16224 SvREFCNT_dec_NN(empty);
16225
16226 #if 0
16227 /* If we ever were to accept aliases for, say private use names, we would
16228 * need to do something fancier to find empty names. The code below works
16229 * (at the time it was written), and is slower than the above */
16230 const char empties_pat[] = "^.";
16231 if (strNE(name, empties_pat)) {
16232 SV * empty = newSVpvs("");
16233 if (execute_wildcard(subpattern_re,
16234 SvPVX(empty),
16235 SvEND(empty),
16236 SvPVX(empty), 0,
16237 empty,
16238 0))
16239 {
16240 SV * empties = NULL;
16241
16242 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
16243
16244 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
16245 SvREFCNT_dec_NN(empties);
16246
16247 found_matches = TRUE;
16248 }
16249 SvREFCNT_dec_NN(empty);
16250 }
16251 #endif
16252
16253 SvREFCNT_dec_NN(subpattern_re);
16254 return found_matches;
16255 }
16256
16257 /*
16258 * ex: set ts=8 sts=4 sw=4 et:
16259 */
16260